#!/usr/bin/env python3 """ Comprehensive GWAS Trait Analysis Script Expanded version with 200+ clinically relevant trait-associated SNPs """ import gzip import sys import re from collections import defaultdict from typing import Dict, List, Tuple # ============================================================================ # COMPREHENSIVE TRAIT-ASSOCIATED SNPs DATABASE # Format: rsid -> (chrom, pos, risk_allele, trait, effect, category) # ============================================================================ TRAIT_SNPS = { # ======================================================================== # GOUT / URIC ACID METABOLISM (新增) # ======================================================================== "rs2231142": ("4", 89052323, "T", "Gout / Hyperuricemia", "risk", "Gout"), "rs16890979": ("4", 9922166, "T", "Serum uric acid levels", "higher", "Gout"), "rs734553": ("4", 9920485, "G", "Gout", "risk", "Gout"), "rs1014290": ("4", 10001861, "A", "Serum uric acid levels", "higher", "Gout"), "rs505802": ("11", 64357072, "C", "Serum uric acid levels", "higher", "Gout"), "rs3775948": ("4", 9999007, "G", "Gout", "risk", "Gout"), "rs12498742": ("4", 9993806, "A", "Serum uric acid levels", "higher", "Gout"), "rs675209": ("4", 89011046, "T", "Gout", "risk", "Gout"), "rs1165151": ("11", 64352047, "T", "Serum uric acid levels", "higher", "Gout"), "rs478607": ("17", 19459563, "A", "Serum uric acid levels", "higher", "Gout"), # ======================================================================== # KIDNEY DISEASE (新增) # ======================================================================== "rs4293393": ("16", 20364808, "T", "Chronic kidney disease", "risk", "Kidney"), "rs12917707": ("16", 20369861, "G", "Chronic kidney disease", "protective", "Kidney"), "rs11959928": ("5", 39394747, "A", "eGFR decline", "risk", "Kidney"), "rs1260326": ("2", 27730940, "T", "Chronic kidney disease", "risk", "Kidney"), "rs13329952": ("16", 20393103, "C", "Chronic kidney disease", "risk", "Kidney"), "rs267734": ("1", 150950830, "C", "Chronic kidney disease", "risk", "Kidney"), # ======================================================================== # HEARING LOSS (與 Usher syndrome 家庭相關) # ======================================================================== "rs7598759": ("2", 70439175, "A", "Age-related hearing loss", "risk", "Hearing"), "rs161927": ("5", 88228027, "G", "Hearing impairment", "risk", "Hearing"), "rs10497394": ("2", 70477374, "T", "Hearing loss", "risk", "Hearing"), "rs3752752": ("7", 129608155, "C", "Noise-induced hearing loss", "risk", "Hearing"), "rs7294": ("4", 6303557, "G", "Hearing loss", "risk", "Hearing"), # ======================================================================== # AUTOIMMUNE DISEASES (新增) # ======================================================================== # Rheumatoid Arthritis "rs6679677": ("1", 114179091, "A", "Rheumatoid arthritis", "risk", "Autoimmune"), "rs2476601": ("1", 114377568, "A", "Rheumatoid arthritis / Autoimmune", "risk", "Autoimmune"), "rs3087243": ("2", 204447164, "G", "Rheumatoid arthritis", "protective", "Autoimmune"), "rs4810485": ("20", 44747947, "T", "Rheumatoid arthritis", "risk", "Autoimmune"), # Systemic Lupus Erythematosus (SLE) "rs1143679": ("16", 31276811, "A", "Systemic lupus erythematosus", "risk", "Autoimmune"), "rs7574865": ("2", 191099907, "T", "Systemic lupus erythematosus", "risk", "Autoimmune"), "rs2187668": ("6", 32605884, "T", "Systemic lupus erythematosus", "risk", "Autoimmune"), # Multiple Sclerosis "rs3135388": ("6", 32439887, "A", "Multiple sclerosis", "risk", "Autoimmune"), "rs6897932": ("5", 35910332, "C", "Multiple sclerosis", "risk", "Autoimmune"), "rs4648356": ("1", 101256530, "C", "Multiple sclerosis", "risk", "Autoimmune"), # Inflammatory Bowel Disease "rs2241880": ("16", 50756540, "G", "Crohn's disease / IBD", "risk", "Autoimmune"), "rs11209026": ("1", 67705958, "A", "Crohn's disease / IBD", "protective", "Autoimmune"), "rs10883365": ("10", 64426914, "G", "Ulcerative colitis", "risk", "Autoimmune"), "rs2066847": ("16", 50745926, "C", "Crohn's disease", "risk", "Autoimmune"), # Type 1 Diabetes "rs2292239": ("12", 56482804, "T", "Type 1 diabetes", "risk", "Autoimmune"), "rs3129889": ("6", 32609440, "G", "Type 1 diabetes", "risk", "Autoimmune"), "rs689": ("11", 2182224, "T", "Type 1 diabetes", "risk", "Autoimmune"), # Celiac Disease "rs2395182": ("6", 32713854, "T", "Celiac disease", "risk", "Autoimmune"), "rs7775228": ("6", 32665438, "C", "Celiac disease", "risk", "Autoimmune"), # Hashimoto's Thyroiditis / Graves' Disease "rs179247": ("2", 204733986, "A", "Autoimmune thyroid disease", "risk", "Autoimmune"), "rs1980422": ("6", 90957406, "C", "Autoimmune thyroid disease", "risk", "Autoimmune"), # ======================================================================== # CANCER RISK (新增) # ======================================================================== # Breast Cancer "rs2981582": ("10", 123337335, "A", "Breast cancer (FGFR2)", "risk", "Cancer"), "rs13281615": ("8", 128355618, "G", "Breast cancer", "risk", "Cancer"), "rs889312": ("5", 56067641, "C", "Breast cancer (MAP3K1)", "risk", "Cancer"), "rs3817198": ("11", 1909006, "C", "Breast cancer (LSP1)", "risk", "Cancer"), "rs13387042": ("2", 217905832, "A", "Breast cancer", "risk", "Cancer"), # Prostate Cancer "rs1447295": ("8", 128554220, "A", "Prostate cancer", "risk", "Cancer"), "rs16901979": ("8", 128320346, "A", "Prostate cancer", "risk", "Cancer"), "rs6983267": ("8", 128413305, "G", "Prostate cancer / Colorectal cancer", "risk", "Cancer"), "rs10993994": ("10", 51549496, "T", "Prostate cancer (MSMB)", "risk", "Cancer"), "rs7679673": ("4", 106061534, "C", "Prostate cancer", "risk", "Cancer"), # Colorectal Cancer "rs4939827": ("18", 46453463, "T", "Colorectal cancer (SMAD7)", "risk", "Cancer"), "rs6983267_crc": ("8", 128413305, "G", "Colorectal cancer", "risk", "Cancer"), "rs4779584": ("15", 32994756, "T", "Colorectal cancer", "risk", "Cancer"), "rs10795668": ("10", 8701219, "G", "Colorectal cancer", "protective", "Cancer"), # Lung Cancer "rs8034191": ("15", 78894339, "C", "Lung cancer", "risk", "Cancer"), "rs1051730": ("15", 78882925, "A", "Lung cancer / Nicotine dependence", "risk", "Cancer"), "rs2736100": ("5", 1286516, "C", "Lung cancer (TERT)", "risk", "Cancer"), # Melanoma "rs910873": ("20", 32665748, "C", "Melanoma", "risk", "Cancer"), "rs1801516": ("11", 108175462, "A", "Melanoma (ATM)", "risk", "Cancer"), "rs16953002": ("12", 89328335, "A", "Melanoma", "risk", "Cancer"), # Thyroid Cancer "rs965513": ("9", 100556109, "A", "Thyroid cancer", "risk", "Cancer"), "rs944289": ("14", 36649246, "T", "Thyroid cancer", "risk", "Cancer"), # Bladder Cancer "rs710521": ("3", 189643526, "A", "Bladder cancer", "risk", "Cancer"), "rs9642880": ("8", 128787253, "T", "Bladder cancer", "risk", "Cancer"), # ======================================================================== # BLOOD CLOTTING / THROMBOSIS (新增) # ======================================================================== "rs6025": ("1", 169519049, "T", "Factor V Leiden / DVT risk", "risk", "Thrombosis"), "rs1799963": ("11", 46761055, "A", "Prothrombin G20210A / DVT risk", "risk", "Thrombosis"), "rs8176719": ("9", 136131322, "C", "Blood type O (protective for VTE)", "protective", "Thrombosis"), "rs505922": ("9", 136149229, "C", "Venous thromboembolism", "risk", "Thrombosis"), "rs2066865": ("4", 155525276, "G", "Fibrinogen levels / DVT", "risk", "Thrombosis"), # ======================================================================== # THYROID DISORDERS (新增) # ======================================================================== "rs1991517": ("8", 133020441, "C", "Hypothyroidism", "risk", "Thyroid"), "rs925489": ("2", 218283107, "T", "TSH levels", "higher", "Thyroid"), "rs10499559": ("6", 166474536, "T", "Hypothyroidism", "risk", "Thyroid"), "rs7850258": ("9", 4126287, "G", "Thyroid function", "altered", "Thyroid"), # ======================================================================== # OSTEOPOROSIS / BONE HEALTH (新增) # ======================================================================== "rs3736228": ("11", 68179081, "T", "Osteoporosis / Low BMD", "risk", "Bone"), "rs4988235": ("2", 136608646, "G", "Lactose intolerance (affects Ca)", "risk", "Bone"), "rs2282679": ("4", 72608383, "C", "Vitamin D deficiency", "risk", "Bone"), "rs1800012": ("17", 48275363, "T", "Osteoporosis (COL1A1)", "risk", "Bone"), "rs2062377": ("8", 119964052, "A", "Bone mineral density", "lower", "Bone"), "rs4355801": ("8", 119963145, "G", "Bone mineral density", "higher", "Bone"), # ======================================================================== # LIVER DISEASE (新增) # ======================================================================== "rs738409": ("22", 44324727, "G", "NAFLD / Fatty liver (PNPLA3)", "risk", "Liver"), "rs58542926": ("19", 19379549, "T", "NAFLD / Liver fibrosis (TM6SF2)", "risk", "Liver"), "rs2228603": ("19", 11350488, "T", "NAFLD", "risk", "Liver"), "rs12979860": ("19", 39248147, "C", "Hepatitis C clearance", "favorable", "Liver"), # ======================================================================== # MIGRAINE / HEADACHE (新增) # ======================================================================== "rs2651899": ("1", 10796866, "C", "Migraine", "risk", "Migraine"), "rs10166942": ("2", 234824778, "T", "Migraine", "risk", "Migraine"), "rs11172113": ("12", 57527283, "C", "Migraine (LRP1)", "risk", "Migraine"), "rs1835740": ("8", 87521374, "A", "Migraine", "risk", "Migraine"), # ======================================================================== # LONGEVITY / AGING (新增) # ======================================================================== "rs2802292": ("6", 157192662, "G", "Longevity (FOXO3)", "protective", "Longevity"), "rs1042522": ("17", 7579472, "C", "Longevity (TP53)", "altered", "Longevity"), "rs4420638": ("19", 45422946, "A", "Longevity / Cardiovascular", "risk", "Longevity"), # ======================================================================== # SLEEP / CIRCADIAN (原有 + 擴展) # ======================================================================== "rs113851554": ("2", 66799986, "T", "Insomnia", "risk", "Sleep"), "rs12927162": ("16", 68856985, "A", "Sleep duration", "shorter", "Sleep"), "rs1823125": ("1", 205713532, "G", "Chronotype (morning person)", "morning", "Sleep"), "rs10493596": ("1", 215803417, "T", "Insomnia", "risk", "Sleep"), "rs3104997": ("6", 27424938, "C", "Sleep duration", "shorter", "Sleep"), "rs73598374": ("4", 94847526, "A", "Insomnia", "risk", "Sleep"), "rs2302729": ("5", 35857091, "G", "Insomnia", "risk", "Sleep"), "rs12936231": ("17", 44282378, "C", "Restless legs syndrome", "risk", "Sleep"), "rs3923809": ("6", 38642286, "A", "Restless legs syndrome (BTBD9)", "risk", "Sleep"), # ======================================================================== # SKIN CONDITIONS (原有 + 擴展) # ======================================================================== "rs1800629": ("6", 31543031, "A", "Psoriasis", "risk", "Skin"), "rs20541": ("5", 131995964, "A", "Atopic dermatitis", "risk", "Skin"), "rs2066808": ("6", 31540784, "A", "Psoriasis", "risk", "Skin"), "rs3093662": ("6", 31574339, "G", "Psoriasis", "risk", "Skin"), "rs10484554": ("6", 31271836, "A", "Psoriasis", "risk", "Skin"), "rs1295686": ("5", 131996447, "A", "Atopic dermatitis", "risk", "Skin"), "rs2227956": ("6", 31783279, "T", "Psoriasis", "risk", "Skin"), "rs6906021": ("6", 32051991, "C", "Atopic dermatitis", "risk", "Skin"), "rs12203592": ("6", 396321, "T", "Skin pigmentation / Freckling", "risk", "Skin"), "rs1805007": ("16", 89986117, "T", "Red hair / Fair skin (MC1R)", "risk", "Skin"), "rs1805008": ("16", 89986144, "T", "Red hair / Fair skin (MC1R)", "risk", "Skin"), # ======================================================================== # CARDIOVASCULAR (原有 + 大幅擴展) # ======================================================================== "rs10757274": ("9", 22096055, "G", "Coronary artery disease", "risk", "Cardiovascular"), "rs1333049": ("9", 22125503, "C", "Coronary artery disease", "risk", "Cardiovascular"), "rs4665058": ("2", 43845437, "C", "Coronary artery disease", "risk", "Cardiovascular"), "rs17465637": ("1", 222823529, "A", "Coronary artery disease", "risk", "Cardiovascular"), "rs6725887": ("2", 203828796, "C", "Coronary artery disease", "risk", "Cardiovascular"), # Hypertension "rs699": ("1", 230845794, "G", "Hypertension (AGT)", "risk", "Cardiovascular"), "rs5186": ("3", 148459988, "C", "Hypertension (AGTR1)", "risk", "Cardiovascular"), "rs4961": ("4", 2906707, "T", "Hypertension / Salt sensitivity", "risk", "Cardiovascular"), "rs1799998": ("8", 142876043, "T", "Hypertension (CYP11B2)", "risk", "Cardiovascular"), # Atrial Fibrillation "rs2200733": ("4", 111718106, "T", "Atrial fibrillation", "risk", "Cardiovascular"), "rs10033464": ("4", 111714418, "T", "Atrial fibrillation", "risk", "Cardiovascular"), "rs6843082": ("4", 111712344, "G", "Atrial fibrillation (PITX2)", "risk", "Cardiovascular"), # Heart Failure "rs1739843": ("15", 75086042, "T", "Heart failure", "risk", "Cardiovascular"), # Stroke "rs11833579": ("12", 115553310, "A", "Ischemic stroke", "risk", "Cardiovascular"), "rs12425791": ("12", 115557677, "A", "Stroke (NINJ2)", "risk", "Cardiovascular"), # Lipids "rs1801177": ("8", 19813529, "A", "LDL cholesterol (LPL)", "higher", "Cardiovascular"), "rs12740374": ("1", 109822166, "G", "LDL cholesterol (CELSR2)", "lower", "Cardiovascular"), "rs3764261": ("16", 56993324, "A", "HDL cholesterol (CETP)", "higher", "Cardiovascular"), "rs1800588": ("15", 58723675, "T", "HDL cholesterol (LIPC)", "higher", "Cardiovascular"), "rs328": ("8", 19819724, "G", "Triglycerides (LPL)", "lower", "Cardiovascular"), "rs662799": ("11", 116663707, "G", "Triglycerides (APOA5)", "higher", "Cardiovascular"), # ======================================================================== # TYPE 2 DIABETES / METABOLIC (原有 + 擴展) # ======================================================================== "rs7903146": ("10", 114758349, "T", "Type 2 diabetes (TCF7L2)", "risk", "Metabolic"), "rs12255372": ("10", 114808902, "T", "Type 2 diabetes (TCF7L2)", "risk", "Metabolic"), "rs1801282": ("3", 12393125, "C", "Type 2 diabetes (PPARG)", "risk", "Metabolic"), "rs5219": ("11", 17409572, "T", "Type 2 diabetes (KCNJ11)", "risk", "Metabolic"), "rs13266634": ("8", 118184783, "C", "Type 2 diabetes (SLC30A8)", "risk", "Metabolic"), "rs7754840": ("6", 20679709, "C", "Type 2 diabetes (CDKAL1)", "risk", "Metabolic"), "rs10811661": ("9", 22134095, "T", "Type 2 diabetes (CDKN2A/B)", "risk", "Metabolic"), "rs864745": ("7", 28196413, "T", "Type 2 diabetes (JAZF1)", "risk", "Metabolic"), "rs4402960": ("3", 185511687, "T", "Type 2 diabetes (IGF2BP2)", "risk", "Metabolic"), # Obesity/BMI "rs9939609": ("16", 53820527, "A", "Obesity (FTO)", "risk", "Metabolic"), "rs17782313": ("18", 57851097, "C", "Obesity (MC4R)", "risk", "Metabolic"), "rs6548238": ("2", 634905, "C", "BMI", "higher", "Metabolic"), "rs10938397": ("4", 45186139, "G", "BMI (GNPDA2)", "higher", "Metabolic"), "rs571312": ("18", 57839769, "A", "BMI (MC4R)", "higher", "Metabolic"), "rs10767664": ("11", 27682562, "A", "BMI (BDNF)", "higher", "Metabolic"), # ======================================================================== # EYE CONDITIONS (原有 + 擴展) # ======================================================================== "rs10490924": ("10", 124214448, "T", "Age-related macular degeneration (ARMS2)", "risk", "Eye"), "rs1061170": ("1", 196659237, "C", "Age-related macular degeneration (CFH)", "risk", "Eye"), "rs9621532": ("22", 38477587, "C", "Myopia", "risk", "Eye"), "rs10034228": ("4", 81951543, "A", "Myopia", "risk", "Eye"), "rs1048661": ("1", 165655423, "C", "Glaucoma (LOXL1)", "risk", "Eye"), "rs4656461": ("1", 165653012, "G", "Glaucoma (LOXL1)", "risk", "Eye"), "rs2165241": ("15", 93600556, "T", "Glaucoma", "risk", "Eye"), "rs3753841": ("1", 196704632, "C", "Age-related macular degeneration", "risk", "Eye"), # ======================================================================== # NEUROPSYCHIATRIC (原有) # ======================================================================== # Alzheimer's Disease "rs429358": ("19", 45411941, "C", "Alzheimer's disease (APOE e4)", "risk", "Neuropsychiatric"), "rs7412": ("19", 45412079, "T", "Alzheimer's disease (APOE e2)", "protective", "Neuropsychiatric"), "rs3865444": ("19", 51727962, "C", "Alzheimer's disease (CD33)", "risk", "Neuropsychiatric"), "rs744373": ("2", 127892810, "G", "Alzheimer's disease (BIN1)", "risk", "Neuropsychiatric"), "rs3851179": ("11", 85868640, "T", "Alzheimer's disease (PICALM)", "protective", "Neuropsychiatric"), "rs670139": ("11", 59939307, "G", "Alzheimer's disease (MS4A)", "risk", "Neuropsychiatric"), "rs9349407": ("6", 47487762, "C", "Alzheimer's disease (CD2AP)", "risk", "Neuropsychiatric"), "rs11136000": ("8", 27468503, "C", "Alzheimer's disease (CLU)", "protective", "Neuropsychiatric"), "rs3764650": ("19", 1063443, "G", "Alzheimer's disease (ABCA7)", "risk", "Neuropsychiatric"), "rs3818361": ("1", 207692049, "A", "Alzheimer's disease (CR1)", "risk", "Neuropsychiatric"), # Parkinson's Disease (新增) "rs356220": ("4", 90626111, "T", "Parkinson's disease (SNCA)", "risk", "Neuropsychiatric"), "rs11931074": ("4", 90674917, "G", "Parkinson's disease (SNCA)", "risk", "Neuropsychiatric"), "rs34637584": ("12", 40734202, "A", "Parkinson's disease (LRRK2)", "risk", "Neuropsychiatric"), "rs34311866": ("4", 951947, "C", "Parkinson's disease (TMEM175)", "risk", "Neuropsychiatric"), # Depression "rs1545843": ("1", 72761657, "A", "Major depression (NEGR1)", "risk", "Neuropsychiatric"), "rs7973260": ("12", 118364392, "A", "Major depression (KSR2)", "risk", "Neuropsychiatric"), "rs10514299": ("5", 87992715, "T", "Major depression (TMEM161B)", "risk", "Neuropsychiatric"), "rs2422321": ("15", 88945878, "G", "Major depression (NTRK3)", "risk", "Neuropsychiatric"), "rs301806": ("1", 8477981, "A", "Major depression (RERE)", "risk", "Neuropsychiatric"), "rs1432639": ("3", 117115304, "G", "Major depression (LSAMP)", "risk", "Neuropsychiatric"), "rs9530139": ("13", 53645407, "G", "Major depression", "risk", "Neuropsychiatric"), "rs4543289": ("10", 106610839, "T", "Major depression (SORCS3)", "risk", "Neuropsychiatric"), # Anxiety "rs1709393": ("1", 34774088, "A", "Anxiety disorder", "risk", "Neuropsychiatric"), "rs7688285": ("4", 123372626, "A", "Anxiety disorder", "risk", "Neuropsychiatric"), # Bipolar "rs4765913": ("12", 2345295, "A", "Bipolar disorder (CACNA1C)", "risk", "Neuropsychiatric"), "rs10994336": ("10", 64649959, "T", "Bipolar disorder (ANK3)", "risk", "Neuropsychiatric"), "rs9804190": ("11", 79077426, "C", "Bipolar disorder (ODZ4)", "risk", "Neuropsychiatric"), # Schizophrenia "rs1625579": ("8", 130635575, "T", "Schizophrenia (MIR137)", "risk", "Neuropsychiatric"), "rs2007044": ("6", 28626894, "G", "Schizophrenia (HIST1H2BJ)", "risk", "Neuropsychiatric"), "rs6932590": ("6", 27243984, "T", "Schizophrenia", "risk", "Neuropsychiatric"), # ADHD (新增) "rs1412005": ("16", 73099702, "T", "ADHD", "risk", "Neuropsychiatric"), "rs11210892": ("1", 44185231, "A", "ADHD", "risk", "Neuropsychiatric"), # ======================================================================== # OTHER TRAITS (原有 + 擴展) # ======================================================================== # Caffeine "rs762551": ("15", 75041917, "C", "Caffeine metabolism (slow)", "slow", "Other"), "rs2472297": ("15", 75027880, "T", "Caffeine consumption", "higher", "Other"), # Alcohol "rs671": ("12", 112241766, "A", "Alcohol flush reaction (ALDH2)", "risk", "Other"), "rs1229984": ("4", 100239319, "T", "Alcohol metabolism (ADH1B)", "fast", "Other"), # Lactose "rs4988235_lct": ("2", 136608646, "G", "Lactose intolerance (LCT)", "risk", "Other"), # Vitamin D "rs12785878": ("11", 71167449, "T", "Vitamin D levels (lower)", "lower", "Other"), # Hair "rs2180439": ("20", 22162468, "T", "Male pattern baldness", "risk", "Other"), "rs1160312": ("X", 67052952, "A", "Male pattern baldness (AR)", "risk", "Other"), "rs6625163": ("X", 67177092, "A", "Male pattern baldness", "risk", "Other"), # Muscle performance (新增) "rs1815739": ("11", 66560624, "T", "Sprint/Power athlete (ACTN3)", "power", "Other"), # Bitter taste (新增) "rs713598": ("7", 141972804, "C", "Bitter taste sensitivity (PTC)", "taster", "Other"), "rs1726866": ("7", 141972905, "T", "Bitter taste sensitivity", "taster", "Other"), # Cilantro aversion (新增) "rs72921001": ("11", 6889648, "A", "Cilantro aversion", "aversion", "Other"), } # Category display order and descriptions CATEGORIES = { "Gout": "痛風 / 尿酸代謝", "Kidney": "腎臟疾病", "Hearing": "聽力損失", "Autoimmune": "自體免疫疾病", "Cancer": "癌症風險", "Thrombosis": "血栓 / 凝血", "Thyroid": "甲狀腺疾病", "Bone": "骨質疏鬆 / 骨骼健康", "Liver": "肝臟疾病", "Migraine": "偏頭痛", "Longevity": "長壽 / 老化", "Sleep": "睡眠", "Skin": "皮膚", "Cardiovascular": "心血管疾病", "Metabolic": "代謝疾病", "Eye": "眼睛疾病", "Neuropsychiatric": "神經精神疾病", "Other": "其他特性", } def get_genotype_class(gt: str) -> str: """Classify genotype""" if gt in ['./.', '.|.', '.']: return 'MISSING' alleles = re.split('[/|]', gt) if all(a == '0' for a in alleles): return 'HOM_REF' elif all(a != '0' and a != '.' for a in alleles): return 'HOM_ALT' else: return 'HET' def parse_vcf_for_traits(vcf_path: str, sample_idx: int = 2) -> Tuple[Dict, List]: """Parse VCF and look for trait-associated SNPs""" print(f"Scanning VCF for {len(TRAIT_SNPS)} trait-associated variants...") # Build position lookup pos_to_snp = {} for rsid, (chrom, pos, risk_allele, trait, effect, category) in TRAIT_SNPS.items(): key = f"{chrom}-{pos}" if key not in pos_to_snp: pos_to_snp[key] = [] pos_to_snp[key].append((rsid, risk_allele, trait, effect, category)) found_variants = {} samples = [] open_func = gzip.open if vcf_path.endswith('.gz') else open mode = 'rt' if vcf_path.endswith('.gz') else 'r' with open_func(vcf_path, mode) as f: for line in f: if line.startswith('##'): continue elif line.startswith('#CHROM'): parts = line.strip().split('\t') samples = parts[9:] continue parts = line.strip().split('\t') if len(parts) < 10: continue chrom, pos, rsid_vcf, ref, alt, qual, filt, info, fmt = parts[:9] gt_fields = parts[9:] # Check if this position has a known trait SNP key = f"{chrom}-{pos}" if key not in pos_to_snp: continue # Get sample genotype fmt_parts = fmt.split(':') gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0 if sample_idx < len(gt_fields): gt_data = gt_fields[sample_idx].split(':') gt = gt_data[gt_idx] if gt_idx < len(gt_data) else './.' else: gt = './.' gt_class = get_genotype_class(gt) alleles = [ref] + alt.split(',') # Process each SNP at this position for rsid, risk_allele, trait, effect, category in pos_to_snp[key]: # Check if risk allele is present has_risk = False risk_copies = 0 if gt_class != 'MISSING': gt_alleles = re.split('[/|]', gt) for a in gt_alleles: if a.isdigit(): allele_idx = int(a) if allele_idx < len(alleles) and alleles[allele_idx] == risk_allele: has_risk = True risk_copies += 1 found_variants[rsid] = { 'rsid': rsid, 'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'genotype': gt, 'genotype_class': gt_class, 'risk_allele': risk_allele, 'trait': trait, 'effect': effect, 'category': category, 'has_risk_allele': has_risk, 'risk_copies': risk_copies } return found_variants, samples def generate_report(found_variants: Dict, output_path: str, sample_name: str): """Generate comprehensive trait analysis report""" # Group by category by_category = defaultdict(list) for rsid, var in found_variants.items(): by_category[var['category']].append(var) with open(output_path, 'w') as f: f.write("=" * 80 + "\n") f.write("COMPREHENSIVE GWAS TRAIT ANALYSIS REPORT\n") f.write(f"Sample: {sample_name}\n") f.write(f"Total SNPs analyzed: {len(TRAIT_SNPS)}\n") f.write(f"SNPs found in data: {len(found_variants)}\n") f.write("=" * 80 + "\n\n") # Summary statistics total_risk = sum(1 for v in found_variants.values() if v['has_risk_allele']) f.write(f"OVERALL SUMMARY: {total_risk} risk variants found\n\n") # Category summary f.write("=" * 80 + "\n") f.write("SUMMARY BY CATEGORY\n") f.write("=" * 80 + "\n\n") for cat_key in CATEGORIES.keys(): if cat_key in by_category: variants = by_category[cat_key] risk_count = sum(1 for v in variants if v['has_risk_allele']) cat_name = CATEGORIES[cat_key] f.write(f"{cat_name}: {risk_count}/{len(variants)} risk variants\n") # Detailed results by category f.write("\n" + "=" * 80 + "\n") f.write("DETAILED RESULTS BY CATEGORY\n") f.write("=" * 80 + "\n") for cat_key in CATEGORIES.keys(): if cat_key not in by_category: continue variants = by_category[cat_key] cat_name = CATEGORIES[cat_key] risk_count = sum(1 for v in variants if v['has_risk_allele']) f.write(f"\n\n## {cat_name} ({risk_count}/{len(variants)} risk)\n") f.write("-" * 60 + "\n") # Sort: risk variants first sorted_vars = sorted(variants, key=lambda x: (not x['has_risk_allele'], x['trait'])) for v in sorted_vars: status = "⚠️ RISK" if v['has_risk_allele'] else "✓ OK" copies = f"({v['risk_copies']}份)" if v['has_risk_allele'] else "" f.write(f"\n{v['trait']}: {v['rsid']} [{status}] {copies}\n") f.write(f" 基因型: {v['genotype']} | 風險等位基因: {v['risk_allele']} | 效應: {v['effect']}\n") # Full variant table f.write("\n\n" + "=" * 80 + "\n") f.write("COMPLETE VARIANT TABLE\n") f.write("=" * 80 + "\n\n") f.write("RSID\tCHROM\tPOS\tGENOTYPE\tRISK_ALLELE\tHAS_RISK\tCOPIES\tTRAIT\tCATEGORY\tEFFECT\n") for rsid, var in sorted(found_variants.items(), key=lambda x: (x[1]['category'], x[1]['trait'])): f.write(f"{var['rsid']}\t{var['chrom']}\t{var['pos']}\t{var['genotype']}\t") f.write(f"{var['risk_allele']}\t{var['has_risk_allele']}\t{var['risk_copies']}\t") f.write(f"{var['trait']}\t{var['category']}\t{var['effect']}\n") print(f"Report saved to: {output_path}") def main(): vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.rsid.vcf.gz' output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/gwas_comprehensive_report.txt' sample_idx = int(sys.argv[3]) if len(sys.argv) > 3 else 2 print("=" * 60) print("COMPREHENSIVE GWAS TRAIT ANALYSIS") print("=" * 60) print(f"VCF: {vcf_path}") print(f"Sample index: {sample_idx}") print(f"Total trait SNPs in database: {len(TRAIT_SNPS)}") print() found_variants, samples = parse_vcf_for_traits(vcf_path, sample_idx) sample_name = samples[sample_idx] if sample_idx < len(samples) else f"Sample_{sample_idx}" print(f"Analyzing sample: {sample_name}") print(f"\nFound {len(found_variants)} trait-associated variants in VCF") # Quick summary by category by_category = defaultdict(list) for rsid, var in found_variants.items(): by_category[var['category']].append(var) print("\n" + "=" * 60) print("QUICK SUMMARY BY CATEGORY") print("=" * 60) for cat_key in CATEGORIES.keys(): if cat_key in by_category: variants = by_category[cat_key] risk_count = sum(1 for v in variants if v['has_risk_allele']) cat_name = CATEGORIES[cat_key] marker = "⚠️ " if risk_count > 0 else " " print(f"{marker}{cat_name}: {risk_count}/{len(variants)} risk variants") generate_report(found_variants, output_path, sample_name) # Print high-risk findings print("\n" + "=" * 60) print("HIGH-PRIORITY FINDINGS (2+ copies of risk allele)") print("=" * 60) high_risk = [v for v in found_variants.values() if v['risk_copies'] >= 2] if high_risk: for v in sorted(high_risk, key=lambda x: x['category']): print(f"\n{v['trait']} ({v['rsid']})") print(f" Category: {CATEGORIES[v['category']]}") print(f" Genotype: {v['genotype']} (2 copies of risk allele {v['risk_allele']})") else: print("\nNo variants with 2 copies of risk allele found.") if __name__ == '__main__': main()