Refactor: Replace scaffolding with working analysis scripts

- Add trio_analysis.py for trio-based variant analysis with de novo detection
- Add clinvar_acmg_annotate.py for ClinVar/ACMG annotation
- Add gwas_comprehensive.py with 201 SNPs across 18 categories
- Add pharmgkb_full_analysis.py for pharmacogenomics analysis
- Add gwas_trait_lookup.py for basic GWAS trait lookup
- Add pharmacogenomics.py for basic PGx analysis
- Remove unused scaffolding code (src/, configs/, docs/, tests/)
- Update README.md with new documentation

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-12-01 22:36:02 +08:00
parent f74dc351f7
commit d13d58df8b
56 changed files with 2608 additions and 2347 deletions

590
gwas_comprehensive.py Normal file
View File

@@ -0,0 +1,590 @@
#!/usr/bin/env python3
"""
Comprehensive GWAS Trait Analysis Script
Expanded version with 200+ clinically relevant trait-associated SNPs
"""
import gzip
import sys
import re
from collections import defaultdict
from typing import Dict, List, Tuple
# ============================================================================
# COMPREHENSIVE TRAIT-ASSOCIATED SNPs DATABASE
# Format: rsid -> (chrom, pos, risk_allele, trait, effect, category)
# ============================================================================
TRAIT_SNPS = {
# ========================================================================
# GOUT / URIC ACID METABOLISM (新增)
# ========================================================================
"rs2231142": ("4", 89052323, "T", "Gout / Hyperuricemia", "risk", "Gout"),
"rs16890979": ("4", 9922166, "T", "Serum uric acid levels", "higher", "Gout"),
"rs734553": ("4", 9920485, "G", "Gout", "risk", "Gout"),
"rs1014290": ("4", 10001861, "A", "Serum uric acid levels", "higher", "Gout"),
"rs505802": ("11", 64357072, "C", "Serum uric acid levels", "higher", "Gout"),
"rs3775948": ("4", 9999007, "G", "Gout", "risk", "Gout"),
"rs12498742": ("4", 9993806, "A", "Serum uric acid levels", "higher", "Gout"),
"rs675209": ("4", 89011046, "T", "Gout", "risk", "Gout"),
"rs1165151": ("11", 64352047, "T", "Serum uric acid levels", "higher", "Gout"),
"rs478607": ("17", 19459563, "A", "Serum uric acid levels", "higher", "Gout"),
# ========================================================================
# KIDNEY DISEASE (新增)
# ========================================================================
"rs4293393": ("16", 20364808, "T", "Chronic kidney disease", "risk", "Kidney"),
"rs12917707": ("16", 20369861, "G", "Chronic kidney disease", "protective", "Kidney"),
"rs11959928": ("5", 39394747, "A", "eGFR decline", "risk", "Kidney"),
"rs1260326": ("2", 27730940, "T", "Chronic kidney disease", "risk", "Kidney"),
"rs13329952": ("16", 20393103, "C", "Chronic kidney disease", "risk", "Kidney"),
"rs267734": ("1", 150950830, "C", "Chronic kidney disease", "risk", "Kidney"),
# ========================================================================
# HEARING LOSS (與 Usher syndrome 家庭相關)
# ========================================================================
"rs7598759": ("2", 70439175, "A", "Age-related hearing loss", "risk", "Hearing"),
"rs161927": ("5", 88228027, "G", "Hearing impairment", "risk", "Hearing"),
"rs10497394": ("2", 70477374, "T", "Hearing loss", "risk", "Hearing"),
"rs3752752": ("7", 129608155, "C", "Noise-induced hearing loss", "risk", "Hearing"),
"rs7294": ("4", 6303557, "G", "Hearing loss", "risk", "Hearing"),
# ========================================================================
# AUTOIMMUNE DISEASES (新增)
# ========================================================================
# Rheumatoid Arthritis
"rs6679677": ("1", 114179091, "A", "Rheumatoid arthritis", "risk", "Autoimmune"),
"rs2476601": ("1", 114377568, "A", "Rheumatoid arthritis / Autoimmune", "risk", "Autoimmune"),
"rs3087243": ("2", 204447164, "G", "Rheumatoid arthritis", "protective", "Autoimmune"),
"rs4810485": ("20", 44747947, "T", "Rheumatoid arthritis", "risk", "Autoimmune"),
# Systemic Lupus Erythematosus (SLE)
"rs1143679": ("16", 31276811, "A", "Systemic lupus erythematosus", "risk", "Autoimmune"),
"rs7574865": ("2", 191099907, "T", "Systemic lupus erythematosus", "risk", "Autoimmune"),
"rs2187668": ("6", 32605884, "T", "Systemic lupus erythematosus", "risk", "Autoimmune"),
# Multiple Sclerosis
"rs3135388": ("6", 32439887, "A", "Multiple sclerosis", "risk", "Autoimmune"),
"rs6897932": ("5", 35910332, "C", "Multiple sclerosis", "risk", "Autoimmune"),
"rs4648356": ("1", 101256530, "C", "Multiple sclerosis", "risk", "Autoimmune"),
# Inflammatory Bowel Disease
"rs2241880": ("16", 50756540, "G", "Crohn's disease / IBD", "risk", "Autoimmune"),
"rs11209026": ("1", 67705958, "A", "Crohn's disease / IBD", "protective", "Autoimmune"),
"rs10883365": ("10", 64426914, "G", "Ulcerative colitis", "risk", "Autoimmune"),
"rs2066847": ("16", 50745926, "C", "Crohn's disease", "risk", "Autoimmune"),
# Type 1 Diabetes
"rs2292239": ("12", 56482804, "T", "Type 1 diabetes", "risk", "Autoimmune"),
"rs3129889": ("6", 32609440, "G", "Type 1 diabetes", "risk", "Autoimmune"),
"rs689": ("11", 2182224, "T", "Type 1 diabetes", "risk", "Autoimmune"),
# Celiac Disease
"rs2395182": ("6", 32713854, "T", "Celiac disease", "risk", "Autoimmune"),
"rs7775228": ("6", 32665438, "C", "Celiac disease", "risk", "Autoimmune"),
# Hashimoto's Thyroiditis / Graves' Disease
"rs179247": ("2", 204733986, "A", "Autoimmune thyroid disease", "risk", "Autoimmune"),
"rs1980422": ("6", 90957406, "C", "Autoimmune thyroid disease", "risk", "Autoimmune"),
# ========================================================================
# CANCER RISK (新增)
# ========================================================================
# Breast Cancer
"rs2981582": ("10", 123337335, "A", "Breast cancer (FGFR2)", "risk", "Cancer"),
"rs13281615": ("8", 128355618, "G", "Breast cancer", "risk", "Cancer"),
"rs889312": ("5", 56067641, "C", "Breast cancer (MAP3K1)", "risk", "Cancer"),
"rs3817198": ("11", 1909006, "C", "Breast cancer (LSP1)", "risk", "Cancer"),
"rs13387042": ("2", 217905832, "A", "Breast cancer", "risk", "Cancer"),
# Prostate Cancer
"rs1447295": ("8", 128554220, "A", "Prostate cancer", "risk", "Cancer"),
"rs16901979": ("8", 128320346, "A", "Prostate cancer", "risk", "Cancer"),
"rs6983267": ("8", 128413305, "G", "Prostate cancer / Colorectal cancer", "risk", "Cancer"),
"rs10993994": ("10", 51549496, "T", "Prostate cancer (MSMB)", "risk", "Cancer"),
"rs7679673": ("4", 106061534, "C", "Prostate cancer", "risk", "Cancer"),
# Colorectal Cancer
"rs4939827": ("18", 46453463, "T", "Colorectal cancer (SMAD7)", "risk", "Cancer"),
"rs6983267_crc": ("8", 128413305, "G", "Colorectal cancer", "risk", "Cancer"),
"rs4779584": ("15", 32994756, "T", "Colorectal cancer", "risk", "Cancer"),
"rs10795668": ("10", 8701219, "G", "Colorectal cancer", "protective", "Cancer"),
# Lung Cancer
"rs8034191": ("15", 78894339, "C", "Lung cancer", "risk", "Cancer"),
"rs1051730": ("15", 78882925, "A", "Lung cancer / Nicotine dependence", "risk", "Cancer"),
"rs2736100": ("5", 1286516, "C", "Lung cancer (TERT)", "risk", "Cancer"),
# Melanoma
"rs910873": ("20", 32665748, "C", "Melanoma", "risk", "Cancer"),
"rs1801516": ("11", 108175462, "A", "Melanoma (ATM)", "risk", "Cancer"),
"rs16953002": ("12", 89328335, "A", "Melanoma", "risk", "Cancer"),
# Thyroid Cancer
"rs965513": ("9", 100556109, "A", "Thyroid cancer", "risk", "Cancer"),
"rs944289": ("14", 36649246, "T", "Thyroid cancer", "risk", "Cancer"),
# Bladder Cancer
"rs710521": ("3", 189643526, "A", "Bladder cancer", "risk", "Cancer"),
"rs9642880": ("8", 128787253, "T", "Bladder cancer", "risk", "Cancer"),
# ========================================================================
# BLOOD CLOTTING / THROMBOSIS (新增)
# ========================================================================
"rs6025": ("1", 169519049, "T", "Factor V Leiden / DVT risk", "risk", "Thrombosis"),
"rs1799963": ("11", 46761055, "A", "Prothrombin G20210A / DVT risk", "risk", "Thrombosis"),
"rs8176719": ("9", 136131322, "C", "Blood type O (protective for VTE)", "protective", "Thrombosis"),
"rs505922": ("9", 136149229, "C", "Venous thromboembolism", "risk", "Thrombosis"),
"rs2066865": ("4", 155525276, "G", "Fibrinogen levels / DVT", "risk", "Thrombosis"),
# ========================================================================
# THYROID DISORDERS (新增)
# ========================================================================
"rs1991517": ("8", 133020441, "C", "Hypothyroidism", "risk", "Thyroid"),
"rs925489": ("2", 218283107, "T", "TSH levels", "higher", "Thyroid"),
"rs10499559": ("6", 166474536, "T", "Hypothyroidism", "risk", "Thyroid"),
"rs7850258": ("9", 4126287, "G", "Thyroid function", "altered", "Thyroid"),
# ========================================================================
# OSTEOPOROSIS / BONE HEALTH (新增)
# ========================================================================
"rs3736228": ("11", 68179081, "T", "Osteoporosis / Low BMD", "risk", "Bone"),
"rs4988235": ("2", 136608646, "G", "Lactose intolerance (affects Ca)", "risk", "Bone"),
"rs2282679": ("4", 72608383, "C", "Vitamin D deficiency", "risk", "Bone"),
"rs1800012": ("17", 48275363, "T", "Osteoporosis (COL1A1)", "risk", "Bone"),
"rs2062377": ("8", 119964052, "A", "Bone mineral density", "lower", "Bone"),
"rs4355801": ("8", 119963145, "G", "Bone mineral density", "higher", "Bone"),
# ========================================================================
# LIVER DISEASE (新增)
# ========================================================================
"rs738409": ("22", 44324727, "G", "NAFLD / Fatty liver (PNPLA3)", "risk", "Liver"),
"rs58542926": ("19", 19379549, "T", "NAFLD / Liver fibrosis (TM6SF2)", "risk", "Liver"),
"rs2228603": ("19", 11350488, "T", "NAFLD", "risk", "Liver"),
"rs12979860": ("19", 39248147, "C", "Hepatitis C clearance", "favorable", "Liver"),
# ========================================================================
# MIGRAINE / HEADACHE (新增)
# ========================================================================
"rs2651899": ("1", 10796866, "C", "Migraine", "risk", "Migraine"),
"rs10166942": ("2", 234824778, "T", "Migraine", "risk", "Migraine"),
"rs11172113": ("12", 57527283, "C", "Migraine (LRP1)", "risk", "Migraine"),
"rs1835740": ("8", 87521374, "A", "Migraine", "risk", "Migraine"),
# ========================================================================
# LONGEVITY / AGING (新增)
# ========================================================================
"rs2802292": ("6", 157192662, "G", "Longevity (FOXO3)", "protective", "Longevity"),
"rs1042522": ("17", 7579472, "C", "Longevity (TP53)", "altered", "Longevity"),
"rs4420638": ("19", 45422946, "A", "Longevity / Cardiovascular", "risk", "Longevity"),
# ========================================================================
# SLEEP / CIRCADIAN (原有 + 擴展)
# ========================================================================
"rs113851554": ("2", 66799986, "T", "Insomnia", "risk", "Sleep"),
"rs12927162": ("16", 68856985, "A", "Sleep duration", "shorter", "Sleep"),
"rs1823125": ("1", 205713532, "G", "Chronotype (morning person)", "morning", "Sleep"),
"rs10493596": ("1", 215803417, "T", "Insomnia", "risk", "Sleep"),
"rs3104997": ("6", 27424938, "C", "Sleep duration", "shorter", "Sleep"),
"rs73598374": ("4", 94847526, "A", "Insomnia", "risk", "Sleep"),
"rs2302729": ("5", 35857091, "G", "Insomnia", "risk", "Sleep"),
"rs12936231": ("17", 44282378, "C", "Restless legs syndrome", "risk", "Sleep"),
"rs3923809": ("6", 38642286, "A", "Restless legs syndrome (BTBD9)", "risk", "Sleep"),
# ========================================================================
# SKIN CONDITIONS (原有 + 擴展)
# ========================================================================
"rs1800629": ("6", 31543031, "A", "Psoriasis", "risk", "Skin"),
"rs20541": ("5", 131995964, "A", "Atopic dermatitis", "risk", "Skin"),
"rs2066808": ("6", 31540784, "A", "Psoriasis", "risk", "Skin"),
"rs3093662": ("6", 31574339, "G", "Psoriasis", "risk", "Skin"),
"rs10484554": ("6", 31271836, "A", "Psoriasis", "risk", "Skin"),
"rs1295686": ("5", 131996447, "A", "Atopic dermatitis", "risk", "Skin"),
"rs2227956": ("6", 31783279, "T", "Psoriasis", "risk", "Skin"),
"rs6906021": ("6", 32051991, "C", "Atopic dermatitis", "risk", "Skin"),
"rs12203592": ("6", 396321, "T", "Skin pigmentation / Freckling", "risk", "Skin"),
"rs1805007": ("16", 89986117, "T", "Red hair / Fair skin (MC1R)", "risk", "Skin"),
"rs1805008": ("16", 89986144, "T", "Red hair / Fair skin (MC1R)", "risk", "Skin"),
# ========================================================================
# CARDIOVASCULAR (原有 + 大幅擴展)
# ========================================================================
"rs10757274": ("9", 22096055, "G", "Coronary artery disease", "risk", "Cardiovascular"),
"rs1333049": ("9", 22125503, "C", "Coronary artery disease", "risk", "Cardiovascular"),
"rs4665058": ("2", 43845437, "C", "Coronary artery disease", "risk", "Cardiovascular"),
"rs17465637": ("1", 222823529, "A", "Coronary artery disease", "risk", "Cardiovascular"),
"rs6725887": ("2", 203828796, "C", "Coronary artery disease", "risk", "Cardiovascular"),
# Hypertension
"rs699": ("1", 230845794, "G", "Hypertension (AGT)", "risk", "Cardiovascular"),
"rs5186": ("3", 148459988, "C", "Hypertension (AGTR1)", "risk", "Cardiovascular"),
"rs4961": ("4", 2906707, "T", "Hypertension / Salt sensitivity", "risk", "Cardiovascular"),
"rs1799998": ("8", 142876043, "T", "Hypertension (CYP11B2)", "risk", "Cardiovascular"),
# Atrial Fibrillation
"rs2200733": ("4", 111718106, "T", "Atrial fibrillation", "risk", "Cardiovascular"),
"rs10033464": ("4", 111714418, "T", "Atrial fibrillation", "risk", "Cardiovascular"),
"rs6843082": ("4", 111712344, "G", "Atrial fibrillation (PITX2)", "risk", "Cardiovascular"),
# Heart Failure
"rs1739843": ("15", 75086042, "T", "Heart failure", "risk", "Cardiovascular"),
# Stroke
"rs11833579": ("12", 115553310, "A", "Ischemic stroke", "risk", "Cardiovascular"),
"rs12425791": ("12", 115557677, "A", "Stroke (NINJ2)", "risk", "Cardiovascular"),
# Lipids
"rs1801177": ("8", 19813529, "A", "LDL cholesterol (LPL)", "higher", "Cardiovascular"),
"rs12740374": ("1", 109822166, "G", "LDL cholesterol (CELSR2)", "lower", "Cardiovascular"),
"rs3764261": ("16", 56993324, "A", "HDL cholesterol (CETP)", "higher", "Cardiovascular"),
"rs1800588": ("15", 58723675, "T", "HDL cholesterol (LIPC)", "higher", "Cardiovascular"),
"rs328": ("8", 19819724, "G", "Triglycerides (LPL)", "lower", "Cardiovascular"),
"rs662799": ("11", 116663707, "G", "Triglycerides (APOA5)", "higher", "Cardiovascular"),
# ========================================================================
# TYPE 2 DIABETES / METABOLIC (原有 + 擴展)
# ========================================================================
"rs7903146": ("10", 114758349, "T", "Type 2 diabetes (TCF7L2)", "risk", "Metabolic"),
"rs12255372": ("10", 114808902, "T", "Type 2 diabetes (TCF7L2)", "risk", "Metabolic"),
"rs1801282": ("3", 12393125, "C", "Type 2 diabetes (PPARG)", "risk", "Metabolic"),
"rs5219": ("11", 17409572, "T", "Type 2 diabetes (KCNJ11)", "risk", "Metabolic"),
"rs13266634": ("8", 118184783, "C", "Type 2 diabetes (SLC30A8)", "risk", "Metabolic"),
"rs7754840": ("6", 20679709, "C", "Type 2 diabetes (CDKAL1)", "risk", "Metabolic"),
"rs10811661": ("9", 22134095, "T", "Type 2 diabetes (CDKN2A/B)", "risk", "Metabolic"),
"rs864745": ("7", 28196413, "T", "Type 2 diabetes (JAZF1)", "risk", "Metabolic"),
"rs4402960": ("3", 185511687, "T", "Type 2 diabetes (IGF2BP2)", "risk", "Metabolic"),
# Obesity/BMI
"rs9939609": ("16", 53820527, "A", "Obesity (FTO)", "risk", "Metabolic"),
"rs17782313": ("18", 57851097, "C", "Obesity (MC4R)", "risk", "Metabolic"),
"rs6548238": ("2", 634905, "C", "BMI", "higher", "Metabolic"),
"rs10938397": ("4", 45186139, "G", "BMI (GNPDA2)", "higher", "Metabolic"),
"rs571312": ("18", 57839769, "A", "BMI (MC4R)", "higher", "Metabolic"),
"rs10767664": ("11", 27682562, "A", "BMI (BDNF)", "higher", "Metabolic"),
# ========================================================================
# EYE CONDITIONS (原有 + 擴展)
# ========================================================================
"rs10490924": ("10", 124214448, "T", "Age-related macular degeneration (ARMS2)", "risk", "Eye"),
"rs1061170": ("1", 196659237, "C", "Age-related macular degeneration (CFH)", "risk", "Eye"),
"rs9621532": ("22", 38477587, "C", "Myopia", "risk", "Eye"),
"rs10034228": ("4", 81951543, "A", "Myopia", "risk", "Eye"),
"rs1048661": ("1", 165655423, "C", "Glaucoma (LOXL1)", "risk", "Eye"),
"rs4656461": ("1", 165653012, "G", "Glaucoma (LOXL1)", "risk", "Eye"),
"rs2165241": ("15", 93600556, "T", "Glaucoma", "risk", "Eye"),
"rs3753841": ("1", 196704632, "C", "Age-related macular degeneration", "risk", "Eye"),
# ========================================================================
# NEUROPSYCHIATRIC (原有)
# ========================================================================
# Alzheimer's Disease
"rs429358": ("19", 45411941, "C", "Alzheimer's disease (APOE e4)", "risk", "Neuropsychiatric"),
"rs7412": ("19", 45412079, "T", "Alzheimer's disease (APOE e2)", "protective", "Neuropsychiatric"),
"rs3865444": ("19", 51727962, "C", "Alzheimer's disease (CD33)", "risk", "Neuropsychiatric"),
"rs744373": ("2", 127892810, "G", "Alzheimer's disease (BIN1)", "risk", "Neuropsychiatric"),
"rs3851179": ("11", 85868640, "T", "Alzheimer's disease (PICALM)", "protective", "Neuropsychiatric"),
"rs670139": ("11", 59939307, "G", "Alzheimer's disease (MS4A)", "risk", "Neuropsychiatric"),
"rs9349407": ("6", 47487762, "C", "Alzheimer's disease (CD2AP)", "risk", "Neuropsychiatric"),
"rs11136000": ("8", 27468503, "C", "Alzheimer's disease (CLU)", "protective", "Neuropsychiatric"),
"rs3764650": ("19", 1063443, "G", "Alzheimer's disease (ABCA7)", "risk", "Neuropsychiatric"),
"rs3818361": ("1", 207692049, "A", "Alzheimer's disease (CR1)", "risk", "Neuropsychiatric"),
# Parkinson's Disease (新增)
"rs356220": ("4", 90626111, "T", "Parkinson's disease (SNCA)", "risk", "Neuropsychiatric"),
"rs11931074": ("4", 90674917, "G", "Parkinson's disease (SNCA)", "risk", "Neuropsychiatric"),
"rs34637584": ("12", 40734202, "A", "Parkinson's disease (LRRK2)", "risk", "Neuropsychiatric"),
"rs34311866": ("4", 951947, "C", "Parkinson's disease (TMEM175)", "risk", "Neuropsychiatric"),
# Depression
"rs1545843": ("1", 72761657, "A", "Major depression (NEGR1)", "risk", "Neuropsychiatric"),
"rs7973260": ("12", 118364392, "A", "Major depression (KSR2)", "risk", "Neuropsychiatric"),
"rs10514299": ("5", 87992715, "T", "Major depression (TMEM161B)", "risk", "Neuropsychiatric"),
"rs2422321": ("15", 88945878, "G", "Major depression (NTRK3)", "risk", "Neuropsychiatric"),
"rs301806": ("1", 8477981, "A", "Major depression (RERE)", "risk", "Neuropsychiatric"),
"rs1432639": ("3", 117115304, "G", "Major depression (LSAMP)", "risk", "Neuropsychiatric"),
"rs9530139": ("13", 53645407, "G", "Major depression", "risk", "Neuropsychiatric"),
"rs4543289": ("10", 106610839, "T", "Major depression (SORCS3)", "risk", "Neuropsychiatric"),
# Anxiety
"rs1709393": ("1", 34774088, "A", "Anxiety disorder", "risk", "Neuropsychiatric"),
"rs7688285": ("4", 123372626, "A", "Anxiety disorder", "risk", "Neuropsychiatric"),
# Bipolar
"rs4765913": ("12", 2345295, "A", "Bipolar disorder (CACNA1C)", "risk", "Neuropsychiatric"),
"rs10994336": ("10", 64649959, "T", "Bipolar disorder (ANK3)", "risk", "Neuropsychiatric"),
"rs9804190": ("11", 79077426, "C", "Bipolar disorder (ODZ4)", "risk", "Neuropsychiatric"),
# Schizophrenia
"rs1625579": ("8", 130635575, "T", "Schizophrenia (MIR137)", "risk", "Neuropsychiatric"),
"rs2007044": ("6", 28626894, "G", "Schizophrenia (HIST1H2BJ)", "risk", "Neuropsychiatric"),
"rs6932590": ("6", 27243984, "T", "Schizophrenia", "risk", "Neuropsychiatric"),
# ADHD (新增)
"rs1412005": ("16", 73099702, "T", "ADHD", "risk", "Neuropsychiatric"),
"rs11210892": ("1", 44185231, "A", "ADHD", "risk", "Neuropsychiatric"),
# ========================================================================
# OTHER TRAITS (原有 + 擴展)
# ========================================================================
# Caffeine
"rs762551": ("15", 75041917, "C", "Caffeine metabolism (slow)", "slow", "Other"),
"rs2472297": ("15", 75027880, "T", "Caffeine consumption", "higher", "Other"),
# Alcohol
"rs671": ("12", 112241766, "A", "Alcohol flush reaction (ALDH2)", "risk", "Other"),
"rs1229984": ("4", 100239319, "T", "Alcohol metabolism (ADH1B)", "fast", "Other"),
# Lactose
"rs4988235_lct": ("2", 136608646, "G", "Lactose intolerance (LCT)", "risk", "Other"),
# Vitamin D
"rs12785878": ("11", 71167449, "T", "Vitamin D levels (lower)", "lower", "Other"),
# Hair
"rs2180439": ("20", 22162468, "T", "Male pattern baldness", "risk", "Other"),
"rs1160312": ("X", 67052952, "A", "Male pattern baldness (AR)", "risk", "Other"),
"rs6625163": ("X", 67177092, "A", "Male pattern baldness", "risk", "Other"),
# Muscle performance (新增)
"rs1815739": ("11", 66560624, "T", "Sprint/Power athlete (ACTN3)", "power", "Other"),
# Bitter taste (新增)
"rs713598": ("7", 141972804, "C", "Bitter taste sensitivity (PTC)", "taster", "Other"),
"rs1726866": ("7", 141972905, "T", "Bitter taste sensitivity", "taster", "Other"),
# Cilantro aversion (新增)
"rs72921001": ("11", 6889648, "A", "Cilantro aversion", "aversion", "Other"),
}
# Category display order and descriptions
CATEGORIES = {
"Gout": "痛風 / 尿酸代謝",
"Kidney": "腎臟疾病",
"Hearing": "聽力損失",
"Autoimmune": "自體免疫疾病",
"Cancer": "癌症風險",
"Thrombosis": "血栓 / 凝血",
"Thyroid": "甲狀腺疾病",
"Bone": "骨質疏鬆 / 骨骼健康",
"Liver": "肝臟疾病",
"Migraine": "偏頭痛",
"Longevity": "長壽 / 老化",
"Sleep": "睡眠",
"Skin": "皮膚",
"Cardiovascular": "心血管疾病",
"Metabolic": "代謝疾病",
"Eye": "眼睛疾病",
"Neuropsychiatric": "神經精神疾病",
"Other": "其他特性",
}
def get_genotype_class(gt: str) -> str:
"""Classify genotype"""
if gt in ['./.', '.|.', '.']:
return 'MISSING'
alleles = re.split('[/|]', gt)
if all(a == '0' for a in alleles):
return 'HOM_REF'
elif all(a != '0' and a != '.' for a in alleles):
return 'HOM_ALT'
else:
return 'HET'
def parse_vcf_for_traits(vcf_path: str, sample_idx: int = 2) -> Tuple[Dict, List]:
"""Parse VCF and look for trait-associated SNPs"""
print(f"Scanning VCF for {len(TRAIT_SNPS)} trait-associated variants...")
# Build position lookup
pos_to_snp = {}
for rsid, (chrom, pos, risk_allele, trait, effect, category) in TRAIT_SNPS.items():
key = f"{chrom}-{pos}"
if key not in pos_to_snp:
pos_to_snp[key] = []
pos_to_snp[key].append((rsid, risk_allele, trait, effect, category))
found_variants = {}
samples = []
open_func = gzip.open if vcf_path.endswith('.gz') else open
mode = 'rt' if vcf_path.endswith('.gz') else 'r'
with open_func(vcf_path, mode) as f:
for line in f:
if line.startswith('##'):
continue
elif line.startswith('#CHROM'):
parts = line.strip().split('\t')
samples = parts[9:]
continue
parts = line.strip().split('\t')
if len(parts) < 10:
continue
chrom, pos, rsid_vcf, ref, alt, qual, filt, info, fmt = parts[:9]
gt_fields = parts[9:]
# Check if this position has a known trait SNP
key = f"{chrom}-{pos}"
if key not in pos_to_snp:
continue
# Get sample genotype
fmt_parts = fmt.split(':')
gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0
if sample_idx < len(gt_fields):
gt_data = gt_fields[sample_idx].split(':')
gt = gt_data[gt_idx] if gt_idx < len(gt_data) else './.'
else:
gt = './.'
gt_class = get_genotype_class(gt)
alleles = [ref] + alt.split(',')
# Process each SNP at this position
for rsid, risk_allele, trait, effect, category in pos_to_snp[key]:
# Check if risk allele is present
has_risk = False
risk_copies = 0
if gt_class != 'MISSING':
gt_alleles = re.split('[/|]', gt)
for a in gt_alleles:
if a.isdigit():
allele_idx = int(a)
if allele_idx < len(alleles) and alleles[allele_idx] == risk_allele:
has_risk = True
risk_copies += 1
found_variants[rsid] = {
'rsid': rsid,
'chrom': chrom,
'pos': pos,
'ref': ref,
'alt': alt,
'genotype': gt,
'genotype_class': gt_class,
'risk_allele': risk_allele,
'trait': trait,
'effect': effect,
'category': category,
'has_risk_allele': has_risk,
'risk_copies': risk_copies
}
return found_variants, samples
def generate_report(found_variants: Dict, output_path: str, sample_name: str):
"""Generate comprehensive trait analysis report"""
# Group by category
by_category = defaultdict(list)
for rsid, var in found_variants.items():
by_category[var['category']].append(var)
with open(output_path, 'w') as f:
f.write("=" * 80 + "\n")
f.write("COMPREHENSIVE GWAS TRAIT ANALYSIS REPORT\n")
f.write(f"Sample: {sample_name}\n")
f.write(f"Total SNPs analyzed: {len(TRAIT_SNPS)}\n")
f.write(f"SNPs found in data: {len(found_variants)}\n")
f.write("=" * 80 + "\n\n")
# Summary statistics
total_risk = sum(1 for v in found_variants.values() if v['has_risk_allele'])
f.write(f"OVERALL SUMMARY: {total_risk} risk variants found\n\n")
# Category summary
f.write("=" * 80 + "\n")
f.write("SUMMARY BY CATEGORY\n")
f.write("=" * 80 + "\n\n")
for cat_key in CATEGORIES.keys():
if cat_key in by_category:
variants = by_category[cat_key]
risk_count = sum(1 for v in variants if v['has_risk_allele'])
cat_name = CATEGORIES[cat_key]
f.write(f"{cat_name}: {risk_count}/{len(variants)} risk variants\n")
# Detailed results by category
f.write("\n" + "=" * 80 + "\n")
f.write("DETAILED RESULTS BY CATEGORY\n")
f.write("=" * 80 + "\n")
for cat_key in CATEGORIES.keys():
if cat_key not in by_category:
continue
variants = by_category[cat_key]
cat_name = CATEGORIES[cat_key]
risk_count = sum(1 for v in variants if v['has_risk_allele'])
f.write(f"\n\n## {cat_name} ({risk_count}/{len(variants)} risk)\n")
f.write("-" * 60 + "\n")
# Sort: risk variants first
sorted_vars = sorted(variants, key=lambda x: (not x['has_risk_allele'], x['trait']))
for v in sorted_vars:
status = "⚠️ RISK" if v['has_risk_allele'] else "✓ OK"
copies = f"({v['risk_copies']}份)" if v['has_risk_allele'] else ""
f.write(f"\n{v['trait']}: {v['rsid']} [{status}] {copies}\n")
f.write(f" 基因型: {v['genotype']} | 風險等位基因: {v['risk_allele']} | 效應: {v['effect']}\n")
# Full variant table
f.write("\n\n" + "=" * 80 + "\n")
f.write("COMPLETE VARIANT TABLE\n")
f.write("=" * 80 + "\n\n")
f.write("RSID\tCHROM\tPOS\tGENOTYPE\tRISK_ALLELE\tHAS_RISK\tCOPIES\tTRAIT\tCATEGORY\tEFFECT\n")
for rsid, var in sorted(found_variants.items(), key=lambda x: (x[1]['category'], x[1]['trait'])):
f.write(f"{var['rsid']}\t{var['chrom']}\t{var['pos']}\t{var['genotype']}\t")
f.write(f"{var['risk_allele']}\t{var['has_risk_allele']}\t{var['risk_copies']}\t")
f.write(f"{var['trait']}\t{var['category']}\t{var['effect']}\n")
print(f"Report saved to: {output_path}")
def main():
vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.rsid.vcf.gz'
output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/gwas_comprehensive_report.txt'
sample_idx = int(sys.argv[3]) if len(sys.argv) > 3 else 2
print("=" * 60)
print("COMPREHENSIVE GWAS TRAIT ANALYSIS")
print("=" * 60)
print(f"VCF: {vcf_path}")
print(f"Sample index: {sample_idx}")
print(f"Total trait SNPs in database: {len(TRAIT_SNPS)}")
print()
found_variants, samples = parse_vcf_for_traits(vcf_path, sample_idx)
sample_name = samples[sample_idx] if sample_idx < len(samples) else f"Sample_{sample_idx}"
print(f"Analyzing sample: {sample_name}")
print(f"\nFound {len(found_variants)} trait-associated variants in VCF")
# Quick summary by category
by_category = defaultdict(list)
for rsid, var in found_variants.items():
by_category[var['category']].append(var)
print("\n" + "=" * 60)
print("QUICK SUMMARY BY CATEGORY")
print("=" * 60)
for cat_key in CATEGORIES.keys():
if cat_key in by_category:
variants = by_category[cat_key]
risk_count = sum(1 for v in variants if v['has_risk_allele'])
cat_name = CATEGORIES[cat_key]
marker = "⚠️ " if risk_count > 0 else " "
print(f"{marker}{cat_name}: {risk_count}/{len(variants)} risk variants")
generate_report(found_variants, output_path, sample_name)
# Print high-risk findings
print("\n" + "=" * 60)
print("HIGH-PRIORITY FINDINGS (2+ copies of risk allele)")
print("=" * 60)
high_risk = [v for v in found_variants.values() if v['risk_copies'] >= 2]
if high_risk:
for v in sorted(high_risk, key=lambda x: x['category']):
print(f"\n{v['trait']} ({v['rsid']})")
print(f" Category: {CATEGORIES[v['category']]}")
print(f" Genotype: {v['genotype']} (2 copies of risk allele {v['risk_allele']})")
else:
print("\nNo variants with 2 copies of risk allele found.")
if __name__ == '__main__':
main()