- Add trio_analysis.py for trio-based variant analysis with de novo detection - Add clinvar_acmg_annotate.py for ClinVar/ACMG annotation - Add gwas_comprehensive.py with 201 SNPs across 18 categories - Add pharmgkb_full_analysis.py for pharmacogenomics analysis - Add gwas_trait_lookup.py for basic GWAS trait lookup - Add pharmacogenomics.py for basic PGx analysis - Remove unused scaffolding code (src/, configs/, docs/, tests/) - Update README.md with new documentation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
591 lines
30 KiB
Python
591 lines
30 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Comprehensive GWAS Trait Analysis Script
|
|
Expanded version with 200+ clinically relevant trait-associated SNPs
|
|
"""
|
|
|
|
import gzip
|
|
import sys
|
|
import re
|
|
from collections import defaultdict
|
|
from typing import Dict, List, Tuple
|
|
|
|
# ============================================================================
|
|
# COMPREHENSIVE TRAIT-ASSOCIATED SNPs DATABASE
|
|
# Format: rsid -> (chrom, pos, risk_allele, trait, effect, category)
|
|
# ============================================================================
|
|
|
|
TRAIT_SNPS = {
|
|
# ========================================================================
|
|
# GOUT / URIC ACID METABOLISM (新增)
|
|
# ========================================================================
|
|
"rs2231142": ("4", 89052323, "T", "Gout / Hyperuricemia", "risk", "Gout"),
|
|
"rs16890979": ("4", 9922166, "T", "Serum uric acid levels", "higher", "Gout"),
|
|
"rs734553": ("4", 9920485, "G", "Gout", "risk", "Gout"),
|
|
"rs1014290": ("4", 10001861, "A", "Serum uric acid levels", "higher", "Gout"),
|
|
"rs505802": ("11", 64357072, "C", "Serum uric acid levels", "higher", "Gout"),
|
|
"rs3775948": ("4", 9999007, "G", "Gout", "risk", "Gout"),
|
|
"rs12498742": ("4", 9993806, "A", "Serum uric acid levels", "higher", "Gout"),
|
|
"rs675209": ("4", 89011046, "T", "Gout", "risk", "Gout"),
|
|
"rs1165151": ("11", 64352047, "T", "Serum uric acid levels", "higher", "Gout"),
|
|
"rs478607": ("17", 19459563, "A", "Serum uric acid levels", "higher", "Gout"),
|
|
|
|
# ========================================================================
|
|
# KIDNEY DISEASE (新增)
|
|
# ========================================================================
|
|
"rs4293393": ("16", 20364808, "T", "Chronic kidney disease", "risk", "Kidney"),
|
|
"rs12917707": ("16", 20369861, "G", "Chronic kidney disease", "protective", "Kidney"),
|
|
"rs11959928": ("5", 39394747, "A", "eGFR decline", "risk", "Kidney"),
|
|
"rs1260326": ("2", 27730940, "T", "Chronic kidney disease", "risk", "Kidney"),
|
|
"rs13329952": ("16", 20393103, "C", "Chronic kidney disease", "risk", "Kidney"),
|
|
"rs267734": ("1", 150950830, "C", "Chronic kidney disease", "risk", "Kidney"),
|
|
|
|
# ========================================================================
|
|
# HEARING LOSS (與 Usher syndrome 家庭相關)
|
|
# ========================================================================
|
|
"rs7598759": ("2", 70439175, "A", "Age-related hearing loss", "risk", "Hearing"),
|
|
"rs161927": ("5", 88228027, "G", "Hearing impairment", "risk", "Hearing"),
|
|
"rs10497394": ("2", 70477374, "T", "Hearing loss", "risk", "Hearing"),
|
|
"rs3752752": ("7", 129608155, "C", "Noise-induced hearing loss", "risk", "Hearing"),
|
|
"rs7294": ("4", 6303557, "G", "Hearing loss", "risk", "Hearing"),
|
|
|
|
# ========================================================================
|
|
# AUTOIMMUNE DISEASES (新增)
|
|
# ========================================================================
|
|
# Rheumatoid Arthritis
|
|
"rs6679677": ("1", 114179091, "A", "Rheumatoid arthritis", "risk", "Autoimmune"),
|
|
"rs2476601": ("1", 114377568, "A", "Rheumatoid arthritis / Autoimmune", "risk", "Autoimmune"),
|
|
"rs3087243": ("2", 204447164, "G", "Rheumatoid arthritis", "protective", "Autoimmune"),
|
|
"rs4810485": ("20", 44747947, "T", "Rheumatoid arthritis", "risk", "Autoimmune"),
|
|
|
|
# Systemic Lupus Erythematosus (SLE)
|
|
"rs1143679": ("16", 31276811, "A", "Systemic lupus erythematosus", "risk", "Autoimmune"),
|
|
"rs7574865": ("2", 191099907, "T", "Systemic lupus erythematosus", "risk", "Autoimmune"),
|
|
"rs2187668": ("6", 32605884, "T", "Systemic lupus erythematosus", "risk", "Autoimmune"),
|
|
|
|
# Multiple Sclerosis
|
|
"rs3135388": ("6", 32439887, "A", "Multiple sclerosis", "risk", "Autoimmune"),
|
|
"rs6897932": ("5", 35910332, "C", "Multiple sclerosis", "risk", "Autoimmune"),
|
|
"rs4648356": ("1", 101256530, "C", "Multiple sclerosis", "risk", "Autoimmune"),
|
|
|
|
# Inflammatory Bowel Disease
|
|
"rs2241880": ("16", 50756540, "G", "Crohn's disease / IBD", "risk", "Autoimmune"),
|
|
"rs11209026": ("1", 67705958, "A", "Crohn's disease / IBD", "protective", "Autoimmune"),
|
|
"rs10883365": ("10", 64426914, "G", "Ulcerative colitis", "risk", "Autoimmune"),
|
|
"rs2066847": ("16", 50745926, "C", "Crohn's disease", "risk", "Autoimmune"),
|
|
|
|
# Type 1 Diabetes
|
|
"rs2292239": ("12", 56482804, "T", "Type 1 diabetes", "risk", "Autoimmune"),
|
|
"rs3129889": ("6", 32609440, "G", "Type 1 diabetes", "risk", "Autoimmune"),
|
|
"rs689": ("11", 2182224, "T", "Type 1 diabetes", "risk", "Autoimmune"),
|
|
|
|
# Celiac Disease
|
|
"rs2395182": ("6", 32713854, "T", "Celiac disease", "risk", "Autoimmune"),
|
|
"rs7775228": ("6", 32665438, "C", "Celiac disease", "risk", "Autoimmune"),
|
|
|
|
# Hashimoto's Thyroiditis / Graves' Disease
|
|
"rs179247": ("2", 204733986, "A", "Autoimmune thyroid disease", "risk", "Autoimmune"),
|
|
"rs1980422": ("6", 90957406, "C", "Autoimmune thyroid disease", "risk", "Autoimmune"),
|
|
|
|
# ========================================================================
|
|
# CANCER RISK (新增)
|
|
# ========================================================================
|
|
# Breast Cancer
|
|
"rs2981582": ("10", 123337335, "A", "Breast cancer (FGFR2)", "risk", "Cancer"),
|
|
"rs13281615": ("8", 128355618, "G", "Breast cancer", "risk", "Cancer"),
|
|
"rs889312": ("5", 56067641, "C", "Breast cancer (MAP3K1)", "risk", "Cancer"),
|
|
"rs3817198": ("11", 1909006, "C", "Breast cancer (LSP1)", "risk", "Cancer"),
|
|
"rs13387042": ("2", 217905832, "A", "Breast cancer", "risk", "Cancer"),
|
|
|
|
# Prostate Cancer
|
|
"rs1447295": ("8", 128554220, "A", "Prostate cancer", "risk", "Cancer"),
|
|
"rs16901979": ("8", 128320346, "A", "Prostate cancer", "risk", "Cancer"),
|
|
"rs6983267": ("8", 128413305, "G", "Prostate cancer / Colorectal cancer", "risk", "Cancer"),
|
|
"rs10993994": ("10", 51549496, "T", "Prostate cancer (MSMB)", "risk", "Cancer"),
|
|
"rs7679673": ("4", 106061534, "C", "Prostate cancer", "risk", "Cancer"),
|
|
|
|
# Colorectal Cancer
|
|
"rs4939827": ("18", 46453463, "T", "Colorectal cancer (SMAD7)", "risk", "Cancer"),
|
|
"rs6983267_crc": ("8", 128413305, "G", "Colorectal cancer", "risk", "Cancer"),
|
|
"rs4779584": ("15", 32994756, "T", "Colorectal cancer", "risk", "Cancer"),
|
|
"rs10795668": ("10", 8701219, "G", "Colorectal cancer", "protective", "Cancer"),
|
|
|
|
# Lung Cancer
|
|
"rs8034191": ("15", 78894339, "C", "Lung cancer", "risk", "Cancer"),
|
|
"rs1051730": ("15", 78882925, "A", "Lung cancer / Nicotine dependence", "risk", "Cancer"),
|
|
"rs2736100": ("5", 1286516, "C", "Lung cancer (TERT)", "risk", "Cancer"),
|
|
|
|
# Melanoma
|
|
"rs910873": ("20", 32665748, "C", "Melanoma", "risk", "Cancer"),
|
|
"rs1801516": ("11", 108175462, "A", "Melanoma (ATM)", "risk", "Cancer"),
|
|
"rs16953002": ("12", 89328335, "A", "Melanoma", "risk", "Cancer"),
|
|
|
|
# Thyroid Cancer
|
|
"rs965513": ("9", 100556109, "A", "Thyroid cancer", "risk", "Cancer"),
|
|
"rs944289": ("14", 36649246, "T", "Thyroid cancer", "risk", "Cancer"),
|
|
|
|
# Bladder Cancer
|
|
"rs710521": ("3", 189643526, "A", "Bladder cancer", "risk", "Cancer"),
|
|
"rs9642880": ("8", 128787253, "T", "Bladder cancer", "risk", "Cancer"),
|
|
|
|
# ========================================================================
|
|
# BLOOD CLOTTING / THROMBOSIS (新增)
|
|
# ========================================================================
|
|
"rs6025": ("1", 169519049, "T", "Factor V Leiden / DVT risk", "risk", "Thrombosis"),
|
|
"rs1799963": ("11", 46761055, "A", "Prothrombin G20210A / DVT risk", "risk", "Thrombosis"),
|
|
"rs8176719": ("9", 136131322, "C", "Blood type O (protective for VTE)", "protective", "Thrombosis"),
|
|
"rs505922": ("9", 136149229, "C", "Venous thromboembolism", "risk", "Thrombosis"),
|
|
"rs2066865": ("4", 155525276, "G", "Fibrinogen levels / DVT", "risk", "Thrombosis"),
|
|
|
|
# ========================================================================
|
|
# THYROID DISORDERS (新增)
|
|
# ========================================================================
|
|
"rs1991517": ("8", 133020441, "C", "Hypothyroidism", "risk", "Thyroid"),
|
|
"rs925489": ("2", 218283107, "T", "TSH levels", "higher", "Thyroid"),
|
|
"rs10499559": ("6", 166474536, "T", "Hypothyroidism", "risk", "Thyroid"),
|
|
"rs7850258": ("9", 4126287, "G", "Thyroid function", "altered", "Thyroid"),
|
|
|
|
# ========================================================================
|
|
# OSTEOPOROSIS / BONE HEALTH (新增)
|
|
# ========================================================================
|
|
"rs3736228": ("11", 68179081, "T", "Osteoporosis / Low BMD", "risk", "Bone"),
|
|
"rs4988235": ("2", 136608646, "G", "Lactose intolerance (affects Ca)", "risk", "Bone"),
|
|
"rs2282679": ("4", 72608383, "C", "Vitamin D deficiency", "risk", "Bone"),
|
|
"rs1800012": ("17", 48275363, "T", "Osteoporosis (COL1A1)", "risk", "Bone"),
|
|
"rs2062377": ("8", 119964052, "A", "Bone mineral density", "lower", "Bone"),
|
|
"rs4355801": ("8", 119963145, "G", "Bone mineral density", "higher", "Bone"),
|
|
|
|
# ========================================================================
|
|
# LIVER DISEASE (新增)
|
|
# ========================================================================
|
|
"rs738409": ("22", 44324727, "G", "NAFLD / Fatty liver (PNPLA3)", "risk", "Liver"),
|
|
"rs58542926": ("19", 19379549, "T", "NAFLD / Liver fibrosis (TM6SF2)", "risk", "Liver"),
|
|
"rs2228603": ("19", 11350488, "T", "NAFLD", "risk", "Liver"),
|
|
"rs12979860": ("19", 39248147, "C", "Hepatitis C clearance", "favorable", "Liver"),
|
|
|
|
# ========================================================================
|
|
# MIGRAINE / HEADACHE (新增)
|
|
# ========================================================================
|
|
"rs2651899": ("1", 10796866, "C", "Migraine", "risk", "Migraine"),
|
|
"rs10166942": ("2", 234824778, "T", "Migraine", "risk", "Migraine"),
|
|
"rs11172113": ("12", 57527283, "C", "Migraine (LRP1)", "risk", "Migraine"),
|
|
"rs1835740": ("8", 87521374, "A", "Migraine", "risk", "Migraine"),
|
|
|
|
# ========================================================================
|
|
# LONGEVITY / AGING (新增)
|
|
# ========================================================================
|
|
"rs2802292": ("6", 157192662, "G", "Longevity (FOXO3)", "protective", "Longevity"),
|
|
"rs1042522": ("17", 7579472, "C", "Longevity (TP53)", "altered", "Longevity"),
|
|
"rs4420638": ("19", 45422946, "A", "Longevity / Cardiovascular", "risk", "Longevity"),
|
|
|
|
# ========================================================================
|
|
# SLEEP / CIRCADIAN (原有 + 擴展)
|
|
# ========================================================================
|
|
"rs113851554": ("2", 66799986, "T", "Insomnia", "risk", "Sleep"),
|
|
"rs12927162": ("16", 68856985, "A", "Sleep duration", "shorter", "Sleep"),
|
|
"rs1823125": ("1", 205713532, "G", "Chronotype (morning person)", "morning", "Sleep"),
|
|
"rs10493596": ("1", 215803417, "T", "Insomnia", "risk", "Sleep"),
|
|
"rs3104997": ("6", 27424938, "C", "Sleep duration", "shorter", "Sleep"),
|
|
"rs73598374": ("4", 94847526, "A", "Insomnia", "risk", "Sleep"),
|
|
"rs2302729": ("5", 35857091, "G", "Insomnia", "risk", "Sleep"),
|
|
"rs12936231": ("17", 44282378, "C", "Restless legs syndrome", "risk", "Sleep"),
|
|
"rs3923809": ("6", 38642286, "A", "Restless legs syndrome (BTBD9)", "risk", "Sleep"),
|
|
|
|
# ========================================================================
|
|
# SKIN CONDITIONS (原有 + 擴展)
|
|
# ========================================================================
|
|
"rs1800629": ("6", 31543031, "A", "Psoriasis", "risk", "Skin"),
|
|
"rs20541": ("5", 131995964, "A", "Atopic dermatitis", "risk", "Skin"),
|
|
"rs2066808": ("6", 31540784, "A", "Psoriasis", "risk", "Skin"),
|
|
"rs3093662": ("6", 31574339, "G", "Psoriasis", "risk", "Skin"),
|
|
"rs10484554": ("6", 31271836, "A", "Psoriasis", "risk", "Skin"),
|
|
"rs1295686": ("5", 131996447, "A", "Atopic dermatitis", "risk", "Skin"),
|
|
"rs2227956": ("6", 31783279, "T", "Psoriasis", "risk", "Skin"),
|
|
"rs6906021": ("6", 32051991, "C", "Atopic dermatitis", "risk", "Skin"),
|
|
"rs12203592": ("6", 396321, "T", "Skin pigmentation / Freckling", "risk", "Skin"),
|
|
"rs1805007": ("16", 89986117, "T", "Red hair / Fair skin (MC1R)", "risk", "Skin"),
|
|
"rs1805008": ("16", 89986144, "T", "Red hair / Fair skin (MC1R)", "risk", "Skin"),
|
|
|
|
# ========================================================================
|
|
# CARDIOVASCULAR (原有 + 大幅擴展)
|
|
# ========================================================================
|
|
"rs10757274": ("9", 22096055, "G", "Coronary artery disease", "risk", "Cardiovascular"),
|
|
"rs1333049": ("9", 22125503, "C", "Coronary artery disease", "risk", "Cardiovascular"),
|
|
"rs4665058": ("2", 43845437, "C", "Coronary artery disease", "risk", "Cardiovascular"),
|
|
"rs17465637": ("1", 222823529, "A", "Coronary artery disease", "risk", "Cardiovascular"),
|
|
"rs6725887": ("2", 203828796, "C", "Coronary artery disease", "risk", "Cardiovascular"),
|
|
# Hypertension
|
|
"rs699": ("1", 230845794, "G", "Hypertension (AGT)", "risk", "Cardiovascular"),
|
|
"rs5186": ("3", 148459988, "C", "Hypertension (AGTR1)", "risk", "Cardiovascular"),
|
|
"rs4961": ("4", 2906707, "T", "Hypertension / Salt sensitivity", "risk", "Cardiovascular"),
|
|
"rs1799998": ("8", 142876043, "T", "Hypertension (CYP11B2)", "risk", "Cardiovascular"),
|
|
# Atrial Fibrillation
|
|
"rs2200733": ("4", 111718106, "T", "Atrial fibrillation", "risk", "Cardiovascular"),
|
|
"rs10033464": ("4", 111714418, "T", "Atrial fibrillation", "risk", "Cardiovascular"),
|
|
"rs6843082": ("4", 111712344, "G", "Atrial fibrillation (PITX2)", "risk", "Cardiovascular"),
|
|
# Heart Failure
|
|
"rs1739843": ("15", 75086042, "T", "Heart failure", "risk", "Cardiovascular"),
|
|
# Stroke
|
|
"rs11833579": ("12", 115553310, "A", "Ischemic stroke", "risk", "Cardiovascular"),
|
|
"rs12425791": ("12", 115557677, "A", "Stroke (NINJ2)", "risk", "Cardiovascular"),
|
|
# Lipids
|
|
"rs1801177": ("8", 19813529, "A", "LDL cholesterol (LPL)", "higher", "Cardiovascular"),
|
|
"rs12740374": ("1", 109822166, "G", "LDL cholesterol (CELSR2)", "lower", "Cardiovascular"),
|
|
"rs3764261": ("16", 56993324, "A", "HDL cholesterol (CETP)", "higher", "Cardiovascular"),
|
|
"rs1800588": ("15", 58723675, "T", "HDL cholesterol (LIPC)", "higher", "Cardiovascular"),
|
|
"rs328": ("8", 19819724, "G", "Triglycerides (LPL)", "lower", "Cardiovascular"),
|
|
"rs662799": ("11", 116663707, "G", "Triglycerides (APOA5)", "higher", "Cardiovascular"),
|
|
|
|
# ========================================================================
|
|
# TYPE 2 DIABETES / METABOLIC (原有 + 擴展)
|
|
# ========================================================================
|
|
"rs7903146": ("10", 114758349, "T", "Type 2 diabetes (TCF7L2)", "risk", "Metabolic"),
|
|
"rs12255372": ("10", 114808902, "T", "Type 2 diabetes (TCF7L2)", "risk", "Metabolic"),
|
|
"rs1801282": ("3", 12393125, "C", "Type 2 diabetes (PPARG)", "risk", "Metabolic"),
|
|
"rs5219": ("11", 17409572, "T", "Type 2 diabetes (KCNJ11)", "risk", "Metabolic"),
|
|
"rs13266634": ("8", 118184783, "C", "Type 2 diabetes (SLC30A8)", "risk", "Metabolic"),
|
|
"rs7754840": ("6", 20679709, "C", "Type 2 diabetes (CDKAL1)", "risk", "Metabolic"),
|
|
"rs10811661": ("9", 22134095, "T", "Type 2 diabetes (CDKN2A/B)", "risk", "Metabolic"),
|
|
"rs864745": ("7", 28196413, "T", "Type 2 diabetes (JAZF1)", "risk", "Metabolic"),
|
|
"rs4402960": ("3", 185511687, "T", "Type 2 diabetes (IGF2BP2)", "risk", "Metabolic"),
|
|
# Obesity/BMI
|
|
"rs9939609": ("16", 53820527, "A", "Obesity (FTO)", "risk", "Metabolic"),
|
|
"rs17782313": ("18", 57851097, "C", "Obesity (MC4R)", "risk", "Metabolic"),
|
|
"rs6548238": ("2", 634905, "C", "BMI", "higher", "Metabolic"),
|
|
"rs10938397": ("4", 45186139, "G", "BMI (GNPDA2)", "higher", "Metabolic"),
|
|
"rs571312": ("18", 57839769, "A", "BMI (MC4R)", "higher", "Metabolic"),
|
|
"rs10767664": ("11", 27682562, "A", "BMI (BDNF)", "higher", "Metabolic"),
|
|
|
|
# ========================================================================
|
|
# EYE CONDITIONS (原有 + 擴展)
|
|
# ========================================================================
|
|
"rs10490924": ("10", 124214448, "T", "Age-related macular degeneration (ARMS2)", "risk", "Eye"),
|
|
"rs1061170": ("1", 196659237, "C", "Age-related macular degeneration (CFH)", "risk", "Eye"),
|
|
"rs9621532": ("22", 38477587, "C", "Myopia", "risk", "Eye"),
|
|
"rs10034228": ("4", 81951543, "A", "Myopia", "risk", "Eye"),
|
|
"rs1048661": ("1", 165655423, "C", "Glaucoma (LOXL1)", "risk", "Eye"),
|
|
"rs4656461": ("1", 165653012, "G", "Glaucoma (LOXL1)", "risk", "Eye"),
|
|
"rs2165241": ("15", 93600556, "T", "Glaucoma", "risk", "Eye"),
|
|
"rs3753841": ("1", 196704632, "C", "Age-related macular degeneration", "risk", "Eye"),
|
|
|
|
# ========================================================================
|
|
# NEUROPSYCHIATRIC (原有)
|
|
# ========================================================================
|
|
# Alzheimer's Disease
|
|
"rs429358": ("19", 45411941, "C", "Alzheimer's disease (APOE e4)", "risk", "Neuropsychiatric"),
|
|
"rs7412": ("19", 45412079, "T", "Alzheimer's disease (APOE e2)", "protective", "Neuropsychiatric"),
|
|
"rs3865444": ("19", 51727962, "C", "Alzheimer's disease (CD33)", "risk", "Neuropsychiatric"),
|
|
"rs744373": ("2", 127892810, "G", "Alzheimer's disease (BIN1)", "risk", "Neuropsychiatric"),
|
|
"rs3851179": ("11", 85868640, "T", "Alzheimer's disease (PICALM)", "protective", "Neuropsychiatric"),
|
|
"rs670139": ("11", 59939307, "G", "Alzheimer's disease (MS4A)", "risk", "Neuropsychiatric"),
|
|
"rs9349407": ("6", 47487762, "C", "Alzheimer's disease (CD2AP)", "risk", "Neuropsychiatric"),
|
|
"rs11136000": ("8", 27468503, "C", "Alzheimer's disease (CLU)", "protective", "Neuropsychiatric"),
|
|
"rs3764650": ("19", 1063443, "G", "Alzheimer's disease (ABCA7)", "risk", "Neuropsychiatric"),
|
|
"rs3818361": ("1", 207692049, "A", "Alzheimer's disease (CR1)", "risk", "Neuropsychiatric"),
|
|
# Parkinson's Disease (新增)
|
|
"rs356220": ("4", 90626111, "T", "Parkinson's disease (SNCA)", "risk", "Neuropsychiatric"),
|
|
"rs11931074": ("4", 90674917, "G", "Parkinson's disease (SNCA)", "risk", "Neuropsychiatric"),
|
|
"rs34637584": ("12", 40734202, "A", "Parkinson's disease (LRRK2)", "risk", "Neuropsychiatric"),
|
|
"rs34311866": ("4", 951947, "C", "Parkinson's disease (TMEM175)", "risk", "Neuropsychiatric"),
|
|
# Depression
|
|
"rs1545843": ("1", 72761657, "A", "Major depression (NEGR1)", "risk", "Neuropsychiatric"),
|
|
"rs7973260": ("12", 118364392, "A", "Major depression (KSR2)", "risk", "Neuropsychiatric"),
|
|
"rs10514299": ("5", 87992715, "T", "Major depression (TMEM161B)", "risk", "Neuropsychiatric"),
|
|
"rs2422321": ("15", 88945878, "G", "Major depression (NTRK3)", "risk", "Neuropsychiatric"),
|
|
"rs301806": ("1", 8477981, "A", "Major depression (RERE)", "risk", "Neuropsychiatric"),
|
|
"rs1432639": ("3", 117115304, "G", "Major depression (LSAMP)", "risk", "Neuropsychiatric"),
|
|
"rs9530139": ("13", 53645407, "G", "Major depression", "risk", "Neuropsychiatric"),
|
|
"rs4543289": ("10", 106610839, "T", "Major depression (SORCS3)", "risk", "Neuropsychiatric"),
|
|
# Anxiety
|
|
"rs1709393": ("1", 34774088, "A", "Anxiety disorder", "risk", "Neuropsychiatric"),
|
|
"rs7688285": ("4", 123372626, "A", "Anxiety disorder", "risk", "Neuropsychiatric"),
|
|
# Bipolar
|
|
"rs4765913": ("12", 2345295, "A", "Bipolar disorder (CACNA1C)", "risk", "Neuropsychiatric"),
|
|
"rs10994336": ("10", 64649959, "T", "Bipolar disorder (ANK3)", "risk", "Neuropsychiatric"),
|
|
"rs9804190": ("11", 79077426, "C", "Bipolar disorder (ODZ4)", "risk", "Neuropsychiatric"),
|
|
# Schizophrenia
|
|
"rs1625579": ("8", 130635575, "T", "Schizophrenia (MIR137)", "risk", "Neuropsychiatric"),
|
|
"rs2007044": ("6", 28626894, "G", "Schizophrenia (HIST1H2BJ)", "risk", "Neuropsychiatric"),
|
|
"rs6932590": ("6", 27243984, "T", "Schizophrenia", "risk", "Neuropsychiatric"),
|
|
# ADHD (新增)
|
|
"rs1412005": ("16", 73099702, "T", "ADHD", "risk", "Neuropsychiatric"),
|
|
"rs11210892": ("1", 44185231, "A", "ADHD", "risk", "Neuropsychiatric"),
|
|
|
|
# ========================================================================
|
|
# OTHER TRAITS (原有 + 擴展)
|
|
# ========================================================================
|
|
# Caffeine
|
|
"rs762551": ("15", 75041917, "C", "Caffeine metabolism (slow)", "slow", "Other"),
|
|
"rs2472297": ("15", 75027880, "T", "Caffeine consumption", "higher", "Other"),
|
|
# Alcohol
|
|
"rs671": ("12", 112241766, "A", "Alcohol flush reaction (ALDH2)", "risk", "Other"),
|
|
"rs1229984": ("4", 100239319, "T", "Alcohol metabolism (ADH1B)", "fast", "Other"),
|
|
# Lactose
|
|
"rs4988235_lct": ("2", 136608646, "G", "Lactose intolerance (LCT)", "risk", "Other"),
|
|
# Vitamin D
|
|
"rs12785878": ("11", 71167449, "T", "Vitamin D levels (lower)", "lower", "Other"),
|
|
# Hair
|
|
"rs2180439": ("20", 22162468, "T", "Male pattern baldness", "risk", "Other"),
|
|
"rs1160312": ("X", 67052952, "A", "Male pattern baldness (AR)", "risk", "Other"),
|
|
"rs6625163": ("X", 67177092, "A", "Male pattern baldness", "risk", "Other"),
|
|
# Muscle performance (新增)
|
|
"rs1815739": ("11", 66560624, "T", "Sprint/Power athlete (ACTN3)", "power", "Other"),
|
|
# Bitter taste (新增)
|
|
"rs713598": ("7", 141972804, "C", "Bitter taste sensitivity (PTC)", "taster", "Other"),
|
|
"rs1726866": ("7", 141972905, "T", "Bitter taste sensitivity", "taster", "Other"),
|
|
# Cilantro aversion (新增)
|
|
"rs72921001": ("11", 6889648, "A", "Cilantro aversion", "aversion", "Other"),
|
|
}
|
|
|
|
# Category display order and descriptions
|
|
CATEGORIES = {
|
|
"Gout": "痛風 / 尿酸代謝",
|
|
"Kidney": "腎臟疾病",
|
|
"Hearing": "聽力損失",
|
|
"Autoimmune": "自體免疫疾病",
|
|
"Cancer": "癌症風險",
|
|
"Thrombosis": "血栓 / 凝血",
|
|
"Thyroid": "甲狀腺疾病",
|
|
"Bone": "骨質疏鬆 / 骨骼健康",
|
|
"Liver": "肝臟疾病",
|
|
"Migraine": "偏頭痛",
|
|
"Longevity": "長壽 / 老化",
|
|
"Sleep": "睡眠",
|
|
"Skin": "皮膚",
|
|
"Cardiovascular": "心血管疾病",
|
|
"Metabolic": "代謝疾病",
|
|
"Eye": "眼睛疾病",
|
|
"Neuropsychiatric": "神經精神疾病",
|
|
"Other": "其他特性",
|
|
}
|
|
|
|
|
|
def get_genotype_class(gt: str) -> str:
|
|
"""Classify genotype"""
|
|
if gt in ['./.', '.|.', '.']:
|
|
return 'MISSING'
|
|
|
|
alleles = re.split('[/|]', gt)
|
|
if all(a == '0' for a in alleles):
|
|
return 'HOM_REF'
|
|
elif all(a != '0' and a != '.' for a in alleles):
|
|
return 'HOM_ALT'
|
|
else:
|
|
return 'HET'
|
|
|
|
|
|
def parse_vcf_for_traits(vcf_path: str, sample_idx: int = 2) -> Tuple[Dict, List]:
|
|
"""Parse VCF and look for trait-associated SNPs"""
|
|
|
|
print(f"Scanning VCF for {len(TRAIT_SNPS)} trait-associated variants...")
|
|
|
|
# Build position lookup
|
|
pos_to_snp = {}
|
|
for rsid, (chrom, pos, risk_allele, trait, effect, category) in TRAIT_SNPS.items():
|
|
key = f"{chrom}-{pos}"
|
|
if key not in pos_to_snp:
|
|
pos_to_snp[key] = []
|
|
pos_to_snp[key].append((rsid, risk_allele, trait, effect, category))
|
|
|
|
found_variants = {}
|
|
samples = []
|
|
|
|
open_func = gzip.open if vcf_path.endswith('.gz') else open
|
|
mode = 'rt' if vcf_path.endswith('.gz') else 'r'
|
|
|
|
with open_func(vcf_path, mode) as f:
|
|
for line in f:
|
|
if line.startswith('##'):
|
|
continue
|
|
elif line.startswith('#CHROM'):
|
|
parts = line.strip().split('\t')
|
|
samples = parts[9:]
|
|
continue
|
|
|
|
parts = line.strip().split('\t')
|
|
if len(parts) < 10:
|
|
continue
|
|
|
|
chrom, pos, rsid_vcf, ref, alt, qual, filt, info, fmt = parts[:9]
|
|
gt_fields = parts[9:]
|
|
|
|
# Check if this position has a known trait SNP
|
|
key = f"{chrom}-{pos}"
|
|
if key not in pos_to_snp:
|
|
continue
|
|
|
|
# Get sample genotype
|
|
fmt_parts = fmt.split(':')
|
|
gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0
|
|
|
|
if sample_idx < len(gt_fields):
|
|
gt_data = gt_fields[sample_idx].split(':')
|
|
gt = gt_data[gt_idx] if gt_idx < len(gt_data) else './.'
|
|
else:
|
|
gt = './.'
|
|
|
|
gt_class = get_genotype_class(gt)
|
|
alleles = [ref] + alt.split(',')
|
|
|
|
# Process each SNP at this position
|
|
for rsid, risk_allele, trait, effect, category in pos_to_snp[key]:
|
|
# Check if risk allele is present
|
|
has_risk = False
|
|
risk_copies = 0
|
|
|
|
if gt_class != 'MISSING':
|
|
gt_alleles = re.split('[/|]', gt)
|
|
for a in gt_alleles:
|
|
if a.isdigit():
|
|
allele_idx = int(a)
|
|
if allele_idx < len(alleles) and alleles[allele_idx] == risk_allele:
|
|
has_risk = True
|
|
risk_copies += 1
|
|
|
|
found_variants[rsid] = {
|
|
'rsid': rsid,
|
|
'chrom': chrom,
|
|
'pos': pos,
|
|
'ref': ref,
|
|
'alt': alt,
|
|
'genotype': gt,
|
|
'genotype_class': gt_class,
|
|
'risk_allele': risk_allele,
|
|
'trait': trait,
|
|
'effect': effect,
|
|
'category': category,
|
|
'has_risk_allele': has_risk,
|
|
'risk_copies': risk_copies
|
|
}
|
|
|
|
return found_variants, samples
|
|
|
|
|
|
def generate_report(found_variants: Dict, output_path: str, sample_name: str):
|
|
"""Generate comprehensive trait analysis report"""
|
|
|
|
# Group by category
|
|
by_category = defaultdict(list)
|
|
for rsid, var in found_variants.items():
|
|
by_category[var['category']].append(var)
|
|
|
|
with open(output_path, 'w') as f:
|
|
f.write("=" * 80 + "\n")
|
|
f.write("COMPREHENSIVE GWAS TRAIT ANALYSIS REPORT\n")
|
|
f.write(f"Sample: {sample_name}\n")
|
|
f.write(f"Total SNPs analyzed: {len(TRAIT_SNPS)}\n")
|
|
f.write(f"SNPs found in data: {len(found_variants)}\n")
|
|
f.write("=" * 80 + "\n\n")
|
|
|
|
# Summary statistics
|
|
total_risk = sum(1 for v in found_variants.values() if v['has_risk_allele'])
|
|
f.write(f"OVERALL SUMMARY: {total_risk} risk variants found\n\n")
|
|
|
|
# Category summary
|
|
f.write("=" * 80 + "\n")
|
|
f.write("SUMMARY BY CATEGORY\n")
|
|
f.write("=" * 80 + "\n\n")
|
|
|
|
for cat_key in CATEGORIES.keys():
|
|
if cat_key in by_category:
|
|
variants = by_category[cat_key]
|
|
risk_count = sum(1 for v in variants if v['has_risk_allele'])
|
|
cat_name = CATEGORIES[cat_key]
|
|
f.write(f"{cat_name}: {risk_count}/{len(variants)} risk variants\n")
|
|
|
|
# Detailed results by category
|
|
f.write("\n" + "=" * 80 + "\n")
|
|
f.write("DETAILED RESULTS BY CATEGORY\n")
|
|
f.write("=" * 80 + "\n")
|
|
|
|
for cat_key in CATEGORIES.keys():
|
|
if cat_key not in by_category:
|
|
continue
|
|
|
|
variants = by_category[cat_key]
|
|
cat_name = CATEGORIES[cat_key]
|
|
risk_count = sum(1 for v in variants if v['has_risk_allele'])
|
|
|
|
f.write(f"\n\n## {cat_name} ({risk_count}/{len(variants)} risk)\n")
|
|
f.write("-" * 60 + "\n")
|
|
|
|
# Sort: risk variants first
|
|
sorted_vars = sorted(variants, key=lambda x: (not x['has_risk_allele'], x['trait']))
|
|
|
|
for v in sorted_vars:
|
|
status = "⚠️ RISK" if v['has_risk_allele'] else "✓ OK"
|
|
copies = f"({v['risk_copies']}份)" if v['has_risk_allele'] else ""
|
|
f.write(f"\n{v['trait']}: {v['rsid']} [{status}] {copies}\n")
|
|
f.write(f" 基因型: {v['genotype']} | 風險等位基因: {v['risk_allele']} | 效應: {v['effect']}\n")
|
|
|
|
# Full variant table
|
|
f.write("\n\n" + "=" * 80 + "\n")
|
|
f.write("COMPLETE VARIANT TABLE\n")
|
|
f.write("=" * 80 + "\n\n")
|
|
|
|
f.write("RSID\tCHROM\tPOS\tGENOTYPE\tRISK_ALLELE\tHAS_RISK\tCOPIES\tTRAIT\tCATEGORY\tEFFECT\n")
|
|
|
|
for rsid, var in sorted(found_variants.items(), key=lambda x: (x[1]['category'], x[1]['trait'])):
|
|
f.write(f"{var['rsid']}\t{var['chrom']}\t{var['pos']}\t{var['genotype']}\t")
|
|
f.write(f"{var['risk_allele']}\t{var['has_risk_allele']}\t{var['risk_copies']}\t")
|
|
f.write(f"{var['trait']}\t{var['category']}\t{var['effect']}\n")
|
|
|
|
print(f"Report saved to: {output_path}")
|
|
|
|
|
|
def main():
|
|
vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.rsid.vcf.gz'
|
|
output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/gwas_comprehensive_report.txt'
|
|
sample_idx = int(sys.argv[3]) if len(sys.argv) > 3 else 2
|
|
|
|
print("=" * 60)
|
|
print("COMPREHENSIVE GWAS TRAIT ANALYSIS")
|
|
print("=" * 60)
|
|
print(f"VCF: {vcf_path}")
|
|
print(f"Sample index: {sample_idx}")
|
|
print(f"Total trait SNPs in database: {len(TRAIT_SNPS)}")
|
|
print()
|
|
|
|
found_variants, samples = parse_vcf_for_traits(vcf_path, sample_idx)
|
|
|
|
sample_name = samples[sample_idx] if sample_idx < len(samples) else f"Sample_{sample_idx}"
|
|
print(f"Analyzing sample: {sample_name}")
|
|
print(f"\nFound {len(found_variants)} trait-associated variants in VCF")
|
|
|
|
# Quick summary by category
|
|
by_category = defaultdict(list)
|
|
for rsid, var in found_variants.items():
|
|
by_category[var['category']].append(var)
|
|
|
|
print("\n" + "=" * 60)
|
|
print("QUICK SUMMARY BY CATEGORY")
|
|
print("=" * 60)
|
|
|
|
for cat_key in CATEGORIES.keys():
|
|
if cat_key in by_category:
|
|
variants = by_category[cat_key]
|
|
risk_count = sum(1 for v in variants if v['has_risk_allele'])
|
|
cat_name = CATEGORIES[cat_key]
|
|
marker = "⚠️ " if risk_count > 0 else " "
|
|
print(f"{marker}{cat_name}: {risk_count}/{len(variants)} risk variants")
|
|
|
|
generate_report(found_variants, output_path, sample_name)
|
|
|
|
# Print high-risk findings
|
|
print("\n" + "=" * 60)
|
|
print("HIGH-PRIORITY FINDINGS (2+ copies of risk allele)")
|
|
print("=" * 60)
|
|
|
|
high_risk = [v for v in found_variants.values() if v['risk_copies'] >= 2]
|
|
if high_risk:
|
|
for v in sorted(high_risk, key=lambda x: x['category']):
|
|
print(f"\n{v['trait']} ({v['rsid']})")
|
|
print(f" Category: {CATEGORIES[v['category']]}")
|
|
print(f" Genotype: {v['genotype']} (2 copies of risk allele {v['risk_allele']})")
|
|
else:
|
|
print("\nNo variants with 2 copies of risk allele found.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|