Refactor: Replace scaffolding with working analysis scripts

- Add trio_analysis.py for trio-based variant analysis with de novo detection
- Add clinvar_acmg_annotate.py for ClinVar/ACMG annotation
- Add gwas_comprehensive.py with 201 SNPs across 18 categories
- Add pharmgkb_full_analysis.py for pharmacogenomics analysis
- Add gwas_trait_lookup.py for basic GWAS trait lookup
- Add pharmacogenomics.py for basic PGx analysis
- Remove unused scaffolding code (src/, configs/, docs/, tests/)
- Update README.md with new documentation

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-12-01 22:36:02 +08:00
parent f74dc351f7
commit d13d58df8b
56 changed files with 2608 additions and 2347 deletions

365
gwas_trait_lookup.py Normal file
View File

@@ -0,0 +1,365 @@
#!/usr/bin/env python3
"""
GWAS Trait Lookup Script
Searches for trait-associated variants in VCF data using GWAS Catalog data.
"""
import gzip
import sys
import os
from collections import defaultdict
from dataclasses import dataclass
from typing import Dict, List, Optional, Set, Tuple
@dataclass
class GWASAssociation:
"""GWAS association entry"""
rsid: str
chrom: str
pos: int
risk_allele: str
trait: str
p_value: float
odds_ratio: Optional[float]
beta: Optional[float]
pubmed_id: str
study: str
@dataclass
class TraitResult:
"""Result for a specific trait"""
trait: str
variants_found: int
risk_variants: int
protective_variants: int
details: List[dict]
# Common trait-associated SNPs (curated from GWAS Catalog)
# Format: rsid -> (chrom, pos, risk_allele, trait, effect_direction)
TRAIT_SNPS = {
# Sleep quality / Insomnia
"rs113851554": ("2", 66799986, "T", "Insomnia", "risk"),
"rs12927162": ("16", 68856985, "A", "Sleep duration", "shorter"),
"rs1823125": ("1", 205713532, "G", "Chronotype (morning person)", "morning"),
"rs10493596": ("1", 215803417, "T", "Insomnia", "risk"),
"rs3104997": ("6", 27424938, "C", "Sleep duration", "shorter"),
"rs73598374": ("4", 94847526, "A", "Insomnia", "risk"),
"rs2302729": ("5", 35857091, "G", "Insomnia", "risk"),
# Skin conditions
"rs1800629": ("6", 31543031, "A", "Psoriasis", "risk"),
"rs20541": ("5", 131995964, "A", "Atopic dermatitis", "risk"),
"rs2066808": ("6", 31540784, "A", "Psoriasis", "risk"),
"rs3093662": ("6", 31574339, "G", "Psoriasis", "risk"),
"rs10484554": ("6", 31271836, "A", "Psoriasis", "risk"),
"rs1295686": ("5", 131996447, "A", "Atopic dermatitis", "risk"),
"rs2227956": ("6", 31783279, "T", "Psoriasis", "risk"),
"rs6906021": ("6", 32051991, "C", "Atopic dermatitis", "risk"),
"rs661313": ("1", 152285861, "G", "Ichthyosis vulgaris (FLG)", "risk"),
"rs2065958": ("11", 35300406, "C", "Atopic dermatitis", "risk"),
# Cardiovascular
"rs10757274": ("9", 22096055, "G", "Coronary artery disease", "risk"),
"rs1333049": ("9", 22125503, "C", "Coronary artery disease", "risk"),
"rs4665058": ("2", 43845437, "C", "Coronary artery disease", "risk"),
"rs17465637": ("1", 222823529, "A", "Coronary artery disease", "risk"),
"rs6725887": ("2", 203828796, "C", "Coronary artery disease", "risk"),
# Type 2 Diabetes
"rs7903146": ("10", 114758349, "T", "Type 2 diabetes", "risk"),
"rs12255372": ("10", 114808902, "T", "Type 2 diabetes", "risk"),
"rs1801282": ("3", 12393125, "C", "Type 2 diabetes", "risk"),
"rs5219": ("11", 17409572, "T", "Type 2 diabetes", "risk"),
"rs13266634": ("8", 118184783, "C", "Type 2 diabetes", "risk"),
# Obesity/BMI
"rs9939609": ("16", 53820527, "A", "Obesity (FTO)", "risk"),
"rs17782313": ("18", 57851097, "C", "Obesity (MC4R)", "risk"),
"rs6548238": ("2", 634905, "C", "BMI", "higher"),
"rs10938397": ("4", 45186139, "G", "BMI", "higher"),
# Hair loss / Baldness
"rs2180439": ("20", 22162468, "T", "Male pattern baldness", "risk"),
"rs1160312": ("X", 67052952, "A", "Male pattern baldness", "risk"),
"rs6625163": ("X", 67177092, "A", "Male pattern baldness", "risk"),
# Eye conditions (relevant to Usher)
"rs10490924": ("10", 124214448, "T", "Age-related macular degeneration", "risk"),
"rs1061170": ("1", 196659237, "C", "Age-related macular degeneration", "risk"),
"rs9621532": ("22", 38477587, "C", "Myopia", "risk"),
# Caffeine metabolism
"rs762551": ("15", 75041917, "C", "Caffeine metabolism (slow)", "slow"),
"rs2472297": ("15", 75027880, "T", "Caffeine consumption", "higher"),
# Alcohol metabolism
"rs671": ("12", 112241766, "A", "Alcohol flush reaction", "risk"),
"rs1229984": ("4", 100239319, "T", "Alcohol metabolism (fast)", "fast"),
# Lactose intolerance
"rs4988235": ("2", 136608646, "G", "Lactose intolerance", "risk"),
# Vitamin D
"rs2282679": ("4", 72608383, "C", "Vitamin D deficiency", "risk"),
"rs12785878": ("11", 71167449, "T", "Vitamin D levels (lower)", "lower"),
# Alzheimer's Disease / Dementia
"rs429358": ("19", 45411941, "C", "Alzheimer's disease (APOE e4)", "risk"), # APOE e4
"rs7412": ("19", 45412079, "T", "Alzheimer's disease (APOE e2)", "protective"), # APOE e2
"rs3865444": ("19", 51727962, "C", "Alzheimer's disease (CD33)", "risk"),
"rs744373": ("2", 127892810, "G", "Alzheimer's disease (BIN1)", "risk"),
"rs3851179": ("11", 85868640, "T", "Alzheimer's disease (PICALM)", "protective"),
"rs670139": ("11", 59939307, "G", "Alzheimer's disease (MS4A)", "risk"),
"rs9349407": ("6", 47487762, "C", "Alzheimer's disease (CD2AP)", "risk"),
"rs11136000": ("8", 27468503, "C", "Alzheimer's disease (CLU)", "protective"),
"rs3764650": ("19", 1063443, "G", "Alzheimer's disease (ABCA7)", "risk"),
"rs3818361": ("1", 207692049, "A", "Alzheimer's disease (CR1)", "risk"),
# Depression / Major Depressive Disorder
"rs1545843": ("1", 72761657, "A", "Major depression (NEGR1)", "risk"),
"rs7973260": ("12", 118364392, "A", "Major depression (KSR2)", "risk"),
"rs10514299": ("5", 87992715, "T", "Major depression (TMEM161B)", "risk"),
"rs2422321": ("15", 88945878, "G", "Major depression (NTRK3)", "risk"),
"rs301806": ("1", 8477981, "A", "Major depression (RERE)", "risk"),
"rs1432639": ("3", 117115304, "G", "Major depression (LSAMP)", "risk"),
"rs9530139": ("13", 53645407, "G", "Major depression", "risk"),
"rs4543289": ("10", 106610839, "T", "Major depression (SORCS3)", "risk"),
# Anxiety
"rs1709393": ("1", 34774088, "A", "Anxiety disorder", "risk"),
"rs7688285": ("4", 123372626, "A", "Anxiety disorder", "risk"),
# Bipolar disorder
"rs4765913": ("12", 2345295, "A", "Bipolar disorder (CACNA1C)", "risk"),
"rs10994336": ("10", 64649959, "T", "Bipolar disorder (ANK3)", "risk"),
"rs9804190": ("11", 79077426, "C", "Bipolar disorder (ODZ4)", "risk"),
# Schizophrenia
"rs1625579": ("8", 130635575, "T", "Schizophrenia (MIR137)", "risk"),
"rs2007044": ("6", 28626894, "G", "Schizophrenia (HIST1H2BJ)", "risk"),
"rs6932590": ("6", 27243984, "T", "Schizophrenia", "risk"),
}
def get_genotype_class(gt: str) -> str:
"""Classify genotype"""
if gt in ['./.', '.|.', '.']:
return 'MISSING'
import re
alleles = re.split('[/|]', gt)
if all(a == '0' for a in alleles):
return 'HOM_REF'
elif all(a != '0' and a != '.' for a in alleles):
return 'HOM_ALT'
else:
return 'HET'
def parse_vcf_for_traits(vcf_path: str, proband_idx: int = 2) -> Dict[str, dict]:
"""Parse VCF and look for trait-associated SNPs"""
print(f"Scanning VCF for trait-associated variants...")
# Build position lookup
pos_to_snp = {}
for rsid, (chrom, pos, risk_allele, trait, effect) in TRAIT_SNPS.items():
key = f"{chrom}-{pos}"
pos_to_snp[key] = (rsid, risk_allele, trait, effect)
found_variants = {}
samples = []
open_func = gzip.open if vcf_path.endswith('.gz') else open
mode = 'rt' if vcf_path.endswith('.gz') else 'r'
with open_func(vcf_path, mode) as f:
for line in f:
if line.startswith('##'):
continue
elif line.startswith('#CHROM'):
parts = line.strip().split('\t')
samples = parts[9:]
continue
parts = line.strip().split('\t')
if len(parts) < 10:
continue
chrom, pos, rsid_vcf, ref, alt, qual, filt, info, fmt = parts[:9]
gt_fields = parts[9:]
# Check if this position has a known trait SNP
key = f"{chrom}-{pos}"
if key not in pos_to_snp:
continue
rsid, risk_allele, trait, effect = pos_to_snp[key]
# Get proband genotype
fmt_parts = fmt.split(':')
gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0
if proband_idx < len(gt_fields):
gt_data = gt_fields[proband_idx].split(':')
gt = gt_data[gt_idx] if gt_idx < len(gt_data) else './.'
else:
gt = './.'
gt_class = get_genotype_class(gt)
# Determine risk
alleles = [ref] + alt.split(',')
# Check if risk allele is present
has_risk = False
risk_copies = 0
if gt_class != 'MISSING':
import re
gt_alleles = re.split('[/|]', gt)
for a in gt_alleles:
if a.isdigit():
allele_idx = int(a)
if allele_idx < len(alleles) and alleles[allele_idx] == risk_allele:
has_risk = True
risk_copies += 1
found_variants[rsid] = {
'rsid': rsid,
'chrom': chrom,
'pos': pos,
'ref': ref,
'alt': alt,
'genotype': gt,
'genotype_class': gt_class,
'risk_allele': risk_allele,
'trait': trait,
'effect': effect,
'has_risk_allele': has_risk,
'risk_copies': risk_copies
}
return found_variants, samples
def generate_trait_report(found_variants: Dict, output_path: str):
"""Generate trait analysis report"""
# Group by trait
traits = defaultdict(list)
for rsid, var in found_variants.items():
traits[var['trait']].append(var)
with open(output_path, 'w') as f:
f.write("# GWAS Trait Analysis Report\n")
f.write("# Based on curated GWAS Catalog associations\n\n")
f.write("=" * 80 + "\n")
f.write("SUMMARY BY TRAIT CATEGORY\n")
f.write("=" * 80 + "\n\n")
# Categorize traits
categories = {
"Sleep": ["Insomnia", "Sleep duration", "Chronotype (morning person)"],
"Skin": ["Psoriasis", "Atopic dermatitis", "Ichthyosis vulgaris (FLG)"],
"Cardiovascular": ["Coronary artery disease"],
"Metabolic": ["Type 2 diabetes", "Obesity (FTO)", "Obesity (MC4R)", "BMI"],
"Eye": ["Age-related macular degeneration", "Myopia"],
"Neuropsychiatric": [
"Alzheimer's disease (APOE e4)", "Alzheimer's disease (APOE e2)",
"Alzheimer's disease (CD33)", "Alzheimer's disease (BIN1)",
"Alzheimer's disease (PICALM)", "Alzheimer's disease (MS4A)",
"Alzheimer's disease (CD2AP)", "Alzheimer's disease (CLU)",
"Alzheimer's disease (ABCA7)", "Alzheimer's disease (CR1)",
"Major depression (NEGR1)", "Major depression (KSR2)",
"Major depression (TMEM161B)", "Major depression (NTRK3)",
"Major depression (RERE)", "Major depression (LSAMP)",
"Major depression", "Major depression (SORCS3)",
"Anxiety disorder",
"Bipolar disorder (CACNA1C)", "Bipolar disorder (ANK3)", "Bipolar disorder (ODZ4)",
"Schizophrenia (MIR137)", "Schizophrenia (HIST1H2BJ)", "Schizophrenia"
],
"Other": ["Caffeine metabolism (slow)", "Caffeine consumption", "Alcohol flush reaction",
"Alcohol metabolism (fast)", "Lactose intolerance", "Vitamin D deficiency",
"Vitamin D levels (lower)", "Male pattern baldness"]
}
for category, trait_list in categories.items():
f.write(f"\n## {category}\n")
f.write("-" * 40 + "\n")
category_risk = 0
category_total = 0
for trait in trait_list:
if trait in traits:
variants = traits[trait]
risk_count = sum(1 for v in variants if v['has_risk_allele'])
category_risk += risk_count
category_total += len(variants)
for v in variants:
status = "RISK" if v['has_risk_allele'] else "OK"
copies = f"({v['risk_copies']} copies)" if v['has_risk_allele'] else ""
f.write(f" {v['trait']}: {v['rsid']} [{status}] {copies}\n")
f.write(f" Genotype: {v['genotype']} | Risk allele: {v['risk_allele']}\n")
if category_total > 0:
f.write(f"\n Category summary: {category_risk}/{category_total} risk variants found\n")
# Detailed results
f.write("\n" + "=" * 80 + "\n")
f.write("DETAILED RESULTS\n")
f.write("=" * 80 + "\n\n")
f.write("RSID\tCHROM\tPOS\tGENOTYPE\tRISK_ALLELE\tHAS_RISK\tCOPIES\tTRAIT\tEFFECT\n")
for rsid, var in sorted(found_variants.items(), key=lambda x: (x[1]['trait'], x[0])):
f.write(f"{var['rsid']}\t{var['chrom']}\t{var['pos']}\t{var['genotype']}\t")
f.write(f"{var['risk_allele']}\t{var['has_risk_allele']}\t{var['risk_copies']}\t")
f.write(f"{var['trait']}\t{var['effect']}\n")
print(f"Report saved to: {output_path}")
def main():
vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf'
output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/gwas_trait_report.txt'
proband_idx = int(sys.argv[3]) if len(sys.argv) > 3 else 2
print(f"GWAS Trait Analysis")
print(f"VCF: {vcf_path}")
print(f"Proband index: {proband_idx}")
print(f"Searching for {len(TRAIT_SNPS)} trait-associated SNPs...\n")
found_variants, samples = parse_vcf_for_traits(vcf_path, proband_idx)
print(f"\nFound {len(found_variants)} trait-associated variants in VCF")
# Quick summary
risk_count = sum(1 for v in found_variants.values() if v['has_risk_allele'])
print(f"Variants with risk allele: {risk_count}")
generate_trait_report(found_variants, output_path)
# Print summary to console
print("\n" + "=" * 60)
print("QUICK SUMMARY")
print("=" * 60)
traits = defaultdict(list)
for rsid, var in found_variants.items():
traits[var['trait']].append(var)
for trait in sorted(traits.keys()):
variants = traits[trait]
risk_vars = [v for v in variants if v['has_risk_allele']]
if risk_vars:
print(f"\n{trait}:")
for v in risk_vars:
print(f" {v['rsid']}: {v['genotype']} (risk allele: {v['risk_allele']}, copies: {v['risk_copies']})")
if __name__ == '__main__':
main()