- Add trio_analysis.py for trio-based variant analysis with de novo detection - Add clinvar_acmg_annotate.py for ClinVar/ACMG annotation - Add gwas_comprehensive.py with 201 SNPs across 18 categories - Add pharmgkb_full_analysis.py for pharmacogenomics analysis - Add gwas_trait_lookup.py for basic GWAS trait lookup - Add pharmacogenomics.py for basic PGx analysis - Remove unused scaffolding code (src/, configs/, docs/, tests/) - Update README.md with new documentation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
366 lines
15 KiB
Python
366 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
GWAS Trait Lookup Script
|
|
Searches for trait-associated variants in VCF data using GWAS Catalog data.
|
|
"""
|
|
|
|
import gzip
|
|
import sys
|
|
import os
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass
|
|
from typing import Dict, List, Optional, Set, Tuple
|
|
|
|
@dataclass
|
|
class GWASAssociation:
|
|
"""GWAS association entry"""
|
|
rsid: str
|
|
chrom: str
|
|
pos: int
|
|
risk_allele: str
|
|
trait: str
|
|
p_value: float
|
|
odds_ratio: Optional[float]
|
|
beta: Optional[float]
|
|
pubmed_id: str
|
|
study: str
|
|
|
|
@dataclass
|
|
class TraitResult:
|
|
"""Result for a specific trait"""
|
|
trait: str
|
|
variants_found: int
|
|
risk_variants: int
|
|
protective_variants: int
|
|
details: List[dict]
|
|
|
|
# Common trait-associated SNPs (curated from GWAS Catalog)
|
|
# Format: rsid -> (chrom, pos, risk_allele, trait, effect_direction)
|
|
TRAIT_SNPS = {
|
|
# Sleep quality / Insomnia
|
|
"rs113851554": ("2", 66799986, "T", "Insomnia", "risk"),
|
|
"rs12927162": ("16", 68856985, "A", "Sleep duration", "shorter"),
|
|
"rs1823125": ("1", 205713532, "G", "Chronotype (morning person)", "morning"),
|
|
"rs10493596": ("1", 215803417, "T", "Insomnia", "risk"),
|
|
"rs3104997": ("6", 27424938, "C", "Sleep duration", "shorter"),
|
|
"rs73598374": ("4", 94847526, "A", "Insomnia", "risk"),
|
|
"rs2302729": ("5", 35857091, "G", "Insomnia", "risk"),
|
|
|
|
# Skin conditions
|
|
"rs1800629": ("6", 31543031, "A", "Psoriasis", "risk"),
|
|
"rs20541": ("5", 131995964, "A", "Atopic dermatitis", "risk"),
|
|
"rs2066808": ("6", 31540784, "A", "Psoriasis", "risk"),
|
|
"rs3093662": ("6", 31574339, "G", "Psoriasis", "risk"),
|
|
"rs10484554": ("6", 31271836, "A", "Psoriasis", "risk"),
|
|
"rs1295686": ("5", 131996447, "A", "Atopic dermatitis", "risk"),
|
|
"rs2227956": ("6", 31783279, "T", "Psoriasis", "risk"),
|
|
"rs6906021": ("6", 32051991, "C", "Atopic dermatitis", "risk"),
|
|
"rs661313": ("1", 152285861, "G", "Ichthyosis vulgaris (FLG)", "risk"),
|
|
"rs2065958": ("11", 35300406, "C", "Atopic dermatitis", "risk"),
|
|
|
|
# Cardiovascular
|
|
"rs10757274": ("9", 22096055, "G", "Coronary artery disease", "risk"),
|
|
"rs1333049": ("9", 22125503, "C", "Coronary artery disease", "risk"),
|
|
"rs4665058": ("2", 43845437, "C", "Coronary artery disease", "risk"),
|
|
"rs17465637": ("1", 222823529, "A", "Coronary artery disease", "risk"),
|
|
"rs6725887": ("2", 203828796, "C", "Coronary artery disease", "risk"),
|
|
|
|
# Type 2 Diabetes
|
|
"rs7903146": ("10", 114758349, "T", "Type 2 diabetes", "risk"),
|
|
"rs12255372": ("10", 114808902, "T", "Type 2 diabetes", "risk"),
|
|
"rs1801282": ("3", 12393125, "C", "Type 2 diabetes", "risk"),
|
|
"rs5219": ("11", 17409572, "T", "Type 2 diabetes", "risk"),
|
|
"rs13266634": ("8", 118184783, "C", "Type 2 diabetes", "risk"),
|
|
|
|
# Obesity/BMI
|
|
"rs9939609": ("16", 53820527, "A", "Obesity (FTO)", "risk"),
|
|
"rs17782313": ("18", 57851097, "C", "Obesity (MC4R)", "risk"),
|
|
"rs6548238": ("2", 634905, "C", "BMI", "higher"),
|
|
"rs10938397": ("4", 45186139, "G", "BMI", "higher"),
|
|
|
|
# Hair loss / Baldness
|
|
"rs2180439": ("20", 22162468, "T", "Male pattern baldness", "risk"),
|
|
"rs1160312": ("X", 67052952, "A", "Male pattern baldness", "risk"),
|
|
"rs6625163": ("X", 67177092, "A", "Male pattern baldness", "risk"),
|
|
|
|
# Eye conditions (relevant to Usher)
|
|
"rs10490924": ("10", 124214448, "T", "Age-related macular degeneration", "risk"),
|
|
"rs1061170": ("1", 196659237, "C", "Age-related macular degeneration", "risk"),
|
|
"rs9621532": ("22", 38477587, "C", "Myopia", "risk"),
|
|
|
|
# Caffeine metabolism
|
|
"rs762551": ("15", 75041917, "C", "Caffeine metabolism (slow)", "slow"),
|
|
"rs2472297": ("15", 75027880, "T", "Caffeine consumption", "higher"),
|
|
|
|
# Alcohol metabolism
|
|
"rs671": ("12", 112241766, "A", "Alcohol flush reaction", "risk"),
|
|
"rs1229984": ("4", 100239319, "T", "Alcohol metabolism (fast)", "fast"),
|
|
|
|
# Lactose intolerance
|
|
"rs4988235": ("2", 136608646, "G", "Lactose intolerance", "risk"),
|
|
|
|
# Vitamin D
|
|
"rs2282679": ("4", 72608383, "C", "Vitamin D deficiency", "risk"),
|
|
"rs12785878": ("11", 71167449, "T", "Vitamin D levels (lower)", "lower"),
|
|
|
|
# Alzheimer's Disease / Dementia
|
|
"rs429358": ("19", 45411941, "C", "Alzheimer's disease (APOE e4)", "risk"), # APOE e4
|
|
"rs7412": ("19", 45412079, "T", "Alzheimer's disease (APOE e2)", "protective"), # APOE e2
|
|
"rs3865444": ("19", 51727962, "C", "Alzheimer's disease (CD33)", "risk"),
|
|
"rs744373": ("2", 127892810, "G", "Alzheimer's disease (BIN1)", "risk"),
|
|
"rs3851179": ("11", 85868640, "T", "Alzheimer's disease (PICALM)", "protective"),
|
|
"rs670139": ("11", 59939307, "G", "Alzheimer's disease (MS4A)", "risk"),
|
|
"rs9349407": ("6", 47487762, "C", "Alzheimer's disease (CD2AP)", "risk"),
|
|
"rs11136000": ("8", 27468503, "C", "Alzheimer's disease (CLU)", "protective"),
|
|
"rs3764650": ("19", 1063443, "G", "Alzheimer's disease (ABCA7)", "risk"),
|
|
"rs3818361": ("1", 207692049, "A", "Alzheimer's disease (CR1)", "risk"),
|
|
|
|
# Depression / Major Depressive Disorder
|
|
"rs1545843": ("1", 72761657, "A", "Major depression (NEGR1)", "risk"),
|
|
"rs7973260": ("12", 118364392, "A", "Major depression (KSR2)", "risk"),
|
|
"rs10514299": ("5", 87992715, "T", "Major depression (TMEM161B)", "risk"),
|
|
"rs2422321": ("15", 88945878, "G", "Major depression (NTRK3)", "risk"),
|
|
"rs301806": ("1", 8477981, "A", "Major depression (RERE)", "risk"),
|
|
"rs1432639": ("3", 117115304, "G", "Major depression (LSAMP)", "risk"),
|
|
"rs9530139": ("13", 53645407, "G", "Major depression", "risk"),
|
|
"rs4543289": ("10", 106610839, "T", "Major depression (SORCS3)", "risk"),
|
|
|
|
# Anxiety
|
|
"rs1709393": ("1", 34774088, "A", "Anxiety disorder", "risk"),
|
|
"rs7688285": ("4", 123372626, "A", "Anxiety disorder", "risk"),
|
|
|
|
# Bipolar disorder
|
|
"rs4765913": ("12", 2345295, "A", "Bipolar disorder (CACNA1C)", "risk"),
|
|
"rs10994336": ("10", 64649959, "T", "Bipolar disorder (ANK3)", "risk"),
|
|
"rs9804190": ("11", 79077426, "C", "Bipolar disorder (ODZ4)", "risk"),
|
|
|
|
# Schizophrenia
|
|
"rs1625579": ("8", 130635575, "T", "Schizophrenia (MIR137)", "risk"),
|
|
"rs2007044": ("6", 28626894, "G", "Schizophrenia (HIST1H2BJ)", "risk"),
|
|
"rs6932590": ("6", 27243984, "T", "Schizophrenia", "risk"),
|
|
}
|
|
|
|
|
|
def get_genotype_class(gt: str) -> str:
|
|
"""Classify genotype"""
|
|
if gt in ['./.', '.|.', '.']:
|
|
return 'MISSING'
|
|
|
|
import re
|
|
alleles = re.split('[/|]', gt)
|
|
if all(a == '0' for a in alleles):
|
|
return 'HOM_REF'
|
|
elif all(a != '0' and a != '.' for a in alleles):
|
|
return 'HOM_ALT'
|
|
else:
|
|
return 'HET'
|
|
|
|
|
|
def parse_vcf_for_traits(vcf_path: str, proband_idx: int = 2) -> Dict[str, dict]:
|
|
"""Parse VCF and look for trait-associated SNPs"""
|
|
|
|
print(f"Scanning VCF for trait-associated variants...")
|
|
|
|
# Build position lookup
|
|
pos_to_snp = {}
|
|
for rsid, (chrom, pos, risk_allele, trait, effect) in TRAIT_SNPS.items():
|
|
key = f"{chrom}-{pos}"
|
|
pos_to_snp[key] = (rsid, risk_allele, trait, effect)
|
|
|
|
found_variants = {}
|
|
samples = []
|
|
|
|
open_func = gzip.open if vcf_path.endswith('.gz') else open
|
|
mode = 'rt' if vcf_path.endswith('.gz') else 'r'
|
|
|
|
with open_func(vcf_path, mode) as f:
|
|
for line in f:
|
|
if line.startswith('##'):
|
|
continue
|
|
elif line.startswith('#CHROM'):
|
|
parts = line.strip().split('\t')
|
|
samples = parts[9:]
|
|
continue
|
|
|
|
parts = line.strip().split('\t')
|
|
if len(parts) < 10:
|
|
continue
|
|
|
|
chrom, pos, rsid_vcf, ref, alt, qual, filt, info, fmt = parts[:9]
|
|
gt_fields = parts[9:]
|
|
|
|
# Check if this position has a known trait SNP
|
|
key = f"{chrom}-{pos}"
|
|
if key not in pos_to_snp:
|
|
continue
|
|
|
|
rsid, risk_allele, trait, effect = pos_to_snp[key]
|
|
|
|
# Get proband genotype
|
|
fmt_parts = fmt.split(':')
|
|
gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0
|
|
|
|
if proband_idx < len(gt_fields):
|
|
gt_data = gt_fields[proband_idx].split(':')
|
|
gt = gt_data[gt_idx] if gt_idx < len(gt_data) else './.'
|
|
else:
|
|
gt = './.'
|
|
|
|
gt_class = get_genotype_class(gt)
|
|
|
|
# Determine risk
|
|
alleles = [ref] + alt.split(',')
|
|
|
|
# Check if risk allele is present
|
|
has_risk = False
|
|
risk_copies = 0
|
|
|
|
if gt_class != 'MISSING':
|
|
import re
|
|
gt_alleles = re.split('[/|]', gt)
|
|
for a in gt_alleles:
|
|
if a.isdigit():
|
|
allele_idx = int(a)
|
|
if allele_idx < len(alleles) and alleles[allele_idx] == risk_allele:
|
|
has_risk = True
|
|
risk_copies += 1
|
|
|
|
found_variants[rsid] = {
|
|
'rsid': rsid,
|
|
'chrom': chrom,
|
|
'pos': pos,
|
|
'ref': ref,
|
|
'alt': alt,
|
|
'genotype': gt,
|
|
'genotype_class': gt_class,
|
|
'risk_allele': risk_allele,
|
|
'trait': trait,
|
|
'effect': effect,
|
|
'has_risk_allele': has_risk,
|
|
'risk_copies': risk_copies
|
|
}
|
|
|
|
return found_variants, samples
|
|
|
|
|
|
def generate_trait_report(found_variants: Dict, output_path: str):
|
|
"""Generate trait analysis report"""
|
|
|
|
# Group by trait
|
|
traits = defaultdict(list)
|
|
for rsid, var in found_variants.items():
|
|
traits[var['trait']].append(var)
|
|
|
|
with open(output_path, 'w') as f:
|
|
f.write("# GWAS Trait Analysis Report\n")
|
|
f.write("# Based on curated GWAS Catalog associations\n\n")
|
|
|
|
f.write("=" * 80 + "\n")
|
|
f.write("SUMMARY BY TRAIT CATEGORY\n")
|
|
f.write("=" * 80 + "\n\n")
|
|
|
|
# Categorize traits
|
|
categories = {
|
|
"Sleep": ["Insomnia", "Sleep duration", "Chronotype (morning person)"],
|
|
"Skin": ["Psoriasis", "Atopic dermatitis", "Ichthyosis vulgaris (FLG)"],
|
|
"Cardiovascular": ["Coronary artery disease"],
|
|
"Metabolic": ["Type 2 diabetes", "Obesity (FTO)", "Obesity (MC4R)", "BMI"],
|
|
"Eye": ["Age-related macular degeneration", "Myopia"],
|
|
"Neuropsychiatric": [
|
|
"Alzheimer's disease (APOE e4)", "Alzheimer's disease (APOE e2)",
|
|
"Alzheimer's disease (CD33)", "Alzheimer's disease (BIN1)",
|
|
"Alzheimer's disease (PICALM)", "Alzheimer's disease (MS4A)",
|
|
"Alzheimer's disease (CD2AP)", "Alzheimer's disease (CLU)",
|
|
"Alzheimer's disease (ABCA7)", "Alzheimer's disease (CR1)",
|
|
"Major depression (NEGR1)", "Major depression (KSR2)",
|
|
"Major depression (TMEM161B)", "Major depression (NTRK3)",
|
|
"Major depression (RERE)", "Major depression (LSAMP)",
|
|
"Major depression", "Major depression (SORCS3)",
|
|
"Anxiety disorder",
|
|
"Bipolar disorder (CACNA1C)", "Bipolar disorder (ANK3)", "Bipolar disorder (ODZ4)",
|
|
"Schizophrenia (MIR137)", "Schizophrenia (HIST1H2BJ)", "Schizophrenia"
|
|
],
|
|
"Other": ["Caffeine metabolism (slow)", "Caffeine consumption", "Alcohol flush reaction",
|
|
"Alcohol metabolism (fast)", "Lactose intolerance", "Vitamin D deficiency",
|
|
"Vitamin D levels (lower)", "Male pattern baldness"]
|
|
}
|
|
|
|
for category, trait_list in categories.items():
|
|
f.write(f"\n## {category}\n")
|
|
f.write("-" * 40 + "\n")
|
|
|
|
category_risk = 0
|
|
category_total = 0
|
|
|
|
for trait in trait_list:
|
|
if trait in traits:
|
|
variants = traits[trait]
|
|
risk_count = sum(1 for v in variants if v['has_risk_allele'])
|
|
category_risk += risk_count
|
|
category_total += len(variants)
|
|
|
|
for v in variants:
|
|
status = "RISK" if v['has_risk_allele'] else "OK"
|
|
copies = f"({v['risk_copies']} copies)" if v['has_risk_allele'] else ""
|
|
f.write(f" {v['trait']}: {v['rsid']} [{status}] {copies}\n")
|
|
f.write(f" Genotype: {v['genotype']} | Risk allele: {v['risk_allele']}\n")
|
|
|
|
if category_total > 0:
|
|
f.write(f"\n Category summary: {category_risk}/{category_total} risk variants found\n")
|
|
|
|
# Detailed results
|
|
f.write("\n" + "=" * 80 + "\n")
|
|
f.write("DETAILED RESULTS\n")
|
|
f.write("=" * 80 + "\n\n")
|
|
|
|
f.write("RSID\tCHROM\tPOS\tGENOTYPE\tRISK_ALLELE\tHAS_RISK\tCOPIES\tTRAIT\tEFFECT\n")
|
|
|
|
for rsid, var in sorted(found_variants.items(), key=lambda x: (x[1]['trait'], x[0])):
|
|
f.write(f"{var['rsid']}\t{var['chrom']}\t{var['pos']}\t{var['genotype']}\t")
|
|
f.write(f"{var['risk_allele']}\t{var['has_risk_allele']}\t{var['risk_copies']}\t")
|
|
f.write(f"{var['trait']}\t{var['effect']}\n")
|
|
|
|
print(f"Report saved to: {output_path}")
|
|
|
|
|
|
def main():
|
|
vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf'
|
|
output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/gwas_trait_report.txt'
|
|
proband_idx = int(sys.argv[3]) if len(sys.argv) > 3 else 2
|
|
|
|
print(f"GWAS Trait Analysis")
|
|
print(f"VCF: {vcf_path}")
|
|
print(f"Proband index: {proband_idx}")
|
|
print(f"Searching for {len(TRAIT_SNPS)} trait-associated SNPs...\n")
|
|
|
|
found_variants, samples = parse_vcf_for_traits(vcf_path, proband_idx)
|
|
|
|
print(f"\nFound {len(found_variants)} trait-associated variants in VCF")
|
|
|
|
# Quick summary
|
|
risk_count = sum(1 for v in found_variants.values() if v['has_risk_allele'])
|
|
print(f"Variants with risk allele: {risk_count}")
|
|
|
|
generate_trait_report(found_variants, output_path)
|
|
|
|
# Print summary to console
|
|
print("\n" + "=" * 60)
|
|
print("QUICK SUMMARY")
|
|
print("=" * 60)
|
|
|
|
traits = defaultdict(list)
|
|
for rsid, var in found_variants.items():
|
|
traits[var['trait']].append(var)
|
|
|
|
for trait in sorted(traits.keys()):
|
|
variants = traits[trait]
|
|
risk_vars = [v for v in variants if v['has_risk_allele']]
|
|
if risk_vars:
|
|
print(f"\n{trait}:")
|
|
for v in risk_vars:
|
|
print(f" {v['rsid']}: {v['genotype']} (risk allele: {v['risk_allele']}, copies: {v['risk_copies']})")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|