#!/usr/bin/env python3 """ GWAS Trait Lookup Script Searches for trait-associated variants in VCF data using GWAS Catalog data. """ import gzip import sys import os from collections import defaultdict from dataclasses import dataclass from typing import Dict, List, Optional, Set, Tuple @dataclass class GWASAssociation: """GWAS association entry""" rsid: str chrom: str pos: int risk_allele: str trait: str p_value: float odds_ratio: Optional[float] beta: Optional[float] pubmed_id: str study: str @dataclass class TraitResult: """Result for a specific trait""" trait: str variants_found: int risk_variants: int protective_variants: int details: List[dict] # Common trait-associated SNPs (curated from GWAS Catalog) # Format: rsid -> (chrom, pos, risk_allele, trait, effect_direction) TRAIT_SNPS = { # Sleep quality / Insomnia "rs113851554": ("2", 66799986, "T", "Insomnia", "risk"), "rs12927162": ("16", 68856985, "A", "Sleep duration", "shorter"), "rs1823125": ("1", 205713532, "G", "Chronotype (morning person)", "morning"), "rs10493596": ("1", 215803417, "T", "Insomnia", "risk"), "rs3104997": ("6", 27424938, "C", "Sleep duration", "shorter"), "rs73598374": ("4", 94847526, "A", "Insomnia", "risk"), "rs2302729": ("5", 35857091, "G", "Insomnia", "risk"), # Skin conditions "rs1800629": ("6", 31543031, "A", "Psoriasis", "risk"), "rs20541": ("5", 131995964, "A", "Atopic dermatitis", "risk"), "rs2066808": ("6", 31540784, "A", "Psoriasis", "risk"), "rs3093662": ("6", 31574339, "G", "Psoriasis", "risk"), "rs10484554": ("6", 31271836, "A", "Psoriasis", "risk"), "rs1295686": ("5", 131996447, "A", "Atopic dermatitis", "risk"), "rs2227956": ("6", 31783279, "T", "Psoriasis", "risk"), "rs6906021": ("6", 32051991, "C", "Atopic dermatitis", "risk"), "rs661313": ("1", 152285861, "G", "Ichthyosis vulgaris (FLG)", "risk"), "rs2065958": ("11", 35300406, "C", "Atopic dermatitis", "risk"), # Cardiovascular "rs10757274": ("9", 22096055, "G", "Coronary artery disease", "risk"), "rs1333049": ("9", 22125503, "C", "Coronary artery disease", "risk"), "rs4665058": ("2", 43845437, "C", "Coronary artery disease", "risk"), "rs17465637": ("1", 222823529, "A", "Coronary artery disease", "risk"), "rs6725887": ("2", 203828796, "C", "Coronary artery disease", "risk"), # Type 2 Diabetes "rs7903146": ("10", 114758349, "T", "Type 2 diabetes", "risk"), "rs12255372": ("10", 114808902, "T", "Type 2 diabetes", "risk"), "rs1801282": ("3", 12393125, "C", "Type 2 diabetes", "risk"), "rs5219": ("11", 17409572, "T", "Type 2 diabetes", "risk"), "rs13266634": ("8", 118184783, "C", "Type 2 diabetes", "risk"), # Obesity/BMI "rs9939609": ("16", 53820527, "A", "Obesity (FTO)", "risk"), "rs17782313": ("18", 57851097, "C", "Obesity (MC4R)", "risk"), "rs6548238": ("2", 634905, "C", "BMI", "higher"), "rs10938397": ("4", 45186139, "G", "BMI", "higher"), # Hair loss / Baldness "rs2180439": ("20", 22162468, "T", "Male pattern baldness", "risk"), "rs1160312": ("X", 67052952, "A", "Male pattern baldness", "risk"), "rs6625163": ("X", 67177092, "A", "Male pattern baldness", "risk"), # Eye conditions (relevant to Usher) "rs10490924": ("10", 124214448, "T", "Age-related macular degeneration", "risk"), "rs1061170": ("1", 196659237, "C", "Age-related macular degeneration", "risk"), "rs9621532": ("22", 38477587, "C", "Myopia", "risk"), # Caffeine metabolism "rs762551": ("15", 75041917, "C", "Caffeine metabolism (slow)", "slow"), "rs2472297": ("15", 75027880, "T", "Caffeine consumption", "higher"), # Alcohol metabolism "rs671": ("12", 112241766, "A", "Alcohol flush reaction", "risk"), "rs1229984": ("4", 100239319, "T", "Alcohol metabolism (fast)", "fast"), # Lactose intolerance "rs4988235": ("2", 136608646, "G", "Lactose intolerance", "risk"), # Vitamin D "rs2282679": ("4", 72608383, "C", "Vitamin D deficiency", "risk"), "rs12785878": ("11", 71167449, "T", "Vitamin D levels (lower)", "lower"), # Alzheimer's Disease / Dementia "rs429358": ("19", 45411941, "C", "Alzheimer's disease (APOE e4)", "risk"), # APOE e4 "rs7412": ("19", 45412079, "T", "Alzheimer's disease (APOE e2)", "protective"), # APOE e2 "rs3865444": ("19", 51727962, "C", "Alzheimer's disease (CD33)", "risk"), "rs744373": ("2", 127892810, "G", "Alzheimer's disease (BIN1)", "risk"), "rs3851179": ("11", 85868640, "T", "Alzheimer's disease (PICALM)", "protective"), "rs670139": ("11", 59939307, "G", "Alzheimer's disease (MS4A)", "risk"), "rs9349407": ("6", 47487762, "C", "Alzheimer's disease (CD2AP)", "risk"), "rs11136000": ("8", 27468503, "C", "Alzheimer's disease (CLU)", "protective"), "rs3764650": ("19", 1063443, "G", "Alzheimer's disease (ABCA7)", "risk"), "rs3818361": ("1", 207692049, "A", "Alzheimer's disease (CR1)", "risk"), # Depression / Major Depressive Disorder "rs1545843": ("1", 72761657, "A", "Major depression (NEGR1)", "risk"), "rs7973260": ("12", 118364392, "A", "Major depression (KSR2)", "risk"), "rs10514299": ("5", 87992715, "T", "Major depression (TMEM161B)", "risk"), "rs2422321": ("15", 88945878, "G", "Major depression (NTRK3)", "risk"), "rs301806": ("1", 8477981, "A", "Major depression (RERE)", "risk"), "rs1432639": ("3", 117115304, "G", "Major depression (LSAMP)", "risk"), "rs9530139": ("13", 53645407, "G", "Major depression", "risk"), "rs4543289": ("10", 106610839, "T", "Major depression (SORCS3)", "risk"), # Anxiety "rs1709393": ("1", 34774088, "A", "Anxiety disorder", "risk"), "rs7688285": ("4", 123372626, "A", "Anxiety disorder", "risk"), # Bipolar disorder "rs4765913": ("12", 2345295, "A", "Bipolar disorder (CACNA1C)", "risk"), "rs10994336": ("10", 64649959, "T", "Bipolar disorder (ANK3)", "risk"), "rs9804190": ("11", 79077426, "C", "Bipolar disorder (ODZ4)", "risk"), # Schizophrenia "rs1625579": ("8", 130635575, "T", "Schizophrenia (MIR137)", "risk"), "rs2007044": ("6", 28626894, "G", "Schizophrenia (HIST1H2BJ)", "risk"), "rs6932590": ("6", 27243984, "T", "Schizophrenia", "risk"), } def get_genotype_class(gt: str) -> str: """Classify genotype""" if gt in ['./.', '.|.', '.']: return 'MISSING' import re alleles = re.split('[/|]', gt) if all(a == '0' for a in alleles): return 'HOM_REF' elif all(a != '0' and a != '.' for a in alleles): return 'HOM_ALT' else: return 'HET' def parse_vcf_for_traits(vcf_path: str, proband_idx: int = 2) -> Dict[str, dict]: """Parse VCF and look for trait-associated SNPs""" print(f"Scanning VCF for trait-associated variants...") # Build position lookup pos_to_snp = {} for rsid, (chrom, pos, risk_allele, trait, effect) in TRAIT_SNPS.items(): key = f"{chrom}-{pos}" pos_to_snp[key] = (rsid, risk_allele, trait, effect) found_variants = {} samples = [] open_func = gzip.open if vcf_path.endswith('.gz') else open mode = 'rt' if vcf_path.endswith('.gz') else 'r' with open_func(vcf_path, mode) as f: for line in f: if line.startswith('##'): continue elif line.startswith('#CHROM'): parts = line.strip().split('\t') samples = parts[9:] continue parts = line.strip().split('\t') if len(parts) < 10: continue chrom, pos, rsid_vcf, ref, alt, qual, filt, info, fmt = parts[:9] gt_fields = parts[9:] # Check if this position has a known trait SNP key = f"{chrom}-{pos}" if key not in pos_to_snp: continue rsid, risk_allele, trait, effect = pos_to_snp[key] # Get proband genotype fmt_parts = fmt.split(':') gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0 if proband_idx < len(gt_fields): gt_data = gt_fields[proband_idx].split(':') gt = gt_data[gt_idx] if gt_idx < len(gt_data) else './.' else: gt = './.' gt_class = get_genotype_class(gt) # Determine risk alleles = [ref] + alt.split(',') # Check if risk allele is present has_risk = False risk_copies = 0 if gt_class != 'MISSING': import re gt_alleles = re.split('[/|]', gt) for a in gt_alleles: if a.isdigit(): allele_idx = int(a) if allele_idx < len(alleles) and alleles[allele_idx] == risk_allele: has_risk = True risk_copies += 1 found_variants[rsid] = { 'rsid': rsid, 'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'genotype': gt, 'genotype_class': gt_class, 'risk_allele': risk_allele, 'trait': trait, 'effect': effect, 'has_risk_allele': has_risk, 'risk_copies': risk_copies } return found_variants, samples def generate_trait_report(found_variants: Dict, output_path: str): """Generate trait analysis report""" # Group by trait traits = defaultdict(list) for rsid, var in found_variants.items(): traits[var['trait']].append(var) with open(output_path, 'w') as f: f.write("# GWAS Trait Analysis Report\n") f.write("# Based on curated GWAS Catalog associations\n\n") f.write("=" * 80 + "\n") f.write("SUMMARY BY TRAIT CATEGORY\n") f.write("=" * 80 + "\n\n") # Categorize traits categories = { "Sleep": ["Insomnia", "Sleep duration", "Chronotype (morning person)"], "Skin": ["Psoriasis", "Atopic dermatitis", "Ichthyosis vulgaris (FLG)"], "Cardiovascular": ["Coronary artery disease"], "Metabolic": ["Type 2 diabetes", "Obesity (FTO)", "Obesity (MC4R)", "BMI"], "Eye": ["Age-related macular degeneration", "Myopia"], "Neuropsychiatric": [ "Alzheimer's disease (APOE e4)", "Alzheimer's disease (APOE e2)", "Alzheimer's disease (CD33)", "Alzheimer's disease (BIN1)", "Alzheimer's disease (PICALM)", "Alzheimer's disease (MS4A)", "Alzheimer's disease (CD2AP)", "Alzheimer's disease (CLU)", "Alzheimer's disease (ABCA7)", "Alzheimer's disease (CR1)", "Major depression (NEGR1)", "Major depression (KSR2)", "Major depression (TMEM161B)", "Major depression (NTRK3)", "Major depression (RERE)", "Major depression (LSAMP)", "Major depression", "Major depression (SORCS3)", "Anxiety disorder", "Bipolar disorder (CACNA1C)", "Bipolar disorder (ANK3)", "Bipolar disorder (ODZ4)", "Schizophrenia (MIR137)", "Schizophrenia (HIST1H2BJ)", "Schizophrenia" ], "Other": ["Caffeine metabolism (slow)", "Caffeine consumption", "Alcohol flush reaction", "Alcohol metabolism (fast)", "Lactose intolerance", "Vitamin D deficiency", "Vitamin D levels (lower)", "Male pattern baldness"] } for category, trait_list in categories.items(): f.write(f"\n## {category}\n") f.write("-" * 40 + "\n") category_risk = 0 category_total = 0 for trait in trait_list: if trait in traits: variants = traits[trait] risk_count = sum(1 for v in variants if v['has_risk_allele']) category_risk += risk_count category_total += len(variants) for v in variants: status = "RISK" if v['has_risk_allele'] else "OK" copies = f"({v['risk_copies']} copies)" if v['has_risk_allele'] else "" f.write(f" {v['trait']}: {v['rsid']} [{status}] {copies}\n") f.write(f" Genotype: {v['genotype']} | Risk allele: {v['risk_allele']}\n") if category_total > 0: f.write(f"\n Category summary: {category_risk}/{category_total} risk variants found\n") # Detailed results f.write("\n" + "=" * 80 + "\n") f.write("DETAILED RESULTS\n") f.write("=" * 80 + "\n\n") f.write("RSID\tCHROM\tPOS\tGENOTYPE\tRISK_ALLELE\tHAS_RISK\tCOPIES\tTRAIT\tEFFECT\n") for rsid, var in sorted(found_variants.items(), key=lambda x: (x[1]['trait'], x[0])): f.write(f"{var['rsid']}\t{var['chrom']}\t{var['pos']}\t{var['genotype']}\t") f.write(f"{var['risk_allele']}\t{var['has_risk_allele']}\t{var['risk_copies']}\t") f.write(f"{var['trait']}\t{var['effect']}\n") print(f"Report saved to: {output_path}") def main(): vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf' output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/gwas_trait_report.txt' proband_idx = int(sys.argv[3]) if len(sys.argv) > 3 else 2 print(f"GWAS Trait Analysis") print(f"VCF: {vcf_path}") print(f"Proband index: {proband_idx}") print(f"Searching for {len(TRAIT_SNPS)} trait-associated SNPs...\n") found_variants, samples = parse_vcf_for_traits(vcf_path, proband_idx) print(f"\nFound {len(found_variants)} trait-associated variants in VCF") # Quick summary risk_count = sum(1 for v in found_variants.values() if v['has_risk_allele']) print(f"Variants with risk allele: {risk_count}") generate_trait_report(found_variants, output_path) # Print summary to console print("\n" + "=" * 60) print("QUICK SUMMARY") print("=" * 60) traits = defaultdict(list) for rsid, var in found_variants.items(): traits[var['trait']].append(var) for trait in sorted(traits.keys()): variants = traits[trait] risk_vars = [v for v in variants if v['has_risk_allele']] if risk_vars: print(f"\n{trait}:") for v in risk_vars: print(f" {v['rsid']}: {v['genotype']} (risk allele: {v['risk_allele']}, copies: {v['risk_copies']})") if __name__ == '__main__': main()