Refactor: Replace scaffolding with working analysis scripts

- Add trio_analysis.py for trio-based variant analysis with de novo detection - Add clinvar_acmg_annotate.py for ClinVar/ACMG annotation - Add gwas_comprehensive.py with 201 SNPs across 18 categories - Add pharmgkb_full_analysis.py for pharmacogenomics analysis - Add gwas_trait_lookup.py for basic GWAS trait lookup - Add pharmacogenomics.py for basic PGx analysis - Remove unused scaffolding code (src/, configs/, docs/, tests/) - Update README.md with new documentation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-01 22:36:02 +08:00
parent f74dc351f7
commit d13d58df8b
56 changed files with 2608 additions and 2347 deletions
--- a/trio_analysis.py
+++ b/trio_analysis.py
@@ -0,0 +1,376 @@
+#!/usr/bin/env python3
+"""
+Trio WES Analysis Script
+Analyzes trio VCF for de novo mutations, compound heterozygous variants,
+and potential pathogenic variants.
+"""
+
+import gzip
+import re
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import List, Dict, Optional, Tuple
+import json
+
+@dataclass
+class Variant:
+    chrom: str
+    pos: int
+    ref: str
+    alt: str
+    qual: float
+    filter_status: str
+    info: str
+    genotypes: Dict[str, str]  # sample -> genotype
+    annotation: Optional[str] = None
+    gene: Optional[str] = None
+    effect: Optional[str] = None
+    impact: Optional[str] = None
+
+def parse_genotype(gt_field: str) -> Tuple[str, int, int]:
+    """Parse genotype field, return (gt_string, ref_count, alt_count)"""
+    parts = gt_field.split(':')
+    gt = parts[0]
+
+    if gt in ['./.', '.|.', '.']:
+        return gt, 0, 0
+
+    alleles = re.split('[/|]', gt)
+    ref_count = sum(1 for a in alleles if a == '0')
+    alt_count = sum(1 for a in alleles if a != '0' and a != '.')
+
+    return gt, ref_count, alt_count
+
+def get_genotype_class(gt: str) -> str:
+    """Classify genotype as HOM_REF, HET, HOM_ALT, or MISSING"""
+    if gt in ['./.', '.|.', '.']:
+        return 'MISSING'
+
+    alleles = re.split('[/|]', gt)
+    if all(a == '0' for a in alleles):
+        return 'HOM_REF'
+    elif all(a != '0' and a != '.' for a in alleles):
+        return 'HOM_ALT'
+    else:
+        return 'HET'
+
+def parse_snpeff_annotation(info: str) -> Dict:
+    """Parse SnpEff ANN field"""
+    result = {
+        'gene': None,
+        'effect': None,
+        'impact': None,
+        'hgvs_c': None,
+        'hgvs_p': None,
+    }
+
+    ann_match = re.search(r'ANN=([^;]+)', info)
+    if not ann_match:
+        return result
+
+    ann_field = ann_match.group(1)
+    annotations = ann_field.split(',')
+
+    if annotations:
+        # Take the first (most severe) annotation
+        parts = annotations[0].split('|')
+        if len(parts) >= 4:
+            result['effect'] = parts[1] if len(parts) > 1 else None
+            result['impact'] = parts[2] if len(parts) > 2 else None
+            result['gene'] = parts[3] if len(parts) > 3 else None
+            if len(parts) > 9:
+                result['hgvs_c'] = parts[9]
+            if len(parts) > 10:
+                result['hgvs_p'] = parts[10]
+
+    return result
+
+def parse_vcf(vcf_path: str) -> Tuple[List[str], List[Variant]]:
+    """Parse VCF file and return sample names and variants"""
+    samples = []
+    variants = []
+
+    open_func = gzip.open if vcf_path.endswith('.gz') else open
+    mode = 'rt' if vcf_path.endswith('.gz') else 'r'
+
+    with open_func(vcf_path, mode) as f:
+        for line in f:
+            if line.startswith('##'):
+                continue
+            elif line.startswith('#CHROM'):
+                parts = line.strip().split('\t')
+                samples = parts[9:]
+                continue
+
+            parts = line.strip().split('\t')
+            if len(parts) < 10:
+                continue
+
+            chrom, pos, _, ref, alt, qual, filt, info, fmt = parts[:9]
+            gt_fields = parts[9:]
+
+            # Parse genotypes
+            genotypes = {}
+            fmt_fields = fmt.split(':')
+            gt_idx = fmt_fields.index('GT') if 'GT' in fmt_fields else 0
+
+            for i, sample in enumerate(samples):
+                gt_parts = gt_fields[i].split(':')
+                genotypes[sample] = gt_parts[gt_idx] if gt_idx < len(gt_parts) else './.'
+
+            # Parse annotation
+            ann = parse_snpeff_annotation(info)
+
+            try:
+                qual_val = float(qual) if qual != '.' else 0
+            except ValueError:
+                qual_val = 0
+
+            variant = Variant(
+                chrom=chrom,
+                pos=int(pos),
+                ref=ref,
+                alt=alt,
+                qual=qual_val,
+                filter_status=filt,
+                info=info,
+                genotypes=genotypes,
+                annotation=info,
+                gene=ann['gene'],
+                effect=ann['effect'],
+                impact=ann['impact']
+            )
+            variants.append(variant)
+
+    return samples, variants
+
+def identify_de_novo(variants: List[Variant], proband: str, father: str, mother: str) -> List[Variant]:
+    """Identify de novo variants: present in proband but absent in both parents"""
+    de_novo = []
+
+    for v in variants:
+        if proband not in v.genotypes or father not in v.genotypes or mother not in v.genotypes:
+            continue
+
+        proband_gt = get_genotype_class(v.genotypes[proband])
+        father_gt = get_genotype_class(v.genotypes[father])
+        mother_gt = get_genotype_class(v.genotypes[mother])
+
+        # De novo: proband has variant, both parents are HOM_REF
+        if proband_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF' and mother_gt == 'HOM_REF':
+            de_novo.append(v)
+
+    return de_novo
+
+def identify_compound_het(variants: List[Variant], proband: str, father: str, mother: str) -> Dict[str, List[Variant]]:
+    """Identify compound heterozygous variants in genes"""
+    gene_variants = defaultdict(list)
+
+    # Group HET variants by gene
+    for v in variants:
+        if not v.gene:
+            continue
+
+        if proband not in v.genotypes:
+            continue
+
+        proband_gt = get_genotype_class(v.genotypes[proband])
+        if proband_gt != 'HET':
+            continue
+
+        gene_variants[v.gene].append(v)
+
+    # Find compound het (>1 HET variant in same gene, inherited from different parents)
+    compound_het = {}
+
+    for gene, vars_list in gene_variants.items():
+        if len(vars_list) < 2:
+            continue
+
+        maternal_inherited = []
+        paternal_inherited = []
+
+        for v in vars_list:
+            if father not in v.genotypes or mother not in v.genotypes:
+                continue
+
+            father_gt = get_genotype_class(v.genotypes[father])
+            mother_gt = get_genotype_class(v.genotypes[mother])
+
+            if father_gt in ['HET', 'HOM_ALT'] and mother_gt == 'HOM_REF':
+                paternal_inherited.append(v)
+            elif mother_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF':
+                maternal_inherited.append(v)
+
+        if maternal_inherited and paternal_inherited:
+            compound_het[gene] = maternal_inherited + paternal_inherited
+
+    return compound_het
+
+def identify_homozygous_recessive(variants: List[Variant], proband: str, father: str, mother: str) -> List[Variant]:
+    """Identify homozygous recessive variants: HOM_ALT in proband, both parents HET"""
+    hom_rec = []
+
+    for v in variants:
+        if proband not in v.genotypes or father not in v.genotypes or mother not in v.genotypes:
+            continue
+
+        proband_gt = get_genotype_class(v.genotypes[proband])
+        father_gt = get_genotype_class(v.genotypes[father])
+        mother_gt = get_genotype_class(v.genotypes[mother])
+
+        # Homozygous recessive: proband HOM_ALT, both parents HET
+        if proband_gt == 'HOM_ALT' and father_gt == 'HET' and mother_gt == 'HET':
+            hom_rec.append(v)
+
+    return hom_rec
+
+def filter_by_impact(variants: List[Variant], impacts: List[str] = ['HIGH', 'MODERATE']) -> List[Variant]:
+    """Filter variants by impact level"""
+    return [v for v in variants if v.impact in impacts]
+
+def generate_report(vcf_path: str, output_path: str):
+    """Generate trio analysis report"""
+    print(f"Parsing VCF: {vcf_path}")
+    samples, variants = parse_vcf(vcf_path)
+
+    print(f"Found {len(samples)} samples: {samples}")
+    print(f"Total variants: {len(variants)}")
+
+    # Identify sample roles based on file naming convention
+    # Expected: I-1 (father), I-2 (mother), II-3 (proband)
+    proband = None
+    father = None
+    mother = None
+
+    for s in samples:
+        s_upper = s.upper()
+        if 'II-3' in s_upper or 'PROBAND' in s_upper:
+            proband = s
+        elif 'I-1' in s_upper:
+            father = s
+        elif 'I-2' in s_upper:
+            mother = s
+
+    if not all([proband, father, mother]):
+        # Fallback: assume order is proband, father, mother
+        if len(samples) >= 3:
+            proband = samples[0]
+            father = samples[1]
+            mother = samples[2]
+        else:
+            print("ERROR: Could not identify trio samples")
+            return
+
+    print(f"\nTrio identified:")
+    print(f"  Proband: {proband}")
+    print(f"  Father:  {father}")
+    print(f"  Mother:  {mother}")
+
+    # Analysis
+    print("\n" + "="*80)
+    print("TRIO ANALYSIS RESULTS")
+    print("="*80)
+
+    # De novo variants
+    de_novo = identify_de_novo(variants, proband, father, mother)
+    de_novo_high = filter_by_impact(de_novo, ['HIGH', 'MODERATE'])
+
+    print(f"\n1. DE NOVO VARIANTS")
+    print(f"   Total de novo: {len(de_novo)}")
+    print(f"   HIGH/MODERATE impact: {len(de_novo_high)}")
+
+    # Compound heterozygous
+    compound_het = identify_compound_het(variants, proband, father, mother)
+
+    print(f"\n2. COMPOUND HETEROZYGOUS GENES")
+    print(f"   Genes with compound het: {len(compound_het)}")
+
+    # Homozygous recessive
+    hom_rec = identify_homozygous_recessive(variants, proband, father, mother)
+    hom_rec_high = filter_by_impact(hom_rec, ['HIGH', 'MODERATE'])
+
+    print(f"\n3. HOMOZYGOUS RECESSIVE VARIANTS")
+    print(f"   Total: {len(hom_rec)}")
+    print(f"   HIGH/MODERATE impact: {len(hom_rec_high)}")
+
+    # Generate detailed report
+    with open(output_path, 'w') as f:
+        f.write("# Trio WES Analysis Report\n")
+        f.write(f"# Generated from: {vcf_path}\n")
+        f.write(f"# Samples: Proband={proband}, Father={father}, Mother={mother}\n")
+        f.write(f"# Total variants analyzed: {len(variants)}\n\n")
+
+        # De novo HIGH/MODERATE impact
+        f.write("## DE NOVO VARIANTS (HIGH/MODERATE IMPACT)\n")
+        f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n")
+        for v in sorted(de_novo_high, key=lambda x: (x.chrom, x.pos)):
+            f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t{v.gene or 'N/A'}\t")
+            f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
+            f.write(f"{v.genotypes.get(proband, './.')}\t")
+            f.write(f"{v.genotypes.get(father, './.')}\t")
+            f.write(f"{v.genotypes.get(mother, './.')}\n")
+
+        # Compound heterozygous
+        f.write("\n## COMPOUND HETEROZYGOUS GENES\n")
+        for gene, vars_list in sorted(compound_het.items()):
+            high_impact = [v for v in vars_list if v.impact in ['HIGH', 'MODERATE']]
+            if high_impact:
+                f.write(f"\n### {gene} ({len(vars_list)} variants, {len(high_impact)} HIGH/MODERATE)\n")
+                f.write("CHROM\tPOS\tREF\tALT\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n")
+                for v in sorted(high_impact, key=lambda x: x.pos):
+                    f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t")
+                    f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
+                    f.write(f"{v.genotypes.get(proband, './.')}\t")
+                    f.write(f"{v.genotypes.get(father, './.')}\t")
+                    f.write(f"{v.genotypes.get(mother, './.')}\n")
+
+        # Homozygous recessive HIGH/MODERATE
+        f.write("\n## HOMOZYGOUS RECESSIVE VARIANTS (HIGH/MODERATE IMPACT)\n")
+        f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n")
+        for v in sorted(hom_rec_high, key=lambda x: (x.chrom, x.pos)):
+            f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t{v.gene or 'N/A'}\t")
+            f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
+            f.write(f"{v.genotypes.get(proband, './.')}\t")
+            f.write(f"{v.genotypes.get(father, './.')}\t")
+            f.write(f"{v.genotypes.get(mother, './.')}\n")
+
+        # Summary statistics
+        f.write("\n## SUMMARY STATISTICS\n")
+        f.write(f"Total variants: {len(variants)}\n")
+        f.write(f"De novo variants: {len(de_novo)}\n")
+        f.write(f"De novo HIGH/MODERATE: {len(de_novo_high)}\n")
+        f.write(f"Compound het genes: {len(compound_het)}\n")
+        f.write(f"Homozygous recessive: {len(hom_rec)}\n")
+        f.write(f"Homozygous recessive HIGH/MODERATE: {len(hom_rec_high)}\n")
+
+    print(f"\nReport saved to: {output_path}")
+
+    # Also print top candidates
+    print("\n" + "="*80)
+    print("TOP CANDIDATE VARIANTS")
+    print("="*80)
+
+    print("\n--- De Novo HIGH Impact ---")
+    de_novo_high_only = [v for v in de_novo if v.impact == 'HIGH']
+    for v in de_novo_high_only[:10]:
+        print(f"  {v.chrom}:{v.pos} {v.ref}>{v.alt} | {v.gene} | {v.effect}")
+
+    print("\n--- Compound Het Genes (with HIGH impact) ---")
+    for gene, vars_list in list(compound_het.items())[:10]:
+        high_count = sum(1 for v in vars_list if v.impact == 'HIGH')
+        if high_count > 0:
+            print(f"  {gene}: {len(vars_list)} variants ({high_count} HIGH)")
+
+    print("\n--- Homozygous Recessive HIGH Impact ---")
+    hom_rec_high_only = [v for v in hom_rec if v.impact == 'HIGH']
+    for v in hom_rec_high_only[:10]:
+        print(f"  {v.chrom}:{v.pos} {v.ref}>{v.alt} | {v.gene} | {v.effect}")
+
+if __name__ == '__main__':
+    import sys
+
+    vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf'
+    output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/trio_analysis_report.txt'
+
+    generate_report(vcf_path, output_path)