genomic-consultant/trio_analysis.py

#!/usr/bin/env python3
"""
Trio WES Analysis Script
Analyzes trio VCF for de novo mutations, compound heterozygous variants,
and potential pathogenic variants.
"""

import gzip
import re
from collections import defaultdict
from dataclasses import dataclass
from typing import List, Dict, Optional, Tuple
import json

@dataclass
class Variant:
    chrom: str
    pos: int
    ref: str
    alt: str
    qual: float
    filter_status: str
    info: str
    genotypes: Dict[str, str]  # sample -> genotype
    annotation: Optional[str] = None
    gene: Optional[str] = None
    effect: Optional[str] = None
    impact: Optional[str] = None

def parse_genotype(gt_field: str) -> Tuple[str, int, int]:
    """Parse genotype field, return (gt_string, ref_count, alt_count)"""
    parts = gt_field.split(':')
    gt = parts[0]

    if gt in ['./.', '.|.', '.']:
        return gt, 0, 0

    alleles = re.split('[/|]', gt)
    ref_count = sum(1 for a in alleles if a == '0')
    alt_count = sum(1 for a in alleles if a != '0' and a != '.')

    return gt, ref_count, alt_count

def get_genotype_class(gt: str) -> str:
    """Classify genotype as HOM_REF, HET, HOM_ALT, or MISSING"""
    if gt in ['./.', '.|.', '.']:
        return 'MISSING'

    alleles = re.split('[/|]', gt)
    if all(a == '0' for a in alleles):
        return 'HOM_REF'
    elif all(a != '0' and a != '.' for a in alleles):
        return 'HOM_ALT'
    else:
        return 'HET'

def parse_snpeff_annotation(info: str) -> Dict:
    """Parse SnpEff ANN field"""
    result = {
        'gene': None,
        'effect': None,
        'impact': None,
        'hgvs_c': None,
        'hgvs_p': None,
    }

    ann_match = re.search(r'ANN=([^;]+)', info)
    if not ann_match:
        return result

    ann_field = ann_match.group(1)
    annotations = ann_field.split(',')

    if annotations:
        # Take the first (most severe) annotation
        parts = annotations[0].split('|')
        if len(parts) >= 4:
            result['effect'] = parts[1] if len(parts) > 1 else None
            result['impact'] = parts[2] if len(parts) > 2 else None
            result['gene'] = parts[3] if len(parts) > 3 else None
            if len(parts) > 9:
                result['hgvs_c'] = parts[9]
            if len(parts) > 10:
                result['hgvs_p'] = parts[10]

    return result

def parse_vcf(vcf_path: str) -> Tuple[List[str], List[Variant]]:
    """Parse VCF file and return sample names and variants"""
    samples = []
    variants = []

    open_func = gzip.open if vcf_path.endswith('.gz') else open
    mode = 'rt' if vcf_path.endswith('.gz') else 'r'

    with open_func(vcf_path, mode) as f:
        for line in f:
            if line.startswith('##'):
                continue
            elif line.startswith('#CHROM'):
                parts = line.strip().split('\t')
                samples = parts[9:]
                continue

            parts = line.strip().split('\t')
            if len(parts) < 10:
                continue

            chrom, pos, _, ref, alt, qual, filt, info, fmt = parts[:9]
            gt_fields = parts[9:]

            # Parse genotypes
            genotypes = {}
            fmt_fields = fmt.split(':')
            gt_idx = fmt_fields.index('GT') if 'GT' in fmt_fields else 0

            for i, sample in enumerate(samples):
                gt_parts = gt_fields[i].split(':')
                genotypes[sample] = gt_parts[gt_idx] if gt_idx < len(gt_parts) else './.'

            # Parse annotation
            ann = parse_snpeff_annotation(info)

            try:
                qual_val = float(qual) if qual != '.' else 0
            except ValueError:
                qual_val = 0

            variant = Variant(
                chrom=chrom,
                pos=int(pos),
                ref=ref,
                alt=alt,
                qual=qual_val,
                filter_status=filt,
                info=info,
                genotypes=genotypes,
                annotation=info,
                gene=ann['gene'],
                effect=ann['effect'],
                impact=ann['impact']
            )
            variants.append(variant)

    return samples, variants

def identify_de_novo(variants: List[Variant], proband: str, father: str, mother: str) -> List[Variant]:
    """Identify de novo variants: present in proband but absent in both parents"""
    de_novo = []

    for v in variants:
        if proband not in v.genotypes or father not in v.genotypes or mother not in v.genotypes:
            continue

        proband_gt = get_genotype_class(v.genotypes[proband])
        father_gt = get_genotype_class(v.genotypes[father])
        mother_gt = get_genotype_class(v.genotypes[mother])

        # De novo: proband has variant, both parents are HOM_REF
        if proband_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF' and mother_gt == 'HOM_REF':
            de_novo.append(v)

    return de_novo

def identify_compound_het(variants: List[Variant], proband: str, father: str, mother: str) -> Dict[str, List[Variant]]:
    """Identify compound heterozygous variants in genes"""
    gene_variants = defaultdict(list)

    # Group HET variants by gene
    for v in variants:
        if not v.gene:
            continue

        if proband not in v.genotypes:
            continue

        proband_gt = get_genotype_class(v.genotypes[proband])
        if proband_gt != 'HET':
            continue

        gene_variants[v.gene].append(v)

    # Find compound het (>1 HET variant in same gene, inherited from different parents)
    compound_het = {}

    for gene, vars_list in gene_variants.items():
        if len(vars_list) < 2:
            continue

        maternal_inherited = []
        paternal_inherited = []

        for v in vars_list:
            if father not in v.genotypes or mother not in v.genotypes:
                continue

            father_gt = get_genotype_class(v.genotypes[father])
            mother_gt = get_genotype_class(v.genotypes[mother])

            if father_gt in ['HET', 'HOM_ALT'] and mother_gt == 'HOM_REF':
                paternal_inherited.append(v)
            elif mother_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF':
                maternal_inherited.append(v)

        if maternal_inherited and paternal_inherited:
            compound_het[gene] = maternal_inherited + paternal_inherited

    return compound_het

def identify_homozygous_recessive(variants: List[Variant], proband: str, father: str, mother: str) -> List[Variant]:
    """Identify homozygous recessive variants: HOM_ALT in proband, both parents HET"""
    hom_rec = []

    for v in variants:
        if proband not in v.genotypes or father not in v.genotypes or mother not in v.genotypes:
            continue

        proband_gt = get_genotype_class(v.genotypes[proband])
        father_gt = get_genotype_class(v.genotypes[father])
        mother_gt = get_genotype_class(v.genotypes[mother])

        # Homozygous recessive: proband HOM_ALT, both parents HET
        if proband_gt == 'HOM_ALT' and father_gt == 'HET' and mother_gt == 'HET':
            hom_rec.append(v)

    return hom_rec

def filter_by_impact(variants: List[Variant], impacts: List[str] = ['HIGH', 'MODERATE']) -> List[Variant]:
    """Filter variants by impact level"""
    return [v for v in variants if v.impact in impacts]

def generate_report(vcf_path: str, output_path: str):
    """Generate trio analysis report"""
    print(f"Parsing VCF: {vcf_path}")
    samples, variants = parse_vcf(vcf_path)

    print(f"Found {len(samples)} samples: {samples}")
    print(f"Total variants: {len(variants)}")

    # Identify sample roles based on file naming convention
    # Expected: I-1 (father), I-2 (mother), II-3 (proband)
    proband = None
    father = None
    mother = None

    for s in samples:
        s_upper = s.upper()
        if 'II-3' in s_upper or 'PROBAND' in s_upper:
            proband = s
        elif 'I-1' in s_upper:
            father = s
        elif 'I-2' in s_upper:
            mother = s

    if not all([proband, father, mother]):
        # Fallback: assume order is proband, father, mother
        if len(samples) >= 3:
            proband = samples[0]
            father = samples[1]
            mother = samples[2]
        else:
            print("ERROR: Could not identify trio samples")
            return

    print(f"\nTrio identified:")
    print(f"  Proband: {proband}")
    print(f"  Father:  {father}")
    print(f"  Mother:  {mother}")

    # Analysis
    print("\n" + "="*80)
    print("TRIO ANALYSIS RESULTS")
    print("="*80)

    # De novo variants
    de_novo = identify_de_novo(variants, proband, father, mother)
    de_novo_high = filter_by_impact(de_novo, ['HIGH', 'MODERATE'])

    print(f"\n1. DE NOVO VARIANTS")
    print(f"   Total de novo: {len(de_novo)}")
    print(f"   HIGH/MODERATE impact: {len(de_novo_high)}")

    # Compound heterozygous
    compound_het = identify_compound_het(variants, proband, father, mother)

    print(f"\n2. COMPOUND HETEROZYGOUS GENES")
    print(f"   Genes with compound het: {len(compound_het)}")

    # Homozygous recessive
    hom_rec = identify_homozygous_recessive(variants, proband, father, mother)
    hom_rec_high = filter_by_impact(hom_rec, ['HIGH', 'MODERATE'])

    print(f"\n3. HOMOZYGOUS RECESSIVE VARIANTS")
    print(f"   Total: {len(hom_rec)}")
    print(f"   HIGH/MODERATE impact: {len(hom_rec_high)}")

    # Generate detailed report
    with open(output_path, 'w') as f:
        f.write("# Trio WES Analysis Report\n")
        f.write(f"# Generated from: {vcf_path}\n")
        f.write(f"# Samples: Proband={proband}, Father={father}, Mother={mother}\n")
        f.write(f"# Total variants analyzed: {len(variants)}\n\n")

        # De novo HIGH/MODERATE impact
        f.write("## DE NOVO VARIANTS (HIGH/MODERATE IMPACT)\n")
        f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n")
        for v in sorted(de_novo_high, key=lambda x: (x.chrom, x.pos)):
            f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t{v.gene or 'N/A'}\t")
            f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
            f.write(f"{v.genotypes.get(proband, './.')}\t")
            f.write(f"{v.genotypes.get(father, './.')}\t")
            f.write(f"{v.genotypes.get(mother, './.')}\n")

        # Compound heterozygous
        f.write("\n## COMPOUND HETEROZYGOUS GENES\n")
        for gene, vars_list in sorted(compound_het.items()):
            high_impact = [v for v in vars_list if v.impact in ['HIGH', 'MODERATE']]
            if high_impact:
                f.write(f"\n### {gene} ({len(vars_list)} variants, {len(high_impact)} HIGH/MODERATE)\n")
                f.write("CHROM\tPOS\tREF\tALT\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n")
                for v in sorted(high_impact, key=lambda x: x.pos):
                    f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t")
                    f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
                    f.write(f"{v.genotypes.get(proband, './.')}\t")
                    f.write(f"{v.genotypes.get(father, './.')}\t")
                    f.write(f"{v.genotypes.get(mother, './.')}\n")

        # Homozygous recessive HIGH/MODERATE
        f.write("\n## HOMOZYGOUS RECESSIVE VARIANTS (HIGH/MODERATE IMPACT)\n")
        f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n")
        for v in sorted(hom_rec_high, key=lambda x: (x.chrom, x.pos)):
            f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t{v.gene or 'N/A'}\t")
            f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
            f.write(f"{v.genotypes.get(proband, './.')}\t")
            f.write(f"{v.genotypes.get(father, './.')}\t")
            f.write(f"{v.genotypes.get(mother, './.')}\n")

        # Summary statistics
        f.write("\n## SUMMARY STATISTICS\n")
        f.write(f"Total variants: {len(variants)}\n")
        f.write(f"De novo variants: {len(de_novo)}\n")
        f.write(f"De novo HIGH/MODERATE: {len(de_novo_high)}\n")
        f.write(f"Compound het genes: {len(compound_het)}\n")
        f.write(f"Homozygous recessive: {len(hom_rec)}\n")
        f.write(f"Homozygous recessive HIGH/MODERATE: {len(hom_rec_high)}\n")

    print(f"\nReport saved to: {output_path}")

    # Also print top candidates
    print("\n" + "="*80)
    print("TOP CANDIDATE VARIANTS")
    print("="*80)

    print("\n--- De Novo HIGH Impact ---")
    de_novo_high_only = [v for v in de_novo if v.impact == 'HIGH']
    for v in de_novo_high_only[:10]:
        print(f"  {v.chrom}:{v.pos} {v.ref}>{v.alt} | {v.gene} | {v.effect}")

    print("\n--- Compound Het Genes (with HIGH impact) ---")
    for gene, vars_list in list(compound_het.items())[:10]:
        high_count = sum(1 for v in vars_list if v.impact == 'HIGH')
        if high_count > 0:
            print(f"  {gene}: {len(vars_list)} variants ({high_count} HIGH)")

    print("\n--- Homozygous Recessive HIGH Impact ---")
    hom_rec_high_only = [v for v in hom_rec if v.impact == 'HIGH']
    for v in hom_rec_high_only[:10]:
        print(f"  {v.chrom}:{v.pos} {v.ref}>{v.alt} | {v.gene} | {v.effect}")

if __name__ == '__main__':
    import sys

    vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf'
    output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/trio_analysis_report.txt'

    generate_report(vcf_path, output_path)