#!/usr/bin/env python3 """ Trio WES Analysis Script Analyzes trio VCF for de novo mutations, compound heterozygous variants, and potential pathogenic variants. """ import gzip import re from collections import defaultdict from dataclasses import dataclass from typing import List, Dict, Optional, Tuple import json @dataclass class Variant: chrom: str pos: int ref: str alt: str qual: float filter_status: str info: str genotypes: Dict[str, str] # sample -> genotype annotation: Optional[str] = None gene: Optional[str] = None effect: Optional[str] = None impact: Optional[str] = None def parse_genotype(gt_field: str) -> Tuple[str, int, int]: """Parse genotype field, return (gt_string, ref_count, alt_count)""" parts = gt_field.split(':') gt = parts[0] if gt in ['./.', '.|.', '.']: return gt, 0, 0 alleles = re.split('[/|]', gt) ref_count = sum(1 for a in alleles if a == '0') alt_count = sum(1 for a in alleles if a != '0' and a != '.') return gt, ref_count, alt_count def get_genotype_class(gt: str) -> str: """Classify genotype as HOM_REF, HET, HOM_ALT, or MISSING""" if gt in ['./.', '.|.', '.']: return 'MISSING' alleles = re.split('[/|]', gt) if all(a == '0' for a in alleles): return 'HOM_REF' elif all(a != '0' and a != '.' for a in alleles): return 'HOM_ALT' else: return 'HET' def parse_snpeff_annotation(info: str) -> Dict: """Parse SnpEff ANN field""" result = { 'gene': None, 'effect': None, 'impact': None, 'hgvs_c': None, 'hgvs_p': None, } ann_match = re.search(r'ANN=([^;]+)', info) if not ann_match: return result ann_field = ann_match.group(1) annotations = ann_field.split(',') if annotations: # Take the first (most severe) annotation parts = annotations[0].split('|') if len(parts) >= 4: result['effect'] = parts[1] if len(parts) > 1 else None result['impact'] = parts[2] if len(parts) > 2 else None result['gene'] = parts[3] if len(parts) > 3 else None if len(parts) > 9: result['hgvs_c'] = parts[9] if len(parts) > 10: result['hgvs_p'] = parts[10] return result def parse_vcf(vcf_path: str) -> Tuple[List[str], List[Variant]]: """Parse VCF file and return sample names and variants""" samples = [] variants = [] open_func = gzip.open if vcf_path.endswith('.gz') else open mode = 'rt' if vcf_path.endswith('.gz') else 'r' with open_func(vcf_path, mode) as f: for line in f: if line.startswith('##'): continue elif line.startswith('#CHROM'): parts = line.strip().split('\t') samples = parts[9:] continue parts = line.strip().split('\t') if len(parts) < 10: continue chrom, pos, _, ref, alt, qual, filt, info, fmt = parts[:9] gt_fields = parts[9:] # Parse genotypes genotypes = {} fmt_fields = fmt.split(':') gt_idx = fmt_fields.index('GT') if 'GT' in fmt_fields else 0 for i, sample in enumerate(samples): gt_parts = gt_fields[i].split(':') genotypes[sample] = gt_parts[gt_idx] if gt_idx < len(gt_parts) else './.' # Parse annotation ann = parse_snpeff_annotation(info) try: qual_val = float(qual) if qual != '.' else 0 except ValueError: qual_val = 0 variant = Variant( chrom=chrom, pos=int(pos), ref=ref, alt=alt, qual=qual_val, filter_status=filt, info=info, genotypes=genotypes, annotation=info, gene=ann['gene'], effect=ann['effect'], impact=ann['impact'] ) variants.append(variant) return samples, variants def identify_de_novo(variants: List[Variant], proband: str, father: str, mother: str) -> List[Variant]: """Identify de novo variants: present in proband but absent in both parents""" de_novo = [] for v in variants: if proband not in v.genotypes or father not in v.genotypes or mother not in v.genotypes: continue proband_gt = get_genotype_class(v.genotypes[proband]) father_gt = get_genotype_class(v.genotypes[father]) mother_gt = get_genotype_class(v.genotypes[mother]) # De novo: proband has variant, both parents are HOM_REF if proband_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF' and mother_gt == 'HOM_REF': de_novo.append(v) return de_novo def identify_compound_het(variants: List[Variant], proband: str, father: str, mother: str) -> Dict[str, List[Variant]]: """Identify compound heterozygous variants in genes""" gene_variants = defaultdict(list) # Group HET variants by gene for v in variants: if not v.gene: continue if proband not in v.genotypes: continue proband_gt = get_genotype_class(v.genotypes[proband]) if proband_gt != 'HET': continue gene_variants[v.gene].append(v) # Find compound het (>1 HET variant in same gene, inherited from different parents) compound_het = {} for gene, vars_list in gene_variants.items(): if len(vars_list) < 2: continue maternal_inherited = [] paternal_inherited = [] for v in vars_list: if father not in v.genotypes or mother not in v.genotypes: continue father_gt = get_genotype_class(v.genotypes[father]) mother_gt = get_genotype_class(v.genotypes[mother]) if father_gt in ['HET', 'HOM_ALT'] and mother_gt == 'HOM_REF': paternal_inherited.append(v) elif mother_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF': maternal_inherited.append(v) if maternal_inherited and paternal_inherited: compound_het[gene] = maternal_inherited + paternal_inherited return compound_het def identify_homozygous_recessive(variants: List[Variant], proband: str, father: str, mother: str) -> List[Variant]: """Identify homozygous recessive variants: HOM_ALT in proband, both parents HET""" hom_rec = [] for v in variants: if proband not in v.genotypes or father not in v.genotypes or mother not in v.genotypes: continue proband_gt = get_genotype_class(v.genotypes[proband]) father_gt = get_genotype_class(v.genotypes[father]) mother_gt = get_genotype_class(v.genotypes[mother]) # Homozygous recessive: proband HOM_ALT, both parents HET if proband_gt == 'HOM_ALT' and father_gt == 'HET' and mother_gt == 'HET': hom_rec.append(v) return hom_rec def filter_by_impact(variants: List[Variant], impacts: List[str] = ['HIGH', 'MODERATE']) -> List[Variant]: """Filter variants by impact level""" return [v for v in variants if v.impact in impacts] def generate_report(vcf_path: str, output_path: str): """Generate trio analysis report""" print(f"Parsing VCF: {vcf_path}") samples, variants = parse_vcf(vcf_path) print(f"Found {len(samples)} samples: {samples}") print(f"Total variants: {len(variants)}") # Identify sample roles based on file naming convention # Expected: I-1 (father), I-2 (mother), II-3 (proband) proband = None father = None mother = None for s in samples: s_upper = s.upper() if 'II-3' in s_upper or 'PROBAND' in s_upper: proband = s elif 'I-1' in s_upper: father = s elif 'I-2' in s_upper: mother = s if not all([proband, father, mother]): # Fallback: assume order is proband, father, mother if len(samples) >= 3: proband = samples[0] father = samples[1] mother = samples[2] else: print("ERROR: Could not identify trio samples") return print(f"\nTrio identified:") print(f" Proband: {proband}") print(f" Father: {father}") print(f" Mother: {mother}") # Analysis print("\n" + "="*80) print("TRIO ANALYSIS RESULTS") print("="*80) # De novo variants de_novo = identify_de_novo(variants, proband, father, mother) de_novo_high = filter_by_impact(de_novo, ['HIGH', 'MODERATE']) print(f"\n1. DE NOVO VARIANTS") print(f" Total de novo: {len(de_novo)}") print(f" HIGH/MODERATE impact: {len(de_novo_high)}") # Compound heterozygous compound_het = identify_compound_het(variants, proband, father, mother) print(f"\n2. COMPOUND HETEROZYGOUS GENES") print(f" Genes with compound het: {len(compound_het)}") # Homozygous recessive hom_rec = identify_homozygous_recessive(variants, proband, father, mother) hom_rec_high = filter_by_impact(hom_rec, ['HIGH', 'MODERATE']) print(f"\n3. HOMOZYGOUS RECESSIVE VARIANTS") print(f" Total: {len(hom_rec)}") print(f" HIGH/MODERATE impact: {len(hom_rec_high)}") # Generate detailed report with open(output_path, 'w') as f: f.write("# Trio WES Analysis Report\n") f.write(f"# Generated from: {vcf_path}\n") f.write(f"# Samples: Proband={proband}, Father={father}, Mother={mother}\n") f.write(f"# Total variants analyzed: {len(variants)}\n\n") # De novo HIGH/MODERATE impact f.write("## DE NOVO VARIANTS (HIGH/MODERATE IMPACT)\n") f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n") for v in sorted(de_novo_high, key=lambda x: (x.chrom, x.pos)): f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t{v.gene or 'N/A'}\t") f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t") f.write(f"{v.genotypes.get(proband, './.')}\t") f.write(f"{v.genotypes.get(father, './.')}\t") f.write(f"{v.genotypes.get(mother, './.')}\n") # Compound heterozygous f.write("\n## COMPOUND HETEROZYGOUS GENES\n") for gene, vars_list in sorted(compound_het.items()): high_impact = [v for v in vars_list if v.impact in ['HIGH', 'MODERATE']] if high_impact: f.write(f"\n### {gene} ({len(vars_list)} variants, {len(high_impact)} HIGH/MODERATE)\n") f.write("CHROM\tPOS\tREF\tALT\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n") for v in sorted(high_impact, key=lambda x: x.pos): f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t") f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t") f.write(f"{v.genotypes.get(proband, './.')}\t") f.write(f"{v.genotypes.get(father, './.')}\t") f.write(f"{v.genotypes.get(mother, './.')}\n") # Homozygous recessive HIGH/MODERATE f.write("\n## HOMOZYGOUS RECESSIVE VARIANTS (HIGH/MODERATE IMPACT)\n") f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n") for v in sorted(hom_rec_high, key=lambda x: (x.chrom, x.pos)): f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t{v.gene or 'N/A'}\t") f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t") f.write(f"{v.genotypes.get(proband, './.')}\t") f.write(f"{v.genotypes.get(father, './.')}\t") f.write(f"{v.genotypes.get(mother, './.')}\n") # Summary statistics f.write("\n## SUMMARY STATISTICS\n") f.write(f"Total variants: {len(variants)}\n") f.write(f"De novo variants: {len(de_novo)}\n") f.write(f"De novo HIGH/MODERATE: {len(de_novo_high)}\n") f.write(f"Compound het genes: {len(compound_het)}\n") f.write(f"Homozygous recessive: {len(hom_rec)}\n") f.write(f"Homozygous recessive HIGH/MODERATE: {len(hom_rec_high)}\n") print(f"\nReport saved to: {output_path}") # Also print top candidates print("\n" + "="*80) print("TOP CANDIDATE VARIANTS") print("="*80) print("\n--- De Novo HIGH Impact ---") de_novo_high_only = [v for v in de_novo if v.impact == 'HIGH'] for v in de_novo_high_only[:10]: print(f" {v.chrom}:{v.pos} {v.ref}>{v.alt} | {v.gene} | {v.effect}") print("\n--- Compound Het Genes (with HIGH impact) ---") for gene, vars_list in list(compound_het.items())[:10]: high_count = sum(1 for v in vars_list if v.impact == 'HIGH') if high_count > 0: print(f" {gene}: {len(vars_list)} variants ({high_count} HIGH)") print("\n--- Homozygous Recessive HIGH Impact ---") hom_rec_high_only = [v for v in hom_rec if v.impact == 'HIGH'] for v in hom_rec_high_only[:10]: print(f" {v.chrom}:{v.pos} {v.ref}>{v.alt} | {v.gene} | {v.effect}") if __name__ == '__main__': import sys vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf' output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/trio_analysis_report.txt' generate_report(vcf_path, output_path)