- Add trio_analysis.py for trio-based variant analysis with de novo detection - Add clinvar_acmg_annotate.py for ClinVar/ACMG annotation - Add gwas_comprehensive.py with 201 SNPs across 18 categories - Add pharmgkb_full_analysis.py for pharmacogenomics analysis - Add gwas_trait_lookup.py for basic GWAS trait lookup - Add pharmacogenomics.py for basic PGx analysis - Remove unused scaffolding code (src/, configs/, docs/, tests/) - Update README.md with new documentation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
377 lines
13 KiB
Python
377 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Trio WES Analysis Script
|
|
Analyzes trio VCF for de novo mutations, compound heterozygous variants,
|
|
and potential pathogenic variants.
|
|
"""
|
|
|
|
import gzip
|
|
import re
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass
|
|
from typing import List, Dict, Optional, Tuple
|
|
import json
|
|
|
|
@dataclass
|
|
class Variant:
|
|
chrom: str
|
|
pos: int
|
|
ref: str
|
|
alt: str
|
|
qual: float
|
|
filter_status: str
|
|
info: str
|
|
genotypes: Dict[str, str] # sample -> genotype
|
|
annotation: Optional[str] = None
|
|
gene: Optional[str] = None
|
|
effect: Optional[str] = None
|
|
impact: Optional[str] = None
|
|
|
|
def parse_genotype(gt_field: str) -> Tuple[str, int, int]:
|
|
"""Parse genotype field, return (gt_string, ref_count, alt_count)"""
|
|
parts = gt_field.split(':')
|
|
gt = parts[0]
|
|
|
|
if gt in ['./.', '.|.', '.']:
|
|
return gt, 0, 0
|
|
|
|
alleles = re.split('[/|]', gt)
|
|
ref_count = sum(1 for a in alleles if a == '0')
|
|
alt_count = sum(1 for a in alleles if a != '0' and a != '.')
|
|
|
|
return gt, ref_count, alt_count
|
|
|
|
def get_genotype_class(gt: str) -> str:
|
|
"""Classify genotype as HOM_REF, HET, HOM_ALT, or MISSING"""
|
|
if gt in ['./.', '.|.', '.']:
|
|
return 'MISSING'
|
|
|
|
alleles = re.split('[/|]', gt)
|
|
if all(a == '0' for a in alleles):
|
|
return 'HOM_REF'
|
|
elif all(a != '0' and a != '.' for a in alleles):
|
|
return 'HOM_ALT'
|
|
else:
|
|
return 'HET'
|
|
|
|
def parse_snpeff_annotation(info: str) -> Dict:
|
|
"""Parse SnpEff ANN field"""
|
|
result = {
|
|
'gene': None,
|
|
'effect': None,
|
|
'impact': None,
|
|
'hgvs_c': None,
|
|
'hgvs_p': None,
|
|
}
|
|
|
|
ann_match = re.search(r'ANN=([^;]+)', info)
|
|
if not ann_match:
|
|
return result
|
|
|
|
ann_field = ann_match.group(1)
|
|
annotations = ann_field.split(',')
|
|
|
|
if annotations:
|
|
# Take the first (most severe) annotation
|
|
parts = annotations[0].split('|')
|
|
if len(parts) >= 4:
|
|
result['effect'] = parts[1] if len(parts) > 1 else None
|
|
result['impact'] = parts[2] if len(parts) > 2 else None
|
|
result['gene'] = parts[3] if len(parts) > 3 else None
|
|
if len(parts) > 9:
|
|
result['hgvs_c'] = parts[9]
|
|
if len(parts) > 10:
|
|
result['hgvs_p'] = parts[10]
|
|
|
|
return result
|
|
|
|
def parse_vcf(vcf_path: str) -> Tuple[List[str], List[Variant]]:
|
|
"""Parse VCF file and return sample names and variants"""
|
|
samples = []
|
|
variants = []
|
|
|
|
open_func = gzip.open if vcf_path.endswith('.gz') else open
|
|
mode = 'rt' if vcf_path.endswith('.gz') else 'r'
|
|
|
|
with open_func(vcf_path, mode) as f:
|
|
for line in f:
|
|
if line.startswith('##'):
|
|
continue
|
|
elif line.startswith('#CHROM'):
|
|
parts = line.strip().split('\t')
|
|
samples = parts[9:]
|
|
continue
|
|
|
|
parts = line.strip().split('\t')
|
|
if len(parts) < 10:
|
|
continue
|
|
|
|
chrom, pos, _, ref, alt, qual, filt, info, fmt = parts[:9]
|
|
gt_fields = parts[9:]
|
|
|
|
# Parse genotypes
|
|
genotypes = {}
|
|
fmt_fields = fmt.split(':')
|
|
gt_idx = fmt_fields.index('GT') if 'GT' in fmt_fields else 0
|
|
|
|
for i, sample in enumerate(samples):
|
|
gt_parts = gt_fields[i].split(':')
|
|
genotypes[sample] = gt_parts[gt_idx] if gt_idx < len(gt_parts) else './.'
|
|
|
|
# Parse annotation
|
|
ann = parse_snpeff_annotation(info)
|
|
|
|
try:
|
|
qual_val = float(qual) if qual != '.' else 0
|
|
except ValueError:
|
|
qual_val = 0
|
|
|
|
variant = Variant(
|
|
chrom=chrom,
|
|
pos=int(pos),
|
|
ref=ref,
|
|
alt=alt,
|
|
qual=qual_val,
|
|
filter_status=filt,
|
|
info=info,
|
|
genotypes=genotypes,
|
|
annotation=info,
|
|
gene=ann['gene'],
|
|
effect=ann['effect'],
|
|
impact=ann['impact']
|
|
)
|
|
variants.append(variant)
|
|
|
|
return samples, variants
|
|
|
|
def identify_de_novo(variants: List[Variant], proband: str, father: str, mother: str) -> List[Variant]:
|
|
"""Identify de novo variants: present in proband but absent in both parents"""
|
|
de_novo = []
|
|
|
|
for v in variants:
|
|
if proband not in v.genotypes or father not in v.genotypes or mother not in v.genotypes:
|
|
continue
|
|
|
|
proband_gt = get_genotype_class(v.genotypes[proband])
|
|
father_gt = get_genotype_class(v.genotypes[father])
|
|
mother_gt = get_genotype_class(v.genotypes[mother])
|
|
|
|
# De novo: proband has variant, both parents are HOM_REF
|
|
if proband_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF' and mother_gt == 'HOM_REF':
|
|
de_novo.append(v)
|
|
|
|
return de_novo
|
|
|
|
def identify_compound_het(variants: List[Variant], proband: str, father: str, mother: str) -> Dict[str, List[Variant]]:
|
|
"""Identify compound heterozygous variants in genes"""
|
|
gene_variants = defaultdict(list)
|
|
|
|
# Group HET variants by gene
|
|
for v in variants:
|
|
if not v.gene:
|
|
continue
|
|
|
|
if proband not in v.genotypes:
|
|
continue
|
|
|
|
proband_gt = get_genotype_class(v.genotypes[proband])
|
|
if proband_gt != 'HET':
|
|
continue
|
|
|
|
gene_variants[v.gene].append(v)
|
|
|
|
# Find compound het (>1 HET variant in same gene, inherited from different parents)
|
|
compound_het = {}
|
|
|
|
for gene, vars_list in gene_variants.items():
|
|
if len(vars_list) < 2:
|
|
continue
|
|
|
|
maternal_inherited = []
|
|
paternal_inherited = []
|
|
|
|
for v in vars_list:
|
|
if father not in v.genotypes or mother not in v.genotypes:
|
|
continue
|
|
|
|
father_gt = get_genotype_class(v.genotypes[father])
|
|
mother_gt = get_genotype_class(v.genotypes[mother])
|
|
|
|
if father_gt in ['HET', 'HOM_ALT'] and mother_gt == 'HOM_REF':
|
|
paternal_inherited.append(v)
|
|
elif mother_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF':
|
|
maternal_inherited.append(v)
|
|
|
|
if maternal_inherited and paternal_inherited:
|
|
compound_het[gene] = maternal_inherited + paternal_inherited
|
|
|
|
return compound_het
|
|
|
|
def identify_homozygous_recessive(variants: List[Variant], proband: str, father: str, mother: str) -> List[Variant]:
|
|
"""Identify homozygous recessive variants: HOM_ALT in proband, both parents HET"""
|
|
hom_rec = []
|
|
|
|
for v in variants:
|
|
if proband not in v.genotypes or father not in v.genotypes or mother not in v.genotypes:
|
|
continue
|
|
|
|
proband_gt = get_genotype_class(v.genotypes[proband])
|
|
father_gt = get_genotype_class(v.genotypes[father])
|
|
mother_gt = get_genotype_class(v.genotypes[mother])
|
|
|
|
# Homozygous recessive: proband HOM_ALT, both parents HET
|
|
if proband_gt == 'HOM_ALT' and father_gt == 'HET' and mother_gt == 'HET':
|
|
hom_rec.append(v)
|
|
|
|
return hom_rec
|
|
|
|
def filter_by_impact(variants: List[Variant], impacts: List[str] = ['HIGH', 'MODERATE']) -> List[Variant]:
|
|
"""Filter variants by impact level"""
|
|
return [v for v in variants if v.impact in impacts]
|
|
|
|
def generate_report(vcf_path: str, output_path: str):
|
|
"""Generate trio analysis report"""
|
|
print(f"Parsing VCF: {vcf_path}")
|
|
samples, variants = parse_vcf(vcf_path)
|
|
|
|
print(f"Found {len(samples)} samples: {samples}")
|
|
print(f"Total variants: {len(variants)}")
|
|
|
|
# Identify sample roles based on file naming convention
|
|
# Expected: I-1 (father), I-2 (mother), II-3 (proband)
|
|
proband = None
|
|
father = None
|
|
mother = None
|
|
|
|
for s in samples:
|
|
s_upper = s.upper()
|
|
if 'II-3' in s_upper or 'PROBAND' in s_upper:
|
|
proband = s
|
|
elif 'I-1' in s_upper:
|
|
father = s
|
|
elif 'I-2' in s_upper:
|
|
mother = s
|
|
|
|
if not all([proband, father, mother]):
|
|
# Fallback: assume order is proband, father, mother
|
|
if len(samples) >= 3:
|
|
proband = samples[0]
|
|
father = samples[1]
|
|
mother = samples[2]
|
|
else:
|
|
print("ERROR: Could not identify trio samples")
|
|
return
|
|
|
|
print(f"\nTrio identified:")
|
|
print(f" Proband: {proband}")
|
|
print(f" Father: {father}")
|
|
print(f" Mother: {mother}")
|
|
|
|
# Analysis
|
|
print("\n" + "="*80)
|
|
print("TRIO ANALYSIS RESULTS")
|
|
print("="*80)
|
|
|
|
# De novo variants
|
|
de_novo = identify_de_novo(variants, proband, father, mother)
|
|
de_novo_high = filter_by_impact(de_novo, ['HIGH', 'MODERATE'])
|
|
|
|
print(f"\n1. DE NOVO VARIANTS")
|
|
print(f" Total de novo: {len(de_novo)}")
|
|
print(f" HIGH/MODERATE impact: {len(de_novo_high)}")
|
|
|
|
# Compound heterozygous
|
|
compound_het = identify_compound_het(variants, proband, father, mother)
|
|
|
|
print(f"\n2. COMPOUND HETEROZYGOUS GENES")
|
|
print(f" Genes with compound het: {len(compound_het)}")
|
|
|
|
# Homozygous recessive
|
|
hom_rec = identify_homozygous_recessive(variants, proband, father, mother)
|
|
hom_rec_high = filter_by_impact(hom_rec, ['HIGH', 'MODERATE'])
|
|
|
|
print(f"\n3. HOMOZYGOUS RECESSIVE VARIANTS")
|
|
print(f" Total: {len(hom_rec)}")
|
|
print(f" HIGH/MODERATE impact: {len(hom_rec_high)}")
|
|
|
|
# Generate detailed report
|
|
with open(output_path, 'w') as f:
|
|
f.write("# Trio WES Analysis Report\n")
|
|
f.write(f"# Generated from: {vcf_path}\n")
|
|
f.write(f"# Samples: Proband={proband}, Father={father}, Mother={mother}\n")
|
|
f.write(f"# Total variants analyzed: {len(variants)}\n\n")
|
|
|
|
# De novo HIGH/MODERATE impact
|
|
f.write("## DE NOVO VARIANTS (HIGH/MODERATE IMPACT)\n")
|
|
f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n")
|
|
for v in sorted(de_novo_high, key=lambda x: (x.chrom, x.pos)):
|
|
f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t{v.gene or 'N/A'}\t")
|
|
f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
|
|
f.write(f"{v.genotypes.get(proband, './.')}\t")
|
|
f.write(f"{v.genotypes.get(father, './.')}\t")
|
|
f.write(f"{v.genotypes.get(mother, './.')}\n")
|
|
|
|
# Compound heterozygous
|
|
f.write("\n## COMPOUND HETEROZYGOUS GENES\n")
|
|
for gene, vars_list in sorted(compound_het.items()):
|
|
high_impact = [v for v in vars_list if v.impact in ['HIGH', 'MODERATE']]
|
|
if high_impact:
|
|
f.write(f"\n### {gene} ({len(vars_list)} variants, {len(high_impact)} HIGH/MODERATE)\n")
|
|
f.write("CHROM\tPOS\tREF\tALT\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n")
|
|
for v in sorted(high_impact, key=lambda x: x.pos):
|
|
f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t")
|
|
f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
|
|
f.write(f"{v.genotypes.get(proband, './.')}\t")
|
|
f.write(f"{v.genotypes.get(father, './.')}\t")
|
|
f.write(f"{v.genotypes.get(mother, './.')}\n")
|
|
|
|
# Homozygous recessive HIGH/MODERATE
|
|
f.write("\n## HOMOZYGOUS RECESSIVE VARIANTS (HIGH/MODERATE IMPACT)\n")
|
|
f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n")
|
|
for v in sorted(hom_rec_high, key=lambda x: (x.chrom, x.pos)):
|
|
f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t{v.gene or 'N/A'}\t")
|
|
f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
|
|
f.write(f"{v.genotypes.get(proband, './.')}\t")
|
|
f.write(f"{v.genotypes.get(father, './.')}\t")
|
|
f.write(f"{v.genotypes.get(mother, './.')}\n")
|
|
|
|
# Summary statistics
|
|
f.write("\n## SUMMARY STATISTICS\n")
|
|
f.write(f"Total variants: {len(variants)}\n")
|
|
f.write(f"De novo variants: {len(de_novo)}\n")
|
|
f.write(f"De novo HIGH/MODERATE: {len(de_novo_high)}\n")
|
|
f.write(f"Compound het genes: {len(compound_het)}\n")
|
|
f.write(f"Homozygous recessive: {len(hom_rec)}\n")
|
|
f.write(f"Homozygous recessive HIGH/MODERATE: {len(hom_rec_high)}\n")
|
|
|
|
print(f"\nReport saved to: {output_path}")
|
|
|
|
# Also print top candidates
|
|
print("\n" + "="*80)
|
|
print("TOP CANDIDATE VARIANTS")
|
|
print("="*80)
|
|
|
|
print("\n--- De Novo HIGH Impact ---")
|
|
de_novo_high_only = [v for v in de_novo if v.impact == 'HIGH']
|
|
for v in de_novo_high_only[:10]:
|
|
print(f" {v.chrom}:{v.pos} {v.ref}>{v.alt} | {v.gene} | {v.effect}")
|
|
|
|
print("\n--- Compound Het Genes (with HIGH impact) ---")
|
|
for gene, vars_list in list(compound_het.items())[:10]:
|
|
high_count = sum(1 for v in vars_list if v.impact == 'HIGH')
|
|
if high_count > 0:
|
|
print(f" {gene}: {len(vars_list)} variants ({high_count} HIGH)")
|
|
|
|
print("\n--- Homozygous Recessive HIGH Impact ---")
|
|
hom_rec_high_only = [v for v in hom_rec if v.impact == 'HIGH']
|
|
for v in hom_rec_high_only[:10]:
|
|
print(f" {v.chrom}:{v.pos} {v.ref}>{v.alt} | {v.gene} | {v.effect}")
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
|
|
vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf'
|
|
output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/trio_analysis_report.txt'
|
|
|
|
generate_report(vcf_path, output_path)
|