Refactor: Replace scaffolding with working analysis scripts
- Add trio_analysis.py for trio-based variant analysis with de novo detection - Add clinvar_acmg_annotate.py for ClinVar/ACMG annotation - Add gwas_comprehensive.py with 201 SNPs across 18 categories - Add pharmgkb_full_analysis.py for pharmacogenomics analysis - Add gwas_trait_lookup.py for basic GWAS trait lookup - Add pharmacogenomics.py for basic PGx analysis - Remove unused scaffolding code (src/, configs/, docs/, tests/) - Update README.md with new documentation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
376
trio_analysis.py
Normal file
376
trio_analysis.py
Normal file
@@ -0,0 +1,376 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Trio WES Analysis Script
|
||||
Analyzes trio VCF for de novo mutations, compound heterozygous variants,
|
||||
and potential pathogenic variants.
|
||||
"""
|
||||
|
||||
import gzip
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
import json
|
||||
|
||||
@dataclass
|
||||
class Variant:
|
||||
chrom: str
|
||||
pos: int
|
||||
ref: str
|
||||
alt: str
|
||||
qual: float
|
||||
filter_status: str
|
||||
info: str
|
||||
genotypes: Dict[str, str] # sample -> genotype
|
||||
annotation: Optional[str] = None
|
||||
gene: Optional[str] = None
|
||||
effect: Optional[str] = None
|
||||
impact: Optional[str] = None
|
||||
|
||||
def parse_genotype(gt_field: str) -> Tuple[str, int, int]:
|
||||
"""Parse genotype field, return (gt_string, ref_count, alt_count)"""
|
||||
parts = gt_field.split(':')
|
||||
gt = parts[0]
|
||||
|
||||
if gt in ['./.', '.|.', '.']:
|
||||
return gt, 0, 0
|
||||
|
||||
alleles = re.split('[/|]', gt)
|
||||
ref_count = sum(1 for a in alleles if a == '0')
|
||||
alt_count = sum(1 for a in alleles if a != '0' and a != '.')
|
||||
|
||||
return gt, ref_count, alt_count
|
||||
|
||||
def get_genotype_class(gt: str) -> str:
|
||||
"""Classify genotype as HOM_REF, HET, HOM_ALT, or MISSING"""
|
||||
if gt in ['./.', '.|.', '.']:
|
||||
return 'MISSING'
|
||||
|
||||
alleles = re.split('[/|]', gt)
|
||||
if all(a == '0' for a in alleles):
|
||||
return 'HOM_REF'
|
||||
elif all(a != '0' and a != '.' for a in alleles):
|
||||
return 'HOM_ALT'
|
||||
else:
|
||||
return 'HET'
|
||||
|
||||
def parse_snpeff_annotation(info: str) -> Dict:
|
||||
"""Parse SnpEff ANN field"""
|
||||
result = {
|
||||
'gene': None,
|
||||
'effect': None,
|
||||
'impact': None,
|
||||
'hgvs_c': None,
|
||||
'hgvs_p': None,
|
||||
}
|
||||
|
||||
ann_match = re.search(r'ANN=([^;]+)', info)
|
||||
if not ann_match:
|
||||
return result
|
||||
|
||||
ann_field = ann_match.group(1)
|
||||
annotations = ann_field.split(',')
|
||||
|
||||
if annotations:
|
||||
# Take the first (most severe) annotation
|
||||
parts = annotations[0].split('|')
|
||||
if len(parts) >= 4:
|
||||
result['effect'] = parts[1] if len(parts) > 1 else None
|
||||
result['impact'] = parts[2] if len(parts) > 2 else None
|
||||
result['gene'] = parts[3] if len(parts) > 3 else None
|
||||
if len(parts) > 9:
|
||||
result['hgvs_c'] = parts[9]
|
||||
if len(parts) > 10:
|
||||
result['hgvs_p'] = parts[10]
|
||||
|
||||
return result
|
||||
|
||||
def parse_vcf(vcf_path: str) -> Tuple[List[str], List[Variant]]:
|
||||
"""Parse VCF file and return sample names and variants"""
|
||||
samples = []
|
||||
variants = []
|
||||
|
||||
open_func = gzip.open if vcf_path.endswith('.gz') else open
|
||||
mode = 'rt' if vcf_path.endswith('.gz') else 'r'
|
||||
|
||||
with open_func(vcf_path, mode) as f:
|
||||
for line in f:
|
||||
if line.startswith('##'):
|
||||
continue
|
||||
elif line.startswith('#CHROM'):
|
||||
parts = line.strip().split('\t')
|
||||
samples = parts[9:]
|
||||
continue
|
||||
|
||||
parts = line.strip().split('\t')
|
||||
if len(parts) < 10:
|
||||
continue
|
||||
|
||||
chrom, pos, _, ref, alt, qual, filt, info, fmt = parts[:9]
|
||||
gt_fields = parts[9:]
|
||||
|
||||
# Parse genotypes
|
||||
genotypes = {}
|
||||
fmt_fields = fmt.split(':')
|
||||
gt_idx = fmt_fields.index('GT') if 'GT' in fmt_fields else 0
|
||||
|
||||
for i, sample in enumerate(samples):
|
||||
gt_parts = gt_fields[i].split(':')
|
||||
genotypes[sample] = gt_parts[gt_idx] if gt_idx < len(gt_parts) else './.'
|
||||
|
||||
# Parse annotation
|
||||
ann = parse_snpeff_annotation(info)
|
||||
|
||||
try:
|
||||
qual_val = float(qual) if qual != '.' else 0
|
||||
except ValueError:
|
||||
qual_val = 0
|
||||
|
||||
variant = Variant(
|
||||
chrom=chrom,
|
||||
pos=int(pos),
|
||||
ref=ref,
|
||||
alt=alt,
|
||||
qual=qual_val,
|
||||
filter_status=filt,
|
||||
info=info,
|
||||
genotypes=genotypes,
|
||||
annotation=info,
|
||||
gene=ann['gene'],
|
||||
effect=ann['effect'],
|
||||
impact=ann['impact']
|
||||
)
|
||||
variants.append(variant)
|
||||
|
||||
return samples, variants
|
||||
|
||||
def identify_de_novo(variants: List[Variant], proband: str, father: str, mother: str) -> List[Variant]:
|
||||
"""Identify de novo variants: present in proband but absent in both parents"""
|
||||
de_novo = []
|
||||
|
||||
for v in variants:
|
||||
if proband not in v.genotypes or father not in v.genotypes or mother not in v.genotypes:
|
||||
continue
|
||||
|
||||
proband_gt = get_genotype_class(v.genotypes[proband])
|
||||
father_gt = get_genotype_class(v.genotypes[father])
|
||||
mother_gt = get_genotype_class(v.genotypes[mother])
|
||||
|
||||
# De novo: proband has variant, both parents are HOM_REF
|
||||
if proband_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF' and mother_gt == 'HOM_REF':
|
||||
de_novo.append(v)
|
||||
|
||||
return de_novo
|
||||
|
||||
def identify_compound_het(variants: List[Variant], proband: str, father: str, mother: str) -> Dict[str, List[Variant]]:
|
||||
"""Identify compound heterozygous variants in genes"""
|
||||
gene_variants = defaultdict(list)
|
||||
|
||||
# Group HET variants by gene
|
||||
for v in variants:
|
||||
if not v.gene:
|
||||
continue
|
||||
|
||||
if proband not in v.genotypes:
|
||||
continue
|
||||
|
||||
proband_gt = get_genotype_class(v.genotypes[proband])
|
||||
if proband_gt != 'HET':
|
||||
continue
|
||||
|
||||
gene_variants[v.gene].append(v)
|
||||
|
||||
# Find compound het (>1 HET variant in same gene, inherited from different parents)
|
||||
compound_het = {}
|
||||
|
||||
for gene, vars_list in gene_variants.items():
|
||||
if len(vars_list) < 2:
|
||||
continue
|
||||
|
||||
maternal_inherited = []
|
||||
paternal_inherited = []
|
||||
|
||||
for v in vars_list:
|
||||
if father not in v.genotypes or mother not in v.genotypes:
|
||||
continue
|
||||
|
||||
father_gt = get_genotype_class(v.genotypes[father])
|
||||
mother_gt = get_genotype_class(v.genotypes[mother])
|
||||
|
||||
if father_gt in ['HET', 'HOM_ALT'] and mother_gt == 'HOM_REF':
|
||||
paternal_inherited.append(v)
|
||||
elif mother_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF':
|
||||
maternal_inherited.append(v)
|
||||
|
||||
if maternal_inherited and paternal_inherited:
|
||||
compound_het[gene] = maternal_inherited + paternal_inherited
|
||||
|
||||
return compound_het
|
||||
|
||||
def identify_homozygous_recessive(variants: List[Variant], proband: str, father: str, mother: str) -> List[Variant]:
|
||||
"""Identify homozygous recessive variants: HOM_ALT in proband, both parents HET"""
|
||||
hom_rec = []
|
||||
|
||||
for v in variants:
|
||||
if proband not in v.genotypes or father not in v.genotypes or mother not in v.genotypes:
|
||||
continue
|
||||
|
||||
proband_gt = get_genotype_class(v.genotypes[proband])
|
||||
father_gt = get_genotype_class(v.genotypes[father])
|
||||
mother_gt = get_genotype_class(v.genotypes[mother])
|
||||
|
||||
# Homozygous recessive: proband HOM_ALT, both parents HET
|
||||
if proband_gt == 'HOM_ALT' and father_gt == 'HET' and mother_gt == 'HET':
|
||||
hom_rec.append(v)
|
||||
|
||||
return hom_rec
|
||||
|
||||
def filter_by_impact(variants: List[Variant], impacts: List[str] = ['HIGH', 'MODERATE']) -> List[Variant]:
|
||||
"""Filter variants by impact level"""
|
||||
return [v for v in variants if v.impact in impacts]
|
||||
|
||||
def generate_report(vcf_path: str, output_path: str):
|
||||
"""Generate trio analysis report"""
|
||||
print(f"Parsing VCF: {vcf_path}")
|
||||
samples, variants = parse_vcf(vcf_path)
|
||||
|
||||
print(f"Found {len(samples)} samples: {samples}")
|
||||
print(f"Total variants: {len(variants)}")
|
||||
|
||||
# Identify sample roles based on file naming convention
|
||||
# Expected: I-1 (father), I-2 (mother), II-3 (proband)
|
||||
proband = None
|
||||
father = None
|
||||
mother = None
|
||||
|
||||
for s in samples:
|
||||
s_upper = s.upper()
|
||||
if 'II-3' in s_upper or 'PROBAND' in s_upper:
|
||||
proband = s
|
||||
elif 'I-1' in s_upper:
|
||||
father = s
|
||||
elif 'I-2' in s_upper:
|
||||
mother = s
|
||||
|
||||
if not all([proband, father, mother]):
|
||||
# Fallback: assume order is proband, father, mother
|
||||
if len(samples) >= 3:
|
||||
proband = samples[0]
|
||||
father = samples[1]
|
||||
mother = samples[2]
|
||||
else:
|
||||
print("ERROR: Could not identify trio samples")
|
||||
return
|
||||
|
||||
print(f"\nTrio identified:")
|
||||
print(f" Proband: {proband}")
|
||||
print(f" Father: {father}")
|
||||
print(f" Mother: {mother}")
|
||||
|
||||
# Analysis
|
||||
print("\n" + "="*80)
|
||||
print("TRIO ANALYSIS RESULTS")
|
||||
print("="*80)
|
||||
|
||||
# De novo variants
|
||||
de_novo = identify_de_novo(variants, proband, father, mother)
|
||||
de_novo_high = filter_by_impact(de_novo, ['HIGH', 'MODERATE'])
|
||||
|
||||
print(f"\n1. DE NOVO VARIANTS")
|
||||
print(f" Total de novo: {len(de_novo)}")
|
||||
print(f" HIGH/MODERATE impact: {len(de_novo_high)}")
|
||||
|
||||
# Compound heterozygous
|
||||
compound_het = identify_compound_het(variants, proband, father, mother)
|
||||
|
||||
print(f"\n2. COMPOUND HETEROZYGOUS GENES")
|
||||
print(f" Genes with compound het: {len(compound_het)}")
|
||||
|
||||
# Homozygous recessive
|
||||
hom_rec = identify_homozygous_recessive(variants, proband, father, mother)
|
||||
hom_rec_high = filter_by_impact(hom_rec, ['HIGH', 'MODERATE'])
|
||||
|
||||
print(f"\n3. HOMOZYGOUS RECESSIVE VARIANTS")
|
||||
print(f" Total: {len(hom_rec)}")
|
||||
print(f" HIGH/MODERATE impact: {len(hom_rec_high)}")
|
||||
|
||||
# Generate detailed report
|
||||
with open(output_path, 'w') as f:
|
||||
f.write("# Trio WES Analysis Report\n")
|
||||
f.write(f"# Generated from: {vcf_path}\n")
|
||||
f.write(f"# Samples: Proband={proband}, Father={father}, Mother={mother}\n")
|
||||
f.write(f"# Total variants analyzed: {len(variants)}\n\n")
|
||||
|
||||
# De novo HIGH/MODERATE impact
|
||||
f.write("## DE NOVO VARIANTS (HIGH/MODERATE IMPACT)\n")
|
||||
f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n")
|
||||
for v in sorted(de_novo_high, key=lambda x: (x.chrom, x.pos)):
|
||||
f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t{v.gene or 'N/A'}\t")
|
||||
f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
|
||||
f.write(f"{v.genotypes.get(proband, './.')}\t")
|
||||
f.write(f"{v.genotypes.get(father, './.')}\t")
|
||||
f.write(f"{v.genotypes.get(mother, './.')}\n")
|
||||
|
||||
# Compound heterozygous
|
||||
f.write("\n## COMPOUND HETEROZYGOUS GENES\n")
|
||||
for gene, vars_list in sorted(compound_het.items()):
|
||||
high_impact = [v for v in vars_list if v.impact in ['HIGH', 'MODERATE']]
|
||||
if high_impact:
|
||||
f.write(f"\n### {gene} ({len(vars_list)} variants, {len(high_impact)} HIGH/MODERATE)\n")
|
||||
f.write("CHROM\tPOS\tREF\tALT\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n")
|
||||
for v in sorted(high_impact, key=lambda x: x.pos):
|
||||
f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t")
|
||||
f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
|
||||
f.write(f"{v.genotypes.get(proband, './.')}\t")
|
||||
f.write(f"{v.genotypes.get(father, './.')}\t")
|
||||
f.write(f"{v.genotypes.get(mother, './.')}\n")
|
||||
|
||||
# Homozygous recessive HIGH/MODERATE
|
||||
f.write("\n## HOMOZYGOUS RECESSIVE VARIANTS (HIGH/MODERATE IMPACT)\n")
|
||||
f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n")
|
||||
for v in sorted(hom_rec_high, key=lambda x: (x.chrom, x.pos)):
|
||||
f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t{v.gene or 'N/A'}\t")
|
||||
f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
|
||||
f.write(f"{v.genotypes.get(proband, './.')}\t")
|
||||
f.write(f"{v.genotypes.get(father, './.')}\t")
|
||||
f.write(f"{v.genotypes.get(mother, './.')}\n")
|
||||
|
||||
# Summary statistics
|
||||
f.write("\n## SUMMARY STATISTICS\n")
|
||||
f.write(f"Total variants: {len(variants)}\n")
|
||||
f.write(f"De novo variants: {len(de_novo)}\n")
|
||||
f.write(f"De novo HIGH/MODERATE: {len(de_novo_high)}\n")
|
||||
f.write(f"Compound het genes: {len(compound_het)}\n")
|
||||
f.write(f"Homozygous recessive: {len(hom_rec)}\n")
|
||||
f.write(f"Homozygous recessive HIGH/MODERATE: {len(hom_rec_high)}\n")
|
||||
|
||||
print(f"\nReport saved to: {output_path}")
|
||||
|
||||
# Also print top candidates
|
||||
print("\n" + "="*80)
|
||||
print("TOP CANDIDATE VARIANTS")
|
||||
print("="*80)
|
||||
|
||||
print("\n--- De Novo HIGH Impact ---")
|
||||
de_novo_high_only = [v for v in de_novo if v.impact == 'HIGH']
|
||||
for v in de_novo_high_only[:10]:
|
||||
print(f" {v.chrom}:{v.pos} {v.ref}>{v.alt} | {v.gene} | {v.effect}")
|
||||
|
||||
print("\n--- Compound Het Genes (with HIGH impact) ---")
|
||||
for gene, vars_list in list(compound_het.items())[:10]:
|
||||
high_count = sum(1 for v in vars_list if v.impact == 'HIGH')
|
||||
if high_count > 0:
|
||||
print(f" {gene}: {len(vars_list)} variants ({high_count} HIGH)")
|
||||
|
||||
print("\n--- Homozygous Recessive HIGH Impact ---")
|
||||
hom_rec_high_only = [v for v in hom_rec if v.impact == 'HIGH']
|
||||
for v in hom_rec_high_only[:10]:
|
||||
print(f" {v.chrom}:{v.pos} {v.ref}>{v.alt} | {v.gene} | {v.effect}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
|
||||
vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf'
|
||||
output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/trio_analysis_report.txt'
|
||||
|
||||
generate_report(vcf_path, output_path)
|
||||
Reference in New Issue
Block a user