Refactor: Replace scaffolding with working analysis scripts

- Add trio_analysis.py for trio-based variant analysis with de novo detection
- Add clinvar_acmg_annotate.py for ClinVar/ACMG annotation
- Add gwas_comprehensive.py with 201 SNPs across 18 categories
- Add pharmgkb_full_analysis.py for pharmacogenomics analysis
- Add gwas_trait_lookup.py for basic GWAS trait lookup
- Add pharmacogenomics.py for basic PGx analysis
- Remove unused scaffolding code (src/, configs/, docs/, tests/)
- Update README.md with new documentation

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-12-01 22:36:02 +08:00
parent f74dc351f7
commit d13d58df8b
56 changed files with 2608 additions and 2347 deletions

376
trio_analysis.py Normal file
View File

@@ -0,0 +1,376 @@
#!/usr/bin/env python3
"""
Trio WES Analysis Script
Analyzes trio VCF for de novo mutations, compound heterozygous variants,
and potential pathogenic variants.
"""
import gzip
import re
from collections import defaultdict
from dataclasses import dataclass
from typing import List, Dict, Optional, Tuple
import json
@dataclass
class Variant:
chrom: str
pos: int
ref: str
alt: str
qual: float
filter_status: str
info: str
genotypes: Dict[str, str] # sample -> genotype
annotation: Optional[str] = None
gene: Optional[str] = None
effect: Optional[str] = None
impact: Optional[str] = None
def parse_genotype(gt_field: str) -> Tuple[str, int, int]:
"""Parse genotype field, return (gt_string, ref_count, alt_count)"""
parts = gt_field.split(':')
gt = parts[0]
if gt in ['./.', '.|.', '.']:
return gt, 0, 0
alleles = re.split('[/|]', gt)
ref_count = sum(1 for a in alleles if a == '0')
alt_count = sum(1 for a in alleles if a != '0' and a != '.')
return gt, ref_count, alt_count
def get_genotype_class(gt: str) -> str:
"""Classify genotype as HOM_REF, HET, HOM_ALT, or MISSING"""
if gt in ['./.', '.|.', '.']:
return 'MISSING'
alleles = re.split('[/|]', gt)
if all(a == '0' for a in alleles):
return 'HOM_REF'
elif all(a != '0' and a != '.' for a in alleles):
return 'HOM_ALT'
else:
return 'HET'
def parse_snpeff_annotation(info: str) -> Dict:
"""Parse SnpEff ANN field"""
result = {
'gene': None,
'effect': None,
'impact': None,
'hgvs_c': None,
'hgvs_p': None,
}
ann_match = re.search(r'ANN=([^;]+)', info)
if not ann_match:
return result
ann_field = ann_match.group(1)
annotations = ann_field.split(',')
if annotations:
# Take the first (most severe) annotation
parts = annotations[0].split('|')
if len(parts) >= 4:
result['effect'] = parts[1] if len(parts) > 1 else None
result['impact'] = parts[2] if len(parts) > 2 else None
result['gene'] = parts[3] if len(parts) > 3 else None
if len(parts) > 9:
result['hgvs_c'] = parts[9]
if len(parts) > 10:
result['hgvs_p'] = parts[10]
return result
def parse_vcf(vcf_path: str) -> Tuple[List[str], List[Variant]]:
"""Parse VCF file and return sample names and variants"""
samples = []
variants = []
open_func = gzip.open if vcf_path.endswith('.gz') else open
mode = 'rt' if vcf_path.endswith('.gz') else 'r'
with open_func(vcf_path, mode) as f:
for line in f:
if line.startswith('##'):
continue
elif line.startswith('#CHROM'):
parts = line.strip().split('\t')
samples = parts[9:]
continue
parts = line.strip().split('\t')
if len(parts) < 10:
continue
chrom, pos, _, ref, alt, qual, filt, info, fmt = parts[:9]
gt_fields = parts[9:]
# Parse genotypes
genotypes = {}
fmt_fields = fmt.split(':')
gt_idx = fmt_fields.index('GT') if 'GT' in fmt_fields else 0
for i, sample in enumerate(samples):
gt_parts = gt_fields[i].split(':')
genotypes[sample] = gt_parts[gt_idx] if gt_idx < len(gt_parts) else './.'
# Parse annotation
ann = parse_snpeff_annotation(info)
try:
qual_val = float(qual) if qual != '.' else 0
except ValueError:
qual_val = 0
variant = Variant(
chrom=chrom,
pos=int(pos),
ref=ref,
alt=alt,
qual=qual_val,
filter_status=filt,
info=info,
genotypes=genotypes,
annotation=info,
gene=ann['gene'],
effect=ann['effect'],
impact=ann['impact']
)
variants.append(variant)
return samples, variants
def identify_de_novo(variants: List[Variant], proband: str, father: str, mother: str) -> List[Variant]:
"""Identify de novo variants: present in proband but absent in both parents"""
de_novo = []
for v in variants:
if proband not in v.genotypes or father not in v.genotypes or mother not in v.genotypes:
continue
proband_gt = get_genotype_class(v.genotypes[proband])
father_gt = get_genotype_class(v.genotypes[father])
mother_gt = get_genotype_class(v.genotypes[mother])
# De novo: proband has variant, both parents are HOM_REF
if proband_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF' and mother_gt == 'HOM_REF':
de_novo.append(v)
return de_novo
def identify_compound_het(variants: List[Variant], proband: str, father: str, mother: str) -> Dict[str, List[Variant]]:
"""Identify compound heterozygous variants in genes"""
gene_variants = defaultdict(list)
# Group HET variants by gene
for v in variants:
if not v.gene:
continue
if proband not in v.genotypes:
continue
proband_gt = get_genotype_class(v.genotypes[proband])
if proband_gt != 'HET':
continue
gene_variants[v.gene].append(v)
# Find compound het (>1 HET variant in same gene, inherited from different parents)
compound_het = {}
for gene, vars_list in gene_variants.items():
if len(vars_list) < 2:
continue
maternal_inherited = []
paternal_inherited = []
for v in vars_list:
if father not in v.genotypes or mother not in v.genotypes:
continue
father_gt = get_genotype_class(v.genotypes[father])
mother_gt = get_genotype_class(v.genotypes[mother])
if father_gt in ['HET', 'HOM_ALT'] and mother_gt == 'HOM_REF':
paternal_inherited.append(v)
elif mother_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF':
maternal_inherited.append(v)
if maternal_inherited and paternal_inherited:
compound_het[gene] = maternal_inherited + paternal_inherited
return compound_het
def identify_homozygous_recessive(variants: List[Variant], proband: str, father: str, mother: str) -> List[Variant]:
"""Identify homozygous recessive variants: HOM_ALT in proband, both parents HET"""
hom_rec = []
for v in variants:
if proband not in v.genotypes or father not in v.genotypes or mother not in v.genotypes:
continue
proband_gt = get_genotype_class(v.genotypes[proband])
father_gt = get_genotype_class(v.genotypes[father])
mother_gt = get_genotype_class(v.genotypes[mother])
# Homozygous recessive: proband HOM_ALT, both parents HET
if proband_gt == 'HOM_ALT' and father_gt == 'HET' and mother_gt == 'HET':
hom_rec.append(v)
return hom_rec
def filter_by_impact(variants: List[Variant], impacts: List[str] = ['HIGH', 'MODERATE']) -> List[Variant]:
"""Filter variants by impact level"""
return [v for v in variants if v.impact in impacts]
def generate_report(vcf_path: str, output_path: str):
"""Generate trio analysis report"""
print(f"Parsing VCF: {vcf_path}")
samples, variants = parse_vcf(vcf_path)
print(f"Found {len(samples)} samples: {samples}")
print(f"Total variants: {len(variants)}")
# Identify sample roles based on file naming convention
# Expected: I-1 (father), I-2 (mother), II-3 (proband)
proband = None
father = None
mother = None
for s in samples:
s_upper = s.upper()
if 'II-3' in s_upper or 'PROBAND' in s_upper:
proband = s
elif 'I-1' in s_upper:
father = s
elif 'I-2' in s_upper:
mother = s
if not all([proband, father, mother]):
# Fallback: assume order is proband, father, mother
if len(samples) >= 3:
proband = samples[0]
father = samples[1]
mother = samples[2]
else:
print("ERROR: Could not identify trio samples")
return
print(f"\nTrio identified:")
print(f" Proband: {proband}")
print(f" Father: {father}")
print(f" Mother: {mother}")
# Analysis
print("\n" + "="*80)
print("TRIO ANALYSIS RESULTS")
print("="*80)
# De novo variants
de_novo = identify_de_novo(variants, proband, father, mother)
de_novo_high = filter_by_impact(de_novo, ['HIGH', 'MODERATE'])
print(f"\n1. DE NOVO VARIANTS")
print(f" Total de novo: {len(de_novo)}")
print(f" HIGH/MODERATE impact: {len(de_novo_high)}")
# Compound heterozygous
compound_het = identify_compound_het(variants, proband, father, mother)
print(f"\n2. COMPOUND HETEROZYGOUS GENES")
print(f" Genes with compound het: {len(compound_het)}")
# Homozygous recessive
hom_rec = identify_homozygous_recessive(variants, proband, father, mother)
hom_rec_high = filter_by_impact(hom_rec, ['HIGH', 'MODERATE'])
print(f"\n3. HOMOZYGOUS RECESSIVE VARIANTS")
print(f" Total: {len(hom_rec)}")
print(f" HIGH/MODERATE impact: {len(hom_rec_high)}")
# Generate detailed report
with open(output_path, 'w') as f:
f.write("# Trio WES Analysis Report\n")
f.write(f"# Generated from: {vcf_path}\n")
f.write(f"# Samples: Proband={proband}, Father={father}, Mother={mother}\n")
f.write(f"# Total variants analyzed: {len(variants)}\n\n")
# De novo HIGH/MODERATE impact
f.write("## DE NOVO VARIANTS (HIGH/MODERATE IMPACT)\n")
f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n")
for v in sorted(de_novo_high, key=lambda x: (x.chrom, x.pos)):
f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t{v.gene or 'N/A'}\t")
f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
f.write(f"{v.genotypes.get(proband, './.')}\t")
f.write(f"{v.genotypes.get(father, './.')}\t")
f.write(f"{v.genotypes.get(mother, './.')}\n")
# Compound heterozygous
f.write("\n## COMPOUND HETEROZYGOUS GENES\n")
for gene, vars_list in sorted(compound_het.items()):
high_impact = [v for v in vars_list if v.impact in ['HIGH', 'MODERATE']]
if high_impact:
f.write(f"\n### {gene} ({len(vars_list)} variants, {len(high_impact)} HIGH/MODERATE)\n")
f.write("CHROM\tPOS\tREF\tALT\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n")
for v in sorted(high_impact, key=lambda x: x.pos):
f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t")
f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
f.write(f"{v.genotypes.get(proband, './.')}\t")
f.write(f"{v.genotypes.get(father, './.')}\t")
f.write(f"{v.genotypes.get(mother, './.')}\n")
# Homozygous recessive HIGH/MODERATE
f.write("\n## HOMOZYGOUS RECESSIVE VARIANTS (HIGH/MODERATE IMPACT)\n")
f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n")
for v in sorted(hom_rec_high, key=lambda x: (x.chrom, x.pos)):
f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t{v.gene or 'N/A'}\t")
f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
f.write(f"{v.genotypes.get(proband, './.')}\t")
f.write(f"{v.genotypes.get(father, './.')}\t")
f.write(f"{v.genotypes.get(mother, './.')}\n")
# Summary statistics
f.write("\n## SUMMARY STATISTICS\n")
f.write(f"Total variants: {len(variants)}\n")
f.write(f"De novo variants: {len(de_novo)}\n")
f.write(f"De novo HIGH/MODERATE: {len(de_novo_high)}\n")
f.write(f"Compound het genes: {len(compound_het)}\n")
f.write(f"Homozygous recessive: {len(hom_rec)}\n")
f.write(f"Homozygous recessive HIGH/MODERATE: {len(hom_rec_high)}\n")
print(f"\nReport saved to: {output_path}")
# Also print top candidates
print("\n" + "="*80)
print("TOP CANDIDATE VARIANTS")
print("="*80)
print("\n--- De Novo HIGH Impact ---")
de_novo_high_only = [v for v in de_novo if v.impact == 'HIGH']
for v in de_novo_high_only[:10]:
print(f" {v.chrom}:{v.pos} {v.ref}>{v.alt} | {v.gene} | {v.effect}")
print("\n--- Compound Het Genes (with HIGH impact) ---")
for gene, vars_list in list(compound_het.items())[:10]:
high_count = sum(1 for v in vars_list if v.impact == 'HIGH')
if high_count > 0:
print(f" {gene}: {len(vars_list)} variants ({high_count} HIGH)")
print("\n--- Homozygous Recessive HIGH Impact ---")
hom_rec_high_only = [v for v in hom_rec if v.impact == 'HIGH']
for v in hom_rec_high_only[:10]:
print(f" {v.chrom}:{v.pos} {v.ref}>{v.alt} | {v.gene} | {v.effect}")
if __name__ == '__main__':
import sys
vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf'
output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/trio_analysis_report.txt'
generate_report(vcf_path, output_path)