- Add trio_analysis.py for trio-based variant analysis with de novo detection - Add clinvar_acmg_annotate.py for ClinVar/ACMG annotation - Add gwas_comprehensive.py with 201 SNPs across 18 categories - Add pharmgkb_full_analysis.py for pharmacogenomics analysis - Add gwas_trait_lookup.py for basic GWAS trait lookup - Add pharmacogenomics.py for basic PGx analysis - Remove unused scaffolding code (src/, configs/, docs/, tests/) - Update README.md with new documentation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
449 lines
16 KiB
Python
449 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
ClinVar Annotation and ACMG Classification Script
|
|
Integrates ClinVar lookup with ACMG auto-classification for trio analysis.
|
|
"""
|
|
|
|
import gzip
|
|
import re
|
|
import sys
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass, field
|
|
from typing import Dict, List, Optional, Set, Tuple
|
|
from pathlib import Path
|
|
|
|
# Add project src to path
|
|
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
|
|
|
try:
|
|
from genomic_consultant.acmg.tagger import ACMGConfig, tag_variant, _is_lof
|
|
from genomic_consultant.utils.models import Variant, EvidenceTag, SuggestedClassification
|
|
HAS_PROJECT_MODULES = True
|
|
except ImportError:
|
|
HAS_PROJECT_MODULES = False
|
|
print("Warning: Project modules not found, using built-in ACMG classification")
|
|
|
|
|
|
@dataclass
|
|
class ClinVarEntry:
|
|
"""ClinVar database entry"""
|
|
chrom: str
|
|
pos: int
|
|
ref: str
|
|
alt: str
|
|
clnsig: str # Clinical significance
|
|
clndn: str # Disease name
|
|
clnrevstat: str # Review status
|
|
clnvc: str # Variant type
|
|
af: Optional[float] = None
|
|
|
|
|
|
@dataclass
|
|
class AnnotatedVariant:
|
|
"""Variant with all annotations"""
|
|
chrom: str
|
|
pos: int
|
|
ref: str
|
|
alt: str
|
|
gene: Optional[str] = None
|
|
effect: Optional[str] = None
|
|
impact: Optional[str] = None
|
|
genotypes: Dict[str, str] = field(default_factory=dict)
|
|
clinvar_sig: Optional[str] = None
|
|
clinvar_disease: Optional[str] = None
|
|
clinvar_review: Optional[str] = None
|
|
acmg_class: Optional[str] = None
|
|
acmg_evidence: List[str] = field(default_factory=list)
|
|
inheritance_pattern: Optional[str] = None # de_novo, compound_het, hom_rec, etc.
|
|
|
|
@property
|
|
def variant_id(self) -> str:
|
|
return f"{self.chrom}-{self.pos}-{self.ref}-{self.alt}"
|
|
|
|
|
|
def load_clinvar_vcf(clinvar_path: str) -> Dict[str, ClinVarEntry]:
|
|
"""Load ClinVar VCF into a lookup dictionary"""
|
|
print(f"Loading ClinVar database from {clinvar_path}...")
|
|
clinvar_db = {}
|
|
|
|
open_func = gzip.open if clinvar_path.endswith('.gz') else open
|
|
mode = 'rt' if clinvar_path.endswith('.gz') else 'r'
|
|
|
|
count = 0
|
|
with open_func(clinvar_path, mode) as f:
|
|
for line in f:
|
|
if line.startswith('#'):
|
|
continue
|
|
|
|
parts = line.strip().split('\t')
|
|
if len(parts) < 8:
|
|
continue
|
|
|
|
chrom, pos, _, ref, alt, _, _, info = parts[:8]
|
|
|
|
# Parse INFO field
|
|
info_dict = {}
|
|
for item in info.split(';'):
|
|
if '=' in item:
|
|
k, v = item.split('=', 1)
|
|
info_dict[k] = v
|
|
|
|
clnsig = info_dict.get('CLNSIG', '')
|
|
clndn = info_dict.get('CLNDN', '')
|
|
clnrevstat = info_dict.get('CLNREVSTAT', '')
|
|
clnvc = info_dict.get('CLNVC', '')
|
|
|
|
# Handle multiple alts
|
|
for a in alt.split(','):
|
|
key = f"{chrom}-{pos}-{ref}-{a}"
|
|
clinvar_db[key] = ClinVarEntry(
|
|
chrom=chrom,
|
|
pos=int(pos),
|
|
ref=ref,
|
|
alt=a,
|
|
clnsig=clnsig,
|
|
clndn=clndn,
|
|
clnrevstat=clnrevstat,
|
|
clnvc=clnvc
|
|
)
|
|
count += 1
|
|
|
|
print(f"Loaded {count} ClinVar entries")
|
|
return clinvar_db
|
|
|
|
|
|
def parse_snpeff_annotation(info: str) -> Dict:
|
|
"""Parse SnpEff ANN field"""
|
|
result = {
|
|
'gene': None,
|
|
'effect': None,
|
|
'impact': None,
|
|
'hgvs_c': None,
|
|
'hgvs_p': None,
|
|
}
|
|
|
|
ann_match = re.search(r'ANN=([^;]+)', info)
|
|
if not ann_match:
|
|
return result
|
|
|
|
ann_field = ann_match.group(1)
|
|
annotations = ann_field.split(',')
|
|
|
|
if annotations:
|
|
parts = annotations[0].split('|')
|
|
if len(parts) >= 4:
|
|
result['effect'] = parts[1] if len(parts) > 1 else None
|
|
result['impact'] = parts[2] if len(parts) > 2 else None
|
|
result['gene'] = parts[3] if len(parts) > 3 else None
|
|
if len(parts) > 9:
|
|
result['hgvs_c'] = parts[9]
|
|
if len(parts) > 10:
|
|
result['hgvs_p'] = parts[10]
|
|
|
|
return result
|
|
|
|
|
|
def get_genotype_class(gt: str) -> str:
|
|
"""Classify genotype"""
|
|
if gt in ['./.', '.|.', '.']:
|
|
return 'MISSING'
|
|
|
|
alleles = re.split('[/|]', gt)
|
|
if all(a == '0' for a in alleles):
|
|
return 'HOM_REF'
|
|
elif all(a != '0' and a != '.' for a in alleles):
|
|
return 'HOM_ALT'
|
|
else:
|
|
return 'HET'
|
|
|
|
|
|
class ACMGClassifier:
|
|
"""ACMG variant classifier"""
|
|
|
|
def __init__(self, lof_genes: Optional[Set[str]] = None):
|
|
self.lof_genes = lof_genes or {
|
|
'BRCA1', 'BRCA2', 'TP53', 'PTEN', 'MLH1', 'MSH2', 'MSH6', 'PMS2',
|
|
'APC', 'MEN1', 'RB1', 'VHL', 'WT1', 'NF1', 'NF2', 'TSC1', 'TSC2'
|
|
}
|
|
self.ba1_af = 0.05
|
|
self.bs1_af = 0.01
|
|
self.pm2_af = 0.0005
|
|
|
|
def classify(self, variant: AnnotatedVariant, is_de_novo: bool = False) -> Tuple[str, List[str]]:
|
|
"""Apply ACMG classification rules"""
|
|
evidence = []
|
|
|
|
# ClinVar evidence
|
|
if variant.clinvar_sig:
|
|
sig_lower = variant.clinvar_sig.lower()
|
|
if 'pathogenic' in sig_lower and 'likely' not in sig_lower:
|
|
evidence.append("PP5: ClinVar pathogenic")
|
|
elif 'likely_pathogenic' in sig_lower:
|
|
evidence.append("PP5: ClinVar likely pathogenic")
|
|
elif 'benign' in sig_lower and 'likely' not in sig_lower:
|
|
evidence.append("BP6: ClinVar benign")
|
|
elif 'likely_benign' in sig_lower:
|
|
evidence.append("BP6: ClinVar likely benign")
|
|
|
|
# Loss of function in LoF-sensitive gene (PVS1)
|
|
if variant.effect and variant.gene:
|
|
lof_keywords = ['frameshift', 'stop_gained', 'splice_acceptor', 'splice_donor', 'start_lost']
|
|
if any(k in variant.effect.lower() for k in lof_keywords):
|
|
if variant.gene.upper() in self.lof_genes:
|
|
evidence.append("PVS1: Null variant in LoF-sensitive gene")
|
|
else:
|
|
evidence.append("PVS1_moderate: Null variant (gene not confirmed LoF-sensitive)")
|
|
|
|
# De novo (PS2)
|
|
if is_de_novo:
|
|
evidence.append("PS2: De novo variant")
|
|
|
|
# Impact-based evidence
|
|
if variant.impact == 'HIGH':
|
|
evidence.append("PM4: Protein length change (HIGH impact)")
|
|
elif variant.impact == 'MODERATE':
|
|
if variant.effect and 'missense' in variant.effect.lower():
|
|
evidence.append("PP3: Computational evidence (missense)")
|
|
|
|
# Determine final classification
|
|
classification = self._determine_class(evidence, variant.clinvar_sig)
|
|
|
|
return classification, evidence
|
|
|
|
def _determine_class(self, evidence: List[str], clinvar_sig: Optional[str]) -> str:
|
|
"""Determine ACMG class based on evidence"""
|
|
evidence_str = ' '.join(evidence)
|
|
|
|
# ClinVar takes precedence if high confidence
|
|
if clinvar_sig:
|
|
sig_lower = clinvar_sig.lower()
|
|
if 'pathogenic' in sig_lower and 'conflicting' not in sig_lower:
|
|
if 'likely' in sig_lower:
|
|
return 'Likely Pathogenic'
|
|
return 'Pathogenic'
|
|
elif 'benign' in sig_lower and 'conflicting' not in sig_lower:
|
|
if 'likely' in sig_lower:
|
|
return 'Likely Benign'
|
|
return 'Benign'
|
|
|
|
# Rule-based classification
|
|
has_pvs1 = 'PVS1:' in evidence_str
|
|
has_ps2 = 'PS2:' in evidence_str
|
|
has_pm4 = 'PM4:' in evidence_str
|
|
has_pp = 'PP' in evidence_str
|
|
has_bp = 'BP' in evidence_str
|
|
|
|
if has_pvs1 and has_ps2:
|
|
return 'Pathogenic'
|
|
elif has_pvs1 or (has_ps2 and has_pm4):
|
|
return 'Likely Pathogenic'
|
|
elif has_bp and not has_pp and not has_pvs1:
|
|
return 'Likely Benign'
|
|
else:
|
|
return 'VUS'
|
|
|
|
|
|
def analyze_trio_with_clinvar(
|
|
snpeff_vcf: str,
|
|
clinvar_path: str,
|
|
output_path: str,
|
|
proband_idx: int = 0,
|
|
father_idx: int = 1,
|
|
mother_idx: int = 2
|
|
):
|
|
"""Main analysis function"""
|
|
|
|
# Load ClinVar
|
|
clinvar_db = load_clinvar_vcf(clinvar_path)
|
|
|
|
# Initialize classifier
|
|
classifier = ACMGClassifier()
|
|
|
|
# Parse VCF and annotate
|
|
print(f"Processing {snpeff_vcf}...")
|
|
|
|
samples = []
|
|
results = []
|
|
pathogenic_variants = []
|
|
|
|
open_func = gzip.open if snpeff_vcf.endswith('.gz') else open
|
|
mode = 'rt' if snpeff_vcf.endswith('.gz') else 'r'
|
|
|
|
with open_func(snpeff_vcf, mode) as f:
|
|
for line in f:
|
|
if line.startswith('##'):
|
|
continue
|
|
elif line.startswith('#CHROM'):
|
|
parts = line.strip().split('\t')
|
|
samples = parts[9:]
|
|
continue
|
|
|
|
parts = line.strip().split('\t')
|
|
if len(parts) < 10:
|
|
continue
|
|
|
|
chrom, pos, _, ref, alt, qual, filt, info, fmt = parts[:9]
|
|
gt_fields = parts[9:]
|
|
|
|
# Parse genotypes
|
|
fmt_parts = fmt.split(':')
|
|
gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0
|
|
|
|
genotypes = {}
|
|
for i, sample in enumerate(samples):
|
|
gt_data = gt_fields[i].split(':')
|
|
genotypes[sample] = gt_data[gt_idx] if gt_idx < len(gt_data) else './.'
|
|
|
|
# Parse SnpEff annotation
|
|
ann = parse_snpeff_annotation(info)
|
|
|
|
# Only process variants in proband
|
|
proband = samples[proband_idx] if proband_idx < len(samples) else samples[0]
|
|
proband_gt = get_genotype_class(genotypes.get(proband, './.'))
|
|
|
|
if proband_gt == 'HOM_REF' or proband_gt == 'MISSING':
|
|
continue
|
|
|
|
# Check inheritance pattern
|
|
father = samples[father_idx] if father_idx < len(samples) else samples[1]
|
|
mother = samples[mother_idx] if mother_idx < len(samples) else samples[2]
|
|
father_gt = get_genotype_class(genotypes.get(father, './.'))
|
|
mother_gt = get_genotype_class(genotypes.get(mother, './.'))
|
|
|
|
is_de_novo = (proband_gt in ['HET', 'HOM_ALT'] and
|
|
father_gt == 'HOM_REF' and mother_gt == 'HOM_REF')
|
|
|
|
is_hom_rec = (proband_gt == 'HOM_ALT' and
|
|
father_gt == 'HET' and mother_gt == 'HET')
|
|
|
|
inheritance = None
|
|
if is_de_novo:
|
|
inheritance = 'de_novo'
|
|
elif is_hom_rec:
|
|
inheritance = 'homozygous_recessive'
|
|
elif proband_gt == 'HET':
|
|
if father_gt in ['HET', 'HOM_ALT'] and mother_gt == 'HOM_REF':
|
|
inheritance = 'paternal'
|
|
elif mother_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF':
|
|
inheritance = 'maternal'
|
|
|
|
# Lookup ClinVar
|
|
for a in alt.split(','):
|
|
var_key = f"{chrom}-{pos}-{ref}-{a}"
|
|
clinvar_entry = clinvar_db.get(var_key)
|
|
|
|
variant = AnnotatedVariant(
|
|
chrom=chrom,
|
|
pos=int(pos),
|
|
ref=ref,
|
|
alt=a,
|
|
gene=ann['gene'],
|
|
effect=ann['effect'],
|
|
impact=ann['impact'],
|
|
genotypes=genotypes,
|
|
inheritance_pattern=inheritance
|
|
)
|
|
|
|
if clinvar_entry:
|
|
variant.clinvar_sig = clinvar_entry.clnsig
|
|
variant.clinvar_disease = clinvar_entry.clndn
|
|
variant.clinvar_review = clinvar_entry.clnrevstat
|
|
|
|
# ACMG classification
|
|
acmg_class, evidence = classifier.classify(variant, is_de_novo)
|
|
variant.acmg_class = acmg_class
|
|
variant.acmg_evidence = evidence
|
|
|
|
# Filter for clinically relevant variants
|
|
if (variant.clinvar_sig and 'pathogenic' in variant.clinvar_sig.lower()) or \
|
|
acmg_class in ['Pathogenic', 'Likely Pathogenic'] or \
|
|
(is_de_novo and ann['impact'] in ['HIGH', 'MODERATE']):
|
|
pathogenic_variants.append(variant)
|
|
|
|
results.append(variant)
|
|
|
|
# Generate report
|
|
print(f"Writing report to {output_path}...")
|
|
|
|
with open(output_path, 'w') as f:
|
|
f.write("# ClinVar & ACMG Classification Report\n")
|
|
f.write(f"# Input: {snpeff_vcf}\n")
|
|
f.write(f"# ClinVar: {clinvar_path}\n")
|
|
f.write(f"# Samples: {', '.join(samples)}\n")
|
|
f.write(f"# Total variants processed: {len(results)}\n\n")
|
|
|
|
f.write("## CLINICALLY RELEVANT VARIANTS\n\n")
|
|
f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tINHERITANCE\tCLINVAR_SIG\tCLINVAR_DISEASE\tACMG_CLASS\tACMG_EVIDENCE\n")
|
|
|
|
for v in sorted(pathogenic_variants, key=lambda x: (x.acmg_class != 'Pathogenic',
|
|
x.acmg_class != 'Likely Pathogenic',
|
|
x.chrom, x.pos)):
|
|
f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t")
|
|
f.write(f"{v.gene or 'N/A'}\t{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
|
|
f.write(f"{v.inheritance_pattern or 'N/A'}\t")
|
|
f.write(f"{v.clinvar_sig or 'N/A'}\t")
|
|
f.write(f"{v.clinvar_disease or 'N/A'}\t")
|
|
f.write(f"{v.acmg_class}\t")
|
|
f.write(f"{'; '.join(v.acmg_evidence)}\n")
|
|
|
|
# Summary statistics
|
|
f.write("\n## SUMMARY\n")
|
|
f.write(f"Total variants in proband: {len(results)}\n")
|
|
f.write(f"Clinically relevant variants: {len(pathogenic_variants)}\n")
|
|
|
|
# Count by ACMG class
|
|
acmg_counts = defaultdict(int)
|
|
for v in pathogenic_variants:
|
|
acmg_counts[v.acmg_class] += 1
|
|
|
|
f.write("\nBy ACMG Classification:\n")
|
|
for cls in ['Pathogenic', 'Likely Pathogenic', 'VUS', 'Likely Benign', 'Benign']:
|
|
if cls in acmg_counts:
|
|
f.write(f" {cls}: {acmg_counts[cls]}\n")
|
|
|
|
# Count by inheritance
|
|
inh_counts = defaultdict(int)
|
|
for v in pathogenic_variants:
|
|
inh_counts[v.inheritance_pattern or 'unknown'] += 1
|
|
|
|
f.write("\nBy Inheritance Pattern:\n")
|
|
for inh, count in sorted(inh_counts.items()):
|
|
f.write(f" {inh}: {count}\n")
|
|
|
|
# ClinVar matches
|
|
clinvar_match = sum(1 for v in pathogenic_variants if v.clinvar_sig)
|
|
f.write(f"\nVariants with ClinVar annotation: {clinvar_match}\n")
|
|
|
|
print(f"\nAnalysis complete!")
|
|
print(f"Clinically relevant variants: {len(pathogenic_variants)}")
|
|
print(f"Report saved to: {output_path}")
|
|
|
|
# Print top candidates
|
|
print("\n=== TOP PATHOGENIC CANDIDATES ===\n")
|
|
top_variants = [v for v in pathogenic_variants if v.acmg_class in ['Pathogenic', 'Likely Pathogenic']][:20]
|
|
|
|
for v in top_variants:
|
|
print(f"{v.chrom}:{v.pos} {v.ref}>{v.alt}")
|
|
print(f" Gene: {v.gene} | Effect: {v.effect}")
|
|
print(f" Inheritance: {v.inheritance_pattern}")
|
|
print(f" ClinVar: {v.clinvar_sig or 'Not found'}")
|
|
if v.clinvar_disease:
|
|
print(f" Disease: {v.clinvar_disease[:80]}...")
|
|
print(f" ACMG: {v.acmg_class}")
|
|
print(f" Evidence: {'; '.join(v.acmg_evidence)}")
|
|
print()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
snpeff_vcf = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf'
|
|
clinvar_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_reference/clinvar/clinvar_GRCh37.vcf.gz'
|
|
output_path = sys.argv[3] if len(sys.argv) > 3 else '/Volumes/NV2/genomics_analysis/clinvar_acmg_report.txt'
|
|
|
|
# VCF sample order: NV0066-08_S33 (idx 0), NV0066-09_S34 (idx 1), NV0066-10_S35 (idx 2)
|
|
# Correct mapping: S35 = proband (II-3), S33 = parent, S34 = parent
|
|
proband_idx = int(sys.argv[4]) if len(sys.argv) > 4 else 2 # S35 is proband
|
|
father_idx = int(sys.argv[5]) if len(sys.argv) > 5 else 0 # S33
|
|
mother_idx = int(sys.argv[6]) if len(sys.argv) > 6 else 1 # S34
|
|
|
|
analyze_trio_with_clinvar(snpeff_vcf, clinvar_path, output_path, proband_idx, father_idx, mother_idx)
|