Refactor: Replace scaffolding with working analysis scripts
- Add trio_analysis.py for trio-based variant analysis with de novo detection - Add clinvar_acmg_annotate.py for ClinVar/ACMG annotation - Add gwas_comprehensive.py with 201 SNPs across 18 categories - Add pharmgkb_full_analysis.py for pharmacogenomics analysis - Add gwas_trait_lookup.py for basic GWAS trait lookup - Add pharmacogenomics.py for basic PGx analysis - Remove unused scaffolding code (src/, configs/, docs/, tests/) - Update README.md with new documentation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
448
clinvar_acmg_annotate.py
Normal file
448
clinvar_acmg_annotate.py
Normal file
@@ -0,0 +1,448 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ClinVar Annotation and ACMG Classification Script
|
||||
Integrates ClinVar lookup with ACMG auto-classification for trio analysis.
|
||||
"""
|
||||
|
||||
import gzip
|
||||
import re
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
from pathlib import Path
|
||||
|
||||
# Add project src to path
|
||||
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
||||
|
||||
try:
|
||||
from genomic_consultant.acmg.tagger import ACMGConfig, tag_variant, _is_lof
|
||||
from genomic_consultant.utils.models import Variant, EvidenceTag, SuggestedClassification
|
||||
HAS_PROJECT_MODULES = True
|
||||
except ImportError:
|
||||
HAS_PROJECT_MODULES = False
|
||||
print("Warning: Project modules not found, using built-in ACMG classification")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClinVarEntry:
|
||||
"""ClinVar database entry"""
|
||||
chrom: str
|
||||
pos: int
|
||||
ref: str
|
||||
alt: str
|
||||
clnsig: str # Clinical significance
|
||||
clndn: str # Disease name
|
||||
clnrevstat: str # Review status
|
||||
clnvc: str # Variant type
|
||||
af: Optional[float] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class AnnotatedVariant:
|
||||
"""Variant with all annotations"""
|
||||
chrom: str
|
||||
pos: int
|
||||
ref: str
|
||||
alt: str
|
||||
gene: Optional[str] = None
|
||||
effect: Optional[str] = None
|
||||
impact: Optional[str] = None
|
||||
genotypes: Dict[str, str] = field(default_factory=dict)
|
||||
clinvar_sig: Optional[str] = None
|
||||
clinvar_disease: Optional[str] = None
|
||||
clinvar_review: Optional[str] = None
|
||||
acmg_class: Optional[str] = None
|
||||
acmg_evidence: List[str] = field(default_factory=list)
|
||||
inheritance_pattern: Optional[str] = None # de_novo, compound_het, hom_rec, etc.
|
||||
|
||||
@property
|
||||
def variant_id(self) -> str:
|
||||
return f"{self.chrom}-{self.pos}-{self.ref}-{self.alt}"
|
||||
|
||||
|
||||
def load_clinvar_vcf(clinvar_path: str) -> Dict[str, ClinVarEntry]:
|
||||
"""Load ClinVar VCF into a lookup dictionary"""
|
||||
print(f"Loading ClinVar database from {clinvar_path}...")
|
||||
clinvar_db = {}
|
||||
|
||||
open_func = gzip.open if clinvar_path.endswith('.gz') else open
|
||||
mode = 'rt' if clinvar_path.endswith('.gz') else 'r'
|
||||
|
||||
count = 0
|
||||
with open_func(clinvar_path, mode) as f:
|
||||
for line in f:
|
||||
if line.startswith('#'):
|
||||
continue
|
||||
|
||||
parts = line.strip().split('\t')
|
||||
if len(parts) < 8:
|
||||
continue
|
||||
|
||||
chrom, pos, _, ref, alt, _, _, info = parts[:8]
|
||||
|
||||
# Parse INFO field
|
||||
info_dict = {}
|
||||
for item in info.split(';'):
|
||||
if '=' in item:
|
||||
k, v = item.split('=', 1)
|
||||
info_dict[k] = v
|
||||
|
||||
clnsig = info_dict.get('CLNSIG', '')
|
||||
clndn = info_dict.get('CLNDN', '')
|
||||
clnrevstat = info_dict.get('CLNREVSTAT', '')
|
||||
clnvc = info_dict.get('CLNVC', '')
|
||||
|
||||
# Handle multiple alts
|
||||
for a in alt.split(','):
|
||||
key = f"{chrom}-{pos}-{ref}-{a}"
|
||||
clinvar_db[key] = ClinVarEntry(
|
||||
chrom=chrom,
|
||||
pos=int(pos),
|
||||
ref=ref,
|
||||
alt=a,
|
||||
clnsig=clnsig,
|
||||
clndn=clndn,
|
||||
clnrevstat=clnrevstat,
|
||||
clnvc=clnvc
|
||||
)
|
||||
count += 1
|
||||
|
||||
print(f"Loaded {count} ClinVar entries")
|
||||
return clinvar_db
|
||||
|
||||
|
||||
def parse_snpeff_annotation(info: str) -> Dict:
|
||||
"""Parse SnpEff ANN field"""
|
||||
result = {
|
||||
'gene': None,
|
||||
'effect': None,
|
||||
'impact': None,
|
||||
'hgvs_c': None,
|
||||
'hgvs_p': None,
|
||||
}
|
||||
|
||||
ann_match = re.search(r'ANN=([^;]+)', info)
|
||||
if not ann_match:
|
||||
return result
|
||||
|
||||
ann_field = ann_match.group(1)
|
||||
annotations = ann_field.split(',')
|
||||
|
||||
if annotations:
|
||||
parts = annotations[0].split('|')
|
||||
if len(parts) >= 4:
|
||||
result['effect'] = parts[1] if len(parts) > 1 else None
|
||||
result['impact'] = parts[2] if len(parts) > 2 else None
|
||||
result['gene'] = parts[3] if len(parts) > 3 else None
|
||||
if len(parts) > 9:
|
||||
result['hgvs_c'] = parts[9]
|
||||
if len(parts) > 10:
|
||||
result['hgvs_p'] = parts[10]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_genotype_class(gt: str) -> str:
|
||||
"""Classify genotype"""
|
||||
if gt in ['./.', '.|.', '.']:
|
||||
return 'MISSING'
|
||||
|
||||
alleles = re.split('[/|]', gt)
|
||||
if all(a == '0' for a in alleles):
|
||||
return 'HOM_REF'
|
||||
elif all(a != '0' and a != '.' for a in alleles):
|
||||
return 'HOM_ALT'
|
||||
else:
|
||||
return 'HET'
|
||||
|
||||
|
||||
class ACMGClassifier:
|
||||
"""ACMG variant classifier"""
|
||||
|
||||
def __init__(self, lof_genes: Optional[Set[str]] = None):
|
||||
self.lof_genes = lof_genes or {
|
||||
'BRCA1', 'BRCA2', 'TP53', 'PTEN', 'MLH1', 'MSH2', 'MSH6', 'PMS2',
|
||||
'APC', 'MEN1', 'RB1', 'VHL', 'WT1', 'NF1', 'NF2', 'TSC1', 'TSC2'
|
||||
}
|
||||
self.ba1_af = 0.05
|
||||
self.bs1_af = 0.01
|
||||
self.pm2_af = 0.0005
|
||||
|
||||
def classify(self, variant: AnnotatedVariant, is_de_novo: bool = False) -> Tuple[str, List[str]]:
|
||||
"""Apply ACMG classification rules"""
|
||||
evidence = []
|
||||
|
||||
# ClinVar evidence
|
||||
if variant.clinvar_sig:
|
||||
sig_lower = variant.clinvar_sig.lower()
|
||||
if 'pathogenic' in sig_lower and 'likely' not in sig_lower:
|
||||
evidence.append("PP5: ClinVar pathogenic")
|
||||
elif 'likely_pathogenic' in sig_lower:
|
||||
evidence.append("PP5: ClinVar likely pathogenic")
|
||||
elif 'benign' in sig_lower and 'likely' not in sig_lower:
|
||||
evidence.append("BP6: ClinVar benign")
|
||||
elif 'likely_benign' in sig_lower:
|
||||
evidence.append("BP6: ClinVar likely benign")
|
||||
|
||||
# Loss of function in LoF-sensitive gene (PVS1)
|
||||
if variant.effect and variant.gene:
|
||||
lof_keywords = ['frameshift', 'stop_gained', 'splice_acceptor', 'splice_donor', 'start_lost']
|
||||
if any(k in variant.effect.lower() for k in lof_keywords):
|
||||
if variant.gene.upper() in self.lof_genes:
|
||||
evidence.append("PVS1: Null variant in LoF-sensitive gene")
|
||||
else:
|
||||
evidence.append("PVS1_moderate: Null variant (gene not confirmed LoF-sensitive)")
|
||||
|
||||
# De novo (PS2)
|
||||
if is_de_novo:
|
||||
evidence.append("PS2: De novo variant")
|
||||
|
||||
# Impact-based evidence
|
||||
if variant.impact == 'HIGH':
|
||||
evidence.append("PM4: Protein length change (HIGH impact)")
|
||||
elif variant.impact == 'MODERATE':
|
||||
if variant.effect and 'missense' in variant.effect.lower():
|
||||
evidence.append("PP3: Computational evidence (missense)")
|
||||
|
||||
# Determine final classification
|
||||
classification = self._determine_class(evidence, variant.clinvar_sig)
|
||||
|
||||
return classification, evidence
|
||||
|
||||
def _determine_class(self, evidence: List[str], clinvar_sig: Optional[str]) -> str:
|
||||
"""Determine ACMG class based on evidence"""
|
||||
evidence_str = ' '.join(evidence)
|
||||
|
||||
# ClinVar takes precedence if high confidence
|
||||
if clinvar_sig:
|
||||
sig_lower = clinvar_sig.lower()
|
||||
if 'pathogenic' in sig_lower and 'conflicting' not in sig_lower:
|
||||
if 'likely' in sig_lower:
|
||||
return 'Likely Pathogenic'
|
||||
return 'Pathogenic'
|
||||
elif 'benign' in sig_lower and 'conflicting' not in sig_lower:
|
||||
if 'likely' in sig_lower:
|
||||
return 'Likely Benign'
|
||||
return 'Benign'
|
||||
|
||||
# Rule-based classification
|
||||
has_pvs1 = 'PVS1:' in evidence_str
|
||||
has_ps2 = 'PS2:' in evidence_str
|
||||
has_pm4 = 'PM4:' in evidence_str
|
||||
has_pp = 'PP' in evidence_str
|
||||
has_bp = 'BP' in evidence_str
|
||||
|
||||
if has_pvs1 and has_ps2:
|
||||
return 'Pathogenic'
|
||||
elif has_pvs1 or (has_ps2 and has_pm4):
|
||||
return 'Likely Pathogenic'
|
||||
elif has_bp and not has_pp and not has_pvs1:
|
||||
return 'Likely Benign'
|
||||
else:
|
||||
return 'VUS'
|
||||
|
||||
|
||||
def analyze_trio_with_clinvar(
|
||||
snpeff_vcf: str,
|
||||
clinvar_path: str,
|
||||
output_path: str,
|
||||
proband_idx: int = 0,
|
||||
father_idx: int = 1,
|
||||
mother_idx: int = 2
|
||||
):
|
||||
"""Main analysis function"""
|
||||
|
||||
# Load ClinVar
|
||||
clinvar_db = load_clinvar_vcf(clinvar_path)
|
||||
|
||||
# Initialize classifier
|
||||
classifier = ACMGClassifier()
|
||||
|
||||
# Parse VCF and annotate
|
||||
print(f"Processing {snpeff_vcf}...")
|
||||
|
||||
samples = []
|
||||
results = []
|
||||
pathogenic_variants = []
|
||||
|
||||
open_func = gzip.open if snpeff_vcf.endswith('.gz') else open
|
||||
mode = 'rt' if snpeff_vcf.endswith('.gz') else 'r'
|
||||
|
||||
with open_func(snpeff_vcf, mode) as f:
|
||||
for line in f:
|
||||
if line.startswith('##'):
|
||||
continue
|
||||
elif line.startswith('#CHROM'):
|
||||
parts = line.strip().split('\t')
|
||||
samples = parts[9:]
|
||||
continue
|
||||
|
||||
parts = line.strip().split('\t')
|
||||
if len(parts) < 10:
|
||||
continue
|
||||
|
||||
chrom, pos, _, ref, alt, qual, filt, info, fmt = parts[:9]
|
||||
gt_fields = parts[9:]
|
||||
|
||||
# Parse genotypes
|
||||
fmt_parts = fmt.split(':')
|
||||
gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0
|
||||
|
||||
genotypes = {}
|
||||
for i, sample in enumerate(samples):
|
||||
gt_data = gt_fields[i].split(':')
|
||||
genotypes[sample] = gt_data[gt_idx] if gt_idx < len(gt_data) else './.'
|
||||
|
||||
# Parse SnpEff annotation
|
||||
ann = parse_snpeff_annotation(info)
|
||||
|
||||
# Only process variants in proband
|
||||
proband = samples[proband_idx] if proband_idx < len(samples) else samples[0]
|
||||
proband_gt = get_genotype_class(genotypes.get(proband, './.'))
|
||||
|
||||
if proband_gt == 'HOM_REF' or proband_gt == 'MISSING':
|
||||
continue
|
||||
|
||||
# Check inheritance pattern
|
||||
father = samples[father_idx] if father_idx < len(samples) else samples[1]
|
||||
mother = samples[mother_idx] if mother_idx < len(samples) else samples[2]
|
||||
father_gt = get_genotype_class(genotypes.get(father, './.'))
|
||||
mother_gt = get_genotype_class(genotypes.get(mother, './.'))
|
||||
|
||||
is_de_novo = (proband_gt in ['HET', 'HOM_ALT'] and
|
||||
father_gt == 'HOM_REF' and mother_gt == 'HOM_REF')
|
||||
|
||||
is_hom_rec = (proband_gt == 'HOM_ALT' and
|
||||
father_gt == 'HET' and mother_gt == 'HET')
|
||||
|
||||
inheritance = None
|
||||
if is_de_novo:
|
||||
inheritance = 'de_novo'
|
||||
elif is_hom_rec:
|
||||
inheritance = 'homozygous_recessive'
|
||||
elif proband_gt == 'HET':
|
||||
if father_gt in ['HET', 'HOM_ALT'] and mother_gt == 'HOM_REF':
|
||||
inheritance = 'paternal'
|
||||
elif mother_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF':
|
||||
inheritance = 'maternal'
|
||||
|
||||
# Lookup ClinVar
|
||||
for a in alt.split(','):
|
||||
var_key = f"{chrom}-{pos}-{ref}-{a}"
|
||||
clinvar_entry = clinvar_db.get(var_key)
|
||||
|
||||
variant = AnnotatedVariant(
|
||||
chrom=chrom,
|
||||
pos=int(pos),
|
||||
ref=ref,
|
||||
alt=a,
|
||||
gene=ann['gene'],
|
||||
effect=ann['effect'],
|
||||
impact=ann['impact'],
|
||||
genotypes=genotypes,
|
||||
inheritance_pattern=inheritance
|
||||
)
|
||||
|
||||
if clinvar_entry:
|
||||
variant.clinvar_sig = clinvar_entry.clnsig
|
||||
variant.clinvar_disease = clinvar_entry.clndn
|
||||
variant.clinvar_review = clinvar_entry.clnrevstat
|
||||
|
||||
# ACMG classification
|
||||
acmg_class, evidence = classifier.classify(variant, is_de_novo)
|
||||
variant.acmg_class = acmg_class
|
||||
variant.acmg_evidence = evidence
|
||||
|
||||
# Filter for clinically relevant variants
|
||||
if (variant.clinvar_sig and 'pathogenic' in variant.clinvar_sig.lower()) or \
|
||||
acmg_class in ['Pathogenic', 'Likely Pathogenic'] or \
|
||||
(is_de_novo and ann['impact'] in ['HIGH', 'MODERATE']):
|
||||
pathogenic_variants.append(variant)
|
||||
|
||||
results.append(variant)
|
||||
|
||||
# Generate report
|
||||
print(f"Writing report to {output_path}...")
|
||||
|
||||
with open(output_path, 'w') as f:
|
||||
f.write("# ClinVar & ACMG Classification Report\n")
|
||||
f.write(f"# Input: {snpeff_vcf}\n")
|
||||
f.write(f"# ClinVar: {clinvar_path}\n")
|
||||
f.write(f"# Samples: {', '.join(samples)}\n")
|
||||
f.write(f"# Total variants processed: {len(results)}\n\n")
|
||||
|
||||
f.write("## CLINICALLY RELEVANT VARIANTS\n\n")
|
||||
f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tINHERITANCE\tCLINVAR_SIG\tCLINVAR_DISEASE\tACMG_CLASS\tACMG_EVIDENCE\n")
|
||||
|
||||
for v in sorted(pathogenic_variants, key=lambda x: (x.acmg_class != 'Pathogenic',
|
||||
x.acmg_class != 'Likely Pathogenic',
|
||||
x.chrom, x.pos)):
|
||||
f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t")
|
||||
f.write(f"{v.gene or 'N/A'}\t{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
|
||||
f.write(f"{v.inheritance_pattern or 'N/A'}\t")
|
||||
f.write(f"{v.clinvar_sig or 'N/A'}\t")
|
||||
f.write(f"{v.clinvar_disease or 'N/A'}\t")
|
||||
f.write(f"{v.acmg_class}\t")
|
||||
f.write(f"{'; '.join(v.acmg_evidence)}\n")
|
||||
|
||||
# Summary statistics
|
||||
f.write("\n## SUMMARY\n")
|
||||
f.write(f"Total variants in proband: {len(results)}\n")
|
||||
f.write(f"Clinically relevant variants: {len(pathogenic_variants)}\n")
|
||||
|
||||
# Count by ACMG class
|
||||
acmg_counts = defaultdict(int)
|
||||
for v in pathogenic_variants:
|
||||
acmg_counts[v.acmg_class] += 1
|
||||
|
||||
f.write("\nBy ACMG Classification:\n")
|
||||
for cls in ['Pathogenic', 'Likely Pathogenic', 'VUS', 'Likely Benign', 'Benign']:
|
||||
if cls in acmg_counts:
|
||||
f.write(f" {cls}: {acmg_counts[cls]}\n")
|
||||
|
||||
# Count by inheritance
|
||||
inh_counts = defaultdict(int)
|
||||
for v in pathogenic_variants:
|
||||
inh_counts[v.inheritance_pattern or 'unknown'] += 1
|
||||
|
||||
f.write("\nBy Inheritance Pattern:\n")
|
||||
for inh, count in sorted(inh_counts.items()):
|
||||
f.write(f" {inh}: {count}\n")
|
||||
|
||||
# ClinVar matches
|
||||
clinvar_match = sum(1 for v in pathogenic_variants if v.clinvar_sig)
|
||||
f.write(f"\nVariants with ClinVar annotation: {clinvar_match}\n")
|
||||
|
||||
print(f"\nAnalysis complete!")
|
||||
print(f"Clinically relevant variants: {len(pathogenic_variants)}")
|
||||
print(f"Report saved to: {output_path}")
|
||||
|
||||
# Print top candidates
|
||||
print("\n=== TOP PATHOGENIC CANDIDATES ===\n")
|
||||
top_variants = [v for v in pathogenic_variants if v.acmg_class in ['Pathogenic', 'Likely Pathogenic']][:20]
|
||||
|
||||
for v in top_variants:
|
||||
print(f"{v.chrom}:{v.pos} {v.ref}>{v.alt}")
|
||||
print(f" Gene: {v.gene} | Effect: {v.effect}")
|
||||
print(f" Inheritance: {v.inheritance_pattern}")
|
||||
print(f" ClinVar: {v.clinvar_sig or 'Not found'}")
|
||||
if v.clinvar_disease:
|
||||
print(f" Disease: {v.clinvar_disease[:80]}...")
|
||||
print(f" ACMG: {v.acmg_class}")
|
||||
print(f" Evidence: {'; '.join(v.acmg_evidence)}")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
snpeff_vcf = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf'
|
||||
clinvar_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_reference/clinvar/clinvar_GRCh37.vcf.gz'
|
||||
output_path = sys.argv[3] if len(sys.argv) > 3 else '/Volumes/NV2/genomics_analysis/clinvar_acmg_report.txt'
|
||||
|
||||
# VCF sample order: NV0066-08_S33 (idx 0), NV0066-09_S34 (idx 1), NV0066-10_S35 (idx 2)
|
||||
# Correct mapping: S35 = proband (II-3), S33 = parent, S34 = parent
|
||||
proband_idx = int(sys.argv[4]) if len(sys.argv) > 4 else 2 # S35 is proband
|
||||
father_idx = int(sys.argv[5]) if len(sys.argv) > 5 else 0 # S33
|
||||
mother_idx = int(sys.argv[6]) if len(sys.argv) > 6 else 1 # S34
|
||||
|
||||
analyze_trio_with_clinvar(snpeff_vcf, clinvar_path, output_path, proband_idx, father_idx, mother_idx)
|
||||
Reference in New Issue
Block a user