genomic-consultant/clinvar_acmg_annotate.py

#!/usr/bin/env python3
"""
ClinVar Annotation and ACMG Classification Script
Integrates ClinVar lookup with ACMG auto-classification for trio analysis.
"""

import gzip
import re
import sys
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Set, Tuple
from pathlib import Path

# Add project src to path
sys.path.insert(0, str(Path(__file__).parent / "src"))

try:
    from genomic_consultant.acmg.tagger import ACMGConfig, tag_variant, _is_lof
    from genomic_consultant.utils.models import Variant, EvidenceTag, SuggestedClassification
    HAS_PROJECT_MODULES = True
except ImportError:
    HAS_PROJECT_MODULES = False
    print("Warning: Project modules not found, using built-in ACMG classification")


@dataclass
class ClinVarEntry:
    """ClinVar database entry"""
    chrom: str
    pos: int
    ref: str
    alt: str
    clnsig: str  # Clinical significance
    clndn: str   # Disease name
    clnrevstat: str  # Review status
    clnvc: str   # Variant type
    af: Optional[float] = None


@dataclass
class AnnotatedVariant:
    """Variant with all annotations"""
    chrom: str
    pos: int
    ref: str
    alt: str
    gene: Optional[str] = None
    effect: Optional[str] = None
    impact: Optional[str] = None
    genotypes: Dict[str, str] = field(default_factory=dict)
    clinvar_sig: Optional[str] = None
    clinvar_disease: Optional[str] = None
    clinvar_review: Optional[str] = None
    acmg_class: Optional[str] = None
    acmg_evidence: List[str] = field(default_factory=list)
    inheritance_pattern: Optional[str] = None  # de_novo, compound_het, hom_rec, etc.

    @property
    def variant_id(self) -> str:
        return f"{self.chrom}-{self.pos}-{self.ref}-{self.alt}"


def load_clinvar_vcf(clinvar_path: str) -> Dict[str, ClinVarEntry]:
    """Load ClinVar VCF into a lookup dictionary"""
    print(f"Loading ClinVar database from {clinvar_path}...")
    clinvar_db = {}

    open_func = gzip.open if clinvar_path.endswith('.gz') else open
    mode = 'rt' if clinvar_path.endswith('.gz') else 'r'

    count = 0
    with open_func(clinvar_path, mode) as f:
        for line in f:
            if line.startswith('#'):
                continue

            parts = line.strip().split('\t')
            if len(parts) < 8:
                continue

            chrom, pos, _, ref, alt, _, _, info = parts[:8]

            # Parse INFO field
            info_dict = {}
            for item in info.split(';'):
                if '=' in item:
                    k, v = item.split('=', 1)
                    info_dict[k] = v

            clnsig = info_dict.get('CLNSIG', '')
            clndn = info_dict.get('CLNDN', '')
            clnrevstat = info_dict.get('CLNREVSTAT', '')
            clnvc = info_dict.get('CLNVC', '')

            # Handle multiple alts
            for a in alt.split(','):
                key = f"{chrom}-{pos}-{ref}-{a}"
                clinvar_db[key] = ClinVarEntry(
                    chrom=chrom,
                    pos=int(pos),
                    ref=ref,
                    alt=a,
                    clnsig=clnsig,
                    clndn=clndn,
                    clnrevstat=clnrevstat,
                    clnvc=clnvc
                )
                count += 1

    print(f"Loaded {count} ClinVar entries")
    return clinvar_db


def parse_snpeff_annotation(info: str) -> Dict:
    """Parse SnpEff ANN field"""
    result = {
        'gene': None,
        'effect': None,
        'impact': None,
        'hgvs_c': None,
        'hgvs_p': None,
    }

    ann_match = re.search(r'ANN=([^;]+)', info)
    if not ann_match:
        return result

    ann_field = ann_match.group(1)
    annotations = ann_field.split(',')

    if annotations:
        parts = annotations[0].split('|')
        if len(parts) >= 4:
            result['effect'] = parts[1] if len(parts) > 1 else None
            result['impact'] = parts[2] if len(parts) > 2 else None
            result['gene'] = parts[3] if len(parts) > 3 else None
            if len(parts) > 9:
                result['hgvs_c'] = parts[9]
            if len(parts) > 10:
                result['hgvs_p'] = parts[10]

    return result


def get_genotype_class(gt: str) -> str:
    """Classify genotype"""
    if gt in ['./.', '.|.', '.']:
        return 'MISSING'

    alleles = re.split('[/|]', gt)
    if all(a == '0' for a in alleles):
        return 'HOM_REF'
    elif all(a != '0' and a != '.' for a in alleles):
        return 'HOM_ALT'
    else:
        return 'HET'


class ACMGClassifier:
    """ACMG variant classifier"""

    def __init__(self, lof_genes: Optional[Set[str]] = None):
        self.lof_genes = lof_genes or {
            'BRCA1', 'BRCA2', 'TP53', 'PTEN', 'MLH1', 'MSH2', 'MSH6', 'PMS2',
            'APC', 'MEN1', 'RB1', 'VHL', 'WT1', 'NF1', 'NF2', 'TSC1', 'TSC2'
        }
        self.ba1_af = 0.05
        self.bs1_af = 0.01
        self.pm2_af = 0.0005

    def classify(self, variant: AnnotatedVariant, is_de_novo: bool = False) -> Tuple[str, List[str]]:
        """Apply ACMG classification rules"""
        evidence = []

        # ClinVar evidence
        if variant.clinvar_sig:
            sig_lower = variant.clinvar_sig.lower()
            if 'pathogenic' in sig_lower and 'likely' not in sig_lower:
                evidence.append("PP5: ClinVar pathogenic")
            elif 'likely_pathogenic' in sig_lower:
                evidence.append("PP5: ClinVar likely pathogenic")
            elif 'benign' in sig_lower and 'likely' not in sig_lower:
                evidence.append("BP6: ClinVar benign")
            elif 'likely_benign' in sig_lower:
                evidence.append("BP6: ClinVar likely benign")

        # Loss of function in LoF-sensitive gene (PVS1)
        if variant.effect and variant.gene:
            lof_keywords = ['frameshift', 'stop_gained', 'splice_acceptor', 'splice_donor', 'start_lost']
            if any(k in variant.effect.lower() for k in lof_keywords):
                if variant.gene.upper() in self.lof_genes:
                    evidence.append("PVS1: Null variant in LoF-sensitive gene")
                else:
                    evidence.append("PVS1_moderate: Null variant (gene not confirmed LoF-sensitive)")

        # De novo (PS2)
        if is_de_novo:
            evidence.append("PS2: De novo variant")

        # Impact-based evidence
        if variant.impact == 'HIGH':
            evidence.append("PM4: Protein length change (HIGH impact)")
        elif variant.impact == 'MODERATE':
            if variant.effect and 'missense' in variant.effect.lower():
                evidence.append("PP3: Computational evidence (missense)")

        # Determine final classification
        classification = self._determine_class(evidence, variant.clinvar_sig)

        return classification, evidence

    def _determine_class(self, evidence: List[str], clinvar_sig: Optional[str]) -> str:
        """Determine ACMG class based on evidence"""
        evidence_str = ' '.join(evidence)

        # ClinVar takes precedence if high confidence
        if clinvar_sig:
            sig_lower = clinvar_sig.lower()
            if 'pathogenic' in sig_lower and 'conflicting' not in sig_lower:
                if 'likely' in sig_lower:
                    return 'Likely Pathogenic'
                return 'Pathogenic'
            elif 'benign' in sig_lower and 'conflicting' not in sig_lower:
                if 'likely' in sig_lower:
                    return 'Likely Benign'
                return 'Benign'

        # Rule-based classification
        has_pvs1 = 'PVS1:' in evidence_str
        has_ps2 = 'PS2:' in evidence_str
        has_pm4 = 'PM4:' in evidence_str
        has_pp = 'PP' in evidence_str
        has_bp = 'BP' in evidence_str

        if has_pvs1 and has_ps2:
            return 'Pathogenic'
        elif has_pvs1 or (has_ps2 and has_pm4):
            return 'Likely Pathogenic'
        elif has_bp and not has_pp and not has_pvs1:
            return 'Likely Benign'
        else:
            return 'VUS'


def analyze_trio_with_clinvar(
    snpeff_vcf: str,
    clinvar_path: str,
    output_path: str,
    proband_idx: int = 0,
    father_idx: int = 1,
    mother_idx: int = 2
):
    """Main analysis function"""

    # Load ClinVar
    clinvar_db = load_clinvar_vcf(clinvar_path)

    # Initialize classifier
    classifier = ACMGClassifier()

    # Parse VCF and annotate
    print(f"Processing {snpeff_vcf}...")

    samples = []
    results = []
    pathogenic_variants = []

    open_func = gzip.open if snpeff_vcf.endswith('.gz') else open
    mode = 'rt' if snpeff_vcf.endswith('.gz') else 'r'

    with open_func(snpeff_vcf, mode) as f:
        for line in f:
            if line.startswith('##'):
                continue
            elif line.startswith('#CHROM'):
                parts = line.strip().split('\t')
                samples = parts[9:]
                continue

            parts = line.strip().split('\t')
            if len(parts) < 10:
                continue

            chrom, pos, _, ref, alt, qual, filt, info, fmt = parts[:9]
            gt_fields = parts[9:]

            # Parse genotypes
            fmt_parts = fmt.split(':')
            gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0

            genotypes = {}
            for i, sample in enumerate(samples):
                gt_data = gt_fields[i].split(':')
                genotypes[sample] = gt_data[gt_idx] if gt_idx < len(gt_data) else './.'

            # Parse SnpEff annotation
            ann = parse_snpeff_annotation(info)

            # Only process variants in proband
            proband = samples[proband_idx] if proband_idx < len(samples) else samples[0]
            proband_gt = get_genotype_class(genotypes.get(proband, './.'))

            if proband_gt == 'HOM_REF' or proband_gt == 'MISSING':
                continue

            # Check inheritance pattern
            father = samples[father_idx] if father_idx < len(samples) else samples[1]
            mother = samples[mother_idx] if mother_idx < len(samples) else samples[2]
            father_gt = get_genotype_class(genotypes.get(father, './.'))
            mother_gt = get_genotype_class(genotypes.get(mother, './.'))

            is_de_novo = (proband_gt in ['HET', 'HOM_ALT'] and
                         father_gt == 'HOM_REF' and mother_gt == 'HOM_REF')

            is_hom_rec = (proband_gt == 'HOM_ALT' and
                         father_gt == 'HET' and mother_gt == 'HET')

            inheritance = None
            if is_de_novo:
                inheritance = 'de_novo'
            elif is_hom_rec:
                inheritance = 'homozygous_recessive'
            elif proband_gt == 'HET':
                if father_gt in ['HET', 'HOM_ALT'] and mother_gt == 'HOM_REF':
                    inheritance = 'paternal'
                elif mother_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF':
                    inheritance = 'maternal'

            # Lookup ClinVar
            for a in alt.split(','):
                var_key = f"{chrom}-{pos}-{ref}-{a}"
                clinvar_entry = clinvar_db.get(var_key)

                variant = AnnotatedVariant(
                    chrom=chrom,
                    pos=int(pos),
                    ref=ref,
                    alt=a,
                    gene=ann['gene'],
                    effect=ann['effect'],
                    impact=ann['impact'],
                    genotypes=genotypes,
                    inheritance_pattern=inheritance
                )

                if clinvar_entry:
                    variant.clinvar_sig = clinvar_entry.clnsig
                    variant.clinvar_disease = clinvar_entry.clndn
                    variant.clinvar_review = clinvar_entry.clnrevstat

                # ACMG classification
                acmg_class, evidence = classifier.classify(variant, is_de_novo)
                variant.acmg_class = acmg_class
                variant.acmg_evidence = evidence

                # Filter for clinically relevant variants
                if (variant.clinvar_sig and 'pathogenic' in variant.clinvar_sig.lower()) or \
                   acmg_class in ['Pathogenic', 'Likely Pathogenic'] or \
                   (is_de_novo and ann['impact'] in ['HIGH', 'MODERATE']):
                    pathogenic_variants.append(variant)

                results.append(variant)

    # Generate report
    print(f"Writing report to {output_path}...")

    with open(output_path, 'w') as f:
        f.write("# ClinVar & ACMG Classification Report\n")
        f.write(f"# Input: {snpeff_vcf}\n")
        f.write(f"# ClinVar: {clinvar_path}\n")
        f.write(f"# Samples: {', '.join(samples)}\n")
        f.write(f"# Total variants processed: {len(results)}\n\n")

        f.write("## CLINICALLY RELEVANT VARIANTS\n\n")
        f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tINHERITANCE\tCLINVAR_SIG\tCLINVAR_DISEASE\tACMG_CLASS\tACMG_EVIDENCE\n")

        for v in sorted(pathogenic_variants, key=lambda x: (x.acmg_class != 'Pathogenic',
                                                            x.acmg_class != 'Likely Pathogenic',
                                                            x.chrom, x.pos)):
            f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t")
            f.write(f"{v.gene or 'N/A'}\t{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t")
            f.write(f"{v.inheritance_pattern or 'N/A'}\t")
            f.write(f"{v.clinvar_sig or 'N/A'}\t")
            f.write(f"{v.clinvar_disease or 'N/A'}\t")
            f.write(f"{v.acmg_class}\t")
            f.write(f"{'; '.join(v.acmg_evidence)}\n")

        # Summary statistics
        f.write("\n## SUMMARY\n")
        f.write(f"Total variants in proband: {len(results)}\n")
        f.write(f"Clinically relevant variants: {len(pathogenic_variants)}\n")

        # Count by ACMG class
        acmg_counts = defaultdict(int)
        for v in pathogenic_variants:
            acmg_counts[v.acmg_class] += 1

        f.write("\nBy ACMG Classification:\n")
        for cls in ['Pathogenic', 'Likely Pathogenic', 'VUS', 'Likely Benign', 'Benign']:
            if cls in acmg_counts:
                f.write(f"  {cls}: {acmg_counts[cls]}\n")

        # Count by inheritance
        inh_counts = defaultdict(int)
        for v in pathogenic_variants:
            inh_counts[v.inheritance_pattern or 'unknown'] += 1

        f.write("\nBy Inheritance Pattern:\n")
        for inh, count in sorted(inh_counts.items()):
            f.write(f"  {inh}: {count}\n")

        # ClinVar matches
        clinvar_match = sum(1 for v in pathogenic_variants if v.clinvar_sig)
        f.write(f"\nVariants with ClinVar annotation: {clinvar_match}\n")

    print(f"\nAnalysis complete!")
    print(f"Clinically relevant variants: {len(pathogenic_variants)}")
    print(f"Report saved to: {output_path}")

    # Print top candidates
    print("\n=== TOP PATHOGENIC CANDIDATES ===\n")
    top_variants = [v for v in pathogenic_variants if v.acmg_class in ['Pathogenic', 'Likely Pathogenic']][:20]

    for v in top_variants:
        print(f"{v.chrom}:{v.pos} {v.ref}>{v.alt}")
        print(f"  Gene: {v.gene} | Effect: {v.effect}")
        print(f"  Inheritance: {v.inheritance_pattern}")
        print(f"  ClinVar: {v.clinvar_sig or 'Not found'}")
        if v.clinvar_disease:
            print(f"  Disease: {v.clinvar_disease[:80]}...")
        print(f"  ACMG: {v.acmg_class}")
        print(f"  Evidence: {'; '.join(v.acmg_evidence)}")
        print()


if __name__ == '__main__':
    snpeff_vcf = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf'
    clinvar_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_reference/clinvar/clinvar_GRCh37.vcf.gz'
    output_path = sys.argv[3] if len(sys.argv) > 3 else '/Volumes/NV2/genomics_analysis/clinvar_acmg_report.txt'

    # VCF sample order: NV0066-08_S33 (idx 0), NV0066-09_S34 (idx 1), NV0066-10_S35 (idx 2)
    # Correct mapping: S35 = proband (II-3), S33 = parent, S34 = parent
    proband_idx = int(sys.argv[4]) if len(sys.argv) > 4 else 2  # S35 is proband
    father_idx = int(sys.argv[5]) if len(sys.argv) > 5 else 0   # S33
    mother_idx = int(sys.argv[6]) if len(sys.argv) > 6 else 1   # S34

    analyze_trio_with_clinvar(snpeff_vcf, clinvar_path, output_path, proband_idx, father_idx, mother_idx)