#!/usr/bin/env python3
"""
Comprehensive PharmGKB Analysis Script
Uses full PharmGKB clinical annotations database for pharmacogenomics analysis.
"""

import gzip
import sys
import os
import re
from collections import defaultdict
from typing import Dict, List, Set, Tuple

# PharmGKB database paths
PHARMGKB_DIR = "/Volumes/NV2/genomics_reference/pharmgkb"
ANNOTATIONS_FILE = f"{PHARMGKB_DIR}/clinical_annotations.tsv"
ALLELES_FILE = f"{PHARMGKB_DIR}/clinical_ann_alleles.tsv"


def load_pharmgkb_annotations() -> Tuple[Dict, Dict]:
    """Load PharmGKB clinical annotations and allele information"""

    # Load main annotations
    annotations = {}
    print(f"Loading PharmGKB annotations from {ANNOTATIONS_FILE}...")

    with open(ANNOTATIONS_FILE, 'r') as f:
        header = f.readline().strip().split('\t')
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) < 11:
                continue

            ann_id = parts[0]
            variant = parts[1]  # rsid or haplotype
            gene = parts[2]
            evidence_level = parts[3]
            phenotype_category = parts[7] if len(parts) > 7 else ""
            drugs = parts[10] if len(parts) > 10 else ""
            phenotypes = parts[11] if len(parts) > 11 else ""

            # Only process rs variants (SNPs)
            if variant.startswith('rs'):
                rsid = variant
                if rsid not in annotations:
                    annotations[rsid] = []
                annotations[rsid].append({
                    'ann_id': ann_id,
                    'gene': gene,
                    'evidence_level': evidence_level,
                    'phenotype_category': phenotype_category,
                    'drugs': drugs,
                    'phenotypes': phenotypes
                })

    # Load allele-specific information
    allele_info = {}
    print(f"Loading allele information from {ALLELES_FILE}...")

    with open(ALLELES_FILE, 'r') as f:
        header = f.readline().strip().split('\t')
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) < 3:
                continue

            ann_id = parts[0]
            genotype = parts[1]
            annotation_text = parts[2] if len(parts) > 2 else ""
            allele_function = parts[3] if len(parts) > 3 else ""

            if ann_id not in allele_info:
                allele_info[ann_id] = {}
            allele_info[ann_id][genotype] = {
                'text': annotation_text,
                'function': allele_function
            }

    print(f"Loaded {len(annotations)} unique variants with annotations")
    return annotations, allele_info


def get_genotype_class(gt: str) -> str:
    """Classify genotype"""
    if gt in ['./.', '.|.', '.']:
        return 'MISSING'

    alleles = re.split('[/|]', gt)
    if all(a == '0' for a in alleles):
        return 'HOM_REF'
    elif all(a != '0' and a != '.' for a in alleles):
        return 'HOM_ALT'
    else:
        return 'HET'


def get_genotype_string(gt: str, ref: str, alt: str) -> str:
    """Convert numeric genotype to allele string"""
    if gt in ['./.', '.|.', '.']:
        return 'N/A'

    alleles = [ref] + alt.split(',')
    gt_alleles = re.split('[/|]', gt)

    result = []
    for a in gt_alleles:
        if a.isdigit():
            idx = int(a)
            if idx < len(alleles):
                result.append(alleles[idx])
            else:
                result.append('?')
        else:
            result.append('?')

    return '/'.join(result)


def parse_vcf_for_pharmgkb(vcf_path: str, sample_idx: int, annotations: Dict) -> Dict:
    """Parse VCF and look for PharmGKB variants"""

    print(f"Scanning VCF for {len(annotations)} PharmGKB variants...")

    found_variants = {}
    samples = []

    # Build rsid lookup from VCF
    open_func = gzip.open if vcf_path.endswith('.gz') else open
    mode = 'rt' if vcf_path.endswith('.gz') else 'r'

    with open_func(vcf_path, mode) as f:
        for line in f:
            if line.startswith('##'):
                continue
            elif line.startswith('#CHROM'):
                parts = line.strip().split('\t')
                samples = parts[9:]
                print(f"Found {len(samples)} samples, analyzing index {sample_idx}: {samples[sample_idx] if sample_idx < len(samples) else 'N/A'}")
                continue

            parts = line.strip().split('\t')
            if len(parts) < 10:
                continue

            chrom, pos, rsid_vcf, ref, alt, qual, filt, info, fmt = parts[:9]
            gt_fields = parts[9:]

            # Check if this rsid has PharmGKB annotation
            if rsid_vcf not in annotations:
                continue

            # Get sample genotype
            fmt_parts = fmt.split(':')
            gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0

            if sample_idx < len(gt_fields):
                gt_data = gt_fields[sample_idx].split(':')
                gt = gt_data[gt_idx] if gt_idx < len(gt_data) else './.'
            else:
                gt = './.'

            gt_class = get_genotype_class(gt)
            gt_string = get_genotype_string(gt, ref, alt)

            found_variants[rsid_vcf] = {
                'rsid': rsid_vcf,
                'chrom': chrom,
                'pos': pos,
                'ref': ref,
                'alt': alt,
                'genotype': gt,
                'genotype_class': gt_class,
                'genotype_string': gt_string,
                'annotations': annotations[rsid_vcf]
            }

    return found_variants, samples


def generate_comprehensive_report(found_variants: Dict, allele_info: Dict,
                                   output_path: str, sample_name: str):
    """Generate comprehensive pharmacogenomics report"""

    # Categorize by evidence level and drug class
    by_evidence = defaultdict(list)
    by_category = defaultdict(list)

    for rsid, var in found_variants.items():
        for ann in var['annotations']:
            level = ann['evidence_level']
            category = ann['phenotype_category']
            by_evidence[level].append((rsid, var, ann))
            if category:
                by_category[category].append((rsid, var, ann))

    with open(output_path, 'w') as f:
        f.write("=" * 80 + "\n")
        f.write("COMPREHENSIVE PHARMACOGENOMICS REPORT\n")
        f.write("Based on PharmGKB Clinical Annotations Database\n")
        f.write("=" * 80 + "\n\n")
        f.write(f"Sample: {sample_name}\n")
        f.write(f"Total variants with PharmGKB annotations: {len(found_variants)}\n\n")

        # Summary statistics
        f.write("=" * 80 + "\n")
        f.write("SUMMARY BY EVIDENCE LEVEL\n")
        f.write("=" * 80 + "\n\n")
        f.write("Level 1A: Annotation based on CPIC or DPWG guideline\n")
        f.write("Level 1B: Annotation based on FDA or EMA label\n")
        f.write("Level 2A: Moderate clinical significance\n")
        f.write("Level 2B: Lower clinical significance\n")
        f.write("Level 3: Low evidence\n")
        f.write("Level 4: In vitro/preclinical evidence only\n\n")

        for level in ['1A', '1B', '2A', '2B', '3', '4']:
            count = len(by_evidence.get(level, []))
            f.write(f"  Level {level}: {count} annotations\n")

        # High evidence findings (1A, 1B)
        f.write("\n" + "=" * 80 + "\n")
        f.write("HIGH EVIDENCE FINDINGS (Level 1A/1B - CPIC/DPWG Guidelines & FDA Labels)\n")
        f.write("=" * 80 + "\n\n")

        high_evidence = by_evidence.get('1A', []) + by_evidence.get('1B', [])
        if high_evidence:
            for rsid, var, ann in sorted(high_evidence, key=lambda x: x[2]['gene']):
                gt_string = var['genotype_string']
                f.write(f"GENE: {ann['gene']} ({rsid})\n")
                f.write(f"  Genotype: {gt_string} ({var['genotype_class']})\n")
                f.write(f"  Drug(s): {ann['drugs']}\n")
                f.write(f"  Category: {ann['phenotype_category']}\n")
                f.write(f"  Evidence Level: {ann['evidence_level']}\n")

                # Get allele-specific annotation
                ann_id = ann['ann_id']
                if ann_id in allele_info:
                    # Try to match genotype
                    for geno, info in allele_info[ann_id].items():
                        if gt_string.replace('/', '') == geno.replace('/', '') or \
                           gt_string == geno or \
                           set(gt_string.split('/')) == set(geno):
                            if info['text']:
                                f.write(f"  Clinical Annotation: {info['text'][:500]}...\n" if len(info['text']) > 500 else f"  Clinical Annotation: {info['text']}\n")
                            if info['function']:
                                f.write(f"  Allele Function: {info['function']}\n")
                            break
                f.write("\n")
        else:
            f.write("  No high-evidence findings.\n\n")

        # Moderate evidence findings (2A, 2B)
        f.write("=" * 80 + "\n")
        f.write("MODERATE EVIDENCE FINDINGS (Level 2A/2B)\n")
        f.write("=" * 80 + "\n\n")

        moderate_evidence = by_evidence.get('2A', []) + by_evidence.get('2B', [])
        if moderate_evidence:
            for rsid, var, ann in sorted(moderate_evidence, key=lambda x: x[2]['gene'])[:50]:  # Limit to top 50
                gt_string = var['genotype_string']
                f.write(f"GENE: {ann['gene']} ({rsid})\n")
                f.write(f"  Genotype: {gt_string}\n")
                f.write(f"  Drug(s): {ann['drugs']}\n")
                f.write(f"  Category: {ann['phenotype_category']}\n")
                f.write(f"  Level: {ann['evidence_level']}\n\n")

            if len(moderate_evidence) > 50:
                f.write(f"  ... and {len(moderate_evidence) - 50} more moderate evidence findings\n\n")
        else:
            f.write("  No moderate-evidence findings.\n\n")

        # Summary by phenotype category
        f.write("=" * 80 + "\n")
        f.write("SUMMARY BY PHENOTYPE CATEGORY\n")
        f.write("=" * 80 + "\n\n")

        for category in sorted(by_category.keys()):
            items = by_category[category]
            f.write(f"\n## {category}: {len(items)} annotations\n")
            f.write("-" * 40 + "\n")

            # Show high-evidence items for each category
            high_in_cat = [x for x in items if x[2]['evidence_level'] in ['1A', '1B', '2A']]
            for rsid, var, ann in high_in_cat[:5]:
                f.write(f"  {ann['gene']} ({rsid}): {ann['drugs'][:50]}...\n" if len(ann['drugs']) > 50 else f"  {ann['gene']} ({rsid}): {ann['drugs']}\n")

        # Full detailed list
        f.write("\n" + "=" * 80 + "\n")
        f.write("COMPLETE VARIANT LIST\n")
        f.write("=" * 80 + "\n\n")

        f.write("RSID\tGENE\tGENOTYPE\tLEVEL\tCATEGORY\tDRUGS\n")
        for rsid, var in sorted(found_variants.items()):
            for ann in var['annotations']:
                drugs_short = ann['drugs'][:30] + "..." if len(ann['drugs']) > 30 else ann['drugs']
                f.write(f"{rsid}\t{ann['gene']}\t{var['genotype_string']}\t{ann['evidence_level']}\t{ann['phenotype_category']}\t{drugs_short}\n")

    print(f"Report saved to: {output_path}")


def main():
    vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf'
    output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/pharmgkb_full_report.txt'
    sample_idx = int(sys.argv[3]) if len(sys.argv) > 3 else 2

    print("=" * 60)
    print("COMPREHENSIVE PHARMGKB ANALYSIS")
    print("=" * 60)
    print(f"VCF: {vcf_path}")
    print(f"Sample index: {sample_idx}")
    print()

    # Load PharmGKB database
    annotations, allele_info = load_pharmgkb_annotations()

    # Parse VCF
    found_variants, samples = parse_vcf_for_pharmgkb(vcf_path, sample_idx, annotations)

    sample_name = samples[sample_idx] if sample_idx < len(samples) else f"Sample_{sample_idx}"
    print(f"\nFound {len(found_variants)} variants with PharmGKB annotations")

    # Count by evidence level
    level_counts = defaultdict(int)
    for rsid, var in found_variants.items():
        for ann in var['annotations']:
            level_counts[ann['evidence_level']] += 1

    print("\nAnnotations by evidence level:")
    for level in ['1A', '1B', '2A', '2B', '3', '4']:
        print(f"  Level {level}: {level_counts.get(level, 0)}")

    # Generate report
    generate_comprehensive_report(found_variants, allele_info, output_path, sample_name)

    # Print high-evidence findings to console
    print("\n" + "=" * 60)
    print("HIGH EVIDENCE FINDINGS (Level 1A/1B)")
    print("=" * 60)

    for rsid, var in found_variants.items():
        for ann in var['annotations']:
            if ann['evidence_level'] in ['1A', '1B']:
                print(f"\n{ann['gene']} ({rsid})")
                print(f"  Genotype: {var['genotype_string']}")
                print(f"  Drug(s): {ann['drugs'][:80]}...")
                print(f"  Level: {ann['evidence_level']}")


if __name__ == '__main__':
    main()