#!/usr/bin/env python3 """ Comprehensive PharmGKB Analysis Script Uses full PharmGKB clinical annotations database for pharmacogenomics analysis. """ import gzip import sys import os import re from collections import defaultdict from typing import Dict, List, Set, Tuple # PharmGKB database paths PHARMGKB_DIR = "/Volumes/NV2/genomics_reference/pharmgkb" ANNOTATIONS_FILE = f"{PHARMGKB_DIR}/clinical_annotations.tsv" ALLELES_FILE = f"{PHARMGKB_DIR}/clinical_ann_alleles.tsv" def load_pharmgkb_annotations() -> Tuple[Dict, Dict]: """Load PharmGKB clinical annotations and allele information""" # Load main annotations annotations = {} print(f"Loading PharmGKB annotations from {ANNOTATIONS_FILE}...") with open(ANNOTATIONS_FILE, 'r') as f: header = f.readline().strip().split('\t') for line in f: parts = line.strip().split('\t') if len(parts) < 11: continue ann_id = parts[0] variant = parts[1] # rsid or haplotype gene = parts[2] evidence_level = parts[3] phenotype_category = parts[7] if len(parts) > 7 else "" drugs = parts[10] if len(parts) > 10 else "" phenotypes = parts[11] if len(parts) > 11 else "" # Only process rs variants (SNPs) if variant.startswith('rs'): rsid = variant if rsid not in annotations: annotations[rsid] = [] annotations[rsid].append({ 'ann_id': ann_id, 'gene': gene, 'evidence_level': evidence_level, 'phenotype_category': phenotype_category, 'drugs': drugs, 'phenotypes': phenotypes }) # Load allele-specific information allele_info = {} print(f"Loading allele information from {ALLELES_FILE}...") with open(ALLELES_FILE, 'r') as f: header = f.readline().strip().split('\t') for line in f: parts = line.strip().split('\t') if len(parts) < 3: continue ann_id = parts[0] genotype = parts[1] annotation_text = parts[2] if len(parts) > 2 else "" allele_function = parts[3] if len(parts) > 3 else "" if ann_id not in allele_info: allele_info[ann_id] = {} allele_info[ann_id][genotype] = { 'text': annotation_text, 'function': allele_function } print(f"Loaded {len(annotations)} unique variants with annotations") return annotations, allele_info def get_genotype_class(gt: str) -> str: """Classify genotype""" if gt in ['./.', '.|.', '.']: return 'MISSING' alleles = re.split('[/|]', gt) if all(a == '0' for a in alleles): return 'HOM_REF' elif all(a != '0' and a != '.' for a in alleles): return 'HOM_ALT' else: return 'HET' def get_genotype_string(gt: str, ref: str, alt: str) -> str: """Convert numeric genotype to allele string""" if gt in ['./.', '.|.', '.']: return 'N/A' alleles = [ref] + alt.split(',') gt_alleles = re.split('[/|]', gt) result = [] for a in gt_alleles: if a.isdigit(): idx = int(a) if idx < len(alleles): result.append(alleles[idx]) else: result.append('?') else: result.append('?') return '/'.join(result) def parse_vcf_for_pharmgkb(vcf_path: str, sample_idx: int, annotations: Dict) -> Dict: """Parse VCF and look for PharmGKB variants""" print(f"Scanning VCF for {len(annotations)} PharmGKB variants...") found_variants = {} samples = [] # Build rsid lookup from VCF open_func = gzip.open if vcf_path.endswith('.gz') else open mode = 'rt' if vcf_path.endswith('.gz') else 'r' with open_func(vcf_path, mode) as f: for line in f: if line.startswith('##'): continue elif line.startswith('#CHROM'): parts = line.strip().split('\t') samples = parts[9:] print(f"Found {len(samples)} samples, analyzing index {sample_idx}: {samples[sample_idx] if sample_idx < len(samples) else 'N/A'}") continue parts = line.strip().split('\t') if len(parts) < 10: continue chrom, pos, rsid_vcf, ref, alt, qual, filt, info, fmt = parts[:9] gt_fields = parts[9:] # Check if this rsid has PharmGKB annotation if rsid_vcf not in annotations: continue # Get sample genotype fmt_parts = fmt.split(':') gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0 if sample_idx < len(gt_fields): gt_data = gt_fields[sample_idx].split(':') gt = gt_data[gt_idx] if gt_idx < len(gt_data) else './.' else: gt = './.' gt_class = get_genotype_class(gt) gt_string = get_genotype_string(gt, ref, alt) found_variants[rsid_vcf] = { 'rsid': rsid_vcf, 'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'genotype': gt, 'genotype_class': gt_class, 'genotype_string': gt_string, 'annotations': annotations[rsid_vcf] } return found_variants, samples def generate_comprehensive_report(found_variants: Dict, allele_info: Dict, output_path: str, sample_name: str): """Generate comprehensive pharmacogenomics report""" # Categorize by evidence level and drug class by_evidence = defaultdict(list) by_category = defaultdict(list) for rsid, var in found_variants.items(): for ann in var['annotations']: level = ann['evidence_level'] category = ann['phenotype_category'] by_evidence[level].append((rsid, var, ann)) if category: by_category[category].append((rsid, var, ann)) with open(output_path, 'w') as f: f.write("=" * 80 + "\n") f.write("COMPREHENSIVE PHARMACOGENOMICS REPORT\n") f.write("Based on PharmGKB Clinical Annotations Database\n") f.write("=" * 80 + "\n\n") f.write(f"Sample: {sample_name}\n") f.write(f"Total variants with PharmGKB annotations: {len(found_variants)}\n\n") # Summary statistics f.write("=" * 80 + "\n") f.write("SUMMARY BY EVIDENCE LEVEL\n") f.write("=" * 80 + "\n\n") f.write("Level 1A: Annotation based on CPIC or DPWG guideline\n") f.write("Level 1B: Annotation based on FDA or EMA label\n") f.write("Level 2A: Moderate clinical significance\n") f.write("Level 2B: Lower clinical significance\n") f.write("Level 3: Low evidence\n") f.write("Level 4: In vitro/preclinical evidence only\n\n") for level in ['1A', '1B', '2A', '2B', '3', '4']: count = len(by_evidence.get(level, [])) f.write(f" Level {level}: {count} annotations\n") # High evidence findings (1A, 1B) f.write("\n" + "=" * 80 + "\n") f.write("HIGH EVIDENCE FINDINGS (Level 1A/1B - CPIC/DPWG Guidelines & FDA Labels)\n") f.write("=" * 80 + "\n\n") high_evidence = by_evidence.get('1A', []) + by_evidence.get('1B', []) if high_evidence: for rsid, var, ann in sorted(high_evidence, key=lambda x: x[2]['gene']): gt_string = var['genotype_string'] f.write(f"GENE: {ann['gene']} ({rsid})\n") f.write(f" Genotype: {gt_string} ({var['genotype_class']})\n") f.write(f" Drug(s): {ann['drugs']}\n") f.write(f" Category: {ann['phenotype_category']}\n") f.write(f" Evidence Level: {ann['evidence_level']}\n") # Get allele-specific annotation ann_id = ann['ann_id'] if ann_id in allele_info: # Try to match genotype for geno, info in allele_info[ann_id].items(): if gt_string.replace('/', '') == geno.replace('/', '') or \ gt_string == geno or \ set(gt_string.split('/')) == set(geno): if info['text']: f.write(f" Clinical Annotation: {info['text'][:500]}...\n" if len(info['text']) > 500 else f" Clinical Annotation: {info['text']}\n") if info['function']: f.write(f" Allele Function: {info['function']}\n") break f.write("\n") else: f.write(" No high-evidence findings.\n\n") # Moderate evidence findings (2A, 2B) f.write("=" * 80 + "\n") f.write("MODERATE EVIDENCE FINDINGS (Level 2A/2B)\n") f.write("=" * 80 + "\n\n") moderate_evidence = by_evidence.get('2A', []) + by_evidence.get('2B', []) if moderate_evidence: for rsid, var, ann in sorted(moderate_evidence, key=lambda x: x[2]['gene'])[:50]: # Limit to top 50 gt_string = var['genotype_string'] f.write(f"GENE: {ann['gene']} ({rsid})\n") f.write(f" Genotype: {gt_string}\n") f.write(f" Drug(s): {ann['drugs']}\n") f.write(f" Category: {ann['phenotype_category']}\n") f.write(f" Level: {ann['evidence_level']}\n\n") if len(moderate_evidence) > 50: f.write(f" ... and {len(moderate_evidence) - 50} more moderate evidence findings\n\n") else: f.write(" No moderate-evidence findings.\n\n") # Summary by phenotype category f.write("=" * 80 + "\n") f.write("SUMMARY BY PHENOTYPE CATEGORY\n") f.write("=" * 80 + "\n\n") for category in sorted(by_category.keys()): items = by_category[category] f.write(f"\n## {category}: {len(items)} annotations\n") f.write("-" * 40 + "\n") # Show high-evidence items for each category high_in_cat = [x for x in items if x[2]['evidence_level'] in ['1A', '1B', '2A']] for rsid, var, ann in high_in_cat[:5]: f.write(f" {ann['gene']} ({rsid}): {ann['drugs'][:50]}...\n" if len(ann['drugs']) > 50 else f" {ann['gene']} ({rsid}): {ann['drugs']}\n") # Full detailed list f.write("\n" + "=" * 80 + "\n") f.write("COMPLETE VARIANT LIST\n") f.write("=" * 80 + "\n\n") f.write("RSID\tGENE\tGENOTYPE\tLEVEL\tCATEGORY\tDRUGS\n") for rsid, var in sorted(found_variants.items()): for ann in var['annotations']: drugs_short = ann['drugs'][:30] + "..." if len(ann['drugs']) > 30 else ann['drugs'] f.write(f"{rsid}\t{ann['gene']}\t{var['genotype_string']}\t{ann['evidence_level']}\t{ann['phenotype_category']}\t{drugs_short}\n") print(f"Report saved to: {output_path}") def main(): vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf' output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/pharmgkb_full_report.txt' sample_idx = int(sys.argv[3]) if len(sys.argv) > 3 else 2 print("=" * 60) print("COMPREHENSIVE PHARMGKB ANALYSIS") print("=" * 60) print(f"VCF: {vcf_path}") print(f"Sample index: {sample_idx}") print() # Load PharmGKB database annotations, allele_info = load_pharmgkb_annotations() # Parse VCF found_variants, samples = parse_vcf_for_pharmgkb(vcf_path, sample_idx, annotations) sample_name = samples[sample_idx] if sample_idx < len(samples) else f"Sample_{sample_idx}" print(f"\nFound {len(found_variants)} variants with PharmGKB annotations") # Count by evidence level level_counts = defaultdict(int) for rsid, var in found_variants.items(): for ann in var['annotations']: level_counts[ann['evidence_level']] += 1 print("\nAnnotations by evidence level:") for level in ['1A', '1B', '2A', '2B', '3', '4']: print(f" Level {level}: {level_counts.get(level, 0)}") # Generate report generate_comprehensive_report(found_variants, allele_info, output_path, sample_name) # Print high-evidence findings to console print("\n" + "=" * 60) print("HIGH EVIDENCE FINDINGS (Level 1A/1B)") print("=" * 60) for rsid, var in found_variants.items(): for ann in var['annotations']: if ann['evidence_level'] in ['1A', '1B']: print(f"\n{ann['gene']} ({rsid})") print(f" Genotype: {var['genotype_string']}") print(f" Drug(s): {ann['drugs'][:80]}...") print(f" Level: {ann['evidence_level']}") if __name__ == '__main__': main()