Refactor: Replace scaffolding with working analysis scripts

- Add trio_analysis.py for trio-based variant analysis with de novo detection - Add clinvar_acmg_annotate.py for ClinVar/ACMG annotation - Add gwas_comprehensive.py with 201 SNPs across 18 categories - Add pharmgkb_full_analysis.py for pharmacogenomics analysis - Add gwas_trait_lookup.py for basic GWAS trait lookup - Add pharmacogenomics.py for basic PGx analysis - Remove unused scaffolding code (src/, configs/, docs/, tests/) - Update README.md with new documentation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-01 22:36:02 +08:00
parent f74dc351f7
commit d13d58df8b
56 changed files with 2608 additions and 2347 deletions
--- a/pharmgkb_full_analysis.py
+++ b/pharmgkb_full_analysis.py
@@ -0,0 +1,349 @@
+#!/usr/bin/env python3
+"""
+Comprehensive PharmGKB Analysis Script
+Uses full PharmGKB clinical annotations database for pharmacogenomics analysis.
+"""
+
+import gzip
+import sys
+import os
+import re
+from collections import defaultdict
+from typing import Dict, List, Set, Tuple
+
+# PharmGKB database paths
+PHARMGKB_DIR = "/Volumes/NV2/genomics_reference/pharmgkb"
+ANNOTATIONS_FILE = f"{PHARMGKB_DIR}/clinical_annotations.tsv"
+ALLELES_FILE = f"{PHARMGKB_DIR}/clinical_ann_alleles.tsv"
+
+
+def load_pharmgkb_annotations() -> Tuple[Dict, Dict]:
+    """Load PharmGKB clinical annotations and allele information"""
+
+    # Load main annotations
+    annotations = {}
+    print(f"Loading PharmGKB annotations from {ANNOTATIONS_FILE}...")
+
+    with open(ANNOTATIONS_FILE, 'r') as f:
+        header = f.readline().strip().split('\t')
+        for line in f:
+            parts = line.strip().split('\t')
+            if len(parts) < 11:
+                continue
+
+            ann_id = parts[0]
+            variant = parts[1]  # rsid or haplotype
+            gene = parts[2]
+            evidence_level = parts[3]
+            phenotype_category = parts[7] if len(parts) > 7 else ""
+            drugs = parts[10] if len(parts) > 10 else ""
+            phenotypes = parts[11] if len(parts) > 11 else ""
+
+            # Only process rs variants (SNPs)
+            if variant.startswith('rs'):
+                rsid = variant
+                if rsid not in annotations:
+                    annotations[rsid] = []
+                annotations[rsid].append({
+                    'ann_id': ann_id,
+                    'gene': gene,
+                    'evidence_level': evidence_level,
+                    'phenotype_category': phenotype_category,
+                    'drugs': drugs,
+                    'phenotypes': phenotypes
+                })
+
+    # Load allele-specific information
+    allele_info = {}
+    print(f"Loading allele information from {ALLELES_FILE}...")
+
+    with open(ALLELES_FILE, 'r') as f:
+        header = f.readline().strip().split('\t')
+        for line in f:
+            parts = line.strip().split('\t')
+            if len(parts) < 3:
+                continue
+
+            ann_id = parts[0]
+            genotype = parts[1]
+            annotation_text = parts[2] if len(parts) > 2 else ""
+            allele_function = parts[3] if len(parts) > 3 else ""
+
+            if ann_id not in allele_info:
+                allele_info[ann_id] = {}
+            allele_info[ann_id][genotype] = {
+                'text': annotation_text,
+                'function': allele_function
+            }
+
+    print(f"Loaded {len(annotations)} unique variants with annotations")
+    return annotations, allele_info
+
+
+def get_genotype_class(gt: str) -> str:
+    """Classify genotype"""
+    if gt in ['./.', '.|.', '.']:
+        return 'MISSING'
+
+    alleles = re.split('[/|]', gt)
+    if all(a == '0' for a in alleles):
+        return 'HOM_REF'
+    elif all(a != '0' and a != '.' for a in alleles):
+        return 'HOM_ALT'
+    else:
+        return 'HET'
+
+
+def get_genotype_string(gt: str, ref: str, alt: str) -> str:
+    """Convert numeric genotype to allele string"""
+    if gt in ['./.', '.|.', '.']:
+        return 'N/A'
+
+    alleles = [ref] + alt.split(',')
+    gt_alleles = re.split('[/|]', gt)
+
+    result = []
+    for a in gt_alleles:
+        if a.isdigit():
+            idx = int(a)
+            if idx < len(alleles):
+                result.append(alleles[idx])
+            else:
+                result.append('?')
+        else:
+            result.append('?')
+
+    return '/'.join(result)
+
+
+def parse_vcf_for_pharmgkb(vcf_path: str, sample_idx: int, annotations: Dict) -> Dict:
+    """Parse VCF and look for PharmGKB variants"""
+
+    print(f"Scanning VCF for {len(annotations)} PharmGKB variants...")
+
+    found_variants = {}
+    samples = []
+
+    # Build rsid lookup from VCF
+    open_func = gzip.open if vcf_path.endswith('.gz') else open
+    mode = 'rt' if vcf_path.endswith('.gz') else 'r'
+
+    with open_func(vcf_path, mode) as f:
+        for line in f:
+            if line.startswith('##'):
+                continue
+            elif line.startswith('#CHROM'):
+                parts = line.strip().split('\t')
+                samples = parts[9:]
+                print(f"Found {len(samples)} samples, analyzing index {sample_idx}: {samples[sample_idx] if sample_idx < len(samples) else 'N/A'}")
+                continue
+
+            parts = line.strip().split('\t')
+            if len(parts) < 10:
+                continue
+
+            chrom, pos, rsid_vcf, ref, alt, qual, filt, info, fmt = parts[:9]
+            gt_fields = parts[9:]
+
+            # Check if this rsid has PharmGKB annotation
+            if rsid_vcf not in annotations:
+                continue
+
+            # Get sample genotype
+            fmt_parts = fmt.split(':')
+            gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0
+
+            if sample_idx < len(gt_fields):
+                gt_data = gt_fields[sample_idx].split(':')
+                gt = gt_data[gt_idx] if gt_idx < len(gt_data) else './.'
+            else:
+                gt = './.'
+
+            gt_class = get_genotype_class(gt)
+            gt_string = get_genotype_string(gt, ref, alt)
+
+            found_variants[rsid_vcf] = {
+                'rsid': rsid_vcf,
+                'chrom': chrom,
+                'pos': pos,
+                'ref': ref,
+                'alt': alt,
+                'genotype': gt,
+                'genotype_class': gt_class,
+                'genotype_string': gt_string,
+                'annotations': annotations[rsid_vcf]
+            }
+
+    return found_variants, samples
+
+
+def generate_comprehensive_report(found_variants: Dict, allele_info: Dict,
+                                   output_path: str, sample_name: str):
+    """Generate comprehensive pharmacogenomics report"""
+
+    # Categorize by evidence level and drug class
+    by_evidence = defaultdict(list)
+    by_category = defaultdict(list)
+
+    for rsid, var in found_variants.items():
+        for ann in var['annotations']:
+            level = ann['evidence_level']
+            category = ann['phenotype_category']
+            by_evidence[level].append((rsid, var, ann))
+            if category:
+                by_category[category].append((rsid, var, ann))
+
+    with open(output_path, 'w') as f:
+        f.write("=" * 80 + "\n")
+        f.write("COMPREHENSIVE PHARMACOGENOMICS REPORT\n")
+        f.write("Based on PharmGKB Clinical Annotations Database\n")
+        f.write("=" * 80 + "\n\n")
+        f.write(f"Sample: {sample_name}\n")
+        f.write(f"Total variants with PharmGKB annotations: {len(found_variants)}\n\n")
+
+        # Summary statistics
+        f.write("=" * 80 + "\n")
+        f.write("SUMMARY BY EVIDENCE LEVEL\n")
+        f.write("=" * 80 + "\n\n")
+        f.write("Level 1A: Annotation based on CPIC or DPWG guideline\n")
+        f.write("Level 1B: Annotation based on FDA or EMA label\n")
+        f.write("Level 2A: Moderate clinical significance\n")
+        f.write("Level 2B: Lower clinical significance\n")
+        f.write("Level 3: Low evidence\n")
+        f.write("Level 4: In vitro/preclinical evidence only\n\n")
+
+        for level in ['1A', '1B', '2A', '2B', '3', '4']:
+            count = len(by_evidence.get(level, []))
+            f.write(f"  Level {level}: {count} annotations\n")
+
+        # High evidence findings (1A, 1B)
+        f.write("\n" + "=" * 80 + "\n")
+        f.write("HIGH EVIDENCE FINDINGS (Level 1A/1B - CPIC/DPWG Guidelines & FDA Labels)\n")
+        f.write("=" * 80 + "\n\n")
+
+        high_evidence = by_evidence.get('1A', []) + by_evidence.get('1B', [])
+        if high_evidence:
+            for rsid, var, ann in sorted(high_evidence, key=lambda x: x[2]['gene']):
+                gt_string = var['genotype_string']
+                f.write(f"GENE: {ann['gene']} ({rsid})\n")
+                f.write(f"  Genotype: {gt_string} ({var['genotype_class']})\n")
+                f.write(f"  Drug(s): {ann['drugs']}\n")
+                f.write(f"  Category: {ann['phenotype_category']}\n")
+                f.write(f"  Evidence Level: {ann['evidence_level']}\n")
+
+                # Get allele-specific annotation
+                ann_id = ann['ann_id']
+                if ann_id in allele_info:
+                    # Try to match genotype
+                    for geno, info in allele_info[ann_id].items():
+                        if gt_string.replace('/', '') == geno.replace('/', '') or \
+                           gt_string == geno or \
+                           set(gt_string.split('/')) == set(geno):
+                            if info['text']:
+                                f.write(f"  Clinical Annotation: {info['text'][:500]}...\n" if len(info['text']) > 500 else f"  Clinical Annotation: {info['text']}\n")
+                            if info['function']:
+                                f.write(f"  Allele Function: {info['function']}\n")
+                            break
+                f.write("\n")
+        else:
+            f.write("  No high-evidence findings.\n\n")
+
+        # Moderate evidence findings (2A, 2B)
+        f.write("=" * 80 + "\n")
+        f.write("MODERATE EVIDENCE FINDINGS (Level 2A/2B)\n")
+        f.write("=" * 80 + "\n\n")
+
+        moderate_evidence = by_evidence.get('2A', []) + by_evidence.get('2B', [])
+        if moderate_evidence:
+            for rsid, var, ann in sorted(moderate_evidence, key=lambda x: x[2]['gene'])[:50]:  # Limit to top 50
+                gt_string = var['genotype_string']
+                f.write(f"GENE: {ann['gene']} ({rsid})\n")
+                f.write(f"  Genotype: {gt_string}\n")
+                f.write(f"  Drug(s): {ann['drugs']}\n")
+                f.write(f"  Category: {ann['phenotype_category']}\n")
+                f.write(f"  Level: {ann['evidence_level']}\n\n")
+
+            if len(moderate_evidence) > 50:
+                f.write(f"  ... and {len(moderate_evidence) - 50} more moderate evidence findings\n\n")
+        else:
+            f.write("  No moderate-evidence findings.\n\n")
+
+        # Summary by phenotype category
+        f.write("=" * 80 + "\n")
+        f.write("SUMMARY BY PHENOTYPE CATEGORY\n")
+        f.write("=" * 80 + "\n\n")
+
+        for category in sorted(by_category.keys()):
+            items = by_category[category]
+            f.write(f"\n## {category}: {len(items)} annotations\n")
+            f.write("-" * 40 + "\n")
+
+            # Show high-evidence items for each category
+            high_in_cat = [x for x in items if x[2]['evidence_level'] in ['1A', '1B', '2A']]
+            for rsid, var, ann in high_in_cat[:5]:
+                f.write(f"  {ann['gene']} ({rsid}): {ann['drugs'][:50]}...\n" if len(ann['drugs']) > 50 else f"  {ann['gene']} ({rsid}): {ann['drugs']}\n")
+
+        # Full detailed list
+        f.write("\n" + "=" * 80 + "\n")
+        f.write("COMPLETE VARIANT LIST\n")
+        f.write("=" * 80 + "\n\n")
+
+        f.write("RSID\tGENE\tGENOTYPE\tLEVEL\tCATEGORY\tDRUGS\n")
+        for rsid, var in sorted(found_variants.items()):
+            for ann in var['annotations']:
+                drugs_short = ann['drugs'][:30] + "..." if len(ann['drugs']) > 30 else ann['drugs']
+                f.write(f"{rsid}\t{ann['gene']}\t{var['genotype_string']}\t{ann['evidence_level']}\t{ann['phenotype_category']}\t{drugs_short}\n")
+
+    print(f"Report saved to: {output_path}")
+
+
+def main():
+    vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf'
+    output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/pharmgkb_full_report.txt'
+    sample_idx = int(sys.argv[3]) if len(sys.argv) > 3 else 2
+
+    print("=" * 60)
+    print("COMPREHENSIVE PHARMGKB ANALYSIS")
+    print("=" * 60)
+    print(f"VCF: {vcf_path}")
+    print(f"Sample index: {sample_idx}")
+    print()
+
+    # Load PharmGKB database
+    annotations, allele_info = load_pharmgkb_annotations()
+
+    # Parse VCF
+    found_variants, samples = parse_vcf_for_pharmgkb(vcf_path, sample_idx, annotations)
+
+    sample_name = samples[sample_idx] if sample_idx < len(samples) else f"Sample_{sample_idx}"
+    print(f"\nFound {len(found_variants)} variants with PharmGKB annotations")
+
+    # Count by evidence level
+    level_counts = defaultdict(int)
+    for rsid, var in found_variants.items():
+        for ann in var['annotations']:
+            level_counts[ann['evidence_level']] += 1
+
+    print("\nAnnotations by evidence level:")
+    for level in ['1A', '1B', '2A', '2B', '3', '4']:
+        print(f"  Level {level}: {level_counts.get(level, 0)}")
+
+    # Generate report
+    generate_comprehensive_report(found_variants, allele_info, output_path, sample_name)
+
+    # Print high-evidence findings to console
+    print("\n" + "=" * 60)
+    print("HIGH EVIDENCE FINDINGS (Level 1A/1B)")
+    print("=" * 60)
+
+    for rsid, var in found_variants.items():
+        for ann in var['annotations']:
+            if ann['evidence_level'] in ['1A', '1B']:
+                print(f"\n{ann['gene']} ({rsid})")
+                print(f"  Genotype: {var['genotype_string']}")
+                print(f"  Drug(s): {ann['drugs'][:80]}...")
+                print(f"  Level: {ann['evidence_level']}")
+
+
+if __name__ == '__main__':
+    main()