Refactor: Replace scaffolding with working analysis scripts
- Add trio_analysis.py for trio-based variant analysis with de novo detection - Add clinvar_acmg_annotate.py for ClinVar/ACMG annotation - Add gwas_comprehensive.py with 201 SNPs across 18 categories - Add pharmgkb_full_analysis.py for pharmacogenomics analysis - Add gwas_trait_lookup.py for basic GWAS trait lookup - Add pharmacogenomics.py for basic PGx analysis - Remove unused scaffolding code (src/, configs/, docs/, tests/) - Update README.md with new documentation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
349
pharmgkb_full_analysis.py
Normal file
349
pharmgkb_full_analysis.py
Normal file
@@ -0,0 +1,349 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive PharmGKB Analysis Script
|
||||
Uses full PharmGKB clinical annotations database for pharmacogenomics analysis.
|
||||
"""
|
||||
|
||||
import gzip
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List, Set, Tuple
|
||||
|
||||
# PharmGKB database paths
|
||||
PHARMGKB_DIR = "/Volumes/NV2/genomics_reference/pharmgkb"
|
||||
ANNOTATIONS_FILE = f"{PHARMGKB_DIR}/clinical_annotations.tsv"
|
||||
ALLELES_FILE = f"{PHARMGKB_DIR}/clinical_ann_alleles.tsv"
|
||||
|
||||
|
||||
def load_pharmgkb_annotations() -> Tuple[Dict, Dict]:
|
||||
"""Load PharmGKB clinical annotations and allele information"""
|
||||
|
||||
# Load main annotations
|
||||
annotations = {}
|
||||
print(f"Loading PharmGKB annotations from {ANNOTATIONS_FILE}...")
|
||||
|
||||
with open(ANNOTATIONS_FILE, 'r') as f:
|
||||
header = f.readline().strip().split('\t')
|
||||
for line in f:
|
||||
parts = line.strip().split('\t')
|
||||
if len(parts) < 11:
|
||||
continue
|
||||
|
||||
ann_id = parts[0]
|
||||
variant = parts[1] # rsid or haplotype
|
||||
gene = parts[2]
|
||||
evidence_level = parts[3]
|
||||
phenotype_category = parts[7] if len(parts) > 7 else ""
|
||||
drugs = parts[10] if len(parts) > 10 else ""
|
||||
phenotypes = parts[11] if len(parts) > 11 else ""
|
||||
|
||||
# Only process rs variants (SNPs)
|
||||
if variant.startswith('rs'):
|
||||
rsid = variant
|
||||
if rsid not in annotations:
|
||||
annotations[rsid] = []
|
||||
annotations[rsid].append({
|
||||
'ann_id': ann_id,
|
||||
'gene': gene,
|
||||
'evidence_level': evidence_level,
|
||||
'phenotype_category': phenotype_category,
|
||||
'drugs': drugs,
|
||||
'phenotypes': phenotypes
|
||||
})
|
||||
|
||||
# Load allele-specific information
|
||||
allele_info = {}
|
||||
print(f"Loading allele information from {ALLELES_FILE}...")
|
||||
|
||||
with open(ALLELES_FILE, 'r') as f:
|
||||
header = f.readline().strip().split('\t')
|
||||
for line in f:
|
||||
parts = line.strip().split('\t')
|
||||
if len(parts) < 3:
|
||||
continue
|
||||
|
||||
ann_id = parts[0]
|
||||
genotype = parts[1]
|
||||
annotation_text = parts[2] if len(parts) > 2 else ""
|
||||
allele_function = parts[3] if len(parts) > 3 else ""
|
||||
|
||||
if ann_id not in allele_info:
|
||||
allele_info[ann_id] = {}
|
||||
allele_info[ann_id][genotype] = {
|
||||
'text': annotation_text,
|
||||
'function': allele_function
|
||||
}
|
||||
|
||||
print(f"Loaded {len(annotations)} unique variants with annotations")
|
||||
return annotations, allele_info
|
||||
|
||||
|
||||
def get_genotype_class(gt: str) -> str:
|
||||
"""Classify genotype"""
|
||||
if gt in ['./.', '.|.', '.']:
|
||||
return 'MISSING'
|
||||
|
||||
alleles = re.split('[/|]', gt)
|
||||
if all(a == '0' for a in alleles):
|
||||
return 'HOM_REF'
|
||||
elif all(a != '0' and a != '.' for a in alleles):
|
||||
return 'HOM_ALT'
|
||||
else:
|
||||
return 'HET'
|
||||
|
||||
|
||||
def get_genotype_string(gt: str, ref: str, alt: str) -> str:
|
||||
"""Convert numeric genotype to allele string"""
|
||||
if gt in ['./.', '.|.', '.']:
|
||||
return 'N/A'
|
||||
|
||||
alleles = [ref] + alt.split(',')
|
||||
gt_alleles = re.split('[/|]', gt)
|
||||
|
||||
result = []
|
||||
for a in gt_alleles:
|
||||
if a.isdigit():
|
||||
idx = int(a)
|
||||
if idx < len(alleles):
|
||||
result.append(alleles[idx])
|
||||
else:
|
||||
result.append('?')
|
||||
else:
|
||||
result.append('?')
|
||||
|
||||
return '/'.join(result)
|
||||
|
||||
|
||||
def parse_vcf_for_pharmgkb(vcf_path: str, sample_idx: int, annotations: Dict) -> Dict:
|
||||
"""Parse VCF and look for PharmGKB variants"""
|
||||
|
||||
print(f"Scanning VCF for {len(annotations)} PharmGKB variants...")
|
||||
|
||||
found_variants = {}
|
||||
samples = []
|
||||
|
||||
# Build rsid lookup from VCF
|
||||
open_func = gzip.open if vcf_path.endswith('.gz') else open
|
||||
mode = 'rt' if vcf_path.endswith('.gz') else 'r'
|
||||
|
||||
with open_func(vcf_path, mode) as f:
|
||||
for line in f:
|
||||
if line.startswith('##'):
|
||||
continue
|
||||
elif line.startswith('#CHROM'):
|
||||
parts = line.strip().split('\t')
|
||||
samples = parts[9:]
|
||||
print(f"Found {len(samples)} samples, analyzing index {sample_idx}: {samples[sample_idx] if sample_idx < len(samples) else 'N/A'}")
|
||||
continue
|
||||
|
||||
parts = line.strip().split('\t')
|
||||
if len(parts) < 10:
|
||||
continue
|
||||
|
||||
chrom, pos, rsid_vcf, ref, alt, qual, filt, info, fmt = parts[:9]
|
||||
gt_fields = parts[9:]
|
||||
|
||||
# Check if this rsid has PharmGKB annotation
|
||||
if rsid_vcf not in annotations:
|
||||
continue
|
||||
|
||||
# Get sample genotype
|
||||
fmt_parts = fmt.split(':')
|
||||
gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0
|
||||
|
||||
if sample_idx < len(gt_fields):
|
||||
gt_data = gt_fields[sample_idx].split(':')
|
||||
gt = gt_data[gt_idx] if gt_idx < len(gt_data) else './.'
|
||||
else:
|
||||
gt = './.'
|
||||
|
||||
gt_class = get_genotype_class(gt)
|
||||
gt_string = get_genotype_string(gt, ref, alt)
|
||||
|
||||
found_variants[rsid_vcf] = {
|
||||
'rsid': rsid_vcf,
|
||||
'chrom': chrom,
|
||||
'pos': pos,
|
||||
'ref': ref,
|
||||
'alt': alt,
|
||||
'genotype': gt,
|
||||
'genotype_class': gt_class,
|
||||
'genotype_string': gt_string,
|
||||
'annotations': annotations[rsid_vcf]
|
||||
}
|
||||
|
||||
return found_variants, samples
|
||||
|
||||
|
||||
def generate_comprehensive_report(found_variants: Dict, allele_info: Dict,
|
||||
output_path: str, sample_name: str):
|
||||
"""Generate comprehensive pharmacogenomics report"""
|
||||
|
||||
# Categorize by evidence level and drug class
|
||||
by_evidence = defaultdict(list)
|
||||
by_category = defaultdict(list)
|
||||
|
||||
for rsid, var in found_variants.items():
|
||||
for ann in var['annotations']:
|
||||
level = ann['evidence_level']
|
||||
category = ann['phenotype_category']
|
||||
by_evidence[level].append((rsid, var, ann))
|
||||
if category:
|
||||
by_category[category].append((rsid, var, ann))
|
||||
|
||||
with open(output_path, 'w') as f:
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("COMPREHENSIVE PHARMACOGENOMICS REPORT\n")
|
||||
f.write("Based on PharmGKB Clinical Annotations Database\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
f.write(f"Sample: {sample_name}\n")
|
||||
f.write(f"Total variants with PharmGKB annotations: {len(found_variants)}\n\n")
|
||||
|
||||
# Summary statistics
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("SUMMARY BY EVIDENCE LEVEL\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
f.write("Level 1A: Annotation based on CPIC or DPWG guideline\n")
|
||||
f.write("Level 1B: Annotation based on FDA or EMA label\n")
|
||||
f.write("Level 2A: Moderate clinical significance\n")
|
||||
f.write("Level 2B: Lower clinical significance\n")
|
||||
f.write("Level 3: Low evidence\n")
|
||||
f.write("Level 4: In vitro/preclinical evidence only\n\n")
|
||||
|
||||
for level in ['1A', '1B', '2A', '2B', '3', '4']:
|
||||
count = len(by_evidence.get(level, []))
|
||||
f.write(f" Level {level}: {count} annotations\n")
|
||||
|
||||
# High evidence findings (1A, 1B)
|
||||
f.write("\n" + "=" * 80 + "\n")
|
||||
f.write("HIGH EVIDENCE FINDINGS (Level 1A/1B - CPIC/DPWG Guidelines & FDA Labels)\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
|
||||
high_evidence = by_evidence.get('1A', []) + by_evidence.get('1B', [])
|
||||
if high_evidence:
|
||||
for rsid, var, ann in sorted(high_evidence, key=lambda x: x[2]['gene']):
|
||||
gt_string = var['genotype_string']
|
||||
f.write(f"GENE: {ann['gene']} ({rsid})\n")
|
||||
f.write(f" Genotype: {gt_string} ({var['genotype_class']})\n")
|
||||
f.write(f" Drug(s): {ann['drugs']}\n")
|
||||
f.write(f" Category: {ann['phenotype_category']}\n")
|
||||
f.write(f" Evidence Level: {ann['evidence_level']}\n")
|
||||
|
||||
# Get allele-specific annotation
|
||||
ann_id = ann['ann_id']
|
||||
if ann_id in allele_info:
|
||||
# Try to match genotype
|
||||
for geno, info in allele_info[ann_id].items():
|
||||
if gt_string.replace('/', '') == geno.replace('/', '') or \
|
||||
gt_string == geno or \
|
||||
set(gt_string.split('/')) == set(geno):
|
||||
if info['text']:
|
||||
f.write(f" Clinical Annotation: {info['text'][:500]}...\n" if len(info['text']) > 500 else f" Clinical Annotation: {info['text']}\n")
|
||||
if info['function']:
|
||||
f.write(f" Allele Function: {info['function']}\n")
|
||||
break
|
||||
f.write("\n")
|
||||
else:
|
||||
f.write(" No high-evidence findings.\n\n")
|
||||
|
||||
# Moderate evidence findings (2A, 2B)
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("MODERATE EVIDENCE FINDINGS (Level 2A/2B)\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
|
||||
moderate_evidence = by_evidence.get('2A', []) + by_evidence.get('2B', [])
|
||||
if moderate_evidence:
|
||||
for rsid, var, ann in sorted(moderate_evidence, key=lambda x: x[2]['gene'])[:50]: # Limit to top 50
|
||||
gt_string = var['genotype_string']
|
||||
f.write(f"GENE: {ann['gene']} ({rsid})\n")
|
||||
f.write(f" Genotype: {gt_string}\n")
|
||||
f.write(f" Drug(s): {ann['drugs']}\n")
|
||||
f.write(f" Category: {ann['phenotype_category']}\n")
|
||||
f.write(f" Level: {ann['evidence_level']}\n\n")
|
||||
|
||||
if len(moderate_evidence) > 50:
|
||||
f.write(f" ... and {len(moderate_evidence) - 50} more moderate evidence findings\n\n")
|
||||
else:
|
||||
f.write(" No moderate-evidence findings.\n\n")
|
||||
|
||||
# Summary by phenotype category
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("SUMMARY BY PHENOTYPE CATEGORY\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
|
||||
for category in sorted(by_category.keys()):
|
||||
items = by_category[category]
|
||||
f.write(f"\n## {category}: {len(items)} annotations\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
|
||||
# Show high-evidence items for each category
|
||||
high_in_cat = [x for x in items if x[2]['evidence_level'] in ['1A', '1B', '2A']]
|
||||
for rsid, var, ann in high_in_cat[:5]:
|
||||
f.write(f" {ann['gene']} ({rsid}): {ann['drugs'][:50]}...\n" if len(ann['drugs']) > 50 else f" {ann['gene']} ({rsid}): {ann['drugs']}\n")
|
||||
|
||||
# Full detailed list
|
||||
f.write("\n" + "=" * 80 + "\n")
|
||||
f.write("COMPLETE VARIANT LIST\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
|
||||
f.write("RSID\tGENE\tGENOTYPE\tLEVEL\tCATEGORY\tDRUGS\n")
|
||||
for rsid, var in sorted(found_variants.items()):
|
||||
for ann in var['annotations']:
|
||||
drugs_short = ann['drugs'][:30] + "..." if len(ann['drugs']) > 30 else ann['drugs']
|
||||
f.write(f"{rsid}\t{ann['gene']}\t{var['genotype_string']}\t{ann['evidence_level']}\t{ann['phenotype_category']}\t{drugs_short}\n")
|
||||
|
||||
print(f"Report saved to: {output_path}")
|
||||
|
||||
|
||||
def main():
|
||||
vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf'
|
||||
output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/pharmgkb_full_report.txt'
|
||||
sample_idx = int(sys.argv[3]) if len(sys.argv) > 3 else 2
|
||||
|
||||
print("=" * 60)
|
||||
print("COMPREHENSIVE PHARMGKB ANALYSIS")
|
||||
print("=" * 60)
|
||||
print(f"VCF: {vcf_path}")
|
||||
print(f"Sample index: {sample_idx}")
|
||||
print()
|
||||
|
||||
# Load PharmGKB database
|
||||
annotations, allele_info = load_pharmgkb_annotations()
|
||||
|
||||
# Parse VCF
|
||||
found_variants, samples = parse_vcf_for_pharmgkb(vcf_path, sample_idx, annotations)
|
||||
|
||||
sample_name = samples[sample_idx] if sample_idx < len(samples) else f"Sample_{sample_idx}"
|
||||
print(f"\nFound {len(found_variants)} variants with PharmGKB annotations")
|
||||
|
||||
# Count by evidence level
|
||||
level_counts = defaultdict(int)
|
||||
for rsid, var in found_variants.items():
|
||||
for ann in var['annotations']:
|
||||
level_counts[ann['evidence_level']] += 1
|
||||
|
||||
print("\nAnnotations by evidence level:")
|
||||
for level in ['1A', '1B', '2A', '2B', '3', '4']:
|
||||
print(f" Level {level}: {level_counts.get(level, 0)}")
|
||||
|
||||
# Generate report
|
||||
generate_comprehensive_report(found_variants, allele_info, output_path, sample_name)
|
||||
|
||||
# Print high-evidence findings to console
|
||||
print("\n" + "=" * 60)
|
||||
print("HIGH EVIDENCE FINDINGS (Level 1A/1B)")
|
||||
print("=" * 60)
|
||||
|
||||
for rsid, var in found_variants.items():
|
||||
for ann in var['annotations']:
|
||||
if ann['evidence_level'] in ['1A', '1B']:
|
||||
print(f"\n{ann['gene']} ({rsid})")
|
||||
print(f" Genotype: {var['genotype_string']}")
|
||||
print(f" Drug(s): {ann['drugs'][:80]}...")
|
||||
print(f" Level: {ann['evidence_level']}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user