Refactor: Replace scaffolding with working analysis scripts

- Add trio_analysis.py for trio-based variant analysis with de novo detection
- Add clinvar_acmg_annotate.py for ClinVar/ACMG annotation
- Add gwas_comprehensive.py with 201 SNPs across 18 categories
- Add pharmgkb_full_analysis.py for pharmacogenomics analysis
- Add gwas_trait_lookup.py for basic GWAS trait lookup
- Add pharmacogenomics.py for basic PGx analysis
- Remove unused scaffolding code (src/, configs/, docs/, tests/)
- Update README.md with new documentation

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-12-01 22:36:02 +08:00
parent f74dc351f7
commit d13d58df8b
56 changed files with 2608 additions and 2347 deletions

349
pharmgkb_full_analysis.py Normal file
View File

@@ -0,0 +1,349 @@
#!/usr/bin/env python3
"""
Comprehensive PharmGKB Analysis Script
Uses full PharmGKB clinical annotations database for pharmacogenomics analysis.
"""
import gzip
import sys
import os
import re
from collections import defaultdict
from typing import Dict, List, Set, Tuple
# PharmGKB database paths
PHARMGKB_DIR = "/Volumes/NV2/genomics_reference/pharmgkb"
ANNOTATIONS_FILE = f"{PHARMGKB_DIR}/clinical_annotations.tsv"
ALLELES_FILE = f"{PHARMGKB_DIR}/clinical_ann_alleles.tsv"
def load_pharmgkb_annotations() -> Tuple[Dict, Dict]:
"""Load PharmGKB clinical annotations and allele information"""
# Load main annotations
annotations = {}
print(f"Loading PharmGKB annotations from {ANNOTATIONS_FILE}...")
with open(ANNOTATIONS_FILE, 'r') as f:
header = f.readline().strip().split('\t')
for line in f:
parts = line.strip().split('\t')
if len(parts) < 11:
continue
ann_id = parts[0]
variant = parts[1] # rsid or haplotype
gene = parts[2]
evidence_level = parts[3]
phenotype_category = parts[7] if len(parts) > 7 else ""
drugs = parts[10] if len(parts) > 10 else ""
phenotypes = parts[11] if len(parts) > 11 else ""
# Only process rs variants (SNPs)
if variant.startswith('rs'):
rsid = variant
if rsid not in annotations:
annotations[rsid] = []
annotations[rsid].append({
'ann_id': ann_id,
'gene': gene,
'evidence_level': evidence_level,
'phenotype_category': phenotype_category,
'drugs': drugs,
'phenotypes': phenotypes
})
# Load allele-specific information
allele_info = {}
print(f"Loading allele information from {ALLELES_FILE}...")
with open(ALLELES_FILE, 'r') as f:
header = f.readline().strip().split('\t')
for line in f:
parts = line.strip().split('\t')
if len(parts) < 3:
continue
ann_id = parts[0]
genotype = parts[1]
annotation_text = parts[2] if len(parts) > 2 else ""
allele_function = parts[3] if len(parts) > 3 else ""
if ann_id not in allele_info:
allele_info[ann_id] = {}
allele_info[ann_id][genotype] = {
'text': annotation_text,
'function': allele_function
}
print(f"Loaded {len(annotations)} unique variants with annotations")
return annotations, allele_info
def get_genotype_class(gt: str) -> str:
"""Classify genotype"""
if gt in ['./.', '.|.', '.']:
return 'MISSING'
alleles = re.split('[/|]', gt)
if all(a == '0' for a in alleles):
return 'HOM_REF'
elif all(a != '0' and a != '.' for a in alleles):
return 'HOM_ALT'
else:
return 'HET'
def get_genotype_string(gt: str, ref: str, alt: str) -> str:
"""Convert numeric genotype to allele string"""
if gt in ['./.', '.|.', '.']:
return 'N/A'
alleles = [ref] + alt.split(',')
gt_alleles = re.split('[/|]', gt)
result = []
for a in gt_alleles:
if a.isdigit():
idx = int(a)
if idx < len(alleles):
result.append(alleles[idx])
else:
result.append('?')
else:
result.append('?')
return '/'.join(result)
def parse_vcf_for_pharmgkb(vcf_path: str, sample_idx: int, annotations: Dict) -> Dict:
"""Parse VCF and look for PharmGKB variants"""
print(f"Scanning VCF for {len(annotations)} PharmGKB variants...")
found_variants = {}
samples = []
# Build rsid lookup from VCF
open_func = gzip.open if vcf_path.endswith('.gz') else open
mode = 'rt' if vcf_path.endswith('.gz') else 'r'
with open_func(vcf_path, mode) as f:
for line in f:
if line.startswith('##'):
continue
elif line.startswith('#CHROM'):
parts = line.strip().split('\t')
samples = parts[9:]
print(f"Found {len(samples)} samples, analyzing index {sample_idx}: {samples[sample_idx] if sample_idx < len(samples) else 'N/A'}")
continue
parts = line.strip().split('\t')
if len(parts) < 10:
continue
chrom, pos, rsid_vcf, ref, alt, qual, filt, info, fmt = parts[:9]
gt_fields = parts[9:]
# Check if this rsid has PharmGKB annotation
if rsid_vcf not in annotations:
continue
# Get sample genotype
fmt_parts = fmt.split(':')
gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0
if sample_idx < len(gt_fields):
gt_data = gt_fields[sample_idx].split(':')
gt = gt_data[gt_idx] if gt_idx < len(gt_data) else './.'
else:
gt = './.'
gt_class = get_genotype_class(gt)
gt_string = get_genotype_string(gt, ref, alt)
found_variants[rsid_vcf] = {
'rsid': rsid_vcf,
'chrom': chrom,
'pos': pos,
'ref': ref,
'alt': alt,
'genotype': gt,
'genotype_class': gt_class,
'genotype_string': gt_string,
'annotations': annotations[rsid_vcf]
}
return found_variants, samples
def generate_comprehensive_report(found_variants: Dict, allele_info: Dict,
output_path: str, sample_name: str):
"""Generate comprehensive pharmacogenomics report"""
# Categorize by evidence level and drug class
by_evidence = defaultdict(list)
by_category = defaultdict(list)
for rsid, var in found_variants.items():
for ann in var['annotations']:
level = ann['evidence_level']
category = ann['phenotype_category']
by_evidence[level].append((rsid, var, ann))
if category:
by_category[category].append((rsid, var, ann))
with open(output_path, 'w') as f:
f.write("=" * 80 + "\n")
f.write("COMPREHENSIVE PHARMACOGENOMICS REPORT\n")
f.write("Based on PharmGKB Clinical Annotations Database\n")
f.write("=" * 80 + "\n\n")
f.write(f"Sample: {sample_name}\n")
f.write(f"Total variants with PharmGKB annotations: {len(found_variants)}\n\n")
# Summary statistics
f.write("=" * 80 + "\n")
f.write("SUMMARY BY EVIDENCE LEVEL\n")
f.write("=" * 80 + "\n\n")
f.write("Level 1A: Annotation based on CPIC or DPWG guideline\n")
f.write("Level 1B: Annotation based on FDA or EMA label\n")
f.write("Level 2A: Moderate clinical significance\n")
f.write("Level 2B: Lower clinical significance\n")
f.write("Level 3: Low evidence\n")
f.write("Level 4: In vitro/preclinical evidence only\n\n")
for level in ['1A', '1B', '2A', '2B', '3', '4']:
count = len(by_evidence.get(level, []))
f.write(f" Level {level}: {count} annotations\n")
# High evidence findings (1A, 1B)
f.write("\n" + "=" * 80 + "\n")
f.write("HIGH EVIDENCE FINDINGS (Level 1A/1B - CPIC/DPWG Guidelines & FDA Labels)\n")
f.write("=" * 80 + "\n\n")
high_evidence = by_evidence.get('1A', []) + by_evidence.get('1B', [])
if high_evidence:
for rsid, var, ann in sorted(high_evidence, key=lambda x: x[2]['gene']):
gt_string = var['genotype_string']
f.write(f"GENE: {ann['gene']} ({rsid})\n")
f.write(f" Genotype: {gt_string} ({var['genotype_class']})\n")
f.write(f" Drug(s): {ann['drugs']}\n")
f.write(f" Category: {ann['phenotype_category']}\n")
f.write(f" Evidence Level: {ann['evidence_level']}\n")
# Get allele-specific annotation
ann_id = ann['ann_id']
if ann_id in allele_info:
# Try to match genotype
for geno, info in allele_info[ann_id].items():
if gt_string.replace('/', '') == geno.replace('/', '') or \
gt_string == geno or \
set(gt_string.split('/')) == set(geno):
if info['text']:
f.write(f" Clinical Annotation: {info['text'][:500]}...\n" if len(info['text']) > 500 else f" Clinical Annotation: {info['text']}\n")
if info['function']:
f.write(f" Allele Function: {info['function']}\n")
break
f.write("\n")
else:
f.write(" No high-evidence findings.\n\n")
# Moderate evidence findings (2A, 2B)
f.write("=" * 80 + "\n")
f.write("MODERATE EVIDENCE FINDINGS (Level 2A/2B)\n")
f.write("=" * 80 + "\n\n")
moderate_evidence = by_evidence.get('2A', []) + by_evidence.get('2B', [])
if moderate_evidence:
for rsid, var, ann in sorted(moderate_evidence, key=lambda x: x[2]['gene'])[:50]: # Limit to top 50
gt_string = var['genotype_string']
f.write(f"GENE: {ann['gene']} ({rsid})\n")
f.write(f" Genotype: {gt_string}\n")
f.write(f" Drug(s): {ann['drugs']}\n")
f.write(f" Category: {ann['phenotype_category']}\n")
f.write(f" Level: {ann['evidence_level']}\n\n")
if len(moderate_evidence) > 50:
f.write(f" ... and {len(moderate_evidence) - 50} more moderate evidence findings\n\n")
else:
f.write(" No moderate-evidence findings.\n\n")
# Summary by phenotype category
f.write("=" * 80 + "\n")
f.write("SUMMARY BY PHENOTYPE CATEGORY\n")
f.write("=" * 80 + "\n\n")
for category in sorted(by_category.keys()):
items = by_category[category]
f.write(f"\n## {category}: {len(items)} annotations\n")
f.write("-" * 40 + "\n")
# Show high-evidence items for each category
high_in_cat = [x for x in items if x[2]['evidence_level'] in ['1A', '1B', '2A']]
for rsid, var, ann in high_in_cat[:5]:
f.write(f" {ann['gene']} ({rsid}): {ann['drugs'][:50]}...\n" if len(ann['drugs']) > 50 else f" {ann['gene']} ({rsid}): {ann['drugs']}\n")
# Full detailed list
f.write("\n" + "=" * 80 + "\n")
f.write("COMPLETE VARIANT LIST\n")
f.write("=" * 80 + "\n\n")
f.write("RSID\tGENE\tGENOTYPE\tLEVEL\tCATEGORY\tDRUGS\n")
for rsid, var in sorted(found_variants.items()):
for ann in var['annotations']:
drugs_short = ann['drugs'][:30] + "..." if len(ann['drugs']) > 30 else ann['drugs']
f.write(f"{rsid}\t{ann['gene']}\t{var['genotype_string']}\t{ann['evidence_level']}\t{ann['phenotype_category']}\t{drugs_short}\n")
print(f"Report saved to: {output_path}")
def main():
vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf'
output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/pharmgkb_full_report.txt'
sample_idx = int(sys.argv[3]) if len(sys.argv) > 3 else 2
print("=" * 60)
print("COMPREHENSIVE PHARMGKB ANALYSIS")
print("=" * 60)
print(f"VCF: {vcf_path}")
print(f"Sample index: {sample_idx}")
print()
# Load PharmGKB database
annotations, allele_info = load_pharmgkb_annotations()
# Parse VCF
found_variants, samples = parse_vcf_for_pharmgkb(vcf_path, sample_idx, annotations)
sample_name = samples[sample_idx] if sample_idx < len(samples) else f"Sample_{sample_idx}"
print(f"\nFound {len(found_variants)} variants with PharmGKB annotations")
# Count by evidence level
level_counts = defaultdict(int)
for rsid, var in found_variants.items():
for ann in var['annotations']:
level_counts[ann['evidence_level']] += 1
print("\nAnnotations by evidence level:")
for level in ['1A', '1B', '2A', '2B', '3', '4']:
print(f" Level {level}: {level_counts.get(level, 0)}")
# Generate report
generate_comprehensive_report(found_variants, allele_info, output_path, sample_name)
# Print high-evidence findings to console
print("\n" + "=" * 60)
print("HIGH EVIDENCE FINDINGS (Level 1A/1B)")
print("=" * 60)
for rsid, var in found_variants.items():
for ann in var['annotations']:
if ann['evidence_level'] in ['1A', '1B']:
print(f"\n{ann['gene']} ({rsid})")
print(f" Genotype: {var['genotype_string']}")
print(f" Drug(s): {ann['drugs'][:80]}...")
print(f" Level: {ann['evidence_level']}")
if __name__ == '__main__':
main()