From d13d58df8bc7cd98b3c68e398c6688ab082980a6 Mon Sep 17 00:00:00 2001 From: gbanyan Date: Mon, 1 Dec 2025 22:36:02 +0800 Subject: [PATCH] Refactor: Replace scaffolding with working analysis scripts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add trio_analysis.py for trio-based variant analysis with de novo detection - Add clinvar_acmg_annotate.py for ClinVar/ACMG annotation - Add gwas_comprehensive.py with 201 SNPs across 18 categories - Add pharmgkb_full_analysis.py for pharmacogenomics analysis - Add gwas_trait_lookup.py for basic GWAS trait lookup - Add pharmacogenomics.py for basic PGx analysis - Remove unused scaffolding code (src/, configs/, docs/, tests/) - Update README.md with new documentation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- Makefile | 14 - README.md | 207 +++--- clinvar_acmg_annotate.py | 448 +++++++++++++ configs/acmg_config.example.yaml | 12 - configs/panel.example.json | 10 - configs/phenotype_to_genes.example.json | 11 - configs/phenotype_to_genes.hpo_seed.json | 13 - docs/phase1_howto.md | 101 --- docs/phase1_implementation_plan.md | 80 --- docs/system_architecture.md | 69 -- genomic_decision_support_system_spec_v0.1.md | 356 ----------- gwas_comprehensive.py | 590 ++++++++++++++++++ gwas_trait_lookup.py | 365 +++++++++++ pharmacogenomics.py | 376 +++++++++++ pharmgkb_full_analysis.py | 349 +++++++++++ pyproject.toml | 31 - sample_data/example_annotated.tsv | 3 - src/genomic_consultant.egg-info/PKG-INFO | 63 -- src/genomic_consultant.egg-info/SOURCES.txt | 27 - .../dependency_links.txt | 1 - .../entry_points.txt | 2 - src/genomic_consultant.egg-info/requires.txt | 5 - src/genomic_consultant.egg-info/top_level.txt | 1 - src/genomic_consultant/__init__.py | 4 - src/genomic_consultant/acmg/__init__.py | 1 - src/genomic_consultant/acmg/tagger.py | 91 --- src/genomic_consultant/audit/__init__.py | 1 - src/genomic_consultant/audit/run_log.py | 30 - src/genomic_consultant/cli.py | 273 -------- .../orchestration/__init__.py | 1 - .../orchestration/phase1_pipeline.py | 146 ----- .../orchestration/workflows.py | 35 -- src/genomic_consultant/panels/__init__.py | 1 - src/genomic_consultant/panels/aggregate.py | 31 - src/genomic_consultant/panels/panels.py | 38 -- src/genomic_consultant/panels/resolver.py | 42 -- src/genomic_consultant/pipelines/__init__.py | 1 - .../pipelines/annotation.py | 72 --- src/genomic_consultant/pipelines/runner.py | 62 -- .../pipelines/variant_calling.py | 65 -- src/genomic_consultant/reporting/__init__.py | 1 - src/genomic_consultant/reporting/report.py | 75 --- src/genomic_consultant/store/__init__.py | 1 - src/genomic_consultant/store/parquet_store.py | 83 --- src/genomic_consultant/store/query.py | 109 ---- src/genomic_consultant/utils/__init__.py | 1 - src/genomic_consultant/utils/hashing.py | 15 - src/genomic_consultant/utils/models.py | 81 --- src/genomic_consultant/utils/tooling.py | 22 - tests/test_acmg.py | 41 -- tests/test_aggregate.py | 16 - tests/test_phase1_pipeline.py | 27 - tests/test_resolver.py | 18 - tests/test_store.py | 28 - tests/test_store_tsv.py | 33 - trio_analysis.py | 376 +++++++++++ 56 files changed, 2608 insertions(+), 2347 deletions(-) delete mode 100644 Makefile create mode 100644 clinvar_acmg_annotate.py delete mode 100644 configs/acmg_config.example.yaml delete mode 100644 configs/panel.example.json delete mode 100644 configs/phenotype_to_genes.example.json delete mode 100644 configs/phenotype_to_genes.hpo_seed.json delete mode 100644 docs/phase1_howto.md delete mode 100644 docs/phase1_implementation_plan.md delete mode 100644 docs/system_architecture.md delete mode 100644 genomic_decision_support_system_spec_v0.1.md create mode 100644 gwas_comprehensive.py create mode 100644 gwas_trait_lookup.py create mode 100644 pharmacogenomics.py create mode 100644 pharmgkb_full_analysis.py delete mode 100644 pyproject.toml delete mode 100644 sample_data/example_annotated.tsv delete mode 100644 src/genomic_consultant.egg-info/PKG-INFO delete mode 100644 src/genomic_consultant.egg-info/SOURCES.txt delete mode 100644 src/genomic_consultant.egg-info/dependency_links.txt delete mode 100644 src/genomic_consultant.egg-info/entry_points.txt delete mode 100644 src/genomic_consultant.egg-info/requires.txt delete mode 100644 src/genomic_consultant.egg-info/top_level.txt delete mode 100644 src/genomic_consultant/__init__.py delete mode 100644 src/genomic_consultant/acmg/__init__.py delete mode 100644 src/genomic_consultant/acmg/tagger.py delete mode 100644 src/genomic_consultant/audit/__init__.py delete mode 100644 src/genomic_consultant/audit/run_log.py delete mode 100644 src/genomic_consultant/cli.py delete mode 100644 src/genomic_consultant/orchestration/__init__.py delete mode 100644 src/genomic_consultant/orchestration/phase1_pipeline.py delete mode 100644 src/genomic_consultant/orchestration/workflows.py delete mode 100644 src/genomic_consultant/panels/__init__.py delete mode 100644 src/genomic_consultant/panels/aggregate.py delete mode 100644 src/genomic_consultant/panels/panels.py delete mode 100644 src/genomic_consultant/panels/resolver.py delete mode 100644 src/genomic_consultant/pipelines/__init__.py delete mode 100644 src/genomic_consultant/pipelines/annotation.py delete mode 100644 src/genomic_consultant/pipelines/runner.py delete mode 100644 src/genomic_consultant/pipelines/variant_calling.py delete mode 100644 src/genomic_consultant/reporting/__init__.py delete mode 100644 src/genomic_consultant/reporting/report.py delete mode 100644 src/genomic_consultant/store/__init__.py delete mode 100644 src/genomic_consultant/store/parquet_store.py delete mode 100644 src/genomic_consultant/store/query.py delete mode 100644 src/genomic_consultant/utils/__init__.py delete mode 100644 src/genomic_consultant/utils/hashing.py delete mode 100644 src/genomic_consultant/utils/models.py delete mode 100644 src/genomic_consultant/utils/tooling.py delete mode 100644 tests/test_acmg.py delete mode 100644 tests/test_aggregate.py delete mode 100644 tests/test_phase1_pipeline.py delete mode 100644 tests/test_resolver.py delete mode 100644 tests/test_store.py delete mode 100644 tests/test_store_tsv.py create mode 100644 trio_analysis.py diff --git a/Makefile b/Makefile deleted file mode 100644 index 8102f16..0000000 --- a/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -PYTHON := .venv/bin/python -PIP := .venv/bin/pip - -.PHONY: venv install test - -venv: - python3 -m venv .venv - $(PIP) install -e . - -install: venv - $(PIP) install -e .[store] - -test: - $(PYTHON) -m pytest diff --git a/README.md b/README.md index af73dab..65a903c 100644 --- a/README.md +++ b/README.md @@ -1,112 +1,113 @@ # Genomic Consultant -Early design for a personal genomic risk and drug–interaction decision support system. Specs are sourced from `genomic_decision_support_system_spec_v0.1.md`. +A practical genomics analysis toolkit for Trio WES (Whole Exome Sequencing) data analysis, including ClinVar/ACMG annotation, GWAS trait analysis, and pharmacogenomics. -## Vision (per spec) -- Phase 1: trio variant calling, annotation, queryable genomic DB, initial ACMG evidence tagging. -- Phase 2: pharmacogenomics genotype-to-phenotype mapping plus drug–drug interaction checks. -- Phase 3: supplement/herb normalization and interaction risk layering. -- Phase 4: LLM-driven query orchestration and report generation. +## Analysis Scripts -## Repository Layout -- `docs/` — system architecture notes, phase plans, data models (work in progress). -- `configs/` — example ACMG config and gene panel JSON. -- `configs/phenotype_to_genes.example.json` — placeholder phenotype/HPO → gene mappings. -- `configs/phenotype_to_genes.hpo_seed.json` — seed HPO mappings (replace with full HPO/GenCC derived panels). -- `sample_data/` — tiny annotated TSV for demo. -- `src/genomic_consultant/` — Python scaffolding (pipelines, store, panel lookup, ACMG tagging, reporting). -- `genomic_decision_support_system_spec_v0.1.md` — original requirements draft. +### 1. Trio Analysis (`trio_analysis.py`) +Comprehensive trio-based variant analysis with de novo detection, compound heterozygosity, and inheritance pattern annotation. -## Contributing/next steps -1. Finalize Phase 1 tech selection (variant caller, annotation stack, reference/DB versions). -2. Stand up the Phase 1 pipelines and minimal query API surface. -3. Add ACMG evidence tagging config and human-review logging. -4. Layer in PGx/DDI and supplement modules per later phases. - -Data safety: keep genomic/clinical data local; the `.gitignore` blocks common genomic outputs by default. - -## Quickstart (CLI scaffolding) -``` -pip install -e . - -# 1) Show trio calling plan (commands only; not executed) -genomic-consultant plan-call \ - --sample proband:/data/proband.bam \ - --sample father:/data/father.bam \ - --sample mother:/data/mother.bam \ - --reference /refs/GRCh38.fa \ - --workdir /tmp/trio - -# 1b) Execute calling plan (requires GATK installed) and emit run log -genomic-consultant run-call \ - --sample proband:/data/proband.bam \ - --sample father:/data/father.bam \ - --sample mother:/data/mother.bam \ - --reference /refs/GRCh38.fa \ - --workdir /tmp/trio \ - --log /tmp/trio/run_call_log.json \ - --probe-tools - -# 2) Show annotation plan for a joint VCF -genomic-consultant plan-annotate \ - --vcf /tmp/trio/trio.joint.vcf.gz \ - --workdir /tmp/trio/annot \ - --prefix trio \ - --reference /refs/GRCh38.fa - -# 2b) Execute annotation plan (requires VEP, bcftools) with run log -genomic-consultant run-annotate \ - --vcf /tmp/trio/trio.joint.vcf.gz \ - --workdir /tmp/trio/annot \ - --prefix trio \ - --reference /refs/GRCh38.fa \ - --log /tmp/trio/annot/run_annot_log.json \ - --probe-tools - -# 3) Demo panel report using sample data (panel file) -genomic-consultant panel-report \ - --tsv sample_data/example_annotated.tsv \ - --panel configs/panel.example.json \ - --acmg-config configs/acmg_config.example.yaml \ - --individual-id demo \ - --format markdown \ - --log /tmp/panel_log.json - -# 3b) Demo panel report using phenotype mapping (HPO) -genomic-consultant panel-report \ - --tsv sample_data/example_annotated.tsv \ - --phenotype-id HP:0000365 \ - --phenotype-mapping configs/phenotype_to_genes.hpo_seed.json \ - --acmg-config configs/acmg_config.example.yaml \ - --individual-id demo \ - --format markdown - -# 3c) Merge multiple phenotype→gene mappings into one -genomic-consultant build-phenotype-mapping \ - --output configs/phenotype_to_genes.merged.json \ - configs/phenotype_to_genes.example.json configs/phenotype_to_genes.hpo_seed.json - -# 4) End-to-end Phase 1 pipeline (optionally skip call/annotate; use sample TSV) -genomic-consultant phase1-run \ - --tsv sample_data/example_annotated.tsv \ - --skip-call --skip-annotate \ - --panel configs/panel.example.json \ - --acmg-config configs/acmg_config.example.yaml \ - --workdir runtime \ - --prefix demo - -# Run tests -pytest +```bash +python trio_analysis.py ``` -### Optional Parquet-backed store -Install pandas to enable Parquet ingestion: -``` -pip install -e .[store] +### 2. ClinVar/ACMG Annotation (`clinvar_acmg_annotate.py`) +Annotates variants with ClinVar clinical significance and generates ACMG-style evidence tags. + +```bash +python clinvar_acmg_annotate.py [sample_idx] ``` -### Notes on VEP plugins (SpliceAI/CADD) -- The annotation plan already queries `SpliceAI` and `CADD_PHRED` fields; ensure your VEP run includes plugins/flags that produce them, e.g.: - - `--plugin SpliceAI,snv=/path/to/spliceai.snv.vcf.gz,indel=/path/to/spliceai.indel.vcf.gz` - - `--plugin CADD,/path/to/whole_genome_SNVs.tsv.gz,/path/to/InDels.tsv.gz` -- Pass these via `--plugin` and/or `--extra-flag` on `run-annotate` / `plan-annotate` to embed fields into the TSV. +### 3. GWAS Comprehensive Analysis (`gwas_comprehensive.py`) +Comprehensive GWAS trait analysis with 201 curated SNPs across 18 categories: +- Gout / Uric acid metabolism +- Kidney disease +- Hearing loss +- Autoimmune diseases +- Cancer risk +- Blood clotting / Thrombosis +- Thyroid disorders +- Bone health / Osteoporosis +- Liver disease (NAFLD) +- Migraine +- Longevity / Aging +- Sleep +- Skin conditions +- Cardiovascular disease +- Metabolic disorders +- Eye conditions +- Neuropsychiatric +- Other traits + +```bash +python gwas_comprehensive.py [sample_idx] +``` + +### 4. PharmGKB Full Analysis (`pharmgkb_full_analysis.py`) +Comprehensive pharmacogenomics analysis using the PharmGKB clinical annotations database. + +```bash +python pharmgkb_full_analysis.py [sample_idx] +``` + +### 5. GWAS Trait Lookup (`gwas_trait_lookup.py`) +Original curated GWAS trait lookup (smaller SNP set). + +```bash +python gwas_trait_lookup.py [sample_idx] +``` + +### 6. Basic Pharmacogenomics (`pharmacogenomics.py`) +Basic pharmacogenomics analysis with common drug-gene interactions. + +## Prerequisites + +- Python 3.8+ +- conda environment with bioinformatics tools: + ```bash + conda create -n genomics python=3.10 + conda activate genomics + conda install -c bioconda bcftools snpeff gatk4 + ``` + +## Reference Databases Required + +- **ClinVar**: VCF from NCBI +- **PharmGKB**: Clinical annotations TSV +- **dbSNP**: For rsID annotation +- **GRCh37/hg19 reference genome** + +## Data Directory Structure + +``` +/Volumes/NV2/ +├── genomics_analysis/ +│ └── vcf/ +│ ├── trio_joint.vcf.gz # Joint-called VCF +│ ├── trio_joint.rsid.vcf.gz # With rsID annotations +│ └── trio_joint.snpeff.vcf # With SnpEff annotations +└── genomics_reference/ + ├── clinvar/ + ├── pharmgkb/ + ├── dbsnp/ + └── gwas_catalog/ +``` + +## Sample Index Mapping + +For trio VCF files: +- Index 0: Mother +- Index 1: Father +- Index 2: Proband + +## Output Reports + +Each script generates detailed reports including: +- Summary statistics +- Risk variant identification +- Family comparison (for trio data) +- Clinical annotations and recommendations + +## License + +Private use only. diff --git a/clinvar_acmg_annotate.py b/clinvar_acmg_annotate.py new file mode 100644 index 0000000..4969b4a --- /dev/null +++ b/clinvar_acmg_annotate.py @@ -0,0 +1,448 @@ +#!/usr/bin/env python3 +""" +ClinVar Annotation and ACMG Classification Script +Integrates ClinVar lookup with ACMG auto-classification for trio analysis. +""" + +import gzip +import re +import sys +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Set, Tuple +from pathlib import Path + +# Add project src to path +sys.path.insert(0, str(Path(__file__).parent / "src")) + +try: + from genomic_consultant.acmg.tagger import ACMGConfig, tag_variant, _is_lof + from genomic_consultant.utils.models import Variant, EvidenceTag, SuggestedClassification + HAS_PROJECT_MODULES = True +except ImportError: + HAS_PROJECT_MODULES = False + print("Warning: Project modules not found, using built-in ACMG classification") + + +@dataclass +class ClinVarEntry: + """ClinVar database entry""" + chrom: str + pos: int + ref: str + alt: str + clnsig: str # Clinical significance + clndn: str # Disease name + clnrevstat: str # Review status + clnvc: str # Variant type + af: Optional[float] = None + + +@dataclass +class AnnotatedVariant: + """Variant with all annotations""" + chrom: str + pos: int + ref: str + alt: str + gene: Optional[str] = None + effect: Optional[str] = None + impact: Optional[str] = None + genotypes: Dict[str, str] = field(default_factory=dict) + clinvar_sig: Optional[str] = None + clinvar_disease: Optional[str] = None + clinvar_review: Optional[str] = None + acmg_class: Optional[str] = None + acmg_evidence: List[str] = field(default_factory=list) + inheritance_pattern: Optional[str] = None # de_novo, compound_het, hom_rec, etc. + + @property + def variant_id(self) -> str: + return f"{self.chrom}-{self.pos}-{self.ref}-{self.alt}" + + +def load_clinvar_vcf(clinvar_path: str) -> Dict[str, ClinVarEntry]: + """Load ClinVar VCF into a lookup dictionary""" + print(f"Loading ClinVar database from {clinvar_path}...") + clinvar_db = {} + + open_func = gzip.open if clinvar_path.endswith('.gz') else open + mode = 'rt' if clinvar_path.endswith('.gz') else 'r' + + count = 0 + with open_func(clinvar_path, mode) as f: + for line in f: + if line.startswith('#'): + continue + + parts = line.strip().split('\t') + if len(parts) < 8: + continue + + chrom, pos, _, ref, alt, _, _, info = parts[:8] + + # Parse INFO field + info_dict = {} + for item in info.split(';'): + if '=' in item: + k, v = item.split('=', 1) + info_dict[k] = v + + clnsig = info_dict.get('CLNSIG', '') + clndn = info_dict.get('CLNDN', '') + clnrevstat = info_dict.get('CLNREVSTAT', '') + clnvc = info_dict.get('CLNVC', '') + + # Handle multiple alts + for a in alt.split(','): + key = f"{chrom}-{pos}-{ref}-{a}" + clinvar_db[key] = ClinVarEntry( + chrom=chrom, + pos=int(pos), + ref=ref, + alt=a, + clnsig=clnsig, + clndn=clndn, + clnrevstat=clnrevstat, + clnvc=clnvc + ) + count += 1 + + print(f"Loaded {count} ClinVar entries") + return clinvar_db + + +def parse_snpeff_annotation(info: str) -> Dict: + """Parse SnpEff ANN field""" + result = { + 'gene': None, + 'effect': None, + 'impact': None, + 'hgvs_c': None, + 'hgvs_p': None, + } + + ann_match = re.search(r'ANN=([^;]+)', info) + if not ann_match: + return result + + ann_field = ann_match.group(1) + annotations = ann_field.split(',') + + if annotations: + parts = annotations[0].split('|') + if len(parts) >= 4: + result['effect'] = parts[1] if len(parts) > 1 else None + result['impact'] = parts[2] if len(parts) > 2 else None + result['gene'] = parts[3] if len(parts) > 3 else None + if len(parts) > 9: + result['hgvs_c'] = parts[9] + if len(parts) > 10: + result['hgvs_p'] = parts[10] + + return result + + +def get_genotype_class(gt: str) -> str: + """Classify genotype""" + if gt in ['./.', '.|.', '.']: + return 'MISSING' + + alleles = re.split('[/|]', gt) + if all(a == '0' for a in alleles): + return 'HOM_REF' + elif all(a != '0' and a != '.' for a in alleles): + return 'HOM_ALT' + else: + return 'HET' + + +class ACMGClassifier: + """ACMG variant classifier""" + + def __init__(self, lof_genes: Optional[Set[str]] = None): + self.lof_genes = lof_genes or { + 'BRCA1', 'BRCA2', 'TP53', 'PTEN', 'MLH1', 'MSH2', 'MSH6', 'PMS2', + 'APC', 'MEN1', 'RB1', 'VHL', 'WT1', 'NF1', 'NF2', 'TSC1', 'TSC2' + } + self.ba1_af = 0.05 + self.bs1_af = 0.01 + self.pm2_af = 0.0005 + + def classify(self, variant: AnnotatedVariant, is_de_novo: bool = False) -> Tuple[str, List[str]]: + """Apply ACMG classification rules""" + evidence = [] + + # ClinVar evidence + if variant.clinvar_sig: + sig_lower = variant.clinvar_sig.lower() + if 'pathogenic' in sig_lower and 'likely' not in sig_lower: + evidence.append("PP5: ClinVar pathogenic") + elif 'likely_pathogenic' in sig_lower: + evidence.append("PP5: ClinVar likely pathogenic") + elif 'benign' in sig_lower and 'likely' not in sig_lower: + evidence.append("BP6: ClinVar benign") + elif 'likely_benign' in sig_lower: + evidence.append("BP6: ClinVar likely benign") + + # Loss of function in LoF-sensitive gene (PVS1) + if variant.effect and variant.gene: + lof_keywords = ['frameshift', 'stop_gained', 'splice_acceptor', 'splice_donor', 'start_lost'] + if any(k in variant.effect.lower() for k in lof_keywords): + if variant.gene.upper() in self.lof_genes: + evidence.append("PVS1: Null variant in LoF-sensitive gene") + else: + evidence.append("PVS1_moderate: Null variant (gene not confirmed LoF-sensitive)") + + # De novo (PS2) + if is_de_novo: + evidence.append("PS2: De novo variant") + + # Impact-based evidence + if variant.impact == 'HIGH': + evidence.append("PM4: Protein length change (HIGH impact)") + elif variant.impact == 'MODERATE': + if variant.effect and 'missense' in variant.effect.lower(): + evidence.append("PP3: Computational evidence (missense)") + + # Determine final classification + classification = self._determine_class(evidence, variant.clinvar_sig) + + return classification, evidence + + def _determine_class(self, evidence: List[str], clinvar_sig: Optional[str]) -> str: + """Determine ACMG class based on evidence""" + evidence_str = ' '.join(evidence) + + # ClinVar takes precedence if high confidence + if clinvar_sig: + sig_lower = clinvar_sig.lower() + if 'pathogenic' in sig_lower and 'conflicting' not in sig_lower: + if 'likely' in sig_lower: + return 'Likely Pathogenic' + return 'Pathogenic' + elif 'benign' in sig_lower and 'conflicting' not in sig_lower: + if 'likely' in sig_lower: + return 'Likely Benign' + return 'Benign' + + # Rule-based classification + has_pvs1 = 'PVS1:' in evidence_str + has_ps2 = 'PS2:' in evidence_str + has_pm4 = 'PM4:' in evidence_str + has_pp = 'PP' in evidence_str + has_bp = 'BP' in evidence_str + + if has_pvs1 and has_ps2: + return 'Pathogenic' + elif has_pvs1 or (has_ps2 and has_pm4): + return 'Likely Pathogenic' + elif has_bp and not has_pp and not has_pvs1: + return 'Likely Benign' + else: + return 'VUS' + + +def analyze_trio_with_clinvar( + snpeff_vcf: str, + clinvar_path: str, + output_path: str, + proband_idx: int = 0, + father_idx: int = 1, + mother_idx: int = 2 +): + """Main analysis function""" + + # Load ClinVar + clinvar_db = load_clinvar_vcf(clinvar_path) + + # Initialize classifier + classifier = ACMGClassifier() + + # Parse VCF and annotate + print(f"Processing {snpeff_vcf}...") + + samples = [] + results = [] + pathogenic_variants = [] + + open_func = gzip.open if snpeff_vcf.endswith('.gz') else open + mode = 'rt' if snpeff_vcf.endswith('.gz') else 'r' + + with open_func(snpeff_vcf, mode) as f: + for line in f: + if line.startswith('##'): + continue + elif line.startswith('#CHROM'): + parts = line.strip().split('\t') + samples = parts[9:] + continue + + parts = line.strip().split('\t') + if len(parts) < 10: + continue + + chrom, pos, _, ref, alt, qual, filt, info, fmt = parts[:9] + gt_fields = parts[9:] + + # Parse genotypes + fmt_parts = fmt.split(':') + gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0 + + genotypes = {} + for i, sample in enumerate(samples): + gt_data = gt_fields[i].split(':') + genotypes[sample] = gt_data[gt_idx] if gt_idx < len(gt_data) else './.' + + # Parse SnpEff annotation + ann = parse_snpeff_annotation(info) + + # Only process variants in proband + proband = samples[proband_idx] if proband_idx < len(samples) else samples[0] + proband_gt = get_genotype_class(genotypes.get(proband, './.')) + + if proband_gt == 'HOM_REF' or proband_gt == 'MISSING': + continue + + # Check inheritance pattern + father = samples[father_idx] if father_idx < len(samples) else samples[1] + mother = samples[mother_idx] if mother_idx < len(samples) else samples[2] + father_gt = get_genotype_class(genotypes.get(father, './.')) + mother_gt = get_genotype_class(genotypes.get(mother, './.')) + + is_de_novo = (proband_gt in ['HET', 'HOM_ALT'] and + father_gt == 'HOM_REF' and mother_gt == 'HOM_REF') + + is_hom_rec = (proband_gt == 'HOM_ALT' and + father_gt == 'HET' and mother_gt == 'HET') + + inheritance = None + if is_de_novo: + inheritance = 'de_novo' + elif is_hom_rec: + inheritance = 'homozygous_recessive' + elif proband_gt == 'HET': + if father_gt in ['HET', 'HOM_ALT'] and mother_gt == 'HOM_REF': + inheritance = 'paternal' + elif mother_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF': + inheritance = 'maternal' + + # Lookup ClinVar + for a in alt.split(','): + var_key = f"{chrom}-{pos}-{ref}-{a}" + clinvar_entry = clinvar_db.get(var_key) + + variant = AnnotatedVariant( + chrom=chrom, + pos=int(pos), + ref=ref, + alt=a, + gene=ann['gene'], + effect=ann['effect'], + impact=ann['impact'], + genotypes=genotypes, + inheritance_pattern=inheritance + ) + + if clinvar_entry: + variant.clinvar_sig = clinvar_entry.clnsig + variant.clinvar_disease = clinvar_entry.clndn + variant.clinvar_review = clinvar_entry.clnrevstat + + # ACMG classification + acmg_class, evidence = classifier.classify(variant, is_de_novo) + variant.acmg_class = acmg_class + variant.acmg_evidence = evidence + + # Filter for clinically relevant variants + if (variant.clinvar_sig and 'pathogenic' in variant.clinvar_sig.lower()) or \ + acmg_class in ['Pathogenic', 'Likely Pathogenic'] or \ + (is_de_novo and ann['impact'] in ['HIGH', 'MODERATE']): + pathogenic_variants.append(variant) + + results.append(variant) + + # Generate report + print(f"Writing report to {output_path}...") + + with open(output_path, 'w') as f: + f.write("# ClinVar & ACMG Classification Report\n") + f.write(f"# Input: {snpeff_vcf}\n") + f.write(f"# ClinVar: {clinvar_path}\n") + f.write(f"# Samples: {', '.join(samples)}\n") + f.write(f"# Total variants processed: {len(results)}\n\n") + + f.write("## CLINICALLY RELEVANT VARIANTS\n\n") + f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tINHERITANCE\tCLINVAR_SIG\tCLINVAR_DISEASE\tACMG_CLASS\tACMG_EVIDENCE\n") + + for v in sorted(pathogenic_variants, key=lambda x: (x.acmg_class != 'Pathogenic', + x.acmg_class != 'Likely Pathogenic', + x.chrom, x.pos)): + f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t") + f.write(f"{v.gene or 'N/A'}\t{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t") + f.write(f"{v.inheritance_pattern or 'N/A'}\t") + f.write(f"{v.clinvar_sig or 'N/A'}\t") + f.write(f"{v.clinvar_disease or 'N/A'}\t") + f.write(f"{v.acmg_class}\t") + f.write(f"{'; '.join(v.acmg_evidence)}\n") + + # Summary statistics + f.write("\n## SUMMARY\n") + f.write(f"Total variants in proband: {len(results)}\n") + f.write(f"Clinically relevant variants: {len(pathogenic_variants)}\n") + + # Count by ACMG class + acmg_counts = defaultdict(int) + for v in pathogenic_variants: + acmg_counts[v.acmg_class] += 1 + + f.write("\nBy ACMG Classification:\n") + for cls in ['Pathogenic', 'Likely Pathogenic', 'VUS', 'Likely Benign', 'Benign']: + if cls in acmg_counts: + f.write(f" {cls}: {acmg_counts[cls]}\n") + + # Count by inheritance + inh_counts = defaultdict(int) + for v in pathogenic_variants: + inh_counts[v.inheritance_pattern or 'unknown'] += 1 + + f.write("\nBy Inheritance Pattern:\n") + for inh, count in sorted(inh_counts.items()): + f.write(f" {inh}: {count}\n") + + # ClinVar matches + clinvar_match = sum(1 for v in pathogenic_variants if v.clinvar_sig) + f.write(f"\nVariants with ClinVar annotation: {clinvar_match}\n") + + print(f"\nAnalysis complete!") + print(f"Clinically relevant variants: {len(pathogenic_variants)}") + print(f"Report saved to: {output_path}") + + # Print top candidates + print("\n=== TOP PATHOGENIC CANDIDATES ===\n") + top_variants = [v for v in pathogenic_variants if v.acmg_class in ['Pathogenic', 'Likely Pathogenic']][:20] + + for v in top_variants: + print(f"{v.chrom}:{v.pos} {v.ref}>{v.alt}") + print(f" Gene: {v.gene} | Effect: {v.effect}") + print(f" Inheritance: {v.inheritance_pattern}") + print(f" ClinVar: {v.clinvar_sig or 'Not found'}") + if v.clinvar_disease: + print(f" Disease: {v.clinvar_disease[:80]}...") + print(f" ACMG: {v.acmg_class}") + print(f" Evidence: {'; '.join(v.acmg_evidence)}") + print() + + +if __name__ == '__main__': + snpeff_vcf = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf' + clinvar_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_reference/clinvar/clinvar_GRCh37.vcf.gz' + output_path = sys.argv[3] if len(sys.argv) > 3 else '/Volumes/NV2/genomics_analysis/clinvar_acmg_report.txt' + + # VCF sample order: NV0066-08_S33 (idx 0), NV0066-09_S34 (idx 1), NV0066-10_S35 (idx 2) + # Correct mapping: S35 = proband (II-3), S33 = parent, S34 = parent + proband_idx = int(sys.argv[4]) if len(sys.argv) > 4 else 2 # S35 is proband + father_idx = int(sys.argv[5]) if len(sys.argv) > 5 else 0 # S33 + mother_idx = int(sys.argv[6]) if len(sys.argv) > 6 else 1 # S34 + + analyze_trio_with_clinvar(snpeff_vcf, clinvar_path, output_path, proband_idx, father_idx, mother_idx) diff --git a/configs/acmg_config.example.yaml b/configs/acmg_config.example.yaml deleted file mode 100644 index 36bba07..0000000 --- a/configs/acmg_config.example.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# ACMG evidence thresholds (example, adjust per lab policy) -ba1_af: 0.05 # Stand-alone benign if AF >= this -bs1_af: 0.01 # Strong benign if AF >= this (and not meeting BA1) -pm2_af: 0.0005 # Moderate pathogenic if AF <= this -bp7_splice_ai_max: 0.1 # Supporting benign if synonymous and predicted low splice impact - -# Genes considered loss-of-function sensitive for PVS1 auto-tagging -lof_genes: - - BRCA1 - - BRCA2 - - TTN - - CFTR diff --git a/configs/panel.example.json b/configs/panel.example.json deleted file mode 100644 index 7866fda..0000000 --- a/configs/panel.example.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "name": "Hearing loss core", - "version": "0.1", - "source": "curated", - "last_updated": "2024-06-01", - "genes": ["GJB2", "SLC26A4", "MITF", "OTOF"], - "metadata": { - "notes": "Example panel for demo; replace with curated panel and provenance." - } -} diff --git a/configs/phenotype_to_genes.example.json b/configs/phenotype_to_genes.example.json deleted file mode 100644 index 1bbae32..0000000 --- a/configs/phenotype_to_genes.example.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "version": "0.1", - "source": "example-curated", - "phenotype_to_genes": { - "HP:0000365": ["GJB2", "SLC26A4", "OTOF"], - "HP:0000510": ["MITF", "SOX10"] - }, - "metadata": { - "notes": "Placeholder mapping; replace with curated HPO/OMIM/GenCC panels." - } -} diff --git a/configs/phenotype_to_genes.hpo_seed.json b/configs/phenotype_to_genes.hpo_seed.json deleted file mode 100644 index d9013bf..0000000 --- a/configs/phenotype_to_genes.hpo_seed.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "version": "2024-11-01", - "source": "HPO-curated-seed", - "phenotype_to_genes": { - "HP:0000365": ["GJB2", "SLC26A4", "OTOF", "TECTA"], - "HP:0001250": ["SCN1A", "KCNQ2", "STXBP1"], - "HP:0001631": ["MYH7", "TNNT2", "MYBPC3"], - "HP:0001156": ["COL1A1", "COL1A2", "PLOD2"] - }, - "metadata": { - "notes": "Seed mapping using common phenotype IDs; replace with full HPO-derived panels." - } -} diff --git a/docs/phase1_howto.md b/docs/phase1_howto.md deleted file mode 100644 index b27bcc5..0000000 --- a/docs/phase1_howto.md +++ /dev/null @@ -1,101 +0,0 @@ -# Phase 1 How-To: BAM → VCF → Annotate → Panel Report - -本文件說明如何用現有 CLI 從 BAM 執行到報告輸出,選項意義,以及可跳過步驟的用法。假設已安裝 GATK、VEP、bcftools/tabix,並以 `pip install -e .` 安裝本專案。 - -## 流程總覽 -1) Trio variant calling → joint VCF -2) VEP 註解 → annotated VCF + 平坦 TSV -3) Panel/phenotype 查詢 + ACMG 標籤 → Markdown/JSON 報告 -可用 `phase1-run` 一鍵(可跳過 call/annotate),或分步 `run-call` / `run-annotate` / `panel-report`。 - -## 分步執行 - -### 1) Variant calling(GATK) -```bash -genomic-consultant run-call \ - --sample proband:/path/proband.bam \ - --sample father:/path/father.bam \ - --sample mother:/path/mother.bam \ - --reference /refs/GRCh38.fa \ - --workdir /tmp/trio \ - --prefix trio \ - --log /tmp/trio/run_call_log.json \ - --probe-tools -``` -- `--sample`: `sample_id:/path/to.bam`,可重複。 -- `--reference`: 參考序列。 -- `--workdir`: 輸出與中間檔位置。 -- `--prefix`: 輸出檔名前綴。 -- `--log`: run log(JSON)路徑。 -- 輸出:joint VCF (`/tmp/trio/trio.joint.vcf.gz`)、run log(含指令/returncode)。 - -### 2) Annotation(VEP + bcftools) -```bash -genomic-consultant run-annotate \ - --vcf /tmp/trio/trio.joint.vcf.gz \ - --workdir /tmp/trio/annot \ - --prefix trio \ - --reference /refs/GRCh38.fa \ - --plugin 'SpliceAI,snv=/path/spliceai.snv.vcf.gz,indel=/path/spliceai.indel.vcf.gz' \ - --plugin 'CADD,/path/whole_genome_SNVs.tsv.gz,/path/InDels.tsv.gz' \ - --extra-flag "--cache --offline" \ - --log /tmp/trio/annot/run_annot_log.json \ - --probe-tools -``` -- `--plugin`: VEP plugin 規格,可重複(示範 SpliceAI/CADD)。 -- `--extra-flag`: 附加給 VEP 的旗標(如 cache/offline)。 -- 輸出:annotated VCF (`trio.vep.vcf.gz`)、平坦 TSV (`trio.vep.tsv`)、run log。 - -### 3) Panel/Phenotype 報告 -使用 panel 檔: -```bash -genomic-consultant panel-report \ - --tsv /tmp/trio/annot/trio.vep.tsv \ - --panel configs/panel.example.json \ - --acmg-config configs/acmg_config.example.yaml \ - --individual-id proband \ - --max-af 0.05 \ - --format markdown \ - --log /tmp/trio/panel_log.json -``` -使用 phenotype 直譯 panel: -```bash -genomic-consultant panel-report \ - --tsv /tmp/trio/annot/trio.vep.tsv \ - --phenotype-id HP:0000365 \ - --phenotype-mapping configs/phenotype_to_genes.hpo_seed.json \ - --acmg-config configs/acmg_config.example.yaml \ - --individual-id proband \ - --max-af 0.05 \ - --format markdown -``` -- `--max-af`: 過濾等位基因頻率上限。 -- `--format`: `markdown` 或 `json`。 -- 輸出:報告文字 + run log(記錄 panel/ACMG config 的 hash)。 - -## 一鍵模式(可跳過 call/annotate) -已經有 joint VCF/TSV 時,可跳過前兩步: -```bash -genomic-consultant phase1-run \ - --workdir /tmp/trio \ - --prefix trio \ - --tsv /tmp/trio/annot/trio.vep.tsv \ - --skip-call --skip-annotate \ - --panel configs/panel.example.json \ - --acmg-config configs/acmg_config.example.yaml \ - --max-af 0.05 \ - --format markdown \ - --log-dir /tmp/trio/runtime -``` -若要實際跑 VEP,可移除 `--skip-annotate` 並提供 `--plugins/--extra-flag`;若要跑 calling,移除 `--skip-call` 並提供 `--sample`/`--reference`。 - -## 主要輸出 -- joint VCF(呼叫結果) -- annotated VCF + 平坦 TSV(含 gene/consequence/ClinVar/AF/SpliceAI/CADD 等欄位) -- run logs(JSON,含指令、return code、config hash;在 `--log` 或 `--log-dir`) -- Panel 報告(Markdown 或 JSON),附 ACMG 自動標籤,需人工複核 - -## 注意 -- Call/annotate 依賴外部工具與對應資源(參考序列、VEP cache、SpliceAI/CADD 資料)。 -- 若無 BAM/資源,可用 sample TSV:`phase1-run --tsv sample_data/example_annotated.tsv --skip-call --skip-annotate ...` 演示報告。 -- 安全:`.gitignore` 已排除大型基因檔案;建議本地受控環境執行。 diff --git a/docs/phase1_implementation_plan.md b/docs/phase1_implementation_plan.md deleted file mode 100644 index 83a5a66..0000000 --- a/docs/phase1_implementation_plan.md +++ /dev/null @@ -1,80 +0,0 @@ -# Phase 1 Implementation Plan (Genomic Foundation) - -Scope: deliver a working trio-based variant pipeline, annotated genomic store, query APIs, initial ACMG evidence tagging, and reporting/logging scaffolding. This assumes local execution with Python 3.11+. - -## Objectives -- Trio BAM → joint VCF with QC artifacts (`Auto`). -- Annotate variants with population frequency, ClinVar, consequence/prediction (`Auto`). -- Provide queryable interfaces for gene/region lookups with filters (`Auto`). -- Disease/phenotype → gene panel lookup and filtered variant outputs (`Auto+Review`). -- Auto-tag subset of ACMG criteria; human-only final classification (`Auto+Review`). -- Produce machine-readable run logs with versions, configs, and overrides. - -## Work Breakdown -1) **Data & references** - - Reference genome: GRCh38 (primary) with option for GRCh37; pin version hash. - - Resource bundles: known sites for BQSR (if using GATK), gnomAD/ClinVar versions for annotation. - - Test fixtures: small trio BAM/CRAM subset or GIAB trio downsampled for CI-like checks. - -2) **Variant calling pipeline (wrapper, `Auto`)** - - Tooling: GATK HaplotypeCaller → gVCF; GenotypeGVCFs for joint calls. Alt: DeepVariant + joint genotyper (parameterized). - - Steps: - - Validate inputs (file presence, reference match). - - Optional QC: coverage, duplicates, on-target. - - Generate per-sample gVCF; joint genotyping to trio VCF. - - Outputs: joint VCF + index; QC summary JSON/TSV; log with tool versions and params. - -3) **Annotation pipeline (`Auto`)** - - Tooling: Ensembl VEP with plugins for gnomAD, ClinVar, CADD, SpliceAI where available; alt path: ANNOVAR. - - Steps: - - Normalize variants (bcftools norm) if needed. - - Annotate with gene, transcript, protein change; population AF; ClinVar significance; consequence; predictions (SIFT/PolyPhen/CADD); inheritance flags. - - Include SpliceAI/CADD plugins if installed; CLI accepts extra flags/plugins to embed SpliceAI/CADD fields. - - Outputs: annotated VCF; flattened TSV/Parquet for faster querying; manifest of DB versions used. - -4) **Genomic store + query API (`Auto`)** - - Early option: tabix-indexed VCF with Python wrapper. - - Functions (Python module `genomic_store`): - - `get_variants_by_gene(individual_id, genes, filters)` - Filters: AF thresholds, consequence categories, ClinVar significance, inheritance pattern. - - `get_variants_by_region(individual_id, chrom, start, end, filters)` - - `list_genes_with_variants(individual_id, filters)` (optional). - - Filters defined in a `FilterConfig` dataclass; serialize-able for logging. - - Future option: import to SQLite/Postgres via Arrow/Parquet for richer predicates. - -5) **Disease/phenotype → gene panel (`Auto+Review`)** - - Data: HPO/OMIM/GenCC lookup or curated JSON panels. - - Function: `get_gene_panel(disease_or_hpo_id, version=None)` returning gene list + provenance. - - Phenotype resolver: curated JSON mapping (e.g., `phenotype_to_genes.example.json`) as placeholder until upstream data is wired; allow dynamic panel synthesis by phenotype ID in CLI; support merging multiple sources into one mapping. - - Flow: resolve panel → call genomic store → apply simple ranking (AF low, ClinVar pathogenicity high). - - Manual review points: panel curation, rank threshold tuning. - -6) **ACMG evidence tagging (subset, `Auto+Review`)** - - Criteria to auto-evaluate initially: PVS1 (LoF in LoF-sensitive gene list), PM2 (absent/rare in population), BA1/BS1 (common frequency), possible BP7 (synonymous, no splicing impact). - - Config: YAML/JSON with thresholds (AF cutoffs, LoF gene list, transcript precedence). - - Output schema per variant: `{variant_id, evidence: [tag, strength, rationale], suggested_class}` with suggested class computed purely from auto tags; final class left blank for human. - - Logging: capture rule version and reasons for each fired rule. - -7) **Run logging and versioning** - - Every pipeline run emits `run_log.json` containing: - - Inputs (sample IDs, file paths, reference build). - - Tool versions and parameters; DB versions; config hashes (panel/ACMG configs). - - Automation level per step; manual overrides (`who/when/why`). - - Derived artifacts paths and checksums. - - Embed run_log reference in reports. - -8) **Report template (minimal)** - - Input: disease/gene panel name, variant query results, ACMG evidence tags. - - Output: Markdown + JSON summary with sections: context, methods, variants table, limitations. - - Mark human-only decisions clearly. - -## Milestones -- **M1**: Repo scaffolding + configs; tiny trio test data wired; `make pipeline` runs through variant calling + annotation on fixture. -- **M2**: Genomic store wrapper with gene/region queries; filter config; basic CLI/Notebook demo. -- **M3**: Panel lookup + ranked variant listing; ACMG auto tags on outputs; run_log generation. -- **M4**: Minimal report generator + acceptance of human-reviewed classifications. - -## Validation Strategy -- Unit tests for filter logic, panel resolution, ACMG tagger decisions (synthetic variants). -- Integration test on small fixture trio to ensure call → annotate → query path works. -- Determinism checks: hash configs and verify outputs stable across runs given same inputs. diff --git a/docs/system_architecture.md b/docs/system_architecture.md deleted file mode 100644 index 6914498..0000000 --- a/docs/system_architecture.md +++ /dev/null @@ -1,69 +0,0 @@ -# System Architecture Blueprint (v0.1) - -This document turns `genomic_decision_support_system_spec_v0.1.md` into a buildable architecture and phased roadmap. Automation levels follow `Auto / Auto+Review / Human-only`. - -## High-Level Views -- **Core layers**: (1) sequencing ingest → variant calling/annotation (`Auto`), (2) genomic query layer (`Auto`), (3) rule engines (ACMG, PGx, DDI, supplements; mixed automation), (4) orchestration/LLM and report generation (`Auto` tool calls, `Auto+Review` outputs). -- **Data custody**: all PHI/genomic artifacts remain local; external calls require de-identification or private models. -- **Traceability**: every run records tool versions, database snapshots, configs, and manual overrides in machine-readable logs. - -### End-to-end flow -``` -BAM (proband + parents) - ↓ Variant Calling (gVCF) [Auto] -Joint Genotyper → joint VCF - ↓ Annotation (VEP/ANNOVAR + ClinVar/gnomAD etc.) [Auto] - ↓ Genomic Store (VCF+tabix or SQL) + Query API [Auto] - ↓ - ├─ Disease/Phenotype → Gene Panel lookup [Auto+Review] - │ └─ Panel variants with basic ranking (freq, ClinVar) [Auto+Review] - ├─ ACMG evidence tagging subset (PVS1, PM2, BA1, BS1…) [Auto+Review] - ├─ PGx genotype→phenotype and recommendation rules [Auto → Auto+Review] - ├─ DDI rule evaluation [Auto] - └─ Supplement/Herb normalization + interaction rules [Auto+Review → Human-only] - ↓ -LLM/Orchestrator routes user questions to tools, produces JSON + Markdown drafts [Auto tools, Auto+Review narratives] -``` - -## Phase Roadmap (build-first view) -- **Phase 1 – Genomic foundation** - - Deliverables: trio joint VCF + annotation; query functions (`get_variants_by_gene/region`); disease→gene panel lookup; partial ACMG evidence tagging. - - Data stores: tabix-backed VCF wrapper initially; optional SQLite/Postgres import later. - - Interfaces: Python CLI/SDK first; machine-readable run logs with versions and automation levels. -- **Phase 2 – PGx & DDI** - - Drug vocabulary normalization (ATC/RxNorm). - - PGx engine: star-allele calling or rule-based genotype→phenotype; guideline-mapped advice with review gates. - - DDI engine: rule base with severity tiers; combine with PGx outputs. -- **Phase 3 – Supplements & Herbs** - - Name/ingredient normalization; herb formula expansion. - - Rule tables for CYP/transporters, coagulation, CNS effects. - - Evidence grading and conservative messaging; human-only final clinical language. -- **Phase 4 – LLM Interface & Reports** - - Tool-calling schema for queries listed above. - - JSON + Markdown report templates with traceability to rules, data versions, and overrides. - -## Module Boundaries -- **Variant Calling Pipeline** (`Auto`): wrapper around GATK or DeepVariant + joint genotyper; pluggable reference genome; QC summaries. -- **Annotation Pipeline** (`Auto`): VEP/ANNOVAR with pinned database versions (gnomAD, ClinVar, transcript set); emits annotated VCF + flat table. -- **Genomic Query Layer** (`Auto`): abstraction over tabix or SQL; minimal APIs: `get_variants_by_gene`, `get_variants_by_region`, filters (freq, consequence, clinvar). -- **Disease/Phenotype to Panel** (`Auto+Review`): HPO/OMIM lookups or curated panels; panel versioned; feeds queries. -- **Phenotype Resolver** (`Auto+Review`): JSON/DB mapping of phenotype/HPO IDs to gene lists as a placeholder until upstream sources are integrated; can synthesize panels dynamically and merge multiple sources. -- **ACMG Evidence Tagger** (`Auto+Review`): auto-evaluable criteria only; config-driven thresholds; human-only final classification. -- **PGx Engine** (`Auto → Auto+Review`): star-allele calling where possible; guideline rules (CPIC/DPWG) with conservative defaults; flag items needing review. -- **DDI Engine** (`Auto`): rule tables keyed by normalized drug IDs; outputs severity and rationale. -- **Supplements/Herbs** (`Auto+Review → Human-only`): ingredient extraction + mapping; interaction rules; human sign-off for clinical language. -- **Orchestrator/LLM** (`Auto tools, Auto+Review outputs`): intent parsing, tool sequencing, safety guardrails, report drafting. - -## Observability and Versioning -- Every pipeline run writes a JSON log: tool versions, reference genome, DB versions, config hashes, automation level per step, manual overrides (who/when/why). -- Reports embed references to those logs so outputs remain reproducible. -- Configs (ACMG thresholds, gene panels, PGx rules) are versioned artifacts stored alongside code. - -## Security/Privacy Notes -- Default to local processing; if external LLMs are used, strip identifiers and avoid full VCF uploads. -- Secrets kept out of repo; rely on environment variables or local config files (excluded by `.gitignore`). - -## Initial Tech Bets (to be validated) -- Language/runtime: Python 3.11+ for pipelines, rules, and orchestration stubs. -- Bio stack candidates: GATK or DeepVariant; VEP; tabix for early querying; SQLAlchemy + SQLite/Postgres when scaling. -- Infra: containerized runners for pipelines; makefiles or workflow engine (Nextflow/Snakemake) later if needed. diff --git a/genomic_decision_support_system_spec_v0.1.md b/genomic_decision_support_system_spec_v0.1.md deleted file mode 100644 index 59c7b27..0000000 --- a/genomic_decision_support_system_spec_v0.1.md +++ /dev/null @@ -1,356 +0,0 @@ - -# 個人基因風險與用藥交互作用決策支援系統 – 系統規格書 - -- 版本:v0.1-draft -- 作者:Gbanyan + 助理規劃 -- 狀態:草案,預期隨實作迭代 -- 目的:提供給 LLM(如 Claude、Codex 等)與開發者閱讀,作為系統設計與實作的基礎規格。 - ---- - -## 0. 系統目標與範圍 - -### 0.1 目標 - -建立一套「個人化、基因驅動」的決策支援系統,核心功能: - -1. **從個人與父母的外顯子定序資料(BAM)出發**,產生可查詢的變異資料庫。 -2. **針對特定疾病、症狀或表型**,自動查詢相關基因與已知致病變異,並在個人資料中搜尋對應變異。 -3. 依據 **ACMG/AMP 等準則** 與公開資料庫,給出 **機器輔助、具人工介入點的變異詮釋**。 -4. 進一步整合: - - 基因藥物學(Pharmacogenomics, PGx) - - 藥–藥交互作用(DDI) - - 保健食品、中藥等成分的潛在交互作用與風險 -5. 提供一個自然語言問答介面,使用者可直接問: - - 「我有沒有 XXX 的遺傳風險?」 - - 「以我現在吃的藥和保健食品,有沒有需要注意的交互作用?」 - -> 系統定位為:**個人決策支援工具 / 研究工具**,而非正式醫療診斷系統。 - -### 0.2 核心設計哲學 - -- **分階段發展**:先穩固「基因本體」與變異詮釋,再往 PGx 與交互作用擴充。 -- **明確的人機分工**:對每個模組標記 `Auto / Auto+Review / Human-only`。 -- **可追蹤、可回溯**:每一個結論都可追蹤到所用規則、資料庫版本、人工 override。 - ---- - -## 1. 發展階段與整體架構 - -### 1.1 發展階段總覽 - -| 階段 | 名稱 | 核心產出 | 主要對象 | -|------|------|----------|----------| -| Phase 1 | 基因本體與變異詮釋基礎 | 個人+trio VCF、註解與疾病/基因查詢 | 單基因疾病風險 | -| Phase 2 | 基因藥物學與 DDI | 基因–藥物對應、藥–藥交互作用分析 | 精準用藥建議 | -| Phase 3 | 保健食品與中藥交互作用 | 成分標準化與交互作用風險層級 | 整體用藥+補充品安全網 | -| Phase 4 | NLP/LLM 問答介面 | 自然語言問答、報告生成 | 一般使用者 / 臨床對話 | - -### 1.2 高階架構圖(Mermaid) - -```mermaid -flowchart TD - -subgraph P1[Phase 1: 基因本體] - A1[BAM (本人+父母)] --> A2[Variant Calling Pipeline] - A2 --> A3[Joint VCF (Trio)] - A3 --> A4[Variant Annotation (ClinVar, gnomAD, VEP...)] - A4 --> A5[Genomic DB / Query API] -end - -subgraph P2[Phase 2: PGx & DDI] - B1[藥物清單] --> B2[PGx Engine] - A5 --> B2 - B1 --> B3[DDI Engine] -end - -subgraph P3[Phase 3: 補充品 & 中藥] - C1[保健食品/中藥清單] --> C2[成分標準化] - C2 --> C3[成分交互作用引擎] - B1 --> C3 - A5 --> C3 -end - -subgraph P4[Phase 4: 問答與報告] - D1[前端 UI / CLI / API Client] --> D2[LLM Orchestrator] - D2 -->|疾病/症狀詢問| A5 - D2 -->|用藥/成分詢問| B2 - D2 --> B3 - D2 --> C3 - A5 --> D3[報告產生器] - B2 --> D3 - B3 --> D3 - C3 --> D3 -end -``` - ---- - -## 2. 通用設計:人機分工標記 - -所有模組需標記自動化等級: - -- `Auto`:可完全自動執行的步驟(例:variant calling、基本註解)。 -- `Auto+Review`:系統先產生建議,需人工複核或有條件接受(例:ACMG 部分 evidence scoring)。 -- `Human-only`:最終醫療判斷/用語/管理建議,必須由人決策(例:最終 Pathogenic 分類、臨床處置建議)。 - -每次分析需生成一份 **machine-readable log**,紀錄: - -- 使用的模組與版本 -- 每一步的自動化等級 -- 哪些地方有人工 override(人員、時間、理由) - ---- - -## 3. Phase 1:基因本體與變異詮釋基礎 - -### 3.1 功能需求 - -1. **輸入** - - 本人與雙親外顯子定序 BAM 檔。 -2. **輸出** - - 高品質 joint VCF(含 trio) - - 每個變異的註解資訊: - - 基因、轉錄本、蛋白改變 - - 族群頻率(gnomAD 等) - - ClinVar 註解 - - 功能預測(SIFT/PolyPhen/CADD 等) - - 對特定疾病/基因清單的變異過濾結果。 -3. **對外服務** - - 以 API / 函式介面提供: - - 給定基因列表 → 回傳該個體在這些基因中的變異列表 - - 支援疾病名稱/HPO → 基因 → 變異的查詢流程(初期可分步呼叫) - -### 3.2 模組設計 - -#### 3.2.1 Variant Calling Pipeline - -- **輸入**:BAM(本人 + 父母) -- **輸出**:個別 gVCF → joint VCF -- **工具候選**: - - GATK(HaplotypeCaller + GenotypeGVCFs) - - 或 DeepVariant + joint genotyper -- **自動化等級**:`Auto` -- **需求**: - - 基本 QC(coverage、duplicate rate、on-target rate) - - 支援版本標記(如 reference genome 版本) - -#### 3.2.2 Annotation Pipeline - -- **輸入**:joint VCF -- **輸出**:annotated VCF / 變異表 -- **工具候選**: - - VEP、ANNOVAR 或類似工具 -- **資料庫**: - - ClinVar - - gnomAD - - 基因功能與轉錄本資料庫 -- **自動化等級**:`Auto` - -#### 3.2.3 Genomic DB / Query API - -- **目的**:提供高效查詢,作為後續模組(疾病風險、PGx 等)的基底。 -- **形式**: - - 選項 A:基於 VCF + tabix,以封裝函式操作 - - 選項 B:匯入 SQLite / PostgreSQL / 專用 genomic DB -- **關鍵查詢**: - - `get_variants_by_gene(individual_id, gene_list, filters)` - - `get_variants_by_region(individual_id, chr, start, end, filters)` -- **自動化等級**:`Auto` - -#### 3.2.4 疾病/表型 → 基因 → 變異流程 - -- 初期可拆成三步: - 1. 使用外部知識庫或手動 panel:疾病/表型 → 基因清單 - 2. 透過 Genomic DB 查詢個人變異 - 3. 以簡單規則(頻率、ClinVar 標註)做初步排序 -- **自動化等級**:`Auto+Review` - -### 3.3 ACMG 規則實作(初版) - -- **範圍**:僅實作部分機器可自動判定之 evidence(如 PVS1、PM2、BA1、BS1 等)。 -- **輸出**: - - 每個變異的 evidence tag 列表與建議分級(例如:`suggested_class = "VUS"`) -- **人工介入點**: - - 變異最終分類(Pathogenic / Likely pathogenic / VUS / Likely benign / Benign) → `Human-only` - - 規則閾值(如頻率 cutoff)以 config 檔管理 → `Auto+Review` - ---- - -## 4. Phase 2:基因藥物學(PGx)與藥–藥交互作用(DDI) - -### 4.1 功能需求 - -1. 接收使用者目前用藥清單(處方藥、成藥)。 -2. 透過基因資料,判定與 PGx 相關的 genotype(例如 CYP2D6, CYP2C9, HLA 等)。 -3. 根據 CPIC / DPWG 等指南,給出: - - 適應症相關風險(如 HLA-B*58:01 與 allopurinol) - - 劑量調整建議 / 藥物替代建議(僅 decision-support 層級) -4. 計算基礎藥–藥交互作用(DDI),例如: - - CYP 抑制 / 誘導疊加 - - QT prolongation 疊加 - - 出血風險疊加 - -### 4.2 模組設計 - -#### 4.2.1 用藥資料標準化 - -- 使用 ATC / RxNorm / 自訂 ID。 -- **自動化等級**:`Auto` - -#### 4.2.2 PGx Engine - -- **輸入**:個人變異(Phase 1 DB)、藥物清單 -- **輸出**:每個藥物的 PGx 評估(genotype → phenotype → 建議) -- **資料庫**: - - CPIC guidelines - - PharmGKB 關聯資料 -- **自動化等級**: - - genotype → phenotype:`Auto` - - phenotype → 臨床建議:`Auto+Review` - -#### 4.2.3 DDI Engine - -- **輸入**:藥物清單 -- **輸出**:已知 DDI 清單與嚴重程度分級 -- **資料來源**:公開或商用 DDI 資料庫(視可用性) -- **自動化等級**:`Auto` - ---- - -## 5. Phase 3:保健食品與中藥交互作用模組 - -### 5.1 功能需求 - -1. 接收使用者的保健食品與中藥使用資料。 -2. 將名稱解析為: - - 標準化有效成分(如 EPA/DHA mg、Vit D IU、銀杏葉萃取物 mg 等) - - 中藥材名稱(如 黃耆、當歸、川芎…) -3. 評估: - - 成分與藥物、基因的交互作用風險 - - 成分間的加乘作用(如抗凝、CNS 抑制等) -4. 按證據等級給出: - - 高優先級警示(有較強臨床證據) - - 一般提醒(動物實驗 / case report 等) - - 資料不足,僅能提醒不確定性 - -### 5.2 模組設計 - -#### 5.2.1 成分標準化引擎 - -- **輸入**:使用者輸入的品名 / 處方 -- **輸出**: - - 標準化成分列表 - - 估計劑量範圍(若無精確資料) -- **資料**: - - 保健食品常用成分資料表 - - 中藥方與藥材對應表 -- **自動化等級**:`Auto+Review` - -#### 5.2.2 成分交互作用引擎 - -- **輸入**:成分列表、藥物清單、基因資料 -- **輸出**:交互作用列表與風險層級 -- **邏輯**: - - 成分對 CYP / P-gp / OATP 等的影響 - - 成分對凝血、血壓、中樞神經等系統的影響 -- **自動化等級**: - - 規則推論:`Auto` - - 最終臨床建議表述:`Human-only` - ---- - -## 6. Phase 4:NLP/LLM 問答介面與報告生成 - -### 6.1 功能需求 - -1. 支援使用者以自然語言提問: - - 疾病/症狀相關風險 - - 用藥安全性 - - 保健食品、中藥併用風險 -2. LLM 負責: - - 問題解析 → 結構化查詢(疾病、HPO、藥物、成分等) - - 協調呼叫底層 API(Phase 1–3) - - 整合結果並生成報告草稿 -3. 報告形式: - - 機器可讀 JSON(便於後處理) - - 人類可讀 Markdown / PDF 報告 - -### 6.2 Orchestration 設計 - -- 可採用「LLM + Tool/Function Calling」模式: - - 工具包括: - - `query_variants_by_gene` - - `query_disease_gene_panel` - - `run_pgx_analysis` - - `run_ddi_analysis` - - `run_supplement_herb_interaction` -- LLM 主要負責: - - 意圖辨識與拆解 - - 工具呼叫順序規劃 - - 結果解釋與用語調整(需符合安全與保守原則) -- **自動化等級**: - - 工具呼叫:`Auto` - - 臨床敏感結論:`Auto+Review` / `Human-only`(視場景而定) - ---- - -## 7. 安全性、隱私與版本管理 - -### 7.1 資料安全與隱私 - -- 所有基因資料、用藥清單、報告: - - 儲存於本地或受控環境 - - 若需與外部服務(如雲端 LLM)互動,需: - - 做脫敏處理(移除個資) - - 或改用 local/私有 LLM - -### 7.2 版本管理 - -- 對以下物件進行版本控制: - - 參考基因組版本 - - variant calling pipeline 版本 - - 資料庫版本(ClinVar、gnomAD 等) - - ACMG 規則 config 版本 - - gene panel / PGx 規則版本 -- 每份分析報告需記錄所用版本,以利追蹤與重跑。 - -### 7.3 人工介入紀錄 - -- 每次人工 override 或審核需紀錄: - - 變異 ID / 分析項目 - - 原自動建議 - - 人工調整結果 - - 理由與參考文獻(如有) - - 審核者與時間 - ---- - -## 8. 未來擴充方向(Optional) - -- 整合 polygenic risk score(PRS)模組 -- 整合 longitudinal data(實驗室數據、症狀日誌)做風險動態追蹤 -- 為特定疾病領域建立更深的 expert-curated knowledge base -- 與可穿戴裝置/其他健康資料源整合 - ---- - -## 9. 第一階段實作建議路線(Actionable TODO) - -1. **規劃 Phase 1 的技術選型** - - 選擇 variant caller(如 GATK)與 reference genome 版本 - - 選擇 annotation 工具(如 VEP 或 ANNOVAR) -2. **建立基本 pipeline** - - BAM → gVCF → joint VCF(trio) - - 加上基本 QC 報表 -3. **建置簡單的 Genomic Query 介面** - - 先以 CLI/Notebook 函式為主(例如 Python 函式庫) -4. **選一個你最關心的疾病領域** - - 建立第一個 gene panel(例如視覺/聽力相關) - - 實作 panel-based 查詢與變異列表輸出 -5. **撰寫第一版報告模板** - - 輸入:疾病名稱 + gene panel + 查詢結果 - - 輸出:簡易 Markdown 報告(含變異表 + 限制說明) -6. **逐步加入 ACMG 自動 evidence 標記與人工 review 流程** - -這個規格書預期會在實作過程中持續更新,可視此為 v0.1 的起點版本。 diff --git a/gwas_comprehensive.py b/gwas_comprehensive.py new file mode 100644 index 0000000..44cbd65 --- /dev/null +++ b/gwas_comprehensive.py @@ -0,0 +1,590 @@ +#!/usr/bin/env python3 +""" +Comprehensive GWAS Trait Analysis Script +Expanded version with 200+ clinically relevant trait-associated SNPs +""" + +import gzip +import sys +import re +from collections import defaultdict +from typing import Dict, List, Tuple + +# ============================================================================ +# COMPREHENSIVE TRAIT-ASSOCIATED SNPs DATABASE +# Format: rsid -> (chrom, pos, risk_allele, trait, effect, category) +# ============================================================================ + +TRAIT_SNPS = { + # ======================================================================== + # GOUT / URIC ACID METABOLISM (新增) + # ======================================================================== + "rs2231142": ("4", 89052323, "T", "Gout / Hyperuricemia", "risk", "Gout"), + "rs16890979": ("4", 9922166, "T", "Serum uric acid levels", "higher", "Gout"), + "rs734553": ("4", 9920485, "G", "Gout", "risk", "Gout"), + "rs1014290": ("4", 10001861, "A", "Serum uric acid levels", "higher", "Gout"), + "rs505802": ("11", 64357072, "C", "Serum uric acid levels", "higher", "Gout"), + "rs3775948": ("4", 9999007, "G", "Gout", "risk", "Gout"), + "rs12498742": ("4", 9993806, "A", "Serum uric acid levels", "higher", "Gout"), + "rs675209": ("4", 89011046, "T", "Gout", "risk", "Gout"), + "rs1165151": ("11", 64352047, "T", "Serum uric acid levels", "higher", "Gout"), + "rs478607": ("17", 19459563, "A", "Serum uric acid levels", "higher", "Gout"), + + # ======================================================================== + # KIDNEY DISEASE (新增) + # ======================================================================== + "rs4293393": ("16", 20364808, "T", "Chronic kidney disease", "risk", "Kidney"), + "rs12917707": ("16", 20369861, "G", "Chronic kidney disease", "protective", "Kidney"), + "rs11959928": ("5", 39394747, "A", "eGFR decline", "risk", "Kidney"), + "rs1260326": ("2", 27730940, "T", "Chronic kidney disease", "risk", "Kidney"), + "rs13329952": ("16", 20393103, "C", "Chronic kidney disease", "risk", "Kidney"), + "rs267734": ("1", 150950830, "C", "Chronic kidney disease", "risk", "Kidney"), + + # ======================================================================== + # HEARING LOSS (與 Usher syndrome 家庭相關) + # ======================================================================== + "rs7598759": ("2", 70439175, "A", "Age-related hearing loss", "risk", "Hearing"), + "rs161927": ("5", 88228027, "G", "Hearing impairment", "risk", "Hearing"), + "rs10497394": ("2", 70477374, "T", "Hearing loss", "risk", "Hearing"), + "rs3752752": ("7", 129608155, "C", "Noise-induced hearing loss", "risk", "Hearing"), + "rs7294": ("4", 6303557, "G", "Hearing loss", "risk", "Hearing"), + + # ======================================================================== + # AUTOIMMUNE DISEASES (新增) + # ======================================================================== + # Rheumatoid Arthritis + "rs6679677": ("1", 114179091, "A", "Rheumatoid arthritis", "risk", "Autoimmune"), + "rs2476601": ("1", 114377568, "A", "Rheumatoid arthritis / Autoimmune", "risk", "Autoimmune"), + "rs3087243": ("2", 204447164, "G", "Rheumatoid arthritis", "protective", "Autoimmune"), + "rs4810485": ("20", 44747947, "T", "Rheumatoid arthritis", "risk", "Autoimmune"), + + # Systemic Lupus Erythematosus (SLE) + "rs1143679": ("16", 31276811, "A", "Systemic lupus erythematosus", "risk", "Autoimmune"), + "rs7574865": ("2", 191099907, "T", "Systemic lupus erythematosus", "risk", "Autoimmune"), + "rs2187668": ("6", 32605884, "T", "Systemic lupus erythematosus", "risk", "Autoimmune"), + + # Multiple Sclerosis + "rs3135388": ("6", 32439887, "A", "Multiple sclerosis", "risk", "Autoimmune"), + "rs6897932": ("5", 35910332, "C", "Multiple sclerosis", "risk", "Autoimmune"), + "rs4648356": ("1", 101256530, "C", "Multiple sclerosis", "risk", "Autoimmune"), + + # Inflammatory Bowel Disease + "rs2241880": ("16", 50756540, "G", "Crohn's disease / IBD", "risk", "Autoimmune"), + "rs11209026": ("1", 67705958, "A", "Crohn's disease / IBD", "protective", "Autoimmune"), + "rs10883365": ("10", 64426914, "G", "Ulcerative colitis", "risk", "Autoimmune"), + "rs2066847": ("16", 50745926, "C", "Crohn's disease", "risk", "Autoimmune"), + + # Type 1 Diabetes + "rs2292239": ("12", 56482804, "T", "Type 1 diabetes", "risk", "Autoimmune"), + "rs3129889": ("6", 32609440, "G", "Type 1 diabetes", "risk", "Autoimmune"), + "rs689": ("11", 2182224, "T", "Type 1 diabetes", "risk", "Autoimmune"), + + # Celiac Disease + "rs2395182": ("6", 32713854, "T", "Celiac disease", "risk", "Autoimmune"), + "rs7775228": ("6", 32665438, "C", "Celiac disease", "risk", "Autoimmune"), + + # Hashimoto's Thyroiditis / Graves' Disease + "rs179247": ("2", 204733986, "A", "Autoimmune thyroid disease", "risk", "Autoimmune"), + "rs1980422": ("6", 90957406, "C", "Autoimmune thyroid disease", "risk", "Autoimmune"), + + # ======================================================================== + # CANCER RISK (新增) + # ======================================================================== + # Breast Cancer + "rs2981582": ("10", 123337335, "A", "Breast cancer (FGFR2)", "risk", "Cancer"), + "rs13281615": ("8", 128355618, "G", "Breast cancer", "risk", "Cancer"), + "rs889312": ("5", 56067641, "C", "Breast cancer (MAP3K1)", "risk", "Cancer"), + "rs3817198": ("11", 1909006, "C", "Breast cancer (LSP1)", "risk", "Cancer"), + "rs13387042": ("2", 217905832, "A", "Breast cancer", "risk", "Cancer"), + + # Prostate Cancer + "rs1447295": ("8", 128554220, "A", "Prostate cancer", "risk", "Cancer"), + "rs16901979": ("8", 128320346, "A", "Prostate cancer", "risk", "Cancer"), + "rs6983267": ("8", 128413305, "G", "Prostate cancer / Colorectal cancer", "risk", "Cancer"), + "rs10993994": ("10", 51549496, "T", "Prostate cancer (MSMB)", "risk", "Cancer"), + "rs7679673": ("4", 106061534, "C", "Prostate cancer", "risk", "Cancer"), + + # Colorectal Cancer + "rs4939827": ("18", 46453463, "T", "Colorectal cancer (SMAD7)", "risk", "Cancer"), + "rs6983267_crc": ("8", 128413305, "G", "Colorectal cancer", "risk", "Cancer"), + "rs4779584": ("15", 32994756, "T", "Colorectal cancer", "risk", "Cancer"), + "rs10795668": ("10", 8701219, "G", "Colorectal cancer", "protective", "Cancer"), + + # Lung Cancer + "rs8034191": ("15", 78894339, "C", "Lung cancer", "risk", "Cancer"), + "rs1051730": ("15", 78882925, "A", "Lung cancer / Nicotine dependence", "risk", "Cancer"), + "rs2736100": ("5", 1286516, "C", "Lung cancer (TERT)", "risk", "Cancer"), + + # Melanoma + "rs910873": ("20", 32665748, "C", "Melanoma", "risk", "Cancer"), + "rs1801516": ("11", 108175462, "A", "Melanoma (ATM)", "risk", "Cancer"), + "rs16953002": ("12", 89328335, "A", "Melanoma", "risk", "Cancer"), + + # Thyroid Cancer + "rs965513": ("9", 100556109, "A", "Thyroid cancer", "risk", "Cancer"), + "rs944289": ("14", 36649246, "T", "Thyroid cancer", "risk", "Cancer"), + + # Bladder Cancer + "rs710521": ("3", 189643526, "A", "Bladder cancer", "risk", "Cancer"), + "rs9642880": ("8", 128787253, "T", "Bladder cancer", "risk", "Cancer"), + + # ======================================================================== + # BLOOD CLOTTING / THROMBOSIS (新增) + # ======================================================================== + "rs6025": ("1", 169519049, "T", "Factor V Leiden / DVT risk", "risk", "Thrombosis"), + "rs1799963": ("11", 46761055, "A", "Prothrombin G20210A / DVT risk", "risk", "Thrombosis"), + "rs8176719": ("9", 136131322, "C", "Blood type O (protective for VTE)", "protective", "Thrombosis"), + "rs505922": ("9", 136149229, "C", "Venous thromboembolism", "risk", "Thrombosis"), + "rs2066865": ("4", 155525276, "G", "Fibrinogen levels / DVT", "risk", "Thrombosis"), + + # ======================================================================== + # THYROID DISORDERS (新增) + # ======================================================================== + "rs1991517": ("8", 133020441, "C", "Hypothyroidism", "risk", "Thyroid"), + "rs925489": ("2", 218283107, "T", "TSH levels", "higher", "Thyroid"), + "rs10499559": ("6", 166474536, "T", "Hypothyroidism", "risk", "Thyroid"), + "rs7850258": ("9", 4126287, "G", "Thyroid function", "altered", "Thyroid"), + + # ======================================================================== + # OSTEOPOROSIS / BONE HEALTH (新增) + # ======================================================================== + "rs3736228": ("11", 68179081, "T", "Osteoporosis / Low BMD", "risk", "Bone"), + "rs4988235": ("2", 136608646, "G", "Lactose intolerance (affects Ca)", "risk", "Bone"), + "rs2282679": ("4", 72608383, "C", "Vitamin D deficiency", "risk", "Bone"), + "rs1800012": ("17", 48275363, "T", "Osteoporosis (COL1A1)", "risk", "Bone"), + "rs2062377": ("8", 119964052, "A", "Bone mineral density", "lower", "Bone"), + "rs4355801": ("8", 119963145, "G", "Bone mineral density", "higher", "Bone"), + + # ======================================================================== + # LIVER DISEASE (新增) + # ======================================================================== + "rs738409": ("22", 44324727, "G", "NAFLD / Fatty liver (PNPLA3)", "risk", "Liver"), + "rs58542926": ("19", 19379549, "T", "NAFLD / Liver fibrosis (TM6SF2)", "risk", "Liver"), + "rs2228603": ("19", 11350488, "T", "NAFLD", "risk", "Liver"), + "rs12979860": ("19", 39248147, "C", "Hepatitis C clearance", "favorable", "Liver"), + + # ======================================================================== + # MIGRAINE / HEADACHE (新增) + # ======================================================================== + "rs2651899": ("1", 10796866, "C", "Migraine", "risk", "Migraine"), + "rs10166942": ("2", 234824778, "T", "Migraine", "risk", "Migraine"), + "rs11172113": ("12", 57527283, "C", "Migraine (LRP1)", "risk", "Migraine"), + "rs1835740": ("8", 87521374, "A", "Migraine", "risk", "Migraine"), + + # ======================================================================== + # LONGEVITY / AGING (新增) + # ======================================================================== + "rs2802292": ("6", 157192662, "G", "Longevity (FOXO3)", "protective", "Longevity"), + "rs1042522": ("17", 7579472, "C", "Longevity (TP53)", "altered", "Longevity"), + "rs4420638": ("19", 45422946, "A", "Longevity / Cardiovascular", "risk", "Longevity"), + + # ======================================================================== + # SLEEP / CIRCADIAN (原有 + 擴展) + # ======================================================================== + "rs113851554": ("2", 66799986, "T", "Insomnia", "risk", "Sleep"), + "rs12927162": ("16", 68856985, "A", "Sleep duration", "shorter", "Sleep"), + "rs1823125": ("1", 205713532, "G", "Chronotype (morning person)", "morning", "Sleep"), + "rs10493596": ("1", 215803417, "T", "Insomnia", "risk", "Sleep"), + "rs3104997": ("6", 27424938, "C", "Sleep duration", "shorter", "Sleep"), + "rs73598374": ("4", 94847526, "A", "Insomnia", "risk", "Sleep"), + "rs2302729": ("5", 35857091, "G", "Insomnia", "risk", "Sleep"), + "rs12936231": ("17", 44282378, "C", "Restless legs syndrome", "risk", "Sleep"), + "rs3923809": ("6", 38642286, "A", "Restless legs syndrome (BTBD9)", "risk", "Sleep"), + + # ======================================================================== + # SKIN CONDITIONS (原有 + 擴展) + # ======================================================================== + "rs1800629": ("6", 31543031, "A", "Psoriasis", "risk", "Skin"), + "rs20541": ("5", 131995964, "A", "Atopic dermatitis", "risk", "Skin"), + "rs2066808": ("6", 31540784, "A", "Psoriasis", "risk", "Skin"), + "rs3093662": ("6", 31574339, "G", "Psoriasis", "risk", "Skin"), + "rs10484554": ("6", 31271836, "A", "Psoriasis", "risk", "Skin"), + "rs1295686": ("5", 131996447, "A", "Atopic dermatitis", "risk", "Skin"), + "rs2227956": ("6", 31783279, "T", "Psoriasis", "risk", "Skin"), + "rs6906021": ("6", 32051991, "C", "Atopic dermatitis", "risk", "Skin"), + "rs12203592": ("6", 396321, "T", "Skin pigmentation / Freckling", "risk", "Skin"), + "rs1805007": ("16", 89986117, "T", "Red hair / Fair skin (MC1R)", "risk", "Skin"), + "rs1805008": ("16", 89986144, "T", "Red hair / Fair skin (MC1R)", "risk", "Skin"), + + # ======================================================================== + # CARDIOVASCULAR (原有 + 大幅擴展) + # ======================================================================== + "rs10757274": ("9", 22096055, "G", "Coronary artery disease", "risk", "Cardiovascular"), + "rs1333049": ("9", 22125503, "C", "Coronary artery disease", "risk", "Cardiovascular"), + "rs4665058": ("2", 43845437, "C", "Coronary artery disease", "risk", "Cardiovascular"), + "rs17465637": ("1", 222823529, "A", "Coronary artery disease", "risk", "Cardiovascular"), + "rs6725887": ("2", 203828796, "C", "Coronary artery disease", "risk", "Cardiovascular"), + # Hypertension + "rs699": ("1", 230845794, "G", "Hypertension (AGT)", "risk", "Cardiovascular"), + "rs5186": ("3", 148459988, "C", "Hypertension (AGTR1)", "risk", "Cardiovascular"), + "rs4961": ("4", 2906707, "T", "Hypertension / Salt sensitivity", "risk", "Cardiovascular"), + "rs1799998": ("8", 142876043, "T", "Hypertension (CYP11B2)", "risk", "Cardiovascular"), + # Atrial Fibrillation + "rs2200733": ("4", 111718106, "T", "Atrial fibrillation", "risk", "Cardiovascular"), + "rs10033464": ("4", 111714418, "T", "Atrial fibrillation", "risk", "Cardiovascular"), + "rs6843082": ("4", 111712344, "G", "Atrial fibrillation (PITX2)", "risk", "Cardiovascular"), + # Heart Failure + "rs1739843": ("15", 75086042, "T", "Heart failure", "risk", "Cardiovascular"), + # Stroke + "rs11833579": ("12", 115553310, "A", "Ischemic stroke", "risk", "Cardiovascular"), + "rs12425791": ("12", 115557677, "A", "Stroke (NINJ2)", "risk", "Cardiovascular"), + # Lipids + "rs1801177": ("8", 19813529, "A", "LDL cholesterol (LPL)", "higher", "Cardiovascular"), + "rs12740374": ("1", 109822166, "G", "LDL cholesterol (CELSR2)", "lower", "Cardiovascular"), + "rs3764261": ("16", 56993324, "A", "HDL cholesterol (CETP)", "higher", "Cardiovascular"), + "rs1800588": ("15", 58723675, "T", "HDL cholesterol (LIPC)", "higher", "Cardiovascular"), + "rs328": ("8", 19819724, "G", "Triglycerides (LPL)", "lower", "Cardiovascular"), + "rs662799": ("11", 116663707, "G", "Triglycerides (APOA5)", "higher", "Cardiovascular"), + + # ======================================================================== + # TYPE 2 DIABETES / METABOLIC (原有 + 擴展) + # ======================================================================== + "rs7903146": ("10", 114758349, "T", "Type 2 diabetes (TCF7L2)", "risk", "Metabolic"), + "rs12255372": ("10", 114808902, "T", "Type 2 diabetes (TCF7L2)", "risk", "Metabolic"), + "rs1801282": ("3", 12393125, "C", "Type 2 diabetes (PPARG)", "risk", "Metabolic"), + "rs5219": ("11", 17409572, "T", "Type 2 diabetes (KCNJ11)", "risk", "Metabolic"), + "rs13266634": ("8", 118184783, "C", "Type 2 diabetes (SLC30A8)", "risk", "Metabolic"), + "rs7754840": ("6", 20679709, "C", "Type 2 diabetes (CDKAL1)", "risk", "Metabolic"), + "rs10811661": ("9", 22134095, "T", "Type 2 diabetes (CDKN2A/B)", "risk", "Metabolic"), + "rs864745": ("7", 28196413, "T", "Type 2 diabetes (JAZF1)", "risk", "Metabolic"), + "rs4402960": ("3", 185511687, "T", "Type 2 diabetes (IGF2BP2)", "risk", "Metabolic"), + # Obesity/BMI + "rs9939609": ("16", 53820527, "A", "Obesity (FTO)", "risk", "Metabolic"), + "rs17782313": ("18", 57851097, "C", "Obesity (MC4R)", "risk", "Metabolic"), + "rs6548238": ("2", 634905, "C", "BMI", "higher", "Metabolic"), + "rs10938397": ("4", 45186139, "G", "BMI (GNPDA2)", "higher", "Metabolic"), + "rs571312": ("18", 57839769, "A", "BMI (MC4R)", "higher", "Metabolic"), + "rs10767664": ("11", 27682562, "A", "BMI (BDNF)", "higher", "Metabolic"), + + # ======================================================================== + # EYE CONDITIONS (原有 + 擴展) + # ======================================================================== + "rs10490924": ("10", 124214448, "T", "Age-related macular degeneration (ARMS2)", "risk", "Eye"), + "rs1061170": ("1", 196659237, "C", "Age-related macular degeneration (CFH)", "risk", "Eye"), + "rs9621532": ("22", 38477587, "C", "Myopia", "risk", "Eye"), + "rs10034228": ("4", 81951543, "A", "Myopia", "risk", "Eye"), + "rs1048661": ("1", 165655423, "C", "Glaucoma (LOXL1)", "risk", "Eye"), + "rs4656461": ("1", 165653012, "G", "Glaucoma (LOXL1)", "risk", "Eye"), + "rs2165241": ("15", 93600556, "T", "Glaucoma", "risk", "Eye"), + "rs3753841": ("1", 196704632, "C", "Age-related macular degeneration", "risk", "Eye"), + + # ======================================================================== + # NEUROPSYCHIATRIC (原有) + # ======================================================================== + # Alzheimer's Disease + "rs429358": ("19", 45411941, "C", "Alzheimer's disease (APOE e4)", "risk", "Neuropsychiatric"), + "rs7412": ("19", 45412079, "T", "Alzheimer's disease (APOE e2)", "protective", "Neuropsychiatric"), + "rs3865444": ("19", 51727962, "C", "Alzheimer's disease (CD33)", "risk", "Neuropsychiatric"), + "rs744373": ("2", 127892810, "G", "Alzheimer's disease (BIN1)", "risk", "Neuropsychiatric"), + "rs3851179": ("11", 85868640, "T", "Alzheimer's disease (PICALM)", "protective", "Neuropsychiatric"), + "rs670139": ("11", 59939307, "G", "Alzheimer's disease (MS4A)", "risk", "Neuropsychiatric"), + "rs9349407": ("6", 47487762, "C", "Alzheimer's disease (CD2AP)", "risk", "Neuropsychiatric"), + "rs11136000": ("8", 27468503, "C", "Alzheimer's disease (CLU)", "protective", "Neuropsychiatric"), + "rs3764650": ("19", 1063443, "G", "Alzheimer's disease (ABCA7)", "risk", "Neuropsychiatric"), + "rs3818361": ("1", 207692049, "A", "Alzheimer's disease (CR1)", "risk", "Neuropsychiatric"), + # Parkinson's Disease (新增) + "rs356220": ("4", 90626111, "T", "Parkinson's disease (SNCA)", "risk", "Neuropsychiatric"), + "rs11931074": ("4", 90674917, "G", "Parkinson's disease (SNCA)", "risk", "Neuropsychiatric"), + "rs34637584": ("12", 40734202, "A", "Parkinson's disease (LRRK2)", "risk", "Neuropsychiatric"), + "rs34311866": ("4", 951947, "C", "Parkinson's disease (TMEM175)", "risk", "Neuropsychiatric"), + # Depression + "rs1545843": ("1", 72761657, "A", "Major depression (NEGR1)", "risk", "Neuropsychiatric"), + "rs7973260": ("12", 118364392, "A", "Major depression (KSR2)", "risk", "Neuropsychiatric"), + "rs10514299": ("5", 87992715, "T", "Major depression (TMEM161B)", "risk", "Neuropsychiatric"), + "rs2422321": ("15", 88945878, "G", "Major depression (NTRK3)", "risk", "Neuropsychiatric"), + "rs301806": ("1", 8477981, "A", "Major depression (RERE)", "risk", "Neuropsychiatric"), + "rs1432639": ("3", 117115304, "G", "Major depression (LSAMP)", "risk", "Neuropsychiatric"), + "rs9530139": ("13", 53645407, "G", "Major depression", "risk", "Neuropsychiatric"), + "rs4543289": ("10", 106610839, "T", "Major depression (SORCS3)", "risk", "Neuropsychiatric"), + # Anxiety + "rs1709393": ("1", 34774088, "A", "Anxiety disorder", "risk", "Neuropsychiatric"), + "rs7688285": ("4", 123372626, "A", "Anxiety disorder", "risk", "Neuropsychiatric"), + # Bipolar + "rs4765913": ("12", 2345295, "A", "Bipolar disorder (CACNA1C)", "risk", "Neuropsychiatric"), + "rs10994336": ("10", 64649959, "T", "Bipolar disorder (ANK3)", "risk", "Neuropsychiatric"), + "rs9804190": ("11", 79077426, "C", "Bipolar disorder (ODZ4)", "risk", "Neuropsychiatric"), + # Schizophrenia + "rs1625579": ("8", 130635575, "T", "Schizophrenia (MIR137)", "risk", "Neuropsychiatric"), + "rs2007044": ("6", 28626894, "G", "Schizophrenia (HIST1H2BJ)", "risk", "Neuropsychiatric"), + "rs6932590": ("6", 27243984, "T", "Schizophrenia", "risk", "Neuropsychiatric"), + # ADHD (新增) + "rs1412005": ("16", 73099702, "T", "ADHD", "risk", "Neuropsychiatric"), + "rs11210892": ("1", 44185231, "A", "ADHD", "risk", "Neuropsychiatric"), + + # ======================================================================== + # OTHER TRAITS (原有 + 擴展) + # ======================================================================== + # Caffeine + "rs762551": ("15", 75041917, "C", "Caffeine metabolism (slow)", "slow", "Other"), + "rs2472297": ("15", 75027880, "T", "Caffeine consumption", "higher", "Other"), + # Alcohol + "rs671": ("12", 112241766, "A", "Alcohol flush reaction (ALDH2)", "risk", "Other"), + "rs1229984": ("4", 100239319, "T", "Alcohol metabolism (ADH1B)", "fast", "Other"), + # Lactose + "rs4988235_lct": ("2", 136608646, "G", "Lactose intolerance (LCT)", "risk", "Other"), + # Vitamin D + "rs12785878": ("11", 71167449, "T", "Vitamin D levels (lower)", "lower", "Other"), + # Hair + "rs2180439": ("20", 22162468, "T", "Male pattern baldness", "risk", "Other"), + "rs1160312": ("X", 67052952, "A", "Male pattern baldness (AR)", "risk", "Other"), + "rs6625163": ("X", 67177092, "A", "Male pattern baldness", "risk", "Other"), + # Muscle performance (新增) + "rs1815739": ("11", 66560624, "T", "Sprint/Power athlete (ACTN3)", "power", "Other"), + # Bitter taste (新增) + "rs713598": ("7", 141972804, "C", "Bitter taste sensitivity (PTC)", "taster", "Other"), + "rs1726866": ("7", 141972905, "T", "Bitter taste sensitivity", "taster", "Other"), + # Cilantro aversion (新增) + "rs72921001": ("11", 6889648, "A", "Cilantro aversion", "aversion", "Other"), +} + +# Category display order and descriptions +CATEGORIES = { + "Gout": "痛風 / 尿酸代謝", + "Kidney": "腎臟疾病", + "Hearing": "聽力損失", + "Autoimmune": "自體免疫疾病", + "Cancer": "癌症風險", + "Thrombosis": "血栓 / 凝血", + "Thyroid": "甲狀腺疾病", + "Bone": "骨質疏鬆 / 骨骼健康", + "Liver": "肝臟疾病", + "Migraine": "偏頭痛", + "Longevity": "長壽 / 老化", + "Sleep": "睡眠", + "Skin": "皮膚", + "Cardiovascular": "心血管疾病", + "Metabolic": "代謝疾病", + "Eye": "眼睛疾病", + "Neuropsychiatric": "神經精神疾病", + "Other": "其他特性", +} + + +def get_genotype_class(gt: str) -> str: + """Classify genotype""" + if gt in ['./.', '.|.', '.']: + return 'MISSING' + + alleles = re.split('[/|]', gt) + if all(a == '0' for a in alleles): + return 'HOM_REF' + elif all(a != '0' and a != '.' for a in alleles): + return 'HOM_ALT' + else: + return 'HET' + + +def parse_vcf_for_traits(vcf_path: str, sample_idx: int = 2) -> Tuple[Dict, List]: + """Parse VCF and look for trait-associated SNPs""" + + print(f"Scanning VCF for {len(TRAIT_SNPS)} trait-associated variants...") + + # Build position lookup + pos_to_snp = {} + for rsid, (chrom, pos, risk_allele, trait, effect, category) in TRAIT_SNPS.items(): + key = f"{chrom}-{pos}" + if key not in pos_to_snp: + pos_to_snp[key] = [] + pos_to_snp[key].append((rsid, risk_allele, trait, effect, category)) + + found_variants = {} + samples = [] + + open_func = gzip.open if vcf_path.endswith('.gz') else open + mode = 'rt' if vcf_path.endswith('.gz') else 'r' + + with open_func(vcf_path, mode) as f: + for line in f: + if line.startswith('##'): + continue + elif line.startswith('#CHROM'): + parts = line.strip().split('\t') + samples = parts[9:] + continue + + parts = line.strip().split('\t') + if len(parts) < 10: + continue + + chrom, pos, rsid_vcf, ref, alt, qual, filt, info, fmt = parts[:9] + gt_fields = parts[9:] + + # Check if this position has a known trait SNP + key = f"{chrom}-{pos}" + if key not in pos_to_snp: + continue + + # Get sample genotype + fmt_parts = fmt.split(':') + gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0 + + if sample_idx < len(gt_fields): + gt_data = gt_fields[sample_idx].split(':') + gt = gt_data[gt_idx] if gt_idx < len(gt_data) else './.' + else: + gt = './.' + + gt_class = get_genotype_class(gt) + alleles = [ref] + alt.split(',') + + # Process each SNP at this position + for rsid, risk_allele, trait, effect, category in pos_to_snp[key]: + # Check if risk allele is present + has_risk = False + risk_copies = 0 + + if gt_class != 'MISSING': + gt_alleles = re.split('[/|]', gt) + for a in gt_alleles: + if a.isdigit(): + allele_idx = int(a) + if allele_idx < len(alleles) and alleles[allele_idx] == risk_allele: + has_risk = True + risk_copies += 1 + + found_variants[rsid] = { + 'rsid': rsid, + 'chrom': chrom, + 'pos': pos, + 'ref': ref, + 'alt': alt, + 'genotype': gt, + 'genotype_class': gt_class, + 'risk_allele': risk_allele, + 'trait': trait, + 'effect': effect, + 'category': category, + 'has_risk_allele': has_risk, + 'risk_copies': risk_copies + } + + return found_variants, samples + + +def generate_report(found_variants: Dict, output_path: str, sample_name: str): + """Generate comprehensive trait analysis report""" + + # Group by category + by_category = defaultdict(list) + for rsid, var in found_variants.items(): + by_category[var['category']].append(var) + + with open(output_path, 'w') as f: + f.write("=" * 80 + "\n") + f.write("COMPREHENSIVE GWAS TRAIT ANALYSIS REPORT\n") + f.write(f"Sample: {sample_name}\n") + f.write(f"Total SNPs analyzed: {len(TRAIT_SNPS)}\n") + f.write(f"SNPs found in data: {len(found_variants)}\n") + f.write("=" * 80 + "\n\n") + + # Summary statistics + total_risk = sum(1 for v in found_variants.values() if v['has_risk_allele']) + f.write(f"OVERALL SUMMARY: {total_risk} risk variants found\n\n") + + # Category summary + f.write("=" * 80 + "\n") + f.write("SUMMARY BY CATEGORY\n") + f.write("=" * 80 + "\n\n") + + for cat_key in CATEGORIES.keys(): + if cat_key in by_category: + variants = by_category[cat_key] + risk_count = sum(1 for v in variants if v['has_risk_allele']) + cat_name = CATEGORIES[cat_key] + f.write(f"{cat_name}: {risk_count}/{len(variants)} risk variants\n") + + # Detailed results by category + f.write("\n" + "=" * 80 + "\n") + f.write("DETAILED RESULTS BY CATEGORY\n") + f.write("=" * 80 + "\n") + + for cat_key in CATEGORIES.keys(): + if cat_key not in by_category: + continue + + variants = by_category[cat_key] + cat_name = CATEGORIES[cat_key] + risk_count = sum(1 for v in variants if v['has_risk_allele']) + + f.write(f"\n\n## {cat_name} ({risk_count}/{len(variants)} risk)\n") + f.write("-" * 60 + "\n") + + # Sort: risk variants first + sorted_vars = sorted(variants, key=lambda x: (not x['has_risk_allele'], x['trait'])) + + for v in sorted_vars: + status = "⚠️ RISK" if v['has_risk_allele'] else "✓ OK" + copies = f"({v['risk_copies']}份)" if v['has_risk_allele'] else "" + f.write(f"\n{v['trait']}: {v['rsid']} [{status}] {copies}\n") + f.write(f" 基因型: {v['genotype']} | 風險等位基因: {v['risk_allele']} | 效應: {v['effect']}\n") + + # Full variant table + f.write("\n\n" + "=" * 80 + "\n") + f.write("COMPLETE VARIANT TABLE\n") + f.write("=" * 80 + "\n\n") + + f.write("RSID\tCHROM\tPOS\tGENOTYPE\tRISK_ALLELE\tHAS_RISK\tCOPIES\tTRAIT\tCATEGORY\tEFFECT\n") + + for rsid, var in sorted(found_variants.items(), key=lambda x: (x[1]['category'], x[1]['trait'])): + f.write(f"{var['rsid']}\t{var['chrom']}\t{var['pos']}\t{var['genotype']}\t") + f.write(f"{var['risk_allele']}\t{var['has_risk_allele']}\t{var['risk_copies']}\t") + f.write(f"{var['trait']}\t{var['category']}\t{var['effect']}\n") + + print(f"Report saved to: {output_path}") + + +def main(): + vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.rsid.vcf.gz' + output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/gwas_comprehensive_report.txt' + sample_idx = int(sys.argv[3]) if len(sys.argv) > 3 else 2 + + print("=" * 60) + print("COMPREHENSIVE GWAS TRAIT ANALYSIS") + print("=" * 60) + print(f"VCF: {vcf_path}") + print(f"Sample index: {sample_idx}") + print(f"Total trait SNPs in database: {len(TRAIT_SNPS)}") + print() + + found_variants, samples = parse_vcf_for_traits(vcf_path, sample_idx) + + sample_name = samples[sample_idx] if sample_idx < len(samples) else f"Sample_{sample_idx}" + print(f"Analyzing sample: {sample_name}") + print(f"\nFound {len(found_variants)} trait-associated variants in VCF") + + # Quick summary by category + by_category = defaultdict(list) + for rsid, var in found_variants.items(): + by_category[var['category']].append(var) + + print("\n" + "=" * 60) + print("QUICK SUMMARY BY CATEGORY") + print("=" * 60) + + for cat_key in CATEGORIES.keys(): + if cat_key in by_category: + variants = by_category[cat_key] + risk_count = sum(1 for v in variants if v['has_risk_allele']) + cat_name = CATEGORIES[cat_key] + marker = "⚠️ " if risk_count > 0 else " " + print(f"{marker}{cat_name}: {risk_count}/{len(variants)} risk variants") + + generate_report(found_variants, output_path, sample_name) + + # Print high-risk findings + print("\n" + "=" * 60) + print("HIGH-PRIORITY FINDINGS (2+ copies of risk allele)") + print("=" * 60) + + high_risk = [v for v in found_variants.values() if v['risk_copies'] >= 2] + if high_risk: + for v in sorted(high_risk, key=lambda x: x['category']): + print(f"\n{v['trait']} ({v['rsid']})") + print(f" Category: {CATEGORIES[v['category']]}") + print(f" Genotype: {v['genotype']} (2 copies of risk allele {v['risk_allele']})") + else: + print("\nNo variants with 2 copies of risk allele found.") + + +if __name__ == '__main__': + main() diff --git a/gwas_trait_lookup.py b/gwas_trait_lookup.py new file mode 100644 index 0000000..439396f --- /dev/null +++ b/gwas_trait_lookup.py @@ -0,0 +1,365 @@ +#!/usr/bin/env python3 +""" +GWAS Trait Lookup Script +Searches for trait-associated variants in VCF data using GWAS Catalog data. +""" + +import gzip +import sys +import os +from collections import defaultdict +from dataclasses import dataclass +from typing import Dict, List, Optional, Set, Tuple + +@dataclass +class GWASAssociation: + """GWAS association entry""" + rsid: str + chrom: str + pos: int + risk_allele: str + trait: str + p_value: float + odds_ratio: Optional[float] + beta: Optional[float] + pubmed_id: str + study: str + +@dataclass +class TraitResult: + """Result for a specific trait""" + trait: str + variants_found: int + risk_variants: int + protective_variants: int + details: List[dict] + +# Common trait-associated SNPs (curated from GWAS Catalog) +# Format: rsid -> (chrom, pos, risk_allele, trait, effect_direction) +TRAIT_SNPS = { + # Sleep quality / Insomnia + "rs113851554": ("2", 66799986, "T", "Insomnia", "risk"), + "rs12927162": ("16", 68856985, "A", "Sleep duration", "shorter"), + "rs1823125": ("1", 205713532, "G", "Chronotype (morning person)", "morning"), + "rs10493596": ("1", 215803417, "T", "Insomnia", "risk"), + "rs3104997": ("6", 27424938, "C", "Sleep duration", "shorter"), + "rs73598374": ("4", 94847526, "A", "Insomnia", "risk"), + "rs2302729": ("5", 35857091, "G", "Insomnia", "risk"), + + # Skin conditions + "rs1800629": ("6", 31543031, "A", "Psoriasis", "risk"), + "rs20541": ("5", 131995964, "A", "Atopic dermatitis", "risk"), + "rs2066808": ("6", 31540784, "A", "Psoriasis", "risk"), + "rs3093662": ("6", 31574339, "G", "Psoriasis", "risk"), + "rs10484554": ("6", 31271836, "A", "Psoriasis", "risk"), + "rs1295686": ("5", 131996447, "A", "Atopic dermatitis", "risk"), + "rs2227956": ("6", 31783279, "T", "Psoriasis", "risk"), + "rs6906021": ("6", 32051991, "C", "Atopic dermatitis", "risk"), + "rs661313": ("1", 152285861, "G", "Ichthyosis vulgaris (FLG)", "risk"), + "rs2065958": ("11", 35300406, "C", "Atopic dermatitis", "risk"), + + # Cardiovascular + "rs10757274": ("9", 22096055, "G", "Coronary artery disease", "risk"), + "rs1333049": ("9", 22125503, "C", "Coronary artery disease", "risk"), + "rs4665058": ("2", 43845437, "C", "Coronary artery disease", "risk"), + "rs17465637": ("1", 222823529, "A", "Coronary artery disease", "risk"), + "rs6725887": ("2", 203828796, "C", "Coronary artery disease", "risk"), + + # Type 2 Diabetes + "rs7903146": ("10", 114758349, "T", "Type 2 diabetes", "risk"), + "rs12255372": ("10", 114808902, "T", "Type 2 diabetes", "risk"), + "rs1801282": ("3", 12393125, "C", "Type 2 diabetes", "risk"), + "rs5219": ("11", 17409572, "T", "Type 2 diabetes", "risk"), + "rs13266634": ("8", 118184783, "C", "Type 2 diabetes", "risk"), + + # Obesity/BMI + "rs9939609": ("16", 53820527, "A", "Obesity (FTO)", "risk"), + "rs17782313": ("18", 57851097, "C", "Obesity (MC4R)", "risk"), + "rs6548238": ("2", 634905, "C", "BMI", "higher"), + "rs10938397": ("4", 45186139, "G", "BMI", "higher"), + + # Hair loss / Baldness + "rs2180439": ("20", 22162468, "T", "Male pattern baldness", "risk"), + "rs1160312": ("X", 67052952, "A", "Male pattern baldness", "risk"), + "rs6625163": ("X", 67177092, "A", "Male pattern baldness", "risk"), + + # Eye conditions (relevant to Usher) + "rs10490924": ("10", 124214448, "T", "Age-related macular degeneration", "risk"), + "rs1061170": ("1", 196659237, "C", "Age-related macular degeneration", "risk"), + "rs9621532": ("22", 38477587, "C", "Myopia", "risk"), + + # Caffeine metabolism + "rs762551": ("15", 75041917, "C", "Caffeine metabolism (slow)", "slow"), + "rs2472297": ("15", 75027880, "T", "Caffeine consumption", "higher"), + + # Alcohol metabolism + "rs671": ("12", 112241766, "A", "Alcohol flush reaction", "risk"), + "rs1229984": ("4", 100239319, "T", "Alcohol metabolism (fast)", "fast"), + + # Lactose intolerance + "rs4988235": ("2", 136608646, "G", "Lactose intolerance", "risk"), + + # Vitamin D + "rs2282679": ("4", 72608383, "C", "Vitamin D deficiency", "risk"), + "rs12785878": ("11", 71167449, "T", "Vitamin D levels (lower)", "lower"), + + # Alzheimer's Disease / Dementia + "rs429358": ("19", 45411941, "C", "Alzheimer's disease (APOE e4)", "risk"), # APOE e4 + "rs7412": ("19", 45412079, "T", "Alzheimer's disease (APOE e2)", "protective"), # APOE e2 + "rs3865444": ("19", 51727962, "C", "Alzheimer's disease (CD33)", "risk"), + "rs744373": ("2", 127892810, "G", "Alzheimer's disease (BIN1)", "risk"), + "rs3851179": ("11", 85868640, "T", "Alzheimer's disease (PICALM)", "protective"), + "rs670139": ("11", 59939307, "G", "Alzheimer's disease (MS4A)", "risk"), + "rs9349407": ("6", 47487762, "C", "Alzheimer's disease (CD2AP)", "risk"), + "rs11136000": ("8", 27468503, "C", "Alzheimer's disease (CLU)", "protective"), + "rs3764650": ("19", 1063443, "G", "Alzheimer's disease (ABCA7)", "risk"), + "rs3818361": ("1", 207692049, "A", "Alzheimer's disease (CR1)", "risk"), + + # Depression / Major Depressive Disorder + "rs1545843": ("1", 72761657, "A", "Major depression (NEGR1)", "risk"), + "rs7973260": ("12", 118364392, "A", "Major depression (KSR2)", "risk"), + "rs10514299": ("5", 87992715, "T", "Major depression (TMEM161B)", "risk"), + "rs2422321": ("15", 88945878, "G", "Major depression (NTRK3)", "risk"), + "rs301806": ("1", 8477981, "A", "Major depression (RERE)", "risk"), + "rs1432639": ("3", 117115304, "G", "Major depression (LSAMP)", "risk"), + "rs9530139": ("13", 53645407, "G", "Major depression", "risk"), + "rs4543289": ("10", 106610839, "T", "Major depression (SORCS3)", "risk"), + + # Anxiety + "rs1709393": ("1", 34774088, "A", "Anxiety disorder", "risk"), + "rs7688285": ("4", 123372626, "A", "Anxiety disorder", "risk"), + + # Bipolar disorder + "rs4765913": ("12", 2345295, "A", "Bipolar disorder (CACNA1C)", "risk"), + "rs10994336": ("10", 64649959, "T", "Bipolar disorder (ANK3)", "risk"), + "rs9804190": ("11", 79077426, "C", "Bipolar disorder (ODZ4)", "risk"), + + # Schizophrenia + "rs1625579": ("8", 130635575, "T", "Schizophrenia (MIR137)", "risk"), + "rs2007044": ("6", 28626894, "G", "Schizophrenia (HIST1H2BJ)", "risk"), + "rs6932590": ("6", 27243984, "T", "Schizophrenia", "risk"), +} + + +def get_genotype_class(gt: str) -> str: + """Classify genotype""" + if gt in ['./.', '.|.', '.']: + return 'MISSING' + + import re + alleles = re.split('[/|]', gt) + if all(a == '0' for a in alleles): + return 'HOM_REF' + elif all(a != '0' and a != '.' for a in alleles): + return 'HOM_ALT' + else: + return 'HET' + + +def parse_vcf_for_traits(vcf_path: str, proband_idx: int = 2) -> Dict[str, dict]: + """Parse VCF and look for trait-associated SNPs""" + + print(f"Scanning VCF for trait-associated variants...") + + # Build position lookup + pos_to_snp = {} + for rsid, (chrom, pos, risk_allele, trait, effect) in TRAIT_SNPS.items(): + key = f"{chrom}-{pos}" + pos_to_snp[key] = (rsid, risk_allele, trait, effect) + + found_variants = {} + samples = [] + + open_func = gzip.open if vcf_path.endswith('.gz') else open + mode = 'rt' if vcf_path.endswith('.gz') else 'r' + + with open_func(vcf_path, mode) as f: + for line in f: + if line.startswith('##'): + continue + elif line.startswith('#CHROM'): + parts = line.strip().split('\t') + samples = parts[9:] + continue + + parts = line.strip().split('\t') + if len(parts) < 10: + continue + + chrom, pos, rsid_vcf, ref, alt, qual, filt, info, fmt = parts[:9] + gt_fields = parts[9:] + + # Check if this position has a known trait SNP + key = f"{chrom}-{pos}" + if key not in pos_to_snp: + continue + + rsid, risk_allele, trait, effect = pos_to_snp[key] + + # Get proband genotype + fmt_parts = fmt.split(':') + gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0 + + if proband_idx < len(gt_fields): + gt_data = gt_fields[proband_idx].split(':') + gt = gt_data[gt_idx] if gt_idx < len(gt_data) else './.' + else: + gt = './.' + + gt_class = get_genotype_class(gt) + + # Determine risk + alleles = [ref] + alt.split(',') + + # Check if risk allele is present + has_risk = False + risk_copies = 0 + + if gt_class != 'MISSING': + import re + gt_alleles = re.split('[/|]', gt) + for a in gt_alleles: + if a.isdigit(): + allele_idx = int(a) + if allele_idx < len(alleles) and alleles[allele_idx] == risk_allele: + has_risk = True + risk_copies += 1 + + found_variants[rsid] = { + 'rsid': rsid, + 'chrom': chrom, + 'pos': pos, + 'ref': ref, + 'alt': alt, + 'genotype': gt, + 'genotype_class': gt_class, + 'risk_allele': risk_allele, + 'trait': trait, + 'effect': effect, + 'has_risk_allele': has_risk, + 'risk_copies': risk_copies + } + + return found_variants, samples + + +def generate_trait_report(found_variants: Dict, output_path: str): + """Generate trait analysis report""" + + # Group by trait + traits = defaultdict(list) + for rsid, var in found_variants.items(): + traits[var['trait']].append(var) + + with open(output_path, 'w') as f: + f.write("# GWAS Trait Analysis Report\n") + f.write("# Based on curated GWAS Catalog associations\n\n") + + f.write("=" * 80 + "\n") + f.write("SUMMARY BY TRAIT CATEGORY\n") + f.write("=" * 80 + "\n\n") + + # Categorize traits + categories = { + "Sleep": ["Insomnia", "Sleep duration", "Chronotype (morning person)"], + "Skin": ["Psoriasis", "Atopic dermatitis", "Ichthyosis vulgaris (FLG)"], + "Cardiovascular": ["Coronary artery disease"], + "Metabolic": ["Type 2 diabetes", "Obesity (FTO)", "Obesity (MC4R)", "BMI"], + "Eye": ["Age-related macular degeneration", "Myopia"], + "Neuropsychiatric": [ + "Alzheimer's disease (APOE e4)", "Alzheimer's disease (APOE e2)", + "Alzheimer's disease (CD33)", "Alzheimer's disease (BIN1)", + "Alzheimer's disease (PICALM)", "Alzheimer's disease (MS4A)", + "Alzheimer's disease (CD2AP)", "Alzheimer's disease (CLU)", + "Alzheimer's disease (ABCA7)", "Alzheimer's disease (CR1)", + "Major depression (NEGR1)", "Major depression (KSR2)", + "Major depression (TMEM161B)", "Major depression (NTRK3)", + "Major depression (RERE)", "Major depression (LSAMP)", + "Major depression", "Major depression (SORCS3)", + "Anxiety disorder", + "Bipolar disorder (CACNA1C)", "Bipolar disorder (ANK3)", "Bipolar disorder (ODZ4)", + "Schizophrenia (MIR137)", "Schizophrenia (HIST1H2BJ)", "Schizophrenia" + ], + "Other": ["Caffeine metabolism (slow)", "Caffeine consumption", "Alcohol flush reaction", + "Alcohol metabolism (fast)", "Lactose intolerance", "Vitamin D deficiency", + "Vitamin D levels (lower)", "Male pattern baldness"] + } + + for category, trait_list in categories.items(): + f.write(f"\n## {category}\n") + f.write("-" * 40 + "\n") + + category_risk = 0 + category_total = 0 + + for trait in trait_list: + if trait in traits: + variants = traits[trait] + risk_count = sum(1 for v in variants if v['has_risk_allele']) + category_risk += risk_count + category_total += len(variants) + + for v in variants: + status = "RISK" if v['has_risk_allele'] else "OK" + copies = f"({v['risk_copies']} copies)" if v['has_risk_allele'] else "" + f.write(f" {v['trait']}: {v['rsid']} [{status}] {copies}\n") + f.write(f" Genotype: {v['genotype']} | Risk allele: {v['risk_allele']}\n") + + if category_total > 0: + f.write(f"\n Category summary: {category_risk}/{category_total} risk variants found\n") + + # Detailed results + f.write("\n" + "=" * 80 + "\n") + f.write("DETAILED RESULTS\n") + f.write("=" * 80 + "\n\n") + + f.write("RSID\tCHROM\tPOS\tGENOTYPE\tRISK_ALLELE\tHAS_RISK\tCOPIES\tTRAIT\tEFFECT\n") + + for rsid, var in sorted(found_variants.items(), key=lambda x: (x[1]['trait'], x[0])): + f.write(f"{var['rsid']}\t{var['chrom']}\t{var['pos']}\t{var['genotype']}\t") + f.write(f"{var['risk_allele']}\t{var['has_risk_allele']}\t{var['risk_copies']}\t") + f.write(f"{var['trait']}\t{var['effect']}\n") + + print(f"Report saved to: {output_path}") + + +def main(): + vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf' + output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/gwas_trait_report.txt' + proband_idx = int(sys.argv[3]) if len(sys.argv) > 3 else 2 + + print(f"GWAS Trait Analysis") + print(f"VCF: {vcf_path}") + print(f"Proband index: {proband_idx}") + print(f"Searching for {len(TRAIT_SNPS)} trait-associated SNPs...\n") + + found_variants, samples = parse_vcf_for_traits(vcf_path, proband_idx) + + print(f"\nFound {len(found_variants)} trait-associated variants in VCF") + + # Quick summary + risk_count = sum(1 for v in found_variants.values() if v['has_risk_allele']) + print(f"Variants with risk allele: {risk_count}") + + generate_trait_report(found_variants, output_path) + + # Print summary to console + print("\n" + "=" * 60) + print("QUICK SUMMARY") + print("=" * 60) + + traits = defaultdict(list) + for rsid, var in found_variants.items(): + traits[var['trait']].append(var) + + for trait in sorted(traits.keys()): + variants = traits[trait] + risk_vars = [v for v in variants if v['has_risk_allele']] + if risk_vars: + print(f"\n{trait}:") + for v in risk_vars: + print(f" {v['rsid']}: {v['genotype']} (risk allele: {v['risk_allele']}, copies: {v['risk_copies']})") + + +if __name__ == '__main__': + main() diff --git a/pharmacogenomics.py b/pharmacogenomics.py new file mode 100644 index 0000000..90ab384 --- /dev/null +++ b/pharmacogenomics.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python3 +""" +Pharmacogenomics Analysis Script +Analyzes drug-gene interactions based on PharmGKB and CPIC guidelines. +""" + +import gzip +import sys +import re +from collections import defaultdict +from dataclasses import dataclass +from typing import Dict, List, Optional + +# Key pharmacogenomic variants (curated from PharmGKB/CPIC) +# Format: rsid -> (chrom, pos, gene, drug_class, effect, clinical_recommendation) + +PHARMGKB_VARIANTS = { + # CYP2D6 - Codeine, Tramadol, Tamoxifen, many antidepressants + "rs3892097": ("22", 42526694, "CYP2D6", "Codeine/Tramadol/Antidepressants", + "*4 allele - Poor metabolizer", + "Reduced efficacy of codeine (no conversion to morphine); Consider alternative analgesics"), + "rs1065852": ("22", 42525772, "CYP2D6", "Codeine/Tramadol/Antidepressants", + "*10 allele - Reduced function", + "Intermediate metabolizer; May need dose adjustment"), + "rs16947": ("22", 42523943, "CYP2D6", "Codeine/Tramadol", + "*2 allele - Normal function", + "Normal metabolism"), + + # CYP2C19 - Clopidogrel, PPIs, some antidepressants + "rs4244285": ("10", 96541616, "CYP2C19", "Clopidogrel/PPIs/Antidepressants", + "*2 allele - Loss of function", + "Poor metabolizer; Clopidogrel may have reduced efficacy; Consider prasugrel or ticagrelor"), + "rs4986893": ("10", 96540410, "CYP2C19", "Clopidogrel/PPIs", + "*3 allele - Loss of function", + "Poor metabolizer; Reduced clopidogrel activation"), + "rs12248560": ("10", 96522463, "CYP2C19", "Clopidogrel/PPIs", + "*17 allele - Increased function", + "Ultra-rapid metabolizer; May need lower PPI doses"), + + # CYP2C9 - Warfarin, NSAIDs, Phenytoin + "rs1799853": ("10", 96702047, "CYP2C9", "Warfarin/NSAIDs/Phenytoin", + "*2 allele - Reduced function", + "Slower warfarin metabolism; Lower dose may be needed"), + "rs1057910": ("10", 96741053, "CYP2C9", "Warfarin/NSAIDs/Phenytoin", + "*3 allele - Reduced function", + "Significantly slower warfarin metabolism; Require ~50% lower dose"), + + # VKORC1 - Warfarin sensitivity + "rs9923231": ("16", 31107689, "VKORC1", "Warfarin", + "-1639G>A - Warfarin sensitivity", + "A allele: Increased sensitivity, need lower warfarin dose"), + + # CYP3A4/CYP3A5 - Many drugs (statins, immunosuppressants, etc.) + "rs776746": ("7", 99270539, "CYP3A5", "Tacrolimus/Cyclosporine/Statins", + "*3 allele - Non-expressor", + "Most common; Normal tacrolimus dosing"), + "rs2740574": ("7", 99382096, "CYP3A4", "Statins/Many drugs", + "*1B allele", + "May affect drug metabolism"), + + # SLCO1B1 - Statin-induced myopathy + "rs4149056": ("12", 21331549, "SLCO1B1", "Simvastatin/Statins", + "*5 allele - Reduced function", + "C allele: Increased risk of statin myopathy; Consider lower dose or alternative statin"), + + # TPMT - Thiopurines (Azathioprine, 6-MP) + "rs1800460": ("6", 18130918, "TPMT", "Azathioprine/6-Mercaptopurine", + "*3B allele - Reduced function", + "Intermediate/Poor metabolizer; High risk of myelosuppression; Reduce dose"), + "rs1142345": ("6", 18130725, "TPMT", "Azathioprine/6-Mercaptopurine", + "*3C allele - Reduced function", + "Intermediate/Poor metabolizer; High risk of myelosuppression; Reduce dose"), + + # DPYD - Fluoropyrimidines (5-FU, Capecitabine) + "rs3918290": ("1", 97915614, "DPYD", "5-Fluorouracil/Capecitabine", + "*2A allele - No function", + "CRITICAL: Complete DPD deficiency; Contraindicated - severe toxicity risk"), + "rs55886062": ("1", 98205966, "DPYD", "5-Fluorouracil/Capecitabine", + "*13 allele - No function", + "CRITICAL: DPD deficiency; Contraindicated"), + "rs67376798": ("1", 97981395, "DPYD", "5-Fluorouracil/Capecitabine", + "D949V - Reduced function", + "Intermediate metabolizer; Consider dose reduction"), + + # UGT1A1 - Irinotecan + "rs8175347": ("2", 234668879, "UGT1A1", "Irinotecan", + "*28 allele (TA repeat)", + "7/7 genotype: Reduced glucuronidation; Increased toxicity risk; Consider dose reduction"), + + # HLA-B*57:01 - Abacavir hypersensitivity + "rs2395029": ("6", 31431780, "HLA-B", "Abacavir (HIV)", + "HLA-B*57:01 tag SNP", + "CRITICAL: If positive, abacavir contraindicated - hypersensitivity reaction risk"), + + # HLA-B*15:02 - Carbamazepine/Phenytoin (SJS/TEN) + "rs144012689": ("6", 31356867, "HLA-B", "Carbamazepine/Phenytoin", + "HLA-B*15:02 tag SNP", + "CRITICAL: If positive in Asian ancestry, carbamazepine contraindicated - SJS/TEN risk"), + + # HLA-A*31:01 - Carbamazepine + "rs1061235": ("6", 29912280, "HLA-A", "Carbamazepine", + "HLA-A*31:01 tag SNP", + "If positive, increased carbamazepine hypersensitivity risk"), + + # F5 - Oral contraceptives, HRT (Factor V Leiden) + "rs6025": ("1", 169519049, "F5", "Oral Contraceptives/HRT", + "Factor V Leiden", + "CRITICAL: Increased thrombosis risk; Oral contraceptives relatively contraindicated"), + + # F2 - Oral contraceptives (Prothrombin) + "rs1799963": ("11", 46761055, "F2", "Oral Contraceptives/HRT", + "Prothrombin G20210A", + "Increased thrombosis risk; Caution with oral contraceptives"), + + # MTHFR - Methotrexate, Folate metabolism + "rs1801133": ("1", 11856378, "MTHFR", "Methotrexate/Folate", + "C677T - Reduced function", + "T/T genotype: Reduced MTHFR activity; May need folate supplementation with methotrexate"), + "rs1801131": ("1", 11854476, "MTHFR", "Methotrexate/Folate", + "A1298C", + "May affect folate metabolism"), + + # OPRM1 - Opioid response + "rs1799971": ("6", 154039662, "OPRM1", "Opioids (Morphine, etc.)", + "A118G", + "G allele: May need higher opioid doses for pain relief"), + + # COMT - Pain medications, ADHD drugs + "rs4680": ("22", 19951271, "COMT", "Pain medications/ADHD drugs", + "Val158Met", + "Met/Met: Lower COMT activity; May affect pain perception and stimulant response"), + + # IFNL3 (IL28B) - Hepatitis C treatment + "rs12979860": ("19", 39738787, "IFNL3", "Hepatitis C treatment (Interferon)", + "IL28B genotype", + "C/C genotype: Better response to interferon-based HCV treatment"), + + # NAT2 - Isoniazid, Hydralazine + "rs1801280": ("8", 18257854, "NAT2", "Isoniazid/Hydralazine/Sulfonamides", + "*5 allele - Slow acetylator", + "Slow acetylator; Increased isoniazid toxicity risk; Monitor for peripheral neuropathy"), + "rs1799930": ("8", 18258103, "NAT2", "Isoniazid/Hydralazine", + "*6 allele - Slow acetylator", + "Slow acetylator; May need dose adjustment"), + + # G6PD - Primaquine, Dapsone, Sulfonamides + "rs1050828": ("X", 153764217, "G6PD", "Primaquine/Dapsone/Sulfonamides", + "G6PD A- variant", + "CRITICAL: G6PD deficiency; Avoid oxidant drugs - hemolysis risk"), + + # CYP2B6 - Efavirenz + "rs3745274": ("19", 41512841, "CYP2B6", "Efavirenz (HIV)", + "*6 allele - Reduced function", + "T/T genotype: Slow metabolizer; Consider lower efavirenz dose; CNS side effects more likely"), +} + + +def get_genotype_class(gt: str) -> str: + """Classify genotype""" + if gt in ['./.', '.|.', '.']: + return 'MISSING' + alleles = re.split('[/|]', gt) + if all(a == '0' for a in alleles): + return 'HOM_REF' + elif all(a != '0' and a != '.' for a in alleles): + return 'HOM_ALT' + else: + return 'HET' + + +def analyze_pharmacogenomics(vcf_path: str, proband_idx: int = 2) -> Dict: + """Analyze VCF for pharmacogenomic variants""" + + print("Scanning for pharmacogenomic variants...") + + # Build position lookup + pos_to_variant = {} + for rsid, data in PHARMGKB_VARIANTS.items(): + chrom, pos, gene, drug, effect, recommendation = data + key = f"{chrom}-{pos}" + pos_to_variant[key] = { + 'rsid': rsid, + 'gene': gene, + 'drug': drug, + 'effect': effect, + 'recommendation': recommendation + } + + results = {} + samples = [] + + open_func = gzip.open if vcf_path.endswith('.gz') else open + mode = 'rt' if vcf_path.endswith('.gz') else 'r' + + with open_func(vcf_path, mode) as f: + for line in f: + if line.startswith('##'): + continue + elif line.startswith('#CHROM'): + parts = line.strip().split('\t') + samples = parts[9:] + continue + + parts = line.strip().split('\t') + if len(parts) < 10: + continue + + chrom, pos, rsid_vcf, ref, alt, qual, filt, info, fmt = parts[:9] + gt_fields = parts[9:] + + key = f"{chrom}-{pos}" + if key not in pos_to_variant: + continue + + variant_info = pos_to_variant[key] + + # Get proband genotype + fmt_parts = fmt.split(':') + gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0 + + if proband_idx < len(gt_fields): + gt_data = gt_fields[proband_idx].split(':') + gt = gt_data[gt_idx] if gt_idx < len(gt_data) else './.' + else: + gt = './.' + + gt_class = get_genotype_class(gt) + + # Determine alleles + alleles = [ref] + alt.split(',') + gt_alleles_str = [] + if gt_class != 'MISSING': + gt_indices = re.split('[/|]', gt) + for idx in gt_indices: + if idx.isdigit() and int(idx) < len(alleles): + gt_alleles_str.append(alleles[int(idx)]) + + results[variant_info['rsid']] = { + **variant_info, + 'chrom': chrom, + 'pos': pos, + 'ref': ref, + 'alt': alt, + 'genotype': gt, + 'genotype_class': gt_class, + 'alleles': '/'.join(gt_alleles_str) if gt_alleles_str else 'N/A', + 'has_variant': gt_class in ['HET', 'HOM_ALT'] + } + + return results, samples + + +def generate_pgx_report(results: Dict, output_path: str): + """Generate pharmacogenomics report""" + + # Categorize by drug class + drug_classes = defaultdict(list) + for rsid, data in results.items(): + drug_classes[data['drug']].append(data) + + # Identify actionable results + critical = [] + actionable = [] + informational = [] + + for rsid, data in results.items(): + if data['has_variant']: + if 'CRITICAL' in data['recommendation']: + critical.append(data) + elif any(word in data['recommendation'].lower() for word in ['reduce', 'consider', 'lower', 'avoid', 'contraindicated']): + actionable.append(data) + else: + informational.append(data) + + with open(output_path, 'w') as f: + f.write("# Pharmacogenomics Analysis Report\n") + f.write("# Based on PharmGKB and CPIC Guidelines\n\n") + + # Critical findings first + if critical: + f.write("=" * 80 + "\n") + f.write("⚠️ CRITICAL FINDINGS - Immediate Clinical Relevance\n") + f.write("=" * 80 + "\n\n") + for data in critical: + f.write(f"GENE: {data['gene']} ({data['rsid']})\n") + f.write(f" Drug(s): {data['drug']}\n") + f.write(f" Genotype: {data['alleles']} ({data['genotype_class']})\n") + f.write(f" Effect: {data['effect']}\n") + f.write(f" ⚠️ {data['recommendation']}\n\n") + + # Actionable findings + if actionable: + f.write("=" * 80 + "\n") + f.write("📋 ACTIONABLE FINDINGS - May Require Dose Adjustment\n") + f.write("=" * 80 + "\n\n") + for data in actionable: + f.write(f"GENE: {data['gene']} ({data['rsid']})\n") + f.write(f" Drug(s): {data['drug']}\n") + f.write(f" Genotype: {data['alleles']} ({data['genotype_class']})\n") + f.write(f" Effect: {data['effect']}\n") + f.write(f" Recommendation: {data['recommendation']}\n\n") + + # Summary by drug class + f.write("=" * 80 + "\n") + f.write("SUMMARY BY DRUG CLASS\n") + f.write("=" * 80 + "\n\n") + + for drug_class in sorted(drug_classes.keys()): + variants = drug_classes[drug_class] + has_risk = any(v['has_variant'] for v in variants) + status = "⚠️ VARIANT DETECTED" if has_risk else "✓ Normal" + f.write(f"\n## {drug_class}\n") + f.write(f"Status: {status}\n") + for v in variants: + marker = "→" if v['has_variant'] else " " + f.write(f" {marker} {v['gene']} ({v['rsid']}): {v['alleles']} - {v['genotype_class']}\n") + + # Detailed table + f.write("\n" + "=" * 80 + "\n") + f.write("DETAILED RESULTS\n") + f.write("=" * 80 + "\n\n") + f.write("RSID\tGENE\tGENOTYPE\tALLELES\tHAS_VARIANT\tDRUG\tEFFECT\n") + + for rsid in sorted(results.keys()): + data = results[rsid] + f.write(f"{rsid}\t{data['gene']}\t{data['genotype']}\t{data['alleles']}\t") + f.write(f"{data['has_variant']}\t{data['drug']}\t{data['effect']}\n") + + print(f"Report saved to: {output_path}") + return critical, actionable + + +def main(): + vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf' + output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/pharmacogenomics_report.txt' + proband_idx = int(sys.argv[3]) if len(sys.argv) > 3 else 2 + + print("=" * 60) + print("PHARMACOGENOMICS ANALYSIS") + print("=" * 60) + print(f"VCF: {vcf_path}") + print(f"Searching for {len(PHARMGKB_VARIANTS)} pharmacogenomic variants...\n") + + results, samples = analyze_pharmacogenomics(vcf_path, proband_idx) + + print(f"Found {len(results)} pharmacogenomic variants in VCF") + + critical, actionable = generate_pgx_report(results, output_path) + + # Console summary + print("\n" + "=" * 60) + print("QUICK SUMMARY") + print("=" * 60) + + variants_with_effect = [r for r in results.values() if r['has_variant']] + print(f"\nVariants detected: {len(variants_with_effect)}/{len(results)}") + + if critical: + print("\n⚠️ CRITICAL FINDINGS:") + for c in critical: + print(f" - {c['gene']}: {c['drug']}") + print(f" {c['recommendation']}") + + if actionable: + print("\n📋 ACTIONABLE FINDINGS:") + for a in actionable: + print(f" - {a['gene']} ({a['rsid']}): {a['drug']}") + print(f" Genotype: {a['alleles']}") + print(f" {a['recommendation']}") + + if not critical and not actionable: + print("\n✓ No critical or actionable pharmacogenomic variants detected") + + +if __name__ == '__main__': + main() diff --git a/pharmgkb_full_analysis.py b/pharmgkb_full_analysis.py new file mode 100644 index 0000000..51e723d --- /dev/null +++ b/pharmgkb_full_analysis.py @@ -0,0 +1,349 @@ +#!/usr/bin/env python3 +""" +Comprehensive PharmGKB Analysis Script +Uses full PharmGKB clinical annotations database for pharmacogenomics analysis. +""" + +import gzip +import sys +import os +import re +from collections import defaultdict +from typing import Dict, List, Set, Tuple + +# PharmGKB database paths +PHARMGKB_DIR = "/Volumes/NV2/genomics_reference/pharmgkb" +ANNOTATIONS_FILE = f"{PHARMGKB_DIR}/clinical_annotations.tsv" +ALLELES_FILE = f"{PHARMGKB_DIR}/clinical_ann_alleles.tsv" + + +def load_pharmgkb_annotations() -> Tuple[Dict, Dict]: + """Load PharmGKB clinical annotations and allele information""" + + # Load main annotations + annotations = {} + print(f"Loading PharmGKB annotations from {ANNOTATIONS_FILE}...") + + with open(ANNOTATIONS_FILE, 'r') as f: + header = f.readline().strip().split('\t') + for line in f: + parts = line.strip().split('\t') + if len(parts) < 11: + continue + + ann_id = parts[0] + variant = parts[1] # rsid or haplotype + gene = parts[2] + evidence_level = parts[3] + phenotype_category = parts[7] if len(parts) > 7 else "" + drugs = parts[10] if len(parts) > 10 else "" + phenotypes = parts[11] if len(parts) > 11 else "" + + # Only process rs variants (SNPs) + if variant.startswith('rs'): + rsid = variant + if rsid not in annotations: + annotations[rsid] = [] + annotations[rsid].append({ + 'ann_id': ann_id, + 'gene': gene, + 'evidence_level': evidence_level, + 'phenotype_category': phenotype_category, + 'drugs': drugs, + 'phenotypes': phenotypes + }) + + # Load allele-specific information + allele_info = {} + print(f"Loading allele information from {ALLELES_FILE}...") + + with open(ALLELES_FILE, 'r') as f: + header = f.readline().strip().split('\t') + for line in f: + parts = line.strip().split('\t') + if len(parts) < 3: + continue + + ann_id = parts[0] + genotype = parts[1] + annotation_text = parts[2] if len(parts) > 2 else "" + allele_function = parts[3] if len(parts) > 3 else "" + + if ann_id not in allele_info: + allele_info[ann_id] = {} + allele_info[ann_id][genotype] = { + 'text': annotation_text, + 'function': allele_function + } + + print(f"Loaded {len(annotations)} unique variants with annotations") + return annotations, allele_info + + +def get_genotype_class(gt: str) -> str: + """Classify genotype""" + if gt in ['./.', '.|.', '.']: + return 'MISSING' + + alleles = re.split('[/|]', gt) + if all(a == '0' for a in alleles): + return 'HOM_REF' + elif all(a != '0' and a != '.' for a in alleles): + return 'HOM_ALT' + else: + return 'HET' + + +def get_genotype_string(gt: str, ref: str, alt: str) -> str: + """Convert numeric genotype to allele string""" + if gt in ['./.', '.|.', '.']: + return 'N/A' + + alleles = [ref] + alt.split(',') + gt_alleles = re.split('[/|]', gt) + + result = [] + for a in gt_alleles: + if a.isdigit(): + idx = int(a) + if idx < len(alleles): + result.append(alleles[idx]) + else: + result.append('?') + else: + result.append('?') + + return '/'.join(result) + + +def parse_vcf_for_pharmgkb(vcf_path: str, sample_idx: int, annotations: Dict) -> Dict: + """Parse VCF and look for PharmGKB variants""" + + print(f"Scanning VCF for {len(annotations)} PharmGKB variants...") + + found_variants = {} + samples = [] + + # Build rsid lookup from VCF + open_func = gzip.open if vcf_path.endswith('.gz') else open + mode = 'rt' if vcf_path.endswith('.gz') else 'r' + + with open_func(vcf_path, mode) as f: + for line in f: + if line.startswith('##'): + continue + elif line.startswith('#CHROM'): + parts = line.strip().split('\t') + samples = parts[9:] + print(f"Found {len(samples)} samples, analyzing index {sample_idx}: {samples[sample_idx] if sample_idx < len(samples) else 'N/A'}") + continue + + parts = line.strip().split('\t') + if len(parts) < 10: + continue + + chrom, pos, rsid_vcf, ref, alt, qual, filt, info, fmt = parts[:9] + gt_fields = parts[9:] + + # Check if this rsid has PharmGKB annotation + if rsid_vcf not in annotations: + continue + + # Get sample genotype + fmt_parts = fmt.split(':') + gt_idx = fmt_parts.index('GT') if 'GT' in fmt_parts else 0 + + if sample_idx < len(gt_fields): + gt_data = gt_fields[sample_idx].split(':') + gt = gt_data[gt_idx] if gt_idx < len(gt_data) else './.' + else: + gt = './.' + + gt_class = get_genotype_class(gt) + gt_string = get_genotype_string(gt, ref, alt) + + found_variants[rsid_vcf] = { + 'rsid': rsid_vcf, + 'chrom': chrom, + 'pos': pos, + 'ref': ref, + 'alt': alt, + 'genotype': gt, + 'genotype_class': gt_class, + 'genotype_string': gt_string, + 'annotations': annotations[rsid_vcf] + } + + return found_variants, samples + + +def generate_comprehensive_report(found_variants: Dict, allele_info: Dict, + output_path: str, sample_name: str): + """Generate comprehensive pharmacogenomics report""" + + # Categorize by evidence level and drug class + by_evidence = defaultdict(list) + by_category = defaultdict(list) + + for rsid, var in found_variants.items(): + for ann in var['annotations']: + level = ann['evidence_level'] + category = ann['phenotype_category'] + by_evidence[level].append((rsid, var, ann)) + if category: + by_category[category].append((rsid, var, ann)) + + with open(output_path, 'w') as f: + f.write("=" * 80 + "\n") + f.write("COMPREHENSIVE PHARMACOGENOMICS REPORT\n") + f.write("Based on PharmGKB Clinical Annotations Database\n") + f.write("=" * 80 + "\n\n") + f.write(f"Sample: {sample_name}\n") + f.write(f"Total variants with PharmGKB annotations: {len(found_variants)}\n\n") + + # Summary statistics + f.write("=" * 80 + "\n") + f.write("SUMMARY BY EVIDENCE LEVEL\n") + f.write("=" * 80 + "\n\n") + f.write("Level 1A: Annotation based on CPIC or DPWG guideline\n") + f.write("Level 1B: Annotation based on FDA or EMA label\n") + f.write("Level 2A: Moderate clinical significance\n") + f.write("Level 2B: Lower clinical significance\n") + f.write("Level 3: Low evidence\n") + f.write("Level 4: In vitro/preclinical evidence only\n\n") + + for level in ['1A', '1B', '2A', '2B', '3', '4']: + count = len(by_evidence.get(level, [])) + f.write(f" Level {level}: {count} annotations\n") + + # High evidence findings (1A, 1B) + f.write("\n" + "=" * 80 + "\n") + f.write("HIGH EVIDENCE FINDINGS (Level 1A/1B - CPIC/DPWG Guidelines & FDA Labels)\n") + f.write("=" * 80 + "\n\n") + + high_evidence = by_evidence.get('1A', []) + by_evidence.get('1B', []) + if high_evidence: + for rsid, var, ann in sorted(high_evidence, key=lambda x: x[2]['gene']): + gt_string = var['genotype_string'] + f.write(f"GENE: {ann['gene']} ({rsid})\n") + f.write(f" Genotype: {gt_string} ({var['genotype_class']})\n") + f.write(f" Drug(s): {ann['drugs']}\n") + f.write(f" Category: {ann['phenotype_category']}\n") + f.write(f" Evidence Level: {ann['evidence_level']}\n") + + # Get allele-specific annotation + ann_id = ann['ann_id'] + if ann_id in allele_info: + # Try to match genotype + for geno, info in allele_info[ann_id].items(): + if gt_string.replace('/', '') == geno.replace('/', '') or \ + gt_string == geno or \ + set(gt_string.split('/')) == set(geno): + if info['text']: + f.write(f" Clinical Annotation: {info['text'][:500]}...\n" if len(info['text']) > 500 else f" Clinical Annotation: {info['text']}\n") + if info['function']: + f.write(f" Allele Function: {info['function']}\n") + break + f.write("\n") + else: + f.write(" No high-evidence findings.\n\n") + + # Moderate evidence findings (2A, 2B) + f.write("=" * 80 + "\n") + f.write("MODERATE EVIDENCE FINDINGS (Level 2A/2B)\n") + f.write("=" * 80 + "\n\n") + + moderate_evidence = by_evidence.get('2A', []) + by_evidence.get('2B', []) + if moderate_evidence: + for rsid, var, ann in sorted(moderate_evidence, key=lambda x: x[2]['gene'])[:50]: # Limit to top 50 + gt_string = var['genotype_string'] + f.write(f"GENE: {ann['gene']} ({rsid})\n") + f.write(f" Genotype: {gt_string}\n") + f.write(f" Drug(s): {ann['drugs']}\n") + f.write(f" Category: {ann['phenotype_category']}\n") + f.write(f" Level: {ann['evidence_level']}\n\n") + + if len(moderate_evidence) > 50: + f.write(f" ... and {len(moderate_evidence) - 50} more moderate evidence findings\n\n") + else: + f.write(" No moderate-evidence findings.\n\n") + + # Summary by phenotype category + f.write("=" * 80 + "\n") + f.write("SUMMARY BY PHENOTYPE CATEGORY\n") + f.write("=" * 80 + "\n\n") + + for category in sorted(by_category.keys()): + items = by_category[category] + f.write(f"\n## {category}: {len(items)} annotations\n") + f.write("-" * 40 + "\n") + + # Show high-evidence items for each category + high_in_cat = [x for x in items if x[2]['evidence_level'] in ['1A', '1B', '2A']] + for rsid, var, ann in high_in_cat[:5]: + f.write(f" {ann['gene']} ({rsid}): {ann['drugs'][:50]}...\n" if len(ann['drugs']) > 50 else f" {ann['gene']} ({rsid}): {ann['drugs']}\n") + + # Full detailed list + f.write("\n" + "=" * 80 + "\n") + f.write("COMPLETE VARIANT LIST\n") + f.write("=" * 80 + "\n\n") + + f.write("RSID\tGENE\tGENOTYPE\tLEVEL\tCATEGORY\tDRUGS\n") + for rsid, var in sorted(found_variants.items()): + for ann in var['annotations']: + drugs_short = ann['drugs'][:30] + "..." if len(ann['drugs']) > 30 else ann['drugs'] + f.write(f"{rsid}\t{ann['gene']}\t{var['genotype_string']}\t{ann['evidence_level']}\t{ann['phenotype_category']}\t{drugs_short}\n") + + print(f"Report saved to: {output_path}") + + +def main(): + vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf' + output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/pharmgkb_full_report.txt' + sample_idx = int(sys.argv[3]) if len(sys.argv) > 3 else 2 + + print("=" * 60) + print("COMPREHENSIVE PHARMGKB ANALYSIS") + print("=" * 60) + print(f"VCF: {vcf_path}") + print(f"Sample index: {sample_idx}") + print() + + # Load PharmGKB database + annotations, allele_info = load_pharmgkb_annotations() + + # Parse VCF + found_variants, samples = parse_vcf_for_pharmgkb(vcf_path, sample_idx, annotations) + + sample_name = samples[sample_idx] if sample_idx < len(samples) else f"Sample_{sample_idx}" + print(f"\nFound {len(found_variants)} variants with PharmGKB annotations") + + # Count by evidence level + level_counts = defaultdict(int) + for rsid, var in found_variants.items(): + for ann in var['annotations']: + level_counts[ann['evidence_level']] += 1 + + print("\nAnnotations by evidence level:") + for level in ['1A', '1B', '2A', '2B', '3', '4']: + print(f" Level {level}: {level_counts.get(level, 0)}") + + # Generate report + generate_comprehensive_report(found_variants, allele_info, output_path, sample_name) + + # Print high-evidence findings to console + print("\n" + "=" * 60) + print("HIGH EVIDENCE FINDINGS (Level 1A/1B)") + print("=" * 60) + + for rsid, var in found_variants.items(): + for ann in var['annotations']: + if ann['evidence_level'] in ['1A', '1B']: + print(f"\n{ann['gene']} ({rsid})") + print(f" Genotype: {var['genotype_string']}") + print(f" Drug(s): {ann['drugs'][:80]}...") + print(f" Level: {ann['evidence_level']}") + + +if __name__ == '__main__': + main() diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index bc6626c..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,31 +0,0 @@ -[build-system] -requires = ["setuptools>=61", "wheel"] -build-backend = "setuptools.build_meta" - -[project] -name = "genomic-consultant" -version = "0.1.0" -description = "Personal genomic risk and drug–interaction decision support scaffolding" -readme = "README.md" -requires-python = ">=3.11" -dependencies = [ - "pyyaml>=6", -] - -[project.scripts] -genomic-consultant = "genomic_consultant.cli:main" - -[project.optional-dependencies] -dev = [ - "pytest", - "ruff", -] -store = [ - "pandas>=2", -] - -[tool.setuptools] -package-dir = {"" = "src"} - -[tool.setuptools.packages.find] -where = ["src"] diff --git a/sample_data/example_annotated.tsv b/sample_data/example_annotated.tsv deleted file mode 100644 index 949c9e4..0000000 --- a/sample_data/example_annotated.tsv +++ /dev/null @@ -1,3 +0,0 @@ -#CHROM POS REF ALT SYMBOL Consequence Protein_position PolyPhen SIFT CLIN_SIG AF gnomAD_AF SpliceAI CADD_PHRED -1 123456 A T GJB2 missense_variant p.Val37Ile benign tolerated Benign 0.012 0.012 0.02 5.1 -2 234567 G C OTOF stop_gained p.* probably_damaging deleterious Uncertain_significance 0.0001 0.0001 0.6 28.5 diff --git a/src/genomic_consultant.egg-info/PKG-INFO b/src/genomic_consultant.egg-info/PKG-INFO deleted file mode 100644 index a40d06f..0000000 --- a/src/genomic_consultant.egg-info/PKG-INFO +++ /dev/null @@ -1,63 +0,0 @@ -Metadata-Version: 2.4 -Name: genomic-consultant -Version: 0.1.0 -Summary: Personal genomic risk and drug–interaction decision support scaffolding -Requires-Python: >=3.11 -Description-Content-Type: text/markdown -Requires-Dist: pyyaml>=6 -Provides-Extra: dev -Requires-Dist: pytest; extra == "dev" -Requires-Dist: ruff; extra == "dev" - -# Genomic Consultant - -Early design for a personal genomic risk and drug–interaction decision support system. Specs are sourced from `genomic_decision_support_system_spec_v0.1.md`. - -## Vision (per spec) -- Phase 1: trio variant calling, annotation, queryable genomic DB, initial ACMG evidence tagging. -- Phase 2: pharmacogenomics genotype-to-phenotype mapping plus drug–drug interaction checks. -- Phase 3: supplement/herb normalization and interaction risk layering. -- Phase 4: LLM-driven query orchestration and report generation. - -## Repository Layout -- `docs/` — system architecture notes, phase plans, data models (work in progress). -- `configs/` — example ACMG config and gene panel JSON. -- `sample_data/` — tiny annotated TSV for demo. -- `src/genomic_consultant/` — Python scaffolding (pipelines, store, panel lookup, ACMG tagging, reporting). -- `genomic_decision_support_system_spec_v0.1.md` — original requirements draft. - -## Contributing/next steps -1. Finalize Phase 1 tech selection (variant caller, annotation stack, reference/DB versions). -2. Stand up the Phase 1 pipelines and minimal query API surface. -3. Add ACMG evidence tagging config and human-review logging. -4. Layer in PGx/DDI and supplement modules per later phases. - -Data safety: keep genomic/clinical data local; the `.gitignore` blocks common genomic outputs by default. - -## Quickstart (CLI scaffolding) -``` -pip install -e . - -# 1) Show trio calling plan (commands only; not executed) -genomic-consultant plan-call \ - --sample proband:/data/proband.bam \ - --sample father:/data/father.bam \ - --sample mother:/data/mother.bam \ - --reference /refs/GRCh38.fa \ - --workdir /tmp/trio - -# 2) Show annotation plan for a joint VCF -genomic-consultant plan-annotate \ - --vcf /tmp/trio/trio.joint.vcf.gz \ - --workdir /tmp/trio/annot \ - --prefix trio \ - --reference /refs/GRCh38.fa - -# 3) Demo panel report using sample data -genomic-consultant panel-report \ - --tsv sample_data/example_annotated.tsv \ - --panel configs/panel.example.json \ - --acmg-config configs/acmg_config.example.yaml \ - --individual-id demo \ - --format markdown -``` diff --git a/src/genomic_consultant.egg-info/SOURCES.txt b/src/genomic_consultant.egg-info/SOURCES.txt deleted file mode 100644 index bc5a276..0000000 --- a/src/genomic_consultant.egg-info/SOURCES.txt +++ /dev/null @@ -1,27 +0,0 @@ -README.md -pyproject.toml -src/genomic_consultant/__init__.py -src/genomic_consultant/cli.py -src/genomic_consultant.egg-info/PKG-INFO -src/genomic_consultant.egg-info/SOURCES.txt -src/genomic_consultant.egg-info/dependency_links.txt -src/genomic_consultant.egg-info/entry_points.txt -src/genomic_consultant.egg-info/requires.txt -src/genomic_consultant.egg-info/top_level.txt -src/genomic_consultant/acmg/__init__.py -src/genomic_consultant/acmg/tagger.py -src/genomic_consultant/audit/__init__.py -src/genomic_consultant/audit/run_log.py -src/genomic_consultant/orchestration/__init__.py -src/genomic_consultant/orchestration/workflows.py -src/genomic_consultant/panels/__init__.py -src/genomic_consultant/panels/panels.py -src/genomic_consultant/pipelines/__init__.py -src/genomic_consultant/pipelines/annotation.py -src/genomic_consultant/pipelines/variant_calling.py -src/genomic_consultant/reporting/__init__.py -src/genomic_consultant/reporting/report.py -src/genomic_consultant/store/__init__.py -src/genomic_consultant/store/query.py -src/genomic_consultant/utils/__init__.py -src/genomic_consultant/utils/models.py \ No newline at end of file diff --git a/src/genomic_consultant.egg-info/dependency_links.txt b/src/genomic_consultant.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/src/genomic_consultant.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/genomic_consultant.egg-info/entry_points.txt b/src/genomic_consultant.egg-info/entry_points.txt deleted file mode 100644 index 3e063e0..0000000 --- a/src/genomic_consultant.egg-info/entry_points.txt +++ /dev/null @@ -1,2 +0,0 @@ -[console_scripts] -genomic-consultant = genomic_consultant.cli:main diff --git a/src/genomic_consultant.egg-info/requires.txt b/src/genomic_consultant.egg-info/requires.txt deleted file mode 100644 index 47e6320..0000000 --- a/src/genomic_consultant.egg-info/requires.txt +++ /dev/null @@ -1,5 +0,0 @@ -pyyaml>=6 - -[dev] -pytest -ruff diff --git a/src/genomic_consultant.egg-info/top_level.txt b/src/genomic_consultant.egg-info/top_level.txt deleted file mode 100644 index bf22515..0000000 --- a/src/genomic_consultant.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -genomic_consultant diff --git a/src/genomic_consultant/__init__.py b/src/genomic_consultant/__init__.py deleted file mode 100644 index 7b5aebb..0000000 --- a/src/genomic_consultant/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -"""Genomic Consultant: genomic decision support scaffolding.""" - -__all__ = ["__version__"] -__version__ = "0.1.0" diff --git a/src/genomic_consultant/acmg/__init__.py b/src/genomic_consultant/acmg/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/src/genomic_consultant/acmg/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/genomic_consultant/acmg/tagger.py b/src/genomic_consultant/acmg/tagger.py deleted file mode 100644 index bcb7ff5..0000000 --- a/src/genomic_consultant/acmg/tagger.py +++ /dev/null @@ -1,91 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import List, Set - -import yaml - -from genomic_consultant.utils.models import EvidenceTag, SuggestedClassification, Variant - - -@dataclass -class ACMGConfig: - ba1_af: float = 0.05 - bs1_af: float = 0.01 - pm2_af: float = 0.0005 - lof_genes: Set[str] | None = None - bp7_splice_ai_max: float = 0.1 - - -def load_acmg_config(path: Path) -> ACMGConfig: - data = yaml.safe_load(Path(path).read_text()) - return ACMGConfig( - ba1_af=data.get("ba1_af", 0.05), - bs1_af=data.get("bs1_af", 0.01), - pm2_af=data.get("pm2_af", 0.0005), - lof_genes=set(data.get("lof_genes", [])) if data.get("lof_genes") else set(), - bp7_splice_ai_max=data.get("bp7_splice_ai_max", 0.1), - ) - - -def tag_variant(variant: Variant, config: ACMGConfig) -> SuggestedClassification: - evidence: List[EvidenceTag] = [] - af = variant.allele_frequency - - if af is not None: - if af >= config.ba1_af: - evidence.append(EvidenceTag(tag="BA1", strength="Stand-alone", rationale=f"AF {af} >= {config.ba1_af}")) - elif af >= config.bs1_af: - evidence.append(EvidenceTag(tag="BS1", strength="Strong", rationale=f"AF {af} >= {config.bs1_af}")) - elif af <= config.pm2_af: - evidence.append(EvidenceTag(tag="PM2", strength="Moderate", rationale=f"AF {af} <= {config.pm2_af}")) - - if _is_lof(variant) and variant.gene and variant.gene in config.lof_genes: - evidence.append( - EvidenceTag(tag="PVS1", strength="Very strong", rationale="Predicted LoF in LoF-sensitive gene") - ) - - splice_ai = _get_float(variant.annotations.get("splice_ai_delta_score")) - if _is_synonymous(variant) and (splice_ai is None or splice_ai <= config.bp7_splice_ai_max): - evidence.append( - EvidenceTag( - tag="BP7", - strength="Supporting", - rationale=f"Synonymous with low predicted splice impact (spliceAI {splice_ai})", - ) - ) - - suggested = _suggest_class(evidence) - return SuggestedClassification(suggested_class=suggested, evidence=evidence) - - -def _is_lof(variant: Variant) -> bool: - consequence = (variant.consequence or "").lower() - lof_keywords = ["frameshift", "stop_gained", "splice_acceptor", "splice_donor", "start_lost"] - return any(k in consequence for k in lof_keywords) - - -def _suggest_class(evidence: List[EvidenceTag]) -> str: - tags = {e.tag for e in evidence} - if "BA1" in tags: - return "Benign" - if "BS1" in tags and "PM2" not in tags and "PVS1" not in tags: - return "Likely benign" - if "PVS1" in tags and "PM2" in tags: - return "Likely pathogenic" - return "VUS" - - -def _is_synonymous(variant: Variant) -> bool: - consequence = (variant.consequence or "").lower() - return "synonymous_variant" in consequence - - -def _get_float(value: str | float | None) -> float | None: - if value is None: - return None - try: - return float(value) - except (TypeError, ValueError): - return None diff --git a/src/genomic_consultant/audit/__init__.py b/src/genomic_consultant/audit/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/src/genomic_consultant/audit/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/genomic_consultant/audit/run_log.py b/src/genomic_consultant/audit/run_log.py deleted file mode 100644 index fabbaa7..0000000 --- a/src/genomic_consultant/audit/run_log.py +++ /dev/null @@ -1,30 +0,0 @@ -from __future__ import annotations - -import json -from dataclasses import asdict, is_dataclass -from datetime import datetime -from pathlib import Path -from typing import Any - -from genomic_consultant.utils.models import RunLog - - -def _serialize(obj: Any) -> Any: - if isinstance(obj, datetime): - return obj.isoformat() - if is_dataclass(obj): - return {k: _serialize(v) for k, v in asdict(obj).items()} - if isinstance(obj, dict): - return {k: _serialize(v) for k, v in obj.items()} - if isinstance(obj, list): - return [_serialize(v) for v in obj] - return obj - - -def write_run_log(run_log: RunLog, path: str | Path) -> Path: - """Persist a RunLog to JSON.""" - path = Path(path) - path.parent.mkdir(parents=True, exist_ok=True) - payload = _serialize(run_log) - path.write_text(json.dumps(payload, indent=2)) - return path diff --git a/src/genomic_consultant/cli.py b/src/genomic_consultant/cli.py deleted file mode 100644 index 2963e89..0000000 --- a/src/genomic_consultant/cli.py +++ /dev/null @@ -1,273 +0,0 @@ -from __future__ import annotations - -import argparse -import sys -from datetime import datetime -from pathlib import Path -from typing import Dict, List - -from genomic_consultant.acmg.tagger import ACMGConfig, load_acmg_config -from genomic_consultant.orchestration.workflows import run_panel_variant_review -from genomic_consultant.panels.aggregate import merge_mappings -from genomic_consultant.panels.panels import load_panel -from genomic_consultant.panels.resolver import PhenotypeGeneResolver -from genomic_consultant.pipelines.annotation import build_vep_plan -from genomic_consultant.pipelines.variant_calling import build_gatk_trio_plan -from genomic_consultant.pipelines.runner import execute_plan -from genomic_consultant.reporting.report import panel_report_json, panel_report_markdown -from genomic_consultant.orchestration.phase1_pipeline import run_phase1_pipeline -from genomic_consultant.store.query import GenomicStore -from genomic_consultant.utils.hashing import sha256sum -from genomic_consultant.utils.models import FilterConfig -from genomic_consultant.utils.tooling import probe_tool_versions - - -def parse_samples(sample_args: List[str]) -> Dict[str, Path]: - samples: Dict[str, Path] = {} - for arg in sample_args: - if ":" not in arg: - raise ValueError(f"Sample must be sample_id:/path/to.bam, got {arg}") - sample_id, bam_path = arg.split(":", 1) - samples[sample_id] = Path(bam_path) - return samples - - -def main(argv: List[str] | None = None) -> int: - parser = argparse.ArgumentParser(prog="genomic-consultant", description="Genomic decision support scaffolding") - sub = parser.add_subparsers(dest="command", required=True) - - # Variant calling plan - call = sub.add_parser("plan-call", help="Build variant calling command plan (GATK trio).") - call.add_argument("--sample", action="append", required=True, help="sample_id:/path/to.bam (repeatable)") - call.add_argument("--reference", required=True, help="Path to reference FASTA") - call.add_argument("--workdir", required=True, help="Working directory for outputs") - call.add_argument("--prefix", default="trio", help="Output prefix for joint VCF") - - run_call = sub.add_parser("run-call", help="Execute variant calling plan (GATK trio).") - run_call.add_argument("--sample", action="append", required=True, help="sample_id:/path/to.bam (repeatable)") - run_call.add_argument("--reference", required=True, help="Path to reference FASTA") - run_call.add_argument("--workdir", required=True, help="Working directory for outputs") - run_call.add_argument("--prefix", default="trio", help="Output prefix for joint VCF") - run_call.add_argument("--log", required=False, help="Path to write run log JSON") - run_call.add_argument("--probe-tools", action="store_true", help="Attempt to record tool versions") - - # Annotation plan - ann = sub.add_parser("plan-annotate", help="Build annotation command plan (VEP).") - ann.add_argument("--vcf", required=True, help="Path to joint VCF") - ann.add_argument("--workdir", required=True, help="Working directory for annotated outputs") - ann.add_argument("--prefix", default="annotated", help="Output prefix") - ann.add_argument("--reference", required=False, help="Reference FASTA (optional)") - ann.add_argument("--plugin", action="append", help="VEP plugin spec, repeatable", default=[]) - - run_ann = sub.add_parser("run-annotate", help="Execute annotation plan (VEP).") - run_ann.add_argument("--vcf", required=True, help="Path to joint VCF") - run_ann.add_argument("--workdir", required=True, help="Working directory for annotated outputs") - run_ann.add_argument("--prefix", default="annotated", help="Output prefix") - run_ann.add_argument("--reference", required=False, help="Reference FASTA (optional)") - run_ann.add_argument("--plugin", action="append", help="VEP plugin spec, repeatable", default=[]) - run_ann.add_argument("--extra-flag", action="append", help="Extra flags appended to VEP command", default=[]) - run_ann.add_argument("--log", required=False, help="Path to write run log JSON") - run_ann.add_argument("--probe-tools", action="store_true", help="Attempt to record tool versions") - - # Panel report - panel = sub.add_parser("panel-report", help="Run panel query + ACMG tagging and emit report.") - panel.add_argument("--tsv", required=True, help="Flattened annotated TSV") - panel.add_argument("--panel", required=False, help="Panel JSON file") - panel.add_argument("--phenotype-id", required=False, help="Phenotype/HPO ID to resolve to a panel") - panel.add_argument("--phenotype-mapping", required=False, help="Phenotype→genes mapping JSON") - panel.add_argument("--acmg-config", required=True, help="ACMG config YAML") - panel.add_argument("--individual-id", required=True, help="Individual identifier") - panel.add_argument("--max-af", type=float, default=None, help="Max allele frequency filter") - panel.add_argument("--format", choices=["markdown", "json"], default="markdown", help="Output format") - panel.add_argument("--log", required=False, help="Path to write run log JSON for the analysis") - panel.add_argument("--phenotype-panel", required=False, help="Phenotype→genes mapping JSON (optional)") - - phase1 = sub.add_parser("phase1-run", help="End-to-end Phase 1 pipeline (call→annotate→panel).") - phase1.add_argument("--sample", action="append", help="sample_id:/path/to.bam (repeatable)") - phase1.add_argument("--reference", required=False, help="Path to reference FASTA") - phase1.add_argument("--workdir", required=True, help="Working directory for outputs") - phase1.add_argument("--prefix", default="trio", help="Output prefix") - phase1.add_argument("--plugins", action="append", default=[], help="VEP plugin specs") - phase1.add_argument("--extra-flag", action="append", default=[], help="Extra flags for VEP") - phase1.add_argument("--joint-vcf", required=False, help="Existing joint VCF (skip calling)") - phase1.add_argument("--tsv", required=False, help="Existing annotated TSV (skip annotation)") - phase1.add_argument("--skip-call", action="store_true", help="Skip variant calling step") - phase1.add_argument("--skip-annotate", action="store_true", help="Skip annotation step") - phase1.add_argument("--panel", required=False, help="Panel JSON file") - phase1.add_argument("--phenotype-id", required=False, help="Phenotype/HPO ID to resolve to a panel") - phase1.add_argument("--phenotype-mapping", required=False, help="Phenotype→genes mapping JSON") - phase1.add_argument("--acmg-config", required=True, help="ACMG config YAML") - phase1.add_argument("--max-af", type=float, default=None, help="Max allele frequency filter") - phase1.add_argument("--format", choices=["markdown", "json"], default="markdown", help="Report format") - phase1.add_argument("--log-dir", required=False, help="Directory to write run logs") - - build_map = sub.add_parser("build-phenotype-mapping", help="Merge phenotype→gene mapping JSON files.") - build_map.add_argument("--output", required=True, help="Output JSON path") - build_map.add_argument("inputs", nargs="+", help="Input mapping JSON files") - - args = parser.parse_args(argv) - - if args.command == "plan-call": - samples = parse_samples(args.sample) - plan = build_gatk_trio_plan( - samples=samples, - reference_fasta=Path(args.reference), - workdir=Path(args.workdir), - output_prefix=args.prefix, - ) - print("# Variant calling command plan") - for cmd in plan.commands: - print(cmd) - return 0 - - if args.command == "run-call": - samples = parse_samples(args.sample) - plan = build_gatk_trio_plan( - samples=samples, - reference_fasta=Path(args.reference), - workdir=Path(args.workdir), - output_prefix=args.prefix, - ) - tool_versions = probe_tool_versions({"gatk": "gatk"}) if args.probe_tools else {} - run_log = execute_plan( - plan, automation_level="Auto", log_path=Path(args.log) if args.log else None - ) - run_log.tool_versions.update(tool_versions) - if args.log: - from genomic_consultant.audit.run_log import write_run_log - - write_run_log(run_log, Path(args.log)) - print(f"Run finished with {len(run_log.outputs.get('command_results', []))} steps. Log ID: {run_log.run_id}") - if args.log: - print(f"Run log written to {args.log}") - return 0 - - if args.command == "plan-annotate": - plan = build_vep_plan( - vcf_path=Path(args.vcf), - workdir=Path(args.workdir), - reference_fasta=Path(args.reference) if args.reference else None, - output_prefix=args.prefix, - plugins=args.plugin, - ) - print("# Annotation command plan") - for cmd in plan.commands: - print(cmd) - return 0 - - if args.command == "run-annotate": - plan = build_vep_plan( - vcf_path=Path(args.vcf), - workdir=Path(args.workdir), - reference_fasta=Path(args.reference) if args.reference else None, - output_prefix=args.prefix, - plugins=args.plugin, - extra_flags=args.extra_flag, - ) - tool_versions = probe_tool_versions({"vep": "vep", "bcftools": "bcftools", "tabix": "tabix"}) if args.probe_tools else {} - run_log = execute_plan( - plan, automation_level="Auto", log_path=Path(args.log) if args.log else None - ) - run_log.tool_versions.update(tool_versions) - if args.log: - from genomic_consultant.audit.run_log import write_run_log - - write_run_log(run_log, Path(args.log)) - print(f"Run finished with {len(run_log.outputs.get('command_results', []))} steps. Log ID: {run_log.run_id}") - if args.log: - print(f"Run log written to {args.log}") - return 0 - - if args.command == "panel-report": - if not args.panel and not (args.phenotype_id and args.phenotype_mapping): - raise SystemExit("Provide either --panel or (--phenotype-id and --phenotype-mapping).") - - store = GenomicStore.from_tsv(Path(args.tsv)) - if args.panel: - panel_obj = load_panel(Path(args.panel)) - panel_config_hash = sha256sum(Path(args.panel)) - else: - resolver = PhenotypeGeneResolver.from_json(Path(args.phenotype_mapping)) - panel_obj = resolver.build_panel(args.phenotype_id) - if panel_obj is None: - raise SystemExit(f"No genes found for phenotype {args.phenotype_id}") - panel_config_hash = sha256sum(Path(args.phenotype_mapping)) - - acmg_config = load_acmg_config(Path(args.acmg_config)) - filters = FilterConfig(max_af=args.max_af) - result = run_panel_variant_review( - individual_id=args.individual_id, - panel=panel_obj, - store=store, - acmg_config=acmg_config, - filters=filters, - ) - output = panel_report_json(result) if args.format == "json" else panel_report_markdown(result) - print(output) - - if args.log: - from genomic_consultant.audit.run_log import write_run_log - from genomic_consultant.utils.models import RunLog - run_log = RunLog( - run_id=f"panel-{args.individual_id}", - started_at=datetime.utcnow(), - inputs={ - "tsv": str(args.tsv), - "panel": str(args.panel) if args.panel else f"phenotype:{args.phenotype_id}", - "acmg_config": str(args.acmg_config), - }, - parameters={ - "max_af": args.max_af, - "format": args.format, - "phenotype_id": args.phenotype_id, - }, - tool_versions={}, - database_versions={}, - config_hashes={ - "panel": panel_config_hash, - "acmg_config": sha256sum(Path(args.acmg_config)), - }, - automation_levels={"panel_report": "Auto+Review"}, - overrides=[], - outputs={"report": output}, - notes=None, - ) - write_run_log(run_log, Path(args.log)) - print(f"Analysis log written to {args.log}") - return 0 - - if args.command == "build-phenotype-mapping": - merge_mappings(inputs=[Path(p) for p in args.inputs], output=Path(args.output)) - print(f"Merged mapping written to {args.output}") - return 0 - - if args.command == "phase1-run": - samples = parse_samples(args.sample) if args.sample else None - log_dir = Path(args.log_dir) if args.log_dir else None - artifacts = run_phase1_pipeline( - samples=samples, - reference_fasta=Path(args.reference) if args.reference else None, - workdir=Path(args.workdir), - output_prefix=args.prefix, - acmg_config_path=Path(args.acmg_config), - max_af=args.max_af, - panel_path=Path(args.panel) if args.panel else None, - phenotype_id=args.phenotype_id, - phenotype_mapping=Path(args.phenotype_mapping) if args.phenotype_mapping else None, - report_format=args.format, - plugins=args.plugins, - extra_flags=args.extra_flag, - existing_joint_vcf=Path(args.joint_vcf) if args.joint_vcf else None, - existing_tsv=Path(args.tsv) if args.tsv else None, - skip_call=args.skip_call, - skip_annotate=args.skip_annotate, - log_dir=log_dir, - ) - print(artifacts.panel_report) - return 0 - - return 1 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/genomic_consultant/orchestration/__init__.py b/src/genomic_consultant/orchestration/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/src/genomic_consultant/orchestration/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/genomic_consultant/orchestration/phase1_pipeline.py b/src/genomic_consultant/orchestration/phase1_pipeline.py deleted file mode 100644 index e4b754c..0000000 --- a/src/genomic_consultant/orchestration/phase1_pipeline.py +++ /dev/null @@ -1,146 +0,0 @@ -from __future__ import annotations - -import uuid -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List, Optional - -from datetime import datetime, timezone - -from genomic_consultant.acmg.tagger import ACMGConfig, load_acmg_config -from genomic_consultant.audit.run_log import write_run_log -from genomic_consultant.orchestration.workflows import run_panel_variant_review -from genomic_consultant.panels.panels import load_panel -from genomic_consultant.panels.resolver import PhenotypeGeneResolver -from genomic_consultant.pipelines.annotation import AnnotationPlan, build_vep_plan -from genomic_consultant.pipelines.runner import execute_plan -from genomic_consultant.pipelines.variant_calling import VariantCallingPlan, build_gatk_trio_plan -from genomic_consultant.reporting.report import panel_report_json, panel_report_markdown -from genomic_consultant.store.query import GenomicStore -from genomic_consultant.utils.hashing import sha256sum -from genomic_consultant.utils.models import FilterConfig, RunLog - - -@dataclass -class Phase1Artifacts: - call_log: Optional[RunLog] - annotate_log: Optional[RunLog] - panel_report: str - panel_report_format: str - panel_result_log: RunLog - tsv_path: Path - joint_vcf: Optional[Path] - - -def run_phase1_pipeline( - samples: Optional[Dict[str, Path]], - reference_fasta: Optional[Path], - workdir: Path, - output_prefix: str, - acmg_config_path: Path, - max_af: Optional[float], - panel_path: Optional[Path], - phenotype_id: Optional[str], - phenotype_mapping: Optional[Path], - report_format: str = "markdown", - plugins: Optional[List[str]] = None, - extra_flags: Optional[List[str]] = None, - existing_joint_vcf: Optional[Path] = None, - existing_tsv: Optional[Path] = None, - skip_call: bool = False, - skip_annotate: bool = False, - log_dir: Optional[Path] = None, -) -> Phase1Artifacts: - """ - Orchestrate Phase 1: optional call -> annotate -> panel report. - Allows skipping call/annotate when precomputed artifacts are supplied. - """ - log_dir = log_dir or workdir / "runtime" - log_dir.mkdir(parents=True, exist_ok=True) - call_log = None - annotate_log = None - - # Variant calling - joint_vcf: Optional[Path] = existing_joint_vcf - if not skip_call: - if not samples or not reference_fasta: - raise ValueError("samples and reference_fasta are required unless skip_call is True") - call_plan: VariantCallingPlan = build_gatk_trio_plan( - samples=samples, reference_fasta=reference_fasta, workdir=workdir, output_prefix=output_prefix - ) - call_log_path = log_dir / f"{output_prefix}_call_runlog.json" - call_log = execute_plan(call_plan, automation_level="Auto", log_path=call_log_path) - joint_vcf = call_plan.joint_vcf - elif joint_vcf is None: - joint_vcf = existing_joint_vcf - - # Annotation - tsv_path: Optional[Path] = existing_tsv - if not skip_annotate: - if joint_vcf is None: - raise ValueError("joint VCF must be provided (via call step or existing_joint_vcf)") - ann_plan: AnnotationPlan = build_vep_plan( - vcf_path=joint_vcf, - workdir=workdir, - reference_fasta=reference_fasta, - output_prefix=output_prefix, - plugins=plugins or [], - extra_flags=extra_flags or [], - ) - ann_log_path = log_dir / f"{output_prefix}_annotate_runlog.json" - annotate_log = execute_plan(ann_plan, automation_level="Auto", log_path=ann_log_path) - tsv_path = ann_plan.flat_table - - if tsv_path is None: - raise ValueError("No TSV available; provide existing_tsv or run annotation.") - - # Panel selection - if panel_path: - panel_obj = load_panel(panel_path) - panel_hash = sha256sum(panel_path) - elif phenotype_id and phenotype_mapping: - resolver = PhenotypeGeneResolver.from_json(phenotype_mapping) - panel_obj = resolver.build_panel(phenotype_id) - if panel_obj is None: - raise ValueError(f"No genes found for phenotype {phenotype_id}") - panel_hash = sha256sum(phenotype_mapping) - else: - raise ValueError("Provide panel_path or (phenotype_id and phenotype_mapping).") - - acmg_config = load_acmg_config(acmg_config_path) - store = GenomicStore.from_tsv(tsv_path) - filters = FilterConfig(max_af=max_af) - panel_result = run_panel_variant_review( - individual_id=output_prefix, panel=panel_obj, store=store, acmg_config=acmg_config, filters=filters - ) - report = panel_report_markdown(panel_result) if report_format == "markdown" else panel_report_json(panel_result) - - panel_log = RunLog( - run_id=f"phase1-panel-{uuid.uuid4()}", - started_at=datetime.now(timezone.utc), - inputs={ - "tsv": str(tsv_path), - "panel": str(panel_path) if panel_path else f"phenotype:{phenotype_id}", - "acmg_config": str(acmg_config_path), - }, - parameters={"max_af": max_af, "report_format": report_format, "phenotype_id": phenotype_id}, - tool_versions={}, - database_versions={}, - config_hashes={"panel": panel_hash, "acmg_config": sha256sum(acmg_config_path)}, - automation_levels={"panel_report": "Auto+Review"}, - overrides=[], - outputs={"report": report}, - notes=None, - ) - panel_log_path = log_dir / f"{output_prefix}_panel_runlog.json" - write_run_log(panel_log, panel_log_path) - - return Phase1Artifacts( - call_log=call_log, - annotate_log=annotate_log, - panel_report=report, - panel_report_format=report_format, - panel_result_log=panel_log, - tsv_path=tsv_path, - joint_vcf=joint_vcf, - ) diff --git a/src/genomic_consultant/orchestration/workflows.py b/src/genomic_consultant/orchestration/workflows.py deleted file mode 100644 index 4eb19fd..0000000 --- a/src/genomic_consultant/orchestration/workflows.py +++ /dev/null @@ -1,35 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from typing import List - -from genomic_consultant.acmg.tagger import ACMGConfig, tag_variant -from genomic_consultant.panels.panels import GenePanel -from genomic_consultant.store.query import GenomicStore -from genomic_consultant.utils.models import FilterConfig, SuggestedClassification, Variant - - -@dataclass -class PanelVariantResult: - variant: Variant - acmg: SuggestedClassification - - -@dataclass -class PanelAnalysisResult: - individual_id: str - panel: GenePanel - variants: List[PanelVariantResult] - - -def run_panel_variant_review( - individual_id: str, - panel: GenePanel, - store: GenomicStore, - acmg_config: ACMGConfig, - filters: FilterConfig | None = None, -) -> PanelAnalysisResult: - """Query variants for a panel and attach ACMG evidence suggestions.""" - variants = store.get_variants_by_gene(individual_id=individual_id, genes=panel.genes, filters=filters) - enriched = [PanelVariantResult(variant=v, acmg=tag_variant(v, acmg_config)) for v in variants] - return PanelAnalysisResult(individual_id=individual_id, panel=panel, variants=enriched) diff --git a/src/genomic_consultant/panels/__init__.py b/src/genomic_consultant/panels/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/src/genomic_consultant/panels/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/genomic_consultant/panels/aggregate.py b/src/genomic_consultant/panels/aggregate.py deleted file mode 100644 index d34c2aa..0000000 --- a/src/genomic_consultant/panels/aggregate.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -import json -from pathlib import Path -from typing import Dict, Iterable, List, Set - - -def merge_mappings(inputs: Iterable[Path], output: Path, version: str = "merged", sources: List[str] | None = None) -> Path: - """ - Merge multiple phenotype→gene mapping JSON files into one. - Input schema: {"phenotype_to_genes": {"HP:xxxx": ["GENE1", ...]}, "version": "...", "source": "..."} - """ - merged: Dict[str, Set[str]] = {} - source_list: List[str] = sources or [] - for path in inputs: - data = json.loads(Path(path).read_text()) - phenos = data.get("phenotype_to_genes", {}) - for pid, genes in phenos.items(): - merged.setdefault(pid, set()).update(genes) - src_label = data.get("source") or path.name - source_list.append(src_label) - - out = { - "version": version, - "source": ",".join(source_list), - "phenotype_to_genes": {pid: sorted(list(genes)) for pid, genes in merged.items()}, - "metadata": {"merged_from": [str(p) for p in inputs]}, - } - output.parent.mkdir(parents=True, exist_ok=True) - output.write_text(json.dumps(out, indent=2)) - return output diff --git a/src/genomic_consultant/panels/panels.py b/src/genomic_consultant/panels/panels.py deleted file mode 100644 index 82ef5cc..0000000 --- a/src/genomic_consultant/panels/panels.py +++ /dev/null @@ -1,38 +0,0 @@ -from __future__ import annotations - -import json -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, Optional - -from genomic_consultant.utils.models import GenePanel - - -@dataclass -class PanelRepository: - """Loads curated gene panels stored as JSON files.""" - - panels: Dict[str, GenePanel] - - @classmethod - def from_directory(cls, path: Path) -> "PanelRepository": - panels: Dict[str, GenePanel] = {} - for json_file in Path(path).glob("*.json"): - panel = load_panel(json_file) - panels[panel.name] = panel - return cls(panels=panels) - - def get(self, name: str) -> Optional[GenePanel]: - return self.panels.get(name) - - -def load_panel(path: Path) -> GenePanel: - data = json.loads(Path(path).read_text()) - return GenePanel( - name=data["name"], - genes=data["genes"], - source=data.get("source", "unknown"), - version=data.get("version", "unknown"), - last_updated=data.get("last_updated", ""), - metadata=data.get("metadata", {}), - ) diff --git a/src/genomic_consultant/panels/resolver.py b/src/genomic_consultant/panels/resolver.py deleted file mode 100644 index 9c9d3f2..0000000 --- a/src/genomic_consultant/panels/resolver.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import annotations - -import json -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List, Optional - -from genomic_consultant.utils.models import GenePanel - - -@dataclass -class PhenotypeGeneResolver: - """Resolves phenotype/HPO terms to gene lists using a curated mapping file.""" - - mapping: Dict[str, List[str]] - version: str - source: str - - @classmethod - def from_json(cls, path: Path) -> "PhenotypeGeneResolver": - data = json.loads(Path(path).read_text()) - mapping = data.get("phenotype_to_genes", {}) - version = data.get("version", "unknown") - source = data.get("source", "unknown") - return cls(mapping=mapping, version=version, source=source) - - def resolve(self, phenotype_id: str) -> Optional[List[str]]: - """Return gene list for a phenotype/HPO ID if present.""" - return self.mapping.get(phenotype_id) - - def build_panel(self, phenotype_id: str) -> Optional[GenePanel]: - genes = self.resolve(phenotype_id) - if not genes: - return None - return GenePanel( - name=f"Phenotype:{phenotype_id}", - genes=genes, - source=self.source, - version=self.version, - last_updated="", - metadata={"phenotype_id": phenotype_id}, - ) diff --git a/src/genomic_consultant/pipelines/__init__.py b/src/genomic_consultant/pipelines/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/src/genomic_consultant/pipelines/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/genomic_consultant/pipelines/annotation.py b/src/genomic_consultant/pipelines/annotation.py deleted file mode 100644 index bcd3dd6..0000000 --- a/src/genomic_consultant/pipelines/annotation.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import List, Sequence - - -@dataclass -class AnnotationPlan: - """Command plan for annotating a VCF with VEP (or similar).""" - - annotated_vcf: Path - flat_table: Path - commands: List[str] - workdir: Path - - -def build_vep_plan( - vcf_path: Path, - workdir: Path, - reference_fasta: Path | None = None, - output_prefix: str = "annotated", - plugins: Sequence[str] | None = None, - extra_flags: Sequence[str] | None = None, -) -> AnnotationPlan: - """ - Build shell commands for running VEP on a VCF. Produces compressed VCF and a flattened TSV. - This is a plan only; execution is left to a runner. - """ - workdir = Path(workdir) - workdir.mkdir(parents=True, exist_ok=True) - - annotated_vcf = workdir / f"{output_prefix}.vep.vcf.gz" - flat_table = workdir / f"{output_prefix}.vep.tsv" - - plugin_arg = "" - if plugins: - plugin_arg = " ".join(f"--plugin {p}" for p in plugins) - - extra_arg = " ".join(extra_flags) if extra_flags else "" - - ref_arg = f"--fasta {reference_fasta}" if reference_fasta else "" - - commands: List[str] = [ - ( - "vep " - f"-i {vcf_path} " - f"-o {annotated_vcf} " - "--vcf --compress_output bgzip " - "--symbol --canonical " - "--af --af_gnomad " - "--polyphen b --sift b " - "--everything " - f"{plugin_arg} " - f"{extra_arg} " - f"{ref_arg}" - ), - f"tabix -p vcf {annotated_vcf}", - ( - "bcftools query " - f"-f '%CHROM\\t%POS\\t%REF\\t%ALT\\t%SYMBOL\\t%Consequence\\t%Protein_position\\t" - f"%PolyPhen\\t%SIFT\\t%CLIN_SIG\\t%AF\\t%gnomAD_AF\\t%SpliceAI\\t%CADD_PHRED\\n' " - f"{annotated_vcf} > {flat_table}" - ), - ] - - return AnnotationPlan( - annotated_vcf=annotated_vcf, - flat_table=flat_table, - commands=commands, - workdir=workdir, - ) diff --git a/src/genomic_consultant/pipelines/runner.py b/src/genomic_consultant/pipelines/runner.py deleted file mode 100644 index ff07806..0000000 --- a/src/genomic_consultant/pipelines/runner.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import annotations - -import subprocess -import uuid -from datetime import datetime -from pathlib import Path -from typing import Any, Dict, List, Protocol - -from genomic_consultant.audit.run_log import write_run_log -from genomic_consultant.utils.models import RunLog - - -class HasCommands(Protocol): - commands: List[str] - workdir: Path - - -def execute_plan(plan: HasCommands, automation_level: str, log_path: Path | None = None) -> RunLog: - """ - Execute shell commands defined in a plan sequentially. Captures stdout/stderr and exit codes. - Suitable for VariantCallingPlan or AnnotationPlan. - """ - run_id = str(uuid.uuid4()) - started_at = datetime.utcnow() - outputs: Dict[str, Any] = {} - # The plan object may expose outputs such as joint_vcf/annotated_vcf we can introspect. - for attr in ("joint_vcf", "per_sample_gvcf", "annotated_vcf", "flat_table"): - if hasattr(plan, attr): - outputs[attr] = getattr(plan, attr) - - results: List[Dict[str, Any]] = [] - for idx, cmd in enumerate(plan.commands): - proc = subprocess.run(cmd, shell=True, cwd=plan.workdir, capture_output=True, text=True) - results.append( - { - "step": idx, - "command": cmd, - "returncode": proc.returncode, - "stdout": proc.stdout, - "stderr": proc.stderr, - } - ) - if proc.returncode != 0: - break - - run_log = RunLog( - run_id=run_id, - started_at=started_at, - inputs={"workdir": str(plan.workdir)}, - parameters={}, - tool_versions={}, # left for caller to fill if known - database_versions={}, - config_hashes={}, - automation_levels={"pipeline": automation_level}, - overrides=[], - outputs=outputs | {"command_results": results}, - notes=None, - ) - - if log_path: - write_run_log(run_log, log_path) - return run_log diff --git a/src/genomic_consultant/pipelines/variant_calling.py b/src/genomic_consultant/pipelines/variant_calling.py deleted file mode 100644 index c7279a3..0000000 --- a/src/genomic_consultant/pipelines/variant_calling.py +++ /dev/null @@ -1,65 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, List, Sequence - - -@dataclass -class VariantCallingPlan: - """Command plan for running a trio variant calling pipeline.""" - - per_sample_gvcf: Dict[str, Path] - joint_vcf: Path - commands: List[str] - workdir: Path - - -def build_gatk_trio_plan( - samples: Dict[str, Path], - reference_fasta: Path, - workdir: Path, - output_prefix: str = "trio", - intervals: Sequence[Path] | None = None, -) -> VariantCallingPlan: - """ - Build shell commands for a GATK-based trio pipeline (HaplotypeCaller gVCF + joint genotyping). - Does not execute commands; returns a plan for orchestration layers to run. - """ - workdir = Path(workdir) - workdir.mkdir(parents=True, exist_ok=True) - - per_sample_gvcf: Dict[str, Path] = {} - commands: List[str] = [] - interval_args = "" - if intervals: - interval_args = " ".join(f"-L {i}" for i in intervals) - - for sample_id, bam_path in samples.items(): - gvcf_path = workdir / f"{sample_id}.g.vcf.gz" - per_sample_gvcf[sample_id] = gvcf_path - cmd = ( - "gatk HaplotypeCaller " - f"-R {reference_fasta} " - f"-I {bam_path} " - "-O {out} " - "-ERC GVCF " - f"{interval_args}" - ).format(out=gvcf_path) - commands.append(cmd) - - joint_vcf = workdir / f"{output_prefix}.joint.vcf.gz" - joint_cmd = ( - "gatk GenotypeGVCFs " - f"-R {reference_fasta} " - + " ".join(f"--variant {p}" for p in per_sample_gvcf.values()) - + f" -O {joint_vcf}" - ) - commands.append(joint_cmd) - - return VariantCallingPlan( - per_sample_gvcf=per_sample_gvcf, - joint_vcf=joint_vcf, - commands=commands, - workdir=workdir, - ) diff --git a/src/genomic_consultant/reporting/__init__.py b/src/genomic_consultant/reporting/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/src/genomic_consultant/reporting/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/genomic_consultant/reporting/report.py b/src/genomic_consultant/reporting/report.py deleted file mode 100644 index ba587d9..0000000 --- a/src/genomic_consultant/reporting/report.py +++ /dev/null @@ -1,75 +0,0 @@ -from __future__ import annotations - -import json -from datetime import datetime, timezone -from typing import List - -from genomic_consultant.orchestration.workflows import PanelAnalysisResult, PanelVariantResult - - -def panel_report_markdown(result: PanelAnalysisResult) -> str: - lines: List[str] = [] - lines.append(f"# Panel Report: {result.panel.name}") - lines.append("") - lines.append(f"- Individual: `{result.individual_id}`") - lines.append(f"- Panel version: `{result.panel.version}` (source: {result.panel.source})") - lines.append(f"- Generated: {datetime.now(timezone.utc).isoformat()}") - lines.append("") - lines.append("## Variants") - if not result.variants: - lines.append("No variants found for this panel with current filters.") - return "\n".join(lines) - - header = "| Variant | Consequence | ClinVar | AF | ACMG suggestion | Evidence |" - lines.append(header) - lines.append("|---|---|---|---|---|---|") - for pv in result.variants: - v = pv.variant - ev_summary = "; ".join(f"{e.tag}({e.strength})" for e in pv.acmg.evidence) or "None" - lines.append( - "|" - + f"{v.id} ({v.gene or 'NA'})" - + f"|{v.consequence or 'NA'}" - + f"|{v.clinvar_significance or 'NA'}" - + f"|{v.allele_frequency if v.allele_frequency is not None else 'NA'}" - + f"|{pv.acmg.suggested_class}" - + f"|{ev_summary}" - + "|" - ) - lines.append("") - lines.append("> Note: ACMG suggestions here are auto-generated and require human review for clinical decisions.") - return "\n".join(lines) - - -def panel_report_json(result: PanelAnalysisResult) -> str: - payload = { - "individual_id": result.individual_id, - "panel": { - "name": result.panel.name, - "version": result.panel.version, - "source": result.panel.source, - }, - "generated": datetime.utcnow().isoformat(), - "variants": [ - { - "id": pv.variant.id, - "gene": pv.variant.gene, - "consequence": pv.variant.consequence, - "clinvar_significance": pv.variant.clinvar_significance, - "allele_frequency": pv.variant.allele_frequency, - "acmg": { - "suggested_class": pv.acmg.suggested_class, - "evidence": [ - {"tag": e.tag, "strength": e.strength, "rationale": e.rationale} - for e in pv.acmg.evidence - ], - "human_classification": pv.acmg.human_classification, - "human_reviewer": pv.acmg.human_reviewer, - "human_notes": pv.acmg.human_notes, - }, - } - for pv in result.variants - ], - "disclaimer": "Auto-generated suggestions; human review required.", - } - return json.dumps(payload, indent=2) diff --git a/src/genomic_consultant/store/__init__.py b/src/genomic_consultant/store/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/src/genomic_consultant/store/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/genomic_consultant/store/parquet_store.py b/src/genomic_consultant/store/parquet_store.py deleted file mode 100644 index aaf2305..0000000 --- a/src/genomic_consultant/store/parquet_store.py +++ /dev/null @@ -1,83 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import List, Sequence - -from genomic_consultant.store.query import _matches_any -from genomic_consultant.utils.models import FilterConfig, Variant - -try: - import pandas as pd -except ImportError as exc: # pragma: no cover - import guard - raise ImportError("ParquetGenomicStore requires pandas. Install with `pip install pandas`.") from exc - - -@dataclass -class ParquetGenomicStore: - """Parquet-backed store for larger datasets (optional dependency: pandas).""" - - df: "pd.DataFrame" - - @classmethod - def from_parquet(cls, path: Path) -> "ParquetGenomicStore": - df = pd.read_parquet(path) - return cls(df=df) - - def get_variants_by_gene( - self, individual_id: str, genes: Sequence[str], filters: FilterConfig | None = None - ) -> List[Variant]: - filters = filters or FilterConfig() - subset = self.df[self.df["SYMBOL"].str.upper().isin([g.upper() for g in genes])] - return self._apply_filters(subset, filters) - - def get_variants_by_region( - self, individual_id: str, chrom: str, start: int, end: int, filters: FilterConfig | None = None - ) -> List[Variant]: - filters = filters or FilterConfig() - subset = self.df[(self.df["CHROM"] == chrom) & (self.df["POS"] >= start) & (self.df["POS"] <= end)] - return self._apply_filters(subset, filters) - - def _apply_filters(self, df: "pd.DataFrame", filters: FilterConfig) -> List[Variant]: - mask = pd.Series(True, index=df.index) - if filters.max_af is not None: - mask &= (df["AF"].fillna(df.get("gnomAD_AF")).fillna(0) <= filters.max_af) - if filters.min_af is not None: - mask &= (df["AF"].fillna(df.get("gnomAD_AF")).fillna(0) >= filters.min_af) - if filters.clinvar_significance: - mask &= df["CLIN_SIG"].str.lower().isin([s.lower() for s in filters.clinvar_significance]) - if filters.consequence_includes: - mask &= df["Consequence"].str.lower().apply( - lambda v: _matches_any(v, [c.lower() for c in filters.consequence_includes]) - ) - if filters.consequence_excludes: - mask &= ~df["Consequence"].str.lower().apply( - lambda v: _matches_any(v, [c.lower() for c in filters.consequence_excludes]) - ) - filtered = df[mask] - variants: List[Variant] = [] - for _, row in filtered.iterrows(): - variants.append( - Variant( - chrom=row["CHROM"], - pos=int(row["POS"]), - ref=row["REF"], - alt=row["ALT"], - gene=row.get("SYMBOL"), - consequence=row.get("Consequence"), - protein_change=row.get("Protein_position"), - clinvar_significance=row.get("CLIN_SIG"), - allele_frequency=_maybe_float(row.get("AF")) or _maybe_float(row.get("gnomAD_AF")), - annotations={"gnomad_af": _maybe_float(row.get("gnomAD_AF"))}, - ) - ) - return variants - - -def _maybe_float(value) -> float | None: - try: - if value is None: - return None - return float(value) - except (TypeError, ValueError): - return None diff --git a/src/genomic_consultant/store/query.py b/src/genomic_consultant/store/query.py deleted file mode 100644 index 3f0f4d0..0000000 --- a/src/genomic_consultant/store/query.py +++ /dev/null @@ -1,109 +0,0 @@ -from __future__ import annotations - -import csv -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, Iterable, List, Sequence - -from genomic_consultant.utils.models import FilterConfig, Variant - - -@dataclass -class GenomicStore: - """Lightweight wrapper around annotated variants.""" - - variants: List[Variant] - - @classmethod - def from_tsv(cls, path: Path) -> "GenomicStore": - """ - Load variants from a flattened TSV generated by the annotation plan. - Expected columns (flexible, missing columns are tolerated): - CHROM POS REF ALT SYMBOL Consequence Protein_position PolyPhen SIFT CLIN_SIG AF gnomAD_AF SpliceAI CADD_PHRED - """ - variants: List[Variant] = [] - with Path(path).open() as fh: - reader = csv.DictReader(fh, delimiter="\t") - for row in reader: - row = {k: v for k, v in row.items()} if row else {} - if not row: - continue - variants.append(_row_to_variant(row)) - return cls(variants=variants) - - def get_variants_by_gene( - self, individual_id: str, genes: Sequence[str], filters: FilterConfig | None = None - ) -> List[Variant]: - filters = filters or FilterConfig() - gene_set = {g.upper() for g in genes} - return self._apply_filters((v for v in self.variants if (v.gene or "").upper() in gene_set), filters) - - def get_variants_by_region( - self, individual_id: str, chrom: str, start: int, end: int, filters: FilterConfig | None = None - ) -> List[Variant]: - filters = filters or FilterConfig() - return self._apply_filters( - (v for v in self.variants if v.chrom == chrom and start <= v.pos <= end), - filters, - ) - - def _apply_filters(self, variants: Iterable[Variant], filters: FilterConfig) -> List[Variant]: - out: List[Variant] = [] - for v in variants: - if filters.max_af is not None and v.allele_frequency is not None and v.allele_frequency > filters.max_af: - continue - if filters.min_af is not None and v.allele_frequency is not None and v.allele_frequency < filters.min_af: - continue - if filters.clinvar_significance and (v.clinvar_significance or "").lower() not in { - sig.lower() for sig in filters.clinvar_significance - }: - continue - if filters.consequence_includes and not _matches_any(v.consequence, filters.consequence_includes): - continue - if filters.consequence_excludes and _matches_any(v.consequence, filters.consequence_excludes): - continue - out.append(v) - return out - - -def _matches_any(value: str | None, patterns: Sequence[str]) -> bool: - if value is None: - return False - v = value.lower() - return any(pat.lower() in v for pat in patterns) - - -def _parse_float(val: str | None) -> float | None: - if val in (None, "", "."): - return None - try: - return float(val) - except ValueError: - return None - - -def _row_to_variant(row: Dict[str, str]) -> Variant: - chrom = row.get("CHROM") or row.get("#CHROM") - pos = int(row["POS"]) - af = _parse_float(row.get("AF")) - gnomad_af = _parse_float(row.get("gnomAD_AF")) - splice_ai = _parse_float(row.get("SpliceAI")) - cadd = _parse_float(row.get("CADD_PHRED")) - return Variant( - chrom=chrom, - pos=pos, - ref=row.get("REF"), - alt=row.get("ALT"), - gene=row.get("SYMBOL") or None, - consequence=row.get("Consequence") or None, - protein_change=row.get("Protein_position") or None, - clinvar_significance=row.get("CLIN_SIG") or None, - allele_frequency=af if af is not None else gnomad_af, - annotations={ - "polyphen": row.get("PolyPhen"), - "sift": row.get("SIFT"), - "gnomad_af": gnomad_af, - "splice_ai_delta_score": splice_ai, - "cadd_phred": cadd, - }, - ) diff --git a/src/genomic_consultant/utils/__init__.py b/src/genomic_consultant/utils/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/src/genomic_consultant/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/genomic_consultant/utils/hashing.py b/src/genomic_consultant/utils/hashing.py deleted file mode 100644 index b710541..0000000 --- a/src/genomic_consultant/utils/hashing.py +++ /dev/null @@ -1,15 +0,0 @@ -from __future__ import annotations - -import hashlib -from pathlib import Path -from typing import Optional - - -def sha256sum(path: Path) -> Optional[str]: - if not Path(path).exists(): - return None - h = hashlib.sha256() - with Path(path).open("rb") as fh: - for chunk in iter(lambda: fh.read(8192), b""): - h.update(chunk) - return h.hexdigest() diff --git a/src/genomic_consultant/utils/models.py b/src/genomic_consultant/utils/models.py deleted file mode 100644 index 7b77185..0000000 --- a/src/genomic_consultant/utils/models.py +++ /dev/null @@ -1,81 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass, field -from datetime import datetime -from typing import Any, Dict, List, Optional - - -@dataclass -class Variant: - """Lightweight variant representation used by the query layer.""" - - chrom: str - pos: int - ref: str - alt: str - gene: Optional[str] = None - transcript: Optional[str] = None - consequence: Optional[str] = None - protein_change: Optional[str] = None - clinvar_significance: Optional[str] = None - allele_frequency: Optional[float] = None - annotations: Dict[str, Any] = field(default_factory=dict) - - @property - def id(self) -> str: - return f"{self.chrom}-{self.pos}-{self.ref}-{self.alt}" - - -@dataclass -class FilterConfig: - """Common filters for variant queries.""" - - max_af: Optional[float] = None - min_af: Optional[float] = None - clinvar_significance: Optional[List[str]] = None - consequence_includes: Optional[List[str]] = None - consequence_excludes: Optional[List[str]] = None - inheritances: Optional[List[str]] = None - - -@dataclass -class EvidenceTag: - tag: str - strength: str - rationale: str - - -@dataclass -class SuggestedClassification: - suggested_class: str - evidence: List[EvidenceTag] = field(default_factory=list) - human_classification: Optional[str] = None - human_reviewer: Optional[str] = None - human_notes: Optional[str] = None - - -@dataclass -class GenePanel: - name: str - genes: List[str] - source: str - version: str - last_updated: str - metadata: Dict[str, Any] = field(default_factory=dict) - - -@dataclass -class RunLog: - """Machine-readable record of a pipeline or analysis run.""" - - run_id: str - started_at: datetime - inputs: Dict[str, Any] - parameters: Dict[str, Any] - tool_versions: Dict[str, str] - database_versions: Dict[str, str] - config_hashes: Dict[str, str] - automation_levels: Dict[str, str] - overrides: List[Dict[str, Any]] = field(default_factory=list) - outputs: Dict[str, Any] = field(default_factory=dict) - notes: Optional[str] = None diff --git a/src/genomic_consultant/utils/tooling.py b/src/genomic_consultant/utils/tooling.py deleted file mode 100644 index 9927c4a..0000000 --- a/src/genomic_consultant/utils/tooling.py +++ /dev/null @@ -1,22 +0,0 @@ -from __future__ import annotations - -import subprocess -from typing import Dict - - -def probe_tool_versions(commands: Dict[str, str]) -> Dict[str, str]: - """ - Attempt to get tool versions by running the provided commands with --version. - Returns best-effort outputs; missing tools are skipped. - """ - results: Dict[str, str] = {} - for name, cmd in commands.items(): - try: - proc = subprocess.run(f"{cmd} --version", shell=True, capture_output=True, text=True, timeout=10) - if proc.returncode == 0 and proc.stdout: - results[name] = proc.stdout.strip().splitlines()[0] - elif proc.returncode == 0 and proc.stderr: - results[name] = proc.stderr.strip().splitlines()[0] - except Exception: - continue - return results diff --git a/tests/test_acmg.py b/tests/test_acmg.py deleted file mode 100644 index 958b388..0000000 --- a/tests/test_acmg.py +++ /dev/null @@ -1,41 +0,0 @@ -from genomic_consultant.acmg.tagger import ACMGConfig, tag_variant -from genomic_consultant.utils.models import Variant - - -def test_ba1_trumps(): - cfg = ACMGConfig(ba1_af=0.05, bs1_af=0.01, pm2_af=0.0005, lof_genes=set()) - v = Variant(chrom="1", pos=1, ref="A", alt="T", allele_frequency=0.2) - result = tag_variant(v, cfg) - assert result.suggested_class == "Benign" - assert any(e.tag == "BA1" for e in result.evidence) - - -def test_pvs1_pm2_likely_pathogenic(): - cfg = ACMGConfig(lof_genes={"GENE1"}, pm2_af=0.0005, ba1_af=0.05, bs1_af=0.01) - v = Variant( - chrom="1", - pos=1, - ref="A", - alt="T", - gene="GENE1", - consequence="stop_gained", - allele_frequency=0.0001, - ) - result = tag_variant(v, cfg) - assert result.suggested_class == "Likely pathogenic" - tags = {e.tag for e in result.evidence} - assert {"PVS1", "PM2"} <= tags - - -def test_bp7_supporting(): - cfg = ACMGConfig(bp7_splice_ai_max=0.1) - v = Variant( - chrom="1", - pos=1, - ref="A", - alt="T", - consequence="synonymous_variant", - annotations={"splice_ai_delta_score": 0.05}, - ) - result = tag_variant(v, cfg) - assert any(e.tag == "BP7" for e in result.evidence) diff --git a/tests/test_aggregate.py b/tests/test_aggregate.py deleted file mode 100644 index b4856ea..0000000 --- a/tests/test_aggregate.py +++ /dev/null @@ -1,16 +0,0 @@ -from pathlib import Path - -from genomic_consultant.panels.aggregate import merge_mappings -import json - - -def test_merge_mappings(tmp_path: Path): - a = tmp_path / "a.json" - b = tmp_path / "b.json" - a.write_text('{"source":"A","phenotype_to_genes":{"HP:1":["G1","G2"]}}') - b.write_text('{"source":"B","phenotype_to_genes":{"HP:1":["G2","G3"],"HP:2":["G4"]}}') - out = tmp_path / "out.json" - merge_mappings([a, b], out) - data = json.loads(out.read_text()) - assert sorted(data["phenotype_to_genes"]["HP:1"]) == ["G1", "G2", "G3"] - assert data["phenotype_to_genes"]["HP:2"] == ["G4"] diff --git a/tests/test_phase1_pipeline.py b/tests/test_phase1_pipeline.py deleted file mode 100644 index bcb63ec..0000000 --- a/tests/test_phase1_pipeline.py +++ /dev/null @@ -1,27 +0,0 @@ -from pathlib import Path - -from genomic_consultant.orchestration.phase1_pipeline import run_phase1_pipeline - - -def test_phase1_pipeline_with_existing_tsv(): - root = Path(__file__).resolve().parents[1] - tsv = root / "sample_data/example_annotated.tsv" - panel = root / "configs/panel.example.json" - acmg = root / "configs/acmg_config.example.yaml" - result = run_phase1_pipeline( - samples=None, - reference_fasta=None, - workdir=root / "runtime", - output_prefix="demo", - acmg_config_path=acmg, - max_af=0.05, - panel_path=panel, - phenotype_id=None, - phenotype_mapping=None, - report_format="markdown", - existing_tsv=tsv, - skip_call=True, - skip_annotate=True, - ) - assert "Panel Report" in result.panel_report - assert result.tsv_path == tsv diff --git a/tests/test_resolver.py b/tests/test_resolver.py deleted file mode 100644 index d77b98b..0000000 --- a/tests/test_resolver.py +++ /dev/null @@ -1,18 +0,0 @@ -from pathlib import Path - -from genomic_consultant.panels.resolver import PhenotypeGeneResolver - - -def test_resolver_build_panel(tmp_path: Path): - data = { - "version": "test", - "source": "example", - "phenotype_to_genes": {"HP:0001": ["GENE1", "GENE2"]}, - } - path = tmp_path / "map.json" - path.write_text('{"version":"test","source":"example","phenotype_to_genes":{"HP:0001":["GENE1","GENE2"]}}') - resolver = PhenotypeGeneResolver.from_json(path) - panel = resolver.build_panel("HP:0001") - assert panel is not None - assert panel.genes == ["GENE1", "GENE2"] - assert "phenotype_id" in panel.metadata diff --git a/tests/test_store.py b/tests/test_store.py deleted file mode 100644 index ece2978..0000000 --- a/tests/test_store.py +++ /dev/null @@ -1,28 +0,0 @@ -from genomic_consultant.store.query import GenomicStore -from genomic_consultant.utils.models import FilterConfig, Variant - - -def test_filter_by_gene_and_af(): - store = GenomicStore( - variants=[ - Variant(chrom="1", pos=1, ref="A", alt="T", gene="GENE1", allele_frequency=0.02), - Variant(chrom="1", pos=2, ref="G", alt="C", gene="GENE2", allele_frequency=0.0001), - ] - ) - res = store.get_variants_by_gene("ind", ["GENE2"], filters=FilterConfig(max_af=0.001)) - assert len(res) == 1 - assert res[0].gene == "GENE2" - - -def test_consequence_include_exclude(): - store = GenomicStore( - variants=[ - Variant(chrom="1", pos=1, ref="A", alt="T", gene="GENE1", consequence="missense_variant"), - Variant(chrom="1", pos=2, ref="G", alt="C", gene="GENE1", consequence="synonymous_variant"), - ] - ) - res = store.get_variants_by_gene( - "ind", ["GENE1"], filters=FilterConfig(consequence_includes=["missense"], consequence_excludes=["synonymous"]) - ) - assert len(res) == 1 - assert res[0].consequence == "missense_variant" diff --git a/tests/test_store_tsv.py b/tests/test_store_tsv.py deleted file mode 100644 index 56b5467..0000000 --- a/tests/test_store_tsv.py +++ /dev/null @@ -1,33 +0,0 @@ -from pathlib import Path - -from genomic_consultant.store.query import GenomicStore - - -def test_from_tsv_with_extra_columns(tmp_path: Path): - content = "\t".join( - [ - "#CHROM", - "POS", - "REF", - "ALT", - "SYMBOL", - "Consequence", - "Protein_position", - "PolyPhen", - "SIFT", - "CLIN_SIG", - "AF", - "gnomAD_AF", - "SpliceAI", - "CADD_PHRED", - ] - ) + "\n" - content += "1\t100\tA\tT\tGENE1\tmissense_variant\tp.X\t.\t.\tPathogenic\t0.0001\t0.0002\t0.05\t20.1\n" - path = tmp_path / "v.tsv" - path.write_text(content) - - store = GenomicStore.from_tsv(path) - assert len(store.variants) == 1 - v = store.variants[0] - assert v.annotations["splice_ai_delta_score"] == 0.05 - assert v.annotations["cadd_phred"] == 20.1 diff --git a/trio_analysis.py b/trio_analysis.py new file mode 100644 index 0000000..a4506f3 --- /dev/null +++ b/trio_analysis.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python3 +""" +Trio WES Analysis Script +Analyzes trio VCF for de novo mutations, compound heterozygous variants, +and potential pathogenic variants. +""" + +import gzip +import re +from collections import defaultdict +from dataclasses import dataclass +from typing import List, Dict, Optional, Tuple +import json + +@dataclass +class Variant: + chrom: str + pos: int + ref: str + alt: str + qual: float + filter_status: str + info: str + genotypes: Dict[str, str] # sample -> genotype + annotation: Optional[str] = None + gene: Optional[str] = None + effect: Optional[str] = None + impact: Optional[str] = None + +def parse_genotype(gt_field: str) -> Tuple[str, int, int]: + """Parse genotype field, return (gt_string, ref_count, alt_count)""" + parts = gt_field.split(':') + gt = parts[0] + + if gt in ['./.', '.|.', '.']: + return gt, 0, 0 + + alleles = re.split('[/|]', gt) + ref_count = sum(1 for a in alleles if a == '0') + alt_count = sum(1 for a in alleles if a != '0' and a != '.') + + return gt, ref_count, alt_count + +def get_genotype_class(gt: str) -> str: + """Classify genotype as HOM_REF, HET, HOM_ALT, or MISSING""" + if gt in ['./.', '.|.', '.']: + return 'MISSING' + + alleles = re.split('[/|]', gt) + if all(a == '0' for a in alleles): + return 'HOM_REF' + elif all(a != '0' and a != '.' for a in alleles): + return 'HOM_ALT' + else: + return 'HET' + +def parse_snpeff_annotation(info: str) -> Dict: + """Parse SnpEff ANN field""" + result = { + 'gene': None, + 'effect': None, + 'impact': None, + 'hgvs_c': None, + 'hgvs_p': None, + } + + ann_match = re.search(r'ANN=([^;]+)', info) + if not ann_match: + return result + + ann_field = ann_match.group(1) + annotations = ann_field.split(',') + + if annotations: + # Take the first (most severe) annotation + parts = annotations[0].split('|') + if len(parts) >= 4: + result['effect'] = parts[1] if len(parts) > 1 else None + result['impact'] = parts[2] if len(parts) > 2 else None + result['gene'] = parts[3] if len(parts) > 3 else None + if len(parts) > 9: + result['hgvs_c'] = parts[9] + if len(parts) > 10: + result['hgvs_p'] = parts[10] + + return result + +def parse_vcf(vcf_path: str) -> Tuple[List[str], List[Variant]]: + """Parse VCF file and return sample names and variants""" + samples = [] + variants = [] + + open_func = gzip.open if vcf_path.endswith('.gz') else open + mode = 'rt' if vcf_path.endswith('.gz') else 'r' + + with open_func(vcf_path, mode) as f: + for line in f: + if line.startswith('##'): + continue + elif line.startswith('#CHROM'): + parts = line.strip().split('\t') + samples = parts[9:] + continue + + parts = line.strip().split('\t') + if len(parts) < 10: + continue + + chrom, pos, _, ref, alt, qual, filt, info, fmt = parts[:9] + gt_fields = parts[9:] + + # Parse genotypes + genotypes = {} + fmt_fields = fmt.split(':') + gt_idx = fmt_fields.index('GT') if 'GT' in fmt_fields else 0 + + for i, sample in enumerate(samples): + gt_parts = gt_fields[i].split(':') + genotypes[sample] = gt_parts[gt_idx] if gt_idx < len(gt_parts) else './.' + + # Parse annotation + ann = parse_snpeff_annotation(info) + + try: + qual_val = float(qual) if qual != '.' else 0 + except ValueError: + qual_val = 0 + + variant = Variant( + chrom=chrom, + pos=int(pos), + ref=ref, + alt=alt, + qual=qual_val, + filter_status=filt, + info=info, + genotypes=genotypes, + annotation=info, + gene=ann['gene'], + effect=ann['effect'], + impact=ann['impact'] + ) + variants.append(variant) + + return samples, variants + +def identify_de_novo(variants: List[Variant], proband: str, father: str, mother: str) -> List[Variant]: + """Identify de novo variants: present in proband but absent in both parents""" + de_novo = [] + + for v in variants: + if proband not in v.genotypes or father not in v.genotypes or mother not in v.genotypes: + continue + + proband_gt = get_genotype_class(v.genotypes[proband]) + father_gt = get_genotype_class(v.genotypes[father]) + mother_gt = get_genotype_class(v.genotypes[mother]) + + # De novo: proband has variant, both parents are HOM_REF + if proband_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF' and mother_gt == 'HOM_REF': + de_novo.append(v) + + return de_novo + +def identify_compound_het(variants: List[Variant], proband: str, father: str, mother: str) -> Dict[str, List[Variant]]: + """Identify compound heterozygous variants in genes""" + gene_variants = defaultdict(list) + + # Group HET variants by gene + for v in variants: + if not v.gene: + continue + + if proband not in v.genotypes: + continue + + proband_gt = get_genotype_class(v.genotypes[proband]) + if proband_gt != 'HET': + continue + + gene_variants[v.gene].append(v) + + # Find compound het (>1 HET variant in same gene, inherited from different parents) + compound_het = {} + + for gene, vars_list in gene_variants.items(): + if len(vars_list) < 2: + continue + + maternal_inherited = [] + paternal_inherited = [] + + for v in vars_list: + if father not in v.genotypes or mother not in v.genotypes: + continue + + father_gt = get_genotype_class(v.genotypes[father]) + mother_gt = get_genotype_class(v.genotypes[mother]) + + if father_gt in ['HET', 'HOM_ALT'] and mother_gt == 'HOM_REF': + paternal_inherited.append(v) + elif mother_gt in ['HET', 'HOM_ALT'] and father_gt == 'HOM_REF': + maternal_inherited.append(v) + + if maternal_inherited and paternal_inherited: + compound_het[gene] = maternal_inherited + paternal_inherited + + return compound_het + +def identify_homozygous_recessive(variants: List[Variant], proband: str, father: str, mother: str) -> List[Variant]: + """Identify homozygous recessive variants: HOM_ALT in proband, both parents HET""" + hom_rec = [] + + for v in variants: + if proband not in v.genotypes or father not in v.genotypes or mother not in v.genotypes: + continue + + proband_gt = get_genotype_class(v.genotypes[proband]) + father_gt = get_genotype_class(v.genotypes[father]) + mother_gt = get_genotype_class(v.genotypes[mother]) + + # Homozygous recessive: proband HOM_ALT, both parents HET + if proband_gt == 'HOM_ALT' and father_gt == 'HET' and mother_gt == 'HET': + hom_rec.append(v) + + return hom_rec + +def filter_by_impact(variants: List[Variant], impacts: List[str] = ['HIGH', 'MODERATE']) -> List[Variant]: + """Filter variants by impact level""" + return [v for v in variants if v.impact in impacts] + +def generate_report(vcf_path: str, output_path: str): + """Generate trio analysis report""" + print(f"Parsing VCF: {vcf_path}") + samples, variants = parse_vcf(vcf_path) + + print(f"Found {len(samples)} samples: {samples}") + print(f"Total variants: {len(variants)}") + + # Identify sample roles based on file naming convention + # Expected: I-1 (father), I-2 (mother), II-3 (proband) + proband = None + father = None + mother = None + + for s in samples: + s_upper = s.upper() + if 'II-3' in s_upper or 'PROBAND' in s_upper: + proband = s + elif 'I-1' in s_upper: + father = s + elif 'I-2' in s_upper: + mother = s + + if not all([proband, father, mother]): + # Fallback: assume order is proband, father, mother + if len(samples) >= 3: + proband = samples[0] + father = samples[1] + mother = samples[2] + else: + print("ERROR: Could not identify trio samples") + return + + print(f"\nTrio identified:") + print(f" Proband: {proband}") + print(f" Father: {father}") + print(f" Mother: {mother}") + + # Analysis + print("\n" + "="*80) + print("TRIO ANALYSIS RESULTS") + print("="*80) + + # De novo variants + de_novo = identify_de_novo(variants, proband, father, mother) + de_novo_high = filter_by_impact(de_novo, ['HIGH', 'MODERATE']) + + print(f"\n1. DE NOVO VARIANTS") + print(f" Total de novo: {len(de_novo)}") + print(f" HIGH/MODERATE impact: {len(de_novo_high)}") + + # Compound heterozygous + compound_het = identify_compound_het(variants, proband, father, mother) + + print(f"\n2. COMPOUND HETEROZYGOUS GENES") + print(f" Genes with compound het: {len(compound_het)}") + + # Homozygous recessive + hom_rec = identify_homozygous_recessive(variants, proband, father, mother) + hom_rec_high = filter_by_impact(hom_rec, ['HIGH', 'MODERATE']) + + print(f"\n3. HOMOZYGOUS RECESSIVE VARIANTS") + print(f" Total: {len(hom_rec)}") + print(f" HIGH/MODERATE impact: {len(hom_rec_high)}") + + # Generate detailed report + with open(output_path, 'w') as f: + f.write("# Trio WES Analysis Report\n") + f.write(f"# Generated from: {vcf_path}\n") + f.write(f"# Samples: Proband={proband}, Father={father}, Mother={mother}\n") + f.write(f"# Total variants analyzed: {len(variants)}\n\n") + + # De novo HIGH/MODERATE impact + f.write("## DE NOVO VARIANTS (HIGH/MODERATE IMPACT)\n") + f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n") + for v in sorted(de_novo_high, key=lambda x: (x.chrom, x.pos)): + f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t{v.gene or 'N/A'}\t") + f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t") + f.write(f"{v.genotypes.get(proband, './.')}\t") + f.write(f"{v.genotypes.get(father, './.')}\t") + f.write(f"{v.genotypes.get(mother, './.')}\n") + + # Compound heterozygous + f.write("\n## COMPOUND HETEROZYGOUS GENES\n") + for gene, vars_list in sorted(compound_het.items()): + high_impact = [v for v in vars_list if v.impact in ['HIGH', 'MODERATE']] + if high_impact: + f.write(f"\n### {gene} ({len(vars_list)} variants, {len(high_impact)} HIGH/MODERATE)\n") + f.write("CHROM\tPOS\tREF\tALT\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n") + for v in sorted(high_impact, key=lambda x: x.pos): + f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t") + f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t") + f.write(f"{v.genotypes.get(proband, './.')}\t") + f.write(f"{v.genotypes.get(father, './.')}\t") + f.write(f"{v.genotypes.get(mother, './.')}\n") + + # Homozygous recessive HIGH/MODERATE + f.write("\n## HOMOZYGOUS RECESSIVE VARIANTS (HIGH/MODERATE IMPACT)\n") + f.write("CHROM\tPOS\tREF\tALT\tGENE\tEFFECT\tIMPACT\tPROBAND_GT\tFATHER_GT\tMOTHER_GT\n") + for v in sorted(hom_rec_high, key=lambda x: (x.chrom, x.pos)): + f.write(f"{v.chrom}\t{v.pos}\t{v.ref}\t{v.alt}\t{v.gene or 'N/A'}\t") + f.write(f"{v.effect or 'N/A'}\t{v.impact or 'N/A'}\t") + f.write(f"{v.genotypes.get(proband, './.')}\t") + f.write(f"{v.genotypes.get(father, './.')}\t") + f.write(f"{v.genotypes.get(mother, './.')}\n") + + # Summary statistics + f.write("\n## SUMMARY STATISTICS\n") + f.write(f"Total variants: {len(variants)}\n") + f.write(f"De novo variants: {len(de_novo)}\n") + f.write(f"De novo HIGH/MODERATE: {len(de_novo_high)}\n") + f.write(f"Compound het genes: {len(compound_het)}\n") + f.write(f"Homozygous recessive: {len(hom_rec)}\n") + f.write(f"Homozygous recessive HIGH/MODERATE: {len(hom_rec_high)}\n") + + print(f"\nReport saved to: {output_path}") + + # Also print top candidates + print("\n" + "="*80) + print("TOP CANDIDATE VARIANTS") + print("="*80) + + print("\n--- De Novo HIGH Impact ---") + de_novo_high_only = [v for v in de_novo if v.impact == 'HIGH'] + for v in de_novo_high_only[:10]: + print(f" {v.chrom}:{v.pos} {v.ref}>{v.alt} | {v.gene} | {v.effect}") + + print("\n--- Compound Het Genes (with HIGH impact) ---") + for gene, vars_list in list(compound_het.items())[:10]: + high_count = sum(1 for v in vars_list if v.impact == 'HIGH') + if high_count > 0: + print(f" {gene}: {len(vars_list)} variants ({high_count} HIGH)") + + print("\n--- Homozygous Recessive HIGH Impact ---") + hom_rec_high_only = [v for v in hom_rec if v.impact == 'HIGH'] + for v in hom_rec_high_only[:10]: + print(f" {v.chrom}:{v.pos} {v.ref}>{v.alt} | {v.gene} | {v.effect}") + +if __name__ == '__main__': + import sys + + vcf_path = sys.argv[1] if len(sys.argv) > 1 else '/Volumes/NV2/genomics_analysis/vcf/trio_joint.snpeff.vcf' + output_path = sys.argv[2] if len(sys.argv) > 2 else '/Volumes/NV2/genomics_analysis/trio_analysis_report.txt' + + generate_report(vcf_path, output_path)