commit f74dc351f7c550f900aba91a0665eb71136c1d21 Author: gbanyan Date: Fri Nov 28 11:52:04 2025 +0800 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..863dda5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,46 @@ +# Python and environments +__pycache__/ +*.py[cod] +.venv/ +env/ +.env +.git-mcp/ +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.coverage + +# Notebooks +.ipynb_checkpoints/ + +# Editor +.DS_Store +.idea/ +.vscode/ + +# Genomics data (avoid accidental commits) +*.bam +*.bai +*.sam +*.cram +*.crai +*.vcf +*.vcf.gz +*.vcf.gz.tbi +*.g.vcf +*.g.vcf.gz +*.g.vcf.gz.tbi +*.bed +*.bed.gz +*.bed.gz.tbi +*.fastq +*.fastq.gz +*.fq +*.fq.gz + +# Reports / tmp +*.log +*.tmp +tmp/ +reports/ +runtime/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8102f16 --- /dev/null +++ b/Makefile @@ -0,0 +1,14 @@ +PYTHON := .venv/bin/python +PIP := .venv/bin/pip + +.PHONY: venv install test + +venv: + python3 -m venv .venv + $(PIP) install -e . + +install: venv + $(PIP) install -e .[store] + +test: + $(PYTHON) -m pytest diff --git a/README.md b/README.md new file mode 100644 index 0000000..af73dab --- /dev/null +++ b/README.md @@ -0,0 +1,112 @@ +# Genomic Consultant + +Early design for a personal genomic risk and drug–interaction decision support system. Specs are sourced from `genomic_decision_support_system_spec_v0.1.md`. + +## Vision (per spec) +- Phase 1: trio variant calling, annotation, queryable genomic DB, initial ACMG evidence tagging. +- Phase 2: pharmacogenomics genotype-to-phenotype mapping plus drug–drug interaction checks. +- Phase 3: supplement/herb normalization and interaction risk layering. +- Phase 4: LLM-driven query orchestration and report generation. + +## Repository Layout +- `docs/` — system architecture notes, phase plans, data models (work in progress). +- `configs/` — example ACMG config and gene panel JSON. +- `configs/phenotype_to_genes.example.json` — placeholder phenotype/HPO → gene mappings. +- `configs/phenotype_to_genes.hpo_seed.json` — seed HPO mappings (replace with full HPO/GenCC derived panels). +- `sample_data/` — tiny annotated TSV for demo. +- `src/genomic_consultant/` — Python scaffolding (pipelines, store, panel lookup, ACMG tagging, reporting). +- `genomic_decision_support_system_spec_v0.1.md` — original requirements draft. + +## Contributing/next steps +1. Finalize Phase 1 tech selection (variant caller, annotation stack, reference/DB versions). +2. Stand up the Phase 1 pipelines and minimal query API surface. +3. Add ACMG evidence tagging config and human-review logging. +4. Layer in PGx/DDI and supplement modules per later phases. + +Data safety: keep genomic/clinical data local; the `.gitignore` blocks common genomic outputs by default. + +## Quickstart (CLI scaffolding) +``` +pip install -e . + +# 1) Show trio calling plan (commands only; not executed) +genomic-consultant plan-call \ + --sample proband:/data/proband.bam \ + --sample father:/data/father.bam \ + --sample mother:/data/mother.bam \ + --reference /refs/GRCh38.fa \ + --workdir /tmp/trio + +# 1b) Execute calling plan (requires GATK installed) and emit run log +genomic-consultant run-call \ + --sample proband:/data/proband.bam \ + --sample father:/data/father.bam \ + --sample mother:/data/mother.bam \ + --reference /refs/GRCh38.fa \ + --workdir /tmp/trio \ + --log /tmp/trio/run_call_log.json \ + --probe-tools + +# 2) Show annotation plan for a joint VCF +genomic-consultant plan-annotate \ + --vcf /tmp/trio/trio.joint.vcf.gz \ + --workdir /tmp/trio/annot \ + --prefix trio \ + --reference /refs/GRCh38.fa + +# 2b) Execute annotation plan (requires VEP, bcftools) with run log +genomic-consultant run-annotate \ + --vcf /tmp/trio/trio.joint.vcf.gz \ + --workdir /tmp/trio/annot \ + --prefix trio \ + --reference /refs/GRCh38.fa \ + --log /tmp/trio/annot/run_annot_log.json \ + --probe-tools + +# 3) Demo panel report using sample data (panel file) +genomic-consultant panel-report \ + --tsv sample_data/example_annotated.tsv \ + --panel configs/panel.example.json \ + --acmg-config configs/acmg_config.example.yaml \ + --individual-id demo \ + --format markdown \ + --log /tmp/panel_log.json + +# 3b) Demo panel report using phenotype mapping (HPO) +genomic-consultant panel-report \ + --tsv sample_data/example_annotated.tsv \ + --phenotype-id HP:0000365 \ + --phenotype-mapping configs/phenotype_to_genes.hpo_seed.json \ + --acmg-config configs/acmg_config.example.yaml \ + --individual-id demo \ + --format markdown + +# 3c) Merge multiple phenotype→gene mappings into one +genomic-consultant build-phenotype-mapping \ + --output configs/phenotype_to_genes.merged.json \ + configs/phenotype_to_genes.example.json configs/phenotype_to_genes.hpo_seed.json + +# 4) End-to-end Phase 1 pipeline (optionally skip call/annotate; use sample TSV) +genomic-consultant phase1-run \ + --tsv sample_data/example_annotated.tsv \ + --skip-call --skip-annotate \ + --panel configs/panel.example.json \ + --acmg-config configs/acmg_config.example.yaml \ + --workdir runtime \ + --prefix demo + +# Run tests +pytest +``` + +### Optional Parquet-backed store +Install pandas to enable Parquet ingestion: +``` +pip install -e .[store] +``` + +### Notes on VEP plugins (SpliceAI/CADD) +- The annotation plan already queries `SpliceAI` and `CADD_PHRED` fields; ensure your VEP run includes plugins/flags that produce them, e.g.: + - `--plugin SpliceAI,snv=/path/to/spliceai.snv.vcf.gz,indel=/path/to/spliceai.indel.vcf.gz` + - `--plugin CADD,/path/to/whole_genome_SNVs.tsv.gz,/path/to/InDels.tsv.gz` +- Pass these via `--plugin` and/or `--extra-flag` on `run-annotate` / `plan-annotate` to embed fields into the TSV. diff --git a/configs/acmg_config.example.yaml b/configs/acmg_config.example.yaml new file mode 100644 index 0000000..36bba07 --- /dev/null +++ b/configs/acmg_config.example.yaml @@ -0,0 +1,12 @@ +# ACMG evidence thresholds (example, adjust per lab policy) +ba1_af: 0.05 # Stand-alone benign if AF >= this +bs1_af: 0.01 # Strong benign if AF >= this (and not meeting BA1) +pm2_af: 0.0005 # Moderate pathogenic if AF <= this +bp7_splice_ai_max: 0.1 # Supporting benign if synonymous and predicted low splice impact + +# Genes considered loss-of-function sensitive for PVS1 auto-tagging +lof_genes: + - BRCA1 + - BRCA2 + - TTN + - CFTR diff --git a/configs/panel.example.json b/configs/panel.example.json new file mode 100644 index 0000000..7866fda --- /dev/null +++ b/configs/panel.example.json @@ -0,0 +1,10 @@ +{ + "name": "Hearing loss core", + "version": "0.1", + "source": "curated", + "last_updated": "2024-06-01", + "genes": ["GJB2", "SLC26A4", "MITF", "OTOF"], + "metadata": { + "notes": "Example panel for demo; replace with curated panel and provenance." + } +} diff --git a/configs/phenotype_to_genes.example.json b/configs/phenotype_to_genes.example.json new file mode 100644 index 0000000..1bbae32 --- /dev/null +++ b/configs/phenotype_to_genes.example.json @@ -0,0 +1,11 @@ +{ + "version": "0.1", + "source": "example-curated", + "phenotype_to_genes": { + "HP:0000365": ["GJB2", "SLC26A4", "OTOF"], + "HP:0000510": ["MITF", "SOX10"] + }, + "metadata": { + "notes": "Placeholder mapping; replace with curated HPO/OMIM/GenCC panels." + } +} diff --git a/configs/phenotype_to_genes.hpo_seed.json b/configs/phenotype_to_genes.hpo_seed.json new file mode 100644 index 0000000..d9013bf --- /dev/null +++ b/configs/phenotype_to_genes.hpo_seed.json @@ -0,0 +1,13 @@ +{ + "version": "2024-11-01", + "source": "HPO-curated-seed", + "phenotype_to_genes": { + "HP:0000365": ["GJB2", "SLC26A4", "OTOF", "TECTA"], + "HP:0001250": ["SCN1A", "KCNQ2", "STXBP1"], + "HP:0001631": ["MYH7", "TNNT2", "MYBPC3"], + "HP:0001156": ["COL1A1", "COL1A2", "PLOD2"] + }, + "metadata": { + "notes": "Seed mapping using common phenotype IDs; replace with full HPO-derived panels." + } +} diff --git a/docs/phase1_howto.md b/docs/phase1_howto.md new file mode 100644 index 0000000..b27bcc5 --- /dev/null +++ b/docs/phase1_howto.md @@ -0,0 +1,101 @@ +# Phase 1 How-To: BAM → VCF → Annotate → Panel Report + +本文件說明如何用現有 CLI 從 BAM 執行到報告輸出,選項意義,以及可跳過步驟的用法。假設已安裝 GATK、VEP、bcftools/tabix,並以 `pip install -e .` 安裝本專案。 + +## 流程總覽 +1) Trio variant calling → joint VCF +2) VEP 註解 → annotated VCF + 平坦 TSV +3) Panel/phenotype 查詢 + ACMG 標籤 → Markdown/JSON 報告 +可用 `phase1-run` 一鍵(可跳過 call/annotate),或分步 `run-call` / `run-annotate` / `panel-report`。 + +## 分步執行 + +### 1) Variant calling(GATK) +```bash +genomic-consultant run-call \ + --sample proband:/path/proband.bam \ + --sample father:/path/father.bam \ + --sample mother:/path/mother.bam \ + --reference /refs/GRCh38.fa \ + --workdir /tmp/trio \ + --prefix trio \ + --log /tmp/trio/run_call_log.json \ + --probe-tools +``` +- `--sample`: `sample_id:/path/to.bam`,可重複。 +- `--reference`: 參考序列。 +- `--workdir`: 輸出與中間檔位置。 +- `--prefix`: 輸出檔名前綴。 +- `--log`: run log(JSON)路徑。 +- 輸出:joint VCF (`/tmp/trio/trio.joint.vcf.gz`)、run log(含指令/returncode)。 + +### 2) Annotation(VEP + bcftools) +```bash +genomic-consultant run-annotate \ + --vcf /tmp/trio/trio.joint.vcf.gz \ + --workdir /tmp/trio/annot \ + --prefix trio \ + --reference /refs/GRCh38.fa \ + --plugin 'SpliceAI,snv=/path/spliceai.snv.vcf.gz,indel=/path/spliceai.indel.vcf.gz' \ + --plugin 'CADD,/path/whole_genome_SNVs.tsv.gz,/path/InDels.tsv.gz' \ + --extra-flag "--cache --offline" \ + --log /tmp/trio/annot/run_annot_log.json \ + --probe-tools +``` +- `--plugin`: VEP plugin 規格,可重複(示範 SpliceAI/CADD)。 +- `--extra-flag`: 附加給 VEP 的旗標(如 cache/offline)。 +- 輸出:annotated VCF (`trio.vep.vcf.gz`)、平坦 TSV (`trio.vep.tsv`)、run log。 + +### 3) Panel/Phenotype 報告 +使用 panel 檔: +```bash +genomic-consultant panel-report \ + --tsv /tmp/trio/annot/trio.vep.tsv \ + --panel configs/panel.example.json \ + --acmg-config configs/acmg_config.example.yaml \ + --individual-id proband \ + --max-af 0.05 \ + --format markdown \ + --log /tmp/trio/panel_log.json +``` +使用 phenotype 直譯 panel: +```bash +genomic-consultant panel-report \ + --tsv /tmp/trio/annot/trio.vep.tsv \ + --phenotype-id HP:0000365 \ + --phenotype-mapping configs/phenotype_to_genes.hpo_seed.json \ + --acmg-config configs/acmg_config.example.yaml \ + --individual-id proband \ + --max-af 0.05 \ + --format markdown +``` +- `--max-af`: 過濾等位基因頻率上限。 +- `--format`: `markdown` 或 `json`。 +- 輸出:報告文字 + run log(記錄 panel/ACMG config 的 hash)。 + +## 一鍵模式(可跳過 call/annotate) +已經有 joint VCF/TSV 時,可跳過前兩步: +```bash +genomic-consultant phase1-run \ + --workdir /tmp/trio \ + --prefix trio \ + --tsv /tmp/trio/annot/trio.vep.tsv \ + --skip-call --skip-annotate \ + --panel configs/panel.example.json \ + --acmg-config configs/acmg_config.example.yaml \ + --max-af 0.05 \ + --format markdown \ + --log-dir /tmp/trio/runtime +``` +若要實際跑 VEP,可移除 `--skip-annotate` 並提供 `--plugins/--extra-flag`;若要跑 calling,移除 `--skip-call` 並提供 `--sample`/`--reference`。 + +## 主要輸出 +- joint VCF(呼叫結果) +- annotated VCF + 平坦 TSV(含 gene/consequence/ClinVar/AF/SpliceAI/CADD 等欄位) +- run logs(JSON,含指令、return code、config hash;在 `--log` 或 `--log-dir`) +- Panel 報告(Markdown 或 JSON),附 ACMG 自動標籤,需人工複核 + +## 注意 +- Call/annotate 依賴外部工具與對應資源(參考序列、VEP cache、SpliceAI/CADD 資料)。 +- 若無 BAM/資源,可用 sample TSV:`phase1-run --tsv sample_data/example_annotated.tsv --skip-call --skip-annotate ...` 演示報告。 +- 安全:`.gitignore` 已排除大型基因檔案;建議本地受控環境執行。 diff --git a/docs/phase1_implementation_plan.md b/docs/phase1_implementation_plan.md new file mode 100644 index 0000000..83a5a66 --- /dev/null +++ b/docs/phase1_implementation_plan.md @@ -0,0 +1,80 @@ +# Phase 1 Implementation Plan (Genomic Foundation) + +Scope: deliver a working trio-based variant pipeline, annotated genomic store, query APIs, initial ACMG evidence tagging, and reporting/logging scaffolding. This assumes local execution with Python 3.11+. + +## Objectives +- Trio BAM → joint VCF with QC artifacts (`Auto`). +- Annotate variants with population frequency, ClinVar, consequence/prediction (`Auto`). +- Provide queryable interfaces for gene/region lookups with filters (`Auto`). +- Disease/phenotype → gene panel lookup and filtered variant outputs (`Auto+Review`). +- Auto-tag subset of ACMG criteria; human-only final classification (`Auto+Review`). +- Produce machine-readable run logs with versions, configs, and overrides. + +## Work Breakdown +1) **Data & references** + - Reference genome: GRCh38 (primary) with option for GRCh37; pin version hash. + - Resource bundles: known sites for BQSR (if using GATK), gnomAD/ClinVar versions for annotation. + - Test fixtures: small trio BAM/CRAM subset or GIAB trio downsampled for CI-like checks. + +2) **Variant calling pipeline (wrapper, `Auto`)** + - Tooling: GATK HaplotypeCaller → gVCF; GenotypeGVCFs for joint calls. Alt: DeepVariant + joint genotyper (parameterized). + - Steps: + - Validate inputs (file presence, reference match). + - Optional QC: coverage, duplicates, on-target. + - Generate per-sample gVCF; joint genotyping to trio VCF. + - Outputs: joint VCF + index; QC summary JSON/TSV; log with tool versions and params. + +3) **Annotation pipeline (`Auto`)** + - Tooling: Ensembl VEP with plugins for gnomAD, ClinVar, CADD, SpliceAI where available; alt path: ANNOVAR. + - Steps: + - Normalize variants (bcftools norm) if needed. + - Annotate with gene, transcript, protein change; population AF; ClinVar significance; consequence; predictions (SIFT/PolyPhen/CADD); inheritance flags. + - Include SpliceAI/CADD plugins if installed; CLI accepts extra flags/plugins to embed SpliceAI/CADD fields. + - Outputs: annotated VCF; flattened TSV/Parquet for faster querying; manifest of DB versions used. + +4) **Genomic store + query API (`Auto`)** + - Early option: tabix-indexed VCF with Python wrapper. + - Functions (Python module `genomic_store`): + - `get_variants_by_gene(individual_id, genes, filters)` + Filters: AF thresholds, consequence categories, ClinVar significance, inheritance pattern. + - `get_variants_by_region(individual_id, chrom, start, end, filters)` + - `list_genes_with_variants(individual_id, filters)` (optional). + - Filters defined in a `FilterConfig` dataclass; serialize-able for logging. + - Future option: import to SQLite/Postgres via Arrow/Parquet for richer predicates. + +5) **Disease/phenotype → gene panel (`Auto+Review`)** + - Data: HPO/OMIM/GenCC lookup or curated JSON panels. + - Function: `get_gene_panel(disease_or_hpo_id, version=None)` returning gene list + provenance. + - Phenotype resolver: curated JSON mapping (e.g., `phenotype_to_genes.example.json`) as placeholder until upstream data is wired; allow dynamic panel synthesis by phenotype ID in CLI; support merging multiple sources into one mapping. + - Flow: resolve panel → call genomic store → apply simple ranking (AF low, ClinVar pathogenicity high). + - Manual review points: panel curation, rank threshold tuning. + +6) **ACMG evidence tagging (subset, `Auto+Review`)** + - Criteria to auto-evaluate initially: PVS1 (LoF in LoF-sensitive gene list), PM2 (absent/rare in population), BA1/BS1 (common frequency), possible BP7 (synonymous, no splicing impact). + - Config: YAML/JSON with thresholds (AF cutoffs, LoF gene list, transcript precedence). + - Output schema per variant: `{variant_id, evidence: [tag, strength, rationale], suggested_class}` with suggested class computed purely from auto tags; final class left blank for human. + - Logging: capture rule version and reasons for each fired rule. + +7) **Run logging and versioning** + - Every pipeline run emits `run_log.json` containing: + - Inputs (sample IDs, file paths, reference build). + - Tool versions and parameters; DB versions; config hashes (panel/ACMG configs). + - Automation level per step; manual overrides (`who/when/why`). + - Derived artifacts paths and checksums. + - Embed run_log reference in reports. + +8) **Report template (minimal)** + - Input: disease/gene panel name, variant query results, ACMG evidence tags. + - Output: Markdown + JSON summary with sections: context, methods, variants table, limitations. + - Mark human-only decisions clearly. + +## Milestones +- **M1**: Repo scaffolding + configs; tiny trio test data wired; `make pipeline` runs through variant calling + annotation on fixture. +- **M2**: Genomic store wrapper with gene/region queries; filter config; basic CLI/Notebook demo. +- **M3**: Panel lookup + ranked variant listing; ACMG auto tags on outputs; run_log generation. +- **M4**: Minimal report generator + acceptance of human-reviewed classifications. + +## Validation Strategy +- Unit tests for filter logic, panel resolution, ACMG tagger decisions (synthetic variants). +- Integration test on small fixture trio to ensure call → annotate → query path works. +- Determinism checks: hash configs and verify outputs stable across runs given same inputs. diff --git a/docs/system_architecture.md b/docs/system_architecture.md new file mode 100644 index 0000000..6914498 --- /dev/null +++ b/docs/system_architecture.md @@ -0,0 +1,69 @@ +# System Architecture Blueprint (v0.1) + +This document turns `genomic_decision_support_system_spec_v0.1.md` into a buildable architecture and phased roadmap. Automation levels follow `Auto / Auto+Review / Human-only`. + +## High-Level Views +- **Core layers**: (1) sequencing ingest → variant calling/annotation (`Auto`), (2) genomic query layer (`Auto`), (3) rule engines (ACMG, PGx, DDI, supplements; mixed automation), (4) orchestration/LLM and report generation (`Auto` tool calls, `Auto+Review` outputs). +- **Data custody**: all PHI/genomic artifacts remain local; external calls require de-identification or private models. +- **Traceability**: every run records tool versions, database snapshots, configs, and manual overrides in machine-readable logs. + +### End-to-end flow +``` +BAM (proband + parents) + ↓ Variant Calling (gVCF) [Auto] +Joint Genotyper → joint VCF + ↓ Annotation (VEP/ANNOVAR + ClinVar/gnomAD etc.) [Auto] + ↓ Genomic Store (VCF+tabix or SQL) + Query API [Auto] + ↓ + ├─ Disease/Phenotype → Gene Panel lookup [Auto+Review] + │ └─ Panel variants with basic ranking (freq, ClinVar) [Auto+Review] + ├─ ACMG evidence tagging subset (PVS1, PM2, BA1, BS1…) [Auto+Review] + ├─ PGx genotype→phenotype and recommendation rules [Auto → Auto+Review] + ├─ DDI rule evaluation [Auto] + └─ Supplement/Herb normalization + interaction rules [Auto+Review → Human-only] + ↓ +LLM/Orchestrator routes user questions to tools, produces JSON + Markdown drafts [Auto tools, Auto+Review narratives] +``` + +## Phase Roadmap (build-first view) +- **Phase 1 – Genomic foundation** + - Deliverables: trio joint VCF + annotation; query functions (`get_variants_by_gene/region`); disease→gene panel lookup; partial ACMG evidence tagging. + - Data stores: tabix-backed VCF wrapper initially; optional SQLite/Postgres import later. + - Interfaces: Python CLI/SDK first; machine-readable run logs with versions and automation levels. +- **Phase 2 – PGx & DDI** + - Drug vocabulary normalization (ATC/RxNorm). + - PGx engine: star-allele calling or rule-based genotype→phenotype; guideline-mapped advice with review gates. + - DDI engine: rule base with severity tiers; combine with PGx outputs. +- **Phase 3 – Supplements & Herbs** + - Name/ingredient normalization; herb formula expansion. + - Rule tables for CYP/transporters, coagulation, CNS effects. + - Evidence grading and conservative messaging; human-only final clinical language. +- **Phase 4 – LLM Interface & Reports** + - Tool-calling schema for queries listed above. + - JSON + Markdown report templates with traceability to rules, data versions, and overrides. + +## Module Boundaries +- **Variant Calling Pipeline** (`Auto`): wrapper around GATK or DeepVariant + joint genotyper; pluggable reference genome; QC summaries. +- **Annotation Pipeline** (`Auto`): VEP/ANNOVAR with pinned database versions (gnomAD, ClinVar, transcript set); emits annotated VCF + flat table. +- **Genomic Query Layer** (`Auto`): abstraction over tabix or SQL; minimal APIs: `get_variants_by_gene`, `get_variants_by_region`, filters (freq, consequence, clinvar). +- **Disease/Phenotype to Panel** (`Auto+Review`): HPO/OMIM lookups or curated panels; panel versioned; feeds queries. +- **Phenotype Resolver** (`Auto+Review`): JSON/DB mapping of phenotype/HPO IDs to gene lists as a placeholder until upstream sources are integrated; can synthesize panels dynamically and merge multiple sources. +- **ACMG Evidence Tagger** (`Auto+Review`): auto-evaluable criteria only; config-driven thresholds; human-only final classification. +- **PGx Engine** (`Auto → Auto+Review`): star-allele calling where possible; guideline rules (CPIC/DPWG) with conservative defaults; flag items needing review. +- **DDI Engine** (`Auto`): rule tables keyed by normalized drug IDs; outputs severity and rationale. +- **Supplements/Herbs** (`Auto+Review → Human-only`): ingredient extraction + mapping; interaction rules; human sign-off for clinical language. +- **Orchestrator/LLM** (`Auto tools, Auto+Review outputs`): intent parsing, tool sequencing, safety guardrails, report drafting. + +## Observability and Versioning +- Every pipeline run writes a JSON log: tool versions, reference genome, DB versions, config hashes, automation level per step, manual overrides (who/when/why). +- Reports embed references to those logs so outputs remain reproducible. +- Configs (ACMG thresholds, gene panels, PGx rules) are versioned artifacts stored alongside code. + +## Security/Privacy Notes +- Default to local processing; if external LLMs are used, strip identifiers and avoid full VCF uploads. +- Secrets kept out of repo; rely on environment variables or local config files (excluded by `.gitignore`). + +## Initial Tech Bets (to be validated) +- Language/runtime: Python 3.11+ for pipelines, rules, and orchestration stubs. +- Bio stack candidates: GATK or DeepVariant; VEP; tabix for early querying; SQLAlchemy + SQLite/Postgres when scaling. +- Infra: containerized runners for pipelines; makefiles or workflow engine (Nextflow/Snakemake) later if needed. diff --git a/genomic_decision_support_system_spec_v0.1.md b/genomic_decision_support_system_spec_v0.1.md new file mode 100644 index 0000000..59c7b27 --- /dev/null +++ b/genomic_decision_support_system_spec_v0.1.md @@ -0,0 +1,356 @@ + +# 個人基因風險與用藥交互作用決策支援系統 – 系統規格書 + +- 版本:v0.1-draft +- 作者:Gbanyan + 助理規劃 +- 狀態:草案,預期隨實作迭代 +- 目的:提供給 LLM(如 Claude、Codex 等)與開發者閱讀,作為系統設計與實作的基礎規格。 + +--- + +## 0. 系統目標與範圍 + +### 0.1 目標 + +建立一套「個人化、基因驅動」的決策支援系統,核心功能: + +1. **從個人與父母的外顯子定序資料(BAM)出發**,產生可查詢的變異資料庫。 +2. **針對特定疾病、症狀或表型**,自動查詢相關基因與已知致病變異,並在個人資料中搜尋對應變異。 +3. 依據 **ACMG/AMP 等準則** 與公開資料庫,給出 **機器輔助、具人工介入點的變異詮釋**。 +4. 進一步整合: + - 基因藥物學(Pharmacogenomics, PGx) + - 藥–藥交互作用(DDI) + - 保健食品、中藥等成分的潛在交互作用與風險 +5. 提供一個自然語言問答介面,使用者可直接問: + - 「我有沒有 XXX 的遺傳風險?」 + - 「以我現在吃的藥和保健食品,有沒有需要注意的交互作用?」 + +> 系統定位為:**個人決策支援工具 / 研究工具**,而非正式醫療診斷系統。 + +### 0.2 核心設計哲學 + +- **分階段發展**:先穩固「基因本體」與變異詮釋,再往 PGx 與交互作用擴充。 +- **明確的人機分工**:對每個模組標記 `Auto / Auto+Review / Human-only`。 +- **可追蹤、可回溯**:每一個結論都可追蹤到所用規則、資料庫版本、人工 override。 + +--- + +## 1. 發展階段與整體架構 + +### 1.1 發展階段總覽 + +| 階段 | 名稱 | 核心產出 | 主要對象 | +|------|------|----------|----------| +| Phase 1 | 基因本體與變異詮釋基礎 | 個人+trio VCF、註解與疾病/基因查詢 | 單基因疾病風險 | +| Phase 2 | 基因藥物學與 DDI | 基因–藥物對應、藥–藥交互作用分析 | 精準用藥建議 | +| Phase 3 | 保健食品與中藥交互作用 | 成分標準化與交互作用風險層級 | 整體用藥+補充品安全網 | +| Phase 4 | NLP/LLM 問答介面 | 自然語言問答、報告生成 | 一般使用者 / 臨床對話 | + +### 1.2 高階架構圖(Mermaid) + +```mermaid +flowchart TD + +subgraph P1[Phase 1: 基因本體] + A1[BAM (本人+父母)] --> A2[Variant Calling Pipeline] + A2 --> A3[Joint VCF (Trio)] + A3 --> A4[Variant Annotation (ClinVar, gnomAD, VEP...)] + A4 --> A5[Genomic DB / Query API] +end + +subgraph P2[Phase 2: PGx & DDI] + B1[藥物清單] --> B2[PGx Engine] + A5 --> B2 + B1 --> B3[DDI Engine] +end + +subgraph P3[Phase 3: 補充品 & 中藥] + C1[保健食品/中藥清單] --> C2[成分標準化] + C2 --> C3[成分交互作用引擎] + B1 --> C3 + A5 --> C3 +end + +subgraph P4[Phase 4: 問答與報告] + D1[前端 UI / CLI / API Client] --> D2[LLM Orchestrator] + D2 -->|疾病/症狀詢問| A5 + D2 -->|用藥/成分詢問| B2 + D2 --> B3 + D2 --> C3 + A5 --> D3[報告產生器] + B2 --> D3 + B3 --> D3 + C3 --> D3 +end +``` + +--- + +## 2. 通用設計:人機分工標記 + +所有模組需標記自動化等級: + +- `Auto`:可完全自動執行的步驟(例:variant calling、基本註解)。 +- `Auto+Review`:系統先產生建議,需人工複核或有條件接受(例:ACMG 部分 evidence scoring)。 +- `Human-only`:最終醫療判斷/用語/管理建議,必須由人決策(例:最終 Pathogenic 分類、臨床處置建議)。 + +每次分析需生成一份 **machine-readable log**,紀錄: + +- 使用的模組與版本 +- 每一步的自動化等級 +- 哪些地方有人工 override(人員、時間、理由) + +--- + +## 3. Phase 1:基因本體與變異詮釋基礎 + +### 3.1 功能需求 + +1. **輸入** + - 本人與雙親外顯子定序 BAM 檔。 +2. **輸出** + - 高品質 joint VCF(含 trio) + - 每個變異的註解資訊: + - 基因、轉錄本、蛋白改變 + - 族群頻率(gnomAD 等) + - ClinVar 註解 + - 功能預測(SIFT/PolyPhen/CADD 等) + - 對特定疾病/基因清單的變異過濾結果。 +3. **對外服務** + - 以 API / 函式介面提供: + - 給定基因列表 → 回傳該個體在這些基因中的變異列表 + - 支援疾病名稱/HPO → 基因 → 變異的查詢流程(初期可分步呼叫) + +### 3.2 模組設計 + +#### 3.2.1 Variant Calling Pipeline + +- **輸入**:BAM(本人 + 父母) +- **輸出**:個別 gVCF → joint VCF +- **工具候選**: + - GATK(HaplotypeCaller + GenotypeGVCFs) + - 或 DeepVariant + joint genotyper +- **自動化等級**:`Auto` +- **需求**: + - 基本 QC(coverage、duplicate rate、on-target rate) + - 支援版本標記(如 reference genome 版本) + +#### 3.2.2 Annotation Pipeline + +- **輸入**:joint VCF +- **輸出**:annotated VCF / 變異表 +- **工具候選**: + - VEP、ANNOVAR 或類似工具 +- **資料庫**: + - ClinVar + - gnomAD + - 基因功能與轉錄本資料庫 +- **自動化等級**:`Auto` + +#### 3.2.3 Genomic DB / Query API + +- **目的**:提供高效查詢,作為後續模組(疾病風險、PGx 等)的基底。 +- **形式**: + - 選項 A:基於 VCF + tabix,以封裝函式操作 + - 選項 B:匯入 SQLite / PostgreSQL / 專用 genomic DB +- **關鍵查詢**: + - `get_variants_by_gene(individual_id, gene_list, filters)` + - `get_variants_by_region(individual_id, chr, start, end, filters)` +- **自動化等級**:`Auto` + +#### 3.2.4 疾病/表型 → 基因 → 變異流程 + +- 初期可拆成三步: + 1. 使用外部知識庫或手動 panel:疾病/表型 → 基因清單 + 2. 透過 Genomic DB 查詢個人變異 + 3. 以簡單規則(頻率、ClinVar 標註)做初步排序 +- **自動化等級**:`Auto+Review` + +### 3.3 ACMG 規則實作(初版) + +- **範圍**:僅實作部分機器可自動判定之 evidence(如 PVS1、PM2、BA1、BS1 等)。 +- **輸出**: + - 每個變異的 evidence tag 列表與建議分級(例如:`suggested_class = "VUS"`) +- **人工介入點**: + - 變異最終分類(Pathogenic / Likely pathogenic / VUS / Likely benign / Benign) → `Human-only` + - 規則閾值(如頻率 cutoff)以 config 檔管理 → `Auto+Review` + +--- + +## 4. Phase 2:基因藥物學(PGx)與藥–藥交互作用(DDI) + +### 4.1 功能需求 + +1. 接收使用者目前用藥清單(處方藥、成藥)。 +2. 透過基因資料,判定與 PGx 相關的 genotype(例如 CYP2D6, CYP2C9, HLA 等)。 +3. 根據 CPIC / DPWG 等指南,給出: + - 適應症相關風險(如 HLA-B*58:01 與 allopurinol) + - 劑量調整建議 / 藥物替代建議(僅 decision-support 層級) +4. 計算基礎藥–藥交互作用(DDI),例如: + - CYP 抑制 / 誘導疊加 + - QT prolongation 疊加 + - 出血風險疊加 + +### 4.2 模組設計 + +#### 4.2.1 用藥資料標準化 + +- 使用 ATC / RxNorm / 自訂 ID。 +- **自動化等級**:`Auto` + +#### 4.2.2 PGx Engine + +- **輸入**:個人變異(Phase 1 DB)、藥物清單 +- **輸出**:每個藥物的 PGx 評估(genotype → phenotype → 建議) +- **資料庫**: + - CPIC guidelines + - PharmGKB 關聯資料 +- **自動化等級**: + - genotype → phenotype:`Auto` + - phenotype → 臨床建議:`Auto+Review` + +#### 4.2.3 DDI Engine + +- **輸入**:藥物清單 +- **輸出**:已知 DDI 清單與嚴重程度分級 +- **資料來源**:公開或商用 DDI 資料庫(視可用性) +- **自動化等級**:`Auto` + +--- + +## 5. Phase 3:保健食品與中藥交互作用模組 + +### 5.1 功能需求 + +1. 接收使用者的保健食品與中藥使用資料。 +2. 將名稱解析為: + - 標準化有效成分(如 EPA/DHA mg、Vit D IU、銀杏葉萃取物 mg 等) + - 中藥材名稱(如 黃耆、當歸、川芎…) +3. 評估: + - 成分與藥物、基因的交互作用風險 + - 成分間的加乘作用(如抗凝、CNS 抑制等) +4. 按證據等級給出: + - 高優先級警示(有較強臨床證據) + - 一般提醒(動物實驗 / case report 等) + - 資料不足,僅能提醒不確定性 + +### 5.2 模組設計 + +#### 5.2.1 成分標準化引擎 + +- **輸入**:使用者輸入的品名 / 處方 +- **輸出**: + - 標準化成分列表 + - 估計劑量範圍(若無精確資料) +- **資料**: + - 保健食品常用成分資料表 + - 中藥方與藥材對應表 +- **自動化等級**:`Auto+Review` + +#### 5.2.2 成分交互作用引擎 + +- **輸入**:成分列表、藥物清單、基因資料 +- **輸出**:交互作用列表與風險層級 +- **邏輯**: + - 成分對 CYP / P-gp / OATP 等的影響 + - 成分對凝血、血壓、中樞神經等系統的影響 +- **自動化等級**: + - 規則推論:`Auto` + - 最終臨床建議表述:`Human-only` + +--- + +## 6. Phase 4:NLP/LLM 問答介面與報告生成 + +### 6.1 功能需求 + +1. 支援使用者以自然語言提問: + - 疾病/症狀相關風險 + - 用藥安全性 + - 保健食品、中藥併用風險 +2. LLM 負責: + - 問題解析 → 結構化查詢(疾病、HPO、藥物、成分等) + - 協調呼叫底層 API(Phase 1–3) + - 整合結果並生成報告草稿 +3. 報告形式: + - 機器可讀 JSON(便於後處理) + - 人類可讀 Markdown / PDF 報告 + +### 6.2 Orchestration 設計 + +- 可採用「LLM + Tool/Function Calling」模式: + - 工具包括: + - `query_variants_by_gene` + - `query_disease_gene_panel` + - `run_pgx_analysis` + - `run_ddi_analysis` + - `run_supplement_herb_interaction` +- LLM 主要負責: + - 意圖辨識與拆解 + - 工具呼叫順序規劃 + - 結果解釋與用語調整(需符合安全與保守原則) +- **自動化等級**: + - 工具呼叫:`Auto` + - 臨床敏感結論:`Auto+Review` / `Human-only`(視場景而定) + +--- + +## 7. 安全性、隱私與版本管理 + +### 7.1 資料安全與隱私 + +- 所有基因資料、用藥清單、報告: + - 儲存於本地或受控環境 + - 若需與外部服務(如雲端 LLM)互動,需: + - 做脫敏處理(移除個資) + - 或改用 local/私有 LLM + +### 7.2 版本管理 + +- 對以下物件進行版本控制: + - 參考基因組版本 + - variant calling pipeline 版本 + - 資料庫版本(ClinVar、gnomAD 等) + - ACMG 規則 config 版本 + - gene panel / PGx 規則版本 +- 每份分析報告需記錄所用版本,以利追蹤與重跑。 + +### 7.3 人工介入紀錄 + +- 每次人工 override 或審核需紀錄: + - 變異 ID / 分析項目 + - 原自動建議 + - 人工調整結果 + - 理由與參考文獻(如有) + - 審核者與時間 + +--- + +## 8. 未來擴充方向(Optional) + +- 整合 polygenic risk score(PRS)模組 +- 整合 longitudinal data(實驗室數據、症狀日誌)做風險動態追蹤 +- 為特定疾病領域建立更深的 expert-curated knowledge base +- 與可穿戴裝置/其他健康資料源整合 + +--- + +## 9. 第一階段實作建議路線(Actionable TODO) + +1. **規劃 Phase 1 的技術選型** + - 選擇 variant caller(如 GATK)與 reference genome 版本 + - 選擇 annotation 工具(如 VEP 或 ANNOVAR) +2. **建立基本 pipeline** + - BAM → gVCF → joint VCF(trio) + - 加上基本 QC 報表 +3. **建置簡單的 Genomic Query 介面** + - 先以 CLI/Notebook 函式為主(例如 Python 函式庫) +4. **選一個你最關心的疾病領域** + - 建立第一個 gene panel(例如視覺/聽力相關) + - 實作 panel-based 查詢與變異列表輸出 +5. **撰寫第一版報告模板** + - 輸入:疾病名稱 + gene panel + 查詢結果 + - 輸出:簡易 Markdown 報告(含變異表 + 限制說明) +6. **逐步加入 ACMG 自動 evidence 標記與人工 review 流程** + +這個規格書預期會在實作過程中持續更新,可視此為 v0.1 的起點版本。 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..bc6626c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,31 @@ +[build-system] +requires = ["setuptools>=61", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "genomic-consultant" +version = "0.1.0" +description = "Personal genomic risk and drug–interaction decision support scaffolding" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "pyyaml>=6", +] + +[project.scripts] +genomic-consultant = "genomic_consultant.cli:main" + +[project.optional-dependencies] +dev = [ + "pytest", + "ruff", +] +store = [ + "pandas>=2", +] + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.setuptools.packages.find] +where = ["src"] diff --git a/sample_data/example_annotated.tsv b/sample_data/example_annotated.tsv new file mode 100644 index 0000000..949c9e4 --- /dev/null +++ b/sample_data/example_annotated.tsv @@ -0,0 +1,3 @@ +#CHROM POS REF ALT SYMBOL Consequence Protein_position PolyPhen SIFT CLIN_SIG AF gnomAD_AF SpliceAI CADD_PHRED +1 123456 A T GJB2 missense_variant p.Val37Ile benign tolerated Benign 0.012 0.012 0.02 5.1 +2 234567 G C OTOF stop_gained p.* probably_damaging deleterious Uncertain_significance 0.0001 0.0001 0.6 28.5 diff --git a/src/genomic_consultant.egg-info/PKG-INFO b/src/genomic_consultant.egg-info/PKG-INFO new file mode 100644 index 0000000..a40d06f --- /dev/null +++ b/src/genomic_consultant.egg-info/PKG-INFO @@ -0,0 +1,63 @@ +Metadata-Version: 2.4 +Name: genomic-consultant +Version: 0.1.0 +Summary: Personal genomic risk and drug–interaction decision support scaffolding +Requires-Python: >=3.11 +Description-Content-Type: text/markdown +Requires-Dist: pyyaml>=6 +Provides-Extra: dev +Requires-Dist: pytest; extra == "dev" +Requires-Dist: ruff; extra == "dev" + +# Genomic Consultant + +Early design for a personal genomic risk and drug–interaction decision support system. Specs are sourced from `genomic_decision_support_system_spec_v0.1.md`. + +## Vision (per spec) +- Phase 1: trio variant calling, annotation, queryable genomic DB, initial ACMG evidence tagging. +- Phase 2: pharmacogenomics genotype-to-phenotype mapping plus drug–drug interaction checks. +- Phase 3: supplement/herb normalization and interaction risk layering. +- Phase 4: LLM-driven query orchestration and report generation. + +## Repository Layout +- `docs/` — system architecture notes, phase plans, data models (work in progress). +- `configs/` — example ACMG config and gene panel JSON. +- `sample_data/` — tiny annotated TSV for demo. +- `src/genomic_consultant/` — Python scaffolding (pipelines, store, panel lookup, ACMG tagging, reporting). +- `genomic_decision_support_system_spec_v0.1.md` — original requirements draft. + +## Contributing/next steps +1. Finalize Phase 1 tech selection (variant caller, annotation stack, reference/DB versions). +2. Stand up the Phase 1 pipelines and minimal query API surface. +3. Add ACMG evidence tagging config and human-review logging. +4. Layer in PGx/DDI and supplement modules per later phases. + +Data safety: keep genomic/clinical data local; the `.gitignore` blocks common genomic outputs by default. + +## Quickstart (CLI scaffolding) +``` +pip install -e . + +# 1) Show trio calling plan (commands only; not executed) +genomic-consultant plan-call \ + --sample proband:/data/proband.bam \ + --sample father:/data/father.bam \ + --sample mother:/data/mother.bam \ + --reference /refs/GRCh38.fa \ + --workdir /tmp/trio + +# 2) Show annotation plan for a joint VCF +genomic-consultant plan-annotate \ + --vcf /tmp/trio/trio.joint.vcf.gz \ + --workdir /tmp/trio/annot \ + --prefix trio \ + --reference /refs/GRCh38.fa + +# 3) Demo panel report using sample data +genomic-consultant panel-report \ + --tsv sample_data/example_annotated.tsv \ + --panel configs/panel.example.json \ + --acmg-config configs/acmg_config.example.yaml \ + --individual-id demo \ + --format markdown +``` diff --git a/src/genomic_consultant.egg-info/SOURCES.txt b/src/genomic_consultant.egg-info/SOURCES.txt new file mode 100644 index 0000000..bc5a276 --- /dev/null +++ b/src/genomic_consultant.egg-info/SOURCES.txt @@ -0,0 +1,27 @@ +README.md +pyproject.toml +src/genomic_consultant/__init__.py +src/genomic_consultant/cli.py +src/genomic_consultant.egg-info/PKG-INFO +src/genomic_consultant.egg-info/SOURCES.txt +src/genomic_consultant.egg-info/dependency_links.txt +src/genomic_consultant.egg-info/entry_points.txt +src/genomic_consultant.egg-info/requires.txt +src/genomic_consultant.egg-info/top_level.txt +src/genomic_consultant/acmg/__init__.py +src/genomic_consultant/acmg/tagger.py +src/genomic_consultant/audit/__init__.py +src/genomic_consultant/audit/run_log.py +src/genomic_consultant/orchestration/__init__.py +src/genomic_consultant/orchestration/workflows.py +src/genomic_consultant/panels/__init__.py +src/genomic_consultant/panels/panels.py +src/genomic_consultant/pipelines/__init__.py +src/genomic_consultant/pipelines/annotation.py +src/genomic_consultant/pipelines/variant_calling.py +src/genomic_consultant/reporting/__init__.py +src/genomic_consultant/reporting/report.py +src/genomic_consultant/store/__init__.py +src/genomic_consultant/store/query.py +src/genomic_consultant/utils/__init__.py +src/genomic_consultant/utils/models.py \ No newline at end of file diff --git a/src/genomic_consultant.egg-info/dependency_links.txt b/src/genomic_consultant.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/genomic_consultant.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/src/genomic_consultant.egg-info/entry_points.txt b/src/genomic_consultant.egg-info/entry_points.txt new file mode 100644 index 0000000..3e063e0 --- /dev/null +++ b/src/genomic_consultant.egg-info/entry_points.txt @@ -0,0 +1,2 @@ +[console_scripts] +genomic-consultant = genomic_consultant.cli:main diff --git a/src/genomic_consultant.egg-info/requires.txt b/src/genomic_consultant.egg-info/requires.txt new file mode 100644 index 0000000..47e6320 --- /dev/null +++ b/src/genomic_consultant.egg-info/requires.txt @@ -0,0 +1,5 @@ +pyyaml>=6 + +[dev] +pytest +ruff diff --git a/src/genomic_consultant.egg-info/top_level.txt b/src/genomic_consultant.egg-info/top_level.txt new file mode 100644 index 0000000..bf22515 --- /dev/null +++ b/src/genomic_consultant.egg-info/top_level.txt @@ -0,0 +1 @@ +genomic_consultant diff --git a/src/genomic_consultant/__init__.py b/src/genomic_consultant/__init__.py new file mode 100644 index 0000000..7b5aebb --- /dev/null +++ b/src/genomic_consultant/__init__.py @@ -0,0 +1,4 @@ +"""Genomic Consultant: genomic decision support scaffolding.""" + +__all__ = ["__version__"] +__version__ = "0.1.0" diff --git a/src/genomic_consultant/acmg/__init__.py b/src/genomic_consultant/acmg/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/genomic_consultant/acmg/__init__.py @@ -0,0 +1 @@ + diff --git a/src/genomic_consultant/acmg/tagger.py b/src/genomic_consultant/acmg/tagger.py new file mode 100644 index 0000000..bcb7ff5 --- /dev/null +++ b/src/genomic_consultant/acmg/tagger.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import List, Set + +import yaml + +from genomic_consultant.utils.models import EvidenceTag, SuggestedClassification, Variant + + +@dataclass +class ACMGConfig: + ba1_af: float = 0.05 + bs1_af: float = 0.01 + pm2_af: float = 0.0005 + lof_genes: Set[str] | None = None + bp7_splice_ai_max: float = 0.1 + + +def load_acmg_config(path: Path) -> ACMGConfig: + data = yaml.safe_load(Path(path).read_text()) + return ACMGConfig( + ba1_af=data.get("ba1_af", 0.05), + bs1_af=data.get("bs1_af", 0.01), + pm2_af=data.get("pm2_af", 0.0005), + lof_genes=set(data.get("lof_genes", [])) if data.get("lof_genes") else set(), + bp7_splice_ai_max=data.get("bp7_splice_ai_max", 0.1), + ) + + +def tag_variant(variant: Variant, config: ACMGConfig) -> SuggestedClassification: + evidence: List[EvidenceTag] = [] + af = variant.allele_frequency + + if af is not None: + if af >= config.ba1_af: + evidence.append(EvidenceTag(tag="BA1", strength="Stand-alone", rationale=f"AF {af} >= {config.ba1_af}")) + elif af >= config.bs1_af: + evidence.append(EvidenceTag(tag="BS1", strength="Strong", rationale=f"AF {af} >= {config.bs1_af}")) + elif af <= config.pm2_af: + evidence.append(EvidenceTag(tag="PM2", strength="Moderate", rationale=f"AF {af} <= {config.pm2_af}")) + + if _is_lof(variant) and variant.gene and variant.gene in config.lof_genes: + evidence.append( + EvidenceTag(tag="PVS1", strength="Very strong", rationale="Predicted LoF in LoF-sensitive gene") + ) + + splice_ai = _get_float(variant.annotations.get("splice_ai_delta_score")) + if _is_synonymous(variant) and (splice_ai is None or splice_ai <= config.bp7_splice_ai_max): + evidence.append( + EvidenceTag( + tag="BP7", + strength="Supporting", + rationale=f"Synonymous with low predicted splice impact (spliceAI {splice_ai})", + ) + ) + + suggested = _suggest_class(evidence) + return SuggestedClassification(suggested_class=suggested, evidence=evidence) + + +def _is_lof(variant: Variant) -> bool: + consequence = (variant.consequence or "").lower() + lof_keywords = ["frameshift", "stop_gained", "splice_acceptor", "splice_donor", "start_lost"] + return any(k in consequence for k in lof_keywords) + + +def _suggest_class(evidence: List[EvidenceTag]) -> str: + tags = {e.tag for e in evidence} + if "BA1" in tags: + return "Benign" + if "BS1" in tags and "PM2" not in tags and "PVS1" not in tags: + return "Likely benign" + if "PVS1" in tags and "PM2" in tags: + return "Likely pathogenic" + return "VUS" + + +def _is_synonymous(variant: Variant) -> bool: + consequence = (variant.consequence or "").lower() + return "synonymous_variant" in consequence + + +def _get_float(value: str | float | None) -> float | None: + if value is None: + return None + try: + return float(value) + except (TypeError, ValueError): + return None diff --git a/src/genomic_consultant/audit/__init__.py b/src/genomic_consultant/audit/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/genomic_consultant/audit/__init__.py @@ -0,0 +1 @@ + diff --git a/src/genomic_consultant/audit/run_log.py b/src/genomic_consultant/audit/run_log.py new file mode 100644 index 0000000..fabbaa7 --- /dev/null +++ b/src/genomic_consultant/audit/run_log.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +import json +from dataclasses import asdict, is_dataclass +from datetime import datetime +from pathlib import Path +from typing import Any + +from genomic_consultant.utils.models import RunLog + + +def _serialize(obj: Any) -> Any: + if isinstance(obj, datetime): + return obj.isoformat() + if is_dataclass(obj): + return {k: _serialize(v) for k, v in asdict(obj).items()} + if isinstance(obj, dict): + return {k: _serialize(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_serialize(v) for v in obj] + return obj + + +def write_run_log(run_log: RunLog, path: str | Path) -> Path: + """Persist a RunLog to JSON.""" + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + payload = _serialize(run_log) + path.write_text(json.dumps(payload, indent=2)) + return path diff --git a/src/genomic_consultant/cli.py b/src/genomic_consultant/cli.py new file mode 100644 index 0000000..2963e89 --- /dev/null +++ b/src/genomic_consultant/cli.py @@ -0,0 +1,273 @@ +from __future__ import annotations + +import argparse +import sys +from datetime import datetime +from pathlib import Path +from typing import Dict, List + +from genomic_consultant.acmg.tagger import ACMGConfig, load_acmg_config +from genomic_consultant.orchestration.workflows import run_panel_variant_review +from genomic_consultant.panels.aggregate import merge_mappings +from genomic_consultant.panels.panels import load_panel +from genomic_consultant.panels.resolver import PhenotypeGeneResolver +from genomic_consultant.pipelines.annotation import build_vep_plan +from genomic_consultant.pipelines.variant_calling import build_gatk_trio_plan +from genomic_consultant.pipelines.runner import execute_plan +from genomic_consultant.reporting.report import panel_report_json, panel_report_markdown +from genomic_consultant.orchestration.phase1_pipeline import run_phase1_pipeline +from genomic_consultant.store.query import GenomicStore +from genomic_consultant.utils.hashing import sha256sum +from genomic_consultant.utils.models import FilterConfig +from genomic_consultant.utils.tooling import probe_tool_versions + + +def parse_samples(sample_args: List[str]) -> Dict[str, Path]: + samples: Dict[str, Path] = {} + for arg in sample_args: + if ":" not in arg: + raise ValueError(f"Sample must be sample_id:/path/to.bam, got {arg}") + sample_id, bam_path = arg.split(":", 1) + samples[sample_id] = Path(bam_path) + return samples + + +def main(argv: List[str] | None = None) -> int: + parser = argparse.ArgumentParser(prog="genomic-consultant", description="Genomic decision support scaffolding") + sub = parser.add_subparsers(dest="command", required=True) + + # Variant calling plan + call = sub.add_parser("plan-call", help="Build variant calling command plan (GATK trio).") + call.add_argument("--sample", action="append", required=True, help="sample_id:/path/to.bam (repeatable)") + call.add_argument("--reference", required=True, help="Path to reference FASTA") + call.add_argument("--workdir", required=True, help="Working directory for outputs") + call.add_argument("--prefix", default="trio", help="Output prefix for joint VCF") + + run_call = sub.add_parser("run-call", help="Execute variant calling plan (GATK trio).") + run_call.add_argument("--sample", action="append", required=True, help="sample_id:/path/to.bam (repeatable)") + run_call.add_argument("--reference", required=True, help="Path to reference FASTA") + run_call.add_argument("--workdir", required=True, help="Working directory for outputs") + run_call.add_argument("--prefix", default="trio", help="Output prefix for joint VCF") + run_call.add_argument("--log", required=False, help="Path to write run log JSON") + run_call.add_argument("--probe-tools", action="store_true", help="Attempt to record tool versions") + + # Annotation plan + ann = sub.add_parser("plan-annotate", help="Build annotation command plan (VEP).") + ann.add_argument("--vcf", required=True, help="Path to joint VCF") + ann.add_argument("--workdir", required=True, help="Working directory for annotated outputs") + ann.add_argument("--prefix", default="annotated", help="Output prefix") + ann.add_argument("--reference", required=False, help="Reference FASTA (optional)") + ann.add_argument("--plugin", action="append", help="VEP plugin spec, repeatable", default=[]) + + run_ann = sub.add_parser("run-annotate", help="Execute annotation plan (VEP).") + run_ann.add_argument("--vcf", required=True, help="Path to joint VCF") + run_ann.add_argument("--workdir", required=True, help="Working directory for annotated outputs") + run_ann.add_argument("--prefix", default="annotated", help="Output prefix") + run_ann.add_argument("--reference", required=False, help="Reference FASTA (optional)") + run_ann.add_argument("--plugin", action="append", help="VEP plugin spec, repeatable", default=[]) + run_ann.add_argument("--extra-flag", action="append", help="Extra flags appended to VEP command", default=[]) + run_ann.add_argument("--log", required=False, help="Path to write run log JSON") + run_ann.add_argument("--probe-tools", action="store_true", help="Attempt to record tool versions") + + # Panel report + panel = sub.add_parser("panel-report", help="Run panel query + ACMG tagging and emit report.") + panel.add_argument("--tsv", required=True, help="Flattened annotated TSV") + panel.add_argument("--panel", required=False, help="Panel JSON file") + panel.add_argument("--phenotype-id", required=False, help="Phenotype/HPO ID to resolve to a panel") + panel.add_argument("--phenotype-mapping", required=False, help="Phenotype→genes mapping JSON") + panel.add_argument("--acmg-config", required=True, help="ACMG config YAML") + panel.add_argument("--individual-id", required=True, help="Individual identifier") + panel.add_argument("--max-af", type=float, default=None, help="Max allele frequency filter") + panel.add_argument("--format", choices=["markdown", "json"], default="markdown", help="Output format") + panel.add_argument("--log", required=False, help="Path to write run log JSON for the analysis") + panel.add_argument("--phenotype-panel", required=False, help="Phenotype→genes mapping JSON (optional)") + + phase1 = sub.add_parser("phase1-run", help="End-to-end Phase 1 pipeline (call→annotate→panel).") + phase1.add_argument("--sample", action="append", help="sample_id:/path/to.bam (repeatable)") + phase1.add_argument("--reference", required=False, help="Path to reference FASTA") + phase1.add_argument("--workdir", required=True, help="Working directory for outputs") + phase1.add_argument("--prefix", default="trio", help="Output prefix") + phase1.add_argument("--plugins", action="append", default=[], help="VEP plugin specs") + phase1.add_argument("--extra-flag", action="append", default=[], help="Extra flags for VEP") + phase1.add_argument("--joint-vcf", required=False, help="Existing joint VCF (skip calling)") + phase1.add_argument("--tsv", required=False, help="Existing annotated TSV (skip annotation)") + phase1.add_argument("--skip-call", action="store_true", help="Skip variant calling step") + phase1.add_argument("--skip-annotate", action="store_true", help="Skip annotation step") + phase1.add_argument("--panel", required=False, help="Panel JSON file") + phase1.add_argument("--phenotype-id", required=False, help="Phenotype/HPO ID to resolve to a panel") + phase1.add_argument("--phenotype-mapping", required=False, help="Phenotype→genes mapping JSON") + phase1.add_argument("--acmg-config", required=True, help="ACMG config YAML") + phase1.add_argument("--max-af", type=float, default=None, help="Max allele frequency filter") + phase1.add_argument("--format", choices=["markdown", "json"], default="markdown", help="Report format") + phase1.add_argument("--log-dir", required=False, help="Directory to write run logs") + + build_map = sub.add_parser("build-phenotype-mapping", help="Merge phenotype→gene mapping JSON files.") + build_map.add_argument("--output", required=True, help="Output JSON path") + build_map.add_argument("inputs", nargs="+", help="Input mapping JSON files") + + args = parser.parse_args(argv) + + if args.command == "plan-call": + samples = parse_samples(args.sample) + plan = build_gatk_trio_plan( + samples=samples, + reference_fasta=Path(args.reference), + workdir=Path(args.workdir), + output_prefix=args.prefix, + ) + print("# Variant calling command plan") + for cmd in plan.commands: + print(cmd) + return 0 + + if args.command == "run-call": + samples = parse_samples(args.sample) + plan = build_gatk_trio_plan( + samples=samples, + reference_fasta=Path(args.reference), + workdir=Path(args.workdir), + output_prefix=args.prefix, + ) + tool_versions = probe_tool_versions({"gatk": "gatk"}) if args.probe_tools else {} + run_log = execute_plan( + plan, automation_level="Auto", log_path=Path(args.log) if args.log else None + ) + run_log.tool_versions.update(tool_versions) + if args.log: + from genomic_consultant.audit.run_log import write_run_log + + write_run_log(run_log, Path(args.log)) + print(f"Run finished with {len(run_log.outputs.get('command_results', []))} steps. Log ID: {run_log.run_id}") + if args.log: + print(f"Run log written to {args.log}") + return 0 + + if args.command == "plan-annotate": + plan = build_vep_plan( + vcf_path=Path(args.vcf), + workdir=Path(args.workdir), + reference_fasta=Path(args.reference) if args.reference else None, + output_prefix=args.prefix, + plugins=args.plugin, + ) + print("# Annotation command plan") + for cmd in plan.commands: + print(cmd) + return 0 + + if args.command == "run-annotate": + plan = build_vep_plan( + vcf_path=Path(args.vcf), + workdir=Path(args.workdir), + reference_fasta=Path(args.reference) if args.reference else None, + output_prefix=args.prefix, + plugins=args.plugin, + extra_flags=args.extra_flag, + ) + tool_versions = probe_tool_versions({"vep": "vep", "bcftools": "bcftools", "tabix": "tabix"}) if args.probe_tools else {} + run_log = execute_plan( + plan, automation_level="Auto", log_path=Path(args.log) if args.log else None + ) + run_log.tool_versions.update(tool_versions) + if args.log: + from genomic_consultant.audit.run_log import write_run_log + + write_run_log(run_log, Path(args.log)) + print(f"Run finished with {len(run_log.outputs.get('command_results', []))} steps. Log ID: {run_log.run_id}") + if args.log: + print(f"Run log written to {args.log}") + return 0 + + if args.command == "panel-report": + if not args.panel and not (args.phenotype_id and args.phenotype_mapping): + raise SystemExit("Provide either --panel or (--phenotype-id and --phenotype-mapping).") + + store = GenomicStore.from_tsv(Path(args.tsv)) + if args.panel: + panel_obj = load_panel(Path(args.panel)) + panel_config_hash = sha256sum(Path(args.panel)) + else: + resolver = PhenotypeGeneResolver.from_json(Path(args.phenotype_mapping)) + panel_obj = resolver.build_panel(args.phenotype_id) + if panel_obj is None: + raise SystemExit(f"No genes found for phenotype {args.phenotype_id}") + panel_config_hash = sha256sum(Path(args.phenotype_mapping)) + + acmg_config = load_acmg_config(Path(args.acmg_config)) + filters = FilterConfig(max_af=args.max_af) + result = run_panel_variant_review( + individual_id=args.individual_id, + panel=panel_obj, + store=store, + acmg_config=acmg_config, + filters=filters, + ) + output = panel_report_json(result) if args.format == "json" else panel_report_markdown(result) + print(output) + + if args.log: + from genomic_consultant.audit.run_log import write_run_log + from genomic_consultant.utils.models import RunLog + run_log = RunLog( + run_id=f"panel-{args.individual_id}", + started_at=datetime.utcnow(), + inputs={ + "tsv": str(args.tsv), + "panel": str(args.panel) if args.panel else f"phenotype:{args.phenotype_id}", + "acmg_config": str(args.acmg_config), + }, + parameters={ + "max_af": args.max_af, + "format": args.format, + "phenotype_id": args.phenotype_id, + }, + tool_versions={}, + database_versions={}, + config_hashes={ + "panel": panel_config_hash, + "acmg_config": sha256sum(Path(args.acmg_config)), + }, + automation_levels={"panel_report": "Auto+Review"}, + overrides=[], + outputs={"report": output}, + notes=None, + ) + write_run_log(run_log, Path(args.log)) + print(f"Analysis log written to {args.log}") + return 0 + + if args.command == "build-phenotype-mapping": + merge_mappings(inputs=[Path(p) for p in args.inputs], output=Path(args.output)) + print(f"Merged mapping written to {args.output}") + return 0 + + if args.command == "phase1-run": + samples = parse_samples(args.sample) if args.sample else None + log_dir = Path(args.log_dir) if args.log_dir else None + artifacts = run_phase1_pipeline( + samples=samples, + reference_fasta=Path(args.reference) if args.reference else None, + workdir=Path(args.workdir), + output_prefix=args.prefix, + acmg_config_path=Path(args.acmg_config), + max_af=args.max_af, + panel_path=Path(args.panel) if args.panel else None, + phenotype_id=args.phenotype_id, + phenotype_mapping=Path(args.phenotype_mapping) if args.phenotype_mapping else None, + report_format=args.format, + plugins=args.plugins, + extra_flags=args.extra_flag, + existing_joint_vcf=Path(args.joint_vcf) if args.joint_vcf else None, + existing_tsv=Path(args.tsv) if args.tsv else None, + skip_call=args.skip_call, + skip_annotate=args.skip_annotate, + log_dir=log_dir, + ) + print(artifacts.panel_report) + return 0 + + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/genomic_consultant/orchestration/__init__.py b/src/genomic_consultant/orchestration/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/genomic_consultant/orchestration/__init__.py @@ -0,0 +1 @@ + diff --git a/src/genomic_consultant/orchestration/phase1_pipeline.py b/src/genomic_consultant/orchestration/phase1_pipeline.py new file mode 100644 index 0000000..e4b754c --- /dev/null +++ b/src/genomic_consultant/orchestration/phase1_pipeline.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +import uuid +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional + +from datetime import datetime, timezone + +from genomic_consultant.acmg.tagger import ACMGConfig, load_acmg_config +from genomic_consultant.audit.run_log import write_run_log +from genomic_consultant.orchestration.workflows import run_panel_variant_review +from genomic_consultant.panels.panels import load_panel +from genomic_consultant.panels.resolver import PhenotypeGeneResolver +from genomic_consultant.pipelines.annotation import AnnotationPlan, build_vep_plan +from genomic_consultant.pipelines.runner import execute_plan +from genomic_consultant.pipelines.variant_calling import VariantCallingPlan, build_gatk_trio_plan +from genomic_consultant.reporting.report import panel_report_json, panel_report_markdown +from genomic_consultant.store.query import GenomicStore +from genomic_consultant.utils.hashing import sha256sum +from genomic_consultant.utils.models import FilterConfig, RunLog + + +@dataclass +class Phase1Artifacts: + call_log: Optional[RunLog] + annotate_log: Optional[RunLog] + panel_report: str + panel_report_format: str + panel_result_log: RunLog + tsv_path: Path + joint_vcf: Optional[Path] + + +def run_phase1_pipeline( + samples: Optional[Dict[str, Path]], + reference_fasta: Optional[Path], + workdir: Path, + output_prefix: str, + acmg_config_path: Path, + max_af: Optional[float], + panel_path: Optional[Path], + phenotype_id: Optional[str], + phenotype_mapping: Optional[Path], + report_format: str = "markdown", + plugins: Optional[List[str]] = None, + extra_flags: Optional[List[str]] = None, + existing_joint_vcf: Optional[Path] = None, + existing_tsv: Optional[Path] = None, + skip_call: bool = False, + skip_annotate: bool = False, + log_dir: Optional[Path] = None, +) -> Phase1Artifacts: + """ + Orchestrate Phase 1: optional call -> annotate -> panel report. + Allows skipping call/annotate when precomputed artifacts are supplied. + """ + log_dir = log_dir or workdir / "runtime" + log_dir.mkdir(parents=True, exist_ok=True) + call_log = None + annotate_log = None + + # Variant calling + joint_vcf: Optional[Path] = existing_joint_vcf + if not skip_call: + if not samples or not reference_fasta: + raise ValueError("samples and reference_fasta are required unless skip_call is True") + call_plan: VariantCallingPlan = build_gatk_trio_plan( + samples=samples, reference_fasta=reference_fasta, workdir=workdir, output_prefix=output_prefix + ) + call_log_path = log_dir / f"{output_prefix}_call_runlog.json" + call_log = execute_plan(call_plan, automation_level="Auto", log_path=call_log_path) + joint_vcf = call_plan.joint_vcf + elif joint_vcf is None: + joint_vcf = existing_joint_vcf + + # Annotation + tsv_path: Optional[Path] = existing_tsv + if not skip_annotate: + if joint_vcf is None: + raise ValueError("joint VCF must be provided (via call step or existing_joint_vcf)") + ann_plan: AnnotationPlan = build_vep_plan( + vcf_path=joint_vcf, + workdir=workdir, + reference_fasta=reference_fasta, + output_prefix=output_prefix, + plugins=plugins or [], + extra_flags=extra_flags or [], + ) + ann_log_path = log_dir / f"{output_prefix}_annotate_runlog.json" + annotate_log = execute_plan(ann_plan, automation_level="Auto", log_path=ann_log_path) + tsv_path = ann_plan.flat_table + + if tsv_path is None: + raise ValueError("No TSV available; provide existing_tsv or run annotation.") + + # Panel selection + if panel_path: + panel_obj = load_panel(panel_path) + panel_hash = sha256sum(panel_path) + elif phenotype_id and phenotype_mapping: + resolver = PhenotypeGeneResolver.from_json(phenotype_mapping) + panel_obj = resolver.build_panel(phenotype_id) + if panel_obj is None: + raise ValueError(f"No genes found for phenotype {phenotype_id}") + panel_hash = sha256sum(phenotype_mapping) + else: + raise ValueError("Provide panel_path or (phenotype_id and phenotype_mapping).") + + acmg_config = load_acmg_config(acmg_config_path) + store = GenomicStore.from_tsv(tsv_path) + filters = FilterConfig(max_af=max_af) + panel_result = run_panel_variant_review( + individual_id=output_prefix, panel=panel_obj, store=store, acmg_config=acmg_config, filters=filters + ) + report = panel_report_markdown(panel_result) if report_format == "markdown" else panel_report_json(panel_result) + + panel_log = RunLog( + run_id=f"phase1-panel-{uuid.uuid4()}", + started_at=datetime.now(timezone.utc), + inputs={ + "tsv": str(tsv_path), + "panel": str(panel_path) if panel_path else f"phenotype:{phenotype_id}", + "acmg_config": str(acmg_config_path), + }, + parameters={"max_af": max_af, "report_format": report_format, "phenotype_id": phenotype_id}, + tool_versions={}, + database_versions={}, + config_hashes={"panel": panel_hash, "acmg_config": sha256sum(acmg_config_path)}, + automation_levels={"panel_report": "Auto+Review"}, + overrides=[], + outputs={"report": report}, + notes=None, + ) + panel_log_path = log_dir / f"{output_prefix}_panel_runlog.json" + write_run_log(panel_log, panel_log_path) + + return Phase1Artifacts( + call_log=call_log, + annotate_log=annotate_log, + panel_report=report, + panel_report_format=report_format, + panel_result_log=panel_log, + tsv_path=tsv_path, + joint_vcf=joint_vcf, + ) diff --git a/src/genomic_consultant/orchestration/workflows.py b/src/genomic_consultant/orchestration/workflows.py new file mode 100644 index 0000000..4eb19fd --- /dev/null +++ b/src/genomic_consultant/orchestration/workflows.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import List + +from genomic_consultant.acmg.tagger import ACMGConfig, tag_variant +from genomic_consultant.panels.panels import GenePanel +from genomic_consultant.store.query import GenomicStore +from genomic_consultant.utils.models import FilterConfig, SuggestedClassification, Variant + + +@dataclass +class PanelVariantResult: + variant: Variant + acmg: SuggestedClassification + + +@dataclass +class PanelAnalysisResult: + individual_id: str + panel: GenePanel + variants: List[PanelVariantResult] + + +def run_panel_variant_review( + individual_id: str, + panel: GenePanel, + store: GenomicStore, + acmg_config: ACMGConfig, + filters: FilterConfig | None = None, +) -> PanelAnalysisResult: + """Query variants for a panel and attach ACMG evidence suggestions.""" + variants = store.get_variants_by_gene(individual_id=individual_id, genes=panel.genes, filters=filters) + enriched = [PanelVariantResult(variant=v, acmg=tag_variant(v, acmg_config)) for v in variants] + return PanelAnalysisResult(individual_id=individual_id, panel=panel, variants=enriched) diff --git a/src/genomic_consultant/panels/__init__.py b/src/genomic_consultant/panels/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/genomic_consultant/panels/__init__.py @@ -0,0 +1 @@ + diff --git a/src/genomic_consultant/panels/aggregate.py b/src/genomic_consultant/panels/aggregate.py new file mode 100644 index 0000000..d34c2aa --- /dev/null +++ b/src/genomic_consultant/panels/aggregate.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Dict, Iterable, List, Set + + +def merge_mappings(inputs: Iterable[Path], output: Path, version: str = "merged", sources: List[str] | None = None) -> Path: + """ + Merge multiple phenotype→gene mapping JSON files into one. + Input schema: {"phenotype_to_genes": {"HP:xxxx": ["GENE1", ...]}, "version": "...", "source": "..."} + """ + merged: Dict[str, Set[str]] = {} + source_list: List[str] = sources or [] + for path in inputs: + data = json.loads(Path(path).read_text()) + phenos = data.get("phenotype_to_genes", {}) + for pid, genes in phenos.items(): + merged.setdefault(pid, set()).update(genes) + src_label = data.get("source") or path.name + source_list.append(src_label) + + out = { + "version": version, + "source": ",".join(source_list), + "phenotype_to_genes": {pid: sorted(list(genes)) for pid, genes in merged.items()}, + "metadata": {"merged_from": [str(p) for p in inputs]}, + } + output.parent.mkdir(parents=True, exist_ok=True) + output.write_text(json.dumps(out, indent=2)) + return output diff --git a/src/genomic_consultant/panels/panels.py b/src/genomic_consultant/panels/panels.py new file mode 100644 index 0000000..82ef5cc --- /dev/null +++ b/src/genomic_consultant/panels/panels.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Optional + +from genomic_consultant.utils.models import GenePanel + + +@dataclass +class PanelRepository: + """Loads curated gene panels stored as JSON files.""" + + panels: Dict[str, GenePanel] + + @classmethod + def from_directory(cls, path: Path) -> "PanelRepository": + panels: Dict[str, GenePanel] = {} + for json_file in Path(path).glob("*.json"): + panel = load_panel(json_file) + panels[panel.name] = panel + return cls(panels=panels) + + def get(self, name: str) -> Optional[GenePanel]: + return self.panels.get(name) + + +def load_panel(path: Path) -> GenePanel: + data = json.loads(Path(path).read_text()) + return GenePanel( + name=data["name"], + genes=data["genes"], + source=data.get("source", "unknown"), + version=data.get("version", "unknown"), + last_updated=data.get("last_updated", ""), + metadata=data.get("metadata", {}), + ) diff --git a/src/genomic_consultant/panels/resolver.py b/src/genomic_consultant/panels/resolver.py new file mode 100644 index 0000000..9c9d3f2 --- /dev/null +++ b/src/genomic_consultant/panels/resolver.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional + +from genomic_consultant.utils.models import GenePanel + + +@dataclass +class PhenotypeGeneResolver: + """Resolves phenotype/HPO terms to gene lists using a curated mapping file.""" + + mapping: Dict[str, List[str]] + version: str + source: str + + @classmethod + def from_json(cls, path: Path) -> "PhenotypeGeneResolver": + data = json.loads(Path(path).read_text()) + mapping = data.get("phenotype_to_genes", {}) + version = data.get("version", "unknown") + source = data.get("source", "unknown") + return cls(mapping=mapping, version=version, source=source) + + def resolve(self, phenotype_id: str) -> Optional[List[str]]: + """Return gene list for a phenotype/HPO ID if present.""" + return self.mapping.get(phenotype_id) + + def build_panel(self, phenotype_id: str) -> Optional[GenePanel]: + genes = self.resolve(phenotype_id) + if not genes: + return None + return GenePanel( + name=f"Phenotype:{phenotype_id}", + genes=genes, + source=self.source, + version=self.version, + last_updated="", + metadata={"phenotype_id": phenotype_id}, + ) diff --git a/src/genomic_consultant/pipelines/__init__.py b/src/genomic_consultant/pipelines/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/genomic_consultant/pipelines/__init__.py @@ -0,0 +1 @@ + diff --git a/src/genomic_consultant/pipelines/annotation.py b/src/genomic_consultant/pipelines/annotation.py new file mode 100644 index 0000000..bcd3dd6 --- /dev/null +++ b/src/genomic_consultant/pipelines/annotation.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import List, Sequence + + +@dataclass +class AnnotationPlan: + """Command plan for annotating a VCF with VEP (or similar).""" + + annotated_vcf: Path + flat_table: Path + commands: List[str] + workdir: Path + + +def build_vep_plan( + vcf_path: Path, + workdir: Path, + reference_fasta: Path | None = None, + output_prefix: str = "annotated", + plugins: Sequence[str] | None = None, + extra_flags: Sequence[str] | None = None, +) -> AnnotationPlan: + """ + Build shell commands for running VEP on a VCF. Produces compressed VCF and a flattened TSV. + This is a plan only; execution is left to a runner. + """ + workdir = Path(workdir) + workdir.mkdir(parents=True, exist_ok=True) + + annotated_vcf = workdir / f"{output_prefix}.vep.vcf.gz" + flat_table = workdir / f"{output_prefix}.vep.tsv" + + plugin_arg = "" + if plugins: + plugin_arg = " ".join(f"--plugin {p}" for p in plugins) + + extra_arg = " ".join(extra_flags) if extra_flags else "" + + ref_arg = f"--fasta {reference_fasta}" if reference_fasta else "" + + commands: List[str] = [ + ( + "vep " + f"-i {vcf_path} " + f"-o {annotated_vcf} " + "--vcf --compress_output bgzip " + "--symbol --canonical " + "--af --af_gnomad " + "--polyphen b --sift b " + "--everything " + f"{plugin_arg} " + f"{extra_arg} " + f"{ref_arg}" + ), + f"tabix -p vcf {annotated_vcf}", + ( + "bcftools query " + f"-f '%CHROM\\t%POS\\t%REF\\t%ALT\\t%SYMBOL\\t%Consequence\\t%Protein_position\\t" + f"%PolyPhen\\t%SIFT\\t%CLIN_SIG\\t%AF\\t%gnomAD_AF\\t%SpliceAI\\t%CADD_PHRED\\n' " + f"{annotated_vcf} > {flat_table}" + ), + ] + + return AnnotationPlan( + annotated_vcf=annotated_vcf, + flat_table=flat_table, + commands=commands, + workdir=workdir, + ) diff --git a/src/genomic_consultant/pipelines/runner.py b/src/genomic_consultant/pipelines/runner.py new file mode 100644 index 0000000..ff07806 --- /dev/null +++ b/src/genomic_consultant/pipelines/runner.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +import subprocess +import uuid +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Protocol + +from genomic_consultant.audit.run_log import write_run_log +from genomic_consultant.utils.models import RunLog + + +class HasCommands(Protocol): + commands: List[str] + workdir: Path + + +def execute_plan(plan: HasCommands, automation_level: str, log_path: Path | None = None) -> RunLog: + """ + Execute shell commands defined in a plan sequentially. Captures stdout/stderr and exit codes. + Suitable for VariantCallingPlan or AnnotationPlan. + """ + run_id = str(uuid.uuid4()) + started_at = datetime.utcnow() + outputs: Dict[str, Any] = {} + # The plan object may expose outputs such as joint_vcf/annotated_vcf we can introspect. + for attr in ("joint_vcf", "per_sample_gvcf", "annotated_vcf", "flat_table"): + if hasattr(plan, attr): + outputs[attr] = getattr(plan, attr) + + results: List[Dict[str, Any]] = [] + for idx, cmd in enumerate(plan.commands): + proc = subprocess.run(cmd, shell=True, cwd=plan.workdir, capture_output=True, text=True) + results.append( + { + "step": idx, + "command": cmd, + "returncode": proc.returncode, + "stdout": proc.stdout, + "stderr": proc.stderr, + } + ) + if proc.returncode != 0: + break + + run_log = RunLog( + run_id=run_id, + started_at=started_at, + inputs={"workdir": str(plan.workdir)}, + parameters={}, + tool_versions={}, # left for caller to fill if known + database_versions={}, + config_hashes={}, + automation_levels={"pipeline": automation_level}, + overrides=[], + outputs=outputs | {"command_results": results}, + notes=None, + ) + + if log_path: + write_run_log(run_log, log_path) + return run_log diff --git a/src/genomic_consultant/pipelines/variant_calling.py b/src/genomic_consultant/pipelines/variant_calling.py new file mode 100644 index 0000000..c7279a3 --- /dev/null +++ b/src/genomic_consultant/pipelines/variant_calling.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Sequence + + +@dataclass +class VariantCallingPlan: + """Command plan for running a trio variant calling pipeline.""" + + per_sample_gvcf: Dict[str, Path] + joint_vcf: Path + commands: List[str] + workdir: Path + + +def build_gatk_trio_plan( + samples: Dict[str, Path], + reference_fasta: Path, + workdir: Path, + output_prefix: str = "trio", + intervals: Sequence[Path] | None = None, +) -> VariantCallingPlan: + """ + Build shell commands for a GATK-based trio pipeline (HaplotypeCaller gVCF + joint genotyping). + Does not execute commands; returns a plan for orchestration layers to run. + """ + workdir = Path(workdir) + workdir.mkdir(parents=True, exist_ok=True) + + per_sample_gvcf: Dict[str, Path] = {} + commands: List[str] = [] + interval_args = "" + if intervals: + interval_args = " ".join(f"-L {i}" for i in intervals) + + for sample_id, bam_path in samples.items(): + gvcf_path = workdir / f"{sample_id}.g.vcf.gz" + per_sample_gvcf[sample_id] = gvcf_path + cmd = ( + "gatk HaplotypeCaller " + f"-R {reference_fasta} " + f"-I {bam_path} " + "-O {out} " + "-ERC GVCF " + f"{interval_args}" + ).format(out=gvcf_path) + commands.append(cmd) + + joint_vcf = workdir / f"{output_prefix}.joint.vcf.gz" + joint_cmd = ( + "gatk GenotypeGVCFs " + f"-R {reference_fasta} " + + " ".join(f"--variant {p}" for p in per_sample_gvcf.values()) + + f" -O {joint_vcf}" + ) + commands.append(joint_cmd) + + return VariantCallingPlan( + per_sample_gvcf=per_sample_gvcf, + joint_vcf=joint_vcf, + commands=commands, + workdir=workdir, + ) diff --git a/src/genomic_consultant/reporting/__init__.py b/src/genomic_consultant/reporting/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/genomic_consultant/reporting/__init__.py @@ -0,0 +1 @@ + diff --git a/src/genomic_consultant/reporting/report.py b/src/genomic_consultant/reporting/report.py new file mode 100644 index 0000000..ba587d9 --- /dev/null +++ b/src/genomic_consultant/reporting/report.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +import json +from datetime import datetime, timezone +from typing import List + +from genomic_consultant.orchestration.workflows import PanelAnalysisResult, PanelVariantResult + + +def panel_report_markdown(result: PanelAnalysisResult) -> str: + lines: List[str] = [] + lines.append(f"# Panel Report: {result.panel.name}") + lines.append("") + lines.append(f"- Individual: `{result.individual_id}`") + lines.append(f"- Panel version: `{result.panel.version}` (source: {result.panel.source})") + lines.append(f"- Generated: {datetime.now(timezone.utc).isoformat()}") + lines.append("") + lines.append("## Variants") + if not result.variants: + lines.append("No variants found for this panel with current filters.") + return "\n".join(lines) + + header = "| Variant | Consequence | ClinVar | AF | ACMG suggestion | Evidence |" + lines.append(header) + lines.append("|---|---|---|---|---|---|") + for pv in result.variants: + v = pv.variant + ev_summary = "; ".join(f"{e.tag}({e.strength})" for e in pv.acmg.evidence) or "None" + lines.append( + "|" + + f"{v.id} ({v.gene or 'NA'})" + + f"|{v.consequence or 'NA'}" + + f"|{v.clinvar_significance or 'NA'}" + + f"|{v.allele_frequency if v.allele_frequency is not None else 'NA'}" + + f"|{pv.acmg.suggested_class}" + + f"|{ev_summary}" + + "|" + ) + lines.append("") + lines.append("> Note: ACMG suggestions here are auto-generated and require human review for clinical decisions.") + return "\n".join(lines) + + +def panel_report_json(result: PanelAnalysisResult) -> str: + payload = { + "individual_id": result.individual_id, + "panel": { + "name": result.panel.name, + "version": result.panel.version, + "source": result.panel.source, + }, + "generated": datetime.utcnow().isoformat(), + "variants": [ + { + "id": pv.variant.id, + "gene": pv.variant.gene, + "consequence": pv.variant.consequence, + "clinvar_significance": pv.variant.clinvar_significance, + "allele_frequency": pv.variant.allele_frequency, + "acmg": { + "suggested_class": pv.acmg.suggested_class, + "evidence": [ + {"tag": e.tag, "strength": e.strength, "rationale": e.rationale} + for e in pv.acmg.evidence + ], + "human_classification": pv.acmg.human_classification, + "human_reviewer": pv.acmg.human_reviewer, + "human_notes": pv.acmg.human_notes, + }, + } + for pv in result.variants + ], + "disclaimer": "Auto-generated suggestions; human review required.", + } + return json.dumps(payload, indent=2) diff --git a/src/genomic_consultant/store/__init__.py b/src/genomic_consultant/store/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/genomic_consultant/store/__init__.py @@ -0,0 +1 @@ + diff --git a/src/genomic_consultant/store/parquet_store.py b/src/genomic_consultant/store/parquet_store.py new file mode 100644 index 0000000..aaf2305 --- /dev/null +++ b/src/genomic_consultant/store/parquet_store.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import List, Sequence + +from genomic_consultant.store.query import _matches_any +from genomic_consultant.utils.models import FilterConfig, Variant + +try: + import pandas as pd +except ImportError as exc: # pragma: no cover - import guard + raise ImportError("ParquetGenomicStore requires pandas. Install with `pip install pandas`.") from exc + + +@dataclass +class ParquetGenomicStore: + """Parquet-backed store for larger datasets (optional dependency: pandas).""" + + df: "pd.DataFrame" + + @classmethod + def from_parquet(cls, path: Path) -> "ParquetGenomicStore": + df = pd.read_parquet(path) + return cls(df=df) + + def get_variants_by_gene( + self, individual_id: str, genes: Sequence[str], filters: FilterConfig | None = None + ) -> List[Variant]: + filters = filters or FilterConfig() + subset = self.df[self.df["SYMBOL"].str.upper().isin([g.upper() for g in genes])] + return self._apply_filters(subset, filters) + + def get_variants_by_region( + self, individual_id: str, chrom: str, start: int, end: int, filters: FilterConfig | None = None + ) -> List[Variant]: + filters = filters or FilterConfig() + subset = self.df[(self.df["CHROM"] == chrom) & (self.df["POS"] >= start) & (self.df["POS"] <= end)] + return self._apply_filters(subset, filters) + + def _apply_filters(self, df: "pd.DataFrame", filters: FilterConfig) -> List[Variant]: + mask = pd.Series(True, index=df.index) + if filters.max_af is not None: + mask &= (df["AF"].fillna(df.get("gnomAD_AF")).fillna(0) <= filters.max_af) + if filters.min_af is not None: + mask &= (df["AF"].fillna(df.get("gnomAD_AF")).fillna(0) >= filters.min_af) + if filters.clinvar_significance: + mask &= df["CLIN_SIG"].str.lower().isin([s.lower() for s in filters.clinvar_significance]) + if filters.consequence_includes: + mask &= df["Consequence"].str.lower().apply( + lambda v: _matches_any(v, [c.lower() for c in filters.consequence_includes]) + ) + if filters.consequence_excludes: + mask &= ~df["Consequence"].str.lower().apply( + lambda v: _matches_any(v, [c.lower() for c in filters.consequence_excludes]) + ) + filtered = df[mask] + variants: List[Variant] = [] + for _, row in filtered.iterrows(): + variants.append( + Variant( + chrom=row["CHROM"], + pos=int(row["POS"]), + ref=row["REF"], + alt=row["ALT"], + gene=row.get("SYMBOL"), + consequence=row.get("Consequence"), + protein_change=row.get("Protein_position"), + clinvar_significance=row.get("CLIN_SIG"), + allele_frequency=_maybe_float(row.get("AF")) or _maybe_float(row.get("gnomAD_AF")), + annotations={"gnomad_af": _maybe_float(row.get("gnomAD_AF"))}, + ) + ) + return variants + + +def _maybe_float(value) -> float | None: + try: + if value is None: + return None + return float(value) + except (TypeError, ValueError): + return None diff --git a/src/genomic_consultant/store/query.py b/src/genomic_consultant/store/query.py new file mode 100644 index 0000000..3f0f4d0 --- /dev/null +++ b/src/genomic_consultant/store/query.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +import csv +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Iterable, List, Sequence + +from genomic_consultant.utils.models import FilterConfig, Variant + + +@dataclass +class GenomicStore: + """Lightweight wrapper around annotated variants.""" + + variants: List[Variant] + + @classmethod + def from_tsv(cls, path: Path) -> "GenomicStore": + """ + Load variants from a flattened TSV generated by the annotation plan. + Expected columns (flexible, missing columns are tolerated): + CHROM POS REF ALT SYMBOL Consequence Protein_position PolyPhen SIFT CLIN_SIG AF gnomAD_AF SpliceAI CADD_PHRED + """ + variants: List[Variant] = [] + with Path(path).open() as fh: + reader = csv.DictReader(fh, delimiter="\t") + for row in reader: + row = {k: v for k, v in row.items()} if row else {} + if not row: + continue + variants.append(_row_to_variant(row)) + return cls(variants=variants) + + def get_variants_by_gene( + self, individual_id: str, genes: Sequence[str], filters: FilterConfig | None = None + ) -> List[Variant]: + filters = filters or FilterConfig() + gene_set = {g.upper() for g in genes} + return self._apply_filters((v for v in self.variants if (v.gene or "").upper() in gene_set), filters) + + def get_variants_by_region( + self, individual_id: str, chrom: str, start: int, end: int, filters: FilterConfig | None = None + ) -> List[Variant]: + filters = filters or FilterConfig() + return self._apply_filters( + (v for v in self.variants if v.chrom == chrom and start <= v.pos <= end), + filters, + ) + + def _apply_filters(self, variants: Iterable[Variant], filters: FilterConfig) -> List[Variant]: + out: List[Variant] = [] + for v in variants: + if filters.max_af is not None and v.allele_frequency is not None and v.allele_frequency > filters.max_af: + continue + if filters.min_af is not None and v.allele_frequency is not None and v.allele_frequency < filters.min_af: + continue + if filters.clinvar_significance and (v.clinvar_significance or "").lower() not in { + sig.lower() for sig in filters.clinvar_significance + }: + continue + if filters.consequence_includes and not _matches_any(v.consequence, filters.consequence_includes): + continue + if filters.consequence_excludes and _matches_any(v.consequence, filters.consequence_excludes): + continue + out.append(v) + return out + + +def _matches_any(value: str | None, patterns: Sequence[str]) -> bool: + if value is None: + return False + v = value.lower() + return any(pat.lower() in v for pat in patterns) + + +def _parse_float(val: str | None) -> float | None: + if val in (None, "", "."): + return None + try: + return float(val) + except ValueError: + return None + + +def _row_to_variant(row: Dict[str, str]) -> Variant: + chrom = row.get("CHROM") or row.get("#CHROM") + pos = int(row["POS"]) + af = _parse_float(row.get("AF")) + gnomad_af = _parse_float(row.get("gnomAD_AF")) + splice_ai = _parse_float(row.get("SpliceAI")) + cadd = _parse_float(row.get("CADD_PHRED")) + return Variant( + chrom=chrom, + pos=pos, + ref=row.get("REF"), + alt=row.get("ALT"), + gene=row.get("SYMBOL") or None, + consequence=row.get("Consequence") or None, + protein_change=row.get("Protein_position") or None, + clinvar_significance=row.get("CLIN_SIG") or None, + allele_frequency=af if af is not None else gnomad_af, + annotations={ + "polyphen": row.get("PolyPhen"), + "sift": row.get("SIFT"), + "gnomad_af": gnomad_af, + "splice_ai_delta_score": splice_ai, + "cadd_phred": cadd, + }, + ) diff --git a/src/genomic_consultant/utils/__init__.py b/src/genomic_consultant/utils/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/genomic_consultant/utils/__init__.py @@ -0,0 +1 @@ + diff --git a/src/genomic_consultant/utils/hashing.py b/src/genomic_consultant/utils/hashing.py new file mode 100644 index 0000000..b710541 --- /dev/null +++ b/src/genomic_consultant/utils/hashing.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import hashlib +from pathlib import Path +from typing import Optional + + +def sha256sum(path: Path) -> Optional[str]: + if not Path(path).exists(): + return None + h = hashlib.sha256() + with Path(path).open("rb") as fh: + for chunk in iter(lambda: fh.read(8192), b""): + h.update(chunk) + return h.hexdigest() diff --git a/src/genomic_consultant/utils/models.py b/src/genomic_consultant/utils/models.py new file mode 100644 index 0000000..7b77185 --- /dev/null +++ b/src/genomic_consultant/utils/models.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import datetime +from typing import Any, Dict, List, Optional + + +@dataclass +class Variant: + """Lightweight variant representation used by the query layer.""" + + chrom: str + pos: int + ref: str + alt: str + gene: Optional[str] = None + transcript: Optional[str] = None + consequence: Optional[str] = None + protein_change: Optional[str] = None + clinvar_significance: Optional[str] = None + allele_frequency: Optional[float] = None + annotations: Dict[str, Any] = field(default_factory=dict) + + @property + def id(self) -> str: + return f"{self.chrom}-{self.pos}-{self.ref}-{self.alt}" + + +@dataclass +class FilterConfig: + """Common filters for variant queries.""" + + max_af: Optional[float] = None + min_af: Optional[float] = None + clinvar_significance: Optional[List[str]] = None + consequence_includes: Optional[List[str]] = None + consequence_excludes: Optional[List[str]] = None + inheritances: Optional[List[str]] = None + + +@dataclass +class EvidenceTag: + tag: str + strength: str + rationale: str + + +@dataclass +class SuggestedClassification: + suggested_class: str + evidence: List[EvidenceTag] = field(default_factory=list) + human_classification: Optional[str] = None + human_reviewer: Optional[str] = None + human_notes: Optional[str] = None + + +@dataclass +class GenePanel: + name: str + genes: List[str] + source: str + version: str + last_updated: str + metadata: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class RunLog: + """Machine-readable record of a pipeline or analysis run.""" + + run_id: str + started_at: datetime + inputs: Dict[str, Any] + parameters: Dict[str, Any] + tool_versions: Dict[str, str] + database_versions: Dict[str, str] + config_hashes: Dict[str, str] + automation_levels: Dict[str, str] + overrides: List[Dict[str, Any]] = field(default_factory=list) + outputs: Dict[str, Any] = field(default_factory=dict) + notes: Optional[str] = None diff --git a/src/genomic_consultant/utils/tooling.py b/src/genomic_consultant/utils/tooling.py new file mode 100644 index 0000000..9927c4a --- /dev/null +++ b/src/genomic_consultant/utils/tooling.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +import subprocess +from typing import Dict + + +def probe_tool_versions(commands: Dict[str, str]) -> Dict[str, str]: + """ + Attempt to get tool versions by running the provided commands with --version. + Returns best-effort outputs; missing tools are skipped. + """ + results: Dict[str, str] = {} + for name, cmd in commands.items(): + try: + proc = subprocess.run(f"{cmd} --version", shell=True, capture_output=True, text=True, timeout=10) + if proc.returncode == 0 and proc.stdout: + results[name] = proc.stdout.strip().splitlines()[0] + elif proc.returncode == 0 and proc.stderr: + results[name] = proc.stderr.strip().splitlines()[0] + except Exception: + continue + return results diff --git a/tests/test_acmg.py b/tests/test_acmg.py new file mode 100644 index 0000000..958b388 --- /dev/null +++ b/tests/test_acmg.py @@ -0,0 +1,41 @@ +from genomic_consultant.acmg.tagger import ACMGConfig, tag_variant +from genomic_consultant.utils.models import Variant + + +def test_ba1_trumps(): + cfg = ACMGConfig(ba1_af=0.05, bs1_af=0.01, pm2_af=0.0005, lof_genes=set()) + v = Variant(chrom="1", pos=1, ref="A", alt="T", allele_frequency=0.2) + result = tag_variant(v, cfg) + assert result.suggested_class == "Benign" + assert any(e.tag == "BA1" for e in result.evidence) + + +def test_pvs1_pm2_likely_pathogenic(): + cfg = ACMGConfig(lof_genes={"GENE1"}, pm2_af=0.0005, ba1_af=0.05, bs1_af=0.01) + v = Variant( + chrom="1", + pos=1, + ref="A", + alt="T", + gene="GENE1", + consequence="stop_gained", + allele_frequency=0.0001, + ) + result = tag_variant(v, cfg) + assert result.suggested_class == "Likely pathogenic" + tags = {e.tag for e in result.evidence} + assert {"PVS1", "PM2"} <= tags + + +def test_bp7_supporting(): + cfg = ACMGConfig(bp7_splice_ai_max=0.1) + v = Variant( + chrom="1", + pos=1, + ref="A", + alt="T", + consequence="synonymous_variant", + annotations={"splice_ai_delta_score": 0.05}, + ) + result = tag_variant(v, cfg) + assert any(e.tag == "BP7" for e in result.evidence) diff --git a/tests/test_aggregate.py b/tests/test_aggregate.py new file mode 100644 index 0000000..b4856ea --- /dev/null +++ b/tests/test_aggregate.py @@ -0,0 +1,16 @@ +from pathlib import Path + +from genomic_consultant.panels.aggregate import merge_mappings +import json + + +def test_merge_mappings(tmp_path: Path): + a = tmp_path / "a.json" + b = tmp_path / "b.json" + a.write_text('{"source":"A","phenotype_to_genes":{"HP:1":["G1","G2"]}}') + b.write_text('{"source":"B","phenotype_to_genes":{"HP:1":["G2","G3"],"HP:2":["G4"]}}') + out = tmp_path / "out.json" + merge_mappings([a, b], out) + data = json.loads(out.read_text()) + assert sorted(data["phenotype_to_genes"]["HP:1"]) == ["G1", "G2", "G3"] + assert data["phenotype_to_genes"]["HP:2"] == ["G4"] diff --git a/tests/test_phase1_pipeline.py b/tests/test_phase1_pipeline.py new file mode 100644 index 0000000..bcb63ec --- /dev/null +++ b/tests/test_phase1_pipeline.py @@ -0,0 +1,27 @@ +from pathlib import Path + +from genomic_consultant.orchestration.phase1_pipeline import run_phase1_pipeline + + +def test_phase1_pipeline_with_existing_tsv(): + root = Path(__file__).resolve().parents[1] + tsv = root / "sample_data/example_annotated.tsv" + panel = root / "configs/panel.example.json" + acmg = root / "configs/acmg_config.example.yaml" + result = run_phase1_pipeline( + samples=None, + reference_fasta=None, + workdir=root / "runtime", + output_prefix="demo", + acmg_config_path=acmg, + max_af=0.05, + panel_path=panel, + phenotype_id=None, + phenotype_mapping=None, + report_format="markdown", + existing_tsv=tsv, + skip_call=True, + skip_annotate=True, + ) + assert "Panel Report" in result.panel_report + assert result.tsv_path == tsv diff --git a/tests/test_resolver.py b/tests/test_resolver.py new file mode 100644 index 0000000..d77b98b --- /dev/null +++ b/tests/test_resolver.py @@ -0,0 +1,18 @@ +from pathlib import Path + +from genomic_consultant.panels.resolver import PhenotypeGeneResolver + + +def test_resolver_build_panel(tmp_path: Path): + data = { + "version": "test", + "source": "example", + "phenotype_to_genes": {"HP:0001": ["GENE1", "GENE2"]}, + } + path = tmp_path / "map.json" + path.write_text('{"version":"test","source":"example","phenotype_to_genes":{"HP:0001":["GENE1","GENE2"]}}') + resolver = PhenotypeGeneResolver.from_json(path) + panel = resolver.build_panel("HP:0001") + assert panel is not None + assert panel.genes == ["GENE1", "GENE2"] + assert "phenotype_id" in panel.metadata diff --git a/tests/test_store.py b/tests/test_store.py new file mode 100644 index 0000000..ece2978 --- /dev/null +++ b/tests/test_store.py @@ -0,0 +1,28 @@ +from genomic_consultant.store.query import GenomicStore +from genomic_consultant.utils.models import FilterConfig, Variant + + +def test_filter_by_gene_and_af(): + store = GenomicStore( + variants=[ + Variant(chrom="1", pos=1, ref="A", alt="T", gene="GENE1", allele_frequency=0.02), + Variant(chrom="1", pos=2, ref="G", alt="C", gene="GENE2", allele_frequency=0.0001), + ] + ) + res = store.get_variants_by_gene("ind", ["GENE2"], filters=FilterConfig(max_af=0.001)) + assert len(res) == 1 + assert res[0].gene == "GENE2" + + +def test_consequence_include_exclude(): + store = GenomicStore( + variants=[ + Variant(chrom="1", pos=1, ref="A", alt="T", gene="GENE1", consequence="missense_variant"), + Variant(chrom="1", pos=2, ref="G", alt="C", gene="GENE1", consequence="synonymous_variant"), + ] + ) + res = store.get_variants_by_gene( + "ind", ["GENE1"], filters=FilterConfig(consequence_includes=["missense"], consequence_excludes=["synonymous"]) + ) + assert len(res) == 1 + assert res[0].consequence == "missense_variant" diff --git a/tests/test_store_tsv.py b/tests/test_store_tsv.py new file mode 100644 index 0000000..56b5467 --- /dev/null +++ b/tests/test_store_tsv.py @@ -0,0 +1,33 @@ +from pathlib import Path + +from genomic_consultant.store.query import GenomicStore + + +def test_from_tsv_with_extra_columns(tmp_path: Path): + content = "\t".join( + [ + "#CHROM", + "POS", + "REF", + "ALT", + "SYMBOL", + "Consequence", + "Protein_position", + "PolyPhen", + "SIFT", + "CLIN_SIG", + "AF", + "gnomAD_AF", + "SpliceAI", + "CADD_PHRED", + ] + ) + "\n" + content += "1\t100\tA\tT\tGENE1\tmissense_variant\tp.X\t.\t.\tPathogenic\t0.0001\t0.0002\t0.05\t20.1\n" + path = tmp_path / "v.tsv" + path.write_text(content) + + store = GenomicStore.from_tsv(path) + assert len(store.variants) == 1 + v = store.variants[0] + assert v.annotations["splice_ai_delta_score"] == 0.05 + assert v.annotations["cadd_phred"] == 20.1