Initial commit
This commit is contained in:
46
.gitignore
vendored
Normal file
46
.gitignore
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
# Python and environments
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
.venv/
|
||||
env/
|
||||
.env
|
||||
.git-mcp/
|
||||
.pytest_cache/
|
||||
.mypy_cache/
|
||||
.ruff_cache/
|
||||
.coverage
|
||||
|
||||
# Notebooks
|
||||
.ipynb_checkpoints/
|
||||
|
||||
# Editor
|
||||
.DS_Store
|
||||
.idea/
|
||||
.vscode/
|
||||
|
||||
# Genomics data (avoid accidental commits)
|
||||
*.bam
|
||||
*.bai
|
||||
*.sam
|
||||
*.cram
|
||||
*.crai
|
||||
*.vcf
|
||||
*.vcf.gz
|
||||
*.vcf.gz.tbi
|
||||
*.g.vcf
|
||||
*.g.vcf.gz
|
||||
*.g.vcf.gz.tbi
|
||||
*.bed
|
||||
*.bed.gz
|
||||
*.bed.gz.tbi
|
||||
*.fastq
|
||||
*.fastq.gz
|
||||
*.fq
|
||||
*.fq.gz
|
||||
|
||||
# Reports / tmp
|
||||
*.log
|
||||
*.tmp
|
||||
tmp/
|
||||
reports/
|
||||
runtime/
|
||||
14
Makefile
Normal file
14
Makefile
Normal file
@@ -0,0 +1,14 @@
|
||||
PYTHON := .venv/bin/python
|
||||
PIP := .venv/bin/pip
|
||||
|
||||
.PHONY: venv install test
|
||||
|
||||
venv:
|
||||
python3 -m venv .venv
|
||||
$(PIP) install -e .
|
||||
|
||||
install: venv
|
||||
$(PIP) install -e .[store]
|
||||
|
||||
test:
|
||||
$(PYTHON) -m pytest
|
||||
112
README.md
Normal file
112
README.md
Normal file
@@ -0,0 +1,112 @@
|
||||
# Genomic Consultant
|
||||
|
||||
Early design for a personal genomic risk and drug–interaction decision support system. Specs are sourced from `genomic_decision_support_system_spec_v0.1.md`.
|
||||
|
||||
## Vision (per spec)
|
||||
- Phase 1: trio variant calling, annotation, queryable genomic DB, initial ACMG evidence tagging.
|
||||
- Phase 2: pharmacogenomics genotype-to-phenotype mapping plus drug–drug interaction checks.
|
||||
- Phase 3: supplement/herb normalization and interaction risk layering.
|
||||
- Phase 4: LLM-driven query orchestration and report generation.
|
||||
|
||||
## Repository Layout
|
||||
- `docs/` — system architecture notes, phase plans, data models (work in progress).
|
||||
- `configs/` — example ACMG config and gene panel JSON.
|
||||
- `configs/phenotype_to_genes.example.json` — placeholder phenotype/HPO → gene mappings.
|
||||
- `configs/phenotype_to_genes.hpo_seed.json` — seed HPO mappings (replace with full HPO/GenCC derived panels).
|
||||
- `sample_data/` — tiny annotated TSV for demo.
|
||||
- `src/genomic_consultant/` — Python scaffolding (pipelines, store, panel lookup, ACMG tagging, reporting).
|
||||
- `genomic_decision_support_system_spec_v0.1.md` — original requirements draft.
|
||||
|
||||
## Contributing/next steps
|
||||
1. Finalize Phase 1 tech selection (variant caller, annotation stack, reference/DB versions).
|
||||
2. Stand up the Phase 1 pipelines and minimal query API surface.
|
||||
3. Add ACMG evidence tagging config and human-review logging.
|
||||
4. Layer in PGx/DDI and supplement modules per later phases.
|
||||
|
||||
Data safety: keep genomic/clinical data local; the `.gitignore` blocks common genomic outputs by default.
|
||||
|
||||
## Quickstart (CLI scaffolding)
|
||||
```
|
||||
pip install -e .
|
||||
|
||||
# 1) Show trio calling plan (commands only; not executed)
|
||||
genomic-consultant plan-call \
|
||||
--sample proband:/data/proband.bam \
|
||||
--sample father:/data/father.bam \
|
||||
--sample mother:/data/mother.bam \
|
||||
--reference /refs/GRCh38.fa \
|
||||
--workdir /tmp/trio
|
||||
|
||||
# 1b) Execute calling plan (requires GATK installed) and emit run log
|
||||
genomic-consultant run-call \
|
||||
--sample proband:/data/proband.bam \
|
||||
--sample father:/data/father.bam \
|
||||
--sample mother:/data/mother.bam \
|
||||
--reference /refs/GRCh38.fa \
|
||||
--workdir /tmp/trio \
|
||||
--log /tmp/trio/run_call_log.json \
|
||||
--probe-tools
|
||||
|
||||
# 2) Show annotation plan for a joint VCF
|
||||
genomic-consultant plan-annotate \
|
||||
--vcf /tmp/trio/trio.joint.vcf.gz \
|
||||
--workdir /tmp/trio/annot \
|
||||
--prefix trio \
|
||||
--reference /refs/GRCh38.fa
|
||||
|
||||
# 2b) Execute annotation plan (requires VEP, bcftools) with run log
|
||||
genomic-consultant run-annotate \
|
||||
--vcf /tmp/trio/trio.joint.vcf.gz \
|
||||
--workdir /tmp/trio/annot \
|
||||
--prefix trio \
|
||||
--reference /refs/GRCh38.fa \
|
||||
--log /tmp/trio/annot/run_annot_log.json \
|
||||
--probe-tools
|
||||
|
||||
# 3) Demo panel report using sample data (panel file)
|
||||
genomic-consultant panel-report \
|
||||
--tsv sample_data/example_annotated.tsv \
|
||||
--panel configs/panel.example.json \
|
||||
--acmg-config configs/acmg_config.example.yaml \
|
||||
--individual-id demo \
|
||||
--format markdown \
|
||||
--log /tmp/panel_log.json
|
||||
|
||||
# 3b) Demo panel report using phenotype mapping (HPO)
|
||||
genomic-consultant panel-report \
|
||||
--tsv sample_data/example_annotated.tsv \
|
||||
--phenotype-id HP:0000365 \
|
||||
--phenotype-mapping configs/phenotype_to_genes.hpo_seed.json \
|
||||
--acmg-config configs/acmg_config.example.yaml \
|
||||
--individual-id demo \
|
||||
--format markdown
|
||||
|
||||
# 3c) Merge multiple phenotype→gene mappings into one
|
||||
genomic-consultant build-phenotype-mapping \
|
||||
--output configs/phenotype_to_genes.merged.json \
|
||||
configs/phenotype_to_genes.example.json configs/phenotype_to_genes.hpo_seed.json
|
||||
|
||||
# 4) End-to-end Phase 1 pipeline (optionally skip call/annotate; use sample TSV)
|
||||
genomic-consultant phase1-run \
|
||||
--tsv sample_data/example_annotated.tsv \
|
||||
--skip-call --skip-annotate \
|
||||
--panel configs/panel.example.json \
|
||||
--acmg-config configs/acmg_config.example.yaml \
|
||||
--workdir runtime \
|
||||
--prefix demo
|
||||
|
||||
# Run tests
|
||||
pytest
|
||||
```
|
||||
|
||||
### Optional Parquet-backed store
|
||||
Install pandas to enable Parquet ingestion:
|
||||
```
|
||||
pip install -e .[store]
|
||||
```
|
||||
|
||||
### Notes on VEP plugins (SpliceAI/CADD)
|
||||
- The annotation plan already queries `SpliceAI` and `CADD_PHRED` fields; ensure your VEP run includes plugins/flags that produce them, e.g.:
|
||||
- `--plugin SpliceAI,snv=/path/to/spliceai.snv.vcf.gz,indel=/path/to/spliceai.indel.vcf.gz`
|
||||
- `--plugin CADD,/path/to/whole_genome_SNVs.tsv.gz,/path/to/InDels.tsv.gz`
|
||||
- Pass these via `--plugin` and/or `--extra-flag` on `run-annotate` / `plan-annotate` to embed fields into the TSV.
|
||||
12
configs/acmg_config.example.yaml
Normal file
12
configs/acmg_config.example.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
# ACMG evidence thresholds (example, adjust per lab policy)
|
||||
ba1_af: 0.05 # Stand-alone benign if AF >= this
|
||||
bs1_af: 0.01 # Strong benign if AF >= this (and not meeting BA1)
|
||||
pm2_af: 0.0005 # Moderate pathogenic if AF <= this
|
||||
bp7_splice_ai_max: 0.1 # Supporting benign if synonymous and predicted low splice impact
|
||||
|
||||
# Genes considered loss-of-function sensitive for PVS1 auto-tagging
|
||||
lof_genes:
|
||||
- BRCA1
|
||||
- BRCA2
|
||||
- TTN
|
||||
- CFTR
|
||||
10
configs/panel.example.json
Normal file
10
configs/panel.example.json
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"name": "Hearing loss core",
|
||||
"version": "0.1",
|
||||
"source": "curated",
|
||||
"last_updated": "2024-06-01",
|
||||
"genes": ["GJB2", "SLC26A4", "MITF", "OTOF"],
|
||||
"metadata": {
|
||||
"notes": "Example panel for demo; replace with curated panel and provenance."
|
||||
}
|
||||
}
|
||||
11
configs/phenotype_to_genes.example.json
Normal file
11
configs/phenotype_to_genes.example.json
Normal file
@@ -0,0 +1,11 @@
|
||||
{
|
||||
"version": "0.1",
|
||||
"source": "example-curated",
|
||||
"phenotype_to_genes": {
|
||||
"HP:0000365": ["GJB2", "SLC26A4", "OTOF"],
|
||||
"HP:0000510": ["MITF", "SOX10"]
|
||||
},
|
||||
"metadata": {
|
||||
"notes": "Placeholder mapping; replace with curated HPO/OMIM/GenCC panels."
|
||||
}
|
||||
}
|
||||
13
configs/phenotype_to_genes.hpo_seed.json
Normal file
13
configs/phenotype_to_genes.hpo_seed.json
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"version": "2024-11-01",
|
||||
"source": "HPO-curated-seed",
|
||||
"phenotype_to_genes": {
|
||||
"HP:0000365": ["GJB2", "SLC26A4", "OTOF", "TECTA"],
|
||||
"HP:0001250": ["SCN1A", "KCNQ2", "STXBP1"],
|
||||
"HP:0001631": ["MYH7", "TNNT2", "MYBPC3"],
|
||||
"HP:0001156": ["COL1A1", "COL1A2", "PLOD2"]
|
||||
},
|
||||
"metadata": {
|
||||
"notes": "Seed mapping using common phenotype IDs; replace with full HPO-derived panels."
|
||||
}
|
||||
}
|
||||
101
docs/phase1_howto.md
Normal file
101
docs/phase1_howto.md
Normal file
@@ -0,0 +1,101 @@
|
||||
# Phase 1 How-To: BAM → VCF → Annotate → Panel Report
|
||||
|
||||
本文件說明如何用現有 CLI 從 BAM 執行到報告輸出,選項意義,以及可跳過步驟的用法。假設已安裝 GATK、VEP、bcftools/tabix,並以 `pip install -e .` 安裝本專案。
|
||||
|
||||
## 流程總覽
|
||||
1) Trio variant calling → joint VCF
|
||||
2) VEP 註解 → annotated VCF + 平坦 TSV
|
||||
3) Panel/phenotype 查詢 + ACMG 標籤 → Markdown/JSON 報告
|
||||
可用 `phase1-run` 一鍵(可跳過 call/annotate),或分步 `run-call` / `run-annotate` / `panel-report`。
|
||||
|
||||
## 分步執行
|
||||
|
||||
### 1) Variant calling(GATK)
|
||||
```bash
|
||||
genomic-consultant run-call \
|
||||
--sample proband:/path/proband.bam \
|
||||
--sample father:/path/father.bam \
|
||||
--sample mother:/path/mother.bam \
|
||||
--reference /refs/GRCh38.fa \
|
||||
--workdir /tmp/trio \
|
||||
--prefix trio \
|
||||
--log /tmp/trio/run_call_log.json \
|
||||
--probe-tools
|
||||
```
|
||||
- `--sample`: `sample_id:/path/to.bam`,可重複。
|
||||
- `--reference`: 參考序列。
|
||||
- `--workdir`: 輸出與中間檔位置。
|
||||
- `--prefix`: 輸出檔名前綴。
|
||||
- `--log`: run log(JSON)路徑。
|
||||
- 輸出:joint VCF (`/tmp/trio/trio.joint.vcf.gz`)、run log(含指令/returncode)。
|
||||
|
||||
### 2) Annotation(VEP + bcftools)
|
||||
```bash
|
||||
genomic-consultant run-annotate \
|
||||
--vcf /tmp/trio/trio.joint.vcf.gz \
|
||||
--workdir /tmp/trio/annot \
|
||||
--prefix trio \
|
||||
--reference /refs/GRCh38.fa \
|
||||
--plugin 'SpliceAI,snv=/path/spliceai.snv.vcf.gz,indel=/path/spliceai.indel.vcf.gz' \
|
||||
--plugin 'CADD,/path/whole_genome_SNVs.tsv.gz,/path/InDels.tsv.gz' \
|
||||
--extra-flag "--cache --offline" \
|
||||
--log /tmp/trio/annot/run_annot_log.json \
|
||||
--probe-tools
|
||||
```
|
||||
- `--plugin`: VEP plugin 規格,可重複(示範 SpliceAI/CADD)。
|
||||
- `--extra-flag`: 附加給 VEP 的旗標(如 cache/offline)。
|
||||
- 輸出:annotated VCF (`trio.vep.vcf.gz`)、平坦 TSV (`trio.vep.tsv`)、run log。
|
||||
|
||||
### 3) Panel/Phenotype 報告
|
||||
使用 panel 檔:
|
||||
```bash
|
||||
genomic-consultant panel-report \
|
||||
--tsv /tmp/trio/annot/trio.vep.tsv \
|
||||
--panel configs/panel.example.json \
|
||||
--acmg-config configs/acmg_config.example.yaml \
|
||||
--individual-id proband \
|
||||
--max-af 0.05 \
|
||||
--format markdown \
|
||||
--log /tmp/trio/panel_log.json
|
||||
```
|
||||
使用 phenotype 直譯 panel:
|
||||
```bash
|
||||
genomic-consultant panel-report \
|
||||
--tsv /tmp/trio/annot/trio.vep.tsv \
|
||||
--phenotype-id HP:0000365 \
|
||||
--phenotype-mapping configs/phenotype_to_genes.hpo_seed.json \
|
||||
--acmg-config configs/acmg_config.example.yaml \
|
||||
--individual-id proband \
|
||||
--max-af 0.05 \
|
||||
--format markdown
|
||||
```
|
||||
- `--max-af`: 過濾等位基因頻率上限。
|
||||
- `--format`: `markdown` 或 `json`。
|
||||
- 輸出:報告文字 + run log(記錄 panel/ACMG config 的 hash)。
|
||||
|
||||
## 一鍵模式(可跳過 call/annotate)
|
||||
已經有 joint VCF/TSV 時,可跳過前兩步:
|
||||
```bash
|
||||
genomic-consultant phase1-run \
|
||||
--workdir /tmp/trio \
|
||||
--prefix trio \
|
||||
--tsv /tmp/trio/annot/trio.vep.tsv \
|
||||
--skip-call --skip-annotate \
|
||||
--panel configs/panel.example.json \
|
||||
--acmg-config configs/acmg_config.example.yaml \
|
||||
--max-af 0.05 \
|
||||
--format markdown \
|
||||
--log-dir /tmp/trio/runtime
|
||||
```
|
||||
若要實際跑 VEP,可移除 `--skip-annotate` 並提供 `--plugins/--extra-flag`;若要跑 calling,移除 `--skip-call` 並提供 `--sample`/`--reference`。
|
||||
|
||||
## 主要輸出
|
||||
- joint VCF(呼叫結果)
|
||||
- annotated VCF + 平坦 TSV(含 gene/consequence/ClinVar/AF/SpliceAI/CADD 等欄位)
|
||||
- run logs(JSON,含指令、return code、config hash;在 `--log` 或 `--log-dir`)
|
||||
- Panel 報告(Markdown 或 JSON),附 ACMG 自動標籤,需人工複核
|
||||
|
||||
## 注意
|
||||
- Call/annotate 依賴外部工具與對應資源(參考序列、VEP cache、SpliceAI/CADD 資料)。
|
||||
- 若無 BAM/資源,可用 sample TSV:`phase1-run --tsv sample_data/example_annotated.tsv --skip-call --skip-annotate ...` 演示報告。
|
||||
- 安全:`.gitignore` 已排除大型基因檔案;建議本地受控環境執行。
|
||||
80
docs/phase1_implementation_plan.md
Normal file
80
docs/phase1_implementation_plan.md
Normal file
@@ -0,0 +1,80 @@
|
||||
# Phase 1 Implementation Plan (Genomic Foundation)
|
||||
|
||||
Scope: deliver a working trio-based variant pipeline, annotated genomic store, query APIs, initial ACMG evidence tagging, and reporting/logging scaffolding. This assumes local execution with Python 3.11+.
|
||||
|
||||
## Objectives
|
||||
- Trio BAM → joint VCF with QC artifacts (`Auto`).
|
||||
- Annotate variants with population frequency, ClinVar, consequence/prediction (`Auto`).
|
||||
- Provide queryable interfaces for gene/region lookups with filters (`Auto`).
|
||||
- Disease/phenotype → gene panel lookup and filtered variant outputs (`Auto+Review`).
|
||||
- Auto-tag subset of ACMG criteria; human-only final classification (`Auto+Review`).
|
||||
- Produce machine-readable run logs with versions, configs, and overrides.
|
||||
|
||||
## Work Breakdown
|
||||
1) **Data & references**
|
||||
- Reference genome: GRCh38 (primary) with option for GRCh37; pin version hash.
|
||||
- Resource bundles: known sites for BQSR (if using GATK), gnomAD/ClinVar versions for annotation.
|
||||
- Test fixtures: small trio BAM/CRAM subset or GIAB trio downsampled for CI-like checks.
|
||||
|
||||
2) **Variant calling pipeline (wrapper, `Auto`)**
|
||||
- Tooling: GATK HaplotypeCaller → gVCF; GenotypeGVCFs for joint calls. Alt: DeepVariant + joint genotyper (parameterized).
|
||||
- Steps:
|
||||
- Validate inputs (file presence, reference match).
|
||||
- Optional QC: coverage, duplicates, on-target.
|
||||
- Generate per-sample gVCF; joint genotyping to trio VCF.
|
||||
- Outputs: joint VCF + index; QC summary JSON/TSV; log with tool versions and params.
|
||||
|
||||
3) **Annotation pipeline (`Auto`)**
|
||||
- Tooling: Ensembl VEP with plugins for gnomAD, ClinVar, CADD, SpliceAI where available; alt path: ANNOVAR.
|
||||
- Steps:
|
||||
- Normalize variants (bcftools norm) if needed.
|
||||
- Annotate with gene, transcript, protein change; population AF; ClinVar significance; consequence; predictions (SIFT/PolyPhen/CADD); inheritance flags.
|
||||
- Include SpliceAI/CADD plugins if installed; CLI accepts extra flags/plugins to embed SpliceAI/CADD fields.
|
||||
- Outputs: annotated VCF; flattened TSV/Parquet for faster querying; manifest of DB versions used.
|
||||
|
||||
4) **Genomic store + query API (`Auto`)**
|
||||
- Early option: tabix-indexed VCF with Python wrapper.
|
||||
- Functions (Python module `genomic_store`):
|
||||
- `get_variants_by_gene(individual_id, genes, filters)`
|
||||
Filters: AF thresholds, consequence categories, ClinVar significance, inheritance pattern.
|
||||
- `get_variants_by_region(individual_id, chrom, start, end, filters)`
|
||||
- `list_genes_with_variants(individual_id, filters)` (optional).
|
||||
- Filters defined in a `FilterConfig` dataclass; serialize-able for logging.
|
||||
- Future option: import to SQLite/Postgres via Arrow/Parquet for richer predicates.
|
||||
|
||||
5) **Disease/phenotype → gene panel (`Auto+Review`)**
|
||||
- Data: HPO/OMIM/GenCC lookup or curated JSON panels.
|
||||
- Function: `get_gene_panel(disease_or_hpo_id, version=None)` returning gene list + provenance.
|
||||
- Phenotype resolver: curated JSON mapping (e.g., `phenotype_to_genes.example.json`) as placeholder until upstream data is wired; allow dynamic panel synthesis by phenotype ID in CLI; support merging multiple sources into one mapping.
|
||||
- Flow: resolve panel → call genomic store → apply simple ranking (AF low, ClinVar pathogenicity high).
|
||||
- Manual review points: panel curation, rank threshold tuning.
|
||||
|
||||
6) **ACMG evidence tagging (subset, `Auto+Review`)**
|
||||
- Criteria to auto-evaluate initially: PVS1 (LoF in LoF-sensitive gene list), PM2 (absent/rare in population), BA1/BS1 (common frequency), possible BP7 (synonymous, no splicing impact).
|
||||
- Config: YAML/JSON with thresholds (AF cutoffs, LoF gene list, transcript precedence).
|
||||
- Output schema per variant: `{variant_id, evidence: [tag, strength, rationale], suggested_class}` with suggested class computed purely from auto tags; final class left blank for human.
|
||||
- Logging: capture rule version and reasons for each fired rule.
|
||||
|
||||
7) **Run logging and versioning**
|
||||
- Every pipeline run emits `run_log.json` containing:
|
||||
- Inputs (sample IDs, file paths, reference build).
|
||||
- Tool versions and parameters; DB versions; config hashes (panel/ACMG configs).
|
||||
- Automation level per step; manual overrides (`who/when/why`).
|
||||
- Derived artifacts paths and checksums.
|
||||
- Embed run_log reference in reports.
|
||||
|
||||
8) **Report template (minimal)**
|
||||
- Input: disease/gene panel name, variant query results, ACMG evidence tags.
|
||||
- Output: Markdown + JSON summary with sections: context, methods, variants table, limitations.
|
||||
- Mark human-only decisions clearly.
|
||||
|
||||
## Milestones
|
||||
- **M1**: Repo scaffolding + configs; tiny trio test data wired; `make pipeline` runs through variant calling + annotation on fixture.
|
||||
- **M2**: Genomic store wrapper with gene/region queries; filter config; basic CLI/Notebook demo.
|
||||
- **M3**: Panel lookup + ranked variant listing; ACMG auto tags on outputs; run_log generation.
|
||||
- **M4**: Minimal report generator + acceptance of human-reviewed classifications.
|
||||
|
||||
## Validation Strategy
|
||||
- Unit tests for filter logic, panel resolution, ACMG tagger decisions (synthetic variants).
|
||||
- Integration test on small fixture trio to ensure call → annotate → query path works.
|
||||
- Determinism checks: hash configs and verify outputs stable across runs given same inputs.
|
||||
69
docs/system_architecture.md
Normal file
69
docs/system_architecture.md
Normal file
@@ -0,0 +1,69 @@
|
||||
# System Architecture Blueprint (v0.1)
|
||||
|
||||
This document turns `genomic_decision_support_system_spec_v0.1.md` into a buildable architecture and phased roadmap. Automation levels follow `Auto / Auto+Review / Human-only`.
|
||||
|
||||
## High-Level Views
|
||||
- **Core layers**: (1) sequencing ingest → variant calling/annotation (`Auto`), (2) genomic query layer (`Auto`), (3) rule engines (ACMG, PGx, DDI, supplements; mixed automation), (4) orchestration/LLM and report generation (`Auto` tool calls, `Auto+Review` outputs).
|
||||
- **Data custody**: all PHI/genomic artifacts remain local; external calls require de-identification or private models.
|
||||
- **Traceability**: every run records tool versions, database snapshots, configs, and manual overrides in machine-readable logs.
|
||||
|
||||
### End-to-end flow
|
||||
```
|
||||
BAM (proband + parents)
|
||||
↓ Variant Calling (gVCF) [Auto]
|
||||
Joint Genotyper → joint VCF
|
||||
↓ Annotation (VEP/ANNOVAR + ClinVar/gnomAD etc.) [Auto]
|
||||
↓ Genomic Store (VCF+tabix or SQL) + Query API [Auto]
|
||||
↓
|
||||
├─ Disease/Phenotype → Gene Panel lookup [Auto+Review]
|
||||
│ └─ Panel variants with basic ranking (freq, ClinVar) [Auto+Review]
|
||||
├─ ACMG evidence tagging subset (PVS1, PM2, BA1, BS1…) [Auto+Review]
|
||||
├─ PGx genotype→phenotype and recommendation rules [Auto → Auto+Review]
|
||||
├─ DDI rule evaluation [Auto]
|
||||
└─ Supplement/Herb normalization + interaction rules [Auto+Review → Human-only]
|
||||
↓
|
||||
LLM/Orchestrator routes user questions to tools, produces JSON + Markdown drafts [Auto tools, Auto+Review narratives]
|
||||
```
|
||||
|
||||
## Phase Roadmap (build-first view)
|
||||
- **Phase 1 – Genomic foundation**
|
||||
- Deliverables: trio joint VCF + annotation; query functions (`get_variants_by_gene/region`); disease→gene panel lookup; partial ACMG evidence tagging.
|
||||
- Data stores: tabix-backed VCF wrapper initially; optional SQLite/Postgres import later.
|
||||
- Interfaces: Python CLI/SDK first; machine-readable run logs with versions and automation levels.
|
||||
- **Phase 2 – PGx & DDI**
|
||||
- Drug vocabulary normalization (ATC/RxNorm).
|
||||
- PGx engine: star-allele calling or rule-based genotype→phenotype; guideline-mapped advice with review gates.
|
||||
- DDI engine: rule base with severity tiers; combine with PGx outputs.
|
||||
- **Phase 3 – Supplements & Herbs**
|
||||
- Name/ingredient normalization; herb formula expansion.
|
||||
- Rule tables for CYP/transporters, coagulation, CNS effects.
|
||||
- Evidence grading and conservative messaging; human-only final clinical language.
|
||||
- **Phase 4 – LLM Interface & Reports**
|
||||
- Tool-calling schema for queries listed above.
|
||||
- JSON + Markdown report templates with traceability to rules, data versions, and overrides.
|
||||
|
||||
## Module Boundaries
|
||||
- **Variant Calling Pipeline** (`Auto`): wrapper around GATK or DeepVariant + joint genotyper; pluggable reference genome; QC summaries.
|
||||
- **Annotation Pipeline** (`Auto`): VEP/ANNOVAR with pinned database versions (gnomAD, ClinVar, transcript set); emits annotated VCF + flat table.
|
||||
- **Genomic Query Layer** (`Auto`): abstraction over tabix or SQL; minimal APIs: `get_variants_by_gene`, `get_variants_by_region`, filters (freq, consequence, clinvar).
|
||||
- **Disease/Phenotype to Panel** (`Auto+Review`): HPO/OMIM lookups or curated panels; panel versioned; feeds queries.
|
||||
- **Phenotype Resolver** (`Auto+Review`): JSON/DB mapping of phenotype/HPO IDs to gene lists as a placeholder until upstream sources are integrated; can synthesize panels dynamically and merge multiple sources.
|
||||
- **ACMG Evidence Tagger** (`Auto+Review`): auto-evaluable criteria only; config-driven thresholds; human-only final classification.
|
||||
- **PGx Engine** (`Auto → Auto+Review`): star-allele calling where possible; guideline rules (CPIC/DPWG) with conservative defaults; flag items needing review.
|
||||
- **DDI Engine** (`Auto`): rule tables keyed by normalized drug IDs; outputs severity and rationale.
|
||||
- **Supplements/Herbs** (`Auto+Review → Human-only`): ingredient extraction + mapping; interaction rules; human sign-off for clinical language.
|
||||
- **Orchestrator/LLM** (`Auto tools, Auto+Review outputs`): intent parsing, tool sequencing, safety guardrails, report drafting.
|
||||
|
||||
## Observability and Versioning
|
||||
- Every pipeline run writes a JSON log: tool versions, reference genome, DB versions, config hashes, automation level per step, manual overrides (who/when/why).
|
||||
- Reports embed references to those logs so outputs remain reproducible.
|
||||
- Configs (ACMG thresholds, gene panels, PGx rules) are versioned artifacts stored alongside code.
|
||||
|
||||
## Security/Privacy Notes
|
||||
- Default to local processing; if external LLMs are used, strip identifiers and avoid full VCF uploads.
|
||||
- Secrets kept out of repo; rely on environment variables or local config files (excluded by `.gitignore`).
|
||||
|
||||
## Initial Tech Bets (to be validated)
|
||||
- Language/runtime: Python 3.11+ for pipelines, rules, and orchestration stubs.
|
||||
- Bio stack candidates: GATK or DeepVariant; VEP; tabix for early querying; SQLAlchemy + SQLite/Postgres when scaling.
|
||||
- Infra: containerized runners for pipelines; makefiles or workflow engine (Nextflow/Snakemake) later if needed.
|
||||
356
genomic_decision_support_system_spec_v0.1.md
Normal file
356
genomic_decision_support_system_spec_v0.1.md
Normal file
@@ -0,0 +1,356 @@
|
||||
|
||||
# 個人基因風險與用藥交互作用決策支援系統 – 系統規格書
|
||||
|
||||
- 版本:v0.1-draft
|
||||
- 作者:Gbanyan + 助理規劃
|
||||
- 狀態:草案,預期隨實作迭代
|
||||
- 目的:提供給 LLM(如 Claude、Codex 等)與開發者閱讀,作為系統設計與實作的基礎規格。
|
||||
|
||||
---
|
||||
|
||||
## 0. 系統目標與範圍
|
||||
|
||||
### 0.1 目標
|
||||
|
||||
建立一套「個人化、基因驅動」的決策支援系統,核心功能:
|
||||
|
||||
1. **從個人與父母的外顯子定序資料(BAM)出發**,產生可查詢的變異資料庫。
|
||||
2. **針對特定疾病、症狀或表型**,自動查詢相關基因與已知致病變異,並在個人資料中搜尋對應變異。
|
||||
3. 依據 **ACMG/AMP 等準則** 與公開資料庫,給出 **機器輔助、具人工介入點的變異詮釋**。
|
||||
4. 進一步整合:
|
||||
- 基因藥物學(Pharmacogenomics, PGx)
|
||||
- 藥–藥交互作用(DDI)
|
||||
- 保健食品、中藥等成分的潛在交互作用與風險
|
||||
5. 提供一個自然語言問答介面,使用者可直接問:
|
||||
- 「我有沒有 XXX 的遺傳風險?」
|
||||
- 「以我現在吃的藥和保健食品,有沒有需要注意的交互作用?」
|
||||
|
||||
> 系統定位為:**個人決策支援工具 / 研究工具**,而非正式醫療診斷系統。
|
||||
|
||||
### 0.2 核心設計哲學
|
||||
|
||||
- **分階段發展**:先穩固「基因本體」與變異詮釋,再往 PGx 與交互作用擴充。
|
||||
- **明確的人機分工**:對每個模組標記 `Auto / Auto+Review / Human-only`。
|
||||
- **可追蹤、可回溯**:每一個結論都可追蹤到所用規則、資料庫版本、人工 override。
|
||||
|
||||
---
|
||||
|
||||
## 1. 發展階段與整體架構
|
||||
|
||||
### 1.1 發展階段總覽
|
||||
|
||||
| 階段 | 名稱 | 核心產出 | 主要對象 |
|
||||
|------|------|----------|----------|
|
||||
| Phase 1 | 基因本體與變異詮釋基礎 | 個人+trio VCF、註解與疾病/基因查詢 | 單基因疾病風險 |
|
||||
| Phase 2 | 基因藥物學與 DDI | 基因–藥物對應、藥–藥交互作用分析 | 精準用藥建議 |
|
||||
| Phase 3 | 保健食品與中藥交互作用 | 成分標準化與交互作用風險層級 | 整體用藥+補充品安全網 |
|
||||
| Phase 4 | NLP/LLM 問答介面 | 自然語言問答、報告生成 | 一般使用者 / 臨床對話 |
|
||||
|
||||
### 1.2 高階架構圖(Mermaid)
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
|
||||
subgraph P1[Phase 1: 基因本體]
|
||||
A1[BAM (本人+父母)] --> A2[Variant Calling Pipeline]
|
||||
A2 --> A3[Joint VCF (Trio)]
|
||||
A3 --> A4[Variant Annotation (ClinVar, gnomAD, VEP...)]
|
||||
A4 --> A5[Genomic DB / Query API]
|
||||
end
|
||||
|
||||
subgraph P2[Phase 2: PGx & DDI]
|
||||
B1[藥物清單] --> B2[PGx Engine]
|
||||
A5 --> B2
|
||||
B1 --> B3[DDI Engine]
|
||||
end
|
||||
|
||||
subgraph P3[Phase 3: 補充品 & 中藥]
|
||||
C1[保健食品/中藥清單] --> C2[成分標準化]
|
||||
C2 --> C3[成分交互作用引擎]
|
||||
B1 --> C3
|
||||
A5 --> C3
|
||||
end
|
||||
|
||||
subgraph P4[Phase 4: 問答與報告]
|
||||
D1[前端 UI / CLI / API Client] --> D2[LLM Orchestrator]
|
||||
D2 -->|疾病/症狀詢問| A5
|
||||
D2 -->|用藥/成分詢問| B2
|
||||
D2 --> B3
|
||||
D2 --> C3
|
||||
A5 --> D3[報告產生器]
|
||||
B2 --> D3
|
||||
B3 --> D3
|
||||
C3 --> D3
|
||||
end
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. 通用設計:人機分工標記
|
||||
|
||||
所有模組需標記自動化等級:
|
||||
|
||||
- `Auto`:可完全自動執行的步驟(例:variant calling、基本註解)。
|
||||
- `Auto+Review`:系統先產生建議,需人工複核或有條件接受(例:ACMG 部分 evidence scoring)。
|
||||
- `Human-only`:最終醫療判斷/用語/管理建議,必須由人決策(例:最終 Pathogenic 分類、臨床處置建議)。
|
||||
|
||||
每次分析需生成一份 **machine-readable log**,紀錄:
|
||||
|
||||
- 使用的模組與版本
|
||||
- 每一步的自動化等級
|
||||
- 哪些地方有人工 override(人員、時間、理由)
|
||||
|
||||
---
|
||||
|
||||
## 3. Phase 1:基因本體與變異詮釋基礎
|
||||
|
||||
### 3.1 功能需求
|
||||
|
||||
1. **輸入**
|
||||
- 本人與雙親外顯子定序 BAM 檔。
|
||||
2. **輸出**
|
||||
- 高品質 joint VCF(含 trio)
|
||||
- 每個變異的註解資訊:
|
||||
- 基因、轉錄本、蛋白改變
|
||||
- 族群頻率(gnomAD 等)
|
||||
- ClinVar 註解
|
||||
- 功能預測(SIFT/PolyPhen/CADD 等)
|
||||
- 對特定疾病/基因清單的變異過濾結果。
|
||||
3. **對外服務**
|
||||
- 以 API / 函式介面提供:
|
||||
- 給定基因列表 → 回傳該個體在這些基因中的變異列表
|
||||
- 支援疾病名稱/HPO → 基因 → 變異的查詢流程(初期可分步呼叫)
|
||||
|
||||
### 3.2 模組設計
|
||||
|
||||
#### 3.2.1 Variant Calling Pipeline
|
||||
|
||||
- **輸入**:BAM(本人 + 父母)
|
||||
- **輸出**:個別 gVCF → joint VCF
|
||||
- **工具候選**:
|
||||
- GATK(HaplotypeCaller + GenotypeGVCFs)
|
||||
- 或 DeepVariant + joint genotyper
|
||||
- **自動化等級**:`Auto`
|
||||
- **需求**:
|
||||
- 基本 QC(coverage、duplicate rate、on-target rate)
|
||||
- 支援版本標記(如 reference genome 版本)
|
||||
|
||||
#### 3.2.2 Annotation Pipeline
|
||||
|
||||
- **輸入**:joint VCF
|
||||
- **輸出**:annotated VCF / 變異表
|
||||
- **工具候選**:
|
||||
- VEP、ANNOVAR 或類似工具
|
||||
- **資料庫**:
|
||||
- ClinVar
|
||||
- gnomAD
|
||||
- 基因功能與轉錄本資料庫
|
||||
- **自動化等級**:`Auto`
|
||||
|
||||
#### 3.2.3 Genomic DB / Query API
|
||||
|
||||
- **目的**:提供高效查詢,作為後續模組(疾病風險、PGx 等)的基底。
|
||||
- **形式**:
|
||||
- 選項 A:基於 VCF + tabix,以封裝函式操作
|
||||
- 選項 B:匯入 SQLite / PostgreSQL / 專用 genomic DB
|
||||
- **關鍵查詢**:
|
||||
- `get_variants_by_gene(individual_id, gene_list, filters)`
|
||||
- `get_variants_by_region(individual_id, chr, start, end, filters)`
|
||||
- **自動化等級**:`Auto`
|
||||
|
||||
#### 3.2.4 疾病/表型 → 基因 → 變異流程
|
||||
|
||||
- 初期可拆成三步:
|
||||
1. 使用外部知識庫或手動 panel:疾病/表型 → 基因清單
|
||||
2. 透過 Genomic DB 查詢個人變異
|
||||
3. 以簡單規則(頻率、ClinVar 標註)做初步排序
|
||||
- **自動化等級**:`Auto+Review`
|
||||
|
||||
### 3.3 ACMG 規則實作(初版)
|
||||
|
||||
- **範圍**:僅實作部分機器可自動判定之 evidence(如 PVS1、PM2、BA1、BS1 等)。
|
||||
- **輸出**:
|
||||
- 每個變異的 evidence tag 列表與建議分級(例如:`suggested_class = "VUS"`)
|
||||
- **人工介入點**:
|
||||
- 變異最終分類(Pathogenic / Likely pathogenic / VUS / Likely benign / Benign) → `Human-only`
|
||||
- 規則閾值(如頻率 cutoff)以 config 檔管理 → `Auto+Review`
|
||||
|
||||
---
|
||||
|
||||
## 4. Phase 2:基因藥物學(PGx)與藥–藥交互作用(DDI)
|
||||
|
||||
### 4.1 功能需求
|
||||
|
||||
1. 接收使用者目前用藥清單(處方藥、成藥)。
|
||||
2. 透過基因資料,判定與 PGx 相關的 genotype(例如 CYP2D6, CYP2C9, HLA 等)。
|
||||
3. 根據 CPIC / DPWG 等指南,給出:
|
||||
- 適應症相關風險(如 HLA-B*58:01 與 allopurinol)
|
||||
- 劑量調整建議 / 藥物替代建議(僅 decision-support 層級)
|
||||
4. 計算基礎藥–藥交互作用(DDI),例如:
|
||||
- CYP 抑制 / 誘導疊加
|
||||
- QT prolongation 疊加
|
||||
- 出血風險疊加
|
||||
|
||||
### 4.2 模組設計
|
||||
|
||||
#### 4.2.1 用藥資料標準化
|
||||
|
||||
- 使用 ATC / RxNorm / 自訂 ID。
|
||||
- **自動化等級**:`Auto`
|
||||
|
||||
#### 4.2.2 PGx Engine
|
||||
|
||||
- **輸入**:個人變異(Phase 1 DB)、藥物清單
|
||||
- **輸出**:每個藥物的 PGx 評估(genotype → phenotype → 建議)
|
||||
- **資料庫**:
|
||||
- CPIC guidelines
|
||||
- PharmGKB 關聯資料
|
||||
- **自動化等級**:
|
||||
- genotype → phenotype:`Auto`
|
||||
- phenotype → 臨床建議:`Auto+Review`
|
||||
|
||||
#### 4.2.3 DDI Engine
|
||||
|
||||
- **輸入**:藥物清單
|
||||
- **輸出**:已知 DDI 清單與嚴重程度分級
|
||||
- **資料來源**:公開或商用 DDI 資料庫(視可用性)
|
||||
- **自動化等級**:`Auto`
|
||||
|
||||
---
|
||||
|
||||
## 5. Phase 3:保健食品與中藥交互作用模組
|
||||
|
||||
### 5.1 功能需求
|
||||
|
||||
1. 接收使用者的保健食品與中藥使用資料。
|
||||
2. 將名稱解析為:
|
||||
- 標準化有效成分(如 EPA/DHA mg、Vit D IU、銀杏葉萃取物 mg 等)
|
||||
- 中藥材名稱(如 黃耆、當歸、川芎…)
|
||||
3. 評估:
|
||||
- 成分與藥物、基因的交互作用風險
|
||||
- 成分間的加乘作用(如抗凝、CNS 抑制等)
|
||||
4. 按證據等級給出:
|
||||
- 高優先級警示(有較強臨床證據)
|
||||
- 一般提醒(動物實驗 / case report 等)
|
||||
- 資料不足,僅能提醒不確定性
|
||||
|
||||
### 5.2 模組設計
|
||||
|
||||
#### 5.2.1 成分標準化引擎
|
||||
|
||||
- **輸入**:使用者輸入的品名 / 處方
|
||||
- **輸出**:
|
||||
- 標準化成分列表
|
||||
- 估計劑量範圍(若無精確資料)
|
||||
- **資料**:
|
||||
- 保健食品常用成分資料表
|
||||
- 中藥方與藥材對應表
|
||||
- **自動化等級**:`Auto+Review`
|
||||
|
||||
#### 5.2.2 成分交互作用引擎
|
||||
|
||||
- **輸入**:成分列表、藥物清單、基因資料
|
||||
- **輸出**:交互作用列表與風險層級
|
||||
- **邏輯**:
|
||||
- 成分對 CYP / P-gp / OATP 等的影響
|
||||
- 成分對凝血、血壓、中樞神經等系統的影響
|
||||
- **自動化等級**:
|
||||
- 規則推論:`Auto`
|
||||
- 最終臨床建議表述:`Human-only`
|
||||
|
||||
---
|
||||
|
||||
## 6. Phase 4:NLP/LLM 問答介面與報告生成
|
||||
|
||||
### 6.1 功能需求
|
||||
|
||||
1. 支援使用者以自然語言提問:
|
||||
- 疾病/症狀相關風險
|
||||
- 用藥安全性
|
||||
- 保健食品、中藥併用風險
|
||||
2. LLM 負責:
|
||||
- 問題解析 → 結構化查詢(疾病、HPO、藥物、成分等)
|
||||
- 協調呼叫底層 API(Phase 1–3)
|
||||
- 整合結果並生成報告草稿
|
||||
3. 報告形式:
|
||||
- 機器可讀 JSON(便於後處理)
|
||||
- 人類可讀 Markdown / PDF 報告
|
||||
|
||||
### 6.2 Orchestration 設計
|
||||
|
||||
- 可採用「LLM + Tool/Function Calling」模式:
|
||||
- 工具包括:
|
||||
- `query_variants_by_gene`
|
||||
- `query_disease_gene_panel`
|
||||
- `run_pgx_analysis`
|
||||
- `run_ddi_analysis`
|
||||
- `run_supplement_herb_interaction`
|
||||
- LLM 主要負責:
|
||||
- 意圖辨識與拆解
|
||||
- 工具呼叫順序規劃
|
||||
- 結果解釋與用語調整(需符合安全與保守原則)
|
||||
- **自動化等級**:
|
||||
- 工具呼叫:`Auto`
|
||||
- 臨床敏感結論:`Auto+Review` / `Human-only`(視場景而定)
|
||||
|
||||
---
|
||||
|
||||
## 7. 安全性、隱私與版本管理
|
||||
|
||||
### 7.1 資料安全與隱私
|
||||
|
||||
- 所有基因資料、用藥清單、報告:
|
||||
- 儲存於本地或受控環境
|
||||
- 若需與外部服務(如雲端 LLM)互動,需:
|
||||
- 做脫敏處理(移除個資)
|
||||
- 或改用 local/私有 LLM
|
||||
|
||||
### 7.2 版本管理
|
||||
|
||||
- 對以下物件進行版本控制:
|
||||
- 參考基因組版本
|
||||
- variant calling pipeline 版本
|
||||
- 資料庫版本(ClinVar、gnomAD 等)
|
||||
- ACMG 規則 config 版本
|
||||
- gene panel / PGx 規則版本
|
||||
- 每份分析報告需記錄所用版本,以利追蹤與重跑。
|
||||
|
||||
### 7.3 人工介入紀錄
|
||||
|
||||
- 每次人工 override 或審核需紀錄:
|
||||
- 變異 ID / 分析項目
|
||||
- 原自動建議
|
||||
- 人工調整結果
|
||||
- 理由與參考文獻(如有)
|
||||
- 審核者與時間
|
||||
|
||||
---
|
||||
|
||||
## 8. 未來擴充方向(Optional)
|
||||
|
||||
- 整合 polygenic risk score(PRS)模組
|
||||
- 整合 longitudinal data(實驗室數據、症狀日誌)做風險動態追蹤
|
||||
- 為特定疾病領域建立更深的 expert-curated knowledge base
|
||||
- 與可穿戴裝置/其他健康資料源整合
|
||||
|
||||
---
|
||||
|
||||
## 9. 第一階段實作建議路線(Actionable TODO)
|
||||
|
||||
1. **規劃 Phase 1 的技術選型**
|
||||
- 選擇 variant caller(如 GATK)與 reference genome 版本
|
||||
- 選擇 annotation 工具(如 VEP 或 ANNOVAR)
|
||||
2. **建立基本 pipeline**
|
||||
- BAM → gVCF → joint VCF(trio)
|
||||
- 加上基本 QC 報表
|
||||
3. **建置簡單的 Genomic Query 介面**
|
||||
- 先以 CLI/Notebook 函式為主(例如 Python 函式庫)
|
||||
4. **選一個你最關心的疾病領域**
|
||||
- 建立第一個 gene panel(例如視覺/聽力相關)
|
||||
- 實作 panel-based 查詢與變異列表輸出
|
||||
5. **撰寫第一版報告模板**
|
||||
- 輸入:疾病名稱 + gene panel + 查詢結果
|
||||
- 輸出:簡易 Markdown 報告(含變異表 + 限制說明)
|
||||
6. **逐步加入 ACMG 自動 evidence 標記與人工 review 流程**
|
||||
|
||||
這個規格書預期會在實作過程中持續更新,可視此為 v0.1 的起點版本。
|
||||
31
pyproject.toml
Normal file
31
pyproject.toml
Normal file
@@ -0,0 +1,31 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=61", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "genomic-consultant"
|
||||
version = "0.1.0"
|
||||
description = "Personal genomic risk and drug–interaction decision support scaffolding"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"pyyaml>=6",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
genomic-consultant = "genomic_consultant.cli:main"
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"pytest",
|
||||
"ruff",
|
||||
]
|
||||
store = [
|
||||
"pandas>=2",
|
||||
]
|
||||
|
||||
[tool.setuptools]
|
||||
package-dir = {"" = "src"}
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["src"]
|
||||
3
sample_data/example_annotated.tsv
Normal file
3
sample_data/example_annotated.tsv
Normal file
@@ -0,0 +1,3 @@
|
||||
#CHROM POS REF ALT SYMBOL Consequence Protein_position PolyPhen SIFT CLIN_SIG AF gnomAD_AF SpliceAI CADD_PHRED
|
||||
1 123456 A T GJB2 missense_variant p.Val37Ile benign tolerated Benign 0.012 0.012 0.02 5.1
|
||||
2 234567 G C OTOF stop_gained p.* probably_damaging deleterious Uncertain_significance 0.0001 0.0001 0.6 28.5
|
||||
|
63
src/genomic_consultant.egg-info/PKG-INFO
Normal file
63
src/genomic_consultant.egg-info/PKG-INFO
Normal file
@@ -0,0 +1,63 @@
|
||||
Metadata-Version: 2.4
|
||||
Name: genomic-consultant
|
||||
Version: 0.1.0
|
||||
Summary: Personal genomic risk and drug–interaction decision support scaffolding
|
||||
Requires-Python: >=3.11
|
||||
Description-Content-Type: text/markdown
|
||||
Requires-Dist: pyyaml>=6
|
||||
Provides-Extra: dev
|
||||
Requires-Dist: pytest; extra == "dev"
|
||||
Requires-Dist: ruff; extra == "dev"
|
||||
|
||||
# Genomic Consultant
|
||||
|
||||
Early design for a personal genomic risk and drug–interaction decision support system. Specs are sourced from `genomic_decision_support_system_spec_v0.1.md`.
|
||||
|
||||
## Vision (per spec)
|
||||
- Phase 1: trio variant calling, annotation, queryable genomic DB, initial ACMG evidence tagging.
|
||||
- Phase 2: pharmacogenomics genotype-to-phenotype mapping plus drug–drug interaction checks.
|
||||
- Phase 3: supplement/herb normalization and interaction risk layering.
|
||||
- Phase 4: LLM-driven query orchestration and report generation.
|
||||
|
||||
## Repository Layout
|
||||
- `docs/` — system architecture notes, phase plans, data models (work in progress).
|
||||
- `configs/` — example ACMG config and gene panel JSON.
|
||||
- `sample_data/` — tiny annotated TSV for demo.
|
||||
- `src/genomic_consultant/` — Python scaffolding (pipelines, store, panel lookup, ACMG tagging, reporting).
|
||||
- `genomic_decision_support_system_spec_v0.1.md` — original requirements draft.
|
||||
|
||||
## Contributing/next steps
|
||||
1. Finalize Phase 1 tech selection (variant caller, annotation stack, reference/DB versions).
|
||||
2. Stand up the Phase 1 pipelines and minimal query API surface.
|
||||
3. Add ACMG evidence tagging config and human-review logging.
|
||||
4. Layer in PGx/DDI and supplement modules per later phases.
|
||||
|
||||
Data safety: keep genomic/clinical data local; the `.gitignore` blocks common genomic outputs by default.
|
||||
|
||||
## Quickstart (CLI scaffolding)
|
||||
```
|
||||
pip install -e .
|
||||
|
||||
# 1) Show trio calling plan (commands only; not executed)
|
||||
genomic-consultant plan-call \
|
||||
--sample proband:/data/proband.bam \
|
||||
--sample father:/data/father.bam \
|
||||
--sample mother:/data/mother.bam \
|
||||
--reference /refs/GRCh38.fa \
|
||||
--workdir /tmp/trio
|
||||
|
||||
# 2) Show annotation plan for a joint VCF
|
||||
genomic-consultant plan-annotate \
|
||||
--vcf /tmp/trio/trio.joint.vcf.gz \
|
||||
--workdir /tmp/trio/annot \
|
||||
--prefix trio \
|
||||
--reference /refs/GRCh38.fa
|
||||
|
||||
# 3) Demo panel report using sample data
|
||||
genomic-consultant panel-report \
|
||||
--tsv sample_data/example_annotated.tsv \
|
||||
--panel configs/panel.example.json \
|
||||
--acmg-config configs/acmg_config.example.yaml \
|
||||
--individual-id demo \
|
||||
--format markdown
|
||||
```
|
||||
27
src/genomic_consultant.egg-info/SOURCES.txt
Normal file
27
src/genomic_consultant.egg-info/SOURCES.txt
Normal file
@@ -0,0 +1,27 @@
|
||||
README.md
|
||||
pyproject.toml
|
||||
src/genomic_consultant/__init__.py
|
||||
src/genomic_consultant/cli.py
|
||||
src/genomic_consultant.egg-info/PKG-INFO
|
||||
src/genomic_consultant.egg-info/SOURCES.txt
|
||||
src/genomic_consultant.egg-info/dependency_links.txt
|
||||
src/genomic_consultant.egg-info/entry_points.txt
|
||||
src/genomic_consultant.egg-info/requires.txt
|
||||
src/genomic_consultant.egg-info/top_level.txt
|
||||
src/genomic_consultant/acmg/__init__.py
|
||||
src/genomic_consultant/acmg/tagger.py
|
||||
src/genomic_consultant/audit/__init__.py
|
||||
src/genomic_consultant/audit/run_log.py
|
||||
src/genomic_consultant/orchestration/__init__.py
|
||||
src/genomic_consultant/orchestration/workflows.py
|
||||
src/genomic_consultant/panels/__init__.py
|
||||
src/genomic_consultant/panels/panels.py
|
||||
src/genomic_consultant/pipelines/__init__.py
|
||||
src/genomic_consultant/pipelines/annotation.py
|
||||
src/genomic_consultant/pipelines/variant_calling.py
|
||||
src/genomic_consultant/reporting/__init__.py
|
||||
src/genomic_consultant/reporting/report.py
|
||||
src/genomic_consultant/store/__init__.py
|
||||
src/genomic_consultant/store/query.py
|
||||
src/genomic_consultant/utils/__init__.py
|
||||
src/genomic_consultant/utils/models.py
|
||||
1
src/genomic_consultant.egg-info/dependency_links.txt
Normal file
1
src/genomic_consultant.egg-info/dependency_links.txt
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
2
src/genomic_consultant.egg-info/entry_points.txt
Normal file
2
src/genomic_consultant.egg-info/entry_points.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
[console_scripts]
|
||||
genomic-consultant = genomic_consultant.cli:main
|
||||
5
src/genomic_consultant.egg-info/requires.txt
Normal file
5
src/genomic_consultant.egg-info/requires.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
pyyaml>=6
|
||||
|
||||
[dev]
|
||||
pytest
|
||||
ruff
|
||||
1
src/genomic_consultant.egg-info/top_level.txt
Normal file
1
src/genomic_consultant.egg-info/top_level.txt
Normal file
@@ -0,0 +1 @@
|
||||
genomic_consultant
|
||||
4
src/genomic_consultant/__init__.py
Normal file
4
src/genomic_consultant/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
"""Genomic Consultant: genomic decision support scaffolding."""
|
||||
|
||||
__all__ = ["__version__"]
|
||||
__version__ = "0.1.0"
|
||||
1
src/genomic_consultant/acmg/__init__.py
Normal file
1
src/genomic_consultant/acmg/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
91
src/genomic_consultant/acmg/tagger.py
Normal file
91
src/genomic_consultant/acmg/tagger.py
Normal file
@@ -0,0 +1,91 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Set
|
||||
|
||||
import yaml
|
||||
|
||||
from genomic_consultant.utils.models import EvidenceTag, SuggestedClassification, Variant
|
||||
|
||||
|
||||
@dataclass
|
||||
class ACMGConfig:
|
||||
ba1_af: float = 0.05
|
||||
bs1_af: float = 0.01
|
||||
pm2_af: float = 0.0005
|
||||
lof_genes: Set[str] | None = None
|
||||
bp7_splice_ai_max: float = 0.1
|
||||
|
||||
|
||||
def load_acmg_config(path: Path) -> ACMGConfig:
|
||||
data = yaml.safe_load(Path(path).read_text())
|
||||
return ACMGConfig(
|
||||
ba1_af=data.get("ba1_af", 0.05),
|
||||
bs1_af=data.get("bs1_af", 0.01),
|
||||
pm2_af=data.get("pm2_af", 0.0005),
|
||||
lof_genes=set(data.get("lof_genes", [])) if data.get("lof_genes") else set(),
|
||||
bp7_splice_ai_max=data.get("bp7_splice_ai_max", 0.1),
|
||||
)
|
||||
|
||||
|
||||
def tag_variant(variant: Variant, config: ACMGConfig) -> SuggestedClassification:
|
||||
evidence: List[EvidenceTag] = []
|
||||
af = variant.allele_frequency
|
||||
|
||||
if af is not None:
|
||||
if af >= config.ba1_af:
|
||||
evidence.append(EvidenceTag(tag="BA1", strength="Stand-alone", rationale=f"AF {af} >= {config.ba1_af}"))
|
||||
elif af >= config.bs1_af:
|
||||
evidence.append(EvidenceTag(tag="BS1", strength="Strong", rationale=f"AF {af} >= {config.bs1_af}"))
|
||||
elif af <= config.pm2_af:
|
||||
evidence.append(EvidenceTag(tag="PM2", strength="Moderate", rationale=f"AF {af} <= {config.pm2_af}"))
|
||||
|
||||
if _is_lof(variant) and variant.gene and variant.gene in config.lof_genes:
|
||||
evidence.append(
|
||||
EvidenceTag(tag="PVS1", strength="Very strong", rationale="Predicted LoF in LoF-sensitive gene")
|
||||
)
|
||||
|
||||
splice_ai = _get_float(variant.annotations.get("splice_ai_delta_score"))
|
||||
if _is_synonymous(variant) and (splice_ai is None or splice_ai <= config.bp7_splice_ai_max):
|
||||
evidence.append(
|
||||
EvidenceTag(
|
||||
tag="BP7",
|
||||
strength="Supporting",
|
||||
rationale=f"Synonymous with low predicted splice impact (spliceAI {splice_ai})",
|
||||
)
|
||||
)
|
||||
|
||||
suggested = _suggest_class(evidence)
|
||||
return SuggestedClassification(suggested_class=suggested, evidence=evidence)
|
||||
|
||||
|
||||
def _is_lof(variant: Variant) -> bool:
|
||||
consequence = (variant.consequence or "").lower()
|
||||
lof_keywords = ["frameshift", "stop_gained", "splice_acceptor", "splice_donor", "start_lost"]
|
||||
return any(k in consequence for k in lof_keywords)
|
||||
|
||||
|
||||
def _suggest_class(evidence: List[EvidenceTag]) -> str:
|
||||
tags = {e.tag for e in evidence}
|
||||
if "BA1" in tags:
|
||||
return "Benign"
|
||||
if "BS1" in tags and "PM2" not in tags and "PVS1" not in tags:
|
||||
return "Likely benign"
|
||||
if "PVS1" in tags and "PM2" in tags:
|
||||
return "Likely pathogenic"
|
||||
return "VUS"
|
||||
|
||||
|
||||
def _is_synonymous(variant: Variant) -> bool:
|
||||
consequence = (variant.consequence or "").lower()
|
||||
return "synonymous_variant" in consequence
|
||||
|
||||
|
||||
def _get_float(value: str | float | None) -> float | None:
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
return float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
1
src/genomic_consultant/audit/__init__.py
Normal file
1
src/genomic_consultant/audit/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
30
src/genomic_consultant/audit/run_log.py
Normal file
30
src/genomic_consultant/audit/run_log.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import asdict, is_dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from genomic_consultant.utils.models import RunLog
|
||||
|
||||
|
||||
def _serialize(obj: Any) -> Any:
|
||||
if isinstance(obj, datetime):
|
||||
return obj.isoformat()
|
||||
if is_dataclass(obj):
|
||||
return {k: _serialize(v) for k, v in asdict(obj).items()}
|
||||
if isinstance(obj, dict):
|
||||
return {k: _serialize(v) for k, v in obj.items()}
|
||||
if isinstance(obj, list):
|
||||
return [_serialize(v) for v in obj]
|
||||
return obj
|
||||
|
||||
|
||||
def write_run_log(run_log: RunLog, path: str | Path) -> Path:
|
||||
"""Persist a RunLog to JSON."""
|
||||
path = Path(path)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
payload = _serialize(run_log)
|
||||
path.write_text(json.dumps(payload, indent=2))
|
||||
return path
|
||||
273
src/genomic_consultant/cli.py
Normal file
273
src/genomic_consultant/cli.py
Normal file
@@ -0,0 +1,273 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
from genomic_consultant.acmg.tagger import ACMGConfig, load_acmg_config
|
||||
from genomic_consultant.orchestration.workflows import run_panel_variant_review
|
||||
from genomic_consultant.panels.aggregate import merge_mappings
|
||||
from genomic_consultant.panels.panels import load_panel
|
||||
from genomic_consultant.panels.resolver import PhenotypeGeneResolver
|
||||
from genomic_consultant.pipelines.annotation import build_vep_plan
|
||||
from genomic_consultant.pipelines.variant_calling import build_gatk_trio_plan
|
||||
from genomic_consultant.pipelines.runner import execute_plan
|
||||
from genomic_consultant.reporting.report import panel_report_json, panel_report_markdown
|
||||
from genomic_consultant.orchestration.phase1_pipeline import run_phase1_pipeline
|
||||
from genomic_consultant.store.query import GenomicStore
|
||||
from genomic_consultant.utils.hashing import sha256sum
|
||||
from genomic_consultant.utils.models import FilterConfig
|
||||
from genomic_consultant.utils.tooling import probe_tool_versions
|
||||
|
||||
|
||||
def parse_samples(sample_args: List[str]) -> Dict[str, Path]:
|
||||
samples: Dict[str, Path] = {}
|
||||
for arg in sample_args:
|
||||
if ":" not in arg:
|
||||
raise ValueError(f"Sample must be sample_id:/path/to.bam, got {arg}")
|
||||
sample_id, bam_path = arg.split(":", 1)
|
||||
samples[sample_id] = Path(bam_path)
|
||||
return samples
|
||||
|
||||
|
||||
def main(argv: List[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(prog="genomic-consultant", description="Genomic decision support scaffolding")
|
||||
sub = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
# Variant calling plan
|
||||
call = sub.add_parser("plan-call", help="Build variant calling command plan (GATK trio).")
|
||||
call.add_argument("--sample", action="append", required=True, help="sample_id:/path/to.bam (repeatable)")
|
||||
call.add_argument("--reference", required=True, help="Path to reference FASTA")
|
||||
call.add_argument("--workdir", required=True, help="Working directory for outputs")
|
||||
call.add_argument("--prefix", default="trio", help="Output prefix for joint VCF")
|
||||
|
||||
run_call = sub.add_parser("run-call", help="Execute variant calling plan (GATK trio).")
|
||||
run_call.add_argument("--sample", action="append", required=True, help="sample_id:/path/to.bam (repeatable)")
|
||||
run_call.add_argument("--reference", required=True, help="Path to reference FASTA")
|
||||
run_call.add_argument("--workdir", required=True, help="Working directory for outputs")
|
||||
run_call.add_argument("--prefix", default="trio", help="Output prefix for joint VCF")
|
||||
run_call.add_argument("--log", required=False, help="Path to write run log JSON")
|
||||
run_call.add_argument("--probe-tools", action="store_true", help="Attempt to record tool versions")
|
||||
|
||||
# Annotation plan
|
||||
ann = sub.add_parser("plan-annotate", help="Build annotation command plan (VEP).")
|
||||
ann.add_argument("--vcf", required=True, help="Path to joint VCF")
|
||||
ann.add_argument("--workdir", required=True, help="Working directory for annotated outputs")
|
||||
ann.add_argument("--prefix", default="annotated", help="Output prefix")
|
||||
ann.add_argument("--reference", required=False, help="Reference FASTA (optional)")
|
||||
ann.add_argument("--plugin", action="append", help="VEP plugin spec, repeatable", default=[])
|
||||
|
||||
run_ann = sub.add_parser("run-annotate", help="Execute annotation plan (VEP).")
|
||||
run_ann.add_argument("--vcf", required=True, help="Path to joint VCF")
|
||||
run_ann.add_argument("--workdir", required=True, help="Working directory for annotated outputs")
|
||||
run_ann.add_argument("--prefix", default="annotated", help="Output prefix")
|
||||
run_ann.add_argument("--reference", required=False, help="Reference FASTA (optional)")
|
||||
run_ann.add_argument("--plugin", action="append", help="VEP plugin spec, repeatable", default=[])
|
||||
run_ann.add_argument("--extra-flag", action="append", help="Extra flags appended to VEP command", default=[])
|
||||
run_ann.add_argument("--log", required=False, help="Path to write run log JSON")
|
||||
run_ann.add_argument("--probe-tools", action="store_true", help="Attempt to record tool versions")
|
||||
|
||||
# Panel report
|
||||
panel = sub.add_parser("panel-report", help="Run panel query + ACMG tagging and emit report.")
|
||||
panel.add_argument("--tsv", required=True, help="Flattened annotated TSV")
|
||||
panel.add_argument("--panel", required=False, help="Panel JSON file")
|
||||
panel.add_argument("--phenotype-id", required=False, help="Phenotype/HPO ID to resolve to a panel")
|
||||
panel.add_argument("--phenotype-mapping", required=False, help="Phenotype→genes mapping JSON")
|
||||
panel.add_argument("--acmg-config", required=True, help="ACMG config YAML")
|
||||
panel.add_argument("--individual-id", required=True, help="Individual identifier")
|
||||
panel.add_argument("--max-af", type=float, default=None, help="Max allele frequency filter")
|
||||
panel.add_argument("--format", choices=["markdown", "json"], default="markdown", help="Output format")
|
||||
panel.add_argument("--log", required=False, help="Path to write run log JSON for the analysis")
|
||||
panel.add_argument("--phenotype-panel", required=False, help="Phenotype→genes mapping JSON (optional)")
|
||||
|
||||
phase1 = sub.add_parser("phase1-run", help="End-to-end Phase 1 pipeline (call→annotate→panel).")
|
||||
phase1.add_argument("--sample", action="append", help="sample_id:/path/to.bam (repeatable)")
|
||||
phase1.add_argument("--reference", required=False, help="Path to reference FASTA")
|
||||
phase1.add_argument("--workdir", required=True, help="Working directory for outputs")
|
||||
phase1.add_argument("--prefix", default="trio", help="Output prefix")
|
||||
phase1.add_argument("--plugins", action="append", default=[], help="VEP plugin specs")
|
||||
phase1.add_argument("--extra-flag", action="append", default=[], help="Extra flags for VEP")
|
||||
phase1.add_argument("--joint-vcf", required=False, help="Existing joint VCF (skip calling)")
|
||||
phase1.add_argument("--tsv", required=False, help="Existing annotated TSV (skip annotation)")
|
||||
phase1.add_argument("--skip-call", action="store_true", help="Skip variant calling step")
|
||||
phase1.add_argument("--skip-annotate", action="store_true", help="Skip annotation step")
|
||||
phase1.add_argument("--panel", required=False, help="Panel JSON file")
|
||||
phase1.add_argument("--phenotype-id", required=False, help="Phenotype/HPO ID to resolve to a panel")
|
||||
phase1.add_argument("--phenotype-mapping", required=False, help="Phenotype→genes mapping JSON")
|
||||
phase1.add_argument("--acmg-config", required=True, help="ACMG config YAML")
|
||||
phase1.add_argument("--max-af", type=float, default=None, help="Max allele frequency filter")
|
||||
phase1.add_argument("--format", choices=["markdown", "json"], default="markdown", help="Report format")
|
||||
phase1.add_argument("--log-dir", required=False, help="Directory to write run logs")
|
||||
|
||||
build_map = sub.add_parser("build-phenotype-mapping", help="Merge phenotype→gene mapping JSON files.")
|
||||
build_map.add_argument("--output", required=True, help="Output JSON path")
|
||||
build_map.add_argument("inputs", nargs="+", help="Input mapping JSON files")
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.command == "plan-call":
|
||||
samples = parse_samples(args.sample)
|
||||
plan = build_gatk_trio_plan(
|
||||
samples=samples,
|
||||
reference_fasta=Path(args.reference),
|
||||
workdir=Path(args.workdir),
|
||||
output_prefix=args.prefix,
|
||||
)
|
||||
print("# Variant calling command plan")
|
||||
for cmd in plan.commands:
|
||||
print(cmd)
|
||||
return 0
|
||||
|
||||
if args.command == "run-call":
|
||||
samples = parse_samples(args.sample)
|
||||
plan = build_gatk_trio_plan(
|
||||
samples=samples,
|
||||
reference_fasta=Path(args.reference),
|
||||
workdir=Path(args.workdir),
|
||||
output_prefix=args.prefix,
|
||||
)
|
||||
tool_versions = probe_tool_versions({"gatk": "gatk"}) if args.probe_tools else {}
|
||||
run_log = execute_plan(
|
||||
plan, automation_level="Auto", log_path=Path(args.log) if args.log else None
|
||||
)
|
||||
run_log.tool_versions.update(tool_versions)
|
||||
if args.log:
|
||||
from genomic_consultant.audit.run_log import write_run_log
|
||||
|
||||
write_run_log(run_log, Path(args.log))
|
||||
print(f"Run finished with {len(run_log.outputs.get('command_results', []))} steps. Log ID: {run_log.run_id}")
|
||||
if args.log:
|
||||
print(f"Run log written to {args.log}")
|
||||
return 0
|
||||
|
||||
if args.command == "plan-annotate":
|
||||
plan = build_vep_plan(
|
||||
vcf_path=Path(args.vcf),
|
||||
workdir=Path(args.workdir),
|
||||
reference_fasta=Path(args.reference) if args.reference else None,
|
||||
output_prefix=args.prefix,
|
||||
plugins=args.plugin,
|
||||
)
|
||||
print("# Annotation command plan")
|
||||
for cmd in plan.commands:
|
||||
print(cmd)
|
||||
return 0
|
||||
|
||||
if args.command == "run-annotate":
|
||||
plan = build_vep_plan(
|
||||
vcf_path=Path(args.vcf),
|
||||
workdir=Path(args.workdir),
|
||||
reference_fasta=Path(args.reference) if args.reference else None,
|
||||
output_prefix=args.prefix,
|
||||
plugins=args.plugin,
|
||||
extra_flags=args.extra_flag,
|
||||
)
|
||||
tool_versions = probe_tool_versions({"vep": "vep", "bcftools": "bcftools", "tabix": "tabix"}) if args.probe_tools else {}
|
||||
run_log = execute_plan(
|
||||
plan, automation_level="Auto", log_path=Path(args.log) if args.log else None
|
||||
)
|
||||
run_log.tool_versions.update(tool_versions)
|
||||
if args.log:
|
||||
from genomic_consultant.audit.run_log import write_run_log
|
||||
|
||||
write_run_log(run_log, Path(args.log))
|
||||
print(f"Run finished with {len(run_log.outputs.get('command_results', []))} steps. Log ID: {run_log.run_id}")
|
||||
if args.log:
|
||||
print(f"Run log written to {args.log}")
|
||||
return 0
|
||||
|
||||
if args.command == "panel-report":
|
||||
if not args.panel and not (args.phenotype_id and args.phenotype_mapping):
|
||||
raise SystemExit("Provide either --panel or (--phenotype-id and --phenotype-mapping).")
|
||||
|
||||
store = GenomicStore.from_tsv(Path(args.tsv))
|
||||
if args.panel:
|
||||
panel_obj = load_panel(Path(args.panel))
|
||||
panel_config_hash = sha256sum(Path(args.panel))
|
||||
else:
|
||||
resolver = PhenotypeGeneResolver.from_json(Path(args.phenotype_mapping))
|
||||
panel_obj = resolver.build_panel(args.phenotype_id)
|
||||
if panel_obj is None:
|
||||
raise SystemExit(f"No genes found for phenotype {args.phenotype_id}")
|
||||
panel_config_hash = sha256sum(Path(args.phenotype_mapping))
|
||||
|
||||
acmg_config = load_acmg_config(Path(args.acmg_config))
|
||||
filters = FilterConfig(max_af=args.max_af)
|
||||
result = run_panel_variant_review(
|
||||
individual_id=args.individual_id,
|
||||
panel=panel_obj,
|
||||
store=store,
|
||||
acmg_config=acmg_config,
|
||||
filters=filters,
|
||||
)
|
||||
output = panel_report_json(result) if args.format == "json" else panel_report_markdown(result)
|
||||
print(output)
|
||||
|
||||
if args.log:
|
||||
from genomic_consultant.audit.run_log import write_run_log
|
||||
from genomic_consultant.utils.models import RunLog
|
||||
run_log = RunLog(
|
||||
run_id=f"panel-{args.individual_id}",
|
||||
started_at=datetime.utcnow(),
|
||||
inputs={
|
||||
"tsv": str(args.tsv),
|
||||
"panel": str(args.panel) if args.panel else f"phenotype:{args.phenotype_id}",
|
||||
"acmg_config": str(args.acmg_config),
|
||||
},
|
||||
parameters={
|
||||
"max_af": args.max_af,
|
||||
"format": args.format,
|
||||
"phenotype_id": args.phenotype_id,
|
||||
},
|
||||
tool_versions={},
|
||||
database_versions={},
|
||||
config_hashes={
|
||||
"panel": panel_config_hash,
|
||||
"acmg_config": sha256sum(Path(args.acmg_config)),
|
||||
},
|
||||
automation_levels={"panel_report": "Auto+Review"},
|
||||
overrides=[],
|
||||
outputs={"report": output},
|
||||
notes=None,
|
||||
)
|
||||
write_run_log(run_log, Path(args.log))
|
||||
print(f"Analysis log written to {args.log}")
|
||||
return 0
|
||||
|
||||
if args.command == "build-phenotype-mapping":
|
||||
merge_mappings(inputs=[Path(p) for p in args.inputs], output=Path(args.output))
|
||||
print(f"Merged mapping written to {args.output}")
|
||||
return 0
|
||||
|
||||
if args.command == "phase1-run":
|
||||
samples = parse_samples(args.sample) if args.sample else None
|
||||
log_dir = Path(args.log_dir) if args.log_dir else None
|
||||
artifacts = run_phase1_pipeline(
|
||||
samples=samples,
|
||||
reference_fasta=Path(args.reference) if args.reference else None,
|
||||
workdir=Path(args.workdir),
|
||||
output_prefix=args.prefix,
|
||||
acmg_config_path=Path(args.acmg_config),
|
||||
max_af=args.max_af,
|
||||
panel_path=Path(args.panel) if args.panel else None,
|
||||
phenotype_id=args.phenotype_id,
|
||||
phenotype_mapping=Path(args.phenotype_mapping) if args.phenotype_mapping else None,
|
||||
report_format=args.format,
|
||||
plugins=args.plugins,
|
||||
extra_flags=args.extra_flag,
|
||||
existing_joint_vcf=Path(args.joint_vcf) if args.joint_vcf else None,
|
||||
existing_tsv=Path(args.tsv) if args.tsv else None,
|
||||
skip_call=args.skip_call,
|
||||
skip_annotate=args.skip_annotate,
|
||||
log_dir=log_dir,
|
||||
)
|
||||
print(artifacts.panel_report)
|
||||
return 0
|
||||
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
1
src/genomic_consultant/orchestration/__init__.py
Normal file
1
src/genomic_consultant/orchestration/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
146
src/genomic_consultant/orchestration/phase1_pipeline.py
Normal file
146
src/genomic_consultant/orchestration/phase1_pipeline.py
Normal file
@@ -0,0 +1,146 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from genomic_consultant.acmg.tagger import ACMGConfig, load_acmg_config
|
||||
from genomic_consultant.audit.run_log import write_run_log
|
||||
from genomic_consultant.orchestration.workflows import run_panel_variant_review
|
||||
from genomic_consultant.panels.panels import load_panel
|
||||
from genomic_consultant.panels.resolver import PhenotypeGeneResolver
|
||||
from genomic_consultant.pipelines.annotation import AnnotationPlan, build_vep_plan
|
||||
from genomic_consultant.pipelines.runner import execute_plan
|
||||
from genomic_consultant.pipelines.variant_calling import VariantCallingPlan, build_gatk_trio_plan
|
||||
from genomic_consultant.reporting.report import panel_report_json, panel_report_markdown
|
||||
from genomic_consultant.store.query import GenomicStore
|
||||
from genomic_consultant.utils.hashing import sha256sum
|
||||
from genomic_consultant.utils.models import FilterConfig, RunLog
|
||||
|
||||
|
||||
@dataclass
|
||||
class Phase1Artifacts:
|
||||
call_log: Optional[RunLog]
|
||||
annotate_log: Optional[RunLog]
|
||||
panel_report: str
|
||||
panel_report_format: str
|
||||
panel_result_log: RunLog
|
||||
tsv_path: Path
|
||||
joint_vcf: Optional[Path]
|
||||
|
||||
|
||||
def run_phase1_pipeline(
|
||||
samples: Optional[Dict[str, Path]],
|
||||
reference_fasta: Optional[Path],
|
||||
workdir: Path,
|
||||
output_prefix: str,
|
||||
acmg_config_path: Path,
|
||||
max_af: Optional[float],
|
||||
panel_path: Optional[Path],
|
||||
phenotype_id: Optional[str],
|
||||
phenotype_mapping: Optional[Path],
|
||||
report_format: str = "markdown",
|
||||
plugins: Optional[List[str]] = None,
|
||||
extra_flags: Optional[List[str]] = None,
|
||||
existing_joint_vcf: Optional[Path] = None,
|
||||
existing_tsv: Optional[Path] = None,
|
||||
skip_call: bool = False,
|
||||
skip_annotate: bool = False,
|
||||
log_dir: Optional[Path] = None,
|
||||
) -> Phase1Artifacts:
|
||||
"""
|
||||
Orchestrate Phase 1: optional call -> annotate -> panel report.
|
||||
Allows skipping call/annotate when precomputed artifacts are supplied.
|
||||
"""
|
||||
log_dir = log_dir or workdir / "runtime"
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
call_log = None
|
||||
annotate_log = None
|
||||
|
||||
# Variant calling
|
||||
joint_vcf: Optional[Path] = existing_joint_vcf
|
||||
if not skip_call:
|
||||
if not samples or not reference_fasta:
|
||||
raise ValueError("samples and reference_fasta are required unless skip_call is True")
|
||||
call_plan: VariantCallingPlan = build_gatk_trio_plan(
|
||||
samples=samples, reference_fasta=reference_fasta, workdir=workdir, output_prefix=output_prefix
|
||||
)
|
||||
call_log_path = log_dir / f"{output_prefix}_call_runlog.json"
|
||||
call_log = execute_plan(call_plan, automation_level="Auto", log_path=call_log_path)
|
||||
joint_vcf = call_plan.joint_vcf
|
||||
elif joint_vcf is None:
|
||||
joint_vcf = existing_joint_vcf
|
||||
|
||||
# Annotation
|
||||
tsv_path: Optional[Path] = existing_tsv
|
||||
if not skip_annotate:
|
||||
if joint_vcf is None:
|
||||
raise ValueError("joint VCF must be provided (via call step or existing_joint_vcf)")
|
||||
ann_plan: AnnotationPlan = build_vep_plan(
|
||||
vcf_path=joint_vcf,
|
||||
workdir=workdir,
|
||||
reference_fasta=reference_fasta,
|
||||
output_prefix=output_prefix,
|
||||
plugins=plugins or [],
|
||||
extra_flags=extra_flags or [],
|
||||
)
|
||||
ann_log_path = log_dir / f"{output_prefix}_annotate_runlog.json"
|
||||
annotate_log = execute_plan(ann_plan, automation_level="Auto", log_path=ann_log_path)
|
||||
tsv_path = ann_plan.flat_table
|
||||
|
||||
if tsv_path is None:
|
||||
raise ValueError("No TSV available; provide existing_tsv or run annotation.")
|
||||
|
||||
# Panel selection
|
||||
if panel_path:
|
||||
panel_obj = load_panel(panel_path)
|
||||
panel_hash = sha256sum(panel_path)
|
||||
elif phenotype_id and phenotype_mapping:
|
||||
resolver = PhenotypeGeneResolver.from_json(phenotype_mapping)
|
||||
panel_obj = resolver.build_panel(phenotype_id)
|
||||
if panel_obj is None:
|
||||
raise ValueError(f"No genes found for phenotype {phenotype_id}")
|
||||
panel_hash = sha256sum(phenotype_mapping)
|
||||
else:
|
||||
raise ValueError("Provide panel_path or (phenotype_id and phenotype_mapping).")
|
||||
|
||||
acmg_config = load_acmg_config(acmg_config_path)
|
||||
store = GenomicStore.from_tsv(tsv_path)
|
||||
filters = FilterConfig(max_af=max_af)
|
||||
panel_result = run_panel_variant_review(
|
||||
individual_id=output_prefix, panel=panel_obj, store=store, acmg_config=acmg_config, filters=filters
|
||||
)
|
||||
report = panel_report_markdown(panel_result) if report_format == "markdown" else panel_report_json(panel_result)
|
||||
|
||||
panel_log = RunLog(
|
||||
run_id=f"phase1-panel-{uuid.uuid4()}",
|
||||
started_at=datetime.now(timezone.utc),
|
||||
inputs={
|
||||
"tsv": str(tsv_path),
|
||||
"panel": str(panel_path) if panel_path else f"phenotype:{phenotype_id}",
|
||||
"acmg_config": str(acmg_config_path),
|
||||
},
|
||||
parameters={"max_af": max_af, "report_format": report_format, "phenotype_id": phenotype_id},
|
||||
tool_versions={},
|
||||
database_versions={},
|
||||
config_hashes={"panel": panel_hash, "acmg_config": sha256sum(acmg_config_path)},
|
||||
automation_levels={"panel_report": "Auto+Review"},
|
||||
overrides=[],
|
||||
outputs={"report": report},
|
||||
notes=None,
|
||||
)
|
||||
panel_log_path = log_dir / f"{output_prefix}_panel_runlog.json"
|
||||
write_run_log(panel_log, panel_log_path)
|
||||
|
||||
return Phase1Artifacts(
|
||||
call_log=call_log,
|
||||
annotate_log=annotate_log,
|
||||
panel_report=report,
|
||||
panel_report_format=report_format,
|
||||
panel_result_log=panel_log,
|
||||
tsv_path=tsv_path,
|
||||
joint_vcf=joint_vcf,
|
||||
)
|
||||
35
src/genomic_consultant/orchestration/workflows.py
Normal file
35
src/genomic_consultant/orchestration/workflows.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
from genomic_consultant.acmg.tagger import ACMGConfig, tag_variant
|
||||
from genomic_consultant.panels.panels import GenePanel
|
||||
from genomic_consultant.store.query import GenomicStore
|
||||
from genomic_consultant.utils.models import FilterConfig, SuggestedClassification, Variant
|
||||
|
||||
|
||||
@dataclass
|
||||
class PanelVariantResult:
|
||||
variant: Variant
|
||||
acmg: SuggestedClassification
|
||||
|
||||
|
||||
@dataclass
|
||||
class PanelAnalysisResult:
|
||||
individual_id: str
|
||||
panel: GenePanel
|
||||
variants: List[PanelVariantResult]
|
||||
|
||||
|
||||
def run_panel_variant_review(
|
||||
individual_id: str,
|
||||
panel: GenePanel,
|
||||
store: GenomicStore,
|
||||
acmg_config: ACMGConfig,
|
||||
filters: FilterConfig | None = None,
|
||||
) -> PanelAnalysisResult:
|
||||
"""Query variants for a panel and attach ACMG evidence suggestions."""
|
||||
variants = store.get_variants_by_gene(individual_id=individual_id, genes=panel.genes, filters=filters)
|
||||
enriched = [PanelVariantResult(variant=v, acmg=tag_variant(v, acmg_config)) for v in variants]
|
||||
return PanelAnalysisResult(individual_id=individual_id, panel=panel, variants=enriched)
|
||||
1
src/genomic_consultant/panels/__init__.py
Normal file
1
src/genomic_consultant/panels/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
31
src/genomic_consultant/panels/aggregate.py
Normal file
31
src/genomic_consultant/panels/aggregate.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Set
|
||||
|
||||
|
||||
def merge_mappings(inputs: Iterable[Path], output: Path, version: str = "merged", sources: List[str] | None = None) -> Path:
|
||||
"""
|
||||
Merge multiple phenotype→gene mapping JSON files into one.
|
||||
Input schema: {"phenotype_to_genes": {"HP:xxxx": ["GENE1", ...]}, "version": "...", "source": "..."}
|
||||
"""
|
||||
merged: Dict[str, Set[str]] = {}
|
||||
source_list: List[str] = sources or []
|
||||
for path in inputs:
|
||||
data = json.loads(Path(path).read_text())
|
||||
phenos = data.get("phenotype_to_genes", {})
|
||||
for pid, genes in phenos.items():
|
||||
merged.setdefault(pid, set()).update(genes)
|
||||
src_label = data.get("source") or path.name
|
||||
source_list.append(src_label)
|
||||
|
||||
out = {
|
||||
"version": version,
|
||||
"source": ",".join(source_list),
|
||||
"phenotype_to_genes": {pid: sorted(list(genes)) for pid, genes in merged.items()},
|
||||
"metadata": {"merged_from": [str(p) for p in inputs]},
|
||||
}
|
||||
output.parent.mkdir(parents=True, exist_ok=True)
|
||||
output.write_text(json.dumps(out, indent=2))
|
||||
return output
|
||||
38
src/genomic_consultant/panels/panels.py
Normal file
38
src/genomic_consultant/panels/panels.py
Normal file
@@ -0,0 +1,38 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
|
||||
from genomic_consultant.utils.models import GenePanel
|
||||
|
||||
|
||||
@dataclass
|
||||
class PanelRepository:
|
||||
"""Loads curated gene panels stored as JSON files."""
|
||||
|
||||
panels: Dict[str, GenePanel]
|
||||
|
||||
@classmethod
|
||||
def from_directory(cls, path: Path) -> "PanelRepository":
|
||||
panels: Dict[str, GenePanel] = {}
|
||||
for json_file in Path(path).glob("*.json"):
|
||||
panel = load_panel(json_file)
|
||||
panels[panel.name] = panel
|
||||
return cls(panels=panels)
|
||||
|
||||
def get(self, name: str) -> Optional[GenePanel]:
|
||||
return self.panels.get(name)
|
||||
|
||||
|
||||
def load_panel(path: Path) -> GenePanel:
|
||||
data = json.loads(Path(path).read_text())
|
||||
return GenePanel(
|
||||
name=data["name"],
|
||||
genes=data["genes"],
|
||||
source=data.get("source", "unknown"),
|
||||
version=data.get("version", "unknown"),
|
||||
last_updated=data.get("last_updated", ""),
|
||||
metadata=data.get("metadata", {}),
|
||||
)
|
||||
42
src/genomic_consultant/panels/resolver.py
Normal file
42
src/genomic_consultant/panels/resolver.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from genomic_consultant.utils.models import GenePanel
|
||||
|
||||
|
||||
@dataclass
|
||||
class PhenotypeGeneResolver:
|
||||
"""Resolves phenotype/HPO terms to gene lists using a curated mapping file."""
|
||||
|
||||
mapping: Dict[str, List[str]]
|
||||
version: str
|
||||
source: str
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, path: Path) -> "PhenotypeGeneResolver":
|
||||
data = json.loads(Path(path).read_text())
|
||||
mapping = data.get("phenotype_to_genes", {})
|
||||
version = data.get("version", "unknown")
|
||||
source = data.get("source", "unknown")
|
||||
return cls(mapping=mapping, version=version, source=source)
|
||||
|
||||
def resolve(self, phenotype_id: str) -> Optional[List[str]]:
|
||||
"""Return gene list for a phenotype/HPO ID if present."""
|
||||
return self.mapping.get(phenotype_id)
|
||||
|
||||
def build_panel(self, phenotype_id: str) -> Optional[GenePanel]:
|
||||
genes = self.resolve(phenotype_id)
|
||||
if not genes:
|
||||
return None
|
||||
return GenePanel(
|
||||
name=f"Phenotype:{phenotype_id}",
|
||||
genes=genes,
|
||||
source=self.source,
|
||||
version=self.version,
|
||||
last_updated="",
|
||||
metadata={"phenotype_id": phenotype_id},
|
||||
)
|
||||
1
src/genomic_consultant/pipelines/__init__.py
Normal file
1
src/genomic_consultant/pipelines/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
72
src/genomic_consultant/pipelines/annotation.py
Normal file
72
src/genomic_consultant/pipelines/annotation.py
Normal file
@@ -0,0 +1,72 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Sequence
|
||||
|
||||
|
||||
@dataclass
|
||||
class AnnotationPlan:
|
||||
"""Command plan for annotating a VCF with VEP (or similar)."""
|
||||
|
||||
annotated_vcf: Path
|
||||
flat_table: Path
|
||||
commands: List[str]
|
||||
workdir: Path
|
||||
|
||||
|
||||
def build_vep_plan(
|
||||
vcf_path: Path,
|
||||
workdir: Path,
|
||||
reference_fasta: Path | None = None,
|
||||
output_prefix: str = "annotated",
|
||||
plugins: Sequence[str] | None = None,
|
||||
extra_flags: Sequence[str] | None = None,
|
||||
) -> AnnotationPlan:
|
||||
"""
|
||||
Build shell commands for running VEP on a VCF. Produces compressed VCF and a flattened TSV.
|
||||
This is a plan only; execution is left to a runner.
|
||||
"""
|
||||
workdir = Path(workdir)
|
||||
workdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
annotated_vcf = workdir / f"{output_prefix}.vep.vcf.gz"
|
||||
flat_table = workdir / f"{output_prefix}.vep.tsv"
|
||||
|
||||
plugin_arg = ""
|
||||
if plugins:
|
||||
plugin_arg = " ".join(f"--plugin {p}" for p in plugins)
|
||||
|
||||
extra_arg = " ".join(extra_flags) if extra_flags else ""
|
||||
|
||||
ref_arg = f"--fasta {reference_fasta}" if reference_fasta else ""
|
||||
|
||||
commands: List[str] = [
|
||||
(
|
||||
"vep "
|
||||
f"-i {vcf_path} "
|
||||
f"-o {annotated_vcf} "
|
||||
"--vcf --compress_output bgzip "
|
||||
"--symbol --canonical "
|
||||
"--af --af_gnomad "
|
||||
"--polyphen b --sift b "
|
||||
"--everything "
|
||||
f"{plugin_arg} "
|
||||
f"{extra_arg} "
|
||||
f"{ref_arg}"
|
||||
),
|
||||
f"tabix -p vcf {annotated_vcf}",
|
||||
(
|
||||
"bcftools query "
|
||||
f"-f '%CHROM\\t%POS\\t%REF\\t%ALT\\t%SYMBOL\\t%Consequence\\t%Protein_position\\t"
|
||||
f"%PolyPhen\\t%SIFT\\t%CLIN_SIG\\t%AF\\t%gnomAD_AF\\t%SpliceAI\\t%CADD_PHRED\\n' "
|
||||
f"{annotated_vcf} > {flat_table}"
|
||||
),
|
||||
]
|
||||
|
||||
return AnnotationPlan(
|
||||
annotated_vcf=annotated_vcf,
|
||||
flat_table=flat_table,
|
||||
commands=commands,
|
||||
workdir=workdir,
|
||||
)
|
||||
62
src/genomic_consultant/pipelines/runner.py
Normal file
62
src/genomic_consultant/pipelines/runner.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Protocol
|
||||
|
||||
from genomic_consultant.audit.run_log import write_run_log
|
||||
from genomic_consultant.utils.models import RunLog
|
||||
|
||||
|
||||
class HasCommands(Protocol):
|
||||
commands: List[str]
|
||||
workdir: Path
|
||||
|
||||
|
||||
def execute_plan(plan: HasCommands, automation_level: str, log_path: Path | None = None) -> RunLog:
|
||||
"""
|
||||
Execute shell commands defined in a plan sequentially. Captures stdout/stderr and exit codes.
|
||||
Suitable for VariantCallingPlan or AnnotationPlan.
|
||||
"""
|
||||
run_id = str(uuid.uuid4())
|
||||
started_at = datetime.utcnow()
|
||||
outputs: Dict[str, Any] = {}
|
||||
# The plan object may expose outputs such as joint_vcf/annotated_vcf we can introspect.
|
||||
for attr in ("joint_vcf", "per_sample_gvcf", "annotated_vcf", "flat_table"):
|
||||
if hasattr(plan, attr):
|
||||
outputs[attr] = getattr(plan, attr)
|
||||
|
||||
results: List[Dict[str, Any]] = []
|
||||
for idx, cmd in enumerate(plan.commands):
|
||||
proc = subprocess.run(cmd, shell=True, cwd=plan.workdir, capture_output=True, text=True)
|
||||
results.append(
|
||||
{
|
||||
"step": idx,
|
||||
"command": cmd,
|
||||
"returncode": proc.returncode,
|
||||
"stdout": proc.stdout,
|
||||
"stderr": proc.stderr,
|
||||
}
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
break
|
||||
|
||||
run_log = RunLog(
|
||||
run_id=run_id,
|
||||
started_at=started_at,
|
||||
inputs={"workdir": str(plan.workdir)},
|
||||
parameters={},
|
||||
tool_versions={}, # left for caller to fill if known
|
||||
database_versions={},
|
||||
config_hashes={},
|
||||
automation_levels={"pipeline": automation_level},
|
||||
overrides=[],
|
||||
outputs=outputs | {"command_results": results},
|
||||
notes=None,
|
||||
)
|
||||
|
||||
if log_path:
|
||||
write_run_log(run_log, log_path)
|
||||
return run_log
|
||||
65
src/genomic_consultant/pipelines/variant_calling.py
Normal file
65
src/genomic_consultant/pipelines/variant_calling.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Sequence
|
||||
|
||||
|
||||
@dataclass
|
||||
class VariantCallingPlan:
|
||||
"""Command plan for running a trio variant calling pipeline."""
|
||||
|
||||
per_sample_gvcf: Dict[str, Path]
|
||||
joint_vcf: Path
|
||||
commands: List[str]
|
||||
workdir: Path
|
||||
|
||||
|
||||
def build_gatk_trio_plan(
|
||||
samples: Dict[str, Path],
|
||||
reference_fasta: Path,
|
||||
workdir: Path,
|
||||
output_prefix: str = "trio",
|
||||
intervals: Sequence[Path] | None = None,
|
||||
) -> VariantCallingPlan:
|
||||
"""
|
||||
Build shell commands for a GATK-based trio pipeline (HaplotypeCaller gVCF + joint genotyping).
|
||||
Does not execute commands; returns a plan for orchestration layers to run.
|
||||
"""
|
||||
workdir = Path(workdir)
|
||||
workdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
per_sample_gvcf: Dict[str, Path] = {}
|
||||
commands: List[str] = []
|
||||
interval_args = ""
|
||||
if intervals:
|
||||
interval_args = " ".join(f"-L {i}" for i in intervals)
|
||||
|
||||
for sample_id, bam_path in samples.items():
|
||||
gvcf_path = workdir / f"{sample_id}.g.vcf.gz"
|
||||
per_sample_gvcf[sample_id] = gvcf_path
|
||||
cmd = (
|
||||
"gatk HaplotypeCaller "
|
||||
f"-R {reference_fasta} "
|
||||
f"-I {bam_path} "
|
||||
"-O {out} "
|
||||
"-ERC GVCF "
|
||||
f"{interval_args}"
|
||||
).format(out=gvcf_path)
|
||||
commands.append(cmd)
|
||||
|
||||
joint_vcf = workdir / f"{output_prefix}.joint.vcf.gz"
|
||||
joint_cmd = (
|
||||
"gatk GenotypeGVCFs "
|
||||
f"-R {reference_fasta} "
|
||||
+ " ".join(f"--variant {p}" for p in per_sample_gvcf.values())
|
||||
+ f" -O {joint_vcf}"
|
||||
)
|
||||
commands.append(joint_cmd)
|
||||
|
||||
return VariantCallingPlan(
|
||||
per_sample_gvcf=per_sample_gvcf,
|
||||
joint_vcf=joint_vcf,
|
||||
commands=commands,
|
||||
workdir=workdir,
|
||||
)
|
||||
1
src/genomic_consultant/reporting/__init__.py
Normal file
1
src/genomic_consultant/reporting/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
75
src/genomic_consultant/reporting/report.py
Normal file
75
src/genomic_consultant/reporting/report.py
Normal file
@@ -0,0 +1,75 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from typing import List
|
||||
|
||||
from genomic_consultant.orchestration.workflows import PanelAnalysisResult, PanelVariantResult
|
||||
|
||||
|
||||
def panel_report_markdown(result: PanelAnalysisResult) -> str:
|
||||
lines: List[str] = []
|
||||
lines.append(f"# Panel Report: {result.panel.name}")
|
||||
lines.append("")
|
||||
lines.append(f"- Individual: `{result.individual_id}`")
|
||||
lines.append(f"- Panel version: `{result.panel.version}` (source: {result.panel.source})")
|
||||
lines.append(f"- Generated: {datetime.now(timezone.utc).isoformat()}")
|
||||
lines.append("")
|
||||
lines.append("## Variants")
|
||||
if not result.variants:
|
||||
lines.append("No variants found for this panel with current filters.")
|
||||
return "\n".join(lines)
|
||||
|
||||
header = "| Variant | Consequence | ClinVar | AF | ACMG suggestion | Evidence |"
|
||||
lines.append(header)
|
||||
lines.append("|---|---|---|---|---|---|")
|
||||
for pv in result.variants:
|
||||
v = pv.variant
|
||||
ev_summary = "; ".join(f"{e.tag}({e.strength})" for e in pv.acmg.evidence) or "None"
|
||||
lines.append(
|
||||
"|"
|
||||
+ f"{v.id} ({v.gene or 'NA'})"
|
||||
+ f"|{v.consequence or 'NA'}"
|
||||
+ f"|{v.clinvar_significance or 'NA'}"
|
||||
+ f"|{v.allele_frequency if v.allele_frequency is not None else 'NA'}"
|
||||
+ f"|{pv.acmg.suggested_class}"
|
||||
+ f"|{ev_summary}"
|
||||
+ "|"
|
||||
)
|
||||
lines.append("")
|
||||
lines.append("> Note: ACMG suggestions here are auto-generated and require human review for clinical decisions.")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def panel_report_json(result: PanelAnalysisResult) -> str:
|
||||
payload = {
|
||||
"individual_id": result.individual_id,
|
||||
"panel": {
|
||||
"name": result.panel.name,
|
||||
"version": result.panel.version,
|
||||
"source": result.panel.source,
|
||||
},
|
||||
"generated": datetime.utcnow().isoformat(),
|
||||
"variants": [
|
||||
{
|
||||
"id": pv.variant.id,
|
||||
"gene": pv.variant.gene,
|
||||
"consequence": pv.variant.consequence,
|
||||
"clinvar_significance": pv.variant.clinvar_significance,
|
||||
"allele_frequency": pv.variant.allele_frequency,
|
||||
"acmg": {
|
||||
"suggested_class": pv.acmg.suggested_class,
|
||||
"evidence": [
|
||||
{"tag": e.tag, "strength": e.strength, "rationale": e.rationale}
|
||||
for e in pv.acmg.evidence
|
||||
],
|
||||
"human_classification": pv.acmg.human_classification,
|
||||
"human_reviewer": pv.acmg.human_reviewer,
|
||||
"human_notes": pv.acmg.human_notes,
|
||||
},
|
||||
}
|
||||
for pv in result.variants
|
||||
],
|
||||
"disclaimer": "Auto-generated suggestions; human review required.",
|
||||
}
|
||||
return json.dumps(payload, indent=2)
|
||||
1
src/genomic_consultant/store/__init__.py
Normal file
1
src/genomic_consultant/store/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
83
src/genomic_consultant/store/parquet_store.py
Normal file
83
src/genomic_consultant/store/parquet_store.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Sequence
|
||||
|
||||
from genomic_consultant.store.query import _matches_any
|
||||
from genomic_consultant.utils.models import FilterConfig, Variant
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError as exc: # pragma: no cover - import guard
|
||||
raise ImportError("ParquetGenomicStore requires pandas. Install with `pip install pandas`.") from exc
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParquetGenomicStore:
|
||||
"""Parquet-backed store for larger datasets (optional dependency: pandas)."""
|
||||
|
||||
df: "pd.DataFrame"
|
||||
|
||||
@classmethod
|
||||
def from_parquet(cls, path: Path) -> "ParquetGenomicStore":
|
||||
df = pd.read_parquet(path)
|
||||
return cls(df=df)
|
||||
|
||||
def get_variants_by_gene(
|
||||
self, individual_id: str, genes: Sequence[str], filters: FilterConfig | None = None
|
||||
) -> List[Variant]:
|
||||
filters = filters or FilterConfig()
|
||||
subset = self.df[self.df["SYMBOL"].str.upper().isin([g.upper() for g in genes])]
|
||||
return self._apply_filters(subset, filters)
|
||||
|
||||
def get_variants_by_region(
|
||||
self, individual_id: str, chrom: str, start: int, end: int, filters: FilterConfig | None = None
|
||||
) -> List[Variant]:
|
||||
filters = filters or FilterConfig()
|
||||
subset = self.df[(self.df["CHROM"] == chrom) & (self.df["POS"] >= start) & (self.df["POS"] <= end)]
|
||||
return self._apply_filters(subset, filters)
|
||||
|
||||
def _apply_filters(self, df: "pd.DataFrame", filters: FilterConfig) -> List[Variant]:
|
||||
mask = pd.Series(True, index=df.index)
|
||||
if filters.max_af is not None:
|
||||
mask &= (df["AF"].fillna(df.get("gnomAD_AF")).fillna(0) <= filters.max_af)
|
||||
if filters.min_af is not None:
|
||||
mask &= (df["AF"].fillna(df.get("gnomAD_AF")).fillna(0) >= filters.min_af)
|
||||
if filters.clinvar_significance:
|
||||
mask &= df["CLIN_SIG"].str.lower().isin([s.lower() for s in filters.clinvar_significance])
|
||||
if filters.consequence_includes:
|
||||
mask &= df["Consequence"].str.lower().apply(
|
||||
lambda v: _matches_any(v, [c.lower() for c in filters.consequence_includes])
|
||||
)
|
||||
if filters.consequence_excludes:
|
||||
mask &= ~df["Consequence"].str.lower().apply(
|
||||
lambda v: _matches_any(v, [c.lower() for c in filters.consequence_excludes])
|
||||
)
|
||||
filtered = df[mask]
|
||||
variants: List[Variant] = []
|
||||
for _, row in filtered.iterrows():
|
||||
variants.append(
|
||||
Variant(
|
||||
chrom=row["CHROM"],
|
||||
pos=int(row["POS"]),
|
||||
ref=row["REF"],
|
||||
alt=row["ALT"],
|
||||
gene=row.get("SYMBOL"),
|
||||
consequence=row.get("Consequence"),
|
||||
protein_change=row.get("Protein_position"),
|
||||
clinvar_significance=row.get("CLIN_SIG"),
|
||||
allele_frequency=_maybe_float(row.get("AF")) or _maybe_float(row.get("gnomAD_AF")),
|
||||
annotations={"gnomad_af": _maybe_float(row.get("gnomAD_AF"))},
|
||||
)
|
||||
)
|
||||
return variants
|
||||
|
||||
|
||||
def _maybe_float(value) -> float | None:
|
||||
try:
|
||||
if value is None:
|
||||
return None
|
||||
return float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
109
src/genomic_consultant/store/query.py
Normal file
109
src/genomic_consultant/store/query.py
Normal file
@@ -0,0 +1,109 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Sequence
|
||||
|
||||
from genomic_consultant.utils.models import FilterConfig, Variant
|
||||
|
||||
|
||||
@dataclass
|
||||
class GenomicStore:
|
||||
"""Lightweight wrapper around annotated variants."""
|
||||
|
||||
variants: List[Variant]
|
||||
|
||||
@classmethod
|
||||
def from_tsv(cls, path: Path) -> "GenomicStore":
|
||||
"""
|
||||
Load variants from a flattened TSV generated by the annotation plan.
|
||||
Expected columns (flexible, missing columns are tolerated):
|
||||
CHROM POS REF ALT SYMBOL Consequence Protein_position PolyPhen SIFT CLIN_SIG AF gnomAD_AF SpliceAI CADD_PHRED
|
||||
"""
|
||||
variants: List[Variant] = []
|
||||
with Path(path).open() as fh:
|
||||
reader = csv.DictReader(fh, delimiter="\t")
|
||||
for row in reader:
|
||||
row = {k: v for k, v in row.items()} if row else {}
|
||||
if not row:
|
||||
continue
|
||||
variants.append(_row_to_variant(row))
|
||||
return cls(variants=variants)
|
||||
|
||||
def get_variants_by_gene(
|
||||
self, individual_id: str, genes: Sequence[str], filters: FilterConfig | None = None
|
||||
) -> List[Variant]:
|
||||
filters = filters or FilterConfig()
|
||||
gene_set = {g.upper() for g in genes}
|
||||
return self._apply_filters((v for v in self.variants if (v.gene or "").upper() in gene_set), filters)
|
||||
|
||||
def get_variants_by_region(
|
||||
self, individual_id: str, chrom: str, start: int, end: int, filters: FilterConfig | None = None
|
||||
) -> List[Variant]:
|
||||
filters = filters or FilterConfig()
|
||||
return self._apply_filters(
|
||||
(v for v in self.variants if v.chrom == chrom and start <= v.pos <= end),
|
||||
filters,
|
||||
)
|
||||
|
||||
def _apply_filters(self, variants: Iterable[Variant], filters: FilterConfig) -> List[Variant]:
|
||||
out: List[Variant] = []
|
||||
for v in variants:
|
||||
if filters.max_af is not None and v.allele_frequency is not None and v.allele_frequency > filters.max_af:
|
||||
continue
|
||||
if filters.min_af is not None and v.allele_frequency is not None and v.allele_frequency < filters.min_af:
|
||||
continue
|
||||
if filters.clinvar_significance and (v.clinvar_significance or "").lower() not in {
|
||||
sig.lower() for sig in filters.clinvar_significance
|
||||
}:
|
||||
continue
|
||||
if filters.consequence_includes and not _matches_any(v.consequence, filters.consequence_includes):
|
||||
continue
|
||||
if filters.consequence_excludes and _matches_any(v.consequence, filters.consequence_excludes):
|
||||
continue
|
||||
out.append(v)
|
||||
return out
|
||||
|
||||
|
||||
def _matches_any(value: str | None, patterns: Sequence[str]) -> bool:
|
||||
if value is None:
|
||||
return False
|
||||
v = value.lower()
|
||||
return any(pat.lower() in v for pat in patterns)
|
||||
|
||||
|
||||
def _parse_float(val: str | None) -> float | None:
|
||||
if val in (None, "", "."):
|
||||
return None
|
||||
try:
|
||||
return float(val)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _row_to_variant(row: Dict[str, str]) -> Variant:
|
||||
chrom = row.get("CHROM") or row.get("#CHROM")
|
||||
pos = int(row["POS"])
|
||||
af = _parse_float(row.get("AF"))
|
||||
gnomad_af = _parse_float(row.get("gnomAD_AF"))
|
||||
splice_ai = _parse_float(row.get("SpliceAI"))
|
||||
cadd = _parse_float(row.get("CADD_PHRED"))
|
||||
return Variant(
|
||||
chrom=chrom,
|
||||
pos=pos,
|
||||
ref=row.get("REF"),
|
||||
alt=row.get("ALT"),
|
||||
gene=row.get("SYMBOL") or None,
|
||||
consequence=row.get("Consequence") or None,
|
||||
protein_change=row.get("Protein_position") or None,
|
||||
clinvar_significance=row.get("CLIN_SIG") or None,
|
||||
allele_frequency=af if af is not None else gnomad_af,
|
||||
annotations={
|
||||
"polyphen": row.get("PolyPhen"),
|
||||
"sift": row.get("SIFT"),
|
||||
"gnomad_af": gnomad_af,
|
||||
"splice_ai_delta_score": splice_ai,
|
||||
"cadd_phred": cadd,
|
||||
},
|
||||
)
|
||||
1
src/genomic_consultant/utils/__init__.py
Normal file
1
src/genomic_consultant/utils/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
15
src/genomic_consultant/utils/hashing.py
Normal file
15
src/genomic_consultant/utils/hashing.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def sha256sum(path: Path) -> Optional[str]:
|
||||
if not Path(path).exists():
|
||||
return None
|
||||
h = hashlib.sha256()
|
||||
with Path(path).open("rb") as fh:
|
||||
for chunk in iter(lambda: fh.read(8192), b""):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
81
src/genomic_consultant/utils/models.py
Normal file
81
src/genomic_consultant/utils/models.py
Normal file
@@ -0,0 +1,81 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class Variant:
|
||||
"""Lightweight variant representation used by the query layer."""
|
||||
|
||||
chrom: str
|
||||
pos: int
|
||||
ref: str
|
||||
alt: str
|
||||
gene: Optional[str] = None
|
||||
transcript: Optional[str] = None
|
||||
consequence: Optional[str] = None
|
||||
protein_change: Optional[str] = None
|
||||
clinvar_significance: Optional[str] = None
|
||||
allele_frequency: Optional[float] = None
|
||||
annotations: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
@property
|
||||
def id(self) -> str:
|
||||
return f"{self.chrom}-{self.pos}-{self.ref}-{self.alt}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class FilterConfig:
|
||||
"""Common filters for variant queries."""
|
||||
|
||||
max_af: Optional[float] = None
|
||||
min_af: Optional[float] = None
|
||||
clinvar_significance: Optional[List[str]] = None
|
||||
consequence_includes: Optional[List[str]] = None
|
||||
consequence_excludes: Optional[List[str]] = None
|
||||
inheritances: Optional[List[str]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvidenceTag:
|
||||
tag: str
|
||||
strength: str
|
||||
rationale: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class SuggestedClassification:
|
||||
suggested_class: str
|
||||
evidence: List[EvidenceTag] = field(default_factory=list)
|
||||
human_classification: Optional[str] = None
|
||||
human_reviewer: Optional[str] = None
|
||||
human_notes: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class GenePanel:
|
||||
name: str
|
||||
genes: List[str]
|
||||
source: str
|
||||
version: str
|
||||
last_updated: str
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RunLog:
|
||||
"""Machine-readable record of a pipeline or analysis run."""
|
||||
|
||||
run_id: str
|
||||
started_at: datetime
|
||||
inputs: Dict[str, Any]
|
||||
parameters: Dict[str, Any]
|
||||
tool_versions: Dict[str, str]
|
||||
database_versions: Dict[str, str]
|
||||
config_hashes: Dict[str, str]
|
||||
automation_levels: Dict[str, str]
|
||||
overrides: List[Dict[str, Any]] = field(default_factory=list)
|
||||
outputs: Dict[str, Any] = field(default_factory=dict)
|
||||
notes: Optional[str] = None
|
||||
22
src/genomic_consultant/utils/tooling.py
Normal file
22
src/genomic_consultant/utils/tooling.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from typing import Dict
|
||||
|
||||
|
||||
def probe_tool_versions(commands: Dict[str, str]) -> Dict[str, str]:
|
||||
"""
|
||||
Attempt to get tool versions by running the provided commands with --version.
|
||||
Returns best-effort outputs; missing tools are skipped.
|
||||
"""
|
||||
results: Dict[str, str] = {}
|
||||
for name, cmd in commands.items():
|
||||
try:
|
||||
proc = subprocess.run(f"{cmd} --version", shell=True, capture_output=True, text=True, timeout=10)
|
||||
if proc.returncode == 0 and proc.stdout:
|
||||
results[name] = proc.stdout.strip().splitlines()[0]
|
||||
elif proc.returncode == 0 and proc.stderr:
|
||||
results[name] = proc.stderr.strip().splitlines()[0]
|
||||
except Exception:
|
||||
continue
|
||||
return results
|
||||
41
tests/test_acmg.py
Normal file
41
tests/test_acmg.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from genomic_consultant.acmg.tagger import ACMGConfig, tag_variant
|
||||
from genomic_consultant.utils.models import Variant
|
||||
|
||||
|
||||
def test_ba1_trumps():
|
||||
cfg = ACMGConfig(ba1_af=0.05, bs1_af=0.01, pm2_af=0.0005, lof_genes=set())
|
||||
v = Variant(chrom="1", pos=1, ref="A", alt="T", allele_frequency=0.2)
|
||||
result = tag_variant(v, cfg)
|
||||
assert result.suggested_class == "Benign"
|
||||
assert any(e.tag == "BA1" for e in result.evidence)
|
||||
|
||||
|
||||
def test_pvs1_pm2_likely_pathogenic():
|
||||
cfg = ACMGConfig(lof_genes={"GENE1"}, pm2_af=0.0005, ba1_af=0.05, bs1_af=0.01)
|
||||
v = Variant(
|
||||
chrom="1",
|
||||
pos=1,
|
||||
ref="A",
|
||||
alt="T",
|
||||
gene="GENE1",
|
||||
consequence="stop_gained",
|
||||
allele_frequency=0.0001,
|
||||
)
|
||||
result = tag_variant(v, cfg)
|
||||
assert result.suggested_class == "Likely pathogenic"
|
||||
tags = {e.tag for e in result.evidence}
|
||||
assert {"PVS1", "PM2"} <= tags
|
||||
|
||||
|
||||
def test_bp7_supporting():
|
||||
cfg = ACMGConfig(bp7_splice_ai_max=0.1)
|
||||
v = Variant(
|
||||
chrom="1",
|
||||
pos=1,
|
||||
ref="A",
|
||||
alt="T",
|
||||
consequence="synonymous_variant",
|
||||
annotations={"splice_ai_delta_score": 0.05},
|
||||
)
|
||||
result = tag_variant(v, cfg)
|
||||
assert any(e.tag == "BP7" for e in result.evidence)
|
||||
16
tests/test_aggregate.py
Normal file
16
tests/test_aggregate.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from pathlib import Path
|
||||
|
||||
from genomic_consultant.panels.aggregate import merge_mappings
|
||||
import json
|
||||
|
||||
|
||||
def test_merge_mappings(tmp_path: Path):
|
||||
a = tmp_path / "a.json"
|
||||
b = tmp_path / "b.json"
|
||||
a.write_text('{"source":"A","phenotype_to_genes":{"HP:1":["G1","G2"]}}')
|
||||
b.write_text('{"source":"B","phenotype_to_genes":{"HP:1":["G2","G3"],"HP:2":["G4"]}}')
|
||||
out = tmp_path / "out.json"
|
||||
merge_mappings([a, b], out)
|
||||
data = json.loads(out.read_text())
|
||||
assert sorted(data["phenotype_to_genes"]["HP:1"]) == ["G1", "G2", "G3"]
|
||||
assert data["phenotype_to_genes"]["HP:2"] == ["G4"]
|
||||
27
tests/test_phase1_pipeline.py
Normal file
27
tests/test_phase1_pipeline.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from pathlib import Path
|
||||
|
||||
from genomic_consultant.orchestration.phase1_pipeline import run_phase1_pipeline
|
||||
|
||||
|
||||
def test_phase1_pipeline_with_existing_tsv():
|
||||
root = Path(__file__).resolve().parents[1]
|
||||
tsv = root / "sample_data/example_annotated.tsv"
|
||||
panel = root / "configs/panel.example.json"
|
||||
acmg = root / "configs/acmg_config.example.yaml"
|
||||
result = run_phase1_pipeline(
|
||||
samples=None,
|
||||
reference_fasta=None,
|
||||
workdir=root / "runtime",
|
||||
output_prefix="demo",
|
||||
acmg_config_path=acmg,
|
||||
max_af=0.05,
|
||||
panel_path=panel,
|
||||
phenotype_id=None,
|
||||
phenotype_mapping=None,
|
||||
report_format="markdown",
|
||||
existing_tsv=tsv,
|
||||
skip_call=True,
|
||||
skip_annotate=True,
|
||||
)
|
||||
assert "Panel Report" in result.panel_report
|
||||
assert result.tsv_path == tsv
|
||||
18
tests/test_resolver.py
Normal file
18
tests/test_resolver.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from pathlib import Path
|
||||
|
||||
from genomic_consultant.panels.resolver import PhenotypeGeneResolver
|
||||
|
||||
|
||||
def test_resolver_build_panel(tmp_path: Path):
|
||||
data = {
|
||||
"version": "test",
|
||||
"source": "example",
|
||||
"phenotype_to_genes": {"HP:0001": ["GENE1", "GENE2"]},
|
||||
}
|
||||
path = tmp_path / "map.json"
|
||||
path.write_text('{"version":"test","source":"example","phenotype_to_genes":{"HP:0001":["GENE1","GENE2"]}}')
|
||||
resolver = PhenotypeGeneResolver.from_json(path)
|
||||
panel = resolver.build_panel("HP:0001")
|
||||
assert panel is not None
|
||||
assert panel.genes == ["GENE1", "GENE2"]
|
||||
assert "phenotype_id" in panel.metadata
|
||||
28
tests/test_store.py
Normal file
28
tests/test_store.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from genomic_consultant.store.query import GenomicStore
|
||||
from genomic_consultant.utils.models import FilterConfig, Variant
|
||||
|
||||
|
||||
def test_filter_by_gene_and_af():
|
||||
store = GenomicStore(
|
||||
variants=[
|
||||
Variant(chrom="1", pos=1, ref="A", alt="T", gene="GENE1", allele_frequency=0.02),
|
||||
Variant(chrom="1", pos=2, ref="G", alt="C", gene="GENE2", allele_frequency=0.0001),
|
||||
]
|
||||
)
|
||||
res = store.get_variants_by_gene("ind", ["GENE2"], filters=FilterConfig(max_af=0.001))
|
||||
assert len(res) == 1
|
||||
assert res[0].gene == "GENE2"
|
||||
|
||||
|
||||
def test_consequence_include_exclude():
|
||||
store = GenomicStore(
|
||||
variants=[
|
||||
Variant(chrom="1", pos=1, ref="A", alt="T", gene="GENE1", consequence="missense_variant"),
|
||||
Variant(chrom="1", pos=2, ref="G", alt="C", gene="GENE1", consequence="synonymous_variant"),
|
||||
]
|
||||
)
|
||||
res = store.get_variants_by_gene(
|
||||
"ind", ["GENE1"], filters=FilterConfig(consequence_includes=["missense"], consequence_excludes=["synonymous"])
|
||||
)
|
||||
assert len(res) == 1
|
||||
assert res[0].consequence == "missense_variant"
|
||||
33
tests/test_store_tsv.py
Normal file
33
tests/test_store_tsv.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from pathlib import Path
|
||||
|
||||
from genomic_consultant.store.query import GenomicStore
|
||||
|
||||
|
||||
def test_from_tsv_with_extra_columns(tmp_path: Path):
|
||||
content = "\t".join(
|
||||
[
|
||||
"#CHROM",
|
||||
"POS",
|
||||
"REF",
|
||||
"ALT",
|
||||
"SYMBOL",
|
||||
"Consequence",
|
||||
"Protein_position",
|
||||
"PolyPhen",
|
||||
"SIFT",
|
||||
"CLIN_SIG",
|
||||
"AF",
|
||||
"gnomAD_AF",
|
||||
"SpliceAI",
|
||||
"CADD_PHRED",
|
||||
]
|
||||
) + "\n"
|
||||
content += "1\t100\tA\tT\tGENE1\tmissense_variant\tp.X\t.\t.\tPathogenic\t0.0001\t0.0002\t0.05\t20.1\n"
|
||||
path = tmp_path / "v.tsv"
|
||||
path.write_text(content)
|
||||
|
||||
store = GenomicStore.from_tsv(path)
|
||||
assert len(store.variants) == 1
|
||||
v = store.variants[0]
|
||||
assert v.annotations["splice_ai_delta_score"] == 0.05
|
||||
assert v.annotations["cadd_phred"] == 20.1
|
||||
Reference in New Issue
Block a user