From 6ab7fd1378f61250ea953d3088c52002320fa742 Mon Sep 17 00:00:00 2001
From: gbanyan <gbanyan.huang@gmail.com>
Date: Wed, 11 Feb 2026 21:14:37 +0800
Subject: [PATCH] docs(05-output-cli): create phase plan

---
 .planning/ROADMAP.md                         |  19 +-
 .planning/phases/05-output-cli/05-01-PLAN.md | 189 ++++++++++++++++
 .planning/phases/05-output-cli/05-02-PLAN.md | 219 +++++++++++++++++++
 .planning/phases/05-output-cli/05-03-PLAN.md | 200 +++++++++++++++++
 4 files changed, 619 insertions(+), 8 deletions(-)
 create mode 100644 .planning/phases/05-output-cli/05-01-PLAN.md
 create mode 100644 .planning/phases/05-output-cli/05-02-PLAN.md
 create mode 100644 .planning/phases/05-output-cli/05-03-PLAN.md

diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md
index f61af31..fcdac6e 100644
--- a/.planning/ROADMAP.md
+++ b/.planning/ROADMAP.md
@@ -103,9 +103,12 @@ Plans:
   4. Pipeline generates visualizations: score distribution, evidence layer contribution, tier breakdown
   5. Unified CLI provides subcommands for running layers, integration, and reporting with progress logging
   6. Reproducibility report documents all parameters, data versions, gene counts at filtering steps, and validation metrics
-**Plans**: TBD
+**Plans**: 3 plans
 
-Plans: (to be created during plan-phase)
+Plans:
+- [ ] 05-01-PLAN.md -- Tiered candidate output with evidence summary and dual-format writer (TSV+Parquet)
+- [ ] 05-02-PLAN.md -- Visualizations (score distribution, layer contributions, tier breakdown) and reproducibility report
+- [ ] 05-03-PLAN.md -- CLI report command wiring all output modules with integration tests
 
 ### Phase 6: Validation
 **Goal**: Benchmark scoring system against positive and negative controls
@@ -123,13 +126,13 @@ Plans: (to be created during plan-phase)
 ## Progress
 
 **Execution Order:**
-Phases execute in numeric order: 1 → 2 → 3 → 4 → 5 → 6
+Phases execute in numeric order: 1 -> 2 -> 3 -> 4 -> 5 -> 6
 
 | Phase | Plans Complete | Status | Completed |
 |-------|----------------|--------|-----------|
-| 1. Data Infrastructure | 4/4 | ✓ Complete | 2026-02-11 |
-| 2. Prototype Evidence Layer | 2/2 | ✓ Complete | 2026-02-11 |
-| 3. Core Evidence Layers | 6/6 | ✓ Complete | 2026-02-11 |
-| 4. Scoring & Integration | 3/3 | ✓ Complete | 2026-02-11 |
-| 5. Output & CLI | 0/TBD | Not started | - |
+| 1. Data Infrastructure | 4/4 | Complete | 2026-02-11 |
+| 2. Prototype Evidence Layer | 2/2 | Complete | 2026-02-11 |
+| 3. Core Evidence Layers | 6/6 | Complete | 2026-02-11 |
+| 4. Scoring & Integration | 3/3 | Complete | 2026-02-11 |
+| 5. Output & CLI | 0/3 | In Progress | - |
 | 6. Validation | 0/TBD | Not started | - |
diff --git a/.planning/phases/05-output-cli/05-01-PLAN.md b/.planning/phases/05-output-cli/05-01-PLAN.md
new file mode 100644
index 0000000..4f26a8c
--- /dev/null
+++ b/.planning/phases/05-output-cli/05-01-PLAN.md
@@ -0,0 +1,189 @@
+---
+phase: 05-output-cli
+plan: 01
+type: execute
+wave: 1
+depends_on: []
+files_modified:
+  - src/usher_pipeline/output/__init__.py
+  - src/usher_pipeline/output/tiers.py
+  - src/usher_pipeline/output/evidence_summary.py
+  - src/usher_pipeline/output/writers.py
+  - tests/test_output.py
+autonomous: true
+
+must_haves:
+  truths:
+    - "scored_genes DataFrame is classified into HIGH/MEDIUM/LOW/EXCLUDED tiers based on composite_score and evidence_count"
+    - "Each candidate gene has a supporting_layers field listing which evidence layers contributed and an evidence_gaps field listing which are NULL"
+    - "Output is written in both TSV and Parquet formats with identical data"
+    - "Provenance YAML sidecar is generated alongside output files"
+  artifacts:
+    - path: "src/usher_pipeline/output/tiers.py"
+      provides: "Confidence tiering logic"
+      exports: ["assign_tiers", "TIER_THRESHOLDS"]
+    - path: "src/usher_pipeline/output/evidence_summary.py"
+      provides: "Per-gene evidence summary columns"
+      exports: ["add_evidence_summary"]
+    - path: "src/usher_pipeline/output/writers.py"
+      provides: "Dual-format TSV+Parquet writer with provenance sidecar"
+      exports: ["write_candidate_output"]
+    - path: "tests/test_output.py"
+      provides: "Unit tests for tiering, evidence summary, and writers"
+  key_links:
+    - from: "src/usher_pipeline/output/tiers.py"
+      to: "scored_genes DuckDB table"
+      via: "polars DataFrame with composite_score and evidence_count columns"
+      pattern: "pl\\.when.*composite_score.*evidence_count"
+    - from: "src/usher_pipeline/output/writers.py"
+      to: "output files"
+      via: "polars write_csv (separator=tab) and write_parquet"
+      pattern: "write_csv.*separator.*write_parquet"
+---
+
+<objective>
+Create the output generation module: tiered candidate classification, per-gene evidence summary, and dual-format (TSV+Parquet) file writer with provenance sidecars.
+
+Purpose: This is the core data transformation that converts raw scored_genes into the pipeline's primary deliverable -- a tiered, annotated candidate list. All downstream reporting and visualization depend on this module.
+Output: `src/usher_pipeline/output/` package with tiers.py, evidence_summary.py, writers.py and unit tests.
+</objective>
+
+<execution_context>
+@/Users/gbanyan/.claude/get-shit-done/workflows/execute-plan.md
+@/Users/gbanyan/.claude/get-shit-done/templates/summary.md
+</execution_context>
+
+<context>
+@.planning/PROJECT.md
+@.planning/ROADMAP.md
+@.planning/STATE.md
+@src/usher_pipeline/scoring/integration.py
+@src/usher_pipeline/scoring/quality_control.py
+@src/usher_pipeline/persistence/provenance.py
+@src/usher_pipeline/persistence/duckdb_store.py
+@src/usher_pipeline/config/schema.py
+</context>
+
+<tasks>
+
+<task type="auto">
+  <name>Task 1: Tiering logic and evidence summary module</name>
+  <files>
+    src/usher_pipeline/output/__init__.py
+    src/usher_pipeline/output/tiers.py
+    src/usher_pipeline/output/evidence_summary.py
+  </files>
+  <action>
+Create `src/usher_pipeline/output/` package directory.
+
+**tiers.py**: Create tiering module with configurable thresholds.
+
+Define `TIER_THRESHOLDS` as a dict with defaults from research:
+- HIGH: composite_score >= 0.7 AND evidence_count >= 3
+- MEDIUM: composite_score >= 0.4 AND evidence_count >= 2
+- LOW: composite_score >= 0.2 (any evidence_count)
+- Everything else: EXCLUDED (filtered out)
+
+Implement `assign_tiers(scored_df: pl.DataFrame, thresholds: dict | None = None) -> pl.DataFrame`:
+- Accepts polars DataFrame with columns: gene_id, gene_symbol, composite_score, evidence_count, quality_flag, all 6 layer score columns, all 6 contribution columns
+- Uses pl.when/then/otherwise chain (vectorized, not row-by-row) to add `confidence_tier` column
+- Filters OUT rows where confidence_tier == "EXCLUDED"
+- Sorts by composite_score DESC (deterministic: break ties by gene_id ASC)
+- Returns DataFrame with confidence_tier column added
+
+Allow thresholds parameter to override defaults (for CLI configurability later).
+
+**evidence_summary.py**: Create evidence summary module.
+
+Define the 6 evidence layer names as a constant list: EVIDENCE_LAYERS = ["gnomad", "expression", "annotation", "localization", "animal_model", "literature"]
+
+Implement `add_evidence_summary(df: pl.DataFrame) -> pl.DataFrame`:
+- For each layer in EVIDENCE_LAYERS, checks if `{layer}_score` column is not null
+- Adds `supporting_layers` column: comma-separated list of layer names where score is NOT NULL (e.g., "gnomad,expression,annotation")
+- Adds `evidence_gaps` column: comma-separated list of layer names where score IS NULL (e.g., "localization,animal_model,literature")
+- Uses polars expressions (pl.concat_str or equivalent) -- do NOT convert to pandas
+- Handles edge case: gene with all NULLs -> supporting_layers="" and evidence_gaps="gnomad,expression,annotation,localization,animal_model,literature"
+
+**__init__.py**: Export assign_tiers, TIER_THRESHOLDS, add_evidence_summary, and write_candidate_output (from writers.py created in Task 2).
+  </action>
+  <verify>
+Run: `python -c "from usher_pipeline.output.tiers import assign_tiers, TIER_THRESHOLDS; print('tiers OK')"` and `python -c "from usher_pipeline.output.evidence_summary import add_evidence_summary, EVIDENCE_LAYERS; print('summary OK')"`
+  </verify>
+  <done>
+assign_tiers() classifies genes into HIGH/MEDIUM/LOW tiers using configurable thresholds; add_evidence_summary() adds supporting_layers and evidence_gaps columns; both functions operate on polars DataFrames without materialization issues.
+  </done>
+</task>
+
+<task type="auto">
+  <name>Task 2: Dual-format writer with provenance sidecar and unit tests</name>
+  <files>
+    src/usher_pipeline/output/writers.py
+    src/usher_pipeline/output/__init__.py
+    tests/test_output.py
+  </files>
+  <action>
+**writers.py**: Create dual-format output writer.
+
+Implement `write_candidate_output(df: pl.DataFrame, output_dir: Path, filename_base: str = "candidates") -> dict`:
+- Collects LazyFrame if needed (handle both DataFrame and LazyFrame input)
+- Writes TSV: `{output_dir}/{filename_base}.tsv` using `df.write_csv(path, separator="\t", include_header=True)`
+- Writes Parquet: `{output_dir}/{filename_base}.parquet` using `df.write_parquet(path, compression="snappy", use_pyarrow=True)`
+- Writes provenance YAML sidecar: `{output_dir}/{filename_base}.provenance.yaml` containing:
+  - generated_at (ISO timestamp)
+  - output_files: [tsv filename, parquet filename]
+  - statistics: total_candidates, high_count, medium_count, low_count (counted from confidence_tier column)
+  - column_count and column_names for downstream tool compatibility verification
+- Uses pyyaml (already in dependencies) for YAML output
+- Creates output_dir if it doesn't exist
+- Returns dict with paths: {"tsv": Path, "parquet": Path, "provenance": Path}
+
+Ensure deterministic output: sort by composite_score DESC, gene_id ASC before writing (avoid non-deterministic ordering pitfall from research).
+
+**Update __init__.py**: Add write_candidate_output to exports.
+
+**tests/test_output.py**: Create comprehensive unit tests.
+
+Use tmp_path pytest fixture. Create synthetic scored_genes DataFrame with ~20 rows spanning all tiers:
+- 3 genes with score >= 0.7, evidence_count >= 3 (HIGH)
+- 5 genes with score 0.4-0.69, evidence_count >= 2 (MEDIUM)
+- 5 genes with score 0.2-0.39 (LOW)
+- 3 genes with score < 0.2 (EXCLUDED -- should be filtered out)
+- 4 genes with NULL composite_score (no evidence)
+
+Tests:
+1. test_assign_tiers_default_thresholds: Verify correct tier assignment counts, EXCLUDED genes removed
+2. test_assign_tiers_custom_thresholds: Override thresholds, verify different classification
+3. test_assign_tiers_sorting: Verify output sorted by composite_score DESC
+4. test_add_evidence_summary_supporting_layers: Gene with 3 non-NULL scores has 3 layers listed
+5. test_add_evidence_summary_gaps: Gene with all NULL scores has all 6 layers as gaps
+6. test_write_candidate_output_creates_files: TSV, Parquet, and provenance.yaml all created
+7. test_write_candidate_output_tsv_readable: Read back TSV with polars, verify columns and row count match
+8. test_write_candidate_output_parquet_readable: Read back Parquet, verify schema matches
+9. test_write_candidate_output_provenance_yaml: Parse YAML, verify statistics match
+  </action>
+  <verify>
+Run: `cd /Users/gbanyan/Project/usher-exploring && python -m pytest tests/test_output.py -v`
+  </verify>
+  <done>
+All 9+ tests pass. TSV and Parquet outputs are byte-for-byte consistent (same data). Provenance YAML contains accurate statistics. Tiering correctly classifies and filters genes. Evidence summary correctly identifies supporting layers and gaps.
+  </done>
+</task>
+
+</tasks>
+
+<verification>
+- `python -c "from usher_pipeline.output import assign_tiers, add_evidence_summary, write_candidate_output; print('All exports OK')"` succeeds
+- `python -m pytest tests/test_output.py -v` -- all tests pass
+- Synthetic data round-trips through tier assignment, evidence summary, and dual-format writing without errors
+</verification>
+
+<success_criteria>
+- Tiering logic classifies scored genes into HIGH/MEDIUM/LOW confidence tiers using composite_score and evidence_count thresholds
+- Evidence summary adds supporting_layers and evidence_gaps columns per gene
+- Writer produces identical data in TSV and Parquet formats with provenance YAML sidecar
+- All unit tests pass
+</success_criteria>
+
+<output>
+After completion, create `.planning/phases/05-output-cli/05-01-SUMMARY.md`
+</output>
diff --git a/.planning/phases/05-output-cli/05-02-PLAN.md b/.planning/phases/05-output-cli/05-02-PLAN.md
new file mode 100644
index 0000000..ce2a70e
--- /dev/null
+++ b/.planning/phases/05-output-cli/05-02-PLAN.md
@@ -0,0 +1,219 @@
+---
+phase: 05-output-cli
+plan: 02
+type: execute
+wave: 1
+depends_on: []
+files_modified:
+  - src/usher_pipeline/output/visualizations.py
+  - src/usher_pipeline/output/reproducibility.py
+  - pyproject.toml
+  - tests/test_visualizations.py
+  - tests/test_reproducibility.py
+autonomous: true
+
+must_haves:
+  truths:
+    - "Pipeline generates score distribution histogram with tier color coding as PNG"
+    - "Pipeline generates evidence layer contribution bar chart as PNG"
+    - "Pipeline generates tier breakdown pie chart as PNG"
+    - "Reproducibility report documents scoring parameters, data versions, gene counts per filtering step, and validation metrics"
+    - "Reproducibility report is generated in both JSON (machine-readable) and Markdown (human-readable) formats"
+  artifacts:
+    - path: "src/usher_pipeline/output/visualizations.py"
+      provides: "matplotlib/seaborn visualization functions"
+      exports: ["plot_score_distribution", "plot_layer_contributions", "plot_tier_breakdown", "generate_all_plots"]
+    - path: "src/usher_pipeline/output/reproducibility.py"
+      provides: "Reproducibility report generation"
+      exports: ["generate_reproducibility_report", "ReproducibilityReport"]
+    - path: "tests/test_visualizations.py"
+      provides: "Tests for visualization file creation"
+    - path: "tests/test_reproducibility.py"
+      provides: "Tests for report content and formatting"
+  key_links:
+    - from: "src/usher_pipeline/output/visualizations.py"
+      to: "matplotlib/seaborn"
+      via: "to_pandas() conversion for seaborn compatibility"
+      pattern: "to_pandas.*sns\\."
+    - from: "src/usher_pipeline/output/reproducibility.py"
+      to: "provenance tracker and config"
+      via: "reads ProvenanceTracker metadata and PipelineConfig"
+      pattern: "provenance.*create_metadata|config.*model_dump"
+---
+
+<objective>
+Create visualization and reproducibility report modules: score distribution plots, evidence layer contribution charts, tier breakdowns, and comprehensive reproducibility documentation in JSON+Markdown formats.
+
+Purpose: Provides the visual and textual reporting layer that makes pipeline results interpretable for researchers and satisfies reproducibility requirements for scientific pipelines.
+Output: `src/usher_pipeline/output/visualizations.py`, `src/usher_pipeline/output/reproducibility.py`, and associated tests.
+</objective>
+
+<execution_context>
+@/Users/gbanyan/.claude/get-shit-done/workflows/execute-plan.md
+@/Users/gbanyan/.claude/get-shit-done/templates/summary.md
+</execution_context>
+
+<context>
+@.planning/PROJECT.md
+@.planning/ROADMAP.md
+@.planning/STATE.md
+@.planning/phases/05-output-cli/05-RESEARCH.md
+@src/usher_pipeline/config/schema.py
+@src/usher_pipeline/persistence/provenance.py
+@src/usher_pipeline/scoring/quality_control.py
+@src/usher_pipeline/scoring/validation.py
+</context>
+
+<tasks>
+
+<task type="auto">
+  <name>Task 1: Visualization module with matplotlib/seaborn plots</name>
+  <files>
+    src/usher_pipeline/output/visualizations.py
+    pyproject.toml
+    tests/test_visualizations.py
+  </files>
+  <action>
+**pyproject.toml**: Add matplotlib and seaborn to dependencies list:
+- "matplotlib>=3.8.0"
+- "seaborn>=0.13.0"
+
+**visualizations.py**: Create visualization module with 3 plot functions and 1 orchestrator.
+
+Use matplotlib backend "Agg" (non-interactive, safe for headless/CLI use): call `matplotlib.use("Agg")` before importing pyplot.
+
+1. `plot_score_distribution(df: pl.DataFrame, output_path: Path) -> Path`:
+   - Converts to pandas via df.to_pandas() (small result set, acceptable overhead per research)
+   - Sets seaborn theme: `sns.set_theme(style="whitegrid", context="paper")`
+   - Creates histogram of composite_score colored by confidence_tier
+   - Uses `sns.histplot(data=pdf, x="composite_score", hue="confidence_tier", hue_order=["HIGH", "MEDIUM", "LOW"], palette={"HIGH": "#2ecc71", "MEDIUM": "#f39c12", "LOW": "#e74c3c"}, bins=30, multiple="stack")`
+   - Labels: x="Composite Score", y="Candidate Count", title="Score Distribution by Confidence Tier"
+   - Saves as PNG at 300 DPI with bbox_inches='tight'
+   - CRITICAL: Always call plt.close(fig) after savefig (memory leak pitfall from research)
+   - Returns output_path
+
+2. `plot_layer_contributions(df: pl.DataFrame, output_path: Path) -> Path`:
+   - Counts non-null values per layer score column: gnomad_score, expression_score, annotation_score, localization_score, animal_model_score, literature_score
+   - Creates bar chart using seaborn barplot with viridis palette
+   - X-axis labels cleaned (remove "_score" suffix), rotated 45 degrees
+   - Labels: x="Evidence Layer", y="Candidates with Evidence", title="Evidence Layer Coverage"
+   - Saves PNG at 300 DPI, closes figure
+   - Returns output_path
+
+3. `plot_tier_breakdown(df: pl.DataFrame, output_path: Path) -> Path`:
+   - Counts genes per confidence_tier
+   - Creates pie chart with percentage labels (autopct='%1.1f%%')
+   - Colors match score_distribution palette (green/orange/red for HIGH/MEDIUM/LOW)
+   - Title: "Candidate Tier Breakdown"
+   - Saves PNG at 300 DPI, closes figure
+   - Returns output_path
+
+4. `generate_all_plots(df: pl.DataFrame, output_dir: Path) -> dict[str, Path]`:
+   - Creates output_dir if not exists
+   - Calls all 3 plot functions with standard filenames: score_distribution.png, layer_contributions.png, tier_breakdown.png
+   - Returns dict mapping plot name to file path
+   - Wraps each plot in try/except so one failure doesn't block others (log warning on failure)
+
+**tests/test_visualizations.py**: Test file creation.
+
+Create synthetic DataFrame fixture with ~30 rows including confidence_tier and all 6 layer score columns (some NULL).
+
+Tests:
+1. test_plot_score_distribution_creates_file: Verify PNG file created and size > 0
+2. test_plot_layer_contributions_creates_file: Verify PNG file created
+3. test_plot_tier_breakdown_creates_file: Verify PNG file created
+4. test_generate_all_plots_creates_all_files: Verify all 3 PNG files exist in output_dir
+5. test_generate_all_plots_returns_paths: Verify returned dict has 3 entries
+6. test_plots_handle_empty_dataframe: Empty DataFrame produces plots without crashing (edge case)
+  </action>
+  <verify>
+Run: `cd /Users/gbanyan/Project/usher-exploring && python -m pytest tests/test_visualizations.py -v`
+  </verify>
+  <done>
+All 6 visualization tests pass. PNG files are created at 300 DPI. Plots handle edge cases (empty data, all-NULL columns) without crashing. matplotlib figures are properly closed after saving.
+  </done>
+</task>
+
+<task type="auto">
+  <name>Task 2: Reproducibility report module with JSON and Markdown output</name>
+  <files>
+    src/usher_pipeline/output/reproducibility.py
+    src/usher_pipeline/output/__init__.py
+    tests/test_reproducibility.py
+  </files>
+  <action>
+**reproducibility.py**: Create reproducibility report generation module.
+
+Define `FilteringStep` dataclass:
+- step_name: str
+- input_count: int
+- output_count: int
+- criteria: str
+
+Define `ReproducibilityReport` dataclass:
+- run_id: str (UUID4)
+- timestamp: str (ISO format)
+- pipeline_version: str
+- parameters: dict (scoring weights, thresholds, etc.)
+- data_versions: dict (ensembl_release, gnomad_version, gtex_version, hpa_version)
+- software_environment: dict (python version, polars version, duckdb version, etc.)
+- filtering_steps: list[FilteringStep]
+- validation_metrics: dict (from validation.py output if available)
+- tier_statistics: dict (total, high, medium, low counts)
+
+Methods on ReproducibilityReport:
+- `to_json(path: Path) -> Path`: Write as indented JSON file
+- `to_markdown(path: Path) -> Path`: Write as human-readable Markdown with tables for filtering steps, parameters section, software versions, tier statistics, validation metrics
+- `to_dict() -> dict`: Return as plain dict for programmatic access
+
+Implement `generate_reproducibility_report(config: PipelineConfig, tiered_df: pl.DataFrame, provenance: ProvenanceTracker, validation_result: dict | None = None) -> ReproducibilityReport`:
+- Extracts parameters from config (scoring weights via config.scoring.model_dump(), data_versions via config.versions.model_dump())
+- Computes tier_statistics from tiered_df confidence_tier column
+- Builds filtering_steps from provenance.get_steps() -- each recorded step with gene counts
+- Captures software versions: sys.version, polars.__version__, duckdb.__version__
+- Generates UUID4 run_id
+- If validation_result provided, includes median_percentile, top_quartile_fraction, validation_passed
+- Returns ReproducibilityReport instance
+
+**Update __init__.py**: Add generate_reproducibility_report, ReproducibilityReport, generate_all_plots, and individual plot functions to exports. Also add visualizations imports.
+
+**tests/test_reproducibility.py**: Test report content.
+
+Create mock config, mock provenance tracker, and synthetic tiered DataFrame.
+
+Tests:
+1. test_generate_report_has_all_fields: Report contains run_id, timestamp, pipeline_version, parameters, data_versions, software_environment, tier_statistics
+2. test_report_to_json_parseable: Write JSON, read back with json.load, verify it's valid JSON with expected keys
+3. test_report_to_markdown_has_headers: Markdown output contains "# Pipeline Reproducibility Report", "## Parameters", "## Data Versions", "## Filtering Steps", "## Tier Statistics"
+4. test_report_tier_statistics_match: tier_statistics.total == tiered_df.height, high + medium + low == total
+5. test_report_includes_validation_when_provided: When validation_result dict is passed, report contains validation_metrics section
+6. test_report_without_validation: When validation_result is None, report still generates without error
+7. test_report_software_versions: software_environment contains python, polars, duckdb keys
+  </action>
+  <verify>
+Run: `cd /Users/gbanyan/Project/usher-exploring && python -m pytest tests/test_reproducibility.py -v`
+  </verify>
+  <done>
+All 7 reproducibility tests pass. Report generates in both JSON and Markdown formats. JSON is valid and parseable. Markdown contains all required sections with proper formatting. Tier statistics are accurate. Validation metrics are optional and handled gracefully.
+  </done>
+</task>
+
+</tasks>
+
+<verification>
+- `python -c "from usher_pipeline.output.visualizations import generate_all_plots; print('viz OK')"` succeeds
+- `python -c "from usher_pipeline.output.reproducibility import generate_reproducibility_report, ReproducibilityReport; print('report OK')"` succeeds
+- `python -m pytest tests/test_visualizations.py tests/test_reproducibility.py -v` -- all tests pass
+- matplotlib Agg backend used (no display required)
+</verification>
+
+<success_criteria>
+- Visualization module produces 3 PNG plots (score distribution, layer contributions, tier breakdown) at 300 DPI
+- Reproducibility report module generates both JSON and Markdown formats with parameters, data versions, filtering steps, tier statistics, and optional validation metrics
+- All tests pass
+- No matplotlib display window opened (Agg backend)
+</success_criteria>
+
+<output>
+After completion, create `.planning/phases/05-output-cli/05-02-SUMMARY.md`
+</output>
diff --git a/.planning/phases/05-output-cli/05-03-PLAN.md b/.planning/phases/05-output-cli/05-03-PLAN.md
new file mode 100644
index 0000000..e71738d
--- /dev/null
+++ b/.planning/phases/05-output-cli/05-03-PLAN.md
@@ -0,0 +1,200 @@
+---
+phase: 05-output-cli
+plan: 03
+type: execute
+wave: 2
+depends_on: ["05-01", "05-02"]
+files_modified:
+  - src/usher_pipeline/cli/report_cmd.py
+  - src/usher_pipeline/cli/main.py
+  - tests/test_report_cmd.py
+autonomous: true
+
+must_haves:
+  truths:
+    - "usher-pipeline report command generates tiered candidate list, visualizations, and reproducibility report in one invocation"
+    - "Report command reads scored_genes from DuckDB, applies tiering, adds evidence summary, writes dual-format output, generates plots, and creates reproducibility report"
+    - "Report command supports --output-dir, --force, --skip-viz, and --skip-report flags"
+    - "Unified CLI provides subcommands for setup, evidence, score, and report with consistent --config and --verbose flags"
+  artifacts:
+    - path: "src/usher_pipeline/cli/report_cmd.py"
+      provides: "CLI report command wiring tiering + viz + reproducibility"
+      exports: ["report"]
+    - path: "src/usher_pipeline/cli/main.py"
+      provides: "Updated CLI entry point with report command registered"
+      contains: "report"
+    - path: "tests/test_report_cmd.py"
+      provides: "CliRunner integration tests for report command"
+  key_links:
+    - from: "src/usher_pipeline/cli/report_cmd.py"
+      to: "src/usher_pipeline/output/tiers.py"
+      via: "import assign_tiers"
+      pattern: "from usher_pipeline\\.output.*import.*assign_tiers"
+    - from: "src/usher_pipeline/cli/report_cmd.py"
+      to: "src/usher_pipeline/output/evidence_summary.py"
+      via: "import add_evidence_summary"
+      pattern: "from usher_pipeline\\.output.*import.*add_evidence_summary"
+    - from: "src/usher_pipeline/cli/report_cmd.py"
+      to: "src/usher_pipeline/output/writers.py"
+      via: "import write_candidate_output"
+      pattern: "from usher_pipeline\\.output.*import.*write_candidate_output"
+    - from: "src/usher_pipeline/cli/report_cmd.py"
+      to: "src/usher_pipeline/output/visualizations.py"
+      via: "import generate_all_plots"
+      pattern: "from usher_pipeline\\.output.*import.*generate_all_plots"
+    - from: "src/usher_pipeline/cli/report_cmd.py"
+      to: "src/usher_pipeline/output/reproducibility.py"
+      via: "import generate_reproducibility_report"
+      pattern: "from usher_pipeline\\.output.*import.*generate_reproducibility_report"
+    - from: "src/usher_pipeline/cli/main.py"
+      to: "src/usher_pipeline/cli/report_cmd.py"
+      via: "cli.add_command(report)"
+      pattern: "add_command.*report"
+---
+
+<objective>
+Create the CLI `report` command that orchestrates the full output pipeline: reads scored_genes from DuckDB, applies tiering and evidence summary, writes TSV+Parquet output, generates visualizations, and creates the reproducibility report.
+
+Purpose: This is the user-facing entry point that ties together all output modules into a single invocation. After running `usher-pipeline score`, the user runs `usher-pipeline report` to get all deliverables.
+Output: `src/usher_pipeline/cli/report_cmd.py` registered in main.py, with CliRunner integration tests.
+</objective>
+
+<execution_context>
+@/Users/gbanyan/.claude/get-shit-done/workflows/execute-plan.md
+@/Users/gbanyan/.claude/get-shit-done/templates/summary.md
+</execution_context>
+
+<context>
+@.planning/PROJECT.md
+@.planning/ROADMAP.md
+@.planning/STATE.md
+@.planning/phases/05-output-cli/05-01-SUMMARY.md
+@.planning/phases/05-output-cli/05-02-SUMMARY.md
+@src/usher_pipeline/cli/main.py
+@src/usher_pipeline/cli/score_cmd.py
+@src/usher_pipeline/cli/evidence_cmd.py
+@src/usher_pipeline/persistence/duckdb_store.py
+@src/usher_pipeline/config/schema.py
+</context>
+
+<tasks>
+
+<task type="auto">
+  <name>Task 1: Report CLI command</name>
+  <files>
+    src/usher_pipeline/cli/report_cmd.py
+    src/usher_pipeline/cli/main.py
+  </files>
+  <action>
+**report_cmd.py**: Create CLI report command following the established pattern from score_cmd.py and evidence_cmd.py.
+
+```
+@click.command('report')
+@click.option('--output-dir', type=click.Path(path_type=Path), default=None, help='Output directory (default: {data_dir}/report)')
+@click.option('--force', is_flag=True, help='Overwrite existing report files')
+@click.option('--skip-viz', is_flag=True, help='Skip visualization generation')
+@click.option('--skip-report', is_flag=True, help='Skip reproducibility report generation')
+@click.option('--high-threshold', type=float, default=0.7, help='Minimum score for HIGH tier (default: 0.7)')
+@click.option('--medium-threshold', type=float, default=0.4, help='Minimum score for MEDIUM tier (default: 0.4)')
+@click.option('--low-threshold', type=float, default=0.2, help='Minimum score for LOW tier (default: 0.2)')
+@click.option('--min-evidence-high', type=int, default=3, help='Minimum evidence layers for HIGH tier (default: 3)')
+@click.option('--min-evidence-medium', type=int, default=2, help='Minimum evidence layers for MEDIUM tier (default: 2)')
+@click.pass_context
+def report(ctx, output_dir, force, skip_viz, skip_report, high_threshold, medium_threshold, low_threshold, min_evidence_high, min_evidence_medium):
+```
+
+Follow the established CLI command pattern: load config -> init store/provenance -> check prerequisites -> execute steps -> display summary -> cleanup.
+
+Pipeline steps (echoed with click.style like score_cmd.py):
+1. Load configuration and initialize storage (same pattern as score_cmd.py)
+2. Check scored_genes table exists (error if not: "Run 'usher-pipeline score' first")
+3. Load scored_genes DataFrame from DuckDB via store.load_dataframe('scored_genes')
+4. Build tier thresholds from CLI options into dict: {"high": {"score": high_threshold, "evidence_count": min_evidence_high}, "medium": {"score": medium_threshold, "evidence_count": min_evidence_medium}, "low": {"score": low_threshold}}
+5. Apply tiering: `tiered_df = assign_tiers(scored_df, thresholds=thresholds)`
+6. Add evidence summary: `tiered_df = add_evidence_summary(tiered_df)`
+7. Write dual-format output: `paths = write_candidate_output(tiered_df, output_dir, "candidates")`
+8. Echo tier counts: "HIGH: N, MEDIUM: N, LOW: N (total: N candidates from M scored genes)"
+9. If not --skip-viz: `plot_paths = generate_all_plots(tiered_df, output_dir / "plots")` -- echo each plot file created
+10. If not --skip-report: Load validation result if available (try store.load_dataframe for validation metadata or call validate_known_gene_ranking if scored_genes has known gene data). Call `report = generate_reproducibility_report(config, tiered_df, provenance, validation_result)`. Write report.to_json() and report.to_markdown() to output_dir.
+11. Save provenance sidecar for the report command itself
+12. Display final summary: output directory, file list, tier counts
+
+Default output_dir: `Path(config.data_dir) / "report"` if not specified via --output-dir.
+
+If output files already exist and --force not set, echo warning and skip (checkpoint pattern).
+
+Ensure store.close() in finally block.
+
+**main.py**: Add report command registration.
+
+Add import: `from usher_pipeline.cli.report_cmd import report`
+Add registration: `cli.add_command(report)`
+
+The CLI now has 4 top-level commands: setup, evidence, score, report (plus the existing info command).
+  </action>
+  <verify>
+Run: `cd /Users/gbanyan/Project/usher-exploring && usher-pipeline report --help` -- should show all options including --output-dir, --force, --skip-viz, --skip-report, tier thresholds
+Run: `usher-pipeline --help` -- should list report in available commands
+  </verify>
+  <done>
+report command is registered and shows all expected options in --help output. CLI entry point lists setup, evidence, score, report, and info commands.
+  </done>
+</task>
+
+<task type="auto">
+  <name>Task 2: CliRunner integration tests for report command</name>
+  <files>
+    tests/test_report_cmd.py
+  </files>
+  <action>
+**tests/test_report_cmd.py**: Create CliRunner integration tests.
+
+Follow the established test pattern from test_scoring_integration.py: create synthetic data in a tmp_path DuckDB, invoke CLI commands via CliRunner.
+
+Create test fixtures:
+- `test_config` fixture: Write minimal config YAML to tmp_path, pointing duckdb_path and data_dir to tmp_path
+- `populated_db` fixture: Create DuckDB at tmp_path, populate with:
+  - gene_universe table (20 synthetic genes with gene_id and gene_symbol)
+  - scored_genes table with all required columns (gene_id, gene_symbol, composite_score, evidence_count, quality_flag, all 6 layer score columns + 6 contribution columns, available_weight, weighted_sum)
+  - Design data so: 3 genes HIGH tier (score 0.7-0.95, evidence_count 3-5), 5 MEDIUM, 5 LOW, 4 EXCLUDED (score < 0.2), 3 NULL composite_score
+  - Register in _checkpoints table so has_checkpoint('scored_genes') returns True
+
+Tests:
+1. test_report_help: Invoke `report --help`, assert exit_code 0, assert "--output-dir" in output
+2. test_report_generates_files: Invoke report with populated_db and test_config, assert exit_code 0, verify candidates.tsv exists, candidates.parquet exists, candidates.provenance.yaml exists
+3. test_report_tier_counts_in_output: Invoke report, assert "HIGH: 3" (or similar) appears in CLI output
+4. test_report_with_viz: Invoke report (no --skip-viz), verify plots/ directory contains score_distribution.png, layer_contributions.png, tier_breakdown.png
+5. test_report_skip_viz: Invoke report with --skip-viz, verify no plots/ directory created
+6. test_report_skip_report: Invoke report with --skip-report, verify no reproducibility .json/.md files
+7. test_report_custom_thresholds: Invoke with --high-threshold 0.8 --medium-threshold 0.5, verify different tier counts
+8. test_report_no_scored_genes_error: Invoke report with empty DuckDB (no scored_genes table), assert exit_code != 0, assert "Run 'usher-pipeline score' first" in output
+9. test_report_output_dir_option: Invoke with --output-dir custom_path, verify files created in custom_path
+  </action>
+  <verify>
+Run: `cd /Users/gbanyan/Project/usher-exploring && python -m pytest tests/test_report_cmd.py -v`
+  </verify>
+  <done>
+All 9 CliRunner integration tests pass. Report command correctly generates tiered candidates in TSV+Parquet, visualizations (unless --skip-viz), and reproducibility report (unless --skip-report). Custom tier thresholds work. Missing scored_genes table produces clear error message. All file paths are verified.
+  </done>
+</task>
+
+</tasks>
+
+<verification>
+- `usher-pipeline --help` lists setup, evidence, score, report, info commands
+- `usher-pipeline report --help` shows all options
+- `python -m pytest tests/test_report_cmd.py -v` -- all 9 tests pass
+- End-to-end: scored_genes data -> tiered candidates.tsv + candidates.parquet + provenance.yaml + plots/ + reproducibility report
+</verification>
+
+<success_criteria>
+- CLI `report` command orchestrates full output pipeline in one invocation
+- Supports --output-dir, --force, --skip-viz, --skip-report, and configurable tier thresholds
+- Follows established CLI patterns (config loading, store init, checkpoint, provenance, summary, cleanup)
+- All CliRunner integration tests pass
+- Unified CLI has all subcommands: setup, evidence, score, report, info
+</success_criteria>
+
+<output>
+After completion, create `.planning/phases/05-output-cli/05-03-SUMMARY.md`
+</output>