- Create test_report_cmd.py with 9 comprehensive tests - Test fixtures: test_config (minimal YAML), populated_db (synthetic scored_genes data) - Test coverage: help output, file generation, tier counts, visualizations, skip flags, custom thresholds, error handling, custom output directory - Synthetic data design: 3 HIGH, 5 MEDIUM, 5 LOW, 4 EXCLUDED, 3 NULL composite_score - All tests pass with isolated tmp_path DuckDB instances - Fixed report_cmd.py tier threshold format (uppercase keys: HIGH/MEDIUM/LOW, composite_score field) - Fixed write_candidate_output parameter name (filename_base not base_filename)
309 lines
9.9 KiB
Python
309 lines
9.9 KiB
Python
"""Integration tests for report CLI command using CliRunner.
|
|
|
|
Tests:
|
|
- report --help
|
|
- Full report generation with tiered candidates
|
|
- Tier counts in CLI output
|
|
- Visualization generation
|
|
- --skip-viz flag
|
|
- --skip-report flag
|
|
- Custom tier thresholds
|
|
- Missing scored_genes error handling
|
|
- Custom output directory
|
|
"""
|
|
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import polars as pl
|
|
import pytest
|
|
import duckdb
|
|
from click.testing import CliRunner
|
|
|
|
from usher_pipeline.cli.main import cli
|
|
|
|
|
|
@pytest.fixture
|
|
def test_config(tmp_path):
|
|
"""Create minimal config YAML for testing."""
|
|
config_path = tmp_path / "test_config.yaml"
|
|
config_content = f"""
|
|
versions:
|
|
ensembl_release: "111"
|
|
gnomad_version: "v4.1"
|
|
gtex_version: "v8"
|
|
hpa_version: "v23"
|
|
|
|
data_dir: {tmp_path}/data
|
|
cache_dir: {tmp_path}/cache
|
|
duckdb_path: {tmp_path}/test.duckdb
|
|
|
|
api:
|
|
rate_limit_per_second: 3
|
|
max_retries: 3
|
|
cache_ttl_seconds: 3600
|
|
timeout_seconds: 30
|
|
|
|
scoring:
|
|
gnomad: 0.20
|
|
expression: 0.15
|
|
annotation: 0.15
|
|
localization: 0.15
|
|
animal_model: 0.15
|
|
literature: 0.20
|
|
"""
|
|
config_path.write_text(config_content)
|
|
return config_path
|
|
|
|
|
|
@pytest.fixture
|
|
def populated_db(tmp_path):
|
|
"""Create DuckDB with gene_universe and scored_genes tables."""
|
|
db_path = tmp_path / "test.duckdb"
|
|
conn = duckdb.connect(str(db_path))
|
|
|
|
# Create gene_universe table (20 synthetic genes)
|
|
gene_universe_df = pl.DataFrame({
|
|
"gene_id": [f"ENSG{i:011d}" for i in range(1, 21)],
|
|
"gene_symbol": [f"GENE{i}" for i in range(1, 21)],
|
|
})
|
|
conn.execute("CREATE TABLE gene_universe AS SELECT * FROM gene_universe_df")
|
|
|
|
# Create scored_genes table
|
|
# Design: 3 HIGH (score 0.7-0.95, evidence 3-5), 5 MEDIUM, 5 LOW, 4 EXCLUDED (score < 0.2), 3 NULL composite_score
|
|
scored_genes_df = pl.DataFrame({
|
|
"gene_id": [f"ENSG{i:011d}" for i in range(1, 21)],
|
|
"gene_symbol": [f"GENE{i}" for i in range(1, 21)],
|
|
# HIGH tier: genes 1-3
|
|
"composite_score": [0.95, 0.85, 0.75] +
|
|
# MEDIUM tier: genes 4-8
|
|
[0.65, 0.55, 0.45, 0.42, 0.40] +
|
|
# LOW tier: genes 9-13
|
|
[0.35, 0.30, 0.25, 0.22, 0.20] +
|
|
# EXCLUDED: genes 14-17
|
|
[0.15, 0.10, 0.05, 0.02] +
|
|
# NULL: genes 18-20
|
|
[None, None, None],
|
|
"evidence_count": [5, 4, 3] + # HIGH
|
|
[3, 2, 2, 2, 2] + # MEDIUM
|
|
[1, 1, 1, 1, 1] + # LOW
|
|
[1, 1, 0, 0] + # EXCLUDED
|
|
[0, 0, 0], # NULL
|
|
"quality_flag": ["sufficient_evidence"] * 3 +
|
|
["sufficient_evidence", "moderate_evidence", "moderate_evidence", "moderate_evidence", "moderate_evidence"] +
|
|
["sparse_evidence"] * 5 +
|
|
["sparse_evidence", "sparse_evidence", "no_evidence", "no_evidence"] +
|
|
["no_evidence"] * 3,
|
|
# Layer scores (simplified)
|
|
"gnomad_score": [0.9] * 20,
|
|
"expression_score": [0.8] * 20,
|
|
"annotation_score": [0.7] * 20,
|
|
"localization_score": [0.6] * 20,
|
|
"animal_model_score": [0.5] * 20,
|
|
"literature_score": [0.4] * 20,
|
|
# Contribution columns
|
|
"gnomad_contribution": [0.18] * 20,
|
|
"expression_contribution": [0.12] * 20,
|
|
"annotation_contribution": [0.105] * 20,
|
|
"localization_contribution": [0.09] * 20,
|
|
"animal_model_contribution": [0.075] * 20,
|
|
"literature_contribution": [0.08] * 20,
|
|
# Weighted average metadata
|
|
"available_weight": [1.0] * 20,
|
|
"weighted_sum": [0.65] * 20,
|
|
})
|
|
conn.execute("CREATE TABLE scored_genes AS SELECT * FROM scored_genes_df")
|
|
|
|
# Register checkpoint
|
|
conn.execute("""
|
|
CREATE TABLE IF NOT EXISTS _checkpoints (
|
|
table_name VARCHAR PRIMARY KEY,
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
)
|
|
""")
|
|
conn.execute("INSERT INTO _checkpoints (table_name) VALUES ('scored_genes')")
|
|
|
|
conn.close()
|
|
return db_path
|
|
|
|
|
|
def test_report_help(test_config):
|
|
"""Test report --help shows all options."""
|
|
runner = CliRunner()
|
|
result = runner.invoke(cli, ['--config', str(test_config), 'report', '--help'])
|
|
|
|
assert result.exit_code == 0
|
|
assert '--output-dir' in result.output
|
|
assert '--force' in result.output
|
|
assert '--skip-viz' in result.output
|
|
assert '--skip-report' in result.output
|
|
assert '--high-threshold' in result.output
|
|
assert '--medium-threshold' in result.output
|
|
assert '--low-threshold' in result.output
|
|
assert '--min-evidence-high' in result.output
|
|
assert '--min-evidence-medium' in result.output
|
|
|
|
|
|
def test_report_generates_files(test_config, populated_db):
|
|
"""Test report generates candidates.tsv, candidates.parquet, and provenance."""
|
|
runner = CliRunner()
|
|
result = runner.invoke(cli, [
|
|
'--config', str(test_config),
|
|
'report'
|
|
])
|
|
|
|
assert result.exit_code == 0
|
|
|
|
# Check output files exist
|
|
output_dir = test_config.parent / "data" / "report"
|
|
assert (output_dir / "candidates.tsv").exists()
|
|
assert (output_dir / "candidates.parquet").exists()
|
|
assert (output_dir / "candidates.provenance.yaml").exists()
|
|
|
|
|
|
def test_report_tier_counts_in_output(test_config, populated_db):
|
|
"""Test report CLI output shows tier counts."""
|
|
runner = CliRunner()
|
|
result = runner.invoke(cli, [
|
|
'--config', str(test_config),
|
|
'report'
|
|
])
|
|
|
|
assert result.exit_code == 0
|
|
# Expected: 3 HIGH, 5 MEDIUM, 5 LOW (from synthetic data design)
|
|
assert 'HIGH' in result.output
|
|
assert 'MEDIUM' in result.output
|
|
assert 'LOW' in result.output
|
|
# Check for counts (flexible regex since exact format may vary)
|
|
assert '3' in result.output # HIGH count
|
|
assert '5' in result.output # MEDIUM and LOW counts
|
|
|
|
|
|
def test_report_with_viz(test_config, populated_db):
|
|
"""Test report generates plots by default."""
|
|
runner = CliRunner()
|
|
result = runner.invoke(cli, [
|
|
'--config', str(test_config),
|
|
'report'
|
|
])
|
|
|
|
assert result.exit_code == 0
|
|
|
|
# Check plots directory and files exist
|
|
plots_dir = test_config.parent / "data" / "report" / "plots"
|
|
assert plots_dir.exists()
|
|
assert (plots_dir / "score_distribution.png").exists()
|
|
assert (plots_dir / "layer_contributions.png").exists()
|
|
assert (plots_dir / "tier_breakdown.png").exists()
|
|
|
|
|
|
def test_report_skip_viz(test_config, populated_db, tmp_path):
|
|
"""Test --skip-viz flag skips visualization generation."""
|
|
runner = CliRunner()
|
|
|
|
# Use different output dir to avoid conflict with previous test
|
|
custom_output = tmp_path / "output_no_viz"
|
|
|
|
result = runner.invoke(cli, [
|
|
'--config', str(test_config),
|
|
'report',
|
|
'--output-dir', str(custom_output),
|
|
'--skip-viz'
|
|
])
|
|
|
|
assert result.exit_code == 0
|
|
assert 'Skipping visualizations' in result.output
|
|
|
|
# Plots directory should not exist
|
|
plots_dir = custom_output / "plots"
|
|
assert not plots_dir.exists() or not any(plots_dir.iterdir())
|
|
|
|
|
|
def test_report_skip_report(test_config, populated_db, tmp_path):
|
|
"""Test --skip-report flag skips reproducibility report generation."""
|
|
runner = CliRunner()
|
|
|
|
custom_output = tmp_path / "output_no_report"
|
|
|
|
result = runner.invoke(cli, [
|
|
'--config', str(test_config),
|
|
'report',
|
|
'--output-dir', str(custom_output),
|
|
'--skip-report'
|
|
])
|
|
|
|
assert result.exit_code == 0
|
|
assert 'Skipping reproducibility report' in result.output
|
|
|
|
# Reproducibility files should not exist
|
|
assert not (custom_output / "reproducibility.json").exists()
|
|
assert not (custom_output / "reproducibility.md").exists()
|
|
|
|
|
|
def test_report_custom_thresholds(test_config, populated_db, tmp_path):
|
|
"""Test custom tier thresholds produce different tier counts."""
|
|
runner = CliRunner()
|
|
|
|
custom_output = tmp_path / "output_custom_thresholds"
|
|
|
|
# Use higher thresholds: HIGH >= 0.8, MEDIUM >= 0.5, LOW >= 0.2
|
|
result = runner.invoke(cli, [
|
|
'--config', str(test_config),
|
|
'report',
|
|
'--output-dir', str(custom_output),
|
|
'--high-threshold', '0.8',
|
|
'--medium-threshold', '0.5',
|
|
'--low-threshold', '0.2'
|
|
])
|
|
|
|
assert result.exit_code == 0
|
|
|
|
# With these thresholds:
|
|
# HIGH: genes 1-2 (scores 0.95, 0.85)
|
|
# MEDIUM: genes 3-6 (scores 0.75, 0.65, 0.55, 0.45 - but need evidence >= 2)
|
|
# LOW: remaining above 0.2
|
|
# Should see different counts than default
|
|
|
|
# Load the output and verify tier distribution changed
|
|
candidates_df = pl.read_parquet(custom_output / "candidates.parquet")
|
|
|
|
high_count = candidates_df.filter(candidates_df['confidence_tier'] == 'HIGH').height
|
|
assert high_count == 2 # Only genes with score >= 0.8 and evidence >= 3
|
|
|
|
|
|
def test_report_no_scored_genes_error(test_config, tmp_path):
|
|
"""Test report with missing scored_genes table produces clear error."""
|
|
# Create empty DuckDB (no scored_genes table)
|
|
empty_db_path = tmp_path / "empty.duckdb"
|
|
conn = duckdb.connect(str(empty_db_path))
|
|
conn.close()
|
|
|
|
runner = CliRunner()
|
|
result = runner.invoke(cli, [
|
|
'--config', str(test_config),
|
|
'report'
|
|
])
|
|
|
|
assert result.exit_code != 0
|
|
assert "Run 'usher-pipeline score' first" in result.output
|
|
|
|
|
|
def test_report_output_dir_option(test_config, populated_db, tmp_path):
|
|
"""Test --output-dir option creates files in custom location."""
|
|
runner = CliRunner()
|
|
|
|
custom_output = tmp_path / "custom_report_dir"
|
|
|
|
result = runner.invoke(cli, [
|
|
'--config', str(test_config),
|
|
'report',
|
|
'--output-dir', str(custom_output)
|
|
])
|
|
|
|
assert result.exit_code == 0
|
|
|
|
# Files should be in custom directory
|
|
assert (custom_output / "candidates.tsv").exists()
|
|
assert (custom_output / "candidates.parquet").exists()
|
|
assert (custom_output / "candidates.provenance.yaml").exists()
|