"""Integration tests for report CLI command using CliRunner. Tests: - report --help - Full report generation with tiered candidates - Tier counts in CLI output - Visualization generation - --skip-viz flag - --skip-report flag - Custom tier thresholds - Missing scored_genes error handling - Custom output directory """ import tempfile from pathlib import Path import polars as pl import pytest import duckdb from click.testing import CliRunner from usher_pipeline.cli.main import cli @pytest.fixture def test_config(tmp_path): """Create minimal config YAML for testing.""" config_path = tmp_path / "test_config.yaml" config_content = f""" versions: ensembl_release: "111" gnomad_version: "v4.1" gtex_version: "v8" hpa_version: "v23" data_dir: {tmp_path}/data cache_dir: {tmp_path}/cache duckdb_path: {tmp_path}/test.duckdb api: rate_limit_per_second: 3 max_retries: 3 cache_ttl_seconds: 3600 timeout_seconds: 30 scoring: gnomad: 0.20 expression: 0.15 annotation: 0.15 localization: 0.15 animal_model: 0.15 literature: 0.20 """ config_path.write_text(config_content) return config_path @pytest.fixture def populated_db(tmp_path): """Create DuckDB with gene_universe and scored_genes tables.""" db_path = tmp_path / "test.duckdb" conn = duckdb.connect(str(db_path)) # Create gene_universe table (20 synthetic genes) gene_universe_df = pl.DataFrame({ "gene_id": [f"ENSG{i:011d}" for i in range(1, 21)], "gene_symbol": [f"GENE{i}" for i in range(1, 21)], }) conn.execute("CREATE TABLE gene_universe AS SELECT * FROM gene_universe_df") # Create scored_genes table # Design: 3 HIGH (score 0.7-0.95, evidence 3-5), 5 MEDIUM, 5 LOW, 4 EXCLUDED (score < 0.2), 3 NULL composite_score scored_genes_df = pl.DataFrame({ "gene_id": [f"ENSG{i:011d}" for i in range(1, 21)], "gene_symbol": [f"GENE{i}" for i in range(1, 21)], # HIGH tier: genes 1-3 "composite_score": [0.95, 0.85, 0.75] + # MEDIUM tier: genes 4-8 [0.65, 0.55, 0.45, 0.42, 0.40] + # LOW tier: genes 9-13 [0.35, 0.30, 0.25, 0.22, 0.20] + # EXCLUDED: genes 14-17 [0.15, 0.10, 0.05, 0.02] + # NULL: genes 18-20 [None, None, None], "evidence_count": [5, 4, 3] + # HIGH [3, 2, 2, 2, 2] + # MEDIUM [1, 1, 1, 1, 1] + # LOW [1, 1, 0, 0] + # EXCLUDED [0, 0, 0], # NULL "quality_flag": ["sufficient_evidence"] * 3 + ["sufficient_evidence", "moderate_evidence", "moderate_evidence", "moderate_evidence", "moderate_evidence"] + ["sparse_evidence"] * 5 + ["sparse_evidence", "sparse_evidence", "no_evidence", "no_evidence"] + ["no_evidence"] * 3, # Layer scores (simplified) "gnomad_score": [0.9] * 20, "expression_score": [0.8] * 20, "annotation_score": [0.7] * 20, "localization_score": [0.6] * 20, "animal_model_score": [0.5] * 20, "literature_score": [0.4] * 20, # Contribution columns "gnomad_contribution": [0.18] * 20, "expression_contribution": [0.12] * 20, "annotation_contribution": [0.105] * 20, "localization_contribution": [0.09] * 20, "animal_model_contribution": [0.075] * 20, "literature_contribution": [0.08] * 20, # Weighted average metadata "available_weight": [1.0] * 20, "weighted_sum": [0.65] * 20, }) conn.execute("CREATE TABLE scored_genes AS SELECT * FROM scored_genes_df") # Register checkpoint conn.execute(""" CREATE TABLE IF NOT EXISTS _checkpoints ( table_name VARCHAR PRIMARY KEY, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) """) conn.execute("INSERT INTO _checkpoints (table_name) VALUES ('scored_genes')") conn.close() return db_path def test_report_help(test_config): """Test report --help shows all options.""" runner = CliRunner() result = runner.invoke(cli, ['--config', str(test_config), 'report', '--help']) assert result.exit_code == 0 assert '--output-dir' in result.output assert '--force' in result.output assert '--skip-viz' in result.output assert '--skip-report' in result.output assert '--high-threshold' in result.output assert '--medium-threshold' in result.output assert '--low-threshold' in result.output assert '--min-evidence-high' in result.output assert '--min-evidence-medium' in result.output def test_report_generates_files(test_config, populated_db): """Test report generates candidates.tsv, candidates.parquet, and provenance.""" runner = CliRunner() result = runner.invoke(cli, [ '--config', str(test_config), 'report' ]) assert result.exit_code == 0 # Check output files exist output_dir = test_config.parent / "data" / "report" assert (output_dir / "candidates.tsv").exists() assert (output_dir / "candidates.parquet").exists() assert (output_dir / "candidates.provenance.yaml").exists() def test_report_tier_counts_in_output(test_config, populated_db): """Test report CLI output shows tier counts.""" runner = CliRunner() result = runner.invoke(cli, [ '--config', str(test_config), 'report' ]) assert result.exit_code == 0 # Expected: 3 HIGH, 5 MEDIUM, 5 LOW (from synthetic data design) assert 'HIGH' in result.output assert 'MEDIUM' in result.output assert 'LOW' in result.output # Check for counts (flexible regex since exact format may vary) assert '3' in result.output # HIGH count assert '5' in result.output # MEDIUM and LOW counts def test_report_with_viz(test_config, populated_db): """Test report generates plots by default.""" runner = CliRunner() result = runner.invoke(cli, [ '--config', str(test_config), 'report' ]) assert result.exit_code == 0 # Check plots directory and files exist plots_dir = test_config.parent / "data" / "report" / "plots" assert plots_dir.exists() assert (plots_dir / "score_distribution.png").exists() assert (plots_dir / "layer_contributions.png").exists() assert (plots_dir / "tier_breakdown.png").exists() def test_report_skip_viz(test_config, populated_db, tmp_path): """Test --skip-viz flag skips visualization generation.""" runner = CliRunner() # Use different output dir to avoid conflict with previous test custom_output = tmp_path / "output_no_viz" result = runner.invoke(cli, [ '--config', str(test_config), 'report', '--output-dir', str(custom_output), '--skip-viz' ]) assert result.exit_code == 0 assert 'Skipping visualizations' in result.output # Plots directory should not exist plots_dir = custom_output / "plots" assert not plots_dir.exists() or not any(plots_dir.iterdir()) def test_report_skip_report(test_config, populated_db, tmp_path): """Test --skip-report flag skips reproducibility report generation.""" runner = CliRunner() custom_output = tmp_path / "output_no_report" result = runner.invoke(cli, [ '--config', str(test_config), 'report', '--output-dir', str(custom_output), '--skip-report' ]) assert result.exit_code == 0 assert 'Skipping reproducibility report' in result.output # Reproducibility files should not exist assert not (custom_output / "reproducibility.json").exists() assert not (custom_output / "reproducibility.md").exists() def test_report_custom_thresholds(test_config, populated_db, tmp_path): """Test custom tier thresholds produce different tier counts.""" runner = CliRunner() custom_output = tmp_path / "output_custom_thresholds" # Use higher thresholds: HIGH >= 0.8, MEDIUM >= 0.5, LOW >= 0.2 result = runner.invoke(cli, [ '--config', str(test_config), 'report', '--output-dir', str(custom_output), '--high-threshold', '0.8', '--medium-threshold', '0.5', '--low-threshold', '0.2' ]) assert result.exit_code == 0 # With these thresholds: # HIGH: genes 1-2 (scores 0.95, 0.85) # MEDIUM: genes 3-6 (scores 0.75, 0.65, 0.55, 0.45 - but need evidence >= 2) # LOW: remaining above 0.2 # Should see different counts than default # Load the output and verify tier distribution changed candidates_df = pl.read_parquet(custom_output / "candidates.parquet") high_count = candidates_df.filter(candidates_df['confidence_tier'] == 'HIGH').height assert high_count == 2 # Only genes with score >= 0.8 and evidence >= 3 def test_report_no_scored_genes_error(test_config, tmp_path): """Test report with missing scored_genes table produces clear error.""" # Create empty DuckDB (no scored_genes table) empty_db_path = tmp_path / "empty.duckdb" conn = duckdb.connect(str(empty_db_path)) conn.close() runner = CliRunner() result = runner.invoke(cli, [ '--config', str(test_config), 'report' ]) assert result.exit_code != 0 assert "Run 'usher-pipeline score' first" in result.output def test_report_output_dir_option(test_config, populated_db, tmp_path): """Test --output-dir option creates files in custom location.""" runner = CliRunner() custom_output = tmp_path / "custom_report_dir" result = runner.invoke(cli, [ '--config', str(test_config), 'report', '--output-dir', str(custom_output) ]) assert result.exit_code == 0 # Files should be in custom directory assert (custom_output / "candidates.tsv").exists() assert (custom_output / "candidates.parquet").exists() assert (custom_output / "candidates.provenance.yaml").exists()