From 5879ae9768aeba2008265e996f3eb4841564c57b Mon Sep 17 00:00:00 2001
From: gbanyan <gbanyan.huang@gmail.com>
Date: Thu, 12 Feb 2026 04:50:17 +0800
Subject: [PATCH] test(06-03): unit tests for all validation modules

- Created test_validation.py with 13 comprehensive tests
- create_synthetic_scored_db helper: designs scores for known genes high, housekeeping low
- Tests for negative controls:
  - test_compile_housekeeping_genes_structure
  - test_compile_housekeeping_genes_known_genes_present
  - test_validate_negative_controls_with_synthetic_data
  - test_validate_negative_controls_inverted_logic
- Tests for recall@k:
  - test_compute_recall_at_k
- Tests for sensitivity/perturbation:
  - test_perturb_weight_renormalizes
  - test_perturb_weight_large_negative
  - test_perturb_weight_invalid_layer
- Tests for validation report:
  - test_generate_comprehensive_validation_report_format
  - test_recommend_weight_tuning_all_pass
  - test_recommend_weight_tuning_positive_fail
  - test_recommend_weight_tuning_negative_fail
  - test_recommend_weight_tuning_sensitivity_fail
- All tests pass using synthetic DuckDB with tmp_path isolation
---
 tests/test_validation.py | 478 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 478 insertions(+)
 create mode 100644 tests/test_validation.py

diff --git a/tests/test_validation.py b/tests/test_validation.py
new file mode 100644
index 0000000..641faef
--- /dev/null
+++ b/tests/test_validation.py
@@ -0,0 +1,478 @@
+"""Tests for validation modules (negative controls, recall@k, sensitivity, report)."""
+
+import polars as pl
+import pytest
+from pathlib import Path
+
+from usher_pipeline.persistence.duckdb_store import PipelineStore
+from usher_pipeline.config.schema import ScoringWeights
+from usher_pipeline.scoring import (
+    compile_housekeeping_genes,
+    validate_negative_controls,
+    compute_recall_at_k,
+    validate_positive_controls_extended,
+)
+from usher_pipeline.scoring.sensitivity import (
+    perturb_weight,
+    run_sensitivity_analysis,
+    summarize_sensitivity,
+)
+from usher_pipeline.scoring.validation_report import (
+    generate_comprehensive_validation_report,
+    recommend_weight_tuning,
+)
+
+
+def create_synthetic_scored_db(tmp_path: Path) -> PipelineStore:
+    """
+    Create DuckDB with synthetic scored_genes for testing.
+
+    Designs scores so:
+    - Known cilia genes (MYO7A, IFT88, BBS1) get high scores (0.8-0.95)
+    - Housekeeping genes (GAPDH, ACTB, RPL13A) get low scores (0.1-0.3)
+    - Other genes get mid-range scores (0.3-0.6)
+
+    Returns:
+        PipelineStore with synthetic scored_genes table
+    """
+    db_path = tmp_path / "test.duckdb"
+    store = PipelineStore(db_path)
+
+    # Create gene_universe
+    gene_symbols = [
+        # Known cilia genes (high scores)
+        "MYO7A", "IFT88", "BBS1",
+        # Housekeeping genes (low scores)
+        "GAPDH", "ACTB", "RPL13A",
+        # Filler genes (mid scores)
+        "GENE001", "GENE002", "GENE003", "GENE004", "GENE005",
+        "GENE006", "GENE007", "GENE008", "GENE009", "GENE010",
+        "GENE011", "GENE012", "GENE013", "GENE014",
+    ]
+
+    gene_universe = pl.DataFrame({
+        "gene_id": [f"ENSG{i:011d}" for i in range(len(gene_symbols))],
+        "gene_symbol": gene_symbols,
+        "hgnc_id": [f"HGNC:{i+1000}" for i in range(len(gene_symbols))],
+        "uniprot_primary": [f"P{i+10000}" for i in range(len(gene_symbols))],
+    })
+
+    store.conn.execute("CREATE TABLE gene_universe AS SELECT * FROM gene_universe")
+
+    # Create scored_genes with designed scores
+    scored_data = {
+        "gene_id": gene_universe["gene_id"].to_list(),
+        "gene_symbol": gene_symbols,
+        # Design composite_score to ensure known genes high, housekeeping low
+        "composite_score": [
+            # Known cilia genes (indices 0-2): high scores
+            0.90, 0.85, 0.92,
+            # Housekeeping genes (indices 3-5): low scores
+            0.15, 0.20, 0.12,
+            # Filler genes: mid scores
+            0.45, 0.50, 0.35, 0.55, 0.40,
+            0.48, 0.52, 0.38, 0.42, 0.58,
+            0.43, 0.47, 0.53, 0.41,
+        ],
+        # Layer scores (simplified - just use composite_score with small variations)
+        "gnomad_score": [
+            0.88, 0.83, 0.90,
+            0.18, 0.22, 0.15,
+            0.44, 0.49, 0.34, 0.54, 0.39,
+            0.47, 0.51, 0.37, 0.41, 0.57,
+            0.42, 0.46, 0.52, 0.40,
+        ],
+        "expression_score": [
+            0.92, 0.87, 0.94,
+            0.12, 0.18, 0.10,
+            0.46, 0.51, 0.36, 0.56, 0.41,
+            0.49, 0.53, 0.39, 0.43, 0.59,
+            0.44, 0.48, 0.54, 0.42,
+        ],
+        "annotation_score": [
+            0.90, 0.85, 0.92,
+            0.15, 0.20, 0.12,
+            0.45, 0.50, 0.35, 0.55, 0.40,
+            0.48, 0.52, 0.38, 0.42, 0.58,
+            0.43, 0.47, 0.53, 0.41,
+        ],
+        "localization_score": [
+            0.91, 0.86, 0.93,
+            0.14, 0.19, 0.11,
+            0.45, 0.50, 0.35, 0.55, 0.40,
+            0.48, 0.52, 0.38, 0.42, 0.58,
+            0.43, 0.47, 0.53, 0.41,
+        ],
+        "animal_model_score": [
+            0.89, 0.84, 0.91,
+            0.16, 0.21, 0.13,
+            0.45, 0.50, 0.35, 0.55, 0.40,
+            0.48, 0.52, 0.38, 0.42, 0.58,
+            0.43, 0.47, 0.53, 0.41,
+        ],
+        "literature_score": [
+            0.90, 0.85, 0.92,
+            0.15, 0.20, 0.12,
+            0.45, 0.50, 0.35, 0.55, 0.40,
+            0.48, 0.52, 0.38, 0.42, 0.58,
+            0.43, 0.47, 0.53, 0.41,
+        ],
+        "evidence_count": [6] * len(gene_symbols),
+        "quality_flag": ["sufficient_evidence"] * len(gene_symbols),
+    }
+
+    scored_df = pl.DataFrame(scored_data)
+    store.conn.execute("CREATE TABLE scored_genes AS SELECT * FROM scored_df")
+
+    # Create known_genes table
+    known_genes = pl.DataFrame({
+        "gene_symbol": ["MYO7A", "IFT88", "BBS1"],
+        "source": ["omim_usher", "syscilia_scgs_v2", "syscilia_scgs_v2"],
+        "confidence": ["HIGH", "HIGH", "HIGH"],
+    })
+
+    store.conn.execute("CREATE TABLE known_genes AS SELECT * FROM known_genes")
+
+    return store
+
+
+def test_compile_housekeeping_genes_structure():
+    """Test compile_housekeeping_genes returns correct structure."""
+    df = compile_housekeeping_genes()
+
+    # Check columns
+    assert "gene_symbol" in df.columns
+    assert "source" in df.columns
+    assert "confidence" in df.columns
+
+    # Check row count
+    assert df.height == 13, f"Expected 13 housekeeping genes, got {df.height}"
+
+    # Check all are literature_validated
+    assert df["source"].unique().to_list() == ["literature_validated"]
+
+    # Check all are HIGH confidence
+    assert df["confidence"].unique().to_list() == ["HIGH"]
+
+
+def test_compile_housekeeping_genes_known_genes_present():
+    """Test known housekeeping genes are present in compiled set."""
+    df = compile_housekeeping_genes()
+
+    gene_symbols = df["gene_symbol"].to_list()
+
+    # Check for well-known housekeeping genes
+    assert "GAPDH" in gene_symbols
+    assert "ACTB" in gene_symbols
+    assert "RPL13A" in gene_symbols
+    assert "TBP" in gene_symbols
+
+
+def test_validate_negative_controls_with_synthetic_data(tmp_path):
+    """Test negative control validation with synthetic data where housekeeping genes rank low."""
+    store = create_synthetic_scored_db(tmp_path)
+
+    try:
+        # Run negative control validation
+        metrics = validate_negative_controls(store)
+
+        # Should pass (housekeeping genes rank low in synthetic data)
+        assert metrics["validation_passed"] is True, \
+            f"Validation should pass with low housekeeping scores. Median: {metrics['median_percentile']}"
+
+        # Median percentile should be < 0.5
+        assert metrics["median_percentile"] < 0.5, \
+            f"Housekeeping median percentile should be < 0.5, got {metrics['median_percentile']}"
+
+        # Check found some housekeeping genes
+        assert metrics["total_in_dataset"] > 0, "Should find at least one housekeeping gene"
+
+    finally:
+        store.close()
+
+
+def test_validate_negative_controls_inverted_logic(tmp_path):
+    """Test negative control validation fails when housekeeping genes rank HIGH."""
+    db_path = tmp_path / "test_inverted.duckdb"
+    store = PipelineStore(db_path)
+
+    try:
+        # Create gene_universe
+        gene_symbols = ["GAPDH", "ACTB", "RPL13A", "GENE001", "GENE002"]
+        gene_universe = pl.DataFrame({
+            "gene_id": [f"ENSG{i:011d}" for i in range(len(gene_symbols))],
+            "gene_symbol": gene_symbols,
+            "hgnc_id": [f"HGNC:{i+1000}" for i in range(len(gene_symbols))],
+            "uniprot_primary": [f"P{i+10000}" for i in range(len(gene_symbols))],
+        })
+        store.conn.execute("CREATE TABLE gene_universe AS SELECT * FROM gene_universe")
+
+        # Create scored_genes where housekeeping genes rank HIGH (inverted)
+        scored_df = pl.DataFrame({
+            "gene_id": gene_universe["gene_id"].to_list(),
+            "gene_symbol": gene_symbols,
+            "composite_score": [0.90, 0.85, 0.88, 0.20, 0.15],  # Housekeeping HIGH
+            "gnomad_score": [0.90, 0.85, 0.88, 0.20, 0.15],
+            "expression_score": [0.90, 0.85, 0.88, 0.20, 0.15],
+            "annotation_score": [0.90, 0.85, 0.88, 0.20, 0.15],
+            "localization_score": [0.90, 0.85, 0.88, 0.20, 0.15],
+            "animal_model_score": [0.90, 0.85, 0.88, 0.20, 0.15],
+            "literature_score": [0.90, 0.85, 0.88, 0.20, 0.15],
+            "evidence_count": [6] * len(gene_symbols),
+            "quality_flag": ["sufficient_evidence"] * len(gene_symbols),
+        })
+        store.conn.execute("CREATE TABLE scored_genes AS SELECT * FROM scored_df")
+
+        # Run negative control validation
+        metrics = validate_negative_controls(store)
+
+        # Should FAIL (housekeeping genes rank high)
+        assert metrics["validation_passed"] is False, \
+            "Validation should fail when housekeeping genes rank high"
+
+        # Median percentile should be >= 0.5
+        assert metrics["median_percentile"] >= 0.5, \
+            f"Housekeeping median percentile should be >= 0.5, got {metrics['median_percentile']}"
+
+    finally:
+        store.close()
+
+
+def test_compute_recall_at_k(tmp_path):
+    """Test recall@k computation with synthetic data."""
+    store = create_synthetic_scored_db(tmp_path)
+
+    try:
+        # Run recall@k
+        metrics = compute_recall_at_k(store)
+
+        # Check structure
+        assert "recalls_absolute" in metrics
+        assert "recalls_percentage" in metrics
+        assert "total_known_unique" in metrics
+        assert "total_scored" in metrics
+
+        # Check absolute recalls
+        assert 100 in metrics["recalls_absolute"]
+        assert 500 in metrics["recalls_absolute"]
+
+        # Check percentage recalls
+        assert "5%" in metrics["recalls_percentage"]
+        assert "10%" in metrics["recalls_percentage"]
+
+        # Note: compile_known_genes() returns all 38 known genes (OMIM + SYSCILIA),
+        # but our synthetic DB only has 3 of them (MYO7A, IFT88, BBS1).
+        # So total_known_unique is 38, but only 3 are in dataset.
+        total_known = metrics["total_known_unique"]
+        assert total_known == 38, f"Expected 38 known genes from compile_known_genes(), got {total_known}"
+
+        # All 3 genes in dataset have high scores, so recall should be 3/38 = 0.0789
+        recall_at_100 = metrics["recalls_absolute"].get(100, 0.0)
+        expected_recall = 3.0 / 38.0
+        assert abs(recall_at_100 - expected_recall) < 0.01, \
+            f"Recall@100 should be {expected_recall:.4f} (3/38), got {recall_at_100:.4f}"
+
+    finally:
+        store.close()
+
+
+def test_perturb_weight_renormalizes():
+    """Test weight perturbation maintains sum=1.0."""
+    baseline = ScoringWeights()  # Default weights
+
+    # Perturb gnomad by +0.10
+    perturbed = perturb_weight(baseline, "gnomad", 0.10)
+
+    # Check weights sum to 1.0 (within tolerance)
+    weight_sum = (
+        perturbed.gnomad +
+        perturbed.expression +
+        perturbed.annotation +
+        perturbed.localization +
+        perturbed.animal_model +
+        perturbed.literature
+    )
+
+    assert abs(weight_sum - 1.0) < 1e-6, f"Weights should sum to 1.0, got {weight_sum}"
+
+    # Check gnomad increased relative to baseline
+    assert perturbed.gnomad > baseline.gnomad, \
+        f"Perturbed gnomad ({perturbed.gnomad}) should be > baseline ({baseline.gnomad})"
+
+
+def test_perturb_weight_large_negative():
+    """Test weight perturbation with large negative delta (edge case)."""
+    baseline = ScoringWeights()
+
+    # Perturb by -0.25 (more than most weight values)
+    perturbed = perturb_weight(baseline, "gnomad", -0.25)
+
+    # Check weight is >= 0.0 (clamped)
+    assert perturbed.gnomad >= 0.0, f"Weight should be >= 0.0, got {perturbed.gnomad}"
+
+    # Check weights still sum to 1.0
+    weight_sum = (
+        perturbed.gnomad +
+        perturbed.expression +
+        perturbed.annotation +
+        perturbed.localization +
+        perturbed.animal_model +
+        perturbed.literature
+    )
+
+    assert abs(weight_sum - 1.0) < 1e-6, f"Weights should sum to 1.0, got {weight_sum}"
+
+
+def test_perturb_weight_invalid_layer():
+    """Test perturb_weight raises ValueError for invalid layer."""
+    baseline = ScoringWeights()
+
+    with pytest.raises(ValueError, match="Invalid layer"):
+        perturb_weight(baseline, "nonexistent", 0.05)
+
+
+def test_generate_comprehensive_validation_report_format():
+    """Test comprehensive validation report contains expected sections."""
+    # Create mock metrics
+    positive_metrics = {
+        "validation_passed": True,
+        "total_known_expected": 38,
+        "total_known_in_dataset": 35,
+        "median_percentile": 0.85,
+        "top_quartile_count": 30,
+        "top_quartile_fraction": 0.86,
+        "recall_at_k": {
+            "recalls_absolute": {100: 0.90, 500: 0.95},
+            "recalls_percentage": {"5%": 0.85, "10%": 0.92},
+            "total_known_unique": 35,
+            "total_scored": 20000,
+        },
+        "per_source_breakdown": {
+            "omim_usher": {"median_percentile": 0.88, "count": 10, "top_quartile_count": 9},
+            "syscilia_scgs_v2": {"median_percentile": 0.83, "count": 25, "top_quartile_count": 21},
+        },
+    }
+
+    negative_metrics = {
+        "validation_passed": True,
+        "total_expected": 13,
+        "total_in_dataset": 13,
+        "median_percentile": 0.35,
+        "top_quartile_count": 1,
+        "in_high_tier_count": 0,
+    }
+
+    sensitivity_result = {
+        "baseline_weights": ScoringWeights().model_dump(),
+        "results": [
+            {
+                "layer": "gnomad",
+                "delta": 0.05,
+                "perturbed_weights": {},
+                "spearman_rho": 0.92,
+                "spearman_pval": 1e-10,
+                "overlap_count": 95,
+                "top_n": 100,
+            }
+        ],
+        "top_n": 100,
+        "total_perturbations": 1,
+    }
+
+    sensitivity_summary = {
+        "min_rho": 0.92,
+        "max_rho": 0.92,
+        "mean_rho": 0.92,
+        "stable_count": 1,
+        "unstable_count": 0,
+        "total_perturbations": 1,
+        "overall_stable": True,
+        "most_sensitive_layer": "gnomad",
+        "most_robust_layer": "literature",
+    }
+
+    # Generate report
+    report = generate_comprehensive_validation_report(
+        positive_metrics,
+        negative_metrics,
+        sensitivity_result,
+        sensitivity_summary,
+    )
+
+    # Check report contains expected sections
+    assert "Positive Control Validation" in report
+    assert "Negative Control Validation" in report
+    assert "Sensitivity Analysis" in report
+    assert "Overall Validation Summary" in report
+    assert "Weight Tuning Recommendations" in report
+
+    # Check status appears
+    assert "PASSED" in report
+
+
+def test_recommend_weight_tuning_all_pass():
+    """Test weight tuning recommendations when all validations pass."""
+    positive_metrics = {"validation_passed": True}
+    negative_metrics = {"validation_passed": True}
+    sensitivity_summary = {"overall_stable": True}
+
+    recommendation = recommend_weight_tuning(
+        positive_metrics,
+        negative_metrics,
+        sensitivity_summary,
+    )
+
+    # Should recommend no tuning
+    assert "No tuning recommended" in recommendation or "validated" in recommendation.lower()
+
+
+def test_recommend_weight_tuning_positive_fail():
+    """Test weight tuning recommendations when positive controls fail."""
+    positive_metrics = {"validation_passed": False}
+    negative_metrics = {"validation_passed": True}
+    sensitivity_summary = {"overall_stable": True}
+
+    recommendation = recommend_weight_tuning(
+        positive_metrics,
+        negative_metrics,
+        sensitivity_summary,
+    )
+
+    # Should suggest increasing weights for layers where known genes score high
+    assert "Known Gene Ranking Issue" in recommendation or "Positive Control" in recommendation
+
+
+def test_recommend_weight_tuning_negative_fail():
+    """Test weight tuning recommendations when negative controls fail."""
+    positive_metrics = {"validation_passed": True}
+    negative_metrics = {"validation_passed": False}
+    sensitivity_summary = {"overall_stable": True}
+
+    recommendation = recommend_weight_tuning(
+        positive_metrics,
+        negative_metrics,
+        sensitivity_summary,
+    )
+
+    # Should suggest examining layers boosting housekeeping genes
+    assert "Housekeeping" in recommendation or "Negative Control" in recommendation
+
+
+def test_recommend_weight_tuning_sensitivity_fail():
+    """Test weight tuning recommendations when sensitivity unstable."""
+    positive_metrics = {"validation_passed": True}
+    negative_metrics = {"validation_passed": True}
+    sensitivity_summary = {
+        "overall_stable": False,
+        "unstable_count": 5,
+        "most_sensitive_layer": "gnomad",
+    }
+
+    recommendation = recommend_weight_tuning(
+        positive_metrics,
+        negative_metrics,
+        sensitivity_summary,
+    )
+
+    # Should suggest reducing weight of most sensitive layer
+    assert "Sensitivity" in recommendation or "gnomad" in recommendation