usher-exploring/tests/test_validation.py

"""Tests for validation modules (negative controls, recall@k, sensitivity, report)."""

import polars as pl
import pytest
from pathlib import Path

from usher_pipeline.persistence.duckdb_store import PipelineStore
from usher_pipeline.config.schema import ScoringWeights
from usher_pipeline.scoring import (
    compile_housekeeping_genes,
    validate_negative_controls,
    compute_recall_at_k,
    validate_positive_controls_extended,
)
from usher_pipeline.scoring.sensitivity import (
    perturb_weight,
    run_sensitivity_analysis,
    summarize_sensitivity,
)
from usher_pipeline.scoring.validation_report import (
    generate_comprehensive_validation_report,
    recommend_weight_tuning,
)


def create_synthetic_scored_db(tmp_path: Path) -> PipelineStore:
    """
    Create DuckDB with synthetic scored_genes for testing.

    Designs scores so:
    - Known cilia genes (MYO7A, IFT88, BBS1) get high scores (0.8-0.95)
    - Housekeeping genes (GAPDH, ACTB, RPL13A) get low scores (0.1-0.3)
    - Other genes get mid-range scores (0.3-0.6)

    Returns:
        PipelineStore with synthetic scored_genes table
    """
    db_path = tmp_path / "test.duckdb"
    store = PipelineStore(db_path)

    # Create gene_universe
    gene_symbols = [
        # Known cilia genes (high scores)
        "MYO7A", "IFT88", "BBS1",
        # Housekeeping genes (low scores)
        "GAPDH", "ACTB", "RPL13A",
        # Filler genes (mid scores)
        "GENE001", "GENE002", "GENE003", "GENE004", "GENE005",
        "GENE006", "GENE007", "GENE008", "GENE009", "GENE010",
        "GENE011", "GENE012", "GENE013", "GENE014",
    ]

    gene_universe = pl.DataFrame({
        "gene_id": [f"ENSG{i:011d}" for i in range(len(gene_symbols))],
        "gene_symbol": gene_symbols,
        "hgnc_id": [f"HGNC:{i+1000}" for i in range(len(gene_symbols))],
        "uniprot_primary": [f"P{i+10000}" for i in range(len(gene_symbols))],
    })

    store.conn.execute("CREATE TABLE gene_universe AS SELECT * FROM gene_universe")

    # Create scored_genes with designed scores
    scored_data = {
        "gene_id": gene_universe["gene_id"].to_list(),
        "gene_symbol": gene_symbols,
        # Design composite_score to ensure known genes high, housekeeping low
        "composite_score": [
            # Known cilia genes (indices 0-2): high scores
            0.90, 0.85, 0.92,
            # Housekeeping genes (indices 3-5): low scores
            0.15, 0.20, 0.12,
            # Filler genes: mid scores
            0.45, 0.50, 0.35, 0.55, 0.40,
            0.48, 0.52, 0.38, 0.42, 0.58,
            0.43, 0.47, 0.53, 0.41,
        ],
        # Layer scores (simplified - just use composite_score with small variations)
        "gnomad_score": [
            0.88, 0.83, 0.90,
            0.18, 0.22, 0.15,
            0.44, 0.49, 0.34, 0.54, 0.39,
            0.47, 0.51, 0.37, 0.41, 0.57,
            0.42, 0.46, 0.52, 0.40,
        ],
        "expression_score": [
            0.92, 0.87, 0.94,
            0.12, 0.18, 0.10,
            0.46, 0.51, 0.36, 0.56, 0.41,
            0.49, 0.53, 0.39, 0.43, 0.59,
            0.44, 0.48, 0.54, 0.42,
        ],
        "annotation_score": [
            0.90, 0.85, 0.92,
            0.15, 0.20, 0.12,
            0.45, 0.50, 0.35, 0.55, 0.40,
            0.48, 0.52, 0.38, 0.42, 0.58,
            0.43, 0.47, 0.53, 0.41,
        ],
        "localization_score": [
            0.91, 0.86, 0.93,
            0.14, 0.19, 0.11,
            0.45, 0.50, 0.35, 0.55, 0.40,
            0.48, 0.52, 0.38, 0.42, 0.58,
            0.43, 0.47, 0.53, 0.41,
        ],
        "animal_model_score": [
            0.89, 0.84, 0.91,
            0.16, 0.21, 0.13,
            0.45, 0.50, 0.35, 0.55, 0.40,
            0.48, 0.52, 0.38, 0.42, 0.58,
            0.43, 0.47, 0.53, 0.41,
        ],
        "literature_score": [
            0.90, 0.85, 0.92,
            0.15, 0.20, 0.12,
            0.45, 0.50, 0.35, 0.55, 0.40,
            0.48, 0.52, 0.38, 0.42, 0.58,
            0.43, 0.47, 0.53, 0.41,
        ],
        "evidence_count": [6] * len(gene_symbols),
        "quality_flag": ["sufficient_evidence"] * len(gene_symbols),
    }

    scored_df = pl.DataFrame(scored_data)
    store.conn.execute("CREATE TABLE scored_genes AS SELECT * FROM scored_df")

    # Create known_genes table
    known_genes = pl.DataFrame({
        "gene_symbol": ["MYO7A", "IFT88", "BBS1"],
        "source": ["omim_usher", "syscilia_scgs_v2", "syscilia_scgs_v2"],
        "confidence": ["HIGH", "HIGH", "HIGH"],
    })

    store.conn.execute("CREATE TABLE known_genes AS SELECT * FROM known_genes")

    return store


def test_compile_housekeeping_genes_structure():
    """Test compile_housekeeping_genes returns correct structure."""
    df = compile_housekeeping_genes()

    # Check columns
    assert "gene_symbol" in df.columns
    assert "source" in df.columns
    assert "confidence" in df.columns

    # Check row count
    assert df.height == 13, f"Expected 13 housekeeping genes, got {df.height}"

    # Check all are literature_validated
    assert df["source"].unique().to_list() == ["literature_validated"]

    # Check all are HIGH confidence
    assert df["confidence"].unique().to_list() == ["HIGH"]


def test_compile_housekeeping_genes_known_genes_present():
    """Test known housekeeping genes are present in compiled set."""
    df = compile_housekeeping_genes()

    gene_symbols = df["gene_symbol"].to_list()

    # Check for well-known housekeeping genes
    assert "GAPDH" in gene_symbols
    assert "ACTB" in gene_symbols
    assert "RPL13A" in gene_symbols
    assert "TBP" in gene_symbols


def test_validate_negative_controls_with_synthetic_data(tmp_path):
    """Test negative control validation with synthetic data where housekeeping genes rank low."""
    store = create_synthetic_scored_db(tmp_path)

    try:
        # Run negative control validation
        metrics = validate_negative_controls(store)

        # Should pass (housekeeping genes rank low in synthetic data)
        assert metrics["validation_passed"] is True, \
            f"Validation should pass with low housekeeping scores. Median: {metrics['median_percentile']}"

        # Median percentile should be < 0.5
        assert metrics["median_percentile"] < 0.5, \
            f"Housekeeping median percentile should be < 0.5, got {metrics['median_percentile']}"

        # Check found some housekeeping genes
        assert metrics["total_in_dataset"] > 0, "Should find at least one housekeeping gene"

    finally:
        store.close()


def test_validate_negative_controls_inverted_logic(tmp_path):
    """Test negative control validation fails when housekeeping genes rank HIGH."""
    db_path = tmp_path / "test_inverted.duckdb"
    store = PipelineStore(db_path)

    try:
        # Create gene_universe
        gene_symbols = ["GAPDH", "ACTB", "RPL13A", "GENE001", "GENE002"]
        gene_universe = pl.DataFrame({
            "gene_id": [f"ENSG{i:011d}" for i in range(len(gene_symbols))],
            "gene_symbol": gene_symbols,
            "hgnc_id": [f"HGNC:{i+1000}" for i in range(len(gene_symbols))],
            "uniprot_primary": [f"P{i+10000}" for i in range(len(gene_symbols))],
        })
        store.conn.execute("CREATE TABLE gene_universe AS SELECT * FROM gene_universe")

        # Create scored_genes where housekeeping genes rank HIGH (inverted)
        scored_df = pl.DataFrame({
            "gene_id": gene_universe["gene_id"].to_list(),
            "gene_symbol": gene_symbols,
            "composite_score": [0.90, 0.85, 0.88, 0.20, 0.15],  # Housekeeping HIGH
            "gnomad_score": [0.90, 0.85, 0.88, 0.20, 0.15],
            "expression_score": [0.90, 0.85, 0.88, 0.20, 0.15],
            "annotation_score": [0.90, 0.85, 0.88, 0.20, 0.15],
            "localization_score": [0.90, 0.85, 0.88, 0.20, 0.15],
            "animal_model_score": [0.90, 0.85, 0.88, 0.20, 0.15],
            "literature_score": [0.90, 0.85, 0.88, 0.20, 0.15],
            "evidence_count": [6] * len(gene_symbols),
            "quality_flag": ["sufficient_evidence"] * len(gene_symbols),
        })
        store.conn.execute("CREATE TABLE scored_genes AS SELECT * FROM scored_df")

        # Run negative control validation
        metrics = validate_negative_controls(store)

        # Should FAIL (housekeeping genes rank high)
        assert metrics["validation_passed"] is False, \
            "Validation should fail when housekeeping genes rank high"

        # Median percentile should be >= 0.5
        assert metrics["median_percentile"] >= 0.5, \
            f"Housekeeping median percentile should be >= 0.5, got {metrics['median_percentile']}"

    finally:
        store.close()


def test_compute_recall_at_k(tmp_path):
    """Test recall@k computation with synthetic data."""
    store = create_synthetic_scored_db(tmp_path)

    try:
        # Run recall@k
        metrics = compute_recall_at_k(store)

        # Check structure
        assert "recalls_absolute" in metrics
        assert "recalls_percentage" in metrics
        assert "total_known_unique" in metrics
        assert "total_scored" in metrics

        # Check absolute recalls
        assert 100 in metrics["recalls_absolute"]
        assert 500 in metrics["recalls_absolute"]

        # Check percentage recalls
        assert "5%" in metrics["recalls_percentage"]
        assert "10%" in metrics["recalls_percentage"]

        # Note: compile_known_genes() returns all 38 known genes (OMIM + SYSCILIA),
        # but our synthetic DB only has 3 of them (MYO7A, IFT88, BBS1).
        # So total_known_unique is 38, but only 3 are in dataset.
        total_known = metrics["total_known_unique"]
        assert total_known == 38, f"Expected 38 known genes from compile_known_genes(), got {total_known}"

        # All 3 genes in dataset have high scores, so recall should be 3/38 = 0.0789
        recall_at_100 = metrics["recalls_absolute"].get(100, 0.0)
        expected_recall = 3.0 / 38.0
        assert abs(recall_at_100 - expected_recall) < 0.01, \
            f"Recall@100 should be {expected_recall:.4f} (3/38), got {recall_at_100:.4f}"

    finally:
        store.close()


def test_perturb_weight_renormalizes():
    """Test weight perturbation maintains sum=1.0."""
    baseline = ScoringWeights()  # Default weights

    # Perturb gnomad by +0.10
    perturbed = perturb_weight(baseline, "gnomad", 0.10)

    # Check weights sum to 1.0 (within tolerance)
    weight_sum = (
        perturbed.gnomad +
        perturbed.expression +
        perturbed.annotation +
        perturbed.localization +
        perturbed.animal_model +
        perturbed.literature
    )

    assert abs(weight_sum - 1.0) < 1e-6, f"Weights should sum to 1.0, got {weight_sum}"

    # Check gnomad increased relative to baseline
    assert perturbed.gnomad > baseline.gnomad, \
        f"Perturbed gnomad ({perturbed.gnomad}) should be > baseline ({baseline.gnomad})"


def test_perturb_weight_large_negative():
    """Test weight perturbation with large negative delta (edge case)."""
    baseline = ScoringWeights()

    # Perturb by -0.25 (more than most weight values)
    perturbed = perturb_weight(baseline, "gnomad", -0.25)

    # Check weight is >= 0.0 (clamped)
    assert perturbed.gnomad >= 0.0, f"Weight should be >= 0.0, got {perturbed.gnomad}"

    # Check weights still sum to 1.0
    weight_sum = (
        perturbed.gnomad +
        perturbed.expression +
        perturbed.annotation +
        perturbed.localization +
        perturbed.animal_model +
        perturbed.literature
    )

    assert abs(weight_sum - 1.0) < 1e-6, f"Weights should sum to 1.0, got {weight_sum}"


def test_perturb_weight_invalid_layer():
    """Test perturb_weight raises ValueError for invalid layer."""
    baseline = ScoringWeights()

    with pytest.raises(ValueError, match="Invalid layer"):
        perturb_weight(baseline, "nonexistent", 0.05)


def test_generate_comprehensive_validation_report_format():
    """Test comprehensive validation report contains expected sections."""
    # Create mock metrics
    positive_metrics = {
        "validation_passed": True,
        "total_known_expected": 38,
        "total_known_in_dataset": 35,
        "median_percentile": 0.85,
        "top_quartile_count": 30,
        "top_quartile_fraction": 0.86,
        "recall_at_k": {
            "recalls_absolute": {100: 0.90, 500: 0.95},
            "recalls_percentage": {"5%": 0.85, "10%": 0.92},
            "total_known_unique": 35,
            "total_scored": 20000,
        },
        "per_source_breakdown": {
            "omim_usher": {"median_percentile": 0.88, "count": 10, "top_quartile_count": 9},
            "syscilia_scgs_v2": {"median_percentile": 0.83, "count": 25, "top_quartile_count": 21},
        },
    }

    negative_metrics = {
        "validation_passed": True,
        "total_expected": 13,
        "total_in_dataset": 13,
        "median_percentile": 0.35,
        "top_quartile_count": 1,
        "in_high_tier_count": 0,
    }

    sensitivity_result = {
        "baseline_weights": ScoringWeights().model_dump(),
        "results": [
            {
                "layer": "gnomad",
                "delta": 0.05,
                "perturbed_weights": {},
                "spearman_rho": 0.92,
                "spearman_pval": 1e-10,
                "overlap_count": 95,
                "top_n": 100,
            }
        ],
        "top_n": 100,
        "total_perturbations": 1,
    }

    sensitivity_summary = {
        "min_rho": 0.92,
        "max_rho": 0.92,
        "mean_rho": 0.92,
        "stable_count": 1,
        "unstable_count": 0,
        "total_perturbations": 1,
        "overall_stable": True,
        "most_sensitive_layer": "gnomad",
        "most_robust_layer": "literature",
    }

    # Generate report
    report = generate_comprehensive_validation_report(
        positive_metrics,
        negative_metrics,
        sensitivity_result,
        sensitivity_summary,
    )

    # Check report contains expected sections
    assert "Positive Control Validation" in report
    assert "Negative Control Validation" in report
    assert "Sensitivity Analysis" in report
    assert "Overall Validation Summary" in report
    assert "Weight Tuning Recommendations" in report

    # Check status appears
    assert "PASSED" in report


def test_recommend_weight_tuning_all_pass():
    """Test weight tuning recommendations when all validations pass."""
    positive_metrics = {"validation_passed": True}
    negative_metrics = {"validation_passed": True}
    sensitivity_summary = {"overall_stable": True}

    recommendation = recommend_weight_tuning(
        positive_metrics,
        negative_metrics,
        sensitivity_summary,
    )

    # Should recommend no tuning
    assert "No tuning recommended" in recommendation or "validated" in recommendation.lower()


def test_recommend_weight_tuning_positive_fail():
    """Test weight tuning recommendations when positive controls fail."""
    positive_metrics = {"validation_passed": False}
    negative_metrics = {"validation_passed": True}
    sensitivity_summary = {"overall_stable": True}

    recommendation = recommend_weight_tuning(
        positive_metrics,
        negative_metrics,
        sensitivity_summary,
    )

    # Should suggest increasing weights for layers where known genes score high
    assert "Known Gene Ranking Issue" in recommendation or "Positive Control" in recommendation


def test_recommend_weight_tuning_negative_fail():
    """Test weight tuning recommendations when negative controls fail."""
    positive_metrics = {"validation_passed": True}
    negative_metrics = {"validation_passed": False}
    sensitivity_summary = {"overall_stable": True}

    recommendation = recommend_weight_tuning(
        positive_metrics,
        negative_metrics,
        sensitivity_summary,
    )

    # Should suggest examining layers boosting housekeeping genes
    assert "Housekeeping" in recommendation or "Negative Control" in recommendation


def test_recommend_weight_tuning_sensitivity_fail():
    """Test weight tuning recommendations when sensitivity unstable."""
    positive_metrics = {"validation_passed": True}
    negative_metrics = {"validation_passed": True}
    sensitivity_summary = {
        "overall_stable": False,
        "unstable_count": 5,
        "most_sensitive_layer": "gnomad",
    }

    recommendation = recommend_weight_tuning(
        positive_metrics,
        negative_metrics,
        sensitivity_summary,
    )

    # Should suggest reducing weight of most sensitive layer
    assert "Sensitivity" in recommendation or "gnomad" in recommendation