Files
usher-exploring/tests/test_validation.py
gbanyan 5879ae9768 test(06-03): unit tests for all validation modules
- Created test_validation.py with 13 comprehensive tests
- create_synthetic_scored_db helper: designs scores for known genes high, housekeeping low
- Tests for negative controls:
  - test_compile_housekeeping_genes_structure
  - test_compile_housekeeping_genes_known_genes_present
  - test_validate_negative_controls_with_synthetic_data
  - test_validate_negative_controls_inverted_logic
- Tests for recall@k:
  - test_compute_recall_at_k
- Tests for sensitivity/perturbation:
  - test_perturb_weight_renormalizes
  - test_perturb_weight_large_negative
  - test_perturb_weight_invalid_layer
- Tests for validation report:
  - test_generate_comprehensive_validation_report_format
  - test_recommend_weight_tuning_all_pass
  - test_recommend_weight_tuning_positive_fail
  - test_recommend_weight_tuning_negative_fail
  - test_recommend_weight_tuning_sensitivity_fail
- All tests pass using synthetic DuckDB with tmp_path isolation
2026-02-12 04:50:17 +08:00

479 lines
16 KiB
Python

"""Tests for validation modules (negative controls, recall@k, sensitivity, report)."""
import polars as pl
import pytest
from pathlib import Path
from usher_pipeline.persistence.duckdb_store import PipelineStore
from usher_pipeline.config.schema import ScoringWeights
from usher_pipeline.scoring import (
compile_housekeeping_genes,
validate_negative_controls,
compute_recall_at_k,
validate_positive_controls_extended,
)
from usher_pipeline.scoring.sensitivity import (
perturb_weight,
run_sensitivity_analysis,
summarize_sensitivity,
)
from usher_pipeline.scoring.validation_report import (
generate_comprehensive_validation_report,
recommend_weight_tuning,
)
def create_synthetic_scored_db(tmp_path: Path) -> PipelineStore:
"""
Create DuckDB with synthetic scored_genes for testing.
Designs scores so:
- Known cilia genes (MYO7A, IFT88, BBS1) get high scores (0.8-0.95)
- Housekeeping genes (GAPDH, ACTB, RPL13A) get low scores (0.1-0.3)
- Other genes get mid-range scores (0.3-0.6)
Returns:
PipelineStore with synthetic scored_genes table
"""
db_path = tmp_path / "test.duckdb"
store = PipelineStore(db_path)
# Create gene_universe
gene_symbols = [
# Known cilia genes (high scores)
"MYO7A", "IFT88", "BBS1",
# Housekeeping genes (low scores)
"GAPDH", "ACTB", "RPL13A",
# Filler genes (mid scores)
"GENE001", "GENE002", "GENE003", "GENE004", "GENE005",
"GENE006", "GENE007", "GENE008", "GENE009", "GENE010",
"GENE011", "GENE012", "GENE013", "GENE014",
]
gene_universe = pl.DataFrame({
"gene_id": [f"ENSG{i:011d}" for i in range(len(gene_symbols))],
"gene_symbol": gene_symbols,
"hgnc_id": [f"HGNC:{i+1000}" for i in range(len(gene_symbols))],
"uniprot_primary": [f"P{i+10000}" for i in range(len(gene_symbols))],
})
store.conn.execute("CREATE TABLE gene_universe AS SELECT * FROM gene_universe")
# Create scored_genes with designed scores
scored_data = {
"gene_id": gene_universe["gene_id"].to_list(),
"gene_symbol": gene_symbols,
# Design composite_score to ensure known genes high, housekeeping low
"composite_score": [
# Known cilia genes (indices 0-2): high scores
0.90, 0.85, 0.92,
# Housekeeping genes (indices 3-5): low scores
0.15, 0.20, 0.12,
# Filler genes: mid scores
0.45, 0.50, 0.35, 0.55, 0.40,
0.48, 0.52, 0.38, 0.42, 0.58,
0.43, 0.47, 0.53, 0.41,
],
# Layer scores (simplified - just use composite_score with small variations)
"gnomad_score": [
0.88, 0.83, 0.90,
0.18, 0.22, 0.15,
0.44, 0.49, 0.34, 0.54, 0.39,
0.47, 0.51, 0.37, 0.41, 0.57,
0.42, 0.46, 0.52, 0.40,
],
"expression_score": [
0.92, 0.87, 0.94,
0.12, 0.18, 0.10,
0.46, 0.51, 0.36, 0.56, 0.41,
0.49, 0.53, 0.39, 0.43, 0.59,
0.44, 0.48, 0.54, 0.42,
],
"annotation_score": [
0.90, 0.85, 0.92,
0.15, 0.20, 0.12,
0.45, 0.50, 0.35, 0.55, 0.40,
0.48, 0.52, 0.38, 0.42, 0.58,
0.43, 0.47, 0.53, 0.41,
],
"localization_score": [
0.91, 0.86, 0.93,
0.14, 0.19, 0.11,
0.45, 0.50, 0.35, 0.55, 0.40,
0.48, 0.52, 0.38, 0.42, 0.58,
0.43, 0.47, 0.53, 0.41,
],
"animal_model_score": [
0.89, 0.84, 0.91,
0.16, 0.21, 0.13,
0.45, 0.50, 0.35, 0.55, 0.40,
0.48, 0.52, 0.38, 0.42, 0.58,
0.43, 0.47, 0.53, 0.41,
],
"literature_score": [
0.90, 0.85, 0.92,
0.15, 0.20, 0.12,
0.45, 0.50, 0.35, 0.55, 0.40,
0.48, 0.52, 0.38, 0.42, 0.58,
0.43, 0.47, 0.53, 0.41,
],
"evidence_count": [6] * len(gene_symbols),
"quality_flag": ["sufficient_evidence"] * len(gene_symbols),
}
scored_df = pl.DataFrame(scored_data)
store.conn.execute("CREATE TABLE scored_genes AS SELECT * FROM scored_df")
# Create known_genes table
known_genes = pl.DataFrame({
"gene_symbol": ["MYO7A", "IFT88", "BBS1"],
"source": ["omim_usher", "syscilia_scgs_v2", "syscilia_scgs_v2"],
"confidence": ["HIGH", "HIGH", "HIGH"],
})
store.conn.execute("CREATE TABLE known_genes AS SELECT * FROM known_genes")
return store
def test_compile_housekeeping_genes_structure():
"""Test compile_housekeeping_genes returns correct structure."""
df = compile_housekeeping_genes()
# Check columns
assert "gene_symbol" in df.columns
assert "source" in df.columns
assert "confidence" in df.columns
# Check row count
assert df.height == 13, f"Expected 13 housekeeping genes, got {df.height}"
# Check all are literature_validated
assert df["source"].unique().to_list() == ["literature_validated"]
# Check all are HIGH confidence
assert df["confidence"].unique().to_list() == ["HIGH"]
def test_compile_housekeeping_genes_known_genes_present():
"""Test known housekeeping genes are present in compiled set."""
df = compile_housekeeping_genes()
gene_symbols = df["gene_symbol"].to_list()
# Check for well-known housekeeping genes
assert "GAPDH" in gene_symbols
assert "ACTB" in gene_symbols
assert "RPL13A" in gene_symbols
assert "TBP" in gene_symbols
def test_validate_negative_controls_with_synthetic_data(tmp_path):
"""Test negative control validation with synthetic data where housekeeping genes rank low."""
store = create_synthetic_scored_db(tmp_path)
try:
# Run negative control validation
metrics = validate_negative_controls(store)
# Should pass (housekeeping genes rank low in synthetic data)
assert metrics["validation_passed"] is True, \
f"Validation should pass with low housekeeping scores. Median: {metrics['median_percentile']}"
# Median percentile should be < 0.5
assert metrics["median_percentile"] < 0.5, \
f"Housekeeping median percentile should be < 0.5, got {metrics['median_percentile']}"
# Check found some housekeeping genes
assert metrics["total_in_dataset"] > 0, "Should find at least one housekeeping gene"
finally:
store.close()
def test_validate_negative_controls_inverted_logic(tmp_path):
"""Test negative control validation fails when housekeeping genes rank HIGH."""
db_path = tmp_path / "test_inverted.duckdb"
store = PipelineStore(db_path)
try:
# Create gene_universe
gene_symbols = ["GAPDH", "ACTB", "RPL13A", "GENE001", "GENE002"]
gene_universe = pl.DataFrame({
"gene_id": [f"ENSG{i:011d}" for i in range(len(gene_symbols))],
"gene_symbol": gene_symbols,
"hgnc_id": [f"HGNC:{i+1000}" for i in range(len(gene_symbols))],
"uniprot_primary": [f"P{i+10000}" for i in range(len(gene_symbols))],
})
store.conn.execute("CREATE TABLE gene_universe AS SELECT * FROM gene_universe")
# Create scored_genes where housekeeping genes rank HIGH (inverted)
scored_df = pl.DataFrame({
"gene_id": gene_universe["gene_id"].to_list(),
"gene_symbol": gene_symbols,
"composite_score": [0.90, 0.85, 0.88, 0.20, 0.15], # Housekeeping HIGH
"gnomad_score": [0.90, 0.85, 0.88, 0.20, 0.15],
"expression_score": [0.90, 0.85, 0.88, 0.20, 0.15],
"annotation_score": [0.90, 0.85, 0.88, 0.20, 0.15],
"localization_score": [0.90, 0.85, 0.88, 0.20, 0.15],
"animal_model_score": [0.90, 0.85, 0.88, 0.20, 0.15],
"literature_score": [0.90, 0.85, 0.88, 0.20, 0.15],
"evidence_count": [6] * len(gene_symbols),
"quality_flag": ["sufficient_evidence"] * len(gene_symbols),
})
store.conn.execute("CREATE TABLE scored_genes AS SELECT * FROM scored_df")
# Run negative control validation
metrics = validate_negative_controls(store)
# Should FAIL (housekeeping genes rank high)
assert metrics["validation_passed"] is False, \
"Validation should fail when housekeeping genes rank high"
# Median percentile should be >= 0.5
assert metrics["median_percentile"] >= 0.5, \
f"Housekeeping median percentile should be >= 0.5, got {metrics['median_percentile']}"
finally:
store.close()
def test_compute_recall_at_k(tmp_path):
"""Test recall@k computation with synthetic data."""
store = create_synthetic_scored_db(tmp_path)
try:
# Run recall@k
metrics = compute_recall_at_k(store)
# Check structure
assert "recalls_absolute" in metrics
assert "recalls_percentage" in metrics
assert "total_known_unique" in metrics
assert "total_scored" in metrics
# Check absolute recalls
assert 100 in metrics["recalls_absolute"]
assert 500 in metrics["recalls_absolute"]
# Check percentage recalls
assert "5%" in metrics["recalls_percentage"]
assert "10%" in metrics["recalls_percentage"]
# Note: compile_known_genes() returns all 38 known genes (OMIM + SYSCILIA),
# but our synthetic DB only has 3 of them (MYO7A, IFT88, BBS1).
# So total_known_unique is 38, but only 3 are in dataset.
total_known = metrics["total_known_unique"]
assert total_known == 38, f"Expected 38 known genes from compile_known_genes(), got {total_known}"
# All 3 genes in dataset have high scores, so recall should be 3/38 = 0.0789
recall_at_100 = metrics["recalls_absolute"].get(100, 0.0)
expected_recall = 3.0 / 38.0
assert abs(recall_at_100 - expected_recall) < 0.01, \
f"Recall@100 should be {expected_recall:.4f} (3/38), got {recall_at_100:.4f}"
finally:
store.close()
def test_perturb_weight_renormalizes():
"""Test weight perturbation maintains sum=1.0."""
baseline = ScoringWeights() # Default weights
# Perturb gnomad by +0.10
perturbed = perturb_weight(baseline, "gnomad", 0.10)
# Check weights sum to 1.0 (within tolerance)
weight_sum = (
perturbed.gnomad +
perturbed.expression +
perturbed.annotation +
perturbed.localization +
perturbed.animal_model +
perturbed.literature
)
assert abs(weight_sum - 1.0) < 1e-6, f"Weights should sum to 1.0, got {weight_sum}"
# Check gnomad increased relative to baseline
assert perturbed.gnomad > baseline.gnomad, \
f"Perturbed gnomad ({perturbed.gnomad}) should be > baseline ({baseline.gnomad})"
def test_perturb_weight_large_negative():
"""Test weight perturbation with large negative delta (edge case)."""
baseline = ScoringWeights()
# Perturb by -0.25 (more than most weight values)
perturbed = perturb_weight(baseline, "gnomad", -0.25)
# Check weight is >= 0.0 (clamped)
assert perturbed.gnomad >= 0.0, f"Weight should be >= 0.0, got {perturbed.gnomad}"
# Check weights still sum to 1.0
weight_sum = (
perturbed.gnomad +
perturbed.expression +
perturbed.annotation +
perturbed.localization +
perturbed.animal_model +
perturbed.literature
)
assert abs(weight_sum - 1.0) < 1e-6, f"Weights should sum to 1.0, got {weight_sum}"
def test_perturb_weight_invalid_layer():
"""Test perturb_weight raises ValueError for invalid layer."""
baseline = ScoringWeights()
with pytest.raises(ValueError, match="Invalid layer"):
perturb_weight(baseline, "nonexistent", 0.05)
def test_generate_comprehensive_validation_report_format():
"""Test comprehensive validation report contains expected sections."""
# Create mock metrics
positive_metrics = {
"validation_passed": True,
"total_known_expected": 38,
"total_known_in_dataset": 35,
"median_percentile": 0.85,
"top_quartile_count": 30,
"top_quartile_fraction": 0.86,
"recall_at_k": {
"recalls_absolute": {100: 0.90, 500: 0.95},
"recalls_percentage": {"5%": 0.85, "10%": 0.92},
"total_known_unique": 35,
"total_scored": 20000,
},
"per_source_breakdown": {
"omim_usher": {"median_percentile": 0.88, "count": 10, "top_quartile_count": 9},
"syscilia_scgs_v2": {"median_percentile": 0.83, "count": 25, "top_quartile_count": 21},
},
}
negative_metrics = {
"validation_passed": True,
"total_expected": 13,
"total_in_dataset": 13,
"median_percentile": 0.35,
"top_quartile_count": 1,
"in_high_tier_count": 0,
}
sensitivity_result = {
"baseline_weights": ScoringWeights().model_dump(),
"results": [
{
"layer": "gnomad",
"delta": 0.05,
"perturbed_weights": {},
"spearman_rho": 0.92,
"spearman_pval": 1e-10,
"overlap_count": 95,
"top_n": 100,
}
],
"top_n": 100,
"total_perturbations": 1,
}
sensitivity_summary = {
"min_rho": 0.92,
"max_rho": 0.92,
"mean_rho": 0.92,
"stable_count": 1,
"unstable_count": 0,
"total_perturbations": 1,
"overall_stable": True,
"most_sensitive_layer": "gnomad",
"most_robust_layer": "literature",
}
# Generate report
report = generate_comprehensive_validation_report(
positive_metrics,
negative_metrics,
sensitivity_result,
sensitivity_summary,
)
# Check report contains expected sections
assert "Positive Control Validation" in report
assert "Negative Control Validation" in report
assert "Sensitivity Analysis" in report
assert "Overall Validation Summary" in report
assert "Weight Tuning Recommendations" in report
# Check status appears
assert "PASSED" in report
def test_recommend_weight_tuning_all_pass():
"""Test weight tuning recommendations when all validations pass."""
positive_metrics = {"validation_passed": True}
negative_metrics = {"validation_passed": True}
sensitivity_summary = {"overall_stable": True}
recommendation = recommend_weight_tuning(
positive_metrics,
negative_metrics,
sensitivity_summary,
)
# Should recommend no tuning
assert "No tuning recommended" in recommendation or "validated" in recommendation.lower()
def test_recommend_weight_tuning_positive_fail():
"""Test weight tuning recommendations when positive controls fail."""
positive_metrics = {"validation_passed": False}
negative_metrics = {"validation_passed": True}
sensitivity_summary = {"overall_stable": True}
recommendation = recommend_weight_tuning(
positive_metrics,
negative_metrics,
sensitivity_summary,
)
# Should suggest increasing weights for layers where known genes score high
assert "Known Gene Ranking Issue" in recommendation or "Positive Control" in recommendation
def test_recommend_weight_tuning_negative_fail():
"""Test weight tuning recommendations when negative controls fail."""
positive_metrics = {"validation_passed": True}
negative_metrics = {"validation_passed": False}
sensitivity_summary = {"overall_stable": True}
recommendation = recommend_weight_tuning(
positive_metrics,
negative_metrics,
sensitivity_summary,
)
# Should suggest examining layers boosting housekeeping genes
assert "Housekeeping" in recommendation or "Negative Control" in recommendation
def test_recommend_weight_tuning_sensitivity_fail():
"""Test weight tuning recommendations when sensitivity unstable."""
positive_metrics = {"validation_passed": True}
negative_metrics = {"validation_passed": True}
sensitivity_summary = {
"overall_stable": False,
"unstable_count": 5,
"most_sensitive_layer": "gnomad",
}
recommendation = recommend_weight_tuning(
positive_metrics,
negative_metrics,
sensitivity_summary,
)
# Should suggest reducing weight of most sensitive layer
assert "Sensitivity" in recommendation or "gnomad" in recommendation