- Created test_validation.py with 13 comprehensive tests - create_synthetic_scored_db helper: designs scores for known genes high, housekeeping low - Tests for negative controls: - test_compile_housekeeping_genes_structure - test_compile_housekeeping_genes_known_genes_present - test_validate_negative_controls_with_synthetic_data - test_validate_negative_controls_inverted_logic - Tests for recall@k: - test_compute_recall_at_k - Tests for sensitivity/perturbation: - test_perturb_weight_renormalizes - test_perturb_weight_large_negative - test_perturb_weight_invalid_layer - Tests for validation report: - test_generate_comprehensive_validation_report_format - test_recommend_weight_tuning_all_pass - test_recommend_weight_tuning_positive_fail - test_recommend_weight_tuning_negative_fail - test_recommend_weight_tuning_sensitivity_fail - All tests pass using synthetic DuckDB with tmp_path isolation
479 lines
16 KiB
Python
479 lines
16 KiB
Python
"""Tests for validation modules (negative controls, recall@k, sensitivity, report)."""
|
|
|
|
import polars as pl
|
|
import pytest
|
|
from pathlib import Path
|
|
|
|
from usher_pipeline.persistence.duckdb_store import PipelineStore
|
|
from usher_pipeline.config.schema import ScoringWeights
|
|
from usher_pipeline.scoring import (
|
|
compile_housekeeping_genes,
|
|
validate_negative_controls,
|
|
compute_recall_at_k,
|
|
validate_positive_controls_extended,
|
|
)
|
|
from usher_pipeline.scoring.sensitivity import (
|
|
perturb_weight,
|
|
run_sensitivity_analysis,
|
|
summarize_sensitivity,
|
|
)
|
|
from usher_pipeline.scoring.validation_report import (
|
|
generate_comprehensive_validation_report,
|
|
recommend_weight_tuning,
|
|
)
|
|
|
|
|
|
def create_synthetic_scored_db(tmp_path: Path) -> PipelineStore:
|
|
"""
|
|
Create DuckDB with synthetic scored_genes for testing.
|
|
|
|
Designs scores so:
|
|
- Known cilia genes (MYO7A, IFT88, BBS1) get high scores (0.8-0.95)
|
|
- Housekeeping genes (GAPDH, ACTB, RPL13A) get low scores (0.1-0.3)
|
|
- Other genes get mid-range scores (0.3-0.6)
|
|
|
|
Returns:
|
|
PipelineStore with synthetic scored_genes table
|
|
"""
|
|
db_path = tmp_path / "test.duckdb"
|
|
store = PipelineStore(db_path)
|
|
|
|
# Create gene_universe
|
|
gene_symbols = [
|
|
# Known cilia genes (high scores)
|
|
"MYO7A", "IFT88", "BBS1",
|
|
# Housekeeping genes (low scores)
|
|
"GAPDH", "ACTB", "RPL13A",
|
|
# Filler genes (mid scores)
|
|
"GENE001", "GENE002", "GENE003", "GENE004", "GENE005",
|
|
"GENE006", "GENE007", "GENE008", "GENE009", "GENE010",
|
|
"GENE011", "GENE012", "GENE013", "GENE014",
|
|
]
|
|
|
|
gene_universe = pl.DataFrame({
|
|
"gene_id": [f"ENSG{i:011d}" for i in range(len(gene_symbols))],
|
|
"gene_symbol": gene_symbols,
|
|
"hgnc_id": [f"HGNC:{i+1000}" for i in range(len(gene_symbols))],
|
|
"uniprot_primary": [f"P{i+10000}" for i in range(len(gene_symbols))],
|
|
})
|
|
|
|
store.conn.execute("CREATE TABLE gene_universe AS SELECT * FROM gene_universe")
|
|
|
|
# Create scored_genes with designed scores
|
|
scored_data = {
|
|
"gene_id": gene_universe["gene_id"].to_list(),
|
|
"gene_symbol": gene_symbols,
|
|
# Design composite_score to ensure known genes high, housekeeping low
|
|
"composite_score": [
|
|
# Known cilia genes (indices 0-2): high scores
|
|
0.90, 0.85, 0.92,
|
|
# Housekeeping genes (indices 3-5): low scores
|
|
0.15, 0.20, 0.12,
|
|
# Filler genes: mid scores
|
|
0.45, 0.50, 0.35, 0.55, 0.40,
|
|
0.48, 0.52, 0.38, 0.42, 0.58,
|
|
0.43, 0.47, 0.53, 0.41,
|
|
],
|
|
# Layer scores (simplified - just use composite_score with small variations)
|
|
"gnomad_score": [
|
|
0.88, 0.83, 0.90,
|
|
0.18, 0.22, 0.15,
|
|
0.44, 0.49, 0.34, 0.54, 0.39,
|
|
0.47, 0.51, 0.37, 0.41, 0.57,
|
|
0.42, 0.46, 0.52, 0.40,
|
|
],
|
|
"expression_score": [
|
|
0.92, 0.87, 0.94,
|
|
0.12, 0.18, 0.10,
|
|
0.46, 0.51, 0.36, 0.56, 0.41,
|
|
0.49, 0.53, 0.39, 0.43, 0.59,
|
|
0.44, 0.48, 0.54, 0.42,
|
|
],
|
|
"annotation_score": [
|
|
0.90, 0.85, 0.92,
|
|
0.15, 0.20, 0.12,
|
|
0.45, 0.50, 0.35, 0.55, 0.40,
|
|
0.48, 0.52, 0.38, 0.42, 0.58,
|
|
0.43, 0.47, 0.53, 0.41,
|
|
],
|
|
"localization_score": [
|
|
0.91, 0.86, 0.93,
|
|
0.14, 0.19, 0.11,
|
|
0.45, 0.50, 0.35, 0.55, 0.40,
|
|
0.48, 0.52, 0.38, 0.42, 0.58,
|
|
0.43, 0.47, 0.53, 0.41,
|
|
],
|
|
"animal_model_score": [
|
|
0.89, 0.84, 0.91,
|
|
0.16, 0.21, 0.13,
|
|
0.45, 0.50, 0.35, 0.55, 0.40,
|
|
0.48, 0.52, 0.38, 0.42, 0.58,
|
|
0.43, 0.47, 0.53, 0.41,
|
|
],
|
|
"literature_score": [
|
|
0.90, 0.85, 0.92,
|
|
0.15, 0.20, 0.12,
|
|
0.45, 0.50, 0.35, 0.55, 0.40,
|
|
0.48, 0.52, 0.38, 0.42, 0.58,
|
|
0.43, 0.47, 0.53, 0.41,
|
|
],
|
|
"evidence_count": [6] * len(gene_symbols),
|
|
"quality_flag": ["sufficient_evidence"] * len(gene_symbols),
|
|
}
|
|
|
|
scored_df = pl.DataFrame(scored_data)
|
|
store.conn.execute("CREATE TABLE scored_genes AS SELECT * FROM scored_df")
|
|
|
|
# Create known_genes table
|
|
known_genes = pl.DataFrame({
|
|
"gene_symbol": ["MYO7A", "IFT88", "BBS1"],
|
|
"source": ["omim_usher", "syscilia_scgs_v2", "syscilia_scgs_v2"],
|
|
"confidence": ["HIGH", "HIGH", "HIGH"],
|
|
})
|
|
|
|
store.conn.execute("CREATE TABLE known_genes AS SELECT * FROM known_genes")
|
|
|
|
return store
|
|
|
|
|
|
def test_compile_housekeeping_genes_structure():
|
|
"""Test compile_housekeeping_genes returns correct structure."""
|
|
df = compile_housekeeping_genes()
|
|
|
|
# Check columns
|
|
assert "gene_symbol" in df.columns
|
|
assert "source" in df.columns
|
|
assert "confidence" in df.columns
|
|
|
|
# Check row count
|
|
assert df.height == 13, f"Expected 13 housekeeping genes, got {df.height}"
|
|
|
|
# Check all are literature_validated
|
|
assert df["source"].unique().to_list() == ["literature_validated"]
|
|
|
|
# Check all are HIGH confidence
|
|
assert df["confidence"].unique().to_list() == ["HIGH"]
|
|
|
|
|
|
def test_compile_housekeeping_genes_known_genes_present():
|
|
"""Test known housekeeping genes are present in compiled set."""
|
|
df = compile_housekeeping_genes()
|
|
|
|
gene_symbols = df["gene_symbol"].to_list()
|
|
|
|
# Check for well-known housekeeping genes
|
|
assert "GAPDH" in gene_symbols
|
|
assert "ACTB" in gene_symbols
|
|
assert "RPL13A" in gene_symbols
|
|
assert "TBP" in gene_symbols
|
|
|
|
|
|
def test_validate_negative_controls_with_synthetic_data(tmp_path):
|
|
"""Test negative control validation with synthetic data where housekeeping genes rank low."""
|
|
store = create_synthetic_scored_db(tmp_path)
|
|
|
|
try:
|
|
# Run negative control validation
|
|
metrics = validate_negative_controls(store)
|
|
|
|
# Should pass (housekeeping genes rank low in synthetic data)
|
|
assert metrics["validation_passed"] is True, \
|
|
f"Validation should pass with low housekeeping scores. Median: {metrics['median_percentile']}"
|
|
|
|
# Median percentile should be < 0.5
|
|
assert metrics["median_percentile"] < 0.5, \
|
|
f"Housekeeping median percentile should be < 0.5, got {metrics['median_percentile']}"
|
|
|
|
# Check found some housekeeping genes
|
|
assert metrics["total_in_dataset"] > 0, "Should find at least one housekeeping gene"
|
|
|
|
finally:
|
|
store.close()
|
|
|
|
|
|
def test_validate_negative_controls_inverted_logic(tmp_path):
|
|
"""Test negative control validation fails when housekeeping genes rank HIGH."""
|
|
db_path = tmp_path / "test_inverted.duckdb"
|
|
store = PipelineStore(db_path)
|
|
|
|
try:
|
|
# Create gene_universe
|
|
gene_symbols = ["GAPDH", "ACTB", "RPL13A", "GENE001", "GENE002"]
|
|
gene_universe = pl.DataFrame({
|
|
"gene_id": [f"ENSG{i:011d}" for i in range(len(gene_symbols))],
|
|
"gene_symbol": gene_symbols,
|
|
"hgnc_id": [f"HGNC:{i+1000}" for i in range(len(gene_symbols))],
|
|
"uniprot_primary": [f"P{i+10000}" for i in range(len(gene_symbols))],
|
|
})
|
|
store.conn.execute("CREATE TABLE gene_universe AS SELECT * FROM gene_universe")
|
|
|
|
# Create scored_genes where housekeeping genes rank HIGH (inverted)
|
|
scored_df = pl.DataFrame({
|
|
"gene_id": gene_universe["gene_id"].to_list(),
|
|
"gene_symbol": gene_symbols,
|
|
"composite_score": [0.90, 0.85, 0.88, 0.20, 0.15], # Housekeeping HIGH
|
|
"gnomad_score": [0.90, 0.85, 0.88, 0.20, 0.15],
|
|
"expression_score": [0.90, 0.85, 0.88, 0.20, 0.15],
|
|
"annotation_score": [0.90, 0.85, 0.88, 0.20, 0.15],
|
|
"localization_score": [0.90, 0.85, 0.88, 0.20, 0.15],
|
|
"animal_model_score": [0.90, 0.85, 0.88, 0.20, 0.15],
|
|
"literature_score": [0.90, 0.85, 0.88, 0.20, 0.15],
|
|
"evidence_count": [6] * len(gene_symbols),
|
|
"quality_flag": ["sufficient_evidence"] * len(gene_symbols),
|
|
})
|
|
store.conn.execute("CREATE TABLE scored_genes AS SELECT * FROM scored_df")
|
|
|
|
# Run negative control validation
|
|
metrics = validate_negative_controls(store)
|
|
|
|
# Should FAIL (housekeeping genes rank high)
|
|
assert metrics["validation_passed"] is False, \
|
|
"Validation should fail when housekeeping genes rank high"
|
|
|
|
# Median percentile should be >= 0.5
|
|
assert metrics["median_percentile"] >= 0.5, \
|
|
f"Housekeeping median percentile should be >= 0.5, got {metrics['median_percentile']}"
|
|
|
|
finally:
|
|
store.close()
|
|
|
|
|
|
def test_compute_recall_at_k(tmp_path):
|
|
"""Test recall@k computation with synthetic data."""
|
|
store = create_synthetic_scored_db(tmp_path)
|
|
|
|
try:
|
|
# Run recall@k
|
|
metrics = compute_recall_at_k(store)
|
|
|
|
# Check structure
|
|
assert "recalls_absolute" in metrics
|
|
assert "recalls_percentage" in metrics
|
|
assert "total_known_unique" in metrics
|
|
assert "total_scored" in metrics
|
|
|
|
# Check absolute recalls
|
|
assert 100 in metrics["recalls_absolute"]
|
|
assert 500 in metrics["recalls_absolute"]
|
|
|
|
# Check percentage recalls
|
|
assert "5%" in metrics["recalls_percentage"]
|
|
assert "10%" in metrics["recalls_percentage"]
|
|
|
|
# Note: compile_known_genes() returns all 38 known genes (OMIM + SYSCILIA),
|
|
# but our synthetic DB only has 3 of them (MYO7A, IFT88, BBS1).
|
|
# So total_known_unique is 38, but only 3 are in dataset.
|
|
total_known = metrics["total_known_unique"]
|
|
assert total_known == 38, f"Expected 38 known genes from compile_known_genes(), got {total_known}"
|
|
|
|
# All 3 genes in dataset have high scores, so recall should be 3/38 = 0.0789
|
|
recall_at_100 = metrics["recalls_absolute"].get(100, 0.0)
|
|
expected_recall = 3.0 / 38.0
|
|
assert abs(recall_at_100 - expected_recall) < 0.01, \
|
|
f"Recall@100 should be {expected_recall:.4f} (3/38), got {recall_at_100:.4f}"
|
|
|
|
finally:
|
|
store.close()
|
|
|
|
|
|
def test_perturb_weight_renormalizes():
|
|
"""Test weight perturbation maintains sum=1.0."""
|
|
baseline = ScoringWeights() # Default weights
|
|
|
|
# Perturb gnomad by +0.10
|
|
perturbed = perturb_weight(baseline, "gnomad", 0.10)
|
|
|
|
# Check weights sum to 1.0 (within tolerance)
|
|
weight_sum = (
|
|
perturbed.gnomad +
|
|
perturbed.expression +
|
|
perturbed.annotation +
|
|
perturbed.localization +
|
|
perturbed.animal_model +
|
|
perturbed.literature
|
|
)
|
|
|
|
assert abs(weight_sum - 1.0) < 1e-6, f"Weights should sum to 1.0, got {weight_sum}"
|
|
|
|
# Check gnomad increased relative to baseline
|
|
assert perturbed.gnomad > baseline.gnomad, \
|
|
f"Perturbed gnomad ({perturbed.gnomad}) should be > baseline ({baseline.gnomad})"
|
|
|
|
|
|
def test_perturb_weight_large_negative():
|
|
"""Test weight perturbation with large negative delta (edge case)."""
|
|
baseline = ScoringWeights()
|
|
|
|
# Perturb by -0.25 (more than most weight values)
|
|
perturbed = perturb_weight(baseline, "gnomad", -0.25)
|
|
|
|
# Check weight is >= 0.0 (clamped)
|
|
assert perturbed.gnomad >= 0.0, f"Weight should be >= 0.0, got {perturbed.gnomad}"
|
|
|
|
# Check weights still sum to 1.0
|
|
weight_sum = (
|
|
perturbed.gnomad +
|
|
perturbed.expression +
|
|
perturbed.annotation +
|
|
perturbed.localization +
|
|
perturbed.animal_model +
|
|
perturbed.literature
|
|
)
|
|
|
|
assert abs(weight_sum - 1.0) < 1e-6, f"Weights should sum to 1.0, got {weight_sum}"
|
|
|
|
|
|
def test_perturb_weight_invalid_layer():
|
|
"""Test perturb_weight raises ValueError for invalid layer."""
|
|
baseline = ScoringWeights()
|
|
|
|
with pytest.raises(ValueError, match="Invalid layer"):
|
|
perturb_weight(baseline, "nonexistent", 0.05)
|
|
|
|
|
|
def test_generate_comprehensive_validation_report_format():
|
|
"""Test comprehensive validation report contains expected sections."""
|
|
# Create mock metrics
|
|
positive_metrics = {
|
|
"validation_passed": True,
|
|
"total_known_expected": 38,
|
|
"total_known_in_dataset": 35,
|
|
"median_percentile": 0.85,
|
|
"top_quartile_count": 30,
|
|
"top_quartile_fraction": 0.86,
|
|
"recall_at_k": {
|
|
"recalls_absolute": {100: 0.90, 500: 0.95},
|
|
"recalls_percentage": {"5%": 0.85, "10%": 0.92},
|
|
"total_known_unique": 35,
|
|
"total_scored": 20000,
|
|
},
|
|
"per_source_breakdown": {
|
|
"omim_usher": {"median_percentile": 0.88, "count": 10, "top_quartile_count": 9},
|
|
"syscilia_scgs_v2": {"median_percentile": 0.83, "count": 25, "top_quartile_count": 21},
|
|
},
|
|
}
|
|
|
|
negative_metrics = {
|
|
"validation_passed": True,
|
|
"total_expected": 13,
|
|
"total_in_dataset": 13,
|
|
"median_percentile": 0.35,
|
|
"top_quartile_count": 1,
|
|
"in_high_tier_count": 0,
|
|
}
|
|
|
|
sensitivity_result = {
|
|
"baseline_weights": ScoringWeights().model_dump(),
|
|
"results": [
|
|
{
|
|
"layer": "gnomad",
|
|
"delta": 0.05,
|
|
"perturbed_weights": {},
|
|
"spearman_rho": 0.92,
|
|
"spearman_pval": 1e-10,
|
|
"overlap_count": 95,
|
|
"top_n": 100,
|
|
}
|
|
],
|
|
"top_n": 100,
|
|
"total_perturbations": 1,
|
|
}
|
|
|
|
sensitivity_summary = {
|
|
"min_rho": 0.92,
|
|
"max_rho": 0.92,
|
|
"mean_rho": 0.92,
|
|
"stable_count": 1,
|
|
"unstable_count": 0,
|
|
"total_perturbations": 1,
|
|
"overall_stable": True,
|
|
"most_sensitive_layer": "gnomad",
|
|
"most_robust_layer": "literature",
|
|
}
|
|
|
|
# Generate report
|
|
report = generate_comprehensive_validation_report(
|
|
positive_metrics,
|
|
negative_metrics,
|
|
sensitivity_result,
|
|
sensitivity_summary,
|
|
)
|
|
|
|
# Check report contains expected sections
|
|
assert "Positive Control Validation" in report
|
|
assert "Negative Control Validation" in report
|
|
assert "Sensitivity Analysis" in report
|
|
assert "Overall Validation Summary" in report
|
|
assert "Weight Tuning Recommendations" in report
|
|
|
|
# Check status appears
|
|
assert "PASSED" in report
|
|
|
|
|
|
def test_recommend_weight_tuning_all_pass():
|
|
"""Test weight tuning recommendations when all validations pass."""
|
|
positive_metrics = {"validation_passed": True}
|
|
negative_metrics = {"validation_passed": True}
|
|
sensitivity_summary = {"overall_stable": True}
|
|
|
|
recommendation = recommend_weight_tuning(
|
|
positive_metrics,
|
|
negative_metrics,
|
|
sensitivity_summary,
|
|
)
|
|
|
|
# Should recommend no tuning
|
|
assert "No tuning recommended" in recommendation or "validated" in recommendation.lower()
|
|
|
|
|
|
def test_recommend_weight_tuning_positive_fail():
|
|
"""Test weight tuning recommendations when positive controls fail."""
|
|
positive_metrics = {"validation_passed": False}
|
|
negative_metrics = {"validation_passed": True}
|
|
sensitivity_summary = {"overall_stable": True}
|
|
|
|
recommendation = recommend_weight_tuning(
|
|
positive_metrics,
|
|
negative_metrics,
|
|
sensitivity_summary,
|
|
)
|
|
|
|
# Should suggest increasing weights for layers where known genes score high
|
|
assert "Known Gene Ranking Issue" in recommendation or "Positive Control" in recommendation
|
|
|
|
|
|
def test_recommend_weight_tuning_negative_fail():
|
|
"""Test weight tuning recommendations when negative controls fail."""
|
|
positive_metrics = {"validation_passed": True}
|
|
negative_metrics = {"validation_passed": False}
|
|
sensitivity_summary = {"overall_stable": True}
|
|
|
|
recommendation = recommend_weight_tuning(
|
|
positive_metrics,
|
|
negative_metrics,
|
|
sensitivity_summary,
|
|
)
|
|
|
|
# Should suggest examining layers boosting housekeeping genes
|
|
assert "Housekeeping" in recommendation or "Negative Control" in recommendation
|
|
|
|
|
|
def test_recommend_weight_tuning_sensitivity_fail():
|
|
"""Test weight tuning recommendations when sensitivity unstable."""
|
|
positive_metrics = {"validation_passed": True}
|
|
negative_metrics = {"validation_passed": True}
|
|
sensitivity_summary = {
|
|
"overall_stable": False,
|
|
"unstable_count": 5,
|
|
"most_sensitive_layer": "gnomad",
|
|
}
|
|
|
|
recommendation = recommend_weight_tuning(
|
|
positive_metrics,
|
|
negative_metrics,
|
|
sensitivity_summary,
|
|
)
|
|
|
|
# Should suggest reducing weight of most sensitive layer
|
|
assert "Sensitivity" in recommendation or "gnomad" in recommendation
|