From 5879ae9768aeba2008265e996f3eb4841564c57b Mon Sep 17 00:00:00 2001 From: gbanyan Date: Thu, 12 Feb 2026 04:50:17 +0800 Subject: [PATCH] test(06-03): unit tests for all validation modules - Created test_validation.py with 13 comprehensive tests - create_synthetic_scored_db helper: designs scores for known genes high, housekeeping low - Tests for negative controls: - test_compile_housekeeping_genes_structure - test_compile_housekeeping_genes_known_genes_present - test_validate_negative_controls_with_synthetic_data - test_validate_negative_controls_inverted_logic - Tests for recall@k: - test_compute_recall_at_k - Tests for sensitivity/perturbation: - test_perturb_weight_renormalizes - test_perturb_weight_large_negative - test_perturb_weight_invalid_layer - Tests for validation report: - test_generate_comprehensive_validation_report_format - test_recommend_weight_tuning_all_pass - test_recommend_weight_tuning_positive_fail - test_recommend_weight_tuning_negative_fail - test_recommend_weight_tuning_sensitivity_fail - All tests pass using synthetic DuckDB with tmp_path isolation --- tests/test_validation.py | 478 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 478 insertions(+) create mode 100644 tests/test_validation.py diff --git a/tests/test_validation.py b/tests/test_validation.py new file mode 100644 index 0000000..641faef --- /dev/null +++ b/tests/test_validation.py @@ -0,0 +1,478 @@ +"""Tests for validation modules (negative controls, recall@k, sensitivity, report).""" + +import polars as pl +import pytest +from pathlib import Path + +from usher_pipeline.persistence.duckdb_store import PipelineStore +from usher_pipeline.config.schema import ScoringWeights +from usher_pipeline.scoring import ( + compile_housekeeping_genes, + validate_negative_controls, + compute_recall_at_k, + validate_positive_controls_extended, +) +from usher_pipeline.scoring.sensitivity import ( + perturb_weight, + run_sensitivity_analysis, + summarize_sensitivity, +) +from usher_pipeline.scoring.validation_report import ( + generate_comprehensive_validation_report, + recommend_weight_tuning, +) + + +def create_synthetic_scored_db(tmp_path: Path) -> PipelineStore: + """ + Create DuckDB with synthetic scored_genes for testing. + + Designs scores so: + - Known cilia genes (MYO7A, IFT88, BBS1) get high scores (0.8-0.95) + - Housekeeping genes (GAPDH, ACTB, RPL13A) get low scores (0.1-0.3) + - Other genes get mid-range scores (0.3-0.6) + + Returns: + PipelineStore with synthetic scored_genes table + """ + db_path = tmp_path / "test.duckdb" + store = PipelineStore(db_path) + + # Create gene_universe + gene_symbols = [ + # Known cilia genes (high scores) + "MYO7A", "IFT88", "BBS1", + # Housekeeping genes (low scores) + "GAPDH", "ACTB", "RPL13A", + # Filler genes (mid scores) + "GENE001", "GENE002", "GENE003", "GENE004", "GENE005", + "GENE006", "GENE007", "GENE008", "GENE009", "GENE010", + "GENE011", "GENE012", "GENE013", "GENE014", + ] + + gene_universe = pl.DataFrame({ + "gene_id": [f"ENSG{i:011d}" for i in range(len(gene_symbols))], + "gene_symbol": gene_symbols, + "hgnc_id": [f"HGNC:{i+1000}" for i in range(len(gene_symbols))], + "uniprot_primary": [f"P{i+10000}" for i in range(len(gene_symbols))], + }) + + store.conn.execute("CREATE TABLE gene_universe AS SELECT * FROM gene_universe") + + # Create scored_genes with designed scores + scored_data = { + "gene_id": gene_universe["gene_id"].to_list(), + "gene_symbol": gene_symbols, + # Design composite_score to ensure known genes high, housekeeping low + "composite_score": [ + # Known cilia genes (indices 0-2): high scores + 0.90, 0.85, 0.92, + # Housekeeping genes (indices 3-5): low scores + 0.15, 0.20, 0.12, + # Filler genes: mid scores + 0.45, 0.50, 0.35, 0.55, 0.40, + 0.48, 0.52, 0.38, 0.42, 0.58, + 0.43, 0.47, 0.53, 0.41, + ], + # Layer scores (simplified - just use composite_score with small variations) + "gnomad_score": [ + 0.88, 0.83, 0.90, + 0.18, 0.22, 0.15, + 0.44, 0.49, 0.34, 0.54, 0.39, + 0.47, 0.51, 0.37, 0.41, 0.57, + 0.42, 0.46, 0.52, 0.40, + ], + "expression_score": [ + 0.92, 0.87, 0.94, + 0.12, 0.18, 0.10, + 0.46, 0.51, 0.36, 0.56, 0.41, + 0.49, 0.53, 0.39, 0.43, 0.59, + 0.44, 0.48, 0.54, 0.42, + ], + "annotation_score": [ + 0.90, 0.85, 0.92, + 0.15, 0.20, 0.12, + 0.45, 0.50, 0.35, 0.55, 0.40, + 0.48, 0.52, 0.38, 0.42, 0.58, + 0.43, 0.47, 0.53, 0.41, + ], + "localization_score": [ + 0.91, 0.86, 0.93, + 0.14, 0.19, 0.11, + 0.45, 0.50, 0.35, 0.55, 0.40, + 0.48, 0.52, 0.38, 0.42, 0.58, + 0.43, 0.47, 0.53, 0.41, + ], + "animal_model_score": [ + 0.89, 0.84, 0.91, + 0.16, 0.21, 0.13, + 0.45, 0.50, 0.35, 0.55, 0.40, + 0.48, 0.52, 0.38, 0.42, 0.58, + 0.43, 0.47, 0.53, 0.41, + ], + "literature_score": [ + 0.90, 0.85, 0.92, + 0.15, 0.20, 0.12, + 0.45, 0.50, 0.35, 0.55, 0.40, + 0.48, 0.52, 0.38, 0.42, 0.58, + 0.43, 0.47, 0.53, 0.41, + ], + "evidence_count": [6] * len(gene_symbols), + "quality_flag": ["sufficient_evidence"] * len(gene_symbols), + } + + scored_df = pl.DataFrame(scored_data) + store.conn.execute("CREATE TABLE scored_genes AS SELECT * FROM scored_df") + + # Create known_genes table + known_genes = pl.DataFrame({ + "gene_symbol": ["MYO7A", "IFT88", "BBS1"], + "source": ["omim_usher", "syscilia_scgs_v2", "syscilia_scgs_v2"], + "confidence": ["HIGH", "HIGH", "HIGH"], + }) + + store.conn.execute("CREATE TABLE known_genes AS SELECT * FROM known_genes") + + return store + + +def test_compile_housekeeping_genes_structure(): + """Test compile_housekeeping_genes returns correct structure.""" + df = compile_housekeeping_genes() + + # Check columns + assert "gene_symbol" in df.columns + assert "source" in df.columns + assert "confidence" in df.columns + + # Check row count + assert df.height == 13, f"Expected 13 housekeeping genes, got {df.height}" + + # Check all are literature_validated + assert df["source"].unique().to_list() == ["literature_validated"] + + # Check all are HIGH confidence + assert df["confidence"].unique().to_list() == ["HIGH"] + + +def test_compile_housekeeping_genes_known_genes_present(): + """Test known housekeeping genes are present in compiled set.""" + df = compile_housekeeping_genes() + + gene_symbols = df["gene_symbol"].to_list() + + # Check for well-known housekeeping genes + assert "GAPDH" in gene_symbols + assert "ACTB" in gene_symbols + assert "RPL13A" in gene_symbols + assert "TBP" in gene_symbols + + +def test_validate_negative_controls_with_synthetic_data(tmp_path): + """Test negative control validation with synthetic data where housekeeping genes rank low.""" + store = create_synthetic_scored_db(tmp_path) + + try: + # Run negative control validation + metrics = validate_negative_controls(store) + + # Should pass (housekeeping genes rank low in synthetic data) + assert metrics["validation_passed"] is True, \ + f"Validation should pass with low housekeeping scores. Median: {metrics['median_percentile']}" + + # Median percentile should be < 0.5 + assert metrics["median_percentile"] < 0.5, \ + f"Housekeeping median percentile should be < 0.5, got {metrics['median_percentile']}" + + # Check found some housekeeping genes + assert metrics["total_in_dataset"] > 0, "Should find at least one housekeeping gene" + + finally: + store.close() + + +def test_validate_negative_controls_inverted_logic(tmp_path): + """Test negative control validation fails when housekeeping genes rank HIGH.""" + db_path = tmp_path / "test_inverted.duckdb" + store = PipelineStore(db_path) + + try: + # Create gene_universe + gene_symbols = ["GAPDH", "ACTB", "RPL13A", "GENE001", "GENE002"] + gene_universe = pl.DataFrame({ + "gene_id": [f"ENSG{i:011d}" for i in range(len(gene_symbols))], + "gene_symbol": gene_symbols, + "hgnc_id": [f"HGNC:{i+1000}" for i in range(len(gene_symbols))], + "uniprot_primary": [f"P{i+10000}" for i in range(len(gene_symbols))], + }) + store.conn.execute("CREATE TABLE gene_universe AS SELECT * FROM gene_universe") + + # Create scored_genes where housekeeping genes rank HIGH (inverted) + scored_df = pl.DataFrame({ + "gene_id": gene_universe["gene_id"].to_list(), + "gene_symbol": gene_symbols, + "composite_score": [0.90, 0.85, 0.88, 0.20, 0.15], # Housekeeping HIGH + "gnomad_score": [0.90, 0.85, 0.88, 0.20, 0.15], + "expression_score": [0.90, 0.85, 0.88, 0.20, 0.15], + "annotation_score": [0.90, 0.85, 0.88, 0.20, 0.15], + "localization_score": [0.90, 0.85, 0.88, 0.20, 0.15], + "animal_model_score": [0.90, 0.85, 0.88, 0.20, 0.15], + "literature_score": [0.90, 0.85, 0.88, 0.20, 0.15], + "evidence_count": [6] * len(gene_symbols), + "quality_flag": ["sufficient_evidence"] * len(gene_symbols), + }) + store.conn.execute("CREATE TABLE scored_genes AS SELECT * FROM scored_df") + + # Run negative control validation + metrics = validate_negative_controls(store) + + # Should FAIL (housekeeping genes rank high) + assert metrics["validation_passed"] is False, \ + "Validation should fail when housekeeping genes rank high" + + # Median percentile should be >= 0.5 + assert metrics["median_percentile"] >= 0.5, \ + f"Housekeeping median percentile should be >= 0.5, got {metrics['median_percentile']}" + + finally: + store.close() + + +def test_compute_recall_at_k(tmp_path): + """Test recall@k computation with synthetic data.""" + store = create_synthetic_scored_db(tmp_path) + + try: + # Run recall@k + metrics = compute_recall_at_k(store) + + # Check structure + assert "recalls_absolute" in metrics + assert "recalls_percentage" in metrics + assert "total_known_unique" in metrics + assert "total_scored" in metrics + + # Check absolute recalls + assert 100 in metrics["recalls_absolute"] + assert 500 in metrics["recalls_absolute"] + + # Check percentage recalls + assert "5%" in metrics["recalls_percentage"] + assert "10%" in metrics["recalls_percentage"] + + # Note: compile_known_genes() returns all 38 known genes (OMIM + SYSCILIA), + # but our synthetic DB only has 3 of them (MYO7A, IFT88, BBS1). + # So total_known_unique is 38, but only 3 are in dataset. + total_known = metrics["total_known_unique"] + assert total_known == 38, f"Expected 38 known genes from compile_known_genes(), got {total_known}" + + # All 3 genes in dataset have high scores, so recall should be 3/38 = 0.0789 + recall_at_100 = metrics["recalls_absolute"].get(100, 0.0) + expected_recall = 3.0 / 38.0 + assert abs(recall_at_100 - expected_recall) < 0.01, \ + f"Recall@100 should be {expected_recall:.4f} (3/38), got {recall_at_100:.4f}" + + finally: + store.close() + + +def test_perturb_weight_renormalizes(): + """Test weight perturbation maintains sum=1.0.""" + baseline = ScoringWeights() # Default weights + + # Perturb gnomad by +0.10 + perturbed = perturb_weight(baseline, "gnomad", 0.10) + + # Check weights sum to 1.0 (within tolerance) + weight_sum = ( + perturbed.gnomad + + perturbed.expression + + perturbed.annotation + + perturbed.localization + + perturbed.animal_model + + perturbed.literature + ) + + assert abs(weight_sum - 1.0) < 1e-6, f"Weights should sum to 1.0, got {weight_sum}" + + # Check gnomad increased relative to baseline + assert perturbed.gnomad > baseline.gnomad, \ + f"Perturbed gnomad ({perturbed.gnomad}) should be > baseline ({baseline.gnomad})" + + +def test_perturb_weight_large_negative(): + """Test weight perturbation with large negative delta (edge case).""" + baseline = ScoringWeights() + + # Perturb by -0.25 (more than most weight values) + perturbed = perturb_weight(baseline, "gnomad", -0.25) + + # Check weight is >= 0.0 (clamped) + assert perturbed.gnomad >= 0.0, f"Weight should be >= 0.0, got {perturbed.gnomad}" + + # Check weights still sum to 1.0 + weight_sum = ( + perturbed.gnomad + + perturbed.expression + + perturbed.annotation + + perturbed.localization + + perturbed.animal_model + + perturbed.literature + ) + + assert abs(weight_sum - 1.0) < 1e-6, f"Weights should sum to 1.0, got {weight_sum}" + + +def test_perturb_weight_invalid_layer(): + """Test perturb_weight raises ValueError for invalid layer.""" + baseline = ScoringWeights() + + with pytest.raises(ValueError, match="Invalid layer"): + perturb_weight(baseline, "nonexistent", 0.05) + + +def test_generate_comprehensive_validation_report_format(): + """Test comprehensive validation report contains expected sections.""" + # Create mock metrics + positive_metrics = { + "validation_passed": True, + "total_known_expected": 38, + "total_known_in_dataset": 35, + "median_percentile": 0.85, + "top_quartile_count": 30, + "top_quartile_fraction": 0.86, + "recall_at_k": { + "recalls_absolute": {100: 0.90, 500: 0.95}, + "recalls_percentage": {"5%": 0.85, "10%": 0.92}, + "total_known_unique": 35, + "total_scored": 20000, + }, + "per_source_breakdown": { + "omim_usher": {"median_percentile": 0.88, "count": 10, "top_quartile_count": 9}, + "syscilia_scgs_v2": {"median_percentile": 0.83, "count": 25, "top_quartile_count": 21}, + }, + } + + negative_metrics = { + "validation_passed": True, + "total_expected": 13, + "total_in_dataset": 13, + "median_percentile": 0.35, + "top_quartile_count": 1, + "in_high_tier_count": 0, + } + + sensitivity_result = { + "baseline_weights": ScoringWeights().model_dump(), + "results": [ + { + "layer": "gnomad", + "delta": 0.05, + "perturbed_weights": {}, + "spearman_rho": 0.92, + "spearman_pval": 1e-10, + "overlap_count": 95, + "top_n": 100, + } + ], + "top_n": 100, + "total_perturbations": 1, + } + + sensitivity_summary = { + "min_rho": 0.92, + "max_rho": 0.92, + "mean_rho": 0.92, + "stable_count": 1, + "unstable_count": 0, + "total_perturbations": 1, + "overall_stable": True, + "most_sensitive_layer": "gnomad", + "most_robust_layer": "literature", + } + + # Generate report + report = generate_comprehensive_validation_report( + positive_metrics, + negative_metrics, + sensitivity_result, + sensitivity_summary, + ) + + # Check report contains expected sections + assert "Positive Control Validation" in report + assert "Negative Control Validation" in report + assert "Sensitivity Analysis" in report + assert "Overall Validation Summary" in report + assert "Weight Tuning Recommendations" in report + + # Check status appears + assert "PASSED" in report + + +def test_recommend_weight_tuning_all_pass(): + """Test weight tuning recommendations when all validations pass.""" + positive_metrics = {"validation_passed": True} + negative_metrics = {"validation_passed": True} + sensitivity_summary = {"overall_stable": True} + + recommendation = recommend_weight_tuning( + positive_metrics, + negative_metrics, + sensitivity_summary, + ) + + # Should recommend no tuning + assert "No tuning recommended" in recommendation or "validated" in recommendation.lower() + + +def test_recommend_weight_tuning_positive_fail(): + """Test weight tuning recommendations when positive controls fail.""" + positive_metrics = {"validation_passed": False} + negative_metrics = {"validation_passed": True} + sensitivity_summary = {"overall_stable": True} + + recommendation = recommend_weight_tuning( + positive_metrics, + negative_metrics, + sensitivity_summary, + ) + + # Should suggest increasing weights for layers where known genes score high + assert "Known Gene Ranking Issue" in recommendation or "Positive Control" in recommendation + + +def test_recommend_weight_tuning_negative_fail(): + """Test weight tuning recommendations when negative controls fail.""" + positive_metrics = {"validation_passed": True} + negative_metrics = {"validation_passed": False} + sensitivity_summary = {"overall_stable": True} + + recommendation = recommend_weight_tuning( + positive_metrics, + negative_metrics, + sensitivity_summary, + ) + + # Should suggest examining layers boosting housekeeping genes + assert "Housekeeping" in recommendation or "Negative Control" in recommendation + + +def test_recommend_weight_tuning_sensitivity_fail(): + """Test weight tuning recommendations when sensitivity unstable.""" + positive_metrics = {"validation_passed": True} + negative_metrics = {"validation_passed": True} + sensitivity_summary = { + "overall_stable": False, + "unstable_count": 5, + "most_sensitive_layer": "gnomad", + } + + recommendation = recommend_weight_tuning( + positive_metrics, + negative_metrics, + sensitivity_summary, + ) + + # Should suggest reducing weight of most sensitive layer + assert "Sensitivity" in recommendation or "gnomad" in recommendation