"""Unit tests for expression evidence layer. Tests tau calculation, enrichment scoring, and null handling with synthetic data. NO external API calls - all data is mocked or synthetic. """ import polars as pl import pytest from usher_pipeline.evidence.expression.transform import ( calculate_tau_specificity, compute_expression_score, ) def test_tau_calculation_ubiquitous(): """Equal expression across tissues -> Tau near 0 (ubiquitous).""" # Create synthetic data with equal expression across tissues df = pl.DataFrame({ "gene_id": ["ENSG00000001", "ENSG00000002"], "tissue1": [10.0, 20.0], "tissue2": [10.0, 20.0], "tissue3": [10.0, 20.0], "tissue4": [10.0, 20.0], }) tissue_cols = ["tissue1", "tissue2", "tissue3", "tissue4"] result = calculate_tau_specificity(df, tissue_cols) # Tau should be close to 0 for ubiquitous expression assert "tau_specificity" in result.columns tau_values = result.select("tau_specificity").to_series().to_list() assert tau_values[0] == pytest.approx(0.0, abs=0.01) assert tau_values[1] == pytest.approx(0.0, abs=0.01) def test_tau_calculation_specific(): """Expression in one tissue only -> Tau near 1 (tissue-specific).""" # Gene expressed only in one tissue df = pl.DataFrame({ "gene_id": ["ENSG00000001"], "tissue1": [100.0], "tissue2": [0.0], "tissue3": [0.0], "tissue4": [0.0], }) tissue_cols = ["tissue1", "tissue2", "tissue3", "tissue4"] result = calculate_tau_specificity(df, tissue_cols) tau = result.select("tau_specificity").item() # Tau = sum(1 - xi/xmax) / (n-1) = (0 + 1 + 1 + 1) / 3 = 1.0 assert tau == pytest.approx(1.0, abs=0.01) def test_tau_null_handling(): """NULL tissue values -> NULL Tau (insufficient data).""" df = pl.DataFrame({ "gene_id": ["ENSG00000001", "ENSG00000002"], "tissue1": [10.0, 20.0], "tissue2": [None, 20.0], # NULL for gene 1 "tissue3": [10.0, 20.0], "tissue4": [10.0, 20.0], }) tissue_cols = ["tissue1", "tissue2", "tissue3", "tissue4"] result = calculate_tau_specificity(df, tissue_cols) tau_values = result.select("tau_specificity").to_series().to_list() # Gene 1 has NULL tissue -> NULL Tau assert tau_values[0] is None # Gene 2 has complete data -> Tau should be valid assert tau_values[1] is not None def test_enrichment_score_high(): """High retina expression relative to global -> high enrichment.""" df = pl.DataFrame({ "gene_id": ["ENSG00000001"], "hpa_retina_tpm": [50.0], "hpa_cerebellum_tpm": [40.0], "gtex_retina_tpm": [60.0], "hpa_testis_tpm": [5.0], "hpa_fallopian_tube_tpm": [5.0], "gtex_testis_tpm": [5.0], "cellxgene_photoreceptor_expr": [None], "cellxgene_hair_cell_expr": [None], "tau_specificity": [0.5], }) result = compute_expression_score(df) # Usher tissues (retina, cerebellum) have much higher expression than global # Mean Usher: (50+40+60)/3 = 50 # Mean global: (50+40+60+5+5+5)/6 = 27.5 # Enrichment: 50/27.5 ≈ 1.82 assert "usher_tissue_enrichment" in result.columns enrichment = result.select("usher_tissue_enrichment").item() assert enrichment > 1.5 # Significantly enriched def test_enrichment_score_low(): """No target tissue expression -> low enrichment.""" df = pl.DataFrame({ "gene_id": ["ENSG00000001"], "hpa_retina_tpm": [5.0], "hpa_cerebellum_tpm": [5.0], "gtex_retina_tpm": [5.0], "hpa_testis_tpm": [50.0], "hpa_fallopian_tube_tpm": [50.0], "gtex_testis_tpm": [50.0], "cellxgene_photoreceptor_expr": [None], "cellxgene_hair_cell_expr": [None], "tau_specificity": [0.8], }) result = compute_expression_score(df) enrichment = result.select("usher_tissue_enrichment").item() assert enrichment < 1.0 # Not enriched in Usher tissues def test_expression_score_normalization(): """Composite score should be in [0, 1] range.""" df = pl.DataFrame({ "gene_id": ["ENSG00000001", "ENSG00000002", "ENSG00000003"], "hpa_retina_tpm": [50.0, 10.0, 5.0], "hpa_cerebellum_tpm": [40.0, 10.0, 5.0], "gtex_retina_tpm": [60.0, 10.0, 5.0], "hpa_testis_tpm": [5.0, 50.0, 50.0], "hpa_fallopian_tube_tpm": [5.0, 50.0, 50.0], "gtex_testis_tpm": [5.0, 50.0, 50.0], "cellxgene_photoreceptor_expr": [None, None, None], "cellxgene_hair_cell_expr": [None, None, None], "tau_specificity": [0.5, 0.3, 0.2], }) result = compute_expression_score(df) scores = result.select("expression_score_normalized").to_series().to_list() for score in scores: if score is not None: assert 0.0 <= score <= 1.0, f"Score {score} out of range [0,1]" def test_null_preservation_all_sources(): """Gene with no data from any source -> NULL score.""" df = pl.DataFrame({ "gene_id": ["ENSG00000001"], "hpa_retina_tpm": [None], "hpa_cerebellum_tpm": [None], "gtex_retina_tpm": [None], "hpa_testis_tpm": [None], "hpa_fallopian_tube_tpm": [None], "gtex_testis_tpm": [None], "cellxgene_photoreceptor_expr": [None], "cellxgene_hair_cell_expr": [None], "tau_specificity": [None], }) result = compute_expression_score(df) # Both enrichment and score should be NULL enrichment = result.select("usher_tissue_enrichment").item() score = result.select("expression_score_normalized").item() assert enrichment is None assert score is None