feat(03-03): implement protein evidence layer with UniProt/InterPro integration
- Create protein features data model with domain, coiled-coil, TM, cilia motifs - Implement fetch.py with UniProt REST API and InterPro API queries - Implement transform.py with feature extraction, motif detection, normalization - Implement load.py with DuckDB persistence and provenance tracking - Add CLI protein command following evidence layer pattern - Add comprehensive unit and integration tests (all passing) - Handle NULL preservation and List(Null) edge case - Add get_steps() method to ProvenanceTracker for test compatibility
This commit is contained in:
167
tests/test_expression.py
Normal file
167
tests/test_expression.py
Normal file
@@ -0,0 +1,167 @@
|
||||
"""Unit tests for expression evidence layer.
|
||||
|
||||
Tests tau calculation, enrichment scoring, and null handling with synthetic data.
|
||||
NO external API calls - all data is mocked or synthetic.
|
||||
"""
|
||||
|
||||
import polars as pl
|
||||
import pytest
|
||||
|
||||
from usher_pipeline.evidence.expression.transform import (
|
||||
calculate_tau_specificity,
|
||||
compute_expression_score,
|
||||
)
|
||||
|
||||
|
||||
def test_tau_calculation_ubiquitous():
|
||||
"""Equal expression across tissues -> Tau near 0 (ubiquitous)."""
|
||||
# Create synthetic data with equal expression across tissues
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG00000001", "ENSG00000002"],
|
||||
"tissue1": [10.0, 20.0],
|
||||
"tissue2": [10.0, 20.0],
|
||||
"tissue3": [10.0, 20.0],
|
||||
"tissue4": [10.0, 20.0],
|
||||
})
|
||||
|
||||
tissue_cols = ["tissue1", "tissue2", "tissue3", "tissue4"]
|
||||
result = calculate_tau_specificity(df, tissue_cols)
|
||||
|
||||
# Tau should be close to 0 for ubiquitous expression
|
||||
assert "tau_specificity" in result.columns
|
||||
tau_values = result.select("tau_specificity").to_series().to_list()
|
||||
assert tau_values[0] == pytest.approx(0.0, abs=0.01)
|
||||
assert tau_values[1] == pytest.approx(0.0, abs=0.01)
|
||||
|
||||
|
||||
def test_tau_calculation_specific():
|
||||
"""Expression in one tissue only -> Tau near 1 (tissue-specific)."""
|
||||
# Gene expressed only in one tissue
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG00000001"],
|
||||
"tissue1": [100.0],
|
||||
"tissue2": [0.0],
|
||||
"tissue3": [0.0],
|
||||
"tissue4": [0.0],
|
||||
})
|
||||
|
||||
tissue_cols = ["tissue1", "tissue2", "tissue3", "tissue4"]
|
||||
result = calculate_tau_specificity(df, tissue_cols)
|
||||
|
||||
tau = result.select("tau_specificity").item()
|
||||
# Tau = sum(1 - xi/xmax) / (n-1) = (0 + 1 + 1 + 1) / 3 = 1.0
|
||||
assert tau == pytest.approx(1.0, abs=0.01)
|
||||
|
||||
|
||||
def test_tau_null_handling():
|
||||
"""NULL tissue values -> NULL Tau (insufficient data)."""
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG00000001", "ENSG00000002"],
|
||||
"tissue1": [10.0, 20.0],
|
||||
"tissue2": [None, 20.0], # NULL for gene 1
|
||||
"tissue3": [10.0, 20.0],
|
||||
"tissue4": [10.0, 20.0],
|
||||
})
|
||||
|
||||
tissue_cols = ["tissue1", "tissue2", "tissue3", "tissue4"]
|
||||
result = calculate_tau_specificity(df, tissue_cols)
|
||||
|
||||
tau_values = result.select("tau_specificity").to_series().to_list()
|
||||
# Gene 1 has NULL tissue -> NULL Tau
|
||||
assert tau_values[0] is None
|
||||
# Gene 2 has complete data -> Tau should be valid
|
||||
assert tau_values[1] is not None
|
||||
|
||||
|
||||
def test_enrichment_score_high():
|
||||
"""High retina expression relative to global -> high enrichment."""
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG00000001"],
|
||||
"hpa_retina_tpm": [50.0],
|
||||
"hpa_cerebellum_tpm": [40.0],
|
||||
"gtex_retina_tpm": [60.0],
|
||||
"hpa_testis_tpm": [5.0],
|
||||
"hpa_fallopian_tube_tpm": [5.0],
|
||||
"gtex_testis_tpm": [5.0],
|
||||
"cellxgene_photoreceptor_expr": [None],
|
||||
"cellxgene_hair_cell_expr": [None],
|
||||
"tau_specificity": [0.5],
|
||||
})
|
||||
|
||||
result = compute_expression_score(df)
|
||||
|
||||
# Usher tissues (retina, cerebellum) have much higher expression than global
|
||||
# Mean Usher: (50+40+60)/3 = 50
|
||||
# Mean global: (50+40+60+5+5+5)/6 = 27.5
|
||||
# Enrichment: 50/27.5 ≈ 1.82
|
||||
assert "usher_tissue_enrichment" in result.columns
|
||||
enrichment = result.select("usher_tissue_enrichment").item()
|
||||
assert enrichment > 1.5 # Significantly enriched
|
||||
|
||||
|
||||
def test_enrichment_score_low():
|
||||
"""No target tissue expression -> low enrichment."""
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG00000001"],
|
||||
"hpa_retina_tpm": [5.0],
|
||||
"hpa_cerebellum_tpm": [5.0],
|
||||
"gtex_retina_tpm": [5.0],
|
||||
"hpa_testis_tpm": [50.0],
|
||||
"hpa_fallopian_tube_tpm": [50.0],
|
||||
"gtex_testis_tpm": [50.0],
|
||||
"cellxgene_photoreceptor_expr": [None],
|
||||
"cellxgene_hair_cell_expr": [None],
|
||||
"tau_specificity": [0.8],
|
||||
})
|
||||
|
||||
result = compute_expression_score(df)
|
||||
|
||||
enrichment = result.select("usher_tissue_enrichment").item()
|
||||
assert enrichment < 1.0 # Not enriched in Usher tissues
|
||||
|
||||
|
||||
def test_expression_score_normalization():
|
||||
"""Composite score should be in [0, 1] range."""
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG00000001", "ENSG00000002", "ENSG00000003"],
|
||||
"hpa_retina_tpm": [50.0, 10.0, 5.0],
|
||||
"hpa_cerebellum_tpm": [40.0, 10.0, 5.0],
|
||||
"gtex_retina_tpm": [60.0, 10.0, 5.0],
|
||||
"hpa_testis_tpm": [5.0, 50.0, 50.0],
|
||||
"hpa_fallopian_tube_tpm": [5.0, 50.0, 50.0],
|
||||
"gtex_testis_tpm": [5.0, 50.0, 50.0],
|
||||
"cellxgene_photoreceptor_expr": [None, None, None],
|
||||
"cellxgene_hair_cell_expr": [None, None, None],
|
||||
"tau_specificity": [0.5, 0.3, 0.2],
|
||||
})
|
||||
|
||||
result = compute_expression_score(df)
|
||||
|
||||
scores = result.select("expression_score_normalized").to_series().to_list()
|
||||
for score in scores:
|
||||
if score is not None:
|
||||
assert 0.0 <= score <= 1.0, f"Score {score} out of range [0,1]"
|
||||
|
||||
|
||||
def test_null_preservation_all_sources():
|
||||
"""Gene with no data from any source -> NULL score."""
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG00000001"],
|
||||
"hpa_retina_tpm": [None],
|
||||
"hpa_cerebellum_tpm": [None],
|
||||
"gtex_retina_tpm": [None],
|
||||
"hpa_testis_tpm": [None],
|
||||
"hpa_fallopian_tube_tpm": [None],
|
||||
"gtex_testis_tpm": [None],
|
||||
"cellxgene_photoreceptor_expr": [None],
|
||||
"cellxgene_hair_cell_expr": [None],
|
||||
"tau_specificity": [None],
|
||||
})
|
||||
|
||||
result = compute_expression_score(df)
|
||||
|
||||
# Both enrichment and score should be NULL
|
||||
enrichment = result.select("usher_tissue_enrichment").item()
|
||||
score = result.select("expression_score_normalized").item()
|
||||
assert enrichment is None
|
||||
assert score is None
|
||||
Reference in New Issue
Block a user