feat(03-03): implement protein evidence layer with UniProt/InterPro integration
- Create protein features data model with domain, coiled-coil, TM, cilia motifs - Implement fetch.py with UniProt REST API and InterPro API queries - Implement transform.py with feature extraction, motif detection, normalization - Implement load.py with DuckDB persistence and provenance tracking - Add CLI protein command following evidence layer pattern - Add comprehensive unit and integration tests (all passing) - Handle NULL preservation and List(Null) edge case - Add get_steps() method to ProvenanceTracker for test compatibility
This commit is contained in:
170
tests/test_expression_integration.py
Normal file
170
tests/test_expression_integration.py
Normal file
@@ -0,0 +1,170 @@
|
||||
"""Integration tests for expression evidence layer.
|
||||
|
||||
Tests with mocked downloads and synthetic fixtures.
|
||||
NO actual external API calls to HPA/GTEx/CellxGene.
|
||||
"""
|
||||
|
||||
import polars as pl
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
|
||||
from usher_pipeline.evidence.expression.transform import process_expression_evidence
|
||||
from usher_pipeline.evidence.expression.load import load_to_duckdb
|
||||
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_cache_dir(tmp_path):
|
||||
"""Create temporary cache directory for downloads."""
|
||||
cache_dir = tmp_path / "expression"
|
||||
cache_dir.mkdir()
|
||||
return cache_dir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_gene_ids():
|
||||
"""Sample gene IDs for testing."""
|
||||
return ["ENSG00000001", "ENSG00000002", "ENSG00000003"]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_hpa_data():
|
||||
"""Synthetic HPA expression data."""
|
||||
return pl.LazyFrame({
|
||||
"gene_symbol": ["GENE1", "GENE2", "GENE3"],
|
||||
"hpa_retina_tpm": [50.0, 10.0, None],
|
||||
"hpa_cerebellum_tpm": [40.0, 10.0, 5.0],
|
||||
"hpa_testis_tpm": [5.0, 50.0, 50.0],
|
||||
"hpa_fallopian_tube_tpm": [5.0, 50.0, None],
|
||||
})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_gtex_data(mock_gene_ids):
|
||||
"""Synthetic GTEx expression data."""
|
||||
return pl.LazyFrame({
|
||||
"gene_id": mock_gene_ids,
|
||||
"gtex_retina_tpm": [60.0, 10.0, None],
|
||||
"gtex_cerebellum_tpm": [45.0, 10.0, 5.0],
|
||||
"gtex_testis_tpm": [5.0, 55.0, 55.0],
|
||||
"gtex_fallopian_tube_tpm": [None, None, None], # Often not available
|
||||
})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_cellxgene_data(mock_gene_ids):
|
||||
"""Synthetic CellxGene data (NULLs as placeholder)."""
|
||||
return pl.LazyFrame({
|
||||
"gene_id": mock_gene_ids,
|
||||
"cellxgene_photoreceptor_expr": [None, None, None],
|
||||
"cellxgene_hair_cell_expr": [None, None, None],
|
||||
})
|
||||
|
||||
|
||||
def test_process_expression_pipeline_with_mocks(
|
||||
temp_cache_dir, mock_gene_ids, mock_hpa_data, mock_gtex_data, mock_cellxgene_data
|
||||
):
|
||||
"""Test full pipeline with mocked data sources."""
|
||||
# Mock all fetch functions to return synthetic data
|
||||
with patch('usher_pipeline.evidence.expression.transform.fetch_hpa_expression') as mock_hpa, \
|
||||
patch('usher_pipeline.evidence.expression.transform.fetch_gtex_expression') as mock_gtex, \
|
||||
patch('usher_pipeline.evidence.expression.transform.fetch_cellxgene_expression') as mock_cellxgene:
|
||||
|
||||
mock_hpa.return_value = mock_hpa_data
|
||||
mock_gtex.return_value = mock_gtex_data
|
||||
mock_cellxgene.return_value = mock_cellxgene_data
|
||||
|
||||
# Run pipeline (skip CellxGene for simplicity)
|
||||
df = process_expression_evidence(
|
||||
gene_ids=mock_gene_ids,
|
||||
cache_dir=temp_cache_dir,
|
||||
skip_cellxgene=True,
|
||||
)
|
||||
|
||||
# Verify output structure
|
||||
assert len(df) == len(mock_gene_ids)
|
||||
assert "gene_id" in df.columns
|
||||
assert "tau_specificity" in df.columns
|
||||
assert "usher_tissue_enrichment" in df.columns
|
||||
assert "expression_score_normalized" in df.columns
|
||||
|
||||
|
||||
def test_checkpoint_restart(temp_cache_dir, mock_gene_ids):
|
||||
"""Test checkpoint-restart: skip processing if table exists."""
|
||||
# Create mock store with existing checkpoint
|
||||
mock_store = Mock(spec=PipelineStore)
|
||||
mock_store.has_checkpoint.return_value = True
|
||||
|
||||
# Mock load_dataframe to return synthetic data
|
||||
existing_data = pl.DataFrame({
|
||||
"gene_id": mock_gene_ids,
|
||||
"tau_specificity": [0.5, 0.3, 0.2],
|
||||
"usher_tissue_enrichment": [2.0, 1.0, 0.5],
|
||||
"expression_score_normalized": [0.8, 0.5, 0.3],
|
||||
})
|
||||
mock_store.load_dataframe.return_value = existing_data
|
||||
|
||||
# Verify checkpoint works (would skip processing in real CLI)
|
||||
assert mock_store.has_checkpoint('tissue_expression')
|
||||
df = mock_store.load_dataframe('tissue_expression')
|
||||
assert len(df) == len(mock_gene_ids)
|
||||
|
||||
|
||||
def test_provenance_recording():
|
||||
"""Test provenance step recording during load."""
|
||||
# Create synthetic expression data
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG00000001", "ENSG00000002"],
|
||||
"hpa_retina_tpm": [50.0, None],
|
||||
"gtex_retina_tpm": [60.0, 10.0],
|
||||
"cellxgene_photoreceptor_expr": [None, None],
|
||||
"cellxgene_hair_cell_expr": [None, None],
|
||||
"tau_specificity": [0.5, None],
|
||||
"usher_tissue_enrichment": [2.0, 1.0],
|
||||
"expression_score_normalized": [0.8, 0.5],
|
||||
})
|
||||
|
||||
# Mock store and provenance tracker
|
||||
mock_store = Mock(spec=PipelineStore)
|
||||
mock_provenance = Mock(spec=ProvenanceTracker)
|
||||
|
||||
# Call load function
|
||||
load_to_duckdb(
|
||||
df=df,
|
||||
store=mock_store,
|
||||
provenance=mock_provenance,
|
||||
description="Test expression data"
|
||||
)
|
||||
|
||||
# Verify provenance step was recorded
|
||||
mock_provenance.record_step.assert_called_once()
|
||||
step_name, step_details = mock_provenance.record_step.call_args[0]
|
||||
assert step_name == "load_tissue_expression"
|
||||
assert "row_count" in step_details
|
||||
assert step_details["row_count"] == 2
|
||||
|
||||
|
||||
def test_null_expression_handling():
|
||||
"""Test that genes with all NULL expression data are handled gracefully."""
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG00000001", "ENSG00000002"],
|
||||
"hpa_retina_tpm": [None, 50.0],
|
||||
"hpa_cerebellum_tpm": [None, 40.0],
|
||||
"gtex_retina_tpm": [None, 60.0],
|
||||
"cellxgene_photoreceptor_expr": [None, None],
|
||||
"cellxgene_hair_cell_expr": [None, None],
|
||||
"tau_specificity": [None, 0.5],
|
||||
"usher_tissue_enrichment": [None, 2.0],
|
||||
"expression_score_normalized": [None, 0.8],
|
||||
})
|
||||
|
||||
# Mock store
|
||||
mock_store = Mock(spec=PipelineStore)
|
||||
mock_provenance = Mock(spec=ProvenanceTracker)
|
||||
|
||||
# Should not raise exception
|
||||
load_to_duckdb(df, mock_store, mock_provenance)
|
||||
|
||||
# Verify store was called
|
||||
mock_store.save_dataframe.assert_called_once()
|
||||
Reference in New Issue
Block a user