feat(03-04): add localization CLI command and comprehensive tests

- Add localization subcommand to evidence command group
- Implement checkpoint-restart pattern for HPA download
- Display summary with evidence type distribution
- Create 17 unit and integration tests (all pass)
- Test HPA parsing, evidence classification, scoring, and DuckDB persistence
- Fix evidence type terminology (computational vs predicted) for consistency
- Mock HTTP calls in integration tests for reproducibility
This commit is contained in:
2026-02-11 19:05:22 +08:00
parent d70239c4ce
commit 942aaf2ec3
4 changed files with 798 additions and 4 deletions

View File

@@ -0,0 +1,252 @@
"""Integration tests for localization evidence layer."""
import pytest
import polars as pl
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
import tempfile
import zipfile
import io
from usher_pipeline.evidence.localization import (
process_localization_evidence,
load_to_duckdb,
)
from usher_pipeline.evidence.localization.transform import classify_evidence_type
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
@pytest.fixture
def mock_hpa_data():
"""Create mock HPA subcellular location TSV data."""
tsv_content = """Gene Gene name Reliability Main location Additional location Extracellular location
ENSG00000001 BBS1 Enhanced Centrosome Cilia
ENSG00000002 CEP290 Supported Cilia;Basal body
ENSG00000003 ACTB Enhanced Actin filaments Cytosol
ENSG00000004 TUBB Supported Cytoskeleton Microtubules
ENSG00000005 TP53 Uncertain Nucleus Cytosol
"""
return tsv_content
@pytest.fixture
def gene_symbol_map():
"""Create gene symbol mapping DataFrame."""
return pl.DataFrame({
"gene_id": ["ENSG00000001", "ENSG00000002", "ENSG00000003", "ENSG00000004", "ENSG00000005"],
"gene_symbol": ["BBS1", "CEP290", "ACTB", "TUBB", "TP53"],
})
class TestFullPipeline:
"""Test full localization evidence pipeline."""
@patch('usher_pipeline.evidence.localization.fetch.httpx.stream')
def test_full_pipeline(self, mock_stream, mock_hpa_data, gene_symbol_map, tmp_path):
"""Test complete pipeline from fetch to scoring."""
# Mock HPA download
# Create a mock zip file containing the TSV
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zf:
zf.writestr("subcellular_location.tsv", mock_hpa_data)
zip_buffer.seek(0)
# Mock httpx stream response
mock_response = MagicMock()
mock_response.read.return_value = zip_buffer.getvalue()
mock_response.headers = {"content-length": str(len(zip_buffer.getvalue()))}
mock_stream.return_value.__enter__.return_value = mock_response
# Run full pipeline
gene_ids = gene_symbol_map["gene_id"].to_list()
result = process_localization_evidence(
gene_ids=gene_ids,
gene_symbol_map=gene_symbol_map,
cache_dir=tmp_path,
force=True,
)
# Verify results
assert len(result) == 5
assert "gene_id" in result.columns
assert "evidence_type" in result.columns
assert "cilia_proximity_score" in result.columns
assert "localization_score_normalized" in result.columns
# Check BBS1 (in HPA centrosome, in proteomics)
bbs1 = result.filter(pl.col("gene_id") == "ENSG00000001")
assert bbs1["compartment_centrosome"][0] == True
assert bbs1["in_cilia_proteomics"][0] == True # BBS1 is in curated list
assert bbs1["evidence_type"][0] == "experimental"
assert bbs1["cilia_proximity_score"][0] == 1.0 # Direct cilia compartment
# Check CEP290 (in HPA cilia, in proteomics)
cep290 = result.filter(pl.col("gene_id") == "ENSG00000002")
assert cep290["compartment_cilia"][0] == True
assert cep290["in_cilia_proteomics"][0] == True
assert cep290["evidence_type"][0] == "experimental"
# Check ACTB (not in cilia compartments, not in proteomics)
actb = result.filter(pl.col("gene_id") == "ENSG00000003")
assert actb["in_cilia_proteomics"][0] == False
assert actb["cilia_proximity_score"][0] == 0.0 # No cilia proximity
# Check TUBB (adjacent compartment)
tubb = result.filter(pl.col("gene_id") == "ENSG00000004")
assert tubb["cilia_proximity_score"][0] == 0.5 # Adjacent compartment
# Check TP53 (computational evidence only)
tp53 = result.filter(pl.col("gene_id") == "ENSG00000005")
assert tp53["hpa_reliability"][0] == "Uncertain"
assert tp53["evidence_type"][0] == "computational"
class TestCheckpointRestart:
"""Test checkpoint-restart functionality."""
@patch('usher_pipeline.evidence.localization.fetch.httpx.stream')
def test_checkpoint_restart(self, mock_stream, mock_hpa_data, gene_symbol_map, tmp_path):
"""Test that cached HPA data is reused on second run."""
# Mock HPA download for first run
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zf:
zf.writestr("subcellular_location.tsv", mock_hpa_data)
zip_buffer.seek(0)
mock_response = MagicMock()
mock_response.read.return_value = zip_buffer.getvalue()
mock_response.headers = {"content-length": str(len(zip_buffer.getvalue()))}
mock_stream.return_value.__enter__.return_value = mock_response
# First run
gene_ids = gene_symbol_map["gene_id"].to_list()
result1 = process_localization_evidence(
gene_ids=gene_ids,
gene_symbol_map=gene_symbol_map,
cache_dir=tmp_path,
force=True,
)
# Reset mock
mock_stream.reset_mock()
# Second run (should use cached data)
result2 = process_localization_evidence(
gene_ids=gene_ids,
gene_symbol_map=gene_symbol_map,
cache_dir=tmp_path,
force=False, # Don't force re-download
)
# Verify httpx.stream was NOT called on second run
mock_stream.assert_not_called()
# Results should be identical
assert len(result1) == len(result2)
class TestProvenanceTracking:
"""Test provenance metadata recording."""
def test_provenance_tracking(self, tmp_path):
"""Test provenance step recording with statistics."""
# Create synthetic data
df = pl.DataFrame({
"gene_id": ["ENSG001", "ENSG002", "ENSG003"],
"gene_symbol": ["BBS1", "CEP290", "ACTB"],
"evidence_type": ["experimental", "both", "experimental"],
"compartment_cilia": [False, True, False],
"compartment_centrosome": [True, False, False],
"cilia_proximity_score": [1.0, 1.0, 0.0],
"localization_score_normalized": [1.0, 1.0, 0.0],
})
# Create temporary DuckDB
db_path = tmp_path / "test.duckdb"
store = PipelineStore(db_path)
# Mock provenance tracker
mock_provenance = Mock()
# Load data
load_to_duckdb(df, store, mock_provenance, "Test description")
# Verify provenance recorded
mock_provenance.record_step.assert_called_once()
step_args = mock_provenance.record_step.call_args
# Check provenance details
assert step_args[0][0] == "load_subcellular_localization"
provenance_data = step_args[0][1]
assert provenance_data["row_count"] == 3
assert provenance_data["experimental_count"] == 2
assert provenance_data["both_count"] == 1
assert provenance_data["cilia_compartment_count"] == 2 # BBS1 centrosome, CEP290 cilia
assert provenance_data["high_proximity_count"] == 2 # Score > 0.5
store.close()
class TestDuckDBQuery:
"""Test DuckDB query helper functions."""
def test_query_cilia_localized(self, tmp_path):
"""Test querying cilia-localized genes from DuckDB."""
from usher_pipeline.evidence.localization.load import query_cilia_localized
# Create synthetic data
df = pl.DataFrame({
"gene_id": ["ENSG001", "ENSG002", "ENSG003", "ENSG004"],
"gene_symbol": ["BBS1", "CEP290", "ACTB", "TP53"],
"evidence_type": ["experimental", "experimental", "experimental", "predicted"],
"compartment_cilia": [False, True, False, False],
"compartment_centrosome": [True, False, False, False],
"compartment_basal_body": [None, None, None, None],
"in_cilia_proteomics": [True, True, False, False],
"in_centrosome_proteomics": [False, False, False, False],
"cilia_proximity_score": [1.0, 1.0, 0.0, 0.2],
"localization_score_normalized": [1.0, 1.0, 0.0, 0.12],
})
# Create DuckDB and load data
db_path = tmp_path / "test.duckdb"
store = PipelineStore(db_path)
mock_provenance = Mock()
load_to_duckdb(df, store, mock_provenance)
# Query cilia-localized genes (proximity > 0.5)
result = query_cilia_localized(store, proximity_threshold=0.5)
# Should return BBS1 and CEP290 only
assert len(result) == 2
gene_symbols = result["gene_symbol"].to_list()
assert "BBS1" in gene_symbols
assert "CEP290" in gene_symbols
assert "ACTB" not in gene_symbols
assert "TP53" not in gene_symbols
store.close()
class TestErrorHandling:
"""Test error handling in localization pipeline."""
def test_missing_gene_universe(self):
"""Test error handling when gene universe is missing."""
# Test with minimal valid data - empty gene list should work
# Just verify classify_evidence_type handles edge cases
df = pl.DataFrame({
"gene_id": [],
"gene_symbol": [],
"hpa_reliability": [],
"in_cilia_proteomics": [],
"in_centrosome_proteomics": [],
})
result = classify_evidence_type(df)
# Should return empty DataFrame with correct schema
assert len(result) == 0
assert "gene_id" in result.columns
assert "evidence_type" in result.columns
assert "hpa_evidence_type" in result.columns