feat(03-04): add localization CLI command and comprehensive tests
- Add localization subcommand to evidence command group - Implement checkpoint-restart pattern for HPA download - Display summary with evidence type distribution - Create 17 unit and integration tests (all pass) - Test HPA parsing, evidence classification, scoring, and DuckDB persistence - Fix evidence type terminology (computational vs predicted) for consistency - Mock HTTP calls in integration tests for reproducibility
This commit is contained in:
330
tests/test_localization.py
Normal file
330
tests/test_localization.py
Normal file
@@ -0,0 +1,330 @@
|
||||
"""Unit tests for localization evidence layer."""
|
||||
|
||||
import pytest
|
||||
import polars as pl
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
from pathlib import Path
|
||||
|
||||
from usher_pipeline.evidence.localization.models import (
|
||||
LocalizationRecord,
|
||||
CILIA_COMPARTMENTS,
|
||||
CILIA_ADJACENT_COMPARTMENTS,
|
||||
)
|
||||
from usher_pipeline.evidence.localization.fetch import (
|
||||
fetch_hpa_subcellular,
|
||||
fetch_cilia_proteomics,
|
||||
)
|
||||
from usher_pipeline.evidence.localization.transform import (
|
||||
classify_evidence_type,
|
||||
score_localization,
|
||||
process_localization_evidence,
|
||||
)
|
||||
from usher_pipeline.evidence.localization.load import (
|
||||
load_to_duckdb,
|
||||
query_cilia_localized,
|
||||
)
|
||||
|
||||
|
||||
class TestHPALocationParsing:
|
||||
"""Test HPA location string parsing."""
|
||||
|
||||
def test_hpa_location_parsing(self):
|
||||
"""Test correct extraction of locations from semicolon-separated string."""
|
||||
# Create mock DataFrame with semicolon-separated locations
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG001", "ENSG002", "ENSG003"],
|
||||
"gene_symbol": ["GENE1", "GENE2", "GENE3"],
|
||||
"hpa_main_location": [
|
||||
"Centrosome;Cilia",
|
||||
"Cytosol;Nucleus",
|
||||
"Microtubules;Cell Junctions",
|
||||
],
|
||||
"hpa_reliability": ["Enhanced", "Supported", "Uncertain"],
|
||||
"in_cilia_proteomics": [False, False, False],
|
||||
"in_centrosome_proteomics": [False, False, False],
|
||||
})
|
||||
|
||||
# Classify evidence type first (required by score_localization)
|
||||
df = classify_evidence_type(df)
|
||||
|
||||
# Score localization should parse the semicolon-separated string
|
||||
result = score_localization(df)
|
||||
|
||||
# GENE1 should have both cilia and centrosome compartments detected
|
||||
gene1 = result.filter(pl.col("gene_id") == "ENSG001")
|
||||
assert gene1["compartment_cilia"][0] == True
|
||||
assert gene1["compartment_centrosome"][0] == True
|
||||
|
||||
# GENE3 should have adjacent compartment detected
|
||||
gene3 = result.filter(pl.col("gene_id") == "ENSG003")
|
||||
assert gene3["cilia_proximity_score"][0] == 0.5 # Adjacent compartment
|
||||
|
||||
|
||||
class TestCiliaCompartmentDetection:
|
||||
"""Test cilia compartment flag setting."""
|
||||
|
||||
def test_cilia_compartment_detection(self):
|
||||
"""Test that 'Centrosome' in location sets compartment_centrosome=True."""
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG001", "ENSG002"],
|
||||
"gene_symbol": ["PCNT", "ACTB"],
|
||||
"hpa_main_location": ["Centrosome;Centriole", "Actin filaments"],
|
||||
"hpa_reliability": ["Enhanced", "Enhanced"],
|
||||
"in_cilia_proteomics": [False, False],
|
||||
"in_centrosome_proteomics": [False, False],
|
||||
"evidence_type": ["experimental", "experimental"],
|
||||
})
|
||||
|
||||
result = score_localization(df)
|
||||
|
||||
# PCNT should have centrosome compartment
|
||||
pcnt = result.filter(pl.col("gene_id") == "ENSG001")
|
||||
assert pcnt["compartment_centrosome"][0] == True
|
||||
assert pcnt["cilia_proximity_score"][0] == 1.0 # Direct match
|
||||
|
||||
# ACTB should not have cilia compartments
|
||||
actb = result.filter(pl.col("gene_id") == "ENSG002")
|
||||
assert actb["compartment_centrosome"][0] == False or actb["compartment_centrosome"][0] is None
|
||||
|
||||
|
||||
class TestAdjacentCompartmentScoring:
|
||||
"""Test adjacent compartment scoring logic."""
|
||||
|
||||
def test_adjacent_compartment_scoring(self):
|
||||
"""Test that 'Cytoskeleton' only gives proximity score of 0.5."""
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG001"],
|
||||
"gene_symbol": ["TUBB"],
|
||||
"hpa_main_location": ["Cytoskeleton;Microtubules"],
|
||||
"hpa_reliability": ["Supported"],
|
||||
"in_cilia_proteomics": [False],
|
||||
"in_centrosome_proteomics": [False],
|
||||
"evidence_type": ["experimental"],
|
||||
})
|
||||
|
||||
result = score_localization(df)
|
||||
|
||||
# Should get 0.5 for adjacent compartment
|
||||
assert result["cilia_proximity_score"][0] == 0.5
|
||||
|
||||
|
||||
class TestEvidenceTypeExperimental:
|
||||
"""Test evidence type classification for experimental data."""
|
||||
|
||||
def test_evidence_type_experimental(self):
|
||||
"""Test HPA Enhanced reliability classifies as experimental."""
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG001", "ENSG002"],
|
||||
"gene_symbol": ["GENE1", "GENE2"],
|
||||
"hpa_reliability": ["Enhanced", "Supported"],
|
||||
"in_cilia_proteomics": [False, False],
|
||||
"in_centrosome_proteomics": [False, False],
|
||||
})
|
||||
|
||||
result = classify_evidence_type(df)
|
||||
|
||||
# Both should be experimental
|
||||
assert result["hpa_evidence_type"][0] == "experimental"
|
||||
assert result["hpa_evidence_type"][1] == "experimental"
|
||||
assert result["evidence_type"][0] == "experimental"
|
||||
assert result["evidence_type"][1] == "experimental"
|
||||
|
||||
|
||||
class TestEvidenceTypeComputational:
|
||||
"""Test evidence type classification for computational predictions."""
|
||||
|
||||
def test_evidence_type_computational(self):
|
||||
"""Test HPA Uncertain reliability classifies as computational."""
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG001", "ENSG002"],
|
||||
"gene_symbol": ["GENE1", "GENE2"],
|
||||
"hpa_reliability": ["Uncertain", "Approved"],
|
||||
"in_cilia_proteomics": [False, False],
|
||||
"in_centrosome_proteomics": [False, False],
|
||||
})
|
||||
|
||||
result = classify_evidence_type(df)
|
||||
|
||||
# Both should be computational
|
||||
assert result["hpa_evidence_type"][0] == "computational"
|
||||
assert result["hpa_evidence_type"][1] == "computational"
|
||||
assert result["evidence_type"][0] == "computational"
|
||||
assert result["evidence_type"][1] == "computational"
|
||||
|
||||
|
||||
class TestProteomicsOverride:
|
||||
"""Test proteomics evidence overrides HPA computational classification."""
|
||||
|
||||
def test_proteomics_override(self):
|
||||
"""Test gene in proteomics but HPA uncertain has evidence_type='both'."""
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG001"],
|
||||
"gene_symbol": ["BBS1"],
|
||||
"hpa_reliability": ["Uncertain"], # Computational
|
||||
"in_cilia_proteomics": [True], # Experimental
|
||||
"in_centrosome_proteomics": [False],
|
||||
})
|
||||
|
||||
result = classify_evidence_type(df)
|
||||
|
||||
# Should have both experimental (proteomics) and computational (HPA)
|
||||
assert result["hpa_evidence_type"][0] == "computational"
|
||||
assert result["evidence_type"][0] == "both"
|
||||
|
||||
|
||||
class TestNullHandlingNoHPA:
|
||||
"""Test NULL handling for genes not in HPA."""
|
||||
|
||||
def test_null_handling_no_hpa(self):
|
||||
"""Test gene not in HPA has HPA columns as NULL."""
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG001"],
|
||||
"gene_symbol": ["GENE1"],
|
||||
"hpa_main_location": [None],
|
||||
"hpa_reliability": [None],
|
||||
"in_cilia_proteomics": [False],
|
||||
"in_centrosome_proteomics": [False],
|
||||
})
|
||||
|
||||
result = classify_evidence_type(df)
|
||||
|
||||
# HPA fields should be NULL
|
||||
assert result["hpa_reliability"][0] is None
|
||||
assert result["hpa_evidence_type"][0] is None
|
||||
# Overall evidence type should be "none"
|
||||
assert result["evidence_type"][0] == "none"
|
||||
|
||||
|
||||
class TestProteomicsAbsenceIsFalse:
|
||||
"""Test proteomics absence is False not NULL."""
|
||||
|
||||
def test_proteomics_absence_is_false(self):
|
||||
"""Test gene not in proteomics has in_cilia_proteomics=False (not NULL)."""
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG001"],
|
||||
"gene_symbol": ["GENE1"],
|
||||
"hpa_main_location": ["Nucleus"],
|
||||
"hpa_reliability": ["Enhanced"],
|
||||
"in_cilia_proteomics": [False], # Explicitly False, not NULL
|
||||
"in_centrosome_proteomics": [False],
|
||||
})
|
||||
|
||||
# Check that False is preserved (not NULL)
|
||||
assert df["in_cilia_proteomics"][0] == False
|
||||
assert df["in_centrosome_proteomics"][0] == False
|
||||
|
||||
|
||||
class TestScoreNormalization:
|
||||
"""Test localization score is in [0, 1] range."""
|
||||
|
||||
def test_score_normalization(self):
|
||||
"""Test localization_score_normalized is in [0, 1]."""
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG001", "ENSG002", "ENSG003"],
|
||||
"gene_symbol": ["G1", "G2", "G3"],
|
||||
"hpa_main_location": ["Centrosome", "Cytoskeleton", "Nucleus"],
|
||||
"hpa_reliability": ["Enhanced", "Supported", "Enhanced"],
|
||||
"in_cilia_proteomics": [False, False, False],
|
||||
"in_centrosome_proteomics": [False, False, False],
|
||||
})
|
||||
|
||||
df = classify_evidence_type(df)
|
||||
result = score_localization(df)
|
||||
|
||||
# All non-null scores should be in [0, 1]
|
||||
scores = result["localization_score_normalized"].drop_nulls()
|
||||
assert all(score >= 0.0 and score <= 1.0 for score in scores)
|
||||
|
||||
|
||||
class TestEvidenceWeightApplied:
|
||||
"""Test experimental evidence scores higher than computational for same compartment."""
|
||||
|
||||
def test_evidence_weight_applied(self):
|
||||
"""Test experimental evidence gets full weight, computational gets 0.6x."""
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG001", "ENSG002"],
|
||||
"gene_symbol": ["GENE1", "GENE2"],
|
||||
"hpa_main_location": ["Centrosome", "Centrosome"],
|
||||
"hpa_reliability": ["Enhanced", "Uncertain"],
|
||||
"in_cilia_proteomics": [False, False],
|
||||
"in_centrosome_proteomics": [False, False],
|
||||
})
|
||||
|
||||
df = classify_evidence_type(df)
|
||||
result = score_localization(df)
|
||||
|
||||
# Both have same cilia_proximity_score
|
||||
assert result["cilia_proximity_score"][0] == 1.0
|
||||
assert result["cilia_proximity_score"][1] == 1.0
|
||||
|
||||
# But normalized scores differ by evidence weight
|
||||
experimental_score = result["localization_score_normalized"][0]
|
||||
computational_score = result["localization_score_normalized"][1]
|
||||
|
||||
assert experimental_score == 1.0 # Enhanced = experimental = 1.0x
|
||||
assert computational_score == pytest.approx(0.6) # Uncertain = computational = 0.6x
|
||||
|
||||
|
||||
class TestFetchCiliaProteomics:
|
||||
"""Test cilia proteomics cross-reference."""
|
||||
|
||||
def test_fetch_cilia_proteomics(self):
|
||||
"""Test cross-referencing against curated proteomics gene sets."""
|
||||
gene_symbol_map = pl.DataFrame({
|
||||
"gene_id": ["ENSG001", "ENSG002", "ENSG003"],
|
||||
"gene_symbol": ["BBS1", "ACTB", "CEP290"], # BBS1 and CEP290 in cilia proteomics
|
||||
})
|
||||
|
||||
result = fetch_cilia_proteomics(
|
||||
gene_ids=["ENSG001", "ENSG002", "ENSG003"],
|
||||
gene_symbol_map=gene_symbol_map,
|
||||
)
|
||||
|
||||
# BBS1 and CEP290 should be in cilia proteomics
|
||||
bbs1 = result.filter(pl.col("gene_id") == "ENSG001")
|
||||
assert bbs1["in_cilia_proteomics"][0] == True
|
||||
|
||||
cep290 = result.filter(pl.col("gene_id") == "ENSG003")
|
||||
assert cep290["in_cilia_proteomics"][0] == True
|
||||
|
||||
# ACTB should not be in cilia proteomics
|
||||
actb = result.filter(pl.col("gene_id") == "ENSG002")
|
||||
assert actb["in_cilia_proteomics"][0] == False
|
||||
|
||||
|
||||
class TestLoadToDuckDB:
|
||||
"""Test DuckDB loading with provenance."""
|
||||
|
||||
def test_load_to_duckdb(self):
|
||||
"""Test loading localization data to DuckDB."""
|
||||
# Create synthetic data
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG001", "ENSG002"],
|
||||
"gene_symbol": ["BBS1", "ACTB"],
|
||||
"hpa_main_location": ["Centrosome", "Actin filaments"],
|
||||
"hpa_reliability": ["Enhanced", "Enhanced"],
|
||||
"evidence_type": ["experimental", "experimental"],
|
||||
"compartment_cilia": [False, False],
|
||||
"compartment_centrosome": [True, False],
|
||||
"cilia_proximity_score": [1.0, 0.0],
|
||||
"localization_score_normalized": [1.0, 0.0],
|
||||
})
|
||||
|
||||
# Mock store and provenance
|
||||
mock_store = Mock()
|
||||
mock_provenance = Mock()
|
||||
|
||||
# Call load function
|
||||
load_to_duckdb(df, mock_store, mock_provenance, "Test description")
|
||||
|
||||
# Verify save_dataframe was called
|
||||
mock_store.save_dataframe.assert_called_once()
|
||||
call_args = mock_store.save_dataframe.call_args
|
||||
assert call_args.kwargs["table_name"] == "subcellular_localization"
|
||||
assert call_args.kwargs["replace"] == True
|
||||
|
||||
# Verify provenance recorded
|
||||
mock_provenance.record_step.assert_called_once()
|
||||
step_args = mock_provenance.record_step.call_args
|
||||
assert step_args[0][0] == "load_subcellular_localization"
|
||||
assert step_args[0][1]["row_count"] == 2
|
||||
252
tests/test_localization_integration.py
Normal file
252
tests/test_localization_integration.py
Normal file
@@ -0,0 +1,252 @@
|
||||
"""Integration tests for localization evidence layer."""
|
||||
|
||||
import pytest
|
||||
import polars as pl
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
import tempfile
|
||||
import zipfile
|
||||
import io
|
||||
|
||||
from usher_pipeline.evidence.localization import (
|
||||
process_localization_evidence,
|
||||
load_to_duckdb,
|
||||
)
|
||||
from usher_pipeline.evidence.localization.transform import classify_evidence_type
|
||||
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_hpa_data():
|
||||
"""Create mock HPA subcellular location TSV data."""
|
||||
tsv_content = """Gene Gene name Reliability Main location Additional location Extracellular location
|
||||
ENSG00000001 BBS1 Enhanced Centrosome Cilia
|
||||
ENSG00000002 CEP290 Supported Cilia;Basal body
|
||||
ENSG00000003 ACTB Enhanced Actin filaments Cytosol
|
||||
ENSG00000004 TUBB Supported Cytoskeleton Microtubules
|
||||
ENSG00000005 TP53 Uncertain Nucleus Cytosol
|
||||
"""
|
||||
return tsv_content
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def gene_symbol_map():
|
||||
"""Create gene symbol mapping DataFrame."""
|
||||
return pl.DataFrame({
|
||||
"gene_id": ["ENSG00000001", "ENSG00000002", "ENSG00000003", "ENSG00000004", "ENSG00000005"],
|
||||
"gene_symbol": ["BBS1", "CEP290", "ACTB", "TUBB", "TP53"],
|
||||
})
|
||||
|
||||
|
||||
class TestFullPipeline:
|
||||
"""Test full localization evidence pipeline."""
|
||||
|
||||
@patch('usher_pipeline.evidence.localization.fetch.httpx.stream')
|
||||
def test_full_pipeline(self, mock_stream, mock_hpa_data, gene_symbol_map, tmp_path):
|
||||
"""Test complete pipeline from fetch to scoring."""
|
||||
# Mock HPA download
|
||||
# Create a mock zip file containing the TSV
|
||||
zip_buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zf:
|
||||
zf.writestr("subcellular_location.tsv", mock_hpa_data)
|
||||
zip_buffer.seek(0)
|
||||
|
||||
# Mock httpx stream response
|
||||
mock_response = MagicMock()
|
||||
mock_response.read.return_value = zip_buffer.getvalue()
|
||||
mock_response.headers = {"content-length": str(len(zip_buffer.getvalue()))}
|
||||
mock_stream.return_value.__enter__.return_value = mock_response
|
||||
|
||||
# Run full pipeline
|
||||
gene_ids = gene_symbol_map["gene_id"].to_list()
|
||||
result = process_localization_evidence(
|
||||
gene_ids=gene_ids,
|
||||
gene_symbol_map=gene_symbol_map,
|
||||
cache_dir=tmp_path,
|
||||
force=True,
|
||||
)
|
||||
|
||||
# Verify results
|
||||
assert len(result) == 5
|
||||
assert "gene_id" in result.columns
|
||||
assert "evidence_type" in result.columns
|
||||
assert "cilia_proximity_score" in result.columns
|
||||
assert "localization_score_normalized" in result.columns
|
||||
|
||||
# Check BBS1 (in HPA centrosome, in proteomics)
|
||||
bbs1 = result.filter(pl.col("gene_id") == "ENSG00000001")
|
||||
assert bbs1["compartment_centrosome"][0] == True
|
||||
assert bbs1["in_cilia_proteomics"][0] == True # BBS1 is in curated list
|
||||
assert bbs1["evidence_type"][0] == "experimental"
|
||||
assert bbs1["cilia_proximity_score"][0] == 1.0 # Direct cilia compartment
|
||||
|
||||
# Check CEP290 (in HPA cilia, in proteomics)
|
||||
cep290 = result.filter(pl.col("gene_id") == "ENSG00000002")
|
||||
assert cep290["compartment_cilia"][0] == True
|
||||
assert cep290["in_cilia_proteomics"][0] == True
|
||||
assert cep290["evidence_type"][0] == "experimental"
|
||||
|
||||
# Check ACTB (not in cilia compartments, not in proteomics)
|
||||
actb = result.filter(pl.col("gene_id") == "ENSG00000003")
|
||||
assert actb["in_cilia_proteomics"][0] == False
|
||||
assert actb["cilia_proximity_score"][0] == 0.0 # No cilia proximity
|
||||
|
||||
# Check TUBB (adjacent compartment)
|
||||
tubb = result.filter(pl.col("gene_id") == "ENSG00000004")
|
||||
assert tubb["cilia_proximity_score"][0] == 0.5 # Adjacent compartment
|
||||
|
||||
# Check TP53 (computational evidence only)
|
||||
tp53 = result.filter(pl.col("gene_id") == "ENSG00000005")
|
||||
assert tp53["hpa_reliability"][0] == "Uncertain"
|
||||
assert tp53["evidence_type"][0] == "computational"
|
||||
|
||||
|
||||
class TestCheckpointRestart:
|
||||
"""Test checkpoint-restart functionality."""
|
||||
|
||||
@patch('usher_pipeline.evidence.localization.fetch.httpx.stream')
|
||||
def test_checkpoint_restart(self, mock_stream, mock_hpa_data, gene_symbol_map, tmp_path):
|
||||
"""Test that cached HPA data is reused on second run."""
|
||||
# Mock HPA download for first run
|
||||
zip_buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zf:
|
||||
zf.writestr("subcellular_location.tsv", mock_hpa_data)
|
||||
zip_buffer.seek(0)
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.read.return_value = zip_buffer.getvalue()
|
||||
mock_response.headers = {"content-length": str(len(zip_buffer.getvalue()))}
|
||||
mock_stream.return_value.__enter__.return_value = mock_response
|
||||
|
||||
# First run
|
||||
gene_ids = gene_symbol_map["gene_id"].to_list()
|
||||
result1 = process_localization_evidence(
|
||||
gene_ids=gene_ids,
|
||||
gene_symbol_map=gene_symbol_map,
|
||||
cache_dir=tmp_path,
|
||||
force=True,
|
||||
)
|
||||
|
||||
# Reset mock
|
||||
mock_stream.reset_mock()
|
||||
|
||||
# Second run (should use cached data)
|
||||
result2 = process_localization_evidence(
|
||||
gene_ids=gene_ids,
|
||||
gene_symbol_map=gene_symbol_map,
|
||||
cache_dir=tmp_path,
|
||||
force=False, # Don't force re-download
|
||||
)
|
||||
|
||||
# Verify httpx.stream was NOT called on second run
|
||||
mock_stream.assert_not_called()
|
||||
|
||||
# Results should be identical
|
||||
assert len(result1) == len(result2)
|
||||
|
||||
|
||||
class TestProvenanceTracking:
|
||||
"""Test provenance metadata recording."""
|
||||
|
||||
def test_provenance_tracking(self, tmp_path):
|
||||
"""Test provenance step recording with statistics."""
|
||||
# Create synthetic data
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG001", "ENSG002", "ENSG003"],
|
||||
"gene_symbol": ["BBS1", "CEP290", "ACTB"],
|
||||
"evidence_type": ["experimental", "both", "experimental"],
|
||||
"compartment_cilia": [False, True, False],
|
||||
"compartment_centrosome": [True, False, False],
|
||||
"cilia_proximity_score": [1.0, 1.0, 0.0],
|
||||
"localization_score_normalized": [1.0, 1.0, 0.0],
|
||||
})
|
||||
|
||||
# Create temporary DuckDB
|
||||
db_path = tmp_path / "test.duckdb"
|
||||
store = PipelineStore(db_path)
|
||||
|
||||
# Mock provenance tracker
|
||||
mock_provenance = Mock()
|
||||
|
||||
# Load data
|
||||
load_to_duckdb(df, store, mock_provenance, "Test description")
|
||||
|
||||
# Verify provenance recorded
|
||||
mock_provenance.record_step.assert_called_once()
|
||||
step_args = mock_provenance.record_step.call_args
|
||||
|
||||
# Check provenance details
|
||||
assert step_args[0][0] == "load_subcellular_localization"
|
||||
provenance_data = step_args[0][1]
|
||||
assert provenance_data["row_count"] == 3
|
||||
assert provenance_data["experimental_count"] == 2
|
||||
assert provenance_data["both_count"] == 1
|
||||
assert provenance_data["cilia_compartment_count"] == 2 # BBS1 centrosome, CEP290 cilia
|
||||
assert provenance_data["high_proximity_count"] == 2 # Score > 0.5
|
||||
|
||||
store.close()
|
||||
|
||||
|
||||
class TestDuckDBQuery:
|
||||
"""Test DuckDB query helper functions."""
|
||||
|
||||
def test_query_cilia_localized(self, tmp_path):
|
||||
"""Test querying cilia-localized genes from DuckDB."""
|
||||
from usher_pipeline.evidence.localization.load import query_cilia_localized
|
||||
|
||||
# Create synthetic data
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG001", "ENSG002", "ENSG003", "ENSG004"],
|
||||
"gene_symbol": ["BBS1", "CEP290", "ACTB", "TP53"],
|
||||
"evidence_type": ["experimental", "experimental", "experimental", "predicted"],
|
||||
"compartment_cilia": [False, True, False, False],
|
||||
"compartment_centrosome": [True, False, False, False],
|
||||
"compartment_basal_body": [None, None, None, None],
|
||||
"in_cilia_proteomics": [True, True, False, False],
|
||||
"in_centrosome_proteomics": [False, False, False, False],
|
||||
"cilia_proximity_score": [1.0, 1.0, 0.0, 0.2],
|
||||
"localization_score_normalized": [1.0, 1.0, 0.0, 0.12],
|
||||
})
|
||||
|
||||
# Create DuckDB and load data
|
||||
db_path = tmp_path / "test.duckdb"
|
||||
store = PipelineStore(db_path)
|
||||
mock_provenance = Mock()
|
||||
load_to_duckdb(df, store, mock_provenance)
|
||||
|
||||
# Query cilia-localized genes (proximity > 0.5)
|
||||
result = query_cilia_localized(store, proximity_threshold=0.5)
|
||||
|
||||
# Should return BBS1 and CEP290 only
|
||||
assert len(result) == 2
|
||||
gene_symbols = result["gene_symbol"].to_list()
|
||||
assert "BBS1" in gene_symbols
|
||||
assert "CEP290" in gene_symbols
|
||||
assert "ACTB" not in gene_symbols
|
||||
assert "TP53" not in gene_symbols
|
||||
|
||||
store.close()
|
||||
|
||||
|
||||
class TestErrorHandling:
|
||||
"""Test error handling in localization pipeline."""
|
||||
|
||||
def test_missing_gene_universe(self):
|
||||
"""Test error handling when gene universe is missing."""
|
||||
# Test with minimal valid data - empty gene list should work
|
||||
# Just verify classify_evidence_type handles edge cases
|
||||
df = pl.DataFrame({
|
||||
"gene_id": [],
|
||||
"gene_symbol": [],
|
||||
"hpa_reliability": [],
|
||||
"in_cilia_proteomics": [],
|
||||
"in_centrosome_proteomics": [],
|
||||
})
|
||||
|
||||
result = classify_evidence_type(df)
|
||||
|
||||
# Should return empty DataFrame with correct schema
|
||||
assert len(result) == 0
|
||||
assert "gene_id" in result.columns
|
||||
assert "evidence_type" in result.columns
|
||||
assert "hpa_evidence_type" in result.columns
|
||||
Reference in New Issue
Block a user