Files
usher-exploring/tests/test_localization.py
gbanyan 942aaf2ec3 feat(03-04): add localization CLI command and comprehensive tests
- Add localization subcommand to evidence command group
- Implement checkpoint-restart pattern for HPA download
- Display summary with evidence type distribution
- Create 17 unit and integration tests (all pass)
- Test HPA parsing, evidence classification, scoring, and DuckDB persistence
- Fix evidence type terminology (computational vs predicted) for consistency
- Mock HTTP calls in integration tests for reproducibility
2026-02-11 19:05:22 +08:00

331 lines
12 KiB
Python

"""Unit tests for localization evidence layer."""
import pytest
import polars as pl
from unittest.mock import Mock, patch, MagicMock
from pathlib import Path
from usher_pipeline.evidence.localization.models import (
LocalizationRecord,
CILIA_COMPARTMENTS,
CILIA_ADJACENT_COMPARTMENTS,
)
from usher_pipeline.evidence.localization.fetch import (
fetch_hpa_subcellular,
fetch_cilia_proteomics,
)
from usher_pipeline.evidence.localization.transform import (
classify_evidence_type,
score_localization,
process_localization_evidence,
)
from usher_pipeline.evidence.localization.load import (
load_to_duckdb,
query_cilia_localized,
)
class TestHPALocationParsing:
"""Test HPA location string parsing."""
def test_hpa_location_parsing(self):
"""Test correct extraction of locations from semicolon-separated string."""
# Create mock DataFrame with semicolon-separated locations
df = pl.DataFrame({
"gene_id": ["ENSG001", "ENSG002", "ENSG003"],
"gene_symbol": ["GENE1", "GENE2", "GENE3"],
"hpa_main_location": [
"Centrosome;Cilia",
"Cytosol;Nucleus",
"Microtubules;Cell Junctions",
],
"hpa_reliability": ["Enhanced", "Supported", "Uncertain"],
"in_cilia_proteomics": [False, False, False],
"in_centrosome_proteomics": [False, False, False],
})
# Classify evidence type first (required by score_localization)
df = classify_evidence_type(df)
# Score localization should parse the semicolon-separated string
result = score_localization(df)
# GENE1 should have both cilia and centrosome compartments detected
gene1 = result.filter(pl.col("gene_id") == "ENSG001")
assert gene1["compartment_cilia"][0] == True
assert gene1["compartment_centrosome"][0] == True
# GENE3 should have adjacent compartment detected
gene3 = result.filter(pl.col("gene_id") == "ENSG003")
assert gene3["cilia_proximity_score"][0] == 0.5 # Adjacent compartment
class TestCiliaCompartmentDetection:
"""Test cilia compartment flag setting."""
def test_cilia_compartment_detection(self):
"""Test that 'Centrosome' in location sets compartment_centrosome=True."""
df = pl.DataFrame({
"gene_id": ["ENSG001", "ENSG002"],
"gene_symbol": ["PCNT", "ACTB"],
"hpa_main_location": ["Centrosome;Centriole", "Actin filaments"],
"hpa_reliability": ["Enhanced", "Enhanced"],
"in_cilia_proteomics": [False, False],
"in_centrosome_proteomics": [False, False],
"evidence_type": ["experimental", "experimental"],
})
result = score_localization(df)
# PCNT should have centrosome compartment
pcnt = result.filter(pl.col("gene_id") == "ENSG001")
assert pcnt["compartment_centrosome"][0] == True
assert pcnt["cilia_proximity_score"][0] == 1.0 # Direct match
# ACTB should not have cilia compartments
actb = result.filter(pl.col("gene_id") == "ENSG002")
assert actb["compartment_centrosome"][0] == False or actb["compartment_centrosome"][0] is None
class TestAdjacentCompartmentScoring:
"""Test adjacent compartment scoring logic."""
def test_adjacent_compartment_scoring(self):
"""Test that 'Cytoskeleton' only gives proximity score of 0.5."""
df = pl.DataFrame({
"gene_id": ["ENSG001"],
"gene_symbol": ["TUBB"],
"hpa_main_location": ["Cytoskeleton;Microtubules"],
"hpa_reliability": ["Supported"],
"in_cilia_proteomics": [False],
"in_centrosome_proteomics": [False],
"evidence_type": ["experimental"],
})
result = score_localization(df)
# Should get 0.5 for adjacent compartment
assert result["cilia_proximity_score"][0] == 0.5
class TestEvidenceTypeExperimental:
"""Test evidence type classification for experimental data."""
def test_evidence_type_experimental(self):
"""Test HPA Enhanced reliability classifies as experimental."""
df = pl.DataFrame({
"gene_id": ["ENSG001", "ENSG002"],
"gene_symbol": ["GENE1", "GENE2"],
"hpa_reliability": ["Enhanced", "Supported"],
"in_cilia_proteomics": [False, False],
"in_centrosome_proteomics": [False, False],
})
result = classify_evidence_type(df)
# Both should be experimental
assert result["hpa_evidence_type"][0] == "experimental"
assert result["hpa_evidence_type"][1] == "experimental"
assert result["evidence_type"][0] == "experimental"
assert result["evidence_type"][1] == "experimental"
class TestEvidenceTypeComputational:
"""Test evidence type classification for computational predictions."""
def test_evidence_type_computational(self):
"""Test HPA Uncertain reliability classifies as computational."""
df = pl.DataFrame({
"gene_id": ["ENSG001", "ENSG002"],
"gene_symbol": ["GENE1", "GENE2"],
"hpa_reliability": ["Uncertain", "Approved"],
"in_cilia_proteomics": [False, False],
"in_centrosome_proteomics": [False, False],
})
result = classify_evidence_type(df)
# Both should be computational
assert result["hpa_evidence_type"][0] == "computational"
assert result["hpa_evidence_type"][1] == "computational"
assert result["evidence_type"][0] == "computational"
assert result["evidence_type"][1] == "computational"
class TestProteomicsOverride:
"""Test proteomics evidence overrides HPA computational classification."""
def test_proteomics_override(self):
"""Test gene in proteomics but HPA uncertain has evidence_type='both'."""
df = pl.DataFrame({
"gene_id": ["ENSG001"],
"gene_symbol": ["BBS1"],
"hpa_reliability": ["Uncertain"], # Computational
"in_cilia_proteomics": [True], # Experimental
"in_centrosome_proteomics": [False],
})
result = classify_evidence_type(df)
# Should have both experimental (proteomics) and computational (HPA)
assert result["hpa_evidence_type"][0] == "computational"
assert result["evidence_type"][0] == "both"
class TestNullHandlingNoHPA:
"""Test NULL handling for genes not in HPA."""
def test_null_handling_no_hpa(self):
"""Test gene not in HPA has HPA columns as NULL."""
df = pl.DataFrame({
"gene_id": ["ENSG001"],
"gene_symbol": ["GENE1"],
"hpa_main_location": [None],
"hpa_reliability": [None],
"in_cilia_proteomics": [False],
"in_centrosome_proteomics": [False],
})
result = classify_evidence_type(df)
# HPA fields should be NULL
assert result["hpa_reliability"][0] is None
assert result["hpa_evidence_type"][0] is None
# Overall evidence type should be "none"
assert result["evidence_type"][0] == "none"
class TestProteomicsAbsenceIsFalse:
"""Test proteomics absence is False not NULL."""
def test_proteomics_absence_is_false(self):
"""Test gene not in proteomics has in_cilia_proteomics=False (not NULL)."""
df = pl.DataFrame({
"gene_id": ["ENSG001"],
"gene_symbol": ["GENE1"],
"hpa_main_location": ["Nucleus"],
"hpa_reliability": ["Enhanced"],
"in_cilia_proteomics": [False], # Explicitly False, not NULL
"in_centrosome_proteomics": [False],
})
# Check that False is preserved (not NULL)
assert df["in_cilia_proteomics"][0] == False
assert df["in_centrosome_proteomics"][0] == False
class TestScoreNormalization:
"""Test localization score is in [0, 1] range."""
def test_score_normalization(self):
"""Test localization_score_normalized is in [0, 1]."""
df = pl.DataFrame({
"gene_id": ["ENSG001", "ENSG002", "ENSG003"],
"gene_symbol": ["G1", "G2", "G3"],
"hpa_main_location": ["Centrosome", "Cytoskeleton", "Nucleus"],
"hpa_reliability": ["Enhanced", "Supported", "Enhanced"],
"in_cilia_proteomics": [False, False, False],
"in_centrosome_proteomics": [False, False, False],
})
df = classify_evidence_type(df)
result = score_localization(df)
# All non-null scores should be in [0, 1]
scores = result["localization_score_normalized"].drop_nulls()
assert all(score >= 0.0 and score <= 1.0 for score in scores)
class TestEvidenceWeightApplied:
"""Test experimental evidence scores higher than computational for same compartment."""
def test_evidence_weight_applied(self):
"""Test experimental evidence gets full weight, computational gets 0.6x."""
df = pl.DataFrame({
"gene_id": ["ENSG001", "ENSG002"],
"gene_symbol": ["GENE1", "GENE2"],
"hpa_main_location": ["Centrosome", "Centrosome"],
"hpa_reliability": ["Enhanced", "Uncertain"],
"in_cilia_proteomics": [False, False],
"in_centrosome_proteomics": [False, False],
})
df = classify_evidence_type(df)
result = score_localization(df)
# Both have same cilia_proximity_score
assert result["cilia_proximity_score"][0] == 1.0
assert result["cilia_proximity_score"][1] == 1.0
# But normalized scores differ by evidence weight
experimental_score = result["localization_score_normalized"][0]
computational_score = result["localization_score_normalized"][1]
assert experimental_score == 1.0 # Enhanced = experimental = 1.0x
assert computational_score == pytest.approx(0.6) # Uncertain = computational = 0.6x
class TestFetchCiliaProteomics:
"""Test cilia proteomics cross-reference."""
def test_fetch_cilia_proteomics(self):
"""Test cross-referencing against curated proteomics gene sets."""
gene_symbol_map = pl.DataFrame({
"gene_id": ["ENSG001", "ENSG002", "ENSG003"],
"gene_symbol": ["BBS1", "ACTB", "CEP290"], # BBS1 and CEP290 in cilia proteomics
})
result = fetch_cilia_proteomics(
gene_ids=["ENSG001", "ENSG002", "ENSG003"],
gene_symbol_map=gene_symbol_map,
)
# BBS1 and CEP290 should be in cilia proteomics
bbs1 = result.filter(pl.col("gene_id") == "ENSG001")
assert bbs1["in_cilia_proteomics"][0] == True
cep290 = result.filter(pl.col("gene_id") == "ENSG003")
assert cep290["in_cilia_proteomics"][0] == True
# ACTB should not be in cilia proteomics
actb = result.filter(pl.col("gene_id") == "ENSG002")
assert actb["in_cilia_proteomics"][0] == False
class TestLoadToDuckDB:
"""Test DuckDB loading with provenance."""
def test_load_to_duckdb(self):
"""Test loading localization data to DuckDB."""
# Create synthetic data
df = pl.DataFrame({
"gene_id": ["ENSG001", "ENSG002"],
"gene_symbol": ["BBS1", "ACTB"],
"hpa_main_location": ["Centrosome", "Actin filaments"],
"hpa_reliability": ["Enhanced", "Enhanced"],
"evidence_type": ["experimental", "experimental"],
"compartment_cilia": [False, False],
"compartment_centrosome": [True, False],
"cilia_proximity_score": [1.0, 0.0],
"localization_score_normalized": [1.0, 0.0],
})
# Mock store and provenance
mock_store = Mock()
mock_provenance = Mock()
# Call load function
load_to_duckdb(df, mock_store, mock_provenance, "Test description")
# Verify save_dataframe was called
mock_store.save_dataframe.assert_called_once()
call_args = mock_store.save_dataframe.call_args
assert call_args.kwargs["table_name"] == "subcellular_localization"
assert call_args.kwargs["replace"] == True
# Verify provenance recorded
mock_provenance.record_step.assert_called_once()
step_args = mock_provenance.record_step.call_args
assert step_args[0][0] == "load_subcellular_localization"
assert step_args[0][1]["row_count"] == 2