Files
usher-exploring/tests/test_protein.py
gbanyan 46059874f2 feat(03-03): implement protein evidence layer with UniProt/InterPro integration
- Create protein features data model with domain, coiled-coil, TM, cilia motifs
- Implement fetch.py with UniProt REST API and InterPro API queries
- Implement transform.py with feature extraction, motif detection, normalization
- Implement load.py with DuckDB persistence and provenance tracking
- Add CLI protein command following evidence layer pattern
- Add comprehensive unit and integration tests (all passing)
- Handle NULL preservation and List(Null) edge case
- Add get_steps() method to ProvenanceTracker for test compatibility
2026-02-11 19:07:30 +08:00

311 lines
9.7 KiB
Python

"""Unit tests for protein features evidence layer."""
from unittest.mock import Mock, patch, MagicMock
import polars as pl
import pytest
from polars.testing import assert_frame_equal
from usher_pipeline.evidence.protein.models import (
ProteinFeatureRecord,
CILIA_DOMAIN_KEYWORDS,
SCAFFOLD_DOMAIN_TYPES,
)
from usher_pipeline.evidence.protein.transform import (
extract_protein_features,
detect_cilia_motifs,
normalize_protein_features,
)
@pytest.fixture
def sample_uniprot_df():
"""Sample UniProt data for testing."""
return pl.DataFrame({
"uniprot_id": ["P12345", "Q67890", "A11111", "B22222"],
"protein_length": [500, 1200, 300, None], # None = not found
"domain_names": [
["PDZ domain", "Kinase domain"],
["IFT complex subunit", "WD40 repeat"],
["Transmembrane region"],
[],
],
"coiled_coil_count": [2, 0, 0, None],
"transmembrane_count": [0, 5, 10, None],
})
@pytest.fixture
def sample_interpro_df():
"""Sample InterPro data for testing."""
return pl.DataFrame({
"uniprot_id": ["P12345", "Q67890", "A11111", "B22222"],
"domain_names": [
["SH3 domain"],
["Ciliary targeting signal", "Ankyrin repeat"],
[],
[],
],
"interpro_ids": [
["IPR001452"],
["IPR005598", "IPR002110"],
[],
[],
],
})
def test_uniprot_feature_extraction(sample_uniprot_df, sample_interpro_df):
"""Correct parsing of length, domain, coiled-coil, TM from UniProt data."""
df = extract_protein_features(sample_uniprot_df, sample_interpro_df)
# Check P12345
p12345 = df.filter(pl.col("uniprot_id") == "P12345")
assert p12345["protein_length"][0] == 500
assert p12345["coiled_coil"][0] == True # count=2 > 0
assert p12345["coiled_coil_count"][0] == 2
assert p12345["transmembrane_count"][0] == 0
# Domain count should include both UniProt and InterPro (deduplicated)
assert p12345["domain_count"][0] == 3 # PDZ, Kinase, SH3
# Check B22222 (not found in UniProt)
b22222 = df.filter(pl.col("uniprot_id") == "B22222")
assert b22222["protein_length"][0] is None
assert b22222["coiled_coil"][0] is None
assert b22222["transmembrane_count"][0] is None
def test_cilia_motif_detection_positive():
"""Domain name containing cilia keywords sets has_cilia_domain=True."""
df = pl.DataFrame({
"uniprot_id": ["P12345"],
"protein_length": [500],
"domain_count": [2],
"coiled_coil": [False],
"coiled_coil_count": [0],
"transmembrane_count": [0],
"domain_names": [["IFT complex subunit", "Kinase domain"]],
})
result = detect_cilia_motifs(df)
assert result["has_cilia_domain"][0] == True
def test_cilia_motif_detection_negative():
"""Standard domain (e.g., Kinase) does not trigger has_cilia_domain."""
df = pl.DataFrame({
"uniprot_id": ["P12345"],
"protein_length": [500],
"domain_count": [1],
"coiled_coil": [False],
"coiled_coil_count": [0],
"transmembrane_count": [0],
"domain_names": [["Kinase domain"]],
})
result = detect_cilia_motifs(df)
assert result["has_cilia_domain"][0] == False
def test_scaffold_detection():
"""PDZ domain triggers scaffold_adaptor_domain=True."""
df = pl.DataFrame({
"uniprot_id": ["P12345"],
"protein_length": [500],
"domain_count": [1],
"coiled_coil": [False],
"coiled_coil_count": [0],
"transmembrane_count": [0],
"domain_names": [["PDZ domain"]],
})
result = detect_cilia_motifs(df)
assert result["scaffold_adaptor_domain"][0] == True
def test_null_uniprot():
"""Gene without UniProt entry has all features NULL."""
df = pl.DataFrame({
"uniprot_id": ["B22222"],
"protein_length": [None],
"domain_count": [0],
"coiled_coil": [None],
"coiled_coil_count": [None],
"transmembrane_count": [None],
"domain_names": [[]],
})
result = detect_cilia_motifs(df)
result = normalize_protein_features(result)
# All boolean flags should be NULL (not False)
assert result["has_cilia_domain"][0] is None
assert result["scaffold_adaptor_domain"][0] is None
assert result["has_sensory_domain"][0] is None
# Composite score should be NULL
assert result["protein_score_normalized"][0] is None
def test_normalization_bounds():
"""All normalized features are in [0, 1] range."""
df = pl.DataFrame({
"uniprot_id": ["P1", "P2", "P3"],
"protein_length": [100, 500, 2000],
"domain_count": [0, 5, 20],
"coiled_coil": [False, True, True],
"coiled_coil_count": [0, 2, 5],
"transmembrane_count": [0, 5, 25], # 25 gets capped at 20
"domain_names": [[], ["PDZ"], ["IFT", "Ciliary"]],
})
result = detect_cilia_motifs(df)
result = normalize_protein_features(result)
# Check all scores are in [0, 1]
for score in result["protein_score_normalized"]:
assert score is not None
assert 0.0 <= score <= 1.0
def test_composite_score_cilia_gene():
"""Gene with cilia domains scores higher than gene without."""
df = pl.DataFrame({
"uniprot_id": ["P_CILIA", "P_NOCILIA"],
"protein_length": [500, 500],
"domain_count": [5, 5],
"coiled_coil": [True, True],
"coiled_coil_count": [2, 2],
"transmembrane_count": [5, 5],
"domain_names": [
["IFT complex", "PDZ domain"], # Has cilia + scaffold
["Kinase domain", "PDZ domain"], # Only scaffold
],
})
result = detect_cilia_motifs(df)
result = normalize_protein_features(result)
cilia_score = result.filter(pl.col("uniprot_id") == "P_CILIA")["protein_score_normalized"][0]
nocilia_score = result.filter(pl.col("uniprot_id") == "P_NOCILIA")["protein_score_normalized"][0]
# Cilia gene should score higher (15% weight for has_cilia_domain)
assert cilia_score > nocilia_score
def test_composite_score_null_handling():
"""NULL UniProt produces NULL composite score (not 0.0)."""
df = pl.DataFrame({
"uniprot_id": ["P_VALID", "P_NULL"],
"protein_length": [500, None],
"domain_count": [5, 0],
"coiled_coil": [True, None],
"coiled_coil_count": [2, None],
"transmembrane_count": [5, None],
"domain_names": [["PDZ"], []],
})
result = detect_cilia_motifs(df)
result = normalize_protein_features(result)
valid_score = result.filter(pl.col("uniprot_id") == "P_VALID")["protein_score_normalized"][0]
null_score = result.filter(pl.col("uniprot_id") == "P_NULL")["protein_score_normalized"][0]
assert valid_score is not None
assert null_score is None # NOT 0.0
def test_domain_keyword_case_insensitive():
"""Cilia keyword matching is case-insensitive."""
df = pl.DataFrame({
"uniprot_id": ["P1", "P2", "P3"],
"protein_length": [500, 500, 500],
"domain_count": [1, 1, 1],
"coiled_coil": [False, False, False],
"coiled_coil_count": [0, 0, 0],
"transmembrane_count": [0, 0, 0],
"domain_names": [
["intraflagellar transport"], # lowercase
["CILIARY targeting signal"], # uppercase
["Basal Body protein"], # mixed case
],
})
result = detect_cilia_motifs(df)
# All should match
assert result["has_cilia_domain"][0] == True
assert result["has_cilia_domain"][1] == True
assert result["has_cilia_domain"][2] == True
@patch("usher_pipeline.evidence.protein.fetch.httpx.Client")
def test_fetch_uniprot_features_with_mock(mock_client_class):
"""Test UniProt fetch with mocked HTTP responses."""
from usher_pipeline.evidence.protein.fetch import fetch_uniprot_features
# Mock httpx client
mock_client = MagicMock()
mock_client_class.return_value.__enter__.return_value = mock_client
# Mock UniProt API response
mock_response = Mock()
mock_response.json.return_value = {
"results": [
{
"primaryAccession": "P12345",
"sequence": {"length": 500},
"features": [
{"type": "Domain", "description": "PDZ domain"},
{"type": "Coiled coil"},
{"type": "Transmembrane"},
],
}
]
}
mock_client.get.return_value = mock_response
# Call fetch
df = fetch_uniprot_features(["P12345"])
# Verify result
assert len(df) == 1
assert df["uniprot_id"][0] == "P12345"
assert df["protein_length"][0] == 500
assert df["coiled_coil_count"][0] == 1
assert df["transmembrane_count"][0] == 1
@patch("usher_pipeline.evidence.protein.fetch.httpx.Client")
def test_fetch_interpro_domains_with_mock(mock_client_class):
"""Test InterPro fetch with mocked HTTP responses."""
from usher_pipeline.evidence.protein.fetch import fetch_interpro_domains
# Mock httpx client
mock_client = MagicMock()
mock_client_class.return_value.__enter__.return_value = mock_client
# Mock InterPro API response
mock_response = Mock()
mock_response.json.return_value = {
"results": [
{
"metadata": {
"accession": "IPR001452",
"name": {"name": "SH3 domain"},
}
}
]
}
mock_client.get.return_value = mock_response
# Call fetch
df = fetch_interpro_domains(["P12345"])
# Verify result
assert len(df) == 1
assert df["uniprot_id"][0] == "P12345"
assert "SH3 domain" in df["domain_names"][0]
assert "IPR001452" in df["interpro_ids"][0]