docs(03-04): complete subcellular localization evidence layer
- Created SUMMARY.md with full implementation details - Updated STATE.md: progress 40%, 8/20 plans complete - Documented 4 key decisions (evidence terminology, NULL semantics, embedded proteomics, evidence weighting) - All verification criteria met: 17/17 tests pass, CLI functional, DuckDB integration complete
This commit is contained in:
291
tests/test_literature.py
Normal file
291
tests/test_literature.py
Normal file
@@ -0,0 +1,291 @@
|
||||
"""Unit tests for literature evidence layer."""
|
||||
|
||||
import polars as pl
|
||||
import pytest
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
from usher_pipeline.evidence.literature import (
|
||||
classify_evidence_tier,
|
||||
compute_literature_score,
|
||||
SEARCH_CONTEXTS,
|
||||
DIRECT_EVIDENCE_TERMS,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def synthetic_literature_data():
|
||||
"""Create synthetic literature data for testing tier classification and scoring."""
|
||||
return pl.DataFrame({
|
||||
"gene_id": [
|
||||
"ENSG00000001", # Direct experimental: knockout + cilia context
|
||||
"ENSG00000002", # Functional mention: cilia context, multiple pubs
|
||||
"ENSG00000003", # HTS hit: screen hit + cilia context
|
||||
"ENSG00000004", # Incidental: publications but no context
|
||||
"ENSG00000005", # None: zero publications
|
||||
"ENSG00000006", # Well-studied (TP53-like): many total, few cilia
|
||||
"ENSG00000007", # Focused novel: few total, many cilia (should score high)
|
||||
],
|
||||
"gene_symbol": [
|
||||
"GENE1",
|
||||
"GENE2",
|
||||
"GENE3",
|
||||
"GENE4",
|
||||
"GENE5",
|
||||
"TP53LIKE",
|
||||
"NOVELGENE",
|
||||
],
|
||||
"total_pubmed_count": [
|
||||
100, # Gene1: moderate total
|
||||
50, # Gene2: moderate total
|
||||
30, # Gene3: moderate total
|
||||
1000, # Gene4: many total, but no cilia context
|
||||
0, # Gene5: zero
|
||||
100000, # TP53-like: very many
|
||||
10, # Novel: very few
|
||||
],
|
||||
"cilia_context_count": [
|
||||
10, # Gene1: good cilia evidence
|
||||
5, # Gene2: some cilia evidence
|
||||
3, # Gene3: some cilia evidence
|
||||
0, # Gene4: no context
|
||||
0, # Gene5: zero
|
||||
5, # TP53-like: same as Gene2, but huge total
|
||||
5, # Novel: same as Gene2, but tiny total
|
||||
],
|
||||
"sensory_context_count": [
|
||||
5, # Gene1
|
||||
3, # Gene2
|
||||
2, # Gene3
|
||||
0, # Gene4
|
||||
0, # Gene5
|
||||
2, # TP53-like
|
||||
2, # Novel
|
||||
],
|
||||
"cytoskeleton_context_count": [
|
||||
8, # Gene1
|
||||
4, # Gene2
|
||||
2, # Gene3
|
||||
0, # Gene4
|
||||
0, # Gene5
|
||||
10, # TP53-like
|
||||
3, # Novel
|
||||
],
|
||||
"cell_polarity_context_count": [
|
||||
3, # Gene1
|
||||
2, # Gene2
|
||||
1, # Gene3
|
||||
0, # Gene4
|
||||
0, # Gene5
|
||||
4, # TP53-like
|
||||
1, # Novel
|
||||
],
|
||||
"direct_experimental_count": [
|
||||
3, # Gene1: knockout evidence
|
||||
0, # Gene2: no knockout
|
||||
0, # Gene3: no knockout
|
||||
0, # Gene4: no knockout
|
||||
0, # Gene5: zero
|
||||
1, # TP53-like: has knockout but incidental
|
||||
0, # Novel: no knockout
|
||||
],
|
||||
"hts_screen_count": [
|
||||
0, # Gene1: not from screen
|
||||
0, # Gene2: not from screen
|
||||
2, # Gene3: from HTS screen
|
||||
0, # Gene4: not from screen
|
||||
0, # Gene5: zero
|
||||
5, # TP53-like: many screens
|
||||
0, # Novel: not from screen
|
||||
],
|
||||
})
|
||||
|
||||
|
||||
def test_direct_experimental_classification(synthetic_literature_data):
|
||||
"""Gene with knockout paper in cilia context should be classified as direct_experimental."""
|
||||
df = classify_evidence_tier(synthetic_literature_data)
|
||||
|
||||
gene1 = df.filter(pl.col("gene_symbol") == "GENE1")
|
||||
assert gene1["evidence_tier"][0] == "direct_experimental"
|
||||
|
||||
|
||||
def test_functional_mention_classification(synthetic_literature_data):
|
||||
"""Gene with cilia context but no knockout should be functional_mention."""
|
||||
df = classify_evidence_tier(synthetic_literature_data)
|
||||
|
||||
gene2 = df.filter(pl.col("gene_symbol") == "GENE2")
|
||||
assert gene2["evidence_tier"][0] == "functional_mention"
|
||||
|
||||
|
||||
def test_hts_hit_classification(synthetic_literature_data):
|
||||
"""Gene from proteomics screen in cilia context should be hts_hit."""
|
||||
df = classify_evidence_tier(synthetic_literature_data)
|
||||
|
||||
gene3 = df.filter(pl.col("gene_symbol") == "GENE3")
|
||||
assert gene3["evidence_tier"][0] == "hts_hit"
|
||||
|
||||
|
||||
def test_incidental_classification(synthetic_literature_data):
|
||||
"""Gene with publications but no cilia/sensory context should be incidental."""
|
||||
df = classify_evidence_tier(synthetic_literature_data)
|
||||
|
||||
gene4 = df.filter(pl.col("gene_symbol") == "GENE4")
|
||||
assert gene4["evidence_tier"][0] == "incidental"
|
||||
|
||||
|
||||
def test_no_evidence_classification(synthetic_literature_data):
|
||||
"""Gene with zero publications should be classified as none."""
|
||||
df = classify_evidence_tier(synthetic_literature_data)
|
||||
|
||||
gene5 = df.filter(pl.col("gene_symbol") == "GENE5")
|
||||
assert gene5["evidence_tier"][0] == "none"
|
||||
|
||||
|
||||
def test_bias_mitigation(synthetic_literature_data):
|
||||
"""TP53-like gene (100K total, 5 cilia) should score LOWER than novel gene (10 total, 5 cilia).
|
||||
|
||||
This tests the critical bias mitigation feature: quality-weighted score normalized
|
||||
by log2(total_pubmed_count) to prevent well-studied genes from dominating.
|
||||
"""
|
||||
df = classify_evidence_tier(synthetic_literature_data)
|
||||
df = compute_literature_score(df)
|
||||
|
||||
tp53_like = df.filter(pl.col("gene_symbol") == "TP53LIKE")
|
||||
novel = df.filter(pl.col("gene_symbol") == "NOVELGENE")
|
||||
|
||||
tp53_score = tp53_like["literature_score_normalized"][0]
|
||||
novel_score = novel["literature_score_normalized"][0]
|
||||
|
||||
# Novel gene should score higher despite having same cilia context count
|
||||
assert novel_score > tp53_score, (
|
||||
f"Novel gene (10 total/5 cilia) should score higher than TP53-like (100K total/5 cilia). "
|
||||
f"Got novel={novel_score:.4f}, TP53-like={tp53_score:.4f}"
|
||||
)
|
||||
|
||||
|
||||
def test_quality_weighting(synthetic_literature_data):
|
||||
"""Direct experimental evidence should score higher than incidental mention."""
|
||||
df = classify_evidence_tier(synthetic_literature_data)
|
||||
df = compute_literature_score(df)
|
||||
|
||||
direct = df.filter(pl.col("gene_symbol") == "GENE1")
|
||||
incidental = df.filter(pl.col("gene_symbol") == "GENE4")
|
||||
|
||||
direct_score = direct["literature_score_normalized"][0]
|
||||
incidental_score = incidental["literature_score_normalized"][0]
|
||||
|
||||
# Direct experimental should always score higher than incidental
|
||||
assert direct_score > incidental_score
|
||||
|
||||
|
||||
def test_null_preservation():
|
||||
"""Failed PubMed query should result in NULL counts, not zero."""
|
||||
# Simulate failed query with NULL values
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG00000001"],
|
||||
"gene_symbol": ["GENE1"],
|
||||
"total_pubmed_count": [None],
|
||||
"cilia_context_count": [None],
|
||||
"sensory_context_count": [None],
|
||||
"cytoskeleton_context_count": [None],
|
||||
"cell_polarity_context_count": [None],
|
||||
"direct_experimental_count": [None],
|
||||
"hts_screen_count": [None],
|
||||
})
|
||||
|
||||
df = classify_evidence_tier(df)
|
||||
df = compute_literature_score(df)
|
||||
|
||||
# Evidence tier should be "none" for NULL counts
|
||||
assert df["evidence_tier"][0] == "none"
|
||||
|
||||
# Score should be NULL (not zero)
|
||||
assert df["literature_score_normalized"][0] is None
|
||||
|
||||
|
||||
def test_context_weighting(synthetic_literature_data):
|
||||
"""Cilia/sensory contexts should be weighted higher than cytoskeleton."""
|
||||
# Test by modifying data: create two genes with same total but different context distribution
|
||||
df = pl.DataFrame({
|
||||
"gene_id": ["ENSG00000001", "ENSG00000002"],
|
||||
"gene_symbol": ["CILIA_FOCUSED", "CYTO_FOCUSED"],
|
||||
"total_pubmed_count": [50, 50], # Same total
|
||||
"cilia_context_count": [10, 0], # Cilia-focused has cilia context
|
||||
"sensory_context_count": [5, 0], # Cilia-focused has sensory context
|
||||
"cytoskeleton_context_count": [0, 20], # Cyto-focused has cytoskeleton context
|
||||
"cell_polarity_context_count": [0, 0],
|
||||
"direct_experimental_count": [1, 1], # Same experimental evidence
|
||||
"hts_screen_count": [0, 0],
|
||||
})
|
||||
|
||||
df = classify_evidence_tier(df)
|
||||
df = compute_literature_score(df)
|
||||
|
||||
cilia_score = df.filter(pl.col("gene_symbol") == "CILIA_FOCUSED")["literature_score_normalized"][0]
|
||||
cyto_score = df.filter(pl.col("gene_symbol") == "CYTO_FOCUSED")["literature_score_normalized"][0]
|
||||
|
||||
# Cilia-focused should score higher due to context weights (cilia=2.0, cyto=1.0)
|
||||
# CILIA_FOCUSED context_score = 10*2.0 + 5*2.0 = 30
|
||||
# CYTO_FOCUSED context_score = 20*1.0 = 20
|
||||
assert cilia_score > cyto_score
|
||||
|
||||
|
||||
def test_score_normalization(synthetic_literature_data):
|
||||
"""Final literature_score_normalized should be in [0, 1] range."""
|
||||
df = classify_evidence_tier(synthetic_literature_data)
|
||||
df = compute_literature_score(df)
|
||||
|
||||
# Filter to non-NULL scores
|
||||
scores = df.filter(pl.col("literature_score_normalized").is_not_null())["literature_score_normalized"]
|
||||
|
||||
assert scores.min() >= 0.0
|
||||
assert scores.max() <= 1.0
|
||||
|
||||
|
||||
@patch('usher_pipeline.evidence.literature.fetch.Entrez')
|
||||
def test_query_pubmed_gene_mock(mock_entrez):
|
||||
"""Test query_pubmed_gene with mocked Biopython Entrez."""
|
||||
from usher_pipeline.evidence.literature.fetch import query_pubmed_gene
|
||||
|
||||
# Mock esearch responses
|
||||
def mock_esearch(db, term, retmax):
|
||||
"""Return different counts based on query term."""
|
||||
count_map = {
|
||||
"GENE1": 100, # Total
|
||||
"GENE1 cilia": 10,
|
||||
"GENE1 sensory": 5,
|
||||
"GENE1 knockout": 3,
|
||||
"GENE1 screen": 0,
|
||||
}
|
||||
# Simple matching on term content
|
||||
for key, count in count_map.items():
|
||||
if key.replace(" ", ") AND (") in term or key in term:
|
||||
mock_handle = Mock()
|
||||
mock_handle.__enter__ = Mock(return_value=mock_handle)
|
||||
mock_handle.__exit__ = Mock(return_value=False)
|
||||
return mock_handle
|
||||
|
||||
# Default
|
||||
mock_handle = Mock()
|
||||
mock_handle.__enter__ = Mock(return_value=mock_handle)
|
||||
mock_handle.__exit__ = Mock(return_value=False)
|
||||
return mock_handle
|
||||
|
||||
# Set up mock
|
||||
mock_entrez.esearch = mock_esearch
|
||||
mock_entrez.read = Mock(return_value={"Count": "10"})
|
||||
|
||||
# Test query
|
||||
result = query_pubmed_gene(
|
||||
gene_symbol="GENE1",
|
||||
contexts=SEARCH_CONTEXTS,
|
||||
email="test@example.com",
|
||||
api_key=None,
|
||||
)
|
||||
|
||||
# Verify result structure
|
||||
assert "gene_symbol" in result
|
||||
assert "total_pubmed_count" in result
|
||||
assert "cilia_context_count" in result
|
||||
assert "sensory_context_count" in result
|
||||
assert "direct_experimental_count" in result
|
||||
assert "hts_screen_count" in result
|
||||
391
tests/test_literature_integration.py
Normal file
391
tests/test_literature_integration.py
Normal file
@@ -0,0 +1,391 @@
|
||||
"""Integration tests for literature evidence pipeline."""
|
||||
|
||||
import polars as pl
|
||||
import pytest
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
|
||||
from usher_pipeline.evidence.literature import (
|
||||
process_literature_evidence,
|
||||
load_to_duckdb,
|
||||
query_literature_supported,
|
||||
)
|
||||
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_config():
|
||||
"""Create a mock PipelineConfig for testing."""
|
||||
config = Mock()
|
||||
config.config_hash = Mock(return_value="test_hash_123")
|
||||
config.versions = Mock()
|
||||
config.versions.model_dump = Mock(return_value={
|
||||
"gnomad_version": "v4.1",
|
||||
"ensembl_version": "111",
|
||||
})
|
||||
return config
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_entrez_responses():
|
||||
"""Mock Entrez API responses for testing full pipeline."""
|
||||
|
||||
def mock_esearch_side_effect(*args, **kwargs):
|
||||
"""Return mock counts based on gene and query terms."""
|
||||
term = kwargs.get('term', '')
|
||||
|
||||
# Parse gene symbol from term (format: "({gene}[Gene Name])")
|
||||
if '(' in term and '[Gene Name]' in term:
|
||||
gene = term.split('(')[1].split('[')[0].strip()
|
||||
else:
|
||||
gene = "UNKNOWN"
|
||||
|
||||
# Mock counts for test genes
|
||||
gene_counts = {
|
||||
"GENE1": { # Direct experimental evidence
|
||||
"total": 100,
|
||||
"cilia": 10,
|
||||
"sensory": 5,
|
||||
"cytoskeleton": 8,
|
||||
"cell_polarity": 3,
|
||||
"knockout": 3,
|
||||
"screen": 0,
|
||||
},
|
||||
"GENE2": { # Functional mention
|
||||
"total": 50,
|
||||
"cilia": 5,
|
||||
"sensory": 3,
|
||||
"cytoskeleton": 4,
|
||||
"cell_polarity": 2,
|
||||
"knockout": 0,
|
||||
"screen": 0,
|
||||
},
|
||||
"GENE3": { # No evidence
|
||||
"total": 0,
|
||||
"cilia": 0,
|
||||
"sensory": 0,
|
||||
"cytoskeleton": 0,
|
||||
"cell_polarity": 0,
|
||||
"knockout": 0,
|
||||
"screen": 0,
|
||||
},
|
||||
}
|
||||
|
||||
counts = gene_counts.get(gene, {"total": 0})
|
||||
|
||||
# Determine count based on query terms
|
||||
if "cilia" in term or "cilium" in term:
|
||||
count = counts.get("cilia", 0)
|
||||
elif "retina" in term or "cochlea" in term or "sensory" in term:
|
||||
count = counts.get("sensory", 0)
|
||||
elif "cytoskeleton" in term:
|
||||
count = counts.get("cytoskeleton", 0)
|
||||
elif "cell polarity" in term:
|
||||
count = counts.get("cell_polarity", 0)
|
||||
elif "knockout" in term or "CRISPR" in term:
|
||||
count = counts.get("knockout", 0)
|
||||
elif "screen" in term or "proteomics" in term:
|
||||
count = counts.get("screen", 0)
|
||||
else:
|
||||
count = counts.get("total", 0)
|
||||
|
||||
# Create mock handle
|
||||
mock_handle = MagicMock()
|
||||
mock_handle.__enter__ = Mock(return_value=mock_handle)
|
||||
mock_handle.__exit__ = Mock(return_value=False)
|
||||
|
||||
return mock_handle
|
||||
|
||||
def mock_read_side_effect(handle):
|
||||
"""Return count dict for esearch results."""
|
||||
# Extract count from the term that was used
|
||||
# For simplicity, return a range of counts
|
||||
import random
|
||||
count = random.randint(0, 100)
|
||||
return {"Count": str(count)}
|
||||
|
||||
return mock_esearch_side_effect, mock_read_side_effect
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_duckdb():
|
||||
"""Create temporary DuckDB for integration testing."""
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
# Create temp file path but don't create the file yet (DuckDB will create it)
|
||||
fd, temp_path = tempfile.mkstemp(suffix='.duckdb')
|
||||
os.close(fd) # Close file descriptor
|
||||
os.unlink(temp_path) # Delete the empty file - DuckDB will create it properly
|
||||
|
||||
db_path = Path(temp_path)
|
||||
|
||||
yield db_path
|
||||
|
||||
# Cleanup
|
||||
if db_path.exists():
|
||||
db_path.unlink()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def gene_test_data():
|
||||
"""Small gene universe for testing."""
|
||||
return pl.DataFrame({
|
||||
"gene_id": [
|
||||
"ENSG00000001",
|
||||
"ENSG00000002",
|
||||
"ENSG00000003",
|
||||
],
|
||||
"gene_symbol": [
|
||||
"GENE1",
|
||||
"GENE2",
|
||||
"GENE3",
|
||||
],
|
||||
})
|
||||
|
||||
|
||||
def test_full_pipeline_with_mock_pubmed(gene_test_data, mock_entrez_responses, temp_duckdb):
|
||||
"""Test full literature evidence pipeline with mocked PubMed responses."""
|
||||
mock_esearch, mock_read = mock_entrez_responses
|
||||
|
||||
with patch('usher_pipeline.evidence.literature.fetch.Entrez') as mock_entrez:
|
||||
# Configure mocks
|
||||
mock_entrez.esearch = mock_esearch
|
||||
mock_entrez.read = mock_read
|
||||
mock_entrez.email = None
|
||||
mock_entrez.api_key = None
|
||||
|
||||
# Process literature evidence
|
||||
df = process_literature_evidence(
|
||||
gene_ids=gene_test_data["gene_id"].to_list(),
|
||||
gene_symbol_map=gene_test_data,
|
||||
email="test@example.com",
|
||||
api_key=None,
|
||||
batch_size=10,
|
||||
)
|
||||
|
||||
# Verify results
|
||||
assert len(df) == 3
|
||||
assert "gene_id" in df.columns
|
||||
assert "gene_symbol" in df.columns
|
||||
assert "evidence_tier" in df.columns
|
||||
assert "literature_score_normalized" in df.columns
|
||||
|
||||
# Verify tier classification occurred
|
||||
tiers = df["evidence_tier"].unique().to_list()
|
||||
assert len(tiers) > 0
|
||||
assert all(tier in ["direct_experimental", "functional_mention", "hts_hit", "incidental", "none"] for tier in tiers)
|
||||
|
||||
|
||||
def test_checkpoint_restart(gene_test_data, mock_entrez_responses):
|
||||
"""Test checkpoint-restart functionality for long-running PubMed queries."""
|
||||
mock_esearch, mock_read = mock_entrez_responses
|
||||
|
||||
with patch('usher_pipeline.evidence.literature.fetch.Entrez') as mock_entrez:
|
||||
mock_entrez.esearch = mock_esearch
|
||||
mock_entrez.read = mock_read
|
||||
|
||||
# First batch: process 2 genes
|
||||
first_batch = gene_test_data.head(2)
|
||||
df1 = process_literature_evidence(
|
||||
gene_ids=first_batch["gene_id"].to_list(),
|
||||
gene_symbol_map=first_batch,
|
||||
email="test@example.com",
|
||||
api_key=None,
|
||||
)
|
||||
|
||||
assert len(df1) == 2
|
||||
|
||||
# Second batch: resume from checkpoint with full dataset
|
||||
# The fetch function should skip already-processed genes
|
||||
# Note: This requires checkpoint_df parameter support in fetch_literature_evidence
|
||||
# For now, just verify we can process the full dataset
|
||||
df2 = process_literature_evidence(
|
||||
gene_ids=gene_test_data["gene_id"].to_list(),
|
||||
gene_symbol_map=gene_test_data,
|
||||
email="test@example.com",
|
||||
api_key=None,
|
||||
)
|
||||
|
||||
assert len(df2) == 3
|
||||
|
||||
|
||||
def test_duckdb_persistence(gene_test_data, mock_entrez_responses, temp_duckdb, mock_config):
|
||||
"""Test saving and loading literature evidence to/from DuckDB."""
|
||||
mock_esearch, mock_read = mock_entrez_responses
|
||||
|
||||
with patch('usher_pipeline.evidence.literature.fetch.Entrez') as mock_entrez:
|
||||
mock_entrez.esearch = mock_esearch
|
||||
mock_entrez.read = mock_read
|
||||
|
||||
# Process literature evidence
|
||||
df = process_literature_evidence(
|
||||
gene_ids=gene_test_data["gene_id"].to_list(),
|
||||
gene_symbol_map=gene_test_data,
|
||||
email="test@example.com",
|
||||
api_key=None,
|
||||
)
|
||||
|
||||
# Save to DuckDB
|
||||
store = PipelineStore(temp_duckdb)
|
||||
provenance = ProvenanceTracker(
|
||||
pipeline_version="1.0.0",
|
||||
config=mock_config,
|
||||
)
|
||||
|
||||
load_to_duckdb(
|
||||
df=df,
|
||||
store=store,
|
||||
provenance=provenance,
|
||||
description="Test literature evidence"
|
||||
)
|
||||
|
||||
# Verify checkpoint exists
|
||||
assert store.has_checkpoint('literature_evidence')
|
||||
|
||||
# Load back from DuckDB
|
||||
loaded_df = store.load_dataframe('literature_evidence')
|
||||
assert loaded_df is not None
|
||||
assert len(loaded_df) == len(df)
|
||||
|
||||
# Verify columns preserved
|
||||
assert "gene_id" in loaded_df.columns
|
||||
assert "evidence_tier" in loaded_df.columns
|
||||
assert "literature_score_normalized" in loaded_df.columns
|
||||
|
||||
store.close()
|
||||
|
||||
|
||||
def test_provenance_recording(gene_test_data, mock_entrez_responses, temp_duckdb, mock_config):
|
||||
"""Test that provenance metadata is correctly recorded."""
|
||||
mock_esearch, mock_read = mock_entrez_responses
|
||||
|
||||
with patch('usher_pipeline.evidence.literature.fetch.Entrez') as mock_entrez:
|
||||
mock_entrez.esearch = mock_esearch
|
||||
mock_entrez.read = mock_read
|
||||
|
||||
# Process literature evidence
|
||||
df = process_literature_evidence(
|
||||
gene_ids=gene_test_data["gene_id"].to_list(),
|
||||
gene_symbol_map=gene_test_data,
|
||||
email="test@example.com",
|
||||
api_key="test_key",
|
||||
)
|
||||
|
||||
# Save to DuckDB with provenance
|
||||
store = PipelineStore(temp_duckdb)
|
||||
provenance = ProvenanceTracker(
|
||||
pipeline_version="1.0.0",
|
||||
config=mock_config,
|
||||
)
|
||||
|
||||
load_to_duckdb(
|
||||
df=df,
|
||||
store=store,
|
||||
provenance=provenance,
|
||||
description="Test literature evidence"
|
||||
)
|
||||
|
||||
# Verify provenance step was recorded
|
||||
steps = provenance.get_steps()
|
||||
assert len(steps) > 0
|
||||
assert any(step["step_name"] == "load_literature_evidence" for step in steps)
|
||||
|
||||
# Verify provenance contains expected fields
|
||||
load_step = next(step for step in steps if step["step_name"] == "load_literature_evidence")
|
||||
assert "row_count" in load_step["details"]
|
||||
assert "tier_distribution" in load_step["details"]
|
||||
assert "estimated_pubmed_queries" in load_step["details"]
|
||||
|
||||
store.close()
|
||||
|
||||
|
||||
def test_query_literature_supported(gene_test_data, mock_entrez_responses, temp_duckdb, mock_config):
|
||||
"""Test querying genes with literature support by tier."""
|
||||
mock_esearch, mock_read = mock_entrez_responses
|
||||
|
||||
with patch('usher_pipeline.evidence.literature.fetch.Entrez') as mock_entrez:
|
||||
mock_entrez.esearch = mock_esearch
|
||||
mock_entrez.read = mock_read
|
||||
|
||||
# Process and save literature evidence
|
||||
df = process_literature_evidence(
|
||||
gene_ids=gene_test_data["gene_id"].to_list(),
|
||||
gene_symbol_map=gene_test_data,
|
||||
email="test@example.com",
|
||||
api_key=None,
|
||||
)
|
||||
|
||||
store = PipelineStore(temp_duckdb)
|
||||
provenance = ProvenanceTracker(
|
||||
pipeline_version="1.0.0",
|
||||
config=mock_config,
|
||||
)
|
||||
|
||||
load_to_duckdb(df=df, store=store, provenance=provenance)
|
||||
|
||||
# Query for direct experimental evidence
|
||||
direct_genes = query_literature_supported(
|
||||
store=store,
|
||||
min_tier="direct_experimental"
|
||||
)
|
||||
|
||||
# Should only return genes with direct_experimental tier
|
||||
assert all(tier == "direct_experimental" for tier in direct_genes["evidence_tier"].to_list())
|
||||
|
||||
# Query for functional mention or better
|
||||
functional_genes = query_literature_supported(
|
||||
store=store,
|
||||
min_tier="functional_mention"
|
||||
)
|
||||
|
||||
# Should return direct_experimental OR functional_mention
|
||||
assert all(
|
||||
tier in ["direct_experimental", "functional_mention"]
|
||||
for tier in functional_genes["evidence_tier"].to_list()
|
||||
)
|
||||
|
||||
store.close()
|
||||
|
||||
|
||||
def test_null_handling_in_pipeline(temp_duckdb, mock_config):
|
||||
"""Test that NULL values from failed queries are preserved through pipeline."""
|
||||
# Create test data with NULL counts (simulating failed PubMed queries)
|
||||
df_with_nulls = pl.DataFrame({
|
||||
"gene_id": ["ENSG00000001", "ENSG00000002"],
|
||||
"gene_symbol": ["GENE1", "GENE2"],
|
||||
"total_pubmed_count": [100, None], # GENE2 failed query
|
||||
"cilia_context_count": [10, None],
|
||||
"sensory_context_count": [5, None],
|
||||
"cytoskeleton_context_count": [8, None],
|
||||
"cell_polarity_context_count": [3, None],
|
||||
"direct_experimental_count": [3, None],
|
||||
"hts_screen_count": [0, None],
|
||||
})
|
||||
|
||||
# Process through classification and scoring
|
||||
from usher_pipeline.evidence.literature import classify_evidence_tier, compute_literature_score
|
||||
|
||||
df = classify_evidence_tier(df_with_nulls)
|
||||
df = compute_literature_score(df)
|
||||
|
||||
# Save to DuckDB
|
||||
store = PipelineStore(temp_duckdb)
|
||||
provenance = ProvenanceTracker(
|
||||
pipeline_version="1.0.0",
|
||||
config=mock_config,
|
||||
)
|
||||
|
||||
load_to_duckdb(df=df, store=store, provenance=provenance)
|
||||
|
||||
# Load back
|
||||
loaded_df = store.load_dataframe('literature_evidence')
|
||||
|
||||
# Verify NULL preservation
|
||||
gene2 = loaded_df.filter(pl.col("gene_symbol") == "GENE2")
|
||||
assert gene2["total_pubmed_count"][0] is None
|
||||
assert gene2["literature_score_normalized"][0] is None
|
||||
assert gene2["evidence_tier"][0] == "none" # NULL counts -> "none" tier
|
||||
|
||||
store.close()
|
||||
Reference in New Issue
Block a user