- load.py: DuckDB persistence with provenance tracking, ortholog confidence distribution stats - CLI animal-models command: checkpoint-restart pattern, top scoring genes display - 10 unit tests: ortholog confidence scoring, keyword filtering, multi-organism bonus, NULL preservation - 4 integration tests: full pipeline, checkpoint-restart, provenance tracking, empty phenotype handling - All tests pass (14/14): validates fetch->transform->load->CLI flow - Fixed polars deprecations: str.join replaces str.concat, pl.len replaces pl.count
270 lines
9.9 KiB
Python
270 lines
9.9 KiB
Python
"""Integration tests for animal model evidence layer."""
|
|
|
|
import tempfile
|
|
from pathlib import Path
|
|
from unittest.mock import patch, Mock
|
|
|
|
import polars as pl
|
|
import pytest
|
|
|
|
from usher_pipeline.evidence.animal_models import (
|
|
process_animal_model_evidence,
|
|
load_to_duckdb,
|
|
)
|
|
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_hcop_data():
|
|
"""Mock HCOP ortholog mapping data."""
|
|
mouse_data = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tmouse_entrez_gene\tmouse_ensembl_gene\tmgi_id\tmouse_name\tmouse_symbol\tmouse_chr\tmouse_assert_ids\tsupport
|
|
123\tENSG00000001\tHGNC:1\tUSH2A\tUSH2A\t1\t\t456\tENSMUSG001\tMGI:1\tUsh2a\tUsh2a\t1\t\tdb1,db2,db3,db4,db5,db6,db7,db8
|
|
456\tENSG00000002\tHGNC:2\tMYO7A\tMYO7A\t11\t\t789\tENSMUSG002\tMGI:2\tMyo7a\tMyo7a\t7\t\tdb1,db2,db3,db4,db5"""
|
|
|
|
zebrafish_data = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tzebrafish_entrez_gene\tzebrafish_ensembl_gene\tzfin_id\tzebrafish_name\tzebrafish_symbol\tzebrafish_chr\tzebrafish_assert_ids\tsupport
|
|
123\tENSG00000001\tHGNC:1\tUSH2A\tUSH2A\t1\t\t111\tENSDART001\tZDB-GENE-1\tush2a\tush2a\t1\t\tdb1,db2,db3,db4,db5,db6"""
|
|
|
|
return {'mouse': mouse_data, 'zebrafish': zebrafish_data}
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_phenotype_data():
|
|
"""Mock MGI, ZFIN, and IMPC phenotype data."""
|
|
mgi_data = """Marker Symbol\tMammalian Phenotype ID
|
|
Ush2a\tMP:0001967
|
|
Ush2a\tMP:0005377
|
|
Myo7a\tMP:0001968"""
|
|
|
|
zfin_data = """Gene Symbol\tAffected Structure or Process 1
|
|
ush2a\tabnormal ear morphology
|
|
ush2a\tabnormal retina morphology"""
|
|
|
|
impc_responses = {
|
|
'Ush2a': {
|
|
'response': {
|
|
'docs': [
|
|
{
|
|
'marker_symbol': 'Ush2a',
|
|
'mp_term_id': 'MP:0001967',
|
|
'mp_term_name': 'deafness',
|
|
'p_value': 0.001
|
|
}
|
|
]
|
|
}
|
|
},
|
|
'Myo7a': {
|
|
'response': {
|
|
'docs': [
|
|
{
|
|
'marker_symbol': 'Myo7a',
|
|
'mp_term_id': 'MP:0001968',
|
|
'mp_term_name': 'abnormal cochlea morphology',
|
|
'p_value': 0.0005
|
|
}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
|
|
return {'mgi': mgi_data, 'zfin': zfin_data, 'impc': impc_responses}
|
|
|
|
|
|
def test_full_pipeline(mock_hcop_data, mock_phenotype_data):
|
|
"""Test full animal model evidence pipeline with mocked data sources."""
|
|
gene_ids = ['ENSG00000001', 'ENSG00000002']
|
|
|
|
with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_hcop, \
|
|
patch('usher_pipeline.evidence.animal_models.fetch._download_text') as mock_text, \
|
|
patch('httpx.get') as mock_http:
|
|
|
|
# Mock HCOP downloads
|
|
mock_hcop.side_effect = [
|
|
mock_hcop_data['mouse'].encode('utf-8'),
|
|
mock_hcop_data['zebrafish'].encode('utf-8'),
|
|
]
|
|
|
|
# Mock MGI and ZFIN downloads
|
|
mock_text.side_effect = [
|
|
mock_phenotype_data['mgi'],
|
|
mock_phenotype_data['zfin'],
|
|
]
|
|
|
|
# Mock IMPC API responses
|
|
def mock_impc_response(url, **kwargs):
|
|
response = Mock()
|
|
response.raise_for_status = Mock()
|
|
|
|
# Extract gene symbol from query
|
|
query = kwargs.get('params', {}).get('q', '')
|
|
if 'Ush2a' in query:
|
|
response.json = Mock(return_value=mock_phenotype_data['impc']['Ush2a'])
|
|
elif 'Myo7a' in query:
|
|
response.json = Mock(return_value=mock_phenotype_data['impc']['Myo7a'])
|
|
else:
|
|
response.json = Mock(return_value={'response': {'docs': []}})
|
|
|
|
return response
|
|
|
|
mock_http.side_effect = mock_impc_response
|
|
|
|
# Run pipeline
|
|
result = process_animal_model_evidence(gene_ids)
|
|
|
|
# Verify results
|
|
assert len(result) == 2
|
|
|
|
# Check USH2A (ENSG00000001)
|
|
ush2a = result.filter(pl.col('gene_id') == 'ENSG00000001')
|
|
assert len(ush2a) == 1
|
|
assert ush2a['mouse_ortholog'][0] == 'Ush2a'
|
|
assert ush2a['mouse_ortholog_confidence'][0] == 'HIGH' # 8 sources
|
|
assert ush2a['zebrafish_ortholog'][0] == 'ush2a'
|
|
assert ush2a['zebrafish_ortholog_confidence'][0] == 'MEDIUM' # 6 sources
|
|
assert ush2a['sensory_phenotype_count'][0] is not None
|
|
assert ush2a['animal_model_score_normalized'][0] is not None
|
|
assert ush2a['animal_model_score_normalized'][0] > 0
|
|
|
|
# Check MYO7A (ENSG00000002)
|
|
myo7a = result.filter(pl.col('gene_id') == 'ENSG00000002')
|
|
assert len(myo7a) == 1
|
|
assert myo7a['mouse_ortholog'][0] == 'Myo7a'
|
|
assert myo7a['mouse_ortholog_confidence'][0] == 'MEDIUM' # 5 sources
|
|
|
|
|
|
def test_checkpoint_restart(mock_hcop_data, mock_phenotype_data):
|
|
"""Test checkpoint-restart pattern: load from DuckDB if exists, skip reprocessing."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
db_path = Path(tmpdir) / "test.duckdb"
|
|
store = PipelineStore(db_path)
|
|
|
|
# Initial load
|
|
gene_ids = ['ENSG00000001', 'ENSG00000002']
|
|
|
|
with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_hcop, \
|
|
patch('usher_pipeline.evidence.animal_models.fetch._download_text') as mock_text, \
|
|
patch('httpx.get') as mock_http:
|
|
|
|
mock_hcop.side_effect = [
|
|
mock_hcop_data['mouse'].encode('utf-8'),
|
|
mock_hcop_data['zebrafish'].encode('utf-8'),
|
|
]
|
|
mock_text.side_effect = [
|
|
mock_phenotype_data['mgi'],
|
|
mock_phenotype_data['zfin'],
|
|
]
|
|
|
|
def mock_impc_response(url, **kwargs):
|
|
response = Mock()
|
|
response.raise_for_status = Mock()
|
|
response.json = Mock(return_value={'response': {'docs': []}})
|
|
return response
|
|
|
|
mock_http.side_effect = mock_impc_response
|
|
|
|
df = process_animal_model_evidence(gene_ids)
|
|
|
|
# Save to DuckDB (use mock provenance tracker)
|
|
provenance = Mock()
|
|
provenance.record_step = Mock()
|
|
load_to_duckdb(df, store, provenance)
|
|
|
|
# Check checkpoint exists
|
|
assert store.has_checkpoint('animal_model_phenotypes')
|
|
|
|
# Load from checkpoint
|
|
loaded_df = store.load_dataframe('animal_model_phenotypes')
|
|
assert loaded_df is not None
|
|
assert len(loaded_df) == 2
|
|
|
|
store.close()
|
|
|
|
|
|
def test_provenance_tracking(mock_hcop_data, mock_phenotype_data):
|
|
"""Test that provenance metadata is correctly recorded."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
db_path = Path(tmpdir) / "test.duckdb"
|
|
store = PipelineStore(db_path)
|
|
|
|
gene_ids = ['ENSG00000001', 'ENSG00000002']
|
|
|
|
with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_hcop, \
|
|
patch('usher_pipeline.evidence.animal_models.fetch._download_text') as mock_text, \
|
|
patch('httpx.get') as mock_http:
|
|
|
|
mock_hcop.side_effect = [
|
|
mock_hcop_data['mouse'].encode('utf-8'),
|
|
mock_hcop_data['zebrafish'].encode('utf-8'),
|
|
]
|
|
mock_text.side_effect = [
|
|
mock_phenotype_data['mgi'],
|
|
mock_phenotype_data['zfin'],
|
|
]
|
|
|
|
def mock_impc_response(url, **kwargs):
|
|
response = Mock()
|
|
response.raise_for_status = Mock()
|
|
response.json = Mock(return_value={'response': {'docs': []}})
|
|
return response
|
|
|
|
mock_http.side_effect = mock_impc_response
|
|
|
|
df = process_animal_model_evidence(gene_ids)
|
|
|
|
# Track provenance (use mock)
|
|
provenance = Mock()
|
|
provenance.record_step = Mock()
|
|
provenance.get_steps = Mock(return_value=[
|
|
{'step': 'load_animal_model_phenotypes', 'row_count': 2}
|
|
])
|
|
|
|
load_to_duckdb(df, store, provenance, description="Test animal model data")
|
|
|
|
# Check provenance was recorded
|
|
steps = provenance.get_steps()
|
|
assert len(steps) > 0
|
|
load_step = next((s for s in steps if s['step'] == 'load_animal_model_phenotypes'), None)
|
|
assert load_step is not None
|
|
assert 'row_count' in load_step
|
|
assert load_step['row_count'] == 2
|
|
|
|
store.close()
|
|
|
|
|
|
def test_empty_phenotype_handling(mock_hcop_data):
|
|
"""Test handling of genes with orthologs but no phenotypes."""
|
|
gene_ids = ['ENSG00000001']
|
|
|
|
with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_hcop, \
|
|
patch('usher_pipeline.evidence.animal_models.fetch._download_text') as mock_text, \
|
|
patch('httpx.get') as mock_http:
|
|
|
|
mock_hcop.side_effect = [
|
|
mock_hcop_data['mouse'].encode('utf-8'),
|
|
mock_hcop_data['zebrafish'].encode('utf-8'),
|
|
]
|
|
|
|
# Empty phenotype data
|
|
empty_mgi = """Marker Symbol\tMammalian Phenotype ID
|
|
"""
|
|
empty_zfin = """Gene Symbol\tAffected Structure or Process 1
|
|
"""
|
|
|
|
mock_text.side_effect = [empty_mgi, empty_zfin]
|
|
|
|
def mock_impc_response(url, **kwargs):
|
|
response = Mock()
|
|
response.raise_for_status = Mock()
|
|
response.json = Mock(return_value={'response': {'docs': []}})
|
|
return response
|
|
|
|
mock_http.side_effect = mock_impc_response
|
|
|
|
result = process_animal_model_evidence(gene_ids)
|
|
|
|
# Should have ortholog mapping but NULL sensory phenotype count
|
|
assert len(result) == 1
|
|
assert result['mouse_ortholog'][0] == 'Ush2a'
|
|
assert result['sensory_phenotype_count'][0] is None
|
|
# Score should still be calculated (but low since no phenotypes)
|
|
assert result['animal_model_score_normalized'][0] is not None
|