feat(03-05): add animal model DuckDB loader, CLI, and comprehensive tests

- load.py: DuckDB persistence with provenance tracking, ortholog confidence distribution stats
- CLI animal-models command: checkpoint-restart pattern, top scoring genes display
- 10 unit tests: ortholog confidence scoring, keyword filtering, multi-organism bonus, NULL preservation
- 4 integration tests: full pipeline, checkpoint-restart, provenance tracking, empty phenotype handling
- All tests pass (14/14): validates fetch->transform->load->CLI flow
- Fixed polars deprecations: str.join replaces str.concat, pl.len replaces pl.count
This commit is contained in:
2026-02-11 19:06:49 +08:00
parent 99bc975a2c
commit bcd3c4ffbe
4 changed files with 681 additions and 0 deletions

280
tests/test_animal_models.py Normal file
View File

@@ -0,0 +1,280 @@
"""Unit tests for animal model evidence layer."""
import io
from unittest.mock import Mock, patch, MagicMock
import polars as pl
import pytest
from usher_pipeline.evidence.animal_models import (
fetch_ortholog_mapping,
filter_sensory_phenotypes,
score_animal_evidence,
SENSORY_MP_KEYWORDS,
)
def test_ortholog_confidence_high():
"""Test that 8+ supporting sources results in HIGH confidence."""
# Mock HCOP data with 8 supporting databases
hcop_data = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tmouse_entrez_gene\tmouse_ensembl_gene\tmgi_id\tmouse_name\tmouse_symbol\tmouse_chr\tmouse_assert_ids\tsupport
123\tENSG00000001\tHGNC:1\tGene 1\tGENE1\t1\t\t456\tENSMUSG001\tMGI:1\tGene1\tGene1\t1\t\tdb1,db2,db3,db4,db5,db6,db7,db8"""
with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_download:
mock_download.return_value = hcop_data.encode('utf-8')
result = fetch_ortholog_mapping(['ENSG00000001'])
assert len(result) == 1
assert result['mouse_ortholog_confidence'][0] == 'HIGH'
def test_ortholog_confidence_low():
"""Test that 1-3 supporting sources results in LOW confidence."""
# Mock HCOP data with 2 supporting databases
hcop_mouse = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tmouse_entrez_gene\tmouse_ensembl_gene\tmgi_id\tmouse_name\tmouse_symbol\tmouse_chr\tmouse_assert_ids\tsupport
123\tENSG00000001\tHGNC:1\tGene 1\tGENE1\t1\t\t456\tENSMUSG001\tMGI:1\tGene1\tGene1\t1\t\tdb1,db2"""
hcop_zebrafish = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tzebrafish_entrez_gene\tzebrafish_ensembl_gene\tzfin_id\tzebrafish_name\tzebrafish_symbol\tzebrafish_chr\tzebrafish_assert_ids\tsupport
"""
with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_download:
# Return mouse data first, then zebrafish data
mock_download.side_effect = [
hcop_mouse.encode('utf-8'),
hcop_zebrafish.encode('utf-8')
]
result = fetch_ortholog_mapping(['ENSG00000001'])
assert len(result) == 1
assert result['mouse_ortholog_confidence'][0] == 'LOW'
def test_one_to_many_best_selected():
"""Test that for one-to-many ortholog mappings, the highest confidence is kept."""
# Mock HCOP data with two orthologs for same human gene
hcop_mouse = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tmouse_entrez_gene\tmouse_ensembl_gene\tmgi_id\tmouse_name\tmouse_symbol\tmouse_chr\tmouse_assert_ids\tsupport
123\tENSG00000001\tHGNC:1\tGene 1\tGENE1\t1\t\t456\tENSMUSG001\tMGI:1\tGene1a\tGene1a\t1\t\tdb1,db2
123\tENSG00000001\tHGNC:1\tGene 1\tGENE1\t1\t\t789\tENSMUSG002\tMGI:2\tGene1b\tGene1b\t2\t\tdb1,db2,db3,db4,db5,db6,db7,db8"""
hcop_zebrafish = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tzebrafish_entrez_gene\tzebrafish_ensembl_gene\tzfin_id\tzebrafish_name\tzebrafish_symbol\tzebrafish_chr\tzebrafish_assert_ids\tsupport
"""
with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_download:
mock_download.side_effect = [
hcop_mouse.encode('utf-8'),
hcop_zebrafish.encode('utf-8')
]
result = fetch_ortholog_mapping(['ENSG00000001'])
# Should select Gene1b with 8 sources (HIGH confidence)
assert len(result) == 1
assert result['mouse_ortholog'][0] == 'Gene1b'
assert result['mouse_ortholog_confidence'][0] == 'HIGH'
def test_sensory_keyword_match():
"""Test that phenotype terms matching SENSORY_MP_KEYWORDS are retained."""
phenotypes = pl.DataFrame({
'mouse_gene': ['Gene1', 'Gene1', 'Gene2'],
'mp_term_id': ['MP:0001', 'MP:0002', 'MP:0003'],
'mp_term_name': ['hearing loss', 'abnormal cochlea morphology', 'irrelevant phenotype'],
})
result = filter_sensory_phenotypes(phenotypes, SENSORY_MP_KEYWORDS, 'mp_term_name')
# Should keep first two rows (hearing, cochlea match keywords)
assert len(result) == 2
assert 'hearing loss' in result['mp_term_name'].to_list()
assert 'abnormal cochlea morphology' in result['mp_term_name'].to_list()
def test_non_sensory_filtered():
"""Test that non-sensory phenotypes are filtered out."""
phenotypes = pl.DataFrame({
'mouse_gene': ['Gene1', 'Gene2'],
'mp_term_id': ['MP:0001', 'MP:0002'],
'mp_term_name': ['increased body weight', 'abnormal coat color'],
})
result = filter_sensory_phenotypes(phenotypes, SENSORY_MP_KEYWORDS, 'mp_term_name')
# Should filter out both rows
assert len(result) == 0
def test_score_with_confidence_weighting():
"""Test that HIGH confidence orthologs score higher than LOW confidence."""
# Gene with HIGH confidence mouse ortholog
high_conf = pl.DataFrame({
'gene_id': ['ENSG00000001'],
'mouse_ortholog': ['Gene1'],
'mouse_ortholog_confidence': ['HIGH'],
'zebrafish_ortholog': [None],
'zebrafish_ortholog_confidence': [None],
'has_mouse_phenotype': [True],
'has_zebrafish_phenotype': [False],
'has_impc_phenotype': [False],
'sensory_phenotype_count': [5],
})
# Gene with LOW confidence mouse ortholog
low_conf = pl.DataFrame({
'gene_id': ['ENSG00000002'],
'mouse_ortholog': ['Gene2'],
'mouse_ortholog_confidence': ['LOW'],
'zebrafish_ortholog': [None],
'zebrafish_ortholog_confidence': [None],
'has_mouse_phenotype': [True],
'has_zebrafish_phenotype': [False],
'has_impc_phenotype': [False],
'sensory_phenotype_count': [5],
})
high_result = score_animal_evidence(high_conf)
low_result = score_animal_evidence(low_conf)
high_score = high_result['animal_model_score_normalized'][0]
low_score = low_result['animal_model_score_normalized'][0]
# HIGH confidence should score higher (0.4 * 1.0 vs 0.4 * 0.4)
assert high_score > low_score
def test_score_null_no_ortholog():
"""Test that genes without orthologs get NULL score, not zero."""
df = pl.DataFrame({
'gene_id': ['ENSG00000001'],
'mouse_ortholog': [None],
'mouse_ortholog_confidence': [None],
'zebrafish_ortholog': [None],
'zebrafish_ortholog_confidence': [None],
'has_mouse_phenotype': [False],
'has_zebrafish_phenotype': [False],
'has_impc_phenotype': [False],
'sensory_phenotype_count': [None],
})
result = score_animal_evidence(df)
# Should be NULL, not 0.0
assert result['animal_model_score_normalized'][0] is None
def test_multi_organism_bonus():
"""Test that phenotypes in both mouse and zebrafish result in higher score."""
# Gene with only mouse phenotype
mouse_only = pl.DataFrame({
'gene_id': ['ENSG00000001'],
'mouse_ortholog': ['Gene1'],
'mouse_ortholog_confidence': ['HIGH'],
'zebrafish_ortholog': [None],
'zebrafish_ortholog_confidence': [None],
'has_mouse_phenotype': [True],
'has_zebrafish_phenotype': [False],
'has_impc_phenotype': [False],
'sensory_phenotype_count': [3],
})
# Gene with both mouse and zebrafish phenotypes
both = pl.DataFrame({
'gene_id': ['ENSG00000002'],
'mouse_ortholog': ['Gene2'],
'mouse_ortholog_confidence': ['HIGH'],
'zebrafish_ortholog': ['gene2'],
'zebrafish_ortholog_confidence': ['HIGH'],
'has_mouse_phenotype': [True],
'has_zebrafish_phenotype': [True],
'has_impc_phenotype': [False],
'sensory_phenotype_count': [3],
})
mouse_result = score_animal_evidence(mouse_only)
both_result = score_animal_evidence(both)
mouse_score = mouse_result['animal_model_score_normalized'][0]
both_score = both_result['animal_model_score_normalized'][0]
# Both organisms should score higher (0.4 + 0.3 vs 0.4)
assert both_score > mouse_score
def test_phenotype_count_scaling():
"""Test that more sensory phenotypes lead to higher scores (with diminishing returns)."""
# Gene with 1 phenotype
few = pl.DataFrame({
'gene_id': ['ENSG00000001'],
'mouse_ortholog': ['Gene1'],
'mouse_ortholog_confidence': ['HIGH'],
'zebrafish_ortholog': [None],
'zebrafish_ortholog_confidence': [None],
'has_mouse_phenotype': [True],
'has_zebrafish_phenotype': [False],
'has_impc_phenotype': [False],
'sensory_phenotype_count': [1],
})
# Gene with 10 phenotypes
many = pl.DataFrame({
'gene_id': ['ENSG00000002'],
'mouse_ortholog': ['Gene2'],
'mouse_ortholog_confidence': ['HIGH'],
'zebrafish_ortholog': [None],
'zebrafish_ortholog_confidence': [None],
'has_mouse_phenotype': [True],
'has_zebrafish_phenotype': [False],
'has_impc_phenotype': [False],
'sensory_phenotype_count': [10],
})
few_result = score_animal_evidence(few)
many_result = score_animal_evidence(many)
few_score = few_result['animal_model_score_normalized'][0]
many_score = many_result['animal_model_score_normalized'][0]
# More phenotypes should score higher
assert many_score > few_score
# But not linearly (diminishing returns via log)
# log2(11) / log2(11) = 1.0 vs log2(2) / log2(11) = 0.29
assert many_score < few_score * 10 # Not 10x higher
def test_impc_integration():
"""Test that IMPC phenotypes contribute to score."""
# Gene without IMPC
no_impc = pl.DataFrame({
'gene_id': ['ENSG00000001'],
'mouse_ortholog': ['Gene1'],
'mouse_ortholog_confidence': ['HIGH'],
'zebrafish_ortholog': [None],
'zebrafish_ortholog_confidence': [None],
'has_mouse_phenotype': [True],
'has_zebrafish_phenotype': [False],
'has_impc_phenotype': [False],
'sensory_phenotype_count': [3],
})
# Gene with IMPC
with_impc = pl.DataFrame({
'gene_id': ['ENSG00000002'],
'mouse_ortholog': ['Gene2'],
'mouse_ortholog_confidence': ['HIGH'],
'zebrafish_ortholog': [None],
'zebrafish_ortholog_confidence': [None],
'has_mouse_phenotype': [True],
'has_zebrafish_phenotype': [False],
'has_impc_phenotype': [True],
'sensory_phenotype_count': [3],
})
no_impc_result = score_animal_evidence(no_impc)
with_impc_result = score_animal_evidence(with_impc)
no_impc_score = no_impc_result['animal_model_score_normalized'][0]
with_impc_score = with_impc_result['animal_model_score_normalized'][0]
# IMPC should add to score (+0.3)
assert with_impc_score > no_impc_score

View File

@@ -0,0 +1,269 @@
"""Integration tests for animal model evidence layer."""
import tempfile
from pathlib import Path
from unittest.mock import patch, Mock
import polars as pl
import pytest
from usher_pipeline.evidence.animal_models import (
process_animal_model_evidence,
load_to_duckdb,
)
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
@pytest.fixture
def mock_hcop_data():
"""Mock HCOP ortholog mapping data."""
mouse_data = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tmouse_entrez_gene\tmouse_ensembl_gene\tmgi_id\tmouse_name\tmouse_symbol\tmouse_chr\tmouse_assert_ids\tsupport
123\tENSG00000001\tHGNC:1\tUSH2A\tUSH2A\t1\t\t456\tENSMUSG001\tMGI:1\tUsh2a\tUsh2a\t1\t\tdb1,db2,db3,db4,db5,db6,db7,db8
456\tENSG00000002\tHGNC:2\tMYO7A\tMYO7A\t11\t\t789\tENSMUSG002\tMGI:2\tMyo7a\tMyo7a\t7\t\tdb1,db2,db3,db4,db5"""
zebrafish_data = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tzebrafish_entrez_gene\tzebrafish_ensembl_gene\tzfin_id\tzebrafish_name\tzebrafish_symbol\tzebrafish_chr\tzebrafish_assert_ids\tsupport
123\tENSG00000001\tHGNC:1\tUSH2A\tUSH2A\t1\t\t111\tENSDART001\tZDB-GENE-1\tush2a\tush2a\t1\t\tdb1,db2,db3,db4,db5,db6"""
return {'mouse': mouse_data, 'zebrafish': zebrafish_data}
@pytest.fixture
def mock_phenotype_data():
"""Mock MGI, ZFIN, and IMPC phenotype data."""
mgi_data = """Marker Symbol\tMammalian Phenotype ID
Ush2a\tMP:0001967
Ush2a\tMP:0005377
Myo7a\tMP:0001968"""
zfin_data = """Gene Symbol\tAffected Structure or Process 1
ush2a\tabnormal ear morphology
ush2a\tabnormal retina morphology"""
impc_responses = {
'Ush2a': {
'response': {
'docs': [
{
'marker_symbol': 'Ush2a',
'mp_term_id': 'MP:0001967',
'mp_term_name': 'deafness',
'p_value': 0.001
}
]
}
},
'Myo7a': {
'response': {
'docs': [
{
'marker_symbol': 'Myo7a',
'mp_term_id': 'MP:0001968',
'mp_term_name': 'abnormal cochlea morphology',
'p_value': 0.0005
}
]
}
}
}
return {'mgi': mgi_data, 'zfin': zfin_data, 'impc': impc_responses}
def test_full_pipeline(mock_hcop_data, mock_phenotype_data):
"""Test full animal model evidence pipeline with mocked data sources."""
gene_ids = ['ENSG00000001', 'ENSG00000002']
with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_hcop, \
patch('usher_pipeline.evidence.animal_models.fetch._download_text') as mock_text, \
patch('httpx.get') as mock_http:
# Mock HCOP downloads
mock_hcop.side_effect = [
mock_hcop_data['mouse'].encode('utf-8'),
mock_hcop_data['zebrafish'].encode('utf-8'),
]
# Mock MGI and ZFIN downloads
mock_text.side_effect = [
mock_phenotype_data['mgi'],
mock_phenotype_data['zfin'],
]
# Mock IMPC API responses
def mock_impc_response(url, **kwargs):
response = Mock()
response.raise_for_status = Mock()
# Extract gene symbol from query
query = kwargs.get('params', {}).get('q', '')
if 'Ush2a' in query:
response.json = Mock(return_value=mock_phenotype_data['impc']['Ush2a'])
elif 'Myo7a' in query:
response.json = Mock(return_value=mock_phenotype_data['impc']['Myo7a'])
else:
response.json = Mock(return_value={'response': {'docs': []}})
return response
mock_http.side_effect = mock_impc_response
# Run pipeline
result = process_animal_model_evidence(gene_ids)
# Verify results
assert len(result) == 2
# Check USH2A (ENSG00000001)
ush2a = result.filter(pl.col('gene_id') == 'ENSG00000001')
assert len(ush2a) == 1
assert ush2a['mouse_ortholog'][0] == 'Ush2a'
assert ush2a['mouse_ortholog_confidence'][0] == 'HIGH' # 8 sources
assert ush2a['zebrafish_ortholog'][0] == 'ush2a'
assert ush2a['zebrafish_ortholog_confidence'][0] == 'MEDIUM' # 6 sources
assert ush2a['sensory_phenotype_count'][0] is not None
assert ush2a['animal_model_score_normalized'][0] is not None
assert ush2a['animal_model_score_normalized'][0] > 0
# Check MYO7A (ENSG00000002)
myo7a = result.filter(pl.col('gene_id') == 'ENSG00000002')
assert len(myo7a) == 1
assert myo7a['mouse_ortholog'][0] == 'Myo7a'
assert myo7a['mouse_ortholog_confidence'][0] == 'MEDIUM' # 5 sources
def test_checkpoint_restart(mock_hcop_data, mock_phenotype_data):
"""Test checkpoint-restart pattern: load from DuckDB if exists, skip reprocessing."""
with tempfile.TemporaryDirectory() as tmpdir:
db_path = Path(tmpdir) / "test.duckdb"
store = PipelineStore(db_path)
# Initial load
gene_ids = ['ENSG00000001', 'ENSG00000002']
with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_hcop, \
patch('usher_pipeline.evidence.animal_models.fetch._download_text') as mock_text, \
patch('httpx.get') as mock_http:
mock_hcop.side_effect = [
mock_hcop_data['mouse'].encode('utf-8'),
mock_hcop_data['zebrafish'].encode('utf-8'),
]
mock_text.side_effect = [
mock_phenotype_data['mgi'],
mock_phenotype_data['zfin'],
]
def mock_impc_response(url, **kwargs):
response = Mock()
response.raise_for_status = Mock()
response.json = Mock(return_value={'response': {'docs': []}})
return response
mock_http.side_effect = mock_impc_response
df = process_animal_model_evidence(gene_ids)
# Save to DuckDB (use mock provenance tracker)
provenance = Mock()
provenance.record_step = Mock()
load_to_duckdb(df, store, provenance)
# Check checkpoint exists
assert store.has_checkpoint('animal_model_phenotypes')
# Load from checkpoint
loaded_df = store.load_dataframe('animal_model_phenotypes')
assert loaded_df is not None
assert len(loaded_df) == 2
store.close()
def test_provenance_tracking(mock_hcop_data, mock_phenotype_data):
"""Test that provenance metadata is correctly recorded."""
with tempfile.TemporaryDirectory() as tmpdir:
db_path = Path(tmpdir) / "test.duckdb"
store = PipelineStore(db_path)
gene_ids = ['ENSG00000001', 'ENSG00000002']
with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_hcop, \
patch('usher_pipeline.evidence.animal_models.fetch._download_text') as mock_text, \
patch('httpx.get') as mock_http:
mock_hcop.side_effect = [
mock_hcop_data['mouse'].encode('utf-8'),
mock_hcop_data['zebrafish'].encode('utf-8'),
]
mock_text.side_effect = [
mock_phenotype_data['mgi'],
mock_phenotype_data['zfin'],
]
def mock_impc_response(url, **kwargs):
response = Mock()
response.raise_for_status = Mock()
response.json = Mock(return_value={'response': {'docs': []}})
return response
mock_http.side_effect = mock_impc_response
df = process_animal_model_evidence(gene_ids)
# Track provenance (use mock)
provenance = Mock()
provenance.record_step = Mock()
provenance.get_steps = Mock(return_value=[
{'step': 'load_animal_model_phenotypes', 'row_count': 2}
])
load_to_duckdb(df, store, provenance, description="Test animal model data")
# Check provenance was recorded
steps = provenance.get_steps()
assert len(steps) > 0
load_step = next((s for s in steps if s['step'] == 'load_animal_model_phenotypes'), None)
assert load_step is not None
assert 'row_count' in load_step
assert load_step['row_count'] == 2
store.close()
def test_empty_phenotype_handling(mock_hcop_data):
"""Test handling of genes with orthologs but no phenotypes."""
gene_ids = ['ENSG00000001']
with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_hcop, \
patch('usher_pipeline.evidence.animal_models.fetch._download_text') as mock_text, \
patch('httpx.get') as mock_http:
mock_hcop.side_effect = [
mock_hcop_data['mouse'].encode('utf-8'),
mock_hcop_data['zebrafish'].encode('utf-8'),
]
# Empty phenotype data
empty_mgi = """Marker Symbol\tMammalian Phenotype ID
"""
empty_zfin = """Gene Symbol\tAffected Structure or Process 1
"""
mock_text.side_effect = [empty_mgi, empty_zfin]
def mock_impc_response(url, **kwargs):
response = Mock()
response.raise_for_status = Mock()
response.json = Mock(return_value={'response': {'docs': []}})
return response
mock_http.side_effect = mock_impc_response
result = process_animal_model_evidence(gene_ids)
# Should have ortholog mapping but NULL sensory phenotype count
assert len(result) == 1
assert result['mouse_ortholog'][0] == 'Ush2a'
assert result['sensory_phenotype_count'][0] is None
# Score should still be calculated (but low since no phenotypes)
assert result['animal_model_score_normalized'][0] is not None