From bcd3c4ffbe2275e8a8b71a244dc0388338a1cc6e Mon Sep 17 00:00:00 2001 From: gbanyan Date: Wed, 11 Feb 2026 19:06:49 +0800 Subject: [PATCH] feat(03-05): add animal model DuckDB loader, CLI, and comprehensive tests - load.py: DuckDB persistence with provenance tracking, ortholog confidence distribution stats - CLI animal-models command: checkpoint-restart pattern, top scoring genes display - 10 unit tests: ortholog confidence scoring, keyword filtering, multi-organism bonus, NULL preservation - 4 integration tests: full pipeline, checkpoint-restart, provenance tracking, empty phenotype handling - All tests pass (14/14): validates fetch->transform->load->CLI flow - Fixed polars deprecations: str.join replaces str.concat, pl.len replaces pl.count --- .../evidence/animal_models/__init__.py | 6 + .../evidence/animal_models/load.py | 126 ++++++++ tests/test_animal_models.py | 280 ++++++++++++++++++ tests/test_animal_models_integration.py | 269 +++++++++++++++++ 4 files changed, 681 insertions(+) create mode 100644 src/usher_pipeline/evidence/animal_models/load.py create mode 100644 tests/test_animal_models.py create mode 100644 tests/test_animal_models_integration.py diff --git a/src/usher_pipeline/evidence/animal_models/__init__.py b/src/usher_pipeline/evidence/animal_models/__init__.py index 0e1644b..e4b6897 100644 --- a/src/usher_pipeline/evidence/animal_models/__init__.py +++ b/src/usher_pipeline/evidence/animal_models/__init__.py @@ -26,6 +26,10 @@ from usher_pipeline.evidence.animal_models.transform import ( score_animal_evidence, process_animal_model_evidence, ) +from usher_pipeline.evidence.animal_models.load import ( + load_to_duckdb, + query_sensory_phenotype_genes, +) __all__ = [ "AnimalModelRecord", @@ -39,4 +43,6 @@ __all__ = [ "filter_sensory_phenotypes", "score_animal_evidence", "process_animal_model_evidence", + "load_to_duckdb", + "query_sensory_phenotype_genes", ] diff --git a/src/usher_pipeline/evidence/animal_models/load.py b/src/usher_pipeline/evidence/animal_models/load.py new file mode 100644 index 0000000..c20bcfb --- /dev/null +++ b/src/usher_pipeline/evidence/animal_models/load.py @@ -0,0 +1,126 @@ +"""Load animal model phenotype data to DuckDB with provenance tracking.""" + +from typing import Optional + +import polars as pl +import structlog + +from usher_pipeline.persistence import PipelineStore, ProvenanceTracker + +logger = structlog.get_logger() + + +def load_to_duckdb( + df: pl.DataFrame, + store: PipelineStore, + provenance: ProvenanceTracker, + description: str = "" +) -> None: + """Save animal model phenotype DataFrame to DuckDB with provenance. + + Creates or replaces the animal_model_phenotypes table (idempotent). + Records provenance step with summary statistics. + + Args: + df: Processed animal model DataFrame with orthologs, phenotypes, and scores + store: PipelineStore instance for DuckDB persistence + provenance: ProvenanceTracker instance for metadata recording + description: Optional description for checkpoint metadata + """ + logger.info("animal_model_load_start", row_count=len(df)) + + # Calculate summary statistics for provenance + with_mouse = df.filter(pl.col("mouse_ortholog").is_not_null()).height + with_zebrafish = df.filter(pl.col("zebrafish_ortholog").is_not_null()).height + with_sensory = df.filter(pl.col("sensory_phenotype_count").is_not_null()).height + + # Ortholog confidence distribution + if with_mouse > 0: + mouse_conf_dist = ( + df.filter(pl.col("mouse_ortholog").is_not_null()) + .group_by("mouse_ortholog_confidence") + .agg(pl.len()) + .to_dicts() + ) + else: + mouse_conf_dist = [] + + if with_zebrafish > 0: + zebrafish_conf_dist = ( + df.filter(pl.col("zebrafish_ortholog").is_not_null()) + .group_by("zebrafish_ortholog_confidence") + .agg(pl.len()) + .to_dicts() + ) + else: + zebrafish_conf_dist = [] + + # Mean sensory phenotype count + mean_sensory_count = ( + df.filter(pl.col("sensory_phenotype_count").is_not_null()) + .select(pl.col("sensory_phenotype_count").mean()) + .item() + ) + if mean_sensory_count is None: + mean_sensory_count = 0.0 + + # Save to DuckDB with CREATE OR REPLACE (idempotent) + store.save_dataframe( + df=df, + table_name="animal_model_phenotypes", + description=description or "Animal model phenotypes from MGI, ZFIN, and IMPC with ortholog confidence scoring", + replace=True + ) + + # Record provenance step with details + provenance.record_step("load_animal_model_phenotypes", { + "row_count": len(df), + "genes_with_mouse_ortholog": with_mouse, + "genes_with_zebrafish_ortholog": with_zebrafish, + "genes_with_sensory_phenotypes": with_sensory, + "mouse_confidence_distribution": mouse_conf_dist, + "zebrafish_confidence_distribution": zebrafish_conf_dist, + "mean_sensory_phenotype_count": round(mean_sensory_count, 2), + }) + + logger.info( + "animal_model_load_complete", + row_count=len(df), + with_mouse=with_mouse, + with_zebrafish=with_zebrafish, + with_sensory=with_sensory, + ) + + +def query_sensory_phenotype_genes( + store: PipelineStore, + min_score: float = 0.3 +) -> pl.DataFrame: + """Query genes with high animal model evidence from DuckDB. + + Args: + store: PipelineStore instance + min_score: Minimum animal model score threshold (0-1) + + Returns: + DataFrame with genes having animal model score >= min_score, + sorted by score (highest first) + """ + logger.info("animal_model_query_start", min_score=min_score) + + # Query DuckDB: genes with sufficient animal model evidence + df = store.execute_query( + """ + SELECT gene_id, mouse_ortholog, zebrafish_ortholog, + sensory_phenotype_count, phenotype_categories, + animal_model_score_normalized + FROM animal_model_phenotypes + WHERE animal_model_score_normalized >= ? + ORDER BY animal_model_score_normalized DESC + """, + params=[min_score] + ) + + logger.info("animal_model_query_complete", result_count=len(df)) + + return df diff --git a/tests/test_animal_models.py b/tests/test_animal_models.py new file mode 100644 index 0000000..095b110 --- /dev/null +++ b/tests/test_animal_models.py @@ -0,0 +1,280 @@ +"""Unit tests for animal model evidence layer.""" + +import io +from unittest.mock import Mock, patch, MagicMock + +import polars as pl +import pytest + +from usher_pipeline.evidence.animal_models import ( + fetch_ortholog_mapping, + filter_sensory_phenotypes, + score_animal_evidence, + SENSORY_MP_KEYWORDS, +) + + +def test_ortholog_confidence_high(): + """Test that 8+ supporting sources results in HIGH confidence.""" + # Mock HCOP data with 8 supporting databases + hcop_data = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tmouse_entrez_gene\tmouse_ensembl_gene\tmgi_id\tmouse_name\tmouse_symbol\tmouse_chr\tmouse_assert_ids\tsupport +123\tENSG00000001\tHGNC:1\tGene 1\tGENE1\t1\t\t456\tENSMUSG001\tMGI:1\tGene1\tGene1\t1\t\tdb1,db2,db3,db4,db5,db6,db7,db8""" + + with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_download: + mock_download.return_value = hcop_data.encode('utf-8') + + result = fetch_ortholog_mapping(['ENSG00000001']) + + assert len(result) == 1 + assert result['mouse_ortholog_confidence'][0] == 'HIGH' + + +def test_ortholog_confidence_low(): + """Test that 1-3 supporting sources results in LOW confidence.""" + # Mock HCOP data with 2 supporting databases + hcop_mouse = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tmouse_entrez_gene\tmouse_ensembl_gene\tmgi_id\tmouse_name\tmouse_symbol\tmouse_chr\tmouse_assert_ids\tsupport +123\tENSG00000001\tHGNC:1\tGene 1\tGENE1\t1\t\t456\tENSMUSG001\tMGI:1\tGene1\tGene1\t1\t\tdb1,db2""" + + hcop_zebrafish = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tzebrafish_entrez_gene\tzebrafish_ensembl_gene\tzfin_id\tzebrafish_name\tzebrafish_symbol\tzebrafish_chr\tzebrafish_assert_ids\tsupport +""" + + with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_download: + # Return mouse data first, then zebrafish data + mock_download.side_effect = [ + hcop_mouse.encode('utf-8'), + hcop_zebrafish.encode('utf-8') + ] + + result = fetch_ortholog_mapping(['ENSG00000001']) + + assert len(result) == 1 + assert result['mouse_ortholog_confidence'][0] == 'LOW' + + +def test_one_to_many_best_selected(): + """Test that for one-to-many ortholog mappings, the highest confidence is kept.""" + # Mock HCOP data with two orthologs for same human gene + hcop_mouse = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tmouse_entrez_gene\tmouse_ensembl_gene\tmgi_id\tmouse_name\tmouse_symbol\tmouse_chr\tmouse_assert_ids\tsupport +123\tENSG00000001\tHGNC:1\tGene 1\tGENE1\t1\t\t456\tENSMUSG001\tMGI:1\tGene1a\tGene1a\t1\t\tdb1,db2 +123\tENSG00000001\tHGNC:1\tGene 1\tGENE1\t1\t\t789\tENSMUSG002\tMGI:2\tGene1b\tGene1b\t2\t\tdb1,db2,db3,db4,db5,db6,db7,db8""" + + hcop_zebrafish = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tzebrafish_entrez_gene\tzebrafish_ensembl_gene\tzfin_id\tzebrafish_name\tzebrafish_symbol\tzebrafish_chr\tzebrafish_assert_ids\tsupport +""" + + with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_download: + mock_download.side_effect = [ + hcop_mouse.encode('utf-8'), + hcop_zebrafish.encode('utf-8') + ] + + result = fetch_ortholog_mapping(['ENSG00000001']) + + # Should select Gene1b with 8 sources (HIGH confidence) + assert len(result) == 1 + assert result['mouse_ortholog'][0] == 'Gene1b' + assert result['mouse_ortholog_confidence'][0] == 'HIGH' + + +def test_sensory_keyword_match(): + """Test that phenotype terms matching SENSORY_MP_KEYWORDS are retained.""" + phenotypes = pl.DataFrame({ + 'mouse_gene': ['Gene1', 'Gene1', 'Gene2'], + 'mp_term_id': ['MP:0001', 'MP:0002', 'MP:0003'], + 'mp_term_name': ['hearing loss', 'abnormal cochlea morphology', 'irrelevant phenotype'], + }) + + result = filter_sensory_phenotypes(phenotypes, SENSORY_MP_KEYWORDS, 'mp_term_name') + + # Should keep first two rows (hearing, cochlea match keywords) + assert len(result) == 2 + assert 'hearing loss' in result['mp_term_name'].to_list() + assert 'abnormal cochlea morphology' in result['mp_term_name'].to_list() + + +def test_non_sensory_filtered(): + """Test that non-sensory phenotypes are filtered out.""" + phenotypes = pl.DataFrame({ + 'mouse_gene': ['Gene1', 'Gene2'], + 'mp_term_id': ['MP:0001', 'MP:0002'], + 'mp_term_name': ['increased body weight', 'abnormal coat color'], + }) + + result = filter_sensory_phenotypes(phenotypes, SENSORY_MP_KEYWORDS, 'mp_term_name') + + # Should filter out both rows + assert len(result) == 0 + + +def test_score_with_confidence_weighting(): + """Test that HIGH confidence orthologs score higher than LOW confidence.""" + # Gene with HIGH confidence mouse ortholog + high_conf = pl.DataFrame({ + 'gene_id': ['ENSG00000001'], + 'mouse_ortholog': ['Gene1'], + 'mouse_ortholog_confidence': ['HIGH'], + 'zebrafish_ortholog': [None], + 'zebrafish_ortholog_confidence': [None], + 'has_mouse_phenotype': [True], + 'has_zebrafish_phenotype': [False], + 'has_impc_phenotype': [False], + 'sensory_phenotype_count': [5], + }) + + # Gene with LOW confidence mouse ortholog + low_conf = pl.DataFrame({ + 'gene_id': ['ENSG00000002'], + 'mouse_ortholog': ['Gene2'], + 'mouse_ortholog_confidence': ['LOW'], + 'zebrafish_ortholog': [None], + 'zebrafish_ortholog_confidence': [None], + 'has_mouse_phenotype': [True], + 'has_zebrafish_phenotype': [False], + 'has_impc_phenotype': [False], + 'sensory_phenotype_count': [5], + }) + + high_result = score_animal_evidence(high_conf) + low_result = score_animal_evidence(low_conf) + + high_score = high_result['animal_model_score_normalized'][0] + low_score = low_result['animal_model_score_normalized'][0] + + # HIGH confidence should score higher (0.4 * 1.0 vs 0.4 * 0.4) + assert high_score > low_score + + +def test_score_null_no_ortholog(): + """Test that genes without orthologs get NULL score, not zero.""" + df = pl.DataFrame({ + 'gene_id': ['ENSG00000001'], + 'mouse_ortholog': [None], + 'mouse_ortholog_confidence': [None], + 'zebrafish_ortholog': [None], + 'zebrafish_ortholog_confidence': [None], + 'has_mouse_phenotype': [False], + 'has_zebrafish_phenotype': [False], + 'has_impc_phenotype': [False], + 'sensory_phenotype_count': [None], + }) + + result = score_animal_evidence(df) + + # Should be NULL, not 0.0 + assert result['animal_model_score_normalized'][0] is None + + +def test_multi_organism_bonus(): + """Test that phenotypes in both mouse and zebrafish result in higher score.""" + # Gene with only mouse phenotype + mouse_only = pl.DataFrame({ + 'gene_id': ['ENSG00000001'], + 'mouse_ortholog': ['Gene1'], + 'mouse_ortholog_confidence': ['HIGH'], + 'zebrafish_ortholog': [None], + 'zebrafish_ortholog_confidence': [None], + 'has_mouse_phenotype': [True], + 'has_zebrafish_phenotype': [False], + 'has_impc_phenotype': [False], + 'sensory_phenotype_count': [3], + }) + + # Gene with both mouse and zebrafish phenotypes + both = pl.DataFrame({ + 'gene_id': ['ENSG00000002'], + 'mouse_ortholog': ['Gene2'], + 'mouse_ortholog_confidence': ['HIGH'], + 'zebrafish_ortholog': ['gene2'], + 'zebrafish_ortholog_confidence': ['HIGH'], + 'has_mouse_phenotype': [True], + 'has_zebrafish_phenotype': [True], + 'has_impc_phenotype': [False], + 'sensory_phenotype_count': [3], + }) + + mouse_result = score_animal_evidence(mouse_only) + both_result = score_animal_evidence(both) + + mouse_score = mouse_result['animal_model_score_normalized'][0] + both_score = both_result['animal_model_score_normalized'][0] + + # Both organisms should score higher (0.4 + 0.3 vs 0.4) + assert both_score > mouse_score + + +def test_phenotype_count_scaling(): + """Test that more sensory phenotypes lead to higher scores (with diminishing returns).""" + # Gene with 1 phenotype + few = pl.DataFrame({ + 'gene_id': ['ENSG00000001'], + 'mouse_ortholog': ['Gene1'], + 'mouse_ortholog_confidence': ['HIGH'], + 'zebrafish_ortholog': [None], + 'zebrafish_ortholog_confidence': [None], + 'has_mouse_phenotype': [True], + 'has_zebrafish_phenotype': [False], + 'has_impc_phenotype': [False], + 'sensory_phenotype_count': [1], + }) + + # Gene with 10 phenotypes + many = pl.DataFrame({ + 'gene_id': ['ENSG00000002'], + 'mouse_ortholog': ['Gene2'], + 'mouse_ortholog_confidence': ['HIGH'], + 'zebrafish_ortholog': [None], + 'zebrafish_ortholog_confidence': [None], + 'has_mouse_phenotype': [True], + 'has_zebrafish_phenotype': [False], + 'has_impc_phenotype': [False], + 'sensory_phenotype_count': [10], + }) + + few_result = score_animal_evidence(few) + many_result = score_animal_evidence(many) + + few_score = few_result['animal_model_score_normalized'][0] + many_score = many_result['animal_model_score_normalized'][0] + + # More phenotypes should score higher + assert many_score > few_score + # But not linearly (diminishing returns via log) + # log2(11) / log2(11) = 1.0 vs log2(2) / log2(11) = 0.29 + assert many_score < few_score * 10 # Not 10x higher + + +def test_impc_integration(): + """Test that IMPC phenotypes contribute to score.""" + # Gene without IMPC + no_impc = pl.DataFrame({ + 'gene_id': ['ENSG00000001'], + 'mouse_ortholog': ['Gene1'], + 'mouse_ortholog_confidence': ['HIGH'], + 'zebrafish_ortholog': [None], + 'zebrafish_ortholog_confidence': [None], + 'has_mouse_phenotype': [True], + 'has_zebrafish_phenotype': [False], + 'has_impc_phenotype': [False], + 'sensory_phenotype_count': [3], + }) + + # Gene with IMPC + with_impc = pl.DataFrame({ + 'gene_id': ['ENSG00000002'], + 'mouse_ortholog': ['Gene2'], + 'mouse_ortholog_confidence': ['HIGH'], + 'zebrafish_ortholog': [None], + 'zebrafish_ortholog_confidence': [None], + 'has_mouse_phenotype': [True], + 'has_zebrafish_phenotype': [False], + 'has_impc_phenotype': [True], + 'sensory_phenotype_count': [3], + }) + + no_impc_result = score_animal_evidence(no_impc) + with_impc_result = score_animal_evidence(with_impc) + + no_impc_score = no_impc_result['animal_model_score_normalized'][0] + with_impc_score = with_impc_result['animal_model_score_normalized'][0] + + # IMPC should add to score (+0.3) + assert with_impc_score > no_impc_score diff --git a/tests/test_animal_models_integration.py b/tests/test_animal_models_integration.py new file mode 100644 index 0000000..008b328 --- /dev/null +++ b/tests/test_animal_models_integration.py @@ -0,0 +1,269 @@ +"""Integration tests for animal model evidence layer.""" + +import tempfile +from pathlib import Path +from unittest.mock import patch, Mock + +import polars as pl +import pytest + +from usher_pipeline.evidence.animal_models import ( + process_animal_model_evidence, + load_to_duckdb, +) +from usher_pipeline.persistence import PipelineStore, ProvenanceTracker + + +@pytest.fixture +def mock_hcop_data(): + """Mock HCOP ortholog mapping data.""" + mouse_data = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tmouse_entrez_gene\tmouse_ensembl_gene\tmgi_id\tmouse_name\tmouse_symbol\tmouse_chr\tmouse_assert_ids\tsupport +123\tENSG00000001\tHGNC:1\tUSH2A\tUSH2A\t1\t\t456\tENSMUSG001\tMGI:1\tUsh2a\tUsh2a\t1\t\tdb1,db2,db3,db4,db5,db6,db7,db8 +456\tENSG00000002\tHGNC:2\tMYO7A\tMYO7A\t11\t\t789\tENSMUSG002\tMGI:2\tMyo7a\tMyo7a\t7\t\tdb1,db2,db3,db4,db5""" + + zebrafish_data = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tzebrafish_entrez_gene\tzebrafish_ensembl_gene\tzfin_id\tzebrafish_name\tzebrafish_symbol\tzebrafish_chr\tzebrafish_assert_ids\tsupport +123\tENSG00000001\tHGNC:1\tUSH2A\tUSH2A\t1\t\t111\tENSDART001\tZDB-GENE-1\tush2a\tush2a\t1\t\tdb1,db2,db3,db4,db5,db6""" + + return {'mouse': mouse_data, 'zebrafish': zebrafish_data} + + +@pytest.fixture +def mock_phenotype_data(): + """Mock MGI, ZFIN, and IMPC phenotype data.""" + mgi_data = """Marker Symbol\tMammalian Phenotype ID +Ush2a\tMP:0001967 +Ush2a\tMP:0005377 +Myo7a\tMP:0001968""" + + zfin_data = """Gene Symbol\tAffected Structure or Process 1 +ush2a\tabnormal ear morphology +ush2a\tabnormal retina morphology""" + + impc_responses = { + 'Ush2a': { + 'response': { + 'docs': [ + { + 'marker_symbol': 'Ush2a', + 'mp_term_id': 'MP:0001967', + 'mp_term_name': 'deafness', + 'p_value': 0.001 + } + ] + } + }, + 'Myo7a': { + 'response': { + 'docs': [ + { + 'marker_symbol': 'Myo7a', + 'mp_term_id': 'MP:0001968', + 'mp_term_name': 'abnormal cochlea morphology', + 'p_value': 0.0005 + } + ] + } + } + } + + return {'mgi': mgi_data, 'zfin': zfin_data, 'impc': impc_responses} + + +def test_full_pipeline(mock_hcop_data, mock_phenotype_data): + """Test full animal model evidence pipeline with mocked data sources.""" + gene_ids = ['ENSG00000001', 'ENSG00000002'] + + with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_hcop, \ + patch('usher_pipeline.evidence.animal_models.fetch._download_text') as mock_text, \ + patch('httpx.get') as mock_http: + + # Mock HCOP downloads + mock_hcop.side_effect = [ + mock_hcop_data['mouse'].encode('utf-8'), + mock_hcop_data['zebrafish'].encode('utf-8'), + ] + + # Mock MGI and ZFIN downloads + mock_text.side_effect = [ + mock_phenotype_data['mgi'], + mock_phenotype_data['zfin'], + ] + + # Mock IMPC API responses + def mock_impc_response(url, **kwargs): + response = Mock() + response.raise_for_status = Mock() + + # Extract gene symbol from query + query = kwargs.get('params', {}).get('q', '') + if 'Ush2a' in query: + response.json = Mock(return_value=mock_phenotype_data['impc']['Ush2a']) + elif 'Myo7a' in query: + response.json = Mock(return_value=mock_phenotype_data['impc']['Myo7a']) + else: + response.json = Mock(return_value={'response': {'docs': []}}) + + return response + + mock_http.side_effect = mock_impc_response + + # Run pipeline + result = process_animal_model_evidence(gene_ids) + + # Verify results + assert len(result) == 2 + + # Check USH2A (ENSG00000001) + ush2a = result.filter(pl.col('gene_id') == 'ENSG00000001') + assert len(ush2a) == 1 + assert ush2a['mouse_ortholog'][0] == 'Ush2a' + assert ush2a['mouse_ortholog_confidence'][0] == 'HIGH' # 8 sources + assert ush2a['zebrafish_ortholog'][0] == 'ush2a' + assert ush2a['zebrafish_ortholog_confidence'][0] == 'MEDIUM' # 6 sources + assert ush2a['sensory_phenotype_count'][0] is not None + assert ush2a['animal_model_score_normalized'][0] is not None + assert ush2a['animal_model_score_normalized'][0] > 0 + + # Check MYO7A (ENSG00000002) + myo7a = result.filter(pl.col('gene_id') == 'ENSG00000002') + assert len(myo7a) == 1 + assert myo7a['mouse_ortholog'][0] == 'Myo7a' + assert myo7a['mouse_ortholog_confidence'][0] == 'MEDIUM' # 5 sources + + +def test_checkpoint_restart(mock_hcop_data, mock_phenotype_data): + """Test checkpoint-restart pattern: load from DuckDB if exists, skip reprocessing.""" + with tempfile.TemporaryDirectory() as tmpdir: + db_path = Path(tmpdir) / "test.duckdb" + store = PipelineStore(db_path) + + # Initial load + gene_ids = ['ENSG00000001', 'ENSG00000002'] + + with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_hcop, \ + patch('usher_pipeline.evidence.animal_models.fetch._download_text') as mock_text, \ + patch('httpx.get') as mock_http: + + mock_hcop.side_effect = [ + mock_hcop_data['mouse'].encode('utf-8'), + mock_hcop_data['zebrafish'].encode('utf-8'), + ] + mock_text.side_effect = [ + mock_phenotype_data['mgi'], + mock_phenotype_data['zfin'], + ] + + def mock_impc_response(url, **kwargs): + response = Mock() + response.raise_for_status = Mock() + response.json = Mock(return_value={'response': {'docs': []}}) + return response + + mock_http.side_effect = mock_impc_response + + df = process_animal_model_evidence(gene_ids) + + # Save to DuckDB (use mock provenance tracker) + provenance = Mock() + provenance.record_step = Mock() + load_to_duckdb(df, store, provenance) + + # Check checkpoint exists + assert store.has_checkpoint('animal_model_phenotypes') + + # Load from checkpoint + loaded_df = store.load_dataframe('animal_model_phenotypes') + assert loaded_df is not None + assert len(loaded_df) == 2 + + store.close() + + +def test_provenance_tracking(mock_hcop_data, mock_phenotype_data): + """Test that provenance metadata is correctly recorded.""" + with tempfile.TemporaryDirectory() as tmpdir: + db_path = Path(tmpdir) / "test.duckdb" + store = PipelineStore(db_path) + + gene_ids = ['ENSG00000001', 'ENSG00000002'] + + with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_hcop, \ + patch('usher_pipeline.evidence.animal_models.fetch._download_text') as mock_text, \ + patch('httpx.get') as mock_http: + + mock_hcop.side_effect = [ + mock_hcop_data['mouse'].encode('utf-8'), + mock_hcop_data['zebrafish'].encode('utf-8'), + ] + mock_text.side_effect = [ + mock_phenotype_data['mgi'], + mock_phenotype_data['zfin'], + ] + + def mock_impc_response(url, **kwargs): + response = Mock() + response.raise_for_status = Mock() + response.json = Mock(return_value={'response': {'docs': []}}) + return response + + mock_http.side_effect = mock_impc_response + + df = process_animal_model_evidence(gene_ids) + + # Track provenance (use mock) + provenance = Mock() + provenance.record_step = Mock() + provenance.get_steps = Mock(return_value=[ + {'step': 'load_animal_model_phenotypes', 'row_count': 2} + ]) + + load_to_duckdb(df, store, provenance, description="Test animal model data") + + # Check provenance was recorded + steps = provenance.get_steps() + assert len(steps) > 0 + load_step = next((s for s in steps if s['step'] == 'load_animal_model_phenotypes'), None) + assert load_step is not None + assert 'row_count' in load_step + assert load_step['row_count'] == 2 + + store.close() + + +def test_empty_phenotype_handling(mock_hcop_data): + """Test handling of genes with orthologs but no phenotypes.""" + gene_ids = ['ENSG00000001'] + + with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_hcop, \ + patch('usher_pipeline.evidence.animal_models.fetch._download_text') as mock_text, \ + patch('httpx.get') as mock_http: + + mock_hcop.side_effect = [ + mock_hcop_data['mouse'].encode('utf-8'), + mock_hcop_data['zebrafish'].encode('utf-8'), + ] + + # Empty phenotype data + empty_mgi = """Marker Symbol\tMammalian Phenotype ID +""" + empty_zfin = """Gene Symbol\tAffected Structure or Process 1 +""" + + mock_text.side_effect = [empty_mgi, empty_zfin] + + def mock_impc_response(url, **kwargs): + response = Mock() + response.raise_for_status = Mock() + response.json = Mock(return_value={'response': {'docs': []}}) + return response + + mock_http.side_effect = mock_impc_response + + result = process_animal_model_evidence(gene_ids) + + # Should have ortholog mapping but NULL sensory phenotype count + assert len(result) == 1 + assert result['mouse_ortholog'][0] == 'Ush2a' + assert result['sensory_phenotype_count'][0] is None + # Score should still be calculated (but low since no phenotypes) + assert result['animal_model_score_normalized'][0] is not None