feat(03-05): add animal model DuckDB loader, CLI, and comprehensive tests

- load.py: DuckDB persistence with provenance tracking, ortholog confidence distribution stats - CLI animal-models command: checkpoint-restart pattern, top scoring genes display - 10 unit tests: ortholog confidence scoring, keyword filtering, multi-organism bonus, NULL preservation - 4 integration tests: full pipeline, checkpoint-restart, provenance tracking, empty phenotype handling - All tests pass (14/14): validates fetch->transform->load->CLI flow - Fixed polars deprecations: str.join replaces str.concat, pl.len replaces pl.count
2026-02-11 19:06:49 +08:00
parent 99bc975a2c
commit bcd3c4ffbe
4 changed files with 681 additions and 0 deletions
--- a/src/usher_pipeline/evidence/animal_models/init.py
+++ b/src/usher_pipeline/evidence/animal_models/init.py
@@ -26,6 +26,10 @@ from usher_pipeline.evidence.animal_models.transform import (
    score_animal_evidence,
    process_animal_model_evidence,
 )
 from usher_pipeline.evidence.animal_models.load import (
    load_to_duckdb,
    query_sensory_phenotype_genes,
 )
 __all__ = [
    "AnimalModelRecord",
@@ -39,4 +43,6 @@ __all__ = [
    "filter_sensory_phenotypes",
    "score_animal_evidence",
    "process_animal_model_evidence",
    "load_to_duckdb",
    "query_sensory_phenotype_genes",
 ]
--- a/src/usher_pipeline/evidence/animal_models/load.py
+++ b/src/usher_pipeline/evidence/animal_models/load.py
@@ -0,0 +1,126 @@
 """Load animal model phenotype data to DuckDB with provenance tracking."""
 from typing import Optional
 import polars as pl
 import structlog
 from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
 logger = structlog.get_logger()
 def load_to_duckdb(
    df: pl.DataFrame,
    store: PipelineStore,
    provenance: ProvenanceTracker,
    description: str = ""
 ) -> None:
    """Save animal model phenotype DataFrame to DuckDB with provenance.
    Creates or replaces the animal_model_phenotypes table (idempotent).
    Records provenance step with summary statistics.
    Args:
        df: Processed animal model DataFrame with orthologs, phenotypes, and scores
        store: PipelineStore instance for DuckDB persistence
        provenance: ProvenanceTracker instance for metadata recording
        description: Optional description for checkpoint metadata
    """
    logger.info("animal_model_load_start", row_count=len(df))
    # Calculate summary statistics for provenance
    with_mouse = df.filter(pl.col("mouse_ortholog").is_not_null()).height
    with_zebrafish = df.filter(pl.col("zebrafish_ortholog").is_not_null()).height
    with_sensory = df.filter(pl.col("sensory_phenotype_count").is_not_null()).height
    # Ortholog confidence distribution
    if with_mouse > 0:
        mouse_conf_dist = (
            df.filter(pl.col("mouse_ortholog").is_not_null())
            .group_by("mouse_ortholog_confidence")
            .agg(pl.len())
            .to_dicts()
        )
    else:
        mouse_conf_dist = []
    if with_zebrafish > 0:
        zebrafish_conf_dist = (
            df.filter(pl.col("zebrafish_ortholog").is_not_null())
            .group_by("zebrafish_ortholog_confidence")
            .agg(pl.len())
            .to_dicts()
        )
    else:
        zebrafish_conf_dist = []
    # Mean sensory phenotype count
    mean_sensory_count = (
        df.filter(pl.col("sensory_phenotype_count").is_not_null())
        .select(pl.col("sensory_phenotype_count").mean())
        .item()
    )
    if mean_sensory_count is None:
        mean_sensory_count = 0.0
    # Save to DuckDB with CREATE OR REPLACE (idempotent)
    store.save_dataframe(
        df=df,
        table_name="animal_model_phenotypes",
        description=description or "Animal model phenotypes from MGI, ZFIN, and IMPC with ortholog confidence scoring",
        replace=True
    )
    # Record provenance step with details
    provenance.record_step("load_animal_model_phenotypes", {
        "row_count": len(df),
        "genes_with_mouse_ortholog": with_mouse,
        "genes_with_zebrafish_ortholog": with_zebrafish,
        "genes_with_sensory_phenotypes": with_sensory,
        "mouse_confidence_distribution": mouse_conf_dist,
        "zebrafish_confidence_distribution": zebrafish_conf_dist,
        "mean_sensory_phenotype_count": round(mean_sensory_count, 2),
    })
    logger.info(
        "animal_model_load_complete",
        row_count=len(df),
        with_mouse=with_mouse,
        with_zebrafish=with_zebrafish,
        with_sensory=with_sensory,
    )
 def query_sensory_phenotype_genes(
    store: PipelineStore,
    min_score: float = 0.3
 ) -> pl.DataFrame:
    """Query genes with high animal model evidence from DuckDB.
    Args:
        store: PipelineStore instance
        min_score: Minimum animal model score threshold (0-1)
    Returns:
        DataFrame with genes having animal model score >= min_score,
        sorted by score (highest first)
    """
    logger.info("animal_model_query_start", min_score=min_score)
    # Query DuckDB: genes with sufficient animal model evidence
    df = store.execute_query(
        """
        SELECT gene_id, mouse_ortholog, zebrafish_ortholog,
               sensory_phenotype_count, phenotype_categories,
               animal_model_score_normalized
        FROM animal_model_phenotypes
        WHERE animal_model_score_normalized >= ?
        ORDER BY animal_model_score_normalized DESC
        """,
        params=[min_score]
    )
    logger.info("animal_model_query_complete", result_count=len(df))
    return df
--- a/tests/test_animal_models.py
+++ b/tests/test_animal_models.py
@@ -0,0 +1,280 @@
 """Unit tests for animal model evidence layer."""
 import io
 from unittest.mock import Mock, patch, MagicMock
 import polars as pl
 import pytest
 from usher_pipeline.evidence.animal_models import (
    fetch_ortholog_mapping,
    filter_sensory_phenotypes,
    score_animal_evidence,
    SENSORY_MP_KEYWORDS,
 )
 def test_ortholog_confidence_high():
    """Test that 8+ supporting sources results in HIGH confidence."""
    # Mock HCOP data with 8 supporting databases
    hcop_data = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tmouse_entrez_gene\tmouse_ensembl_gene\tmgi_id\tmouse_name\tmouse_symbol\tmouse_chr\tmouse_assert_ids\tsupport
 123\tENSG00000001\tHGNC:1\tGene 1\tGENE1\t1\t\t456\tENSMUSG001\tMGI:1\tGene1\tGene1\t1\t\tdb1,db2,db3,db4,db5,db6,db7,db8"""
    with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_download:
        mock_download.return_value = hcop_data.encode('utf-8')
        result = fetch_ortholog_mapping(['ENSG00000001'])
        assert len(result) == 1
        assert result['mouse_ortholog_confidence'][0] == 'HIGH'
 def test_ortholog_confidence_low():
    """Test that 1-3 supporting sources results in LOW confidence."""
    # Mock HCOP data with 2 supporting databases
    hcop_mouse = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tmouse_entrez_gene\tmouse_ensembl_gene\tmgi_id\tmouse_name\tmouse_symbol\tmouse_chr\tmouse_assert_ids\tsupport
 123\tENSG00000001\tHGNC:1\tGene 1\tGENE1\t1\t\t456\tENSMUSG001\tMGI:1\tGene1\tGene1\t1\t\tdb1,db2"""
    hcop_zebrafish = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tzebrafish_entrez_gene\tzebrafish_ensembl_gene\tzfin_id\tzebrafish_name\tzebrafish_symbol\tzebrafish_chr\tzebrafish_assert_ids\tsupport
 """
    with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_download:
        # Return mouse data first, then zebrafish data
        mock_download.side_effect = [
            hcop_mouse.encode('utf-8'),
            hcop_zebrafish.encode('utf-8')
        ]
        result = fetch_ortholog_mapping(['ENSG00000001'])
        assert len(result) == 1
        assert result['mouse_ortholog_confidence'][0] == 'LOW'
 def test_one_to_many_best_selected():
    """Test that for one-to-many ortholog mappings, the highest confidence is kept."""
    # Mock HCOP data with two orthologs for same human gene
    hcop_mouse = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tmouse_entrez_gene\tmouse_ensembl_gene\tmgi_id\tmouse_name\tmouse_symbol\tmouse_chr\tmouse_assert_ids\tsupport
 123\tENSG00000001\tHGNC:1\tGene 1\tGENE1\t1\t\t456\tENSMUSG001\tMGI:1\tGene1a\tGene1a\t1\t\tdb1,db2
 123\tENSG00000001\tHGNC:1\tGene 1\tGENE1\t1\t\t789\tENSMUSG002\tMGI:2\tGene1b\tGene1b\t2\t\tdb1,db2,db3,db4,db5,db6,db7,db8"""
    hcop_zebrafish = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tzebrafish_entrez_gene\tzebrafish_ensembl_gene\tzfin_id\tzebrafish_name\tzebrafish_symbol\tzebrafish_chr\tzebrafish_assert_ids\tsupport
 """
    with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_download:
        mock_download.side_effect = [
            hcop_mouse.encode('utf-8'),
            hcop_zebrafish.encode('utf-8')
        ]
        result = fetch_ortholog_mapping(['ENSG00000001'])
        # Should select Gene1b with 8 sources (HIGH confidence)
        assert len(result) == 1
        assert result['mouse_ortholog'][0] == 'Gene1b'
        assert result['mouse_ortholog_confidence'][0] == 'HIGH'
 def test_sensory_keyword_match():
    """Test that phenotype terms matching SENSORY_MP_KEYWORDS are retained."""
    phenotypes = pl.DataFrame({
        'mouse_gene': ['Gene1', 'Gene1', 'Gene2'],
        'mp_term_id': ['MP:0001', 'MP:0002', 'MP:0003'],
        'mp_term_name': ['hearing loss', 'abnormal cochlea morphology', 'irrelevant phenotype'],
    })
    result = filter_sensory_phenotypes(phenotypes, SENSORY_MP_KEYWORDS, 'mp_term_name')
    # Should keep first two rows (hearing, cochlea match keywords)
    assert len(result) == 2
    assert 'hearing loss' in result['mp_term_name'].to_list()
    assert 'abnormal cochlea morphology' in result['mp_term_name'].to_list()
 def test_non_sensory_filtered():
    """Test that non-sensory phenotypes are filtered out."""
    phenotypes = pl.DataFrame({
        'mouse_gene': ['Gene1', 'Gene2'],
        'mp_term_id': ['MP:0001', 'MP:0002'],
        'mp_term_name': ['increased body weight', 'abnormal coat color'],
    })
    result = filter_sensory_phenotypes(phenotypes, SENSORY_MP_KEYWORDS, 'mp_term_name')
    # Should filter out both rows
    assert len(result) == 0
 def test_score_with_confidence_weighting():
    """Test that HIGH confidence orthologs score higher than LOW confidence."""
    # Gene with HIGH confidence mouse ortholog
    high_conf = pl.DataFrame({
        'gene_id': ['ENSG00000001'],
        'mouse_ortholog': ['Gene1'],
        'mouse_ortholog_confidence': ['HIGH'],
        'zebrafish_ortholog': [None],
        'zebrafish_ortholog_confidence': [None],
        'has_mouse_phenotype': [True],
        'has_zebrafish_phenotype': [False],
        'has_impc_phenotype': [False],
        'sensory_phenotype_count': [5],
    })
    # Gene with LOW confidence mouse ortholog
    low_conf = pl.DataFrame({
        'gene_id': ['ENSG00000002'],
        'mouse_ortholog': ['Gene2'],
        'mouse_ortholog_confidence': ['LOW'],
        'zebrafish_ortholog': [None],
        'zebrafish_ortholog_confidence': [None],
        'has_mouse_phenotype': [True],
        'has_zebrafish_phenotype': [False],
        'has_impc_phenotype': [False],
        'sensory_phenotype_count': [5],
    })
    high_result = score_animal_evidence(high_conf)
    low_result = score_animal_evidence(low_conf)
    high_score = high_result['animal_model_score_normalized'][0]
    low_score = low_result['animal_model_score_normalized'][0]
    # HIGH confidence should score higher (0.4 * 1.0 vs 0.4 * 0.4)
    assert high_score > low_score
 def test_score_null_no_ortholog():
    """Test that genes without orthologs get NULL score, not zero."""
    df = pl.DataFrame({
        'gene_id': ['ENSG00000001'],
        'mouse_ortholog': [None],
        'mouse_ortholog_confidence': [None],
        'zebrafish_ortholog': [None],
        'zebrafish_ortholog_confidence': [None],
        'has_mouse_phenotype': [False],
        'has_zebrafish_phenotype': [False],
        'has_impc_phenotype': [False],
        'sensory_phenotype_count': [None],
    })
    result = score_animal_evidence(df)
    # Should be NULL, not 0.0
    assert result['animal_model_score_normalized'][0] is None
 def test_multi_organism_bonus():
    """Test that phenotypes in both mouse and zebrafish result in higher score."""
    # Gene with only mouse phenotype
    mouse_only = pl.DataFrame({
        'gene_id': ['ENSG00000001'],
        'mouse_ortholog': ['Gene1'],
        'mouse_ortholog_confidence': ['HIGH'],
        'zebrafish_ortholog': [None],
        'zebrafish_ortholog_confidence': [None],
        'has_mouse_phenotype': [True],
        'has_zebrafish_phenotype': [False],
        'has_impc_phenotype': [False],
        'sensory_phenotype_count': [3],
    })
    # Gene with both mouse and zebrafish phenotypes
    both = pl.DataFrame({
        'gene_id': ['ENSG00000002'],
        'mouse_ortholog': ['Gene2'],
        'mouse_ortholog_confidence': ['HIGH'],
        'zebrafish_ortholog': ['gene2'],
        'zebrafish_ortholog_confidence': ['HIGH'],
        'has_mouse_phenotype': [True],
        'has_zebrafish_phenotype': [True],
        'has_impc_phenotype': [False],
        'sensory_phenotype_count': [3],
    })
    mouse_result = score_animal_evidence(mouse_only)
    both_result = score_animal_evidence(both)
    mouse_score = mouse_result['animal_model_score_normalized'][0]
    both_score = both_result['animal_model_score_normalized'][0]
    # Both organisms should score higher (0.4 + 0.3 vs 0.4)
    assert both_score > mouse_score
 def test_phenotype_count_scaling():
    """Test that more sensory phenotypes lead to higher scores (with diminishing returns)."""
    # Gene with 1 phenotype
    few = pl.DataFrame({
        'gene_id': ['ENSG00000001'],
        'mouse_ortholog': ['Gene1'],
        'mouse_ortholog_confidence': ['HIGH'],
        'zebrafish_ortholog': [None],
        'zebrafish_ortholog_confidence': [None],
        'has_mouse_phenotype': [True],
        'has_zebrafish_phenotype': [False],
        'has_impc_phenotype': [False],
        'sensory_phenotype_count': [1],
    })
    # Gene with 10 phenotypes
    many = pl.DataFrame({
        'gene_id': ['ENSG00000002'],
        'mouse_ortholog': ['Gene2'],
        'mouse_ortholog_confidence': ['HIGH'],
        'zebrafish_ortholog': [None],
        'zebrafish_ortholog_confidence': [None],
        'has_mouse_phenotype': [True],
        'has_zebrafish_phenotype': [False],
        'has_impc_phenotype': [False],
        'sensory_phenotype_count': [10],
    })
    few_result = score_animal_evidence(few)
    many_result = score_animal_evidence(many)
    few_score = few_result['animal_model_score_normalized'][0]
    many_score = many_result['animal_model_score_normalized'][0]
    # More phenotypes should score higher
    assert many_score > few_score
    # But not linearly (diminishing returns via log)
    # log2(11) / log2(11) = 1.0 vs log2(2) / log2(11) = 0.29
    assert many_score < few_score * 10  # Not 10x higher
 def test_impc_integration():
    """Test that IMPC phenotypes contribute to score."""
    # Gene without IMPC
    no_impc = pl.DataFrame({
        'gene_id': ['ENSG00000001'],
        'mouse_ortholog': ['Gene1'],
        'mouse_ortholog_confidence': ['HIGH'],
        'zebrafish_ortholog': [None],
        'zebrafish_ortholog_confidence': [None],
        'has_mouse_phenotype': [True],
        'has_zebrafish_phenotype': [False],
        'has_impc_phenotype': [False],
        'sensory_phenotype_count': [3],
    })
    # Gene with IMPC
    with_impc = pl.DataFrame({
        'gene_id': ['ENSG00000002'],
        'mouse_ortholog': ['Gene2'],
        'mouse_ortholog_confidence': ['HIGH'],
        'zebrafish_ortholog': [None],
        'zebrafish_ortholog_confidence': [None],
        'has_mouse_phenotype': [True],
        'has_zebrafish_phenotype': [False],
        'has_impc_phenotype': [True],
        'sensory_phenotype_count': [3],
    })
    no_impc_result = score_animal_evidence(no_impc)
    with_impc_result = score_animal_evidence(with_impc)
    no_impc_score = no_impc_result['animal_model_score_normalized'][0]
    with_impc_score = with_impc_result['animal_model_score_normalized'][0]
    # IMPC should add to score (+0.3)
    assert with_impc_score > no_impc_score
--- a/tests/test_animal_models_integration.py
+++ b/tests/test_animal_models_integration.py
@@ -0,0 +1,269 @@
 """Integration tests for animal model evidence layer."""
 import tempfile
 from pathlib import Path
 from unittest.mock import patch, Mock
 import polars as pl
 import pytest
 from usher_pipeline.evidence.animal_models import (
    process_animal_model_evidence,
    load_to_duckdb,
 )
 from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
@pytest.fixture
 def mock_hcop_data():
    """Mock HCOP ortholog mapping data."""
    mouse_data = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tmouse_entrez_gene\tmouse_ensembl_gene\tmgi_id\tmouse_name\tmouse_symbol\tmouse_chr\tmouse_assert_ids\tsupport
 123\tENSG00000001\tHGNC:1\tUSH2A\tUSH2A\t1\t\t456\tENSMUSG001\tMGI:1\tUsh2a\tUsh2a\t1\t\tdb1,db2,db3,db4,db5,db6,db7,db8
 456\tENSG00000002\tHGNC:2\tMYO7A\tMYO7A\t11\t\t789\tENSMUSG002\tMGI:2\tMyo7a\tMyo7a\t7\t\tdb1,db2,db3,db4,db5"""
    zebrafish_data = """human_entrez_gene\thuman_ensembl_gene\thgnc_id\thuman_name\thuman_symbol\thuman_chr\thuman_assert_ids\tzebrafish_entrez_gene\tzebrafish_ensembl_gene\tzfin_id\tzebrafish_name\tzebrafish_symbol\tzebrafish_chr\tzebrafish_assert_ids\tsupport
 123\tENSG00000001\tHGNC:1\tUSH2A\tUSH2A\t1\t\t111\tENSDART001\tZDB-GENE-1\tush2a\tush2a\t1\t\tdb1,db2,db3,db4,db5,db6"""
    return {'mouse': mouse_data, 'zebrafish': zebrafish_data}
@pytest.fixture
 def mock_phenotype_data():
    """Mock MGI, ZFIN, and IMPC phenotype data."""
    mgi_data = """Marker Symbol\tMammalian Phenotype ID
 Ush2a\tMP:0001967
 Ush2a\tMP:0005377
 Myo7a\tMP:0001968"""
    zfin_data = """Gene Symbol\tAffected Structure or Process 1
 ush2a\tabnormal ear morphology
 ush2a\tabnormal retina morphology"""
    impc_responses = {
        'Ush2a': {
            'response': {
                'docs': [
                    {
                        'marker_symbol': 'Ush2a',
                        'mp_term_id': 'MP:0001967',
                        'mp_term_name': 'deafness',
                        'p_value': 0.001
                    }
                ]
            }
        },
        'Myo7a': {
            'response': {
                'docs': [
                    {
                        'marker_symbol': 'Myo7a',
                        'mp_term_id': 'MP:0001968',
                        'mp_term_name': 'abnormal cochlea morphology',
                        'p_value': 0.0005
                    }
                ]
            }
        }
    }
    return {'mgi': mgi_data, 'zfin': zfin_data, 'impc': impc_responses}
 def test_full_pipeline(mock_hcop_data, mock_phenotype_data):
    """Test full animal model evidence pipeline with mocked data sources."""
    gene_ids = ['ENSG00000001', 'ENSG00000002']
    with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_hcop, \
         patch('usher_pipeline.evidence.animal_models.fetch._download_text') as mock_text, \
         patch('httpx.get') as mock_http:
        # Mock HCOP downloads
        mock_hcop.side_effect = [
            mock_hcop_data['mouse'].encode('utf-8'),
            mock_hcop_data['zebrafish'].encode('utf-8'),
        ]
        # Mock MGI and ZFIN downloads
        mock_text.side_effect = [
            mock_phenotype_data['mgi'],
            mock_phenotype_data['zfin'],
        ]
        # Mock IMPC API responses
        def mock_impc_response(url, **kwargs):
            response = Mock()
            response.raise_for_status = Mock()
            # Extract gene symbol from query
            query = kwargs.get('params', {}).get('q', '')
            if 'Ush2a' in query:
                response.json = Mock(return_value=mock_phenotype_data['impc']['Ush2a'])
            elif 'Myo7a' in query:
                response.json = Mock(return_value=mock_phenotype_data['impc']['Myo7a'])
            else:
                response.json = Mock(return_value={'response': {'docs': []}})
            return response
        mock_http.side_effect = mock_impc_response
        # Run pipeline
        result = process_animal_model_evidence(gene_ids)
        # Verify results
        assert len(result) == 2
        # Check USH2A (ENSG00000001)
        ush2a = result.filter(pl.col('gene_id') == 'ENSG00000001')
        assert len(ush2a) == 1
        assert ush2a['mouse_ortholog'][0] == 'Ush2a'
        assert ush2a['mouse_ortholog_confidence'][0] == 'HIGH'  # 8 sources
        assert ush2a['zebrafish_ortholog'][0] == 'ush2a'
        assert ush2a['zebrafish_ortholog_confidence'][0] == 'MEDIUM'  # 6 sources
        assert ush2a['sensory_phenotype_count'][0] is not None
        assert ush2a['animal_model_score_normalized'][0] is not None
        assert ush2a['animal_model_score_normalized'][0] > 0
        # Check MYO7A (ENSG00000002)
        myo7a = result.filter(pl.col('gene_id') == 'ENSG00000002')
        assert len(myo7a) == 1
        assert myo7a['mouse_ortholog'][0] == 'Myo7a'
        assert myo7a['mouse_ortholog_confidence'][0] == 'MEDIUM'  # 5 sources
 def test_checkpoint_restart(mock_hcop_data, mock_phenotype_data):
    """Test checkpoint-restart pattern: load from DuckDB if exists, skip reprocessing."""
    with tempfile.TemporaryDirectory() as tmpdir:
        db_path = Path(tmpdir) / "test.duckdb"
        store = PipelineStore(db_path)
        # Initial load
        gene_ids = ['ENSG00000001', 'ENSG00000002']
        with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_hcop, \
             patch('usher_pipeline.evidence.animal_models.fetch._download_text') as mock_text, \
             patch('httpx.get') as mock_http:
            mock_hcop.side_effect = [
                mock_hcop_data['mouse'].encode('utf-8'),
                mock_hcop_data['zebrafish'].encode('utf-8'),
            ]
            mock_text.side_effect = [
                mock_phenotype_data['mgi'],
                mock_phenotype_data['zfin'],
            ]
            def mock_impc_response(url, **kwargs):
                response = Mock()
                response.raise_for_status = Mock()
                response.json = Mock(return_value={'response': {'docs': []}})
                return response
            mock_http.side_effect = mock_impc_response
            df = process_animal_model_evidence(gene_ids)
            # Save to DuckDB (use mock provenance tracker)
            provenance = Mock()
            provenance.record_step = Mock()
            load_to_duckdb(df, store, provenance)
        # Check checkpoint exists
        assert store.has_checkpoint('animal_model_phenotypes')
        # Load from checkpoint
        loaded_df = store.load_dataframe('animal_model_phenotypes')
        assert loaded_df is not None
        assert len(loaded_df) == 2
        store.close()
 def test_provenance_tracking(mock_hcop_data, mock_phenotype_data):
    """Test that provenance metadata is correctly recorded."""
    with tempfile.TemporaryDirectory() as tmpdir:
        db_path = Path(tmpdir) / "test.duckdb"
        store = PipelineStore(db_path)
        gene_ids = ['ENSG00000001', 'ENSG00000002']
        with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_hcop, \
             patch('usher_pipeline.evidence.animal_models.fetch._download_text') as mock_text, \
             patch('httpx.get') as mock_http:
            mock_hcop.side_effect = [
                mock_hcop_data['mouse'].encode('utf-8'),
                mock_hcop_data['zebrafish'].encode('utf-8'),
            ]
            mock_text.side_effect = [
                mock_phenotype_data['mgi'],
                mock_phenotype_data['zfin'],
            ]
            def mock_impc_response(url, **kwargs):
                response = Mock()
                response.raise_for_status = Mock()
                response.json = Mock(return_value={'response': {'docs': []}})
                return response
            mock_http.side_effect = mock_impc_response
            df = process_animal_model_evidence(gene_ids)
            # Track provenance (use mock)
            provenance = Mock()
            provenance.record_step = Mock()
            provenance.get_steps = Mock(return_value=[
                {'step': 'load_animal_model_phenotypes', 'row_count': 2}
            ])
            load_to_duckdb(df, store, provenance, description="Test animal model data")
            # Check provenance was recorded
            steps = provenance.get_steps()
            assert len(steps) > 0
            load_step = next((s for s in steps if s['step'] == 'load_animal_model_phenotypes'), None)
            assert load_step is not None
            assert 'row_count' in load_step
            assert load_step['row_count'] == 2
        store.close()
 def test_empty_phenotype_handling(mock_hcop_data):
    """Test handling of genes with orthologs but no phenotypes."""
    gene_ids = ['ENSG00000001']
    with patch('usher_pipeline.evidence.animal_models.fetch._download_gzipped') as mock_hcop, \
         patch('usher_pipeline.evidence.animal_models.fetch._download_text') as mock_text, \
         patch('httpx.get') as mock_http:
        mock_hcop.side_effect = [
            mock_hcop_data['mouse'].encode('utf-8'),
            mock_hcop_data['zebrafish'].encode('utf-8'),
        ]
        # Empty phenotype data
        empty_mgi = """Marker Symbol\tMammalian Phenotype ID
 """
        empty_zfin = """Gene Symbol\tAffected Structure or Process 1
 """
        mock_text.side_effect = [empty_mgi, empty_zfin]
        def mock_impc_response(url, **kwargs):
            response = Mock()
            response.raise_for_status = Mock()
            response.json = Mock(return_value={'response': {'docs': []}})
            return response
        mock_http.side_effect = mock_impc_response
        result = process_animal_model_evidence(gene_ids)
        # Should have ortholog mapping but NULL sensory phenotype count
        assert len(result) == 1
        assert result['mouse_ortholog'][0] == 'Ush2a'
        assert result['sensory_phenotype_count'][0] is None
        # Score should still be calculated (but low since no phenotypes)
        assert result['animal_model_score_normalized'][0] is not None