usher-exploring/tests/test_expression_integration.py

"""Integration tests for expression evidence layer.

Tests with mocked downloads and synthetic fixtures.
NO actual external API calls to HPA/GTEx/CellxGene.
"""

import polars as pl
import pytest
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock

from usher_pipeline.evidence.expression.transform import process_expression_evidence
from usher_pipeline.evidence.expression.load import load_to_duckdb
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker


@pytest.fixture
def temp_cache_dir(tmp_path):
    """Create temporary cache directory for downloads."""
    cache_dir = tmp_path / "expression"
    cache_dir.mkdir()
    return cache_dir


@pytest.fixture
def mock_gene_ids():
    """Sample gene IDs for testing."""
    return ["ENSG00000001", "ENSG00000002", "ENSG00000003"]


@pytest.fixture
def mock_hpa_data():
    """Synthetic HPA expression data."""
    return pl.LazyFrame({
        "gene_symbol": ["GENE1", "GENE2", "GENE3"],
        "hpa_retina_tpm": [50.0, 10.0, None],
        "hpa_cerebellum_tpm": [40.0, 10.0, 5.0],
        "hpa_testis_tpm": [5.0, 50.0, 50.0],
        "hpa_fallopian_tube_tpm": [5.0, 50.0, None],
    })


@pytest.fixture
def mock_gtex_data(mock_gene_ids):
    """Synthetic GTEx expression data."""
    return pl.LazyFrame({
        "gene_id": mock_gene_ids,
        "gtex_retina_tpm": [60.0, 10.0, None],
        "gtex_cerebellum_tpm": [45.0, 10.0, 5.0],
        "gtex_testis_tpm": [5.0, 55.0, 55.0],
        "gtex_fallopian_tube_tpm": [None, None, None],  # Often not available
    })


@pytest.fixture
def mock_cellxgene_data(mock_gene_ids):
    """Synthetic CellxGene data (NULLs as placeholder)."""
    return pl.LazyFrame({
        "gene_id": mock_gene_ids,
        "cellxgene_photoreceptor_expr": [None, None, None],
        "cellxgene_hair_cell_expr": [None, None, None],
    })


def test_process_expression_pipeline_with_mocks(
    temp_cache_dir, mock_gene_ids, mock_hpa_data, mock_gtex_data, mock_cellxgene_data
):
    """Test full pipeline with mocked data sources."""
    # Mock all fetch functions to return synthetic data
    with patch('usher_pipeline.evidence.expression.transform.fetch_hpa_expression') as mock_hpa, \
         patch('usher_pipeline.evidence.expression.transform.fetch_gtex_expression') as mock_gtex, \
         patch('usher_pipeline.evidence.expression.transform.fetch_cellxgene_expression') as mock_cellxgene:

        mock_hpa.return_value = mock_hpa_data
        mock_gtex.return_value = mock_gtex_data
        mock_cellxgene.return_value = mock_cellxgene_data

        # Run pipeline (skip CellxGene for simplicity)
        df = process_expression_evidence(
            gene_ids=mock_gene_ids,
            cache_dir=temp_cache_dir,
            skip_cellxgene=True,
        )

        # Verify output structure
        assert len(df) == len(mock_gene_ids)
        assert "gene_id" in df.columns
        assert "tau_specificity" in df.columns
        assert "usher_tissue_enrichment" in df.columns
        assert "expression_score_normalized" in df.columns


def test_checkpoint_restart(temp_cache_dir, mock_gene_ids):
    """Test checkpoint-restart: skip processing if table exists."""
    # Create mock store with existing checkpoint
    mock_store = Mock(spec=PipelineStore)
    mock_store.has_checkpoint.return_value = True

    # Mock load_dataframe to return synthetic data
    existing_data = pl.DataFrame({
        "gene_id": mock_gene_ids,
        "tau_specificity": [0.5, 0.3, 0.2],
        "usher_tissue_enrichment": [2.0, 1.0, 0.5],
        "expression_score_normalized": [0.8, 0.5, 0.3],
    })
    mock_store.load_dataframe.return_value = existing_data

    # Verify checkpoint works (would skip processing in real CLI)
    assert mock_store.has_checkpoint('tissue_expression')
    df = mock_store.load_dataframe('tissue_expression')
    assert len(df) == len(mock_gene_ids)


def test_provenance_recording():
    """Test provenance step recording during load."""
    # Create synthetic expression data
    df = pl.DataFrame({
        "gene_id": ["ENSG00000001", "ENSG00000002"],
        "hpa_retina_tpm": [50.0, None],
        "gtex_retina_tpm": [60.0, 10.0],
        "cellxgene_photoreceptor_expr": [None, None],
        "cellxgene_hair_cell_expr": [None, None],
        "tau_specificity": [0.5, None],
        "usher_tissue_enrichment": [2.0, 1.0],
        "expression_score_normalized": [0.8, 0.5],
    })

    # Mock store and provenance tracker
    mock_store = Mock(spec=PipelineStore)
    mock_provenance = Mock(spec=ProvenanceTracker)

    # Call load function
    load_to_duckdb(
        df=df,
        store=mock_store,
        provenance=mock_provenance,
        description="Test expression data"
    )

    # Verify provenance step was recorded
    mock_provenance.record_step.assert_called_once()
    step_name, step_details = mock_provenance.record_step.call_args[0]
    assert step_name == "load_tissue_expression"
    assert "row_count" in step_details
    assert step_details["row_count"] == 2


def test_null_expression_handling():
    """Test that genes with all NULL expression data are handled gracefully."""
    df = pl.DataFrame({
        "gene_id": ["ENSG00000001", "ENSG00000002"],
        "hpa_retina_tpm": [None, 50.0],
        "hpa_cerebellum_tpm": [None, 40.0],
        "gtex_retina_tpm": [None, 60.0],
        "cellxgene_photoreceptor_expr": [None, None],
        "cellxgene_hair_cell_expr": [None, None],
        "tau_specificity": [None, 0.5],
        "usher_tissue_enrichment": [None, 2.0],
        "expression_score_normalized": [None, 0.8],
    })

    # Mock store
    mock_store = Mock(spec=PipelineStore)
    mock_provenance = Mock(spec=ProvenanceTracker)

    # Should not raise exception
    load_to_duckdb(df, mock_store, mock_provenance)

    # Verify store was called
    mock_store.save_dataframe.assert_called_once()