"""Integration tests for annotation evidence layer.""" import polars as pl import pytest from pathlib import Path from unittest.mock import Mock, patch, MagicMock from usher_pipeline.config.loader import load_config from usher_pipeline.persistence import PipelineStore, ProvenanceTracker from usher_pipeline.evidence.annotation import ( process_annotation_evidence, load_to_duckdb, query_poorly_annotated, ) @pytest.fixture def test_config(tmp_path): """Create test configuration.""" config_dir = tmp_path / "config" config_dir.mkdir() data_dir = tmp_path / "data" data_dir.mkdir() config_yaml = f""" project_name: "usher-pipeline-test" data_dir: "{data_dir}" cache_dir: "{tmp_path / 'cache'}" duckdb_path: "{tmp_path / 'test.duckdb'}" versions: ensembl_release: 112 gnomad_version: "4.1" api: rate_limit_per_second: 5 max_retries: 3 cache_ttl_seconds: 86400 timeout_seconds: 30 scoring: gnomad: 0.20 expression: 0.20 annotation: 0.15 localization: 0.15 animal_model: 0.15 literature: 0.15 """ config_file = config_dir / "pipeline.yaml" config_file.write_text(config_yaml) return load_config(config_file) @pytest.fixture def mock_gene_ids(): """Sample gene IDs for testing.""" return ["ENSG001", "ENSG002", "ENSG003", "ENSG004", "ENSG005"] @pytest.fixture def mock_uniprot_mapping(): """Mock UniProt mapping DataFrame.""" return pl.DataFrame({ "gene_id": ["ENSG001", "ENSG002", "ENSG003"], "uniprot_accession": ["P12345", "Q67890", "A11111"], }) @pytest.fixture def synthetic_annotation_data(): """Create synthetic annotation data for testing.""" return pl.DataFrame({ "gene_id": ["ENSG001", "ENSG002", "ENSG003", "ENSG004", "ENSG005"], "gene_symbol": ["GENE1", "GENE2", "GENE3", "GENE4", "GENE5"], "go_term_count": [50, 15, 5, None, 2], "go_biological_process_count": [30, 10, 3, None, 1], "go_molecular_function_count": [15, 3, 2, None, 1], "go_cellular_component_count": [5, 2, 0, None, 0], "uniprot_annotation_score": [5, 4, 3, None, 1], "has_pathway_membership": [True, True, False, None, False], "annotation_tier": ["well_annotated", "well_annotated", "partially_annotated", "poorly_annotated", "poorly_annotated"], "annotation_score_normalized": [0.9, 0.75, 0.45, None, 0.15], }) def mock_mygene_querymany(gene_ids, **kwargs): """Mock mygene.querymany response.""" # Simulate different annotation levels return [ { "query": "ENSG001", "symbol": "GENE1", "go": { "BP": [{"id": f"GO:000{i}"} for i in range(30)], "MF": [{"id": f"GO:100{i}"} for i in range(15)], "CC": [{"id": f"GO:200{i}"} for i in range(5)], }, "pathway": { "kegg": [{"id": "hsa00001"}], "reactome": [{"id": "R-HSA-00001"}], }, }, { "query": "ENSG002", "symbol": "GENE2", "go": { "BP": [{"id": f"GO:000{i}"} for i in range(10)], "MF": [{"id": f"GO:100{i}"} for i in range(3)], "CC": [{"id": f"GO:200{i}"} for i in range(2)], }, "pathway": {"kegg": [{"id": "hsa00002"}]}, }, { "query": "ENSG003", "symbol": "GENE3", "go": { "BP": [{"id": "GO:0001"}, {"id": "GO:0002"}], }, "pathway": {}, }, { "query": "ENSG004", "symbol": "GENE4", # No GO or pathway data }, { "query": "ENSG005", "symbol": "GENE5", "go": { "BP": [{"id": "GO:0001"}], }, }, ] def mock_uniprot_api_response(): """Mock UniProt API response.""" return { "results": [ {"primaryAccession": "P12345", "annotationScore": 5}, {"primaryAccession": "Q67890", "annotationScore": 4}, {"primaryAccession": "A11111", "annotationScore": 3}, ] } @patch("usher_pipeline.evidence.annotation.fetch._get_mygene_client") @patch("usher_pipeline.evidence.annotation.fetch._query_uniprot_batch") def test_process_annotation_evidence_pipeline( mock_uniprot, mock_mygene_client, mock_gene_ids, mock_uniprot_mapping ): """Test full annotation evidence processing pipeline.""" # Setup mocks mock_mg = Mock() mock_mg.querymany.return_value = mock_mygene_querymany(mock_gene_ids) mock_mygene_client.return_value = mock_mg mock_uniprot.return_value = { "P12345": 5, "Q67890": 4, "A11111": 3, } # Run pipeline result = process_annotation_evidence(mock_gene_ids, mock_uniprot_mapping) # Verify results assert result.height == len(mock_gene_ids) assert "annotation_tier" in result.columns assert "annotation_score_normalized" in result.columns # Check that tiers are classified tiers = result["annotation_tier"].unique().to_list() assert "well_annotated" in tiers or "partially_annotated" in tiers or "poorly_annotated" in tiers # Verify mygene was called mock_mg.querymany.assert_called_once() # Verify UniProt was queried mock_uniprot.assert_called() def test_load_to_duckdb_idempotent(test_config, synthetic_annotation_data): """Test that load_to_duckdb is idempotent (CREATE OR REPLACE).""" store = PipelineStore.from_config(test_config) provenance = ProvenanceTracker.from_config(test_config) # First load load_to_duckdb(synthetic_annotation_data, store, provenance, "First load") # Verify data exists df1 = store.load_dataframe("annotation_completeness") assert df1 is not None assert df1.height == synthetic_annotation_data.height # Second load (should replace) modified_data = synthetic_annotation_data.with_columns( pl.lit("test_modified").alias("gene_symbol") ) load_to_duckdb(modified_data, store, provenance, "Second load") # Verify data was replaced df2 = store.load_dataframe("annotation_completeness") assert df2 is not None assert df2.height == modified_data.height assert all(df2["gene_symbol"] == "test_modified") store.close() def test_checkpoint_restart(test_config, synthetic_annotation_data): """Test checkpoint-restart pattern.""" store = PipelineStore.from_config(test_config) provenance = ProvenanceTracker.from_config(test_config) # Initially no checkpoint assert not store.has_checkpoint("annotation_completeness") # Load creates checkpoint load_to_duckdb(synthetic_annotation_data, store, provenance) # Now checkpoint exists assert store.has_checkpoint("annotation_completeness") # Can load existing data df = store.load_dataframe("annotation_completeness") assert df is not None assert df.height == synthetic_annotation_data.height store.close() def test_provenance_recording(test_config, synthetic_annotation_data): """Test that provenance metadata is recorded correctly.""" store = PipelineStore.from_config(test_config) provenance = ProvenanceTracker.from_config(test_config) load_to_duckdb(synthetic_annotation_data, store, provenance) # Verify provenance step was recorded steps = provenance.processing_steps assert len(steps) > 0 step = steps[-1] assert step["step_name"] == "load_annotation_completeness" assert "row_count" in step["details"] assert step["details"]["row_count"] == synthetic_annotation_data.height assert "well_annotated_count" in step["details"] assert "poorly_annotated_count" in step["details"] store.close() def test_query_poorly_annotated(test_config, synthetic_annotation_data): """Test querying poorly annotated genes.""" store = PipelineStore.from_config(test_config) provenance = ProvenanceTracker.from_config(test_config) # Load data load_to_duckdb(synthetic_annotation_data, store, provenance) # Query poorly annotated genes (score <= 0.3) result = query_poorly_annotated(store, max_score=0.3) # Should return genes with low scores assert result.height > 0 assert all(result["annotation_score_normalized"] <= 0.3) # Results should be sorted by score (lowest first) scores = result["annotation_score_normalized"].to_list() assert scores == sorted(scores) store.close() def test_null_handling_throughout_pipeline(test_config, mock_gene_ids, mock_uniprot_mapping): """Test that NULL values are preserved throughout the pipeline.""" # Create data with NULLs data_with_nulls = pl.DataFrame({ "gene_id": ["ENSG001", "ENSG002"], "gene_symbol": ["GENE1", "GENE2"], "go_term_count": [10, None], "go_biological_process_count": [7, None], "go_molecular_function_count": [2, None], "go_cellular_component_count": [1, None], "uniprot_annotation_score": [3, None], "has_pathway_membership": [True, None], "annotation_tier": ["partially_annotated", "poorly_annotated"], "annotation_score_normalized": [0.5, None], }) store = PipelineStore.from_config(test_config) provenance = ProvenanceTracker.from_config(test_config) # Load to DuckDB load_to_duckdb(data_with_nulls, store, provenance) # Load back and verify NULLs preserved result = store.load_dataframe("annotation_completeness") # Gene with NULL GO should have NULL in result gene2 = result.filter(pl.col("gene_id") == "ENSG002") assert gene2["go_term_count"][0] is None assert gene2["annotation_score_normalized"][0] is None store.close()