feat(03-01): add annotation DuckDB loader, CLI command, and tests

- Create load_to_duckdb with provenance tracking and tier distribution stats - Add query_poorly_annotated helper to find under-studied genes - Register `evidence annotation` CLI command with checkpoint-restart pattern - Add comprehensive unit tests (9 tests) covering GO extraction, NULL handling, tier classification, score normalization, weighting - Add integration tests (6 tests) for pipeline, idempotency, checkpoint-restart, provenance, queries - All 15 tests pass with proper NULL preservation and schema validation
2026-02-11 19:03:10 +08:00
parent 0e389c7e41
commit d70239c4ce
5 changed files with 1625 additions and 2 deletions
--- a/tests/test_annotation.py
+++ b/tests/test_annotation.py
@@ -0,0 +1,227 @@
+"""Unit tests for annotation evidence layer."""
+
+import polars as pl
+import pytest
+from unittest.mock import Mock, patch
+
+from usher_pipeline.evidence.annotation import (
+    classify_annotation_tier,
+    normalize_annotation_score,
+)
+
+
+def test_go_count_extraction():
+    """Test correct GO term counting by category."""
+    # Create synthetic data with different GO counts per category
+    df = pl.DataFrame({
+        "gene_id": ["ENSG001", "ENSG002", "ENSG003"],
+        "gene_symbol": ["GENE1", "GENE2", "GENE3"],
+        "go_term_count": [50, 15, 3],
+        "go_biological_process_count": [30, 10, 2],
+        "go_molecular_function_count": [15, 3, 1],
+        "go_cellular_component_count": [5, 2, 0],
+        "uniprot_annotation_score": [5, 4, 2],
+        "has_pathway_membership": [True, True, False],
+    })
+
+    # Verify counts sum correctly (BP + MF + CC should equal total)
+    for row in df.iter_rows(named=True):
+        expected_total = (
+            row["go_biological_process_count"]
+            + row["go_molecular_function_count"]
+            + row["go_cellular_component_count"]
+        )
+        assert row["go_term_count"] == expected_total
+
+
+def test_null_go_handling():
+    """Test that genes with no GO data get NULL counts."""
+    # Create data with NULL GO counts
+    df = pl.DataFrame({
+        "gene_id": ["ENSG001", "ENSG002"],
+        "gene_symbol": ["GENE1", "GENE2"],
+        "go_term_count": [20, None],
+        "go_biological_process_count": [15, None],
+        "go_molecular_function_count": [3, None],
+        "go_cellular_component_count": [2, None],
+        "uniprot_annotation_score": [4, 3],
+        "has_pathway_membership": [True, False],
+    })
+
+    # Verify NULL is preserved (not converted to 0)
+    assert df["go_term_count"][1] is None
+    assert df["go_biological_process_count"][1] is None
+
+
+def test_tier_classification_well_annotated():
+    """Test well_annotated tier: high GO + high UniProt."""
+    df = pl.DataFrame({
+        "gene_id": ["ENSG001", "ENSG002", "ENSG003"],
+        "gene_symbol": ["GENE1", "GENE2", "GENE3"],
+        "go_term_count": [25, 20, 22],
+        "go_biological_process_count": [15, 12, 13],
+        "go_molecular_function_count": [7, 6, 7],
+        "go_cellular_component_count": [3, 2, 2],
+        "uniprot_annotation_score": [5, 4, 4],
+        "has_pathway_membership": [True, True, False],
+    })
+
+    result = classify_annotation_tier(df)
+
+    # All should be well_annotated (GO >= 20 AND UniProt >= 4)
+    assert all(result["annotation_tier"] == "well_annotated")
+
+
+def test_tier_classification_poorly_annotated():
+    """Test poorly_annotated tier: low/NULL GO + low UniProt."""
+    df = pl.DataFrame({
+        "gene_id": ["ENSG001", "ENSG002", "ENSG003"],
+        "gene_symbol": ["GENE1", "GENE2", "GENE3"],
+        "go_term_count": [2, None, 0],
+        "go_biological_process_count": [1, None, 0],
+        "go_molecular_function_count": [1, None, 0],
+        "go_cellular_component_count": [0, None, 0],
+        "uniprot_annotation_score": [2, None, 1],
+        "has_pathway_membership": [False, None, False],
+    })
+
+    result = classify_annotation_tier(df)
+
+    # All should be poorly_annotated
+    assert all(result["annotation_tier"] == "poorly_annotated")
+
+
+def test_tier_classification_partial():
+    """Test partially_annotated tier: medium annotations."""
+    df = pl.DataFrame({
+        "gene_id": ["ENSG001", "ENSG002", "ENSG003"],
+        "gene_symbol": ["GENE1", "GENE2", "GENE3"],
+        "go_term_count": [10, 3, 15],
+        "go_biological_process_count": [7, 2, 10],
+        "go_molecular_function_count": [2, 1, 4],
+        "go_cellular_component_count": [1, 0, 1],
+        "uniprot_annotation_score": [3, 3, 2],
+        "has_pathway_membership": [True, False, True],
+    })
+
+    result = classify_annotation_tier(df)
+
+    # All should be partially_annotated (GO >= 5 OR UniProt >= 3)
+    assert all(result["annotation_tier"] == "partially_annotated")
+
+
+def test_normalization_bounds():
+    """Test that normalized scores are always in [0, 1] range."""
+    df = pl.DataFrame({
+        "gene_id": ["ENSG001", "ENSG002", "ENSG003", "ENSG004"],
+        "gene_symbol": ["GENE1", "GENE2", "GENE3", "GENE4"],
+        "go_term_count": [100, 50, 10, 1],
+        "go_biological_process_count": [60, 30, 7, 1],
+        "go_molecular_function_count": [30, 15, 2, 0],
+        "go_cellular_component_count": [10, 5, 1, 0],
+        "uniprot_annotation_score": [5, 4, 3, 1],
+        "has_pathway_membership": [True, True, False, False],
+    })
+
+    result = normalize_annotation_score(df)
+
+    # All non-NULL scores should be in [0, 1]
+    scores = result.filter(pl.col("annotation_score_normalized").is_not_null())["annotation_score_normalized"]
+    assert all(scores >= 0.0)
+    assert all(scores <= 1.0)
+
+
+def test_normalization_null_preservation():
+    """Test that all-NULL inputs produce NULL score."""
+    df = pl.DataFrame({
+        "gene_id": ["ENSG001"],
+        "gene_symbol": ["GENE1"],
+        "go_term_count": pl.Series([None], dtype=pl.Int64),
+        "go_biological_process_count": pl.Series([None], dtype=pl.Int64),
+        "go_molecular_function_count": pl.Series([None], dtype=pl.Int64),
+        "go_cellular_component_count": pl.Series([None], dtype=pl.Int64),
+        "uniprot_annotation_score": pl.Series([None], dtype=pl.Int64),
+        "has_pathway_membership": pl.Series([None], dtype=pl.Boolean),
+    })
+
+    result = normalize_annotation_score(df)
+
+    # Should get NULL score (not 0.0)
+    assert result["annotation_score_normalized"][0] is None
+
+
+def test_normalization_with_pathway():
+    """Test that pathway membership contributes to score."""
+    # Two genes with identical GO/UniProt, different pathway membership
+    df = pl.DataFrame({
+        "gene_id": ["ENSG001", "ENSG002"],
+        "gene_symbol": ["GENE1", "GENE2"],
+        "go_term_count": [10, 10],
+        "go_biological_process_count": [7, 7],
+        "go_molecular_function_count": [2, 2],
+        "go_cellular_component_count": [1, 1],
+        "uniprot_annotation_score": [3, 3],
+        "has_pathway_membership": [True, False],
+    })
+
+    result = normalize_annotation_score(df)
+
+    # Gene with pathway should have higher score
+    assert result["annotation_score_normalized"][0] > result["annotation_score_normalized"][1]
+
+
+def test_composite_weighting():
+    """Test that composite score follows 0.5/0.3/0.2 weight distribution."""
+    # Create gene with only GO data (should contribute 50% weight)
+    df_go_only = pl.DataFrame({
+        "gene_id": ["ENSG001"],
+        "gene_symbol": ["GENE1"],
+        "go_term_count": [100],  # Max GO to get full GO component
+        "go_biological_process_count": [60],
+        "go_molecular_function_count": [30],
+        "go_cellular_component_count": [10],
+        "uniprot_annotation_score": pl.Series([None], dtype=pl.Int64),
+        "has_pathway_membership": pl.Series([None], dtype=pl.Boolean),
+    })
+
+    # Create gene with only UniProt data (should contribute 30% weight)
+    df_uniprot_only = pl.DataFrame({
+        "gene_id": ["ENSG002"],
+        "gene_symbol": ["GENE2"],
+        "go_term_count": pl.Series([None], dtype=pl.Int64),
+        "go_biological_process_count": pl.Series([None], dtype=pl.Int64),
+        "go_molecular_function_count": pl.Series([None], dtype=pl.Int64),
+        "go_cellular_component_count": pl.Series([None], dtype=pl.Int64),
+        "uniprot_annotation_score": [5],  # Max UniProt score
+        "has_pathway_membership": pl.Series([None], dtype=pl.Boolean),
+    })
+
+    # Create gene with only pathway data (should contribute 20% weight)
+    df_pathway_only = pl.DataFrame({
+        "gene_id": ["ENSG003"],
+        "gene_symbol": ["GENE3"],
+        "go_term_count": pl.Series([None], dtype=pl.Int64),
+        "go_biological_process_count": pl.Series([None], dtype=pl.Int64),
+        "go_molecular_function_count": pl.Series([None], dtype=pl.Int64),
+        "go_cellular_component_count": pl.Series([None], dtype=pl.Int64),
+        "uniprot_annotation_score": pl.Series([None], dtype=pl.Int64),
+        "has_pathway_membership": [True],
+    })
+
+    # Normalize each separately (need same GO max, so combine first)
+    df_combined = pl.concat([df_go_only, df_uniprot_only, df_pathway_only])
+    result = normalize_annotation_score(df_combined)
+
+    # Check approximate weights (allowing for small rounding)
+    go_score = result["annotation_score_normalized"][0]
+    uniprot_score = result["annotation_score_normalized"][1]
+    pathway_score = result["annotation_score_normalized"][2]
+
+    # GO component should be ~0.5 (full weight)
+    assert abs(go_score - 0.5) < 0.01
+
+    # UniProt component should be 0.3 (full score * weight)
+    assert abs(uniprot_score - 0.3) < 0.01
+
+    # Pathway component should be 0.2 (full weight)
+    assert abs(pathway_score - 0.2) < 0.01
--- a/tests/test_annotation_integration.py
+++ b/tests/test_annotation_integration.py
@@ -0,0 +1,307 @@
+"""Integration tests for annotation evidence layer."""
+
+import polars as pl
+import pytest
+from pathlib import Path
+from unittest.mock import Mock, patch, MagicMock
+
+from usher_pipeline.config.loader import load_config
+from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
+from usher_pipeline.evidence.annotation import (
+    process_annotation_evidence,
+    load_to_duckdb,
+    query_poorly_annotated,
+)
+
+
+@pytest.fixture
+def test_config(tmp_path):
+    """Create test configuration."""
+    config_dir = tmp_path / "config"
+    config_dir.mkdir()
+    data_dir = tmp_path / "data"
+    data_dir.mkdir()
+
+    config_yaml = f"""
+project_name: "usher-pipeline-test"
+data_dir: "{data_dir}"
+cache_dir: "{tmp_path / 'cache'}"
+duckdb_path: "{tmp_path / 'test.duckdb'}"
+
+versions:
+  ensembl_release: 112
+  gnomad_version: "4.1"
+
+api:
+  rate_limit_per_second: 5
+  max_retries: 3
+  cache_ttl_seconds: 86400
+  timeout_seconds: 30
+
+scoring:
+  gnomad: 0.20
+  expression: 0.20
+  annotation: 0.15
+  localization: 0.15
+  animal_model: 0.15
+  literature: 0.15
+"""
+    config_file = config_dir / "pipeline.yaml"
+    config_file.write_text(config_yaml)
+
+    return load_config(config_file)
+
+
+@pytest.fixture
+def mock_gene_ids():
+    """Sample gene IDs for testing."""
+    return ["ENSG001", "ENSG002", "ENSG003", "ENSG004", "ENSG005"]
+
+
+@pytest.fixture
+def mock_uniprot_mapping():
+    """Mock UniProt mapping DataFrame."""
+    return pl.DataFrame({
+        "gene_id": ["ENSG001", "ENSG002", "ENSG003"],
+        "uniprot_accession": ["P12345", "Q67890", "A11111"],
+    })
+
+
+@pytest.fixture
+def synthetic_annotation_data():
+    """Create synthetic annotation data for testing."""
+    return pl.DataFrame({
+        "gene_id": ["ENSG001", "ENSG002", "ENSG003", "ENSG004", "ENSG005"],
+        "gene_symbol": ["GENE1", "GENE2", "GENE3", "GENE4", "GENE5"],
+        "go_term_count": [50, 15, 5, None, 2],
+        "go_biological_process_count": [30, 10, 3, None, 1],
+        "go_molecular_function_count": [15, 3, 2, None, 1],
+        "go_cellular_component_count": [5, 2, 0, None, 0],
+        "uniprot_annotation_score": [5, 4, 3, None, 1],
+        "has_pathway_membership": [True, True, False, None, False],
+        "annotation_tier": ["well_annotated", "well_annotated", "partially_annotated", "poorly_annotated", "poorly_annotated"],
+        "annotation_score_normalized": [0.9, 0.75, 0.45, None, 0.15],
+    })
+
+
+def mock_mygene_querymany(gene_ids, **kwargs):
+    """Mock mygene.querymany response."""
+    # Simulate different annotation levels
+    return [
+        {
+            "query": "ENSG001",
+            "symbol": "GENE1",
+            "go": {
+                "BP": [{"id": f"GO:000{i}"} for i in range(30)],
+                "MF": [{"id": f"GO:100{i}"} for i in range(15)],
+                "CC": [{"id": f"GO:200{i}"} for i in range(5)],
+            },
+            "pathway": {
+                "kegg": [{"id": "hsa00001"}],
+                "reactome": [{"id": "R-HSA-00001"}],
+            },
+        },
+        {
+            "query": "ENSG002",
+            "symbol": "GENE2",
+            "go": {
+                "BP": [{"id": f"GO:000{i}"} for i in range(10)],
+                "MF": [{"id": f"GO:100{i}"} for i in range(3)],
+                "CC": [{"id": f"GO:200{i}"} for i in range(2)],
+            },
+            "pathway": {"kegg": [{"id": "hsa00002"}]},
+        },
+        {
+            "query": "ENSG003",
+            "symbol": "GENE3",
+            "go": {
+                "BP": [{"id": "GO:0001"}, {"id": "GO:0002"}],
+            },
+            "pathway": {},
+        },
+        {
+            "query": "ENSG004",
+            "symbol": "GENE4",
+            # No GO or pathway data
+        },
+        {
+            "query": "ENSG005",
+            "symbol": "GENE5",
+            "go": {
+                "BP": [{"id": "GO:0001"}],
+            },
+        },
+    ]
+
+
+def mock_uniprot_api_response():
+    """Mock UniProt API response."""
+    return {
+        "results": [
+            {"primaryAccession": "P12345", "annotationScore": 5},
+            {"primaryAccession": "Q67890", "annotationScore": 4},
+            {"primaryAccession": "A11111", "annotationScore": 3},
+        ]
+    }
+
+
+@patch("usher_pipeline.evidence.annotation.fetch._get_mygene_client")
+@patch("usher_pipeline.evidence.annotation.fetch._query_uniprot_batch")
+def test_process_annotation_evidence_pipeline(
+    mock_uniprot, mock_mygene_client, mock_gene_ids, mock_uniprot_mapping
+):
+    """Test full annotation evidence processing pipeline."""
+    # Setup mocks
+    mock_mg = Mock()
+    mock_mg.querymany.return_value = mock_mygene_querymany(mock_gene_ids)
+    mock_mygene_client.return_value = mock_mg
+
+    mock_uniprot.return_value = {
+        "P12345": 5,
+        "Q67890": 4,
+        "A11111": 3,
+    }
+
+    # Run pipeline
+    result = process_annotation_evidence(mock_gene_ids, mock_uniprot_mapping)
+
+    # Verify results
+    assert result.height == len(mock_gene_ids)
+    assert "annotation_tier" in result.columns
+    assert "annotation_score_normalized" in result.columns
+
+    # Check that tiers are classified
+    tiers = result["annotation_tier"].unique().to_list()
+    assert "well_annotated" in tiers or "partially_annotated" in tiers or "poorly_annotated" in tiers
+
+    # Verify mygene was called
+    mock_mg.querymany.assert_called_once()
+
+    # Verify UniProt was queried
+    mock_uniprot.assert_called()
+
+
+def test_load_to_duckdb_idempotent(test_config, synthetic_annotation_data):
+    """Test that load_to_duckdb is idempotent (CREATE OR REPLACE)."""
+    store = PipelineStore.from_config(test_config)
+    provenance = ProvenanceTracker.from_config(test_config)
+
+    # First load
+    load_to_duckdb(synthetic_annotation_data, store, provenance, "First load")
+
+    # Verify data exists
+    df1 = store.load_dataframe("annotation_completeness")
+    assert df1 is not None
+    assert df1.height == synthetic_annotation_data.height
+
+    # Second load (should replace)
+    modified_data = synthetic_annotation_data.with_columns(
+        pl.lit("test_modified").alias("gene_symbol")
+    )
+    load_to_duckdb(modified_data, store, provenance, "Second load")
+
+    # Verify data was replaced
+    df2 = store.load_dataframe("annotation_completeness")
+    assert df2 is not None
+    assert df2.height == modified_data.height
+    assert all(df2["gene_symbol"] == "test_modified")
+
+    store.close()
+
+
+def test_checkpoint_restart(test_config, synthetic_annotation_data):
+    """Test checkpoint-restart pattern."""
+    store = PipelineStore.from_config(test_config)
+    provenance = ProvenanceTracker.from_config(test_config)
+
+    # Initially no checkpoint
+    assert not store.has_checkpoint("annotation_completeness")
+
+    # Load creates checkpoint
+    load_to_duckdb(synthetic_annotation_data, store, provenance)
+
+    # Now checkpoint exists
+    assert store.has_checkpoint("annotation_completeness")
+
+    # Can load existing data
+    df = store.load_dataframe("annotation_completeness")
+    assert df is not None
+    assert df.height == synthetic_annotation_data.height
+
+    store.close()
+
+
+def test_provenance_recording(test_config, synthetic_annotation_data):
+    """Test that provenance metadata is recorded correctly."""
+    store = PipelineStore.from_config(test_config)
+    provenance = ProvenanceTracker.from_config(test_config)
+
+    load_to_duckdb(synthetic_annotation_data, store, provenance)
+
+    # Verify provenance step was recorded
+    steps = provenance.processing_steps
+    assert len(steps) > 0
+
+    step = steps[-1]
+    assert step["step_name"] == "load_annotation_completeness"
+    assert "row_count" in step["details"]
+    assert step["details"]["row_count"] == synthetic_annotation_data.height
+    assert "well_annotated_count" in step["details"]
+    assert "poorly_annotated_count" in step["details"]
+
+    store.close()
+
+
+def test_query_poorly_annotated(test_config, synthetic_annotation_data):
+    """Test querying poorly annotated genes."""
+    store = PipelineStore.from_config(test_config)
+    provenance = ProvenanceTracker.from_config(test_config)
+
+    # Load data
+    load_to_duckdb(synthetic_annotation_data, store, provenance)
+
+    # Query poorly annotated genes (score <= 0.3)
+    result = query_poorly_annotated(store, max_score=0.3)
+
+    # Should return genes with low scores
+    assert result.height > 0
+    assert all(result["annotation_score_normalized"] <= 0.3)
+
+    # Results should be sorted by score (lowest first)
+    scores = result["annotation_score_normalized"].to_list()
+    assert scores == sorted(scores)
+
+    store.close()
+
+
+def test_null_handling_throughout_pipeline(test_config, mock_gene_ids, mock_uniprot_mapping):
+    """Test that NULL values are preserved throughout the pipeline."""
+    # Create data with NULLs
+    data_with_nulls = pl.DataFrame({
+        "gene_id": ["ENSG001", "ENSG002"],
+        "gene_symbol": ["GENE1", "GENE2"],
+        "go_term_count": [10, None],
+        "go_biological_process_count": [7, None],
+        "go_molecular_function_count": [2, None],
+        "go_cellular_component_count": [1, None],
+        "uniprot_annotation_score": [3, None],
+        "has_pathway_membership": [True, None],
+        "annotation_tier": ["partially_annotated", "poorly_annotated"],
+        "annotation_score_normalized": [0.5, None],
+    })
+
+    store = PipelineStore.from_config(test_config)
+    provenance = ProvenanceTracker.from_config(test_config)
+
+    # Load to DuckDB
+    load_to_duckdb(data_with_nulls, store, provenance)
+
+    # Load back and verify NULLs preserved
+    result = store.load_dataframe("annotation_completeness")
+
+    # Gene with NULL GO should have NULL in result
+    gene2 = result.filter(pl.col("gene_id") == "ENSG002")
+    assert gene2["go_term_count"][0] is None
+    assert gene2["annotation_score_normalized"][0] is None
+
+    store.close()