feat(03-06): implement literature evidence models, PubMed fetch, and scoring

- Create LiteratureRecord pydantic model with context-specific counts - Implement PubMed query via Biopython Entrez with rate limiting (3/sec default, 10/sec with API key) - Define SEARCH_CONTEXTS for cilia, sensory, cytoskeleton, cell_polarity queries - Implement evidence tier classification: direct_experimental > functional_mention > hts_hit > incidental > none - Implement quality-weighted scoring with bias mitigation via log2(total_pubmed_count) normalization - Add biopython>=1.84 dependency to pyproject.toml - Support checkpoint-restart for long-running PubMed queries (estimated 3-11 hours for 20K genes)
2026-02-11 19:00:20 +08:00
parent 6645c59b0b
commit 8aa66987f8
11 changed files with 1806 additions and 0 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,6 +35,7 @@ dependencies = [
    "pyyaml>=6.0",
    "httpx>=0.28",
    "structlog>=25.0",
    "biopython>=1.84",
 ]
 [project.optional-dependencies]
@@ -42,6 +43,9 @@ dev = [
    "pytest>=7.4.0",
    "pytest-cov>=4.1.0",
 ]
 expression = [
    "cellxgene-census>=1.19",
 ]
 [project.scripts]
 usher-pipeline = "usher_pipeline.cli.main:cli"
--- a/src/usher_pipeline/evidence/expression/init.py
+++ b/src/usher_pipeline/evidence/expression/init.py
@@ -0,0 +1,48 @@
 """Tissue expression evidence layer for Usher-relevant tissues.
 This module retrieves expression data from:
 - HPA (Human Protein Atlas): Tissue-level RNA/protein expression
 - GTEx: Tissue-level RNA expression across diverse samples
 - CellxGene: Single-cell RNA-seq data for specific cell types
 Target tissues/cell types:
 - Retina, photoreceptor cells (retinal rod, retinal cone)
 - Inner ear, hair cells (cochlea, vestibular)
 - Cilia-rich tissues (cerebellum, testis, fallopian tube)
 Expression enrichment in these tissues is evidence for potential cilia/Usher involvement.
 """
 from usher_pipeline.evidence.expression.fetch import (
    fetch_hpa_expression,
    fetch_gtex_expression,
    fetch_cellxgene_expression,
 )
 from usher_pipeline.evidence.expression.transform import (
    calculate_tau_specificity,
    compute_expression_score,
    process_expression_evidence,
 )
 from usher_pipeline.evidence.expression.load import (
    load_to_duckdb,
    query_tissue_enriched,
 )
 from usher_pipeline.evidence.expression.models import (
    ExpressionRecord,
    EXPRESSION_TABLE_NAME,
    TARGET_TISSUES,
 )
 __all__ = [
    "fetch_hpa_expression",
    "fetch_gtex_expression",
    "fetch_cellxgene_expression",
    "calculate_tau_specificity",
    "compute_expression_score",
    "process_expression_evidence",
    "load_to_duckdb",
    "query_tissue_enriched",
    "ExpressionRecord",
    "EXPRESSION_TABLE_NAME",
    "TARGET_TISSUES",
 ]
--- a/src/usher_pipeline/evidence/expression/fetch.py
+++ b/src/usher_pipeline/evidence/expression/fetch.py
@@ -0,0 +1,458 @@
 """Download and parse tissue expression data from HPA, GTEx, and CellxGene."""
 import gzip
 import shutil
 import zipfile
 from pathlib import Path
 from typing import Optional
 import httpx
 import polars as pl
 import structlog
 from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type,
 )
 from usher_pipeline.evidence.expression.models import (
    HPA_NORMAL_TISSUE_URL,
    GTEX_MEDIAN_EXPRESSION_URL,
    TARGET_TISSUES,
    TARGET_CELL_TYPES,
 )
 logger = structlog.get_logger()
@retry(
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=4, max=60),
    retry=retry_if_exception_type(
        (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
    ),
 )
 def download_hpa_tissue_data(
    output_path: Path,
    url: str = HPA_NORMAL_TISSUE_URL,
    force: bool = False,
 ) -> Path:
    """Download HPA normal tissue TSV (bulk download for all genes).
    Args:
        output_path: Where to save the TSV file
        url: HPA normal tissue data URL (default: proteinatlas.org bulk download)
        force: If True, re-download even if file exists
    Returns:
        Path to the downloaded TSV file
    Raises:
        httpx.HTTPStatusError: On HTTP errors (after retries)
        httpx.ConnectError: On connection errors (after retries)
        httpx.TimeoutException: On timeout (after retries)
    """
    output_path = Path(output_path)
    # Checkpoint pattern: skip if already downloaded
    if output_path.exists() and not force:
        logger.info(
            "hpa_tissue_exists",
            path=str(output_path),
            size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
        )
        return output_path
    # Create parent directory if needed
    output_path.parent.mkdir(parents=True, exist_ok=True)
    # HPA data is zipped
    is_zipped = url.endswith(".zip")
    temp_path = output_path.with_suffix(".zip.tmp")
    logger.info("hpa_download_start", url=url, zipped=is_zipped)
    # Stream download to disk
    with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
        response.raise_for_status()
        total_bytes = int(response.headers.get("content-length", 0))
        downloaded = 0
        with open(temp_path, "wb") as f:
            for chunk in response.iter_bytes(chunk_size=8192):
                f.write(chunk)
                downloaded += len(chunk)
                # Log progress every 10MB
                if total_bytes > 0 and downloaded % (10 * 1024 * 1024) < 8192:
                    pct = (downloaded / total_bytes) * 100
                    logger.info(
                        "hpa_download_progress",
                        downloaded_mb=round(downloaded / 1024 / 1024, 2),
                        total_mb=round(total_bytes / 1024 / 1024, 2),
                        percent=round(pct, 1),
                    )
    # Unzip if needed
    if is_zipped:
        logger.info("hpa_unzip_start", zip_path=str(temp_path))
        with zipfile.ZipFile(temp_path, "r") as zip_ref:
            # Extract the TSV file (usually named "normal_tissue.tsv")
            tsv_files = [name for name in zip_ref.namelist() if name.endswith(".tsv")]
            if not tsv_files:
                raise ValueError(f"No TSV file found in HPA zip: {temp_path}")
            # Extract first TSV
            zip_ref.extract(tsv_files[0], path=output_path.parent)
            extracted_path = output_path.parent / tsv_files[0]
            extracted_path.rename(output_path)
        temp_path.unlink()
    else:
        temp_path.rename(output_path)
    logger.info(
        "hpa_download_complete",
        path=str(output_path),
        size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
    )
    return output_path
@retry(
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=4, max=60),
    retry=retry_if_exception_type(
        (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
    ),
 )
 def download_gtex_expression(
    output_path: Path,
    url: str = GTEX_MEDIAN_EXPRESSION_URL,
    force: bool = False,
 ) -> Path:
    """Download GTEx median gene expression file (bulk download).
    Args:
        output_path: Where to save the GCT file
        url: GTEx median TPM file URL (default: v8/v10 bulk data)
        force: If True, re-download even if file exists
    Returns:
        Path to the downloaded GCT file
    Raises:
        httpx.HTTPStatusError: On HTTP errors (after retries)
        httpx.ConnectError: On connection errors (after retries)
        httpx.TimeoutException: On timeout (after retries)
    """
    output_path = Path(output_path)
    # Checkpoint pattern: skip if already downloaded
    if output_path.exists() and not force:
        logger.info(
            "gtex_expression_exists",
            path=str(output_path),
            size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
        )
        return output_path
    # Create parent directory if needed
    output_path.parent.mkdir(parents=True, exist_ok=True)
    # GTEx data is gzipped
    is_compressed = url.endswith(".gz")
    temp_path = output_path.with_suffix(output_path.suffix + ".tmp")
    logger.info("gtex_download_start", url=url, compressed=is_compressed)
    # Stream download to disk
    with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
        response.raise_for_status()
        total_bytes = int(response.headers.get("content-length", 0))
        downloaded = 0
        with open(temp_path, "wb") as f:
            for chunk in response.iter_bytes(chunk_size=8192):
                f.write(chunk)
                downloaded += len(chunk)
                # Log progress every 10MB
                if total_bytes > 0 and downloaded % (10 * 1024 * 1024) < 8192:
                    pct = (downloaded / total_bytes) * 100
                    logger.info(
                        "gtex_download_progress",
                        downloaded_mb=round(downloaded / 1024 / 1024, 2),
                        total_mb=round(total_bytes / 1024 / 1024, 2),
                        percent=round(pct, 1),
                    )
    # Decompress if needed
    if is_compressed:
        logger.info("gtex_decompress_start", compressed_path=str(temp_path))
        with gzip.open(temp_path, "rb") as f_in:
            with open(output_path, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)
        temp_path.unlink()
    else:
        temp_path.rename(output_path)
    logger.info(
        "gtex_download_complete",
        path=str(output_path),
        size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
    )
    return output_path
 def fetch_hpa_expression(
    gene_ids: list[str],
    cache_dir: Optional[Path] = None,
    force: bool = False,
 ) -> pl.LazyFrame:
    """Fetch HPA tissue expression data for target tissues.
    Downloads HPA bulk normal tissue TSV, filters to target tissues
    (retina, cerebellum, testis, fallopian tube), and extracts TPM values.
    Args:
        gene_ids: List of Ensembl gene IDs to filter (unused - HPA uses gene symbols)
        cache_dir: Directory to cache downloaded HPA file
        force: If True, re-download even if cached
    Returns:
        LazyFrame with columns: gene_symbol, hpa_retina_tpm, hpa_cerebellum_tpm,
        hpa_testis_tpm, hpa_fallopian_tube_tpm
        NULL for genes/tissues not in HPA data.
    """
    cache_dir = Path(cache_dir) if cache_dir else Path("data/expression")
    cache_dir.mkdir(parents=True, exist_ok=True)
    # Download HPA bulk tissue data
    hpa_tsv_path = cache_dir / "hpa_normal_tissue.tsv"
    download_hpa_tissue_data(hpa_tsv_path, force=force)
    logger.info("hpa_parse_start", path=str(hpa_tsv_path))
    # HPA TSV format (v24):
    # Gene | Gene name | Tissue | Cell type | Level | Reliability
    # Where Level is expression level category, not TPM
    # For quantitative data we need the "nTPM" column if available
    # Columns: Gene, Gene name, Tissue, Cell type, Level, Reliability
    # OR: Gene, Gene name, Tissue, nTPM (normalized TPM)
    # Read HPA data with lazy evaluation
    lf = pl.scan_csv(
        hpa_tsv_path,
        separator="\t",
        null_values=["NA", "", "."],
        has_header=True,
    )
    # Target tissues from HPA
    target_tissue_names = {
        "retina": TARGET_TISSUES["retina"]["hpa"],
        "cerebellum": TARGET_TISSUES["cerebellum"]["hpa"],
        "testis": TARGET_TISSUES["testis"]["hpa"],
        "fallopian_tube": TARGET_TISSUES["fallopian_tube"]["hpa"],
    }
    # Filter to target tissues
    tissue_filter = pl.col("Tissue").is_in(list(target_tissue_names.values()))
    lf = lf.filter(tissue_filter)
    # HPA provides categorical "Level" (Not detected, Low, Medium, High)
    # For scoring, we'll convert to numeric: Not detected=0, Low=1, Medium=2, High=3
    # If nTPM column exists, use that instead
    # Check if nTPM column exists (better for quantitative analysis)
    # For now, use Level mapping as HPA download format varies
    level_mapping = {
        "Not detected": 0.0,
        "Low": 1.0,
        "Medium": 2.0,
        "High": 3.0,
    }
    # Convert Level to numeric expression proxy
    # If "nTPM" column exists, use it; otherwise map Level
    # We'll handle this by attempting both approaches
    # Pivot to wide format: gene x tissue
    # Group by Gene name and Tissue, aggregate Level (take max if multiple cell types)
    lf = (
        lf.group_by(["Gene name", "Tissue"])
        .agg(pl.col("Level").first().alias("expression_level"))
        .with_columns(
            pl.col("expression_level")
            .map_elements(lambda x: level_mapping.get(x, None), return_dtype=pl.Float64)
            .alias("expression_value")
        )
    )
    # Pivot: rows=genes, columns=tissues
    # Create separate columns for each target tissue
    lf_wide = lf.pivot(
        values="expression_value",
        index="Gene name",
        columns="Tissue",
    )
    # Rename columns to match our schema
    rename_map = {}
    for our_key, hpa_tissue in target_tissue_names.items():
        if hpa_tissue:
            rename_map[hpa_tissue] = f"hpa_{our_key}_tpm"
    if rename_map:
        lf_wide = lf_wide.rename(rename_map)
    # Rename "Gene name" to "gene_symbol"
    lf_wide = lf_wide.rename({"Gene name": "gene_symbol"})
    logger.info("hpa_parse_complete", tissues=list(target_tissue_names.keys()))
    return lf_wide
 def fetch_gtex_expression(
    gene_ids: list[str],
    cache_dir: Optional[Path] = None,
    force: bool = False,
 ) -> pl.LazyFrame:
    """Fetch GTEx tissue expression data for target tissues.
    Downloads GTEx bulk median TPM file, filters to target tissues.
    NOTE: GTEx lacks inner ear/cochlea tissue - will be NULL.
    Args:
        gene_ids: List of Ensembl gene IDs to filter
        cache_dir: Directory to cache downloaded GTEx file
        force: If True, re-download even if cached
    Returns:
        LazyFrame with columns: gene_id, gtex_retina_tpm, gtex_cerebellum_tpm,
        gtex_testis_tpm, gtex_fallopian_tube_tpm
        NULL for tissues not available in GTEx.
    """
    cache_dir = Path(cache_dir) if cache_dir else Path("data/expression")
    cache_dir.mkdir(parents=True, exist_ok=True)
    # Download GTEx bulk expression data
    gtex_gct_path = cache_dir / "gtex_median_tpm.gct"
    download_gtex_expression(gtex_gct_path, force=force)
    logger.info("gtex_parse_start", path=str(gtex_gct_path))
    # GTEx GCT format:
    # #1.2 (version header)
    # [dimensions line]
    # Name  Description  [Tissue1]  [Tissue2]  ...
    # ENSG00000... | GeneSymbol | tpm1 | tpm2 | ...
    # Skip first 2 lines (GCT header), then read
    lf = pl.scan_csv(
        gtex_gct_path,
        separator="\t",
        skip_rows=2,
        null_values=["NA", "", "."],
        has_header=True,
    )
    # Target tissues from GTEx
    target_tissue_cols = {
        "retina": TARGET_TISSUES["retina"]["gtex"],
        "cerebellum": TARGET_TISSUES["cerebellum"]["gtex"],
        "testis": TARGET_TISSUES["testis"]["gtex"],
        "fallopian_tube": TARGET_TISSUES["fallopian_tube"]["gtex"],
    }
    # Select gene ID column + target tissue columns
    # GTEx uses "Name" for gene ID (ENSG...) and "Description" for gene symbol
    select_cols = ["Name"]
    rename_map = {"Name": "gene_id"}
    for our_key, gtex_tissue in target_tissue_cols.items():
        if gtex_tissue:
            # Check if tissue column exists (not all GTEx versions have all tissues)
            select_cols.append(gtex_tissue)
            rename_map[gtex_tissue] = f"gtex_{our_key}_tpm"
    # Try to select columns; if tissue missing, it will be NULL
    # Use select with error handling for missing columns
    try:
        lf = lf.select(select_cols).rename(rename_map)
    except Exception as e:
        logger.warning("gtex_tissue_missing", error=str(e))
        # Fallback: select available columns
        available_cols = lf.columns
        select_available = [col for col in select_cols if col in available_cols]
        lf = lf.select(select_available).rename({
            k: v for k, v in rename_map.items() if k in select_available
        })
    # Filter to requested gene_ids if provided
    if gene_ids:
        lf = lf.filter(pl.col("gene_id").is_in(gene_ids))
    logger.info("gtex_parse_complete", tissues=list(target_tissue_cols.keys()))
    return lf
 def fetch_cellxgene_expression(
    gene_ids: list[str],
    cache_dir: Optional[Path] = None,
    batch_size: int = 100,
 ) -> pl.LazyFrame:
    """Fetch CellxGene single-cell expression data for target cell types.
    Uses cellxgene_census library to query scRNA-seq data for photoreceptor
    and hair cell populations. Computes mean expression per gene per cell type.
    NOTE: cellxgene_census is an optional dependency (large install).
    If not available, returns DataFrame with all NULL values and logs warning.
    Args:
        gene_ids: List of Ensembl gene IDs to query
        cache_dir: Directory for caching (currently unused)
        batch_size: Number of genes to process per batch (default: 100)
    Returns:
        LazyFrame with columns: gene_id, cellxgene_photoreceptor_expr,
        cellxgene_hair_cell_expr
        NULL if cellxgene_census not available or cell type data missing.
    """
    try:
        import cellxgene_census
    except ImportError:
        logger.warning(
            "cellxgene_census_unavailable",
            message="cellxgene_census not installed. Install with: pip install 'usher-pipeline[expression]'",
        )
        # Return empty DataFrame with NULL values
        return pl.LazyFrame({
            "gene_id": gene_ids,
            "cellxgene_photoreceptor_expr": [None] * len(gene_ids),
            "cellxgene_hair_cell_expr": [None] * len(gene_ids),
        })
    logger.info("cellxgene_query_start", gene_count=len(gene_ids), batch_size=batch_size)
    # For now, return placeholder with NULLs
    # Full CellxGene integration requires census schema knowledge and filtering
    # This is a complex query that would need cell type ontology matching
    # Placeholder implementation for testing
    logger.warning(
        "cellxgene_not_implemented",
        message="CellxGene integration is complex and not yet implemented. Returning NULL values.",
    )
    return pl.LazyFrame({
        "gene_id": gene_ids,
        "cellxgene_photoreceptor_expr": [None] * len(gene_ids),
        "cellxgene_hair_cell_expr": [None] * len(gene_ids),
    })
--- a/src/usher_pipeline/evidence/expression/load.py
+++ b/src/usher_pipeline/evidence/expression/load.py
@@ -0,0 +1,119 @@
 """Load expression evidence data to DuckDB with provenance tracking."""
 from typing import Optional
 import polars as pl
 import structlog
 from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
 from usher_pipeline.evidence.expression.models import EXPRESSION_TABLE_NAME
 logger = structlog.get_logger()
 def load_to_duckdb(
    df: pl.DataFrame,
    store: PipelineStore,
    provenance: ProvenanceTracker,
    description: str = ""
 ) -> None:
    """Save expression evidence DataFrame to DuckDB with provenance.
    Creates or replaces the tissue_expression table (idempotent).
    Records provenance step with summary statistics.
    Args:
        df: Processed expression DataFrame with tau_specificity and expression_score_normalized
        store: PipelineStore instance for DuckDB persistence
        provenance: ProvenanceTracker instance for metadata recording
        description: Optional description for checkpoint metadata
    """
    logger.info("expression_load_start", row_count=len(df))
    # Calculate summary statistics for provenance
    # Genes with retina expression (any source)
    retina_expr_count = df.filter(
        pl.col("hpa_retina_tpm").is_not_null() |
        pl.col("gtex_retina_tpm").is_not_null() |
        pl.col("cellxgene_photoreceptor_expr").is_not_null()
    ).height
    # Genes with inner ear expression (primarily CellxGene)
    inner_ear_expr_count = df.filter(
        pl.col("cellxgene_hair_cell_expr").is_not_null()
    ).height
    # Mean Tau specificity (excluding NULLs)
    mean_tau = df.select(pl.col("tau_specificity").mean()).item()
    # Expression score distribution
    expr_score_stats = df.select([
        pl.col("expression_score_normalized").min().alias("min"),
        pl.col("expression_score_normalized").max().alias("max"),
        pl.col("expression_score_normalized").mean().alias("mean"),
        pl.col("expression_score_normalized").median().alias("median"),
    ]).to_dicts()[0]
    # Save to DuckDB with CREATE OR REPLACE (idempotent)
    store.save_dataframe(
        df=df,
        table_name=EXPRESSION_TABLE_NAME,
        description=description or "Tissue expression evidence with HPA, GTEx, and CellxGene data",
        replace=True
    )
    # Record provenance step with details
    provenance.record_step("load_tissue_expression", {
        "row_count": len(df),
        "retina_expression_count": retina_expr_count,
        "inner_ear_expression_count": inner_ear_expr_count,
        "mean_tau_specificity": round(mean_tau, 3) if mean_tau else None,
        "expression_score_min": round(expr_score_stats["min"], 3) if expr_score_stats["min"] else None,
        "expression_score_max": round(expr_score_stats["max"], 3) if expr_score_stats["max"] else None,
        "expression_score_mean": round(expr_score_stats["mean"], 3) if expr_score_stats["mean"] else None,
        "expression_score_median": round(expr_score_stats["median"], 3) if expr_score_stats["median"] else None,
    })
    logger.info(
        "expression_load_complete",
        row_count=len(df),
        retina_expr=retina_expr_count,
        inner_ear_expr=inner_ear_expr_count,
        mean_tau=round(mean_tau, 3) if mean_tau else None,
    )
 def query_tissue_enriched(
    store: PipelineStore,
    min_enrichment: float = 2.0
 ) -> pl.DataFrame:
    """Query genes enriched in Usher-relevant tissues from DuckDB.
    Args:
        store: PipelineStore instance
        min_enrichment: Minimum usher_tissue_enrichment threshold (default: 2.0 = 2x enriched)
    Returns:
        DataFrame with tissue-enriched genes sorted by enrichment (most enriched first)
        Columns: gene_id, gene_symbol, usher_tissue_enrichment, tau_specificity,
                 expression_score_normalized
    """
    logger.info("expression_query_enriched", min_enrichment=min_enrichment)
    # Query DuckDB: enriched genes
    df = store.execute_query(
        f"""
        SELECT gene_id, gene_symbol, usher_tissue_enrichment, tau_specificity,
               expression_score_normalized,
               hpa_retina_tpm, gtex_retina_tpm, cellxgene_photoreceptor_expr,
               cellxgene_hair_cell_expr
        FROM {EXPRESSION_TABLE_NAME}
        WHERE usher_tissue_enrichment >= ?
        ORDER BY usher_tissue_enrichment DESC
        """,
        params=[min_enrichment]
    )
    logger.info("expression_query_complete", result_count=len(df))
    return df
--- a/src/usher_pipeline/evidence/expression/models.py
+++ b/src/usher_pipeline/evidence/expression/models.py
@@ -0,0 +1,101 @@
 """Data models for tissue expression evidence."""
 from pydantic import BaseModel
 # HPA normal tissue data download URL (bulk TSV, more efficient than per-gene API)
 HPA_NORMAL_TISSUE_URL = (
    "https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
 )
 # GTEx v10 median gene expression bulk data
 GTEX_MEDIAN_EXPRESSION_URL = (
    "https://storage.googleapis.com/adult-gtex/bulk-gex/v10/median-tpm/"
    "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
 )
 # Table name in DuckDB
 EXPRESSION_TABLE_NAME = "tissue_expression"
 # Target tissues for Usher/cilia relevance
 # Maps our standardized tissue keys to API-specific identifiers
 TARGET_TISSUES = {
    # Retina-related
    "retina": {
        "hpa": "retina",
        "gtex": "Eye - Retina",  # Note: Not available in all GTEx versions
        "cellxgene_tissue": ["retina", "eye"],
    },
    # Inner ear-related (primarily from scRNA-seq, not in HPA/GTEx bulk)
    "inner_ear": {
        "hpa": None,  # Not available in HPA bulk tissue data
        "gtex": None,  # Not available in GTEx
        "cellxgene_tissue": ["inner ear", "cochlea", "vestibular system"],
    },
    # Cilia-rich tissues
    "cerebellum": {
        "hpa": "cerebellum",
        "gtex": "Brain - Cerebellum",
        "cellxgene_tissue": ["cerebellum"],
    },
    "testis": {
        "hpa": "testis",
        "gtex": "Testis",
        "cellxgene_tissue": ["testis"],
    },
    "fallopian_tube": {
        "hpa": "fallopian tube",
        "gtex": "Fallopian Tube",  # May not be available in all GTEx versions
        "cellxgene_tissue": ["fallopian tube"],
    },
 }
 # Target cell types for scRNA-seq (CellxGene)
 TARGET_CELL_TYPES = [
    "photoreceptor cell",
    "retinal rod cell",
    "retinal cone cell",
    "hair cell",  # Inner ear mechanoreceptor
    "cochlear hair cell",
    "vestibular hair cell",
 ]
 class ExpressionRecord(BaseModel):
    """Tissue expression evidence for a single gene.
    Attributes:
        gene_id: Ensembl gene ID (e.g., ENSG00000...)
        gene_symbol: HGNC gene symbol
        hpa_retina_tpm: HPA retina TPM expression (NULL if not in HPA)
        hpa_cerebellum_tpm: HPA cerebellum TPM (proxy for cilia-rich tissue)
        hpa_testis_tpm: HPA testis TPM (cilia-rich)
        hpa_fallopian_tube_tpm: HPA fallopian tube TPM (ciliated epithelium)
        gtex_retina_tpm: GTEx "Eye - Retina" median TPM (NULL if tissue unavailable)
        gtex_cerebellum_tpm: GTEx "Brain - Cerebellum" median TPM
        gtex_testis_tpm: GTEx "Testis" median TPM
        gtex_fallopian_tube_tpm: GTEx "Fallopian Tube" median TPM (often NULL)
        cellxgene_photoreceptor_expr: Mean expression in photoreceptor cells (scRNA-seq)
        cellxgene_hair_cell_expr: Mean expression in hair cells (scRNA-seq)
        tau_specificity: Tau index (0=ubiquitous, 1=tissue-specific) across all tissues
        usher_tissue_enrichment: Enrichment in Usher-relevant tissues vs global expression
        expression_score_normalized: Composite expression score (0-1 range)
    CRITICAL: NULL values represent missing/unavailable data and are preserved as None.
    Inner ear data is primarily from CellxGene (not HPA/GTEx bulk).
    """
    gene_id: str
    gene_symbol: str
    hpa_retina_tpm: float | None = None
    hpa_cerebellum_tpm: float | None = None
    hpa_testis_tpm: float | None = None
    hpa_fallopian_tube_tpm: float | None = None
    gtex_retina_tpm: float | None = None
    gtex_cerebellum_tpm: float | None = None
    gtex_testis_tpm: float | None = None
    gtex_fallopian_tube_tpm: float | None = None
    cellxgene_photoreceptor_expr: float | None = None
    cellxgene_hair_cell_expr: float | None = None
    tau_specificity: float | None = None
    usher_tissue_enrichment: float | None = None
    expression_score_normalized: float | None = None
--- a/src/usher_pipeline/evidence/expression/transform.py
+++ b/src/usher_pipeline/evidence/expression/transform.py
@@ -0,0 +1,273 @@
 """Transform and normalize tissue expression data."""
 from pathlib import Path
 from typing import Optional
 import polars as pl
 import structlog
 from usher_pipeline.evidence.expression.fetch import (
    fetch_hpa_expression,
    fetch_gtex_expression,
    fetch_cellxgene_expression,
 )
 logger = structlog.get_logger()
 def calculate_tau_specificity(
    df: pl.DataFrame,
    tissue_columns: list[str],
 ) -> pl.DataFrame:
    """Calculate Tau tissue specificity index.
    Tau measures tissue specificity: 0 = ubiquitous expression, 1 = tissue-specific.
    Formula: Tau = sum(1 - xi/xmax) / (n-1)
    where xi is expression in tissue i, xmax is max expression across tissues.
    If ANY tissue value is NULL, Tau is NULL (insufficient data for reliable specificity).
    Args:
        df: DataFrame with expression values across tissues
        tissue_columns: List of column names containing tissue expression values
    Returns:
        DataFrame with tau_specificity column added
    """
    logger.info("tau_calculation_start", tissue_count=len(tissue_columns))
    # Check if any tissue columns are missing
    available_cols = [col for col in tissue_columns if col in df.columns]
    if len(available_cols) < len(tissue_columns):
        missing = set(tissue_columns) - set(available_cols)
        logger.warning("tau_missing_columns", missing=list(missing))
    if not available_cols:
        # No tissue data available - return with NULL Tau
        return df.with_columns(pl.lit(None).cast(pl.Float64).alias("tau_specificity"))
    # For each gene, check if all tissue values are non-NULL
    # If any NULL, Tau is NULL
    # Otherwise, compute Tau = sum(1 - xi/xmax) / (n-1)
    # Create expression for NULL check
    has_all_data = pl.all_horizontal([pl.col(col).is_not_null() for col in available_cols])
    # Compute Tau only for genes with complete data
    # Step 1: Find max expression across tissues
    max_expr = pl.max_horizontal([pl.col(col) for col in available_cols])
    # Step 2: Compute sum(1 - xi/xmax) for each gene
    # Handle division by zero: if max_expr is 0, Tau is undefined (set to NULL)
    tau_sum = sum([
        pl.when(max_expr > 0)
        .then(1.0 - (pl.col(col) / max_expr))
        .otherwise(0.0)
        for col in available_cols
    ])
    # Step 3: Divide by (n-1), where n is number of tissues
    n_tissues = len(available_cols)
    if n_tissues <= 1:
        # Cannot compute specificity with only 1 tissue
        tau = pl.lit(None).cast(pl.Float64)
    else:
        tau = tau_sum / (n_tissues - 1)
    # Apply Tau only to genes with complete data
    df = df.with_columns(
        pl.when(has_all_data & (max_expr > 0))
        .then(tau)
        .otherwise(pl.lit(None))
        .alias("tau_specificity")
    )
    logger.info("tau_calculation_complete")
    return df
 def compute_expression_score(df: pl.DataFrame) -> pl.DataFrame:
    """Compute Usher tissue enrichment and normalized expression score.
    Computes:
    1. usher_tissue_enrichment: Ratio of mean expression in Usher-relevant tissues
       (retina, inner ear proxies) to mean expression across all tissues.
       Higher ratio = more enriched in target tissues.
    2. expression_score_normalized: Weighted composite of:
       - 40%: usher_tissue_enrichment (normalized to 0-1)
       - 30%: tau_specificity
       - 30%: max_target_tissue_rank (percentile rank of max expression in targets)
    NULL if all expression data is NULL.
    Args:
        df: DataFrame with tissue expression columns and tau_specificity
    Returns:
        DataFrame with usher_tissue_enrichment and expression_score_normalized columns
    """
    logger.info("expression_score_start")
    # Define Usher-relevant tissue columns
    usher_tissue_cols = [
        "hpa_retina_tpm",
        "hpa_cerebellum_tpm",  # Cilia-rich
        "gtex_retina_tpm",
        "gtex_cerebellum_tpm",
        "cellxgene_photoreceptor_expr",
        "cellxgene_hair_cell_expr",
    ]
    # All tissue columns for global mean
    all_tissue_cols = [
        "hpa_retina_tpm",
        "hpa_cerebellum_tpm",
        "hpa_testis_tpm",
        "hpa_fallopian_tube_tpm",
        "gtex_retina_tpm",
        "gtex_cerebellum_tpm",
        "gtex_testis_tpm",
        "gtex_fallopian_tube_tpm",
        "cellxgene_photoreceptor_expr",
        "cellxgene_hair_cell_expr",
    ]
    # Filter to available columns
    usher_available = [col for col in usher_tissue_cols if col in df.columns]
    all_available = [col for col in all_tissue_cols if col in df.columns]
    if not usher_available or not all_available:
        # No expression data - return NULL scores
        return df.with_columns([
            pl.lit(None).cast(pl.Float64).alias("usher_tissue_enrichment"),
            pl.lit(None).cast(pl.Float64).alias("expression_score_normalized"),
        ])
    # Compute mean expression in Usher tissues (ignoring NULLs)
    usher_mean = pl.mean_horizontal([pl.col(col) for col in usher_available])
    # Compute mean expression across all tissues (ignoring NULLs)
    global_mean = pl.mean_horizontal([pl.col(col) for col in all_available])
    # Enrichment ratio: usher_mean / global_mean
    # If global_mean is 0 or NULL, enrichment is NULL
    enrichment = pl.when(global_mean > 0).then(usher_mean / global_mean).otherwise(pl.lit(None))
    df = df.with_columns(enrichment.alias("usher_tissue_enrichment"))
    # Normalize enrichment to 0-1 scale
    # Use percentile rank across all genes
    enrichment_percentile = pl.col("usher_tissue_enrichment").rank(method="average") / pl.col("usher_tissue_enrichment").count()
    # Compute max expression in target tissues
    max_target_expr = pl.max_horizontal([pl.col(col) for col in usher_available])
    max_target_percentile = max_target_expr.rank(method="average") / max_target_expr.count()
    # Composite score (weighted average)
    # If tau_specificity is NULL, we can still compute a partial score
    # But prefer to have at least enrichment or tau available
    composite = pl.when(
        pl.col("usher_tissue_enrichment").is_not_null() | pl.col("tau_specificity").is_not_null()
    ).then(
        0.4 * enrichment_percentile.fill_null(0.0) +
        0.3 * pl.col("tau_specificity").fill_null(0.0) +
        0.3 * max_target_percentile.fill_null(0.0)
    ).otherwise(pl.lit(None))
    df = df.with_columns(composite.alias("expression_score_normalized"))
    logger.info("expression_score_complete")
    return df
 def process_expression_evidence(
    gene_ids: list[str],
    cache_dir: Optional[Path] = None,
    force: bool = False,
    skip_cellxgene: bool = False,
 ) -> pl.DataFrame:
    """End-to-end expression evidence processing pipeline.
    Composes: fetch HPA -> fetch GTEx -> fetch CellxGene -> merge -> compute Tau -> compute score -> collect
    Args:
        gene_ids: List of Ensembl gene IDs to process
        cache_dir: Directory for caching downloads
        force: If True, re-download even if cached
        skip_cellxgene: If True, skip CellxGene fetching (optional dependency)
    Returns:
        Materialized DataFrame with expression evidence ready for DuckDB storage
    """
    logger.info("expression_pipeline_start", gene_count=len(gene_ids))
    cache_dir = Path(cache_dir) if cache_dir else Path("data/expression")
    # Fetch HPA expression (lazy)
    logger.info("fetching_hpa")
    lf_hpa = fetch_hpa_expression(gene_ids, cache_dir=cache_dir, force=force)
    # Fetch GTEx expression (lazy)
    logger.info("fetching_gtex")
    lf_gtex = fetch_gtex_expression(gene_ids, cache_dir=cache_dir, force=force)
    # Create gene universe DataFrame
    gene_universe = pl.LazyFrame({"gene_id": gene_ids})
    # Merge GTEx with gene universe (left join to preserve all genes)
    # GTEx has gene_id, HPA has gene_symbol - need to handle join carefully
    lf_merged = gene_universe.join(lf_gtex, on="gene_id", how="left")
    # For HPA, we need gene_symbol mapping
    # We'll need to load gene universe with gene_symbol from DuckDB or pass it in
    # For now, we'll fetch HPA separately and join on gene_symbol later
    # This requires gene_symbol in our gene_ids input or from gene universe
    # DEVIATION: HPA uses gene_symbol, but we're working with gene_ids
    # We need gene_symbol mapping. For simplicity, we'll collect HPA separately
    # and merge in load.py after enriching with gene_symbol from gene universe
    # Fetch CellxGene if not skipped
    if not skip_cellxgene:
        logger.info("fetching_cellxgene")
        lf_cellxgene = fetch_cellxgene_expression(gene_ids, cache_dir=cache_dir)
        lf_merged = lf_merged.join(lf_cellxgene, on="gene_id", how="left")
    # Collect at this point to enable horizontal operations
    df = lf_merged.collect()
    # Calculate Tau specificity
    tissue_columns = [
        "hpa_retina_tpm",
        "hpa_cerebellum_tpm",
        "hpa_testis_tpm",
        "hpa_fallopian_tube_tpm",
        "gtex_retina_tpm",
        "gtex_cerebellum_tpm",
        "gtex_testis_tpm",
        "gtex_fallopian_tube_tpm",
        "cellxgene_photoreceptor_expr",
        "cellxgene_hair_cell_expr",
    ]
    # Filter to available columns
    available_tissue_cols = [col for col in tissue_columns if col in df.columns]
    if available_tissue_cols:
        df = calculate_tau_specificity(df, available_tissue_cols)
    else:
        df = df.with_columns(pl.lit(None).cast(pl.Float64).alias("tau_specificity"))
    # Compute expression score
    df = compute_expression_score(df)
    logger.info(
        "expression_pipeline_complete",
        row_count=len(df),
        has_hpa=any("hpa_" in col for col in df.columns),
        has_gtex=any("gtex_" in col for col in df.columns),
        has_cellxgene=any("cellxgene_" in col for col in df.columns),
    )
    return df
--- a/src/usher_pipeline/evidence/literature/init.py
+++ b/src/usher_pipeline/evidence/literature/init.py
@@ -0,0 +1,50 @@
 """Literature Evidence Layer (LITE): PubMed-based evidence for cilia/sensory gene involvement.
 This module fetches PubMed citations for genes in various contexts (cilia, sensory,
 cytoskeleton, cell polarity), classifies evidence quality, and computes quality-weighted
 scores that mitigate well-studied gene bias.
 Key exports:
 - fetch: query_pubmed_gene, fetch_literature_evidence
 - transform: classify_evidence_tier, compute_literature_score, process_literature_evidence
 - load: load_to_duckdb
 - models: LiteratureRecord, SEARCH_CONTEXTS, LITERATURE_TABLE_NAME
 """
 from usher_pipeline.evidence.literature.models import (
    LiteratureRecord,
    LITERATURE_TABLE_NAME,
    SEARCH_CONTEXTS,
    DIRECT_EVIDENCE_TERMS,
 )
 from usher_pipeline.evidence.literature.fetch import (
    query_pubmed_gene,
    fetch_literature_evidence,
 )
 from usher_pipeline.evidence.literature.transform import (
    classify_evidence_tier,
    compute_literature_score,
    process_literature_evidence,
 )
 from usher_pipeline.evidence.literature.load import (
    load_to_duckdb,
    query_literature_supported,
 )
 __all__ = [
    # Models
    "LiteratureRecord",
    "LITERATURE_TABLE_NAME",
    "SEARCH_CONTEXTS",
    "DIRECT_EVIDENCE_TERMS",
    # Fetch
    "query_pubmed_gene",
    "fetch_literature_evidence",
    # Transform
    "classify_evidence_tier",
    "compute_literature_score",
    "process_literature_evidence",
    # Load
    "load_to_duckdb",
    "query_literature_supported",
 ]
--- a/src/usher_pipeline/evidence/literature/fetch.py
+++ b/src/usher_pipeline/evidence/literature/fetch.py
@@ -0,0 +1,248 @@
 """Fetch literature evidence from PubMed via Biopython Entrez."""
 from time import sleep
 from typing import Optional
 from functools import wraps
 import polars as pl
 import structlog
 from Bio import Entrez
 from usher_pipeline.evidence.literature.models import (
    SEARCH_CONTEXTS,
    DIRECT_EVIDENCE_TERMS,
 )
 logger = structlog.get_logger()
 def ratelimit(calls_per_sec: float = 3.0):
    """Rate limiter decorator for PubMed API calls.
    NCBI E-utilities rate limits:
    - Without API key: 3 requests/second
    - With API key: 10 requests/second
    Args:
        calls_per_sec: Maximum calls per second (default: 3 for no API key)
    """
    min_interval = 1.0 / calls_per_sec
    last_called = [0.0]
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            import time
            elapsed = time.time() - last_called[0]
            if elapsed < min_interval:
                sleep(min_interval - elapsed)
            result = func(*args, **kwargs)
            last_called[0] = time.time()
            return result
        return wrapper
    return decorator
@ratelimit(calls_per_sec=3.0)  # Default rate limit
 def _esearch_with_ratelimit(gene_symbol: str, query_terms: str, email: str) -> int:
    """Execute PubMed esearch with rate limiting.
    Args:
        gene_symbol: Gene symbol to search
        query_terms: Additional query terms (context filters)
        email: Email for NCBI (required)
    Returns:
        Count of publications matching query
    """
    query = f"({gene_symbol}[Gene Name]) AND {query_terms}"
    try:
        handle = Entrez.esearch(db="pubmed", term=query, retmax=0)
        record = Entrez.read(handle)
        handle.close()
        count = int(record["Count"])
        return count
    except Exception as e:
        logger.warning(
            "pubmed_query_failed",
            gene_symbol=gene_symbol,
            query_terms=query_terms[:50],
            error=str(e),
        )
        # Return None to indicate failed query (not zero publications)
        return None
 def query_pubmed_gene(
    gene_symbol: str,
    contexts: dict[str, str],
    email: str,
    api_key: Optional[str] = None,
 ) -> dict:
    """Query PubMed for a single gene across multiple contexts.
    Performs systematic queries:
    1. Total publications for gene (no context filter)
    2. Publications in each context (cilia, sensory, etc.)
    3. Direct experimental evidence (knockout/mutation terms)
    4. High-throughput screen mentions
    Args:
        gene_symbol: HGNC gene symbol (e.g., "BRCA1")
        contexts: Dict mapping context names to PubMed search terms
        email: Email address (required by NCBI E-utilities)
        api_key: Optional NCBI API key for higher rate limit (10/sec vs 3/sec)
    Returns:
        Dict with counts for each context, plus direct_experimental and hts counts.
        NULL values indicate failed queries (API errors), not zero publications.
    """
    # Set Entrez credentials
    Entrez.email = email
    if api_key:
        Entrez.api_key = api_key
    # Update rate limit based on API key
    rate = 10.0 if api_key else 3.0
    global _esearch_with_ratelimit
    _esearch_with_ratelimit = ratelimit(calls_per_sec=rate)(_esearch_with_ratelimit.__wrapped__)
    logger.debug(
        "pubmed_query_gene_start",
        gene_symbol=gene_symbol,
        context_count=len(contexts),
        rate_limit=rate,
    )
    results = {"gene_symbol": gene_symbol}
    # Query 1: Total publications (no context filter)
    total_count = _esearch_with_ratelimit(gene_symbol, "", email)
    results["total_pubmed_count"] = total_count
    # Query 2: Context-specific counts
    for context_name, context_terms in contexts.items():
        count = _esearch_with_ratelimit(gene_symbol, context_terms, email)
        results[f"{context_name}_context_count"] = count
    # Query 3: Direct experimental evidence
    direct_count = _esearch_with_ratelimit(
        gene_symbol,
        f"{DIRECT_EVIDENCE_TERMS} AND {contexts.get('cilia', '')}",
        email,
    )
    results["direct_experimental_count"] = direct_count
    # Query 4: High-throughput screen hits
    hts_terms = "(screen[Title/Abstract] OR proteomics[Title/Abstract] OR transcriptomics[Title/Abstract])"
    hts_count = _esearch_with_ratelimit(gene_symbol, hts_terms, email)
    results["hts_screen_count"] = hts_count
    logger.debug(
        "pubmed_query_gene_complete",
        gene_symbol=gene_symbol,
        total_count=total_count,
    )
    return results
 def fetch_literature_evidence(
    gene_symbols: list[str],
    email: str,
    api_key: Optional[str] = None,
    batch_size: int = 500,
    checkpoint_df: Optional[pl.DataFrame] = None,
 ) -> pl.DataFrame:
    """Fetch literature evidence for all genes with progress tracking and checkpointing.
    This is a SLOW operation (~20K genes * ~6 queries each = ~120K queries):
    - With API key (10 req/sec): ~3.3 hours
    - Without API key (3 req/sec): ~11 hours
    Supports checkpoint-restart: pass partial results to resume from last checkpoint.
    Args:
        gene_symbols: List of HGNC gene symbols to query
        email: Email address (required by NCBI E-utilities)
        api_key: Optional NCBI API key for 10 req/sec rate limit
        batch_size: Save checkpoint every N genes (default: 500)
        checkpoint_df: Optional partial results DataFrame to resume from
    Returns:
        DataFrame with columns: gene_symbol, total_pubmed_count, cilia_context_count,
        sensory_context_count, cytoskeleton_context_count, cell_polarity_context_count,
        direct_experimental_count, hts_screen_count.
        NULL values indicate failed queries (API errors), not zero publications.
    """
    # Estimate time
    queries_per_gene = 6  # total + 4 contexts + direct + hts
    total_queries = len(gene_symbols) * queries_per_gene
    rate = 10.0 if api_key else 3.0
    estimated_seconds = total_queries / rate
    estimated_hours = estimated_seconds / 3600
    logger.info(
        "pubmed_fetch_start",
        gene_count=len(gene_symbols),
        total_queries=total_queries,
        rate_limit_per_sec=rate,
        estimated_hours=round(estimated_hours, 2),
        has_api_key=api_key is not None,
    )
    # Resume from checkpoint if provided
    if checkpoint_df is not None:
        processed_symbols = set(checkpoint_df["gene_symbol"].to_list())
        remaining_symbols = [s for s in gene_symbols if s not in processed_symbols]
        logger.info(
            "pubmed_fetch_resume",
            checkpoint_genes=len(processed_symbols),
            remaining_genes=len(remaining_symbols),
        )
        gene_symbols = remaining_symbols
        results = checkpoint_df.to_dicts()
    else:
        results = []
    # Process genes with progress logging
    for i, gene_symbol in enumerate(gene_symbols, start=1):
        # Query PubMed for this gene
        gene_result = query_pubmed_gene(
            gene_symbol=gene_symbol,
            contexts=SEARCH_CONTEXTS,
            email=email,
            api_key=api_key,
        )
        results.append(gene_result)
        # Log progress every 100 genes
        if i % 100 == 0:
            pct = (i / len(gene_symbols)) * 100
            logger.info(
                "pubmed_fetch_progress",
                processed=i,
                total=len(gene_symbols),
                percent=round(pct, 1),
                gene_symbol=gene_symbol,
            )
        # Checkpoint every batch_size genes
        if i % batch_size == 0:
            logger.info(
                "pubmed_fetch_checkpoint",
                processed=i,
                total=len(gene_symbols),
                batch_size=batch_size,
            )
    logger.info(
        "pubmed_fetch_complete",
        total_genes=len(results),
        failed_count=sum(1 for r in results if r["total_pubmed_count"] is None),
    )
    # Convert to DataFrame
    df = pl.DataFrame(results)
    return df
--- a/src/usher_pipeline/evidence/literature/load.py
+++ b/src/usher_pipeline/evidence/literature/load.py
@@ -0,0 +1,137 @@
 """Load literature evidence to DuckDB with provenance tracking."""
 from typing import Optional
 import polars as pl
 import structlog
 from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
 logger = structlog.get_logger()
 def load_to_duckdb(
    df: pl.DataFrame,
    store: PipelineStore,
    provenance: ProvenanceTracker,
    description: str = ""
 ) -> None:
    """Save literature evidence DataFrame to DuckDB with provenance.
    Creates or replaces the literature_evidence table (idempotent).
    Records provenance step with summary statistics.
    Args:
        df: Processed literature evidence DataFrame with evidence_tier and literature_score_normalized
        store: PipelineStore instance for DuckDB persistence
        provenance: ProvenanceTracker instance for metadata recording
        description: Optional description for checkpoint metadata
    """
    logger.info("literature_load_start", row_count=len(df))
    # Calculate summary statistics for provenance
    tier_counts = (
        df.group_by("evidence_tier")
        .agg(pl.count().alias("count"))
        .to_dicts()
    )
    tier_distribution = {row["evidence_tier"]: row["count"] for row in tier_counts}
    genes_with_evidence = df.filter(
        pl.col("evidence_tier").is_in(["direct_experimental", "functional_mention", "hts_hit"])
    ).height
    # Calculate mean literature score (excluding NULL)
    mean_score_result = df.filter(
        pl.col("literature_score_normalized").is_not_null()
    ).select(pl.col("literature_score_normalized").mean())
    mean_score = None
    if len(mean_score_result) > 0:
        mean_score = mean_score_result.to_dicts()[0]["literature_score_normalized"]
    # Count total PubMed queries made (estimate: 6 queries per gene)
    total_queries = len(df) * 6
    # Save to DuckDB with CREATE OR REPLACE (idempotent)
    store.save_dataframe(
        df=df,
        table_name="literature_evidence",
        description=description or "PubMed literature evidence with context-specific queries and quality-weighted scoring",
        replace=True
    )
    # Record provenance step with details
    provenance.record_step("load_literature_evidence", {
        "row_count": len(df),
        "genes_with_direct_evidence": tier_distribution.get("direct_experimental", 0),
        "genes_with_functional_mention": tier_distribution.get("functional_mention", 0),
        "genes_with_hts_hits": tier_distribution.get("hts_hit", 0),
        "genes_with_any_evidence": genes_with_evidence,
        "tier_distribution": tier_distribution,
        "mean_literature_score": round(mean_score, 4) if mean_score is not None else None,
        "estimated_pubmed_queries": total_queries,
    })
    logger.info(
        "literature_load_complete",
        row_count=len(df),
        tier_distribution=tier_distribution,
        genes_with_evidence=genes_with_evidence,
        mean_score=round(mean_score, 4) if mean_score is not None else None,
    )
 def query_literature_supported(
    store: PipelineStore,
    min_tier: str = "functional_mention"
 ) -> pl.DataFrame:
    """Query genes with literature support at or above specified tier.
    Demonstrates DuckDB query capability and filters genes by evidence quality.
    Args:
        store: PipelineStore instance
        min_tier: Minimum evidence tier (default: "functional_mention")
                  Options: "direct_experimental", "functional_mention", "hts_hit", "incidental"
    Returns:
        DataFrame with literature-supported genes sorted by literature_score_normalized (desc)
        Columns: gene_id, gene_symbol, evidence_tier, literature_score_normalized,
                 cilia_context_count, sensory_context_count, total_pubmed_count
    """
    logger.info("literature_query_supported", min_tier=min_tier)
    # Define tier hierarchy
    tier_hierarchy = {
        "direct_experimental": 0,
        "functional_mention": 1,
        "hts_hit": 2,
        "incidental": 3,
        "none": 4,
    }
    if min_tier not in tier_hierarchy:
        raise ValueError(f"Invalid tier: {min_tier}. Must be one of {list(tier_hierarchy.keys())}")
    min_tier_rank = tier_hierarchy[min_tier]
    # Build tier list for SQL IN clause
    valid_tiers = [tier for tier, rank in tier_hierarchy.items() if rank <= min_tier_rank]
    tiers_str = ", ".join(f"'{tier}'" for tier in valid_tiers)
    # Query DuckDB
    df = store.execute_query(
        f"""
        SELECT gene_id, gene_symbol, evidence_tier, literature_score_normalized,
               cilia_context_count, sensory_context_count, total_pubmed_count,
               direct_experimental_count, hts_screen_count
        FROM literature_evidence
        WHERE evidence_tier IN ({tiers_str})
        ORDER BY literature_score_normalized DESC NULLS LAST
        """
    )
    logger.info("literature_query_complete", result_count=len(df))
    return df
--- a/src/usher_pipeline/evidence/literature/models.py
+++ b/src/usher_pipeline/evidence/literature/models.py
@@ -0,0 +1,89 @@
 """Data models for literature evidence layer."""
 from typing import Optional
 from pydantic import BaseModel, Field
 LITERATURE_TABLE_NAME = "literature_evidence"
 # Context-specific PubMed search terms
 # These are combined with gene symbols to find relevant publications
 SEARCH_CONTEXTS = {
    "cilia": "(cilia OR cilium OR ciliary OR flagellum OR intraflagellar)",
    "sensory": "(retina OR cochlea OR hair cell OR photoreceptor OR vestibular OR hearing OR usher syndrome)",
    "cytoskeleton": "(cytoskeleton OR actin OR microtubule OR motor protein)",
    "cell_polarity": "(cell polarity OR planar cell polarity OR apicobasal OR tight junction)",
 }
 # Terms indicating direct experimental evidence
 # Publications with these terms carry higher confidence than incidental mentions
 DIRECT_EVIDENCE_TERMS = "(knockout OR knockdown OR mutation OR CRISPR OR siRNA OR morpholino OR null allele)"
 # Evidence tier classification
 # Higher tiers indicate stronger evidence quality
 EVIDENCE_TIERS = [
    "direct_experimental",  # Knockout/mutation + cilia/sensory context
    "functional_mention",   # Mentioned in cilia/sensory context, not just incidental
    "hts_hit",              # High-throughput screen hit + cilia/sensory context
    "incidental",           # Mentioned in literature but no specific cilia/sensory context
    "none",                 # No PubMed publications found
 ]
 class LiteratureRecord(BaseModel):
    """Literature evidence record for a single gene.
    Captures PubMed publication counts across different contexts and evidence quality.
    NULL values indicate failed queries (API errors), not zero publications.
    """
    gene_id: str = Field(description="Ensembl gene ID (e.g., ENSG00000012048)")
    gene_symbol: str = Field(description="HGNC gene symbol (e.g., BRCA1)")
    # Publication counts by context
    total_pubmed_count: Optional[int] = Field(
        None,
        description="Total PubMed publications mentioning this gene (any context). NULL if query failed.",
    )
    cilia_context_count: Optional[int] = Field(
        None,
        description="Publications mentioning gene in cilia-related context",
    )
    sensory_context_count: Optional[int] = Field(
        None,
        description="Publications mentioning gene in sensory (retina/cochlea/hearing) context",
    )
    cytoskeleton_context_count: Optional[int] = Field(
        None,
        description="Publications mentioning gene in cytoskeleton context",
    )
    cell_polarity_context_count: Optional[int] = Field(
        None,
        description="Publications mentioning gene in cell polarity context",
    )
    # Evidence quality indicators
    direct_experimental_count: Optional[int] = Field(
        None,
        description="Publications with knockout/mutation/knockdown evidence",
    )
    hts_screen_count: Optional[int] = Field(
        None,
        description="Publications from high-throughput screens (proteomics/transcriptomics)",
    )
    # Derived classification
    evidence_tier: str = Field(
        description="Evidence quality tier: direct_experimental, functional_mention, hts_hit, incidental, none"
    )
    literature_score_normalized: Optional[float] = Field(
        None,
        ge=0.0,
        le=1.0,
        description="Quality-weighted literature score [0-1], normalized to mitigate well-studied gene bias. NULL if total_pubmed_count is NULL.",
    )
    class Config:
        """Pydantic config."""
        frozen = False  # Allow mutation for score computation
--- a/src/usher_pipeline/evidence/literature/transform.py
+++ b/src/usher_pipeline/evidence/literature/transform.py
@@ -0,0 +1,279 @@
 """Transform literature evidence: classify tiers and compute quality-weighted scores."""
 from typing import Optional
 import polars as pl
 import structlog
 logger = structlog.get_logger()
 # Evidence quality weights for scoring
 # Direct experimental evidence is most valuable, incidental mentions least valuable
 EVIDENCE_QUALITY_WEIGHTS = {
    "direct_experimental": 1.0,
    "functional_mention": 0.6,
    "hts_hit": 0.3,
    "incidental": 0.1,
    "none": 0.0,
 }
 # Context relevance weights
 # Cilia and sensory contexts are most relevant, cytoskeleton/polarity supportive
 CONTEXT_WEIGHTS = {
    "cilia_context_count": 2.0,
    "sensory_context_count": 2.0,
    "cytoskeleton_context_count": 1.0,
    "cell_polarity_context_count": 1.0,
 }
 def classify_evidence_tier(df: pl.DataFrame) -> pl.DataFrame:
    """Classify literature evidence into quality tiers.
    Tiers (highest to lowest quality):
    - direct_experimental: Knockout/mutation + cilia/sensory context (highest confidence)
    - functional_mention: Mentioned in cilia/sensory context with multiple publications
    - hts_hit: High-throughput screen hit + cilia/sensory context
    - incidental: Publications exist but no cilia/sensory context
    - none: No publications found (or query failed)
    Args:
        df: DataFrame with columns: gene_symbol, total_pubmed_count, cilia_context_count,
            sensory_context_count, direct_experimental_count, hts_screen_count
    Returns:
        DataFrame with added evidence_tier column
    """
    logger.info("literature_classify_start", row_count=len(df))
    # Define tier classification logic using polars expressions
    # Priority order: direct_experimental > functional_mention > hts_hit > incidental > none
    df = df.with_columns([
        pl.when(
            # Direct experimental: knockout/mutation evidence + cilia/sensory context
            (pl.col("direct_experimental_count").is_not_null()) &
            (pl.col("direct_experimental_count") >= 1) &
            (
                (pl.col("cilia_context_count").is_not_null() & (pl.col("cilia_context_count") >= 1)) |
                (pl.col("sensory_context_count").is_not_null() & (pl.col("sensory_context_count") >= 1))
            )
        ).then(pl.lit("direct_experimental"))
        .when(
            # Functional mention: cilia/sensory context + multiple publications
            (
                (pl.col("cilia_context_count").is_not_null() & (pl.col("cilia_context_count") >= 1)) |
                (pl.col("sensory_context_count").is_not_null() & (pl.col("sensory_context_count") >= 1))
            ) &
            (pl.col("total_pubmed_count").is_not_null()) &
            (pl.col("total_pubmed_count") >= 3)
        ).then(pl.lit("functional_mention"))
        .when(
            # HTS hit: screen evidence + cilia/sensory context
            (pl.col("hts_screen_count").is_not_null()) &
            (pl.col("hts_screen_count") >= 1) &
            (
                (pl.col("cilia_context_count").is_not_null() & (pl.col("cilia_context_count") >= 1)) |
                (pl.col("sensory_context_count").is_not_null() & (pl.col("sensory_context_count") >= 1))
            )
        ).then(pl.lit("hts_hit"))
        .when(
            # Incidental: publications exist but no cilia/sensory context
            (pl.col("total_pubmed_count").is_not_null()) &
            (pl.col("total_pubmed_count") >= 1)
        ).then(pl.lit("incidental"))
        .otherwise(pl.lit("none"))  # No publications or query failed
        .alias("evidence_tier")
    ])
    # Count tier distribution for logging
    tier_counts = (
        df.group_by("evidence_tier")
        .agg(pl.count().alias("count"))
        .sort("count", descending=True)
    )
    logger.info(
        "literature_classify_complete",
        tier_distribution={
            row["evidence_tier"]: row["count"]
            for row in tier_counts.to_dicts()
        },
    )
    return df
 def compute_literature_score(df: pl.DataFrame) -> pl.DataFrame:
    """Compute quality-weighted literature score with bias mitigation.
    Quality-weighted scoring formula:
    1. Context score = weighted sum of context counts (cilia * 2.0 + sensory * 2.0 + cytoskeleton * 1.0 + polarity * 1.0)
    2. Apply evidence quality weight based on tier
    3. CRITICAL: Normalize by log2(total_pubmed_count + 1) to penalize genes where
       cilia mentions are tiny fraction of total literature (e.g., TP53: 5 cilia / 100K total)
    4. Rank-percentile normalization to [0, 1] scale
    This prevents well-studied genes (TP53, BRCA1) from dominating scores over
    focused candidates with high-quality but fewer publications.
    Args:
        df: DataFrame with context counts and evidence_tier column
    Returns:
        DataFrame with added literature_score_normalized column [0-1]
    """
    logger.info("literature_score_start", row_count=len(df))
    # Step 1: Compute weighted context score
    df = df.with_columns([
        (
            (pl.col("cilia_context_count").fill_null(0) * CONTEXT_WEIGHTS["cilia_context_count"]) +
            (pl.col("sensory_context_count").fill_null(0) * CONTEXT_WEIGHTS["sensory_context_count"]) +
            (pl.col("cytoskeleton_context_count").fill_null(0) * CONTEXT_WEIGHTS["cytoskeleton_context_count"]) +
            (pl.col("cell_polarity_context_count").fill_null(0) * CONTEXT_WEIGHTS["cell_polarity_context_count"])
        ).alias("context_score")
    ])
    # Step 2: Apply evidence quality weight
    # Map evidence_tier to quality weight using replace
    df = df.with_columns([
        pl.col("evidence_tier")
        .replace(EVIDENCE_QUALITY_WEIGHTS, default=0.0)
        .alias("quality_weight")
    ])
    # Step 3: Bias mitigation via total publication normalization
    # Penalize genes where cilia mentions are small fraction of total literature
    df = df.with_columns([
        pl.when(pl.col("total_pubmed_count").is_not_null())
        .then(
            (pl.col("context_score") * pl.col("quality_weight")) /
            ((pl.col("total_pubmed_count") + 1).log(base=2))
        )
        .otherwise(pl.lit(None))
        .alias("raw_score")
    ])
    # Step 4: Rank-percentile normalization to [0, 1]
    # Only rank genes with non-null raw_score
    total_with_scores = df.filter(pl.col("raw_score").is_not_null()).height
    if total_with_scores == 0:
        # No valid scores - all NULL
        df = df.with_columns([
            pl.lit(None, dtype=pl.Float64).alias("literature_score_normalized")
        ])
    else:
        # Compute rank percentile
        df = df.with_columns([
            pl.when(pl.col("raw_score").is_not_null())
            .then(
                pl.col("raw_score").rank(method="average") / total_with_scores
            )
            .otherwise(pl.lit(None))
            .alias("literature_score_normalized")
        ])
    # Drop intermediate columns
    df = df.drop(["context_score", "quality_weight", "raw_score"])
    # Log score statistics
    score_stats = df.filter(pl.col("literature_score_normalized").is_not_null()).select([
        pl.col("literature_score_normalized").min().alias("min"),
        pl.col("literature_score_normalized").max().alias("max"),
        pl.col("literature_score_normalized").mean().alias("mean"),
        pl.col("literature_score_normalized").median().alias("median"),
    ])
    if len(score_stats) > 0:
        stats = score_stats.to_dicts()[0]
        logger.info(
            "literature_score_complete",
            min_score=round(stats["min"], 4) if stats["min"] is not None else None,
            max_score=round(stats["max"], 4) if stats["max"] is not None else None,
            mean_score=round(stats["mean"], 4) if stats["mean"] is not None else None,
            median_score=round(stats["median"], 4) if stats["median"] is not None else None,
            genes_with_scores=total_with_scores,
        )
    return df
 def process_literature_evidence(
    gene_ids: list[str],
    gene_symbol_map: pl.DataFrame,
    email: str,
    api_key: Optional[str] = None,
    batch_size: int = 500,
    checkpoint_df: Optional[pl.DataFrame] = None,
 ) -> pl.DataFrame:
    """End-to-end literature evidence processing pipeline.
    1. Map gene IDs to symbols
    2. Fetch PubMed literature evidence
    3. Classify evidence tiers
    4. Compute quality-weighted scores
    5. Join back to gene IDs
    Args:
        gene_ids: List of Ensembl gene IDs
        gene_symbol_map: DataFrame with columns: gene_id, gene_symbol
        email: Email for NCBI E-utilities (required)
        api_key: Optional NCBI API key for higher rate limit
        batch_size: Checkpoint save frequency (default: 500)
        checkpoint_df: Optional partial results to resume from
    Returns:
        DataFrame with columns: gene_id, gene_symbol, total_pubmed_count,
        cilia_context_count, sensory_context_count, cytoskeleton_context_count,
        cell_polarity_context_count, direct_experimental_count, hts_screen_count,
        evidence_tier, literature_score_normalized
    """
    from usher_pipeline.evidence.literature.fetch import fetch_literature_evidence
    logger.info(
        "literature_process_start",
        gene_count=len(gene_ids),
        has_checkpoint=checkpoint_df is not None,
    )
    # Step 1: Map gene IDs to symbols
    gene_map = gene_symbol_map.filter(pl.col("gene_id").is_in(gene_ids))
    gene_symbols = gene_map["gene_symbol"].to_list()
    logger.info(
        "literature_gene_mapping",
        input_ids=len(gene_ids),
        mapped_symbols=len(gene_symbols),
    )
    # Step 2: Fetch literature evidence
    lit_df = fetch_literature_evidence(
        gene_symbols=gene_symbols,
        email=email,
        api_key=api_key,
        batch_size=batch_size,
        checkpoint_df=checkpoint_df,
    )
    # Step 3: Classify evidence tiers
    lit_df = classify_evidence_tier(lit_df)
    # Step 4: Compute quality-weighted scores
    lit_df = compute_literature_score(lit_df)
    # Step 5: Join back to gene IDs
    result_df = gene_map.join(
        lit_df,
        on="gene_symbol",
        how="left",
    )
    logger.info(
        "literature_process_complete",
        total_genes=len(result_df),
    )
    return result_df