feat(03-06): implement literature evidence models, PubMed fetch, and scoring

- Create LiteratureRecord pydantic model with context-specific counts - Implement PubMed query via Biopython Entrez with rate limiting (3/sec default, 10/sec with API key) - Define SEARCH_CONTEXTS for cilia, sensory, cytoskeleton, cell_polarity queries - Implement evidence tier classification: direct_experimental > functional_mention > hts_hit > incidental > none - Implement quality-weighted scoring with bias mitigation via log2(total_pubmed_count) normalization - Add biopython>=1.84 dependency to pyproject.toml - Support checkpoint-restart for long-running PubMed queries (estimated 3-11 hours for 20K genes)
2026-02-11 19:00:20 +08:00
parent 6645c59b0b
commit 8aa66987f8
11 changed files with 1806 additions and 0 deletions
--- a/src/usher_pipeline/evidence/expression/init.py
+++ b/src/usher_pipeline/evidence/expression/init.py
@@ -0,0 +1,48 @@
+"""Tissue expression evidence layer for Usher-relevant tissues.
+
+This module retrieves expression data from:
+- HPA (Human Protein Atlas): Tissue-level RNA/protein expression
+- GTEx: Tissue-level RNA expression across diverse samples
+- CellxGene: Single-cell RNA-seq data for specific cell types
+
+Target tissues/cell types:
+- Retina, photoreceptor cells (retinal rod, retinal cone)
+- Inner ear, hair cells (cochlea, vestibular)
+- Cilia-rich tissues (cerebellum, testis, fallopian tube)
+
+Expression enrichment in these tissues is evidence for potential cilia/Usher involvement.
+"""
+
+from usher_pipeline.evidence.expression.fetch import (
+    fetch_hpa_expression,
+    fetch_gtex_expression,
+    fetch_cellxgene_expression,
+)
+from usher_pipeline.evidence.expression.transform import (
+    calculate_tau_specificity,
+    compute_expression_score,
+    process_expression_evidence,
+)
+from usher_pipeline.evidence.expression.load import (
+    load_to_duckdb,
+    query_tissue_enriched,
+)
+from usher_pipeline.evidence.expression.models import (
+    ExpressionRecord,
+    EXPRESSION_TABLE_NAME,
+    TARGET_TISSUES,
+)
+
+__all__ = [
+    "fetch_hpa_expression",
+    "fetch_gtex_expression",
+    "fetch_cellxgene_expression",
+    "calculate_tau_specificity",
+    "compute_expression_score",
+    "process_expression_evidence",
+    "load_to_duckdb",
+    "query_tissue_enriched",
+    "ExpressionRecord",
+    "EXPRESSION_TABLE_NAME",
+    "TARGET_TISSUES",
+]
--- a/src/usher_pipeline/evidence/expression/fetch.py
+++ b/src/usher_pipeline/evidence/expression/fetch.py
@@ -0,0 +1,458 @@
+"""Download and parse tissue expression data from HPA, GTEx, and CellxGene."""
+
+import gzip
+import shutil
+import zipfile
+from pathlib import Path
+from typing import Optional
+
+import httpx
+import polars as pl
+import structlog
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_exponential,
+    retry_if_exception_type,
+)
+
+from usher_pipeline.evidence.expression.models import (
+    HPA_NORMAL_TISSUE_URL,
+    GTEX_MEDIAN_EXPRESSION_URL,
+    TARGET_TISSUES,
+    TARGET_CELL_TYPES,
+)
+
+logger = structlog.get_logger()
+
+
+@retry(
+    stop=stop_after_attempt(5),
+    wait=wait_exponential(multiplier=1, min=4, max=60),
+    retry=retry_if_exception_type(
+        (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
+    ),
+)
+def download_hpa_tissue_data(
+    output_path: Path,
+    url: str = HPA_NORMAL_TISSUE_URL,
+    force: bool = False,
+) -> Path:
+    """Download HPA normal tissue TSV (bulk download for all genes).
+
+    Args:
+        output_path: Where to save the TSV file
+        url: HPA normal tissue data URL (default: proteinatlas.org bulk download)
+        force: If True, re-download even if file exists
+
+    Returns:
+        Path to the downloaded TSV file
+
+    Raises:
+        httpx.HTTPStatusError: On HTTP errors (after retries)
+        httpx.ConnectError: On connection errors (after retries)
+        httpx.TimeoutException: On timeout (after retries)
+    """
+    output_path = Path(output_path)
+
+    # Checkpoint pattern: skip if already downloaded
+    if output_path.exists() and not force:
+        logger.info(
+            "hpa_tissue_exists",
+            path=str(output_path),
+            size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
+        )
+        return output_path
+
+    # Create parent directory if needed
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # HPA data is zipped
+    is_zipped = url.endswith(".zip")
+    temp_path = output_path.with_suffix(".zip.tmp")
+
+    logger.info("hpa_download_start", url=url, zipped=is_zipped)
+
+    # Stream download to disk
+    with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
+        response.raise_for_status()
+
+        total_bytes = int(response.headers.get("content-length", 0))
+        downloaded = 0
+
+        with open(temp_path, "wb") as f:
+            for chunk in response.iter_bytes(chunk_size=8192):
+                f.write(chunk)
+                downloaded += len(chunk)
+
+                # Log progress every 10MB
+                if total_bytes > 0 and downloaded % (10 * 1024 * 1024) < 8192:
+                    pct = (downloaded / total_bytes) * 100
+                    logger.info(
+                        "hpa_download_progress",
+                        downloaded_mb=round(downloaded / 1024 / 1024, 2),
+                        total_mb=round(total_bytes / 1024 / 1024, 2),
+                        percent=round(pct, 1),
+                    )
+
+    # Unzip if needed
+    if is_zipped:
+        logger.info("hpa_unzip_start", zip_path=str(temp_path))
+        with zipfile.ZipFile(temp_path, "r") as zip_ref:
+            # Extract the TSV file (usually named "normal_tissue.tsv")
+            tsv_files = [name for name in zip_ref.namelist() if name.endswith(".tsv")]
+            if not tsv_files:
+                raise ValueError(f"No TSV file found in HPA zip: {temp_path}")
+            # Extract first TSV
+            zip_ref.extract(tsv_files[0], path=output_path.parent)
+            extracted_path = output_path.parent / tsv_files[0]
+            extracted_path.rename(output_path)
+        temp_path.unlink()
+    else:
+        temp_path.rename(output_path)
+
+    logger.info(
+        "hpa_download_complete",
+        path=str(output_path),
+        size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
+    )
+
+    return output_path
+
+
+@retry(
+    stop=stop_after_attempt(5),
+    wait=wait_exponential(multiplier=1, min=4, max=60),
+    retry=retry_if_exception_type(
+        (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
+    ),
+)
+def download_gtex_expression(
+    output_path: Path,
+    url: str = GTEX_MEDIAN_EXPRESSION_URL,
+    force: bool = False,
+) -> Path:
+    """Download GTEx median gene expression file (bulk download).
+
+    Args:
+        output_path: Where to save the GCT file
+        url: GTEx median TPM file URL (default: v8/v10 bulk data)
+        force: If True, re-download even if file exists
+
+    Returns:
+        Path to the downloaded GCT file
+
+    Raises:
+        httpx.HTTPStatusError: On HTTP errors (after retries)
+        httpx.ConnectError: On connection errors (after retries)
+        httpx.TimeoutException: On timeout (after retries)
+    """
+    output_path = Path(output_path)
+
+    # Checkpoint pattern: skip if already downloaded
+    if output_path.exists() and not force:
+        logger.info(
+            "gtex_expression_exists",
+            path=str(output_path),
+            size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
+        )
+        return output_path
+
+    # Create parent directory if needed
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # GTEx data is gzipped
+    is_compressed = url.endswith(".gz")
+    temp_path = output_path.with_suffix(output_path.suffix + ".tmp")
+
+    logger.info("gtex_download_start", url=url, compressed=is_compressed)
+
+    # Stream download to disk
+    with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
+        response.raise_for_status()
+
+        total_bytes = int(response.headers.get("content-length", 0))
+        downloaded = 0
+
+        with open(temp_path, "wb") as f:
+            for chunk in response.iter_bytes(chunk_size=8192):
+                f.write(chunk)
+                downloaded += len(chunk)
+
+                # Log progress every 10MB
+                if total_bytes > 0 and downloaded % (10 * 1024 * 1024) < 8192:
+                    pct = (downloaded / total_bytes) * 100
+                    logger.info(
+                        "gtex_download_progress",
+                        downloaded_mb=round(downloaded / 1024 / 1024, 2),
+                        total_mb=round(total_bytes / 1024 / 1024, 2),
+                        percent=round(pct, 1),
+                    )
+
+    # Decompress if needed
+    if is_compressed:
+        logger.info("gtex_decompress_start", compressed_path=str(temp_path))
+        with gzip.open(temp_path, "rb") as f_in:
+            with open(output_path, "wb") as f_out:
+                shutil.copyfileobj(f_in, f_out)
+        temp_path.unlink()
+    else:
+        temp_path.rename(output_path)
+
+    logger.info(
+        "gtex_download_complete",
+        path=str(output_path),
+        size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
+    )
+
+    return output_path
+
+
+def fetch_hpa_expression(
+    gene_ids: list[str],
+    cache_dir: Optional[Path] = None,
+    force: bool = False,
+) -> pl.LazyFrame:
+    """Fetch HPA tissue expression data for target tissues.
+
+    Downloads HPA bulk normal tissue TSV, filters to target tissues
+    (retina, cerebellum, testis, fallopian tube), and extracts TPM values.
+
+    Args:
+        gene_ids: List of Ensembl gene IDs to filter (unused - HPA uses gene symbols)
+        cache_dir: Directory to cache downloaded HPA file
+        force: If True, re-download even if cached
+
+    Returns:
+        LazyFrame with columns: gene_symbol, hpa_retina_tpm, hpa_cerebellum_tpm,
+        hpa_testis_tpm, hpa_fallopian_tube_tpm
+        NULL for genes/tissues not in HPA data.
+    """
+    cache_dir = Path(cache_dir) if cache_dir else Path("data/expression")
+    cache_dir.mkdir(parents=True, exist_ok=True)
+
+    # Download HPA bulk tissue data
+    hpa_tsv_path = cache_dir / "hpa_normal_tissue.tsv"
+    download_hpa_tissue_data(hpa_tsv_path, force=force)
+
+    logger.info("hpa_parse_start", path=str(hpa_tsv_path))
+
+    # HPA TSV format (v24):
+    # Gene | Gene name | Tissue | Cell type | Level | Reliability
+    # Where Level is expression level category, not TPM
+    # For quantitative data we need the "nTPM" column if available
+    # Columns: Gene, Gene name, Tissue, Cell type, Level, Reliability
+    # OR: Gene, Gene name, Tissue, nTPM (normalized TPM)
+
+    # Read HPA data with lazy evaluation
+    lf = pl.scan_csv(
+        hpa_tsv_path,
+        separator="\t",
+        null_values=["NA", "", "."],
+        has_header=True,
+    )
+
+    # Target tissues from HPA
+    target_tissue_names = {
+        "retina": TARGET_TISSUES["retina"]["hpa"],
+        "cerebellum": TARGET_TISSUES["cerebellum"]["hpa"],
+        "testis": TARGET_TISSUES["testis"]["hpa"],
+        "fallopian_tube": TARGET_TISSUES["fallopian_tube"]["hpa"],
+    }
+
+    # Filter to target tissues
+    tissue_filter = pl.col("Tissue").is_in(list(target_tissue_names.values()))
+    lf = lf.filter(tissue_filter)
+
+    # HPA provides categorical "Level" (Not detected, Low, Medium, High)
+    # For scoring, we'll convert to numeric: Not detected=0, Low=1, Medium=2, High=3
+    # If nTPM column exists, use that instead
+
+    # Check if nTPM column exists (better for quantitative analysis)
+    # For now, use Level mapping as HPA download format varies
+    level_mapping = {
+        "Not detected": 0.0,
+        "Low": 1.0,
+        "Medium": 2.0,
+        "High": 3.0,
+    }
+
+    # Convert Level to numeric expression proxy
+    # If "nTPM" column exists, use it; otherwise map Level
+    # We'll handle this by attempting both approaches
+
+    # Pivot to wide format: gene x tissue
+    # Group by Gene name and Tissue, aggregate Level (take max if multiple cell types)
+    lf = (
+        lf.group_by(["Gene name", "Tissue"])
+        .agg(pl.col("Level").first().alias("expression_level"))
+        .with_columns(
+            pl.col("expression_level")
+            .map_elements(lambda x: level_mapping.get(x, None), return_dtype=pl.Float64)
+            .alias("expression_value")
+        )
+    )
+
+    # Pivot: rows=genes, columns=tissues
+    # Create separate columns for each target tissue
+    lf_wide = lf.pivot(
+        values="expression_value",
+        index="Gene name",
+        columns="Tissue",
+    )
+
+    # Rename columns to match our schema
+    rename_map = {}
+    for our_key, hpa_tissue in target_tissue_names.items():
+        if hpa_tissue:
+            rename_map[hpa_tissue] = f"hpa_{our_key}_tpm"
+
+    if rename_map:
+        lf_wide = lf_wide.rename(rename_map)
+
+    # Rename "Gene name" to "gene_symbol"
+    lf_wide = lf_wide.rename({"Gene name": "gene_symbol"})
+
+    logger.info("hpa_parse_complete", tissues=list(target_tissue_names.keys()))
+
+    return lf_wide
+
+
+def fetch_gtex_expression(
+    gene_ids: list[str],
+    cache_dir: Optional[Path] = None,
+    force: bool = False,
+) -> pl.LazyFrame:
+    """Fetch GTEx tissue expression data for target tissues.
+
+    Downloads GTEx bulk median TPM file, filters to target tissues.
+    NOTE: GTEx lacks inner ear/cochlea tissue - will be NULL.
+
+    Args:
+        gene_ids: List of Ensembl gene IDs to filter
+        cache_dir: Directory to cache downloaded GTEx file
+        force: If True, re-download even if cached
+
+    Returns:
+        LazyFrame with columns: gene_id, gtex_retina_tpm, gtex_cerebellum_tpm,
+        gtex_testis_tpm, gtex_fallopian_tube_tpm
+        NULL for tissues not available in GTEx.
+    """
+    cache_dir = Path(cache_dir) if cache_dir else Path("data/expression")
+    cache_dir.mkdir(parents=True, exist_ok=True)
+
+    # Download GTEx bulk expression data
+    gtex_gct_path = cache_dir / "gtex_median_tpm.gct"
+    download_gtex_expression(gtex_gct_path, force=force)
+
+    logger.info("gtex_parse_start", path=str(gtex_gct_path))
+
+    # GTEx GCT format:
+    # #1.2 (version header)
+    # [dimensions line]
+    # Name  Description  [Tissue1]  [Tissue2]  ...
+    # ENSG00000... | GeneSymbol | tpm1 | tpm2 | ...
+
+    # Skip first 2 lines (GCT header), then read
+    lf = pl.scan_csv(
+        gtex_gct_path,
+        separator="\t",
+        skip_rows=2,
+        null_values=["NA", "", "."],
+        has_header=True,
+    )
+
+    # Target tissues from GTEx
+    target_tissue_cols = {
+        "retina": TARGET_TISSUES["retina"]["gtex"],
+        "cerebellum": TARGET_TISSUES["cerebellum"]["gtex"],
+        "testis": TARGET_TISSUES["testis"]["gtex"],
+        "fallopian_tube": TARGET_TISSUES["fallopian_tube"]["gtex"],
+    }
+
+    # Select gene ID column + target tissue columns
+    # GTEx uses "Name" for gene ID (ENSG...) and "Description" for gene symbol
+    select_cols = ["Name"]
+    rename_map = {"Name": "gene_id"}
+
+    for our_key, gtex_tissue in target_tissue_cols.items():
+        if gtex_tissue:
+            # Check if tissue column exists (not all GTEx versions have all tissues)
+            select_cols.append(gtex_tissue)
+            rename_map[gtex_tissue] = f"gtex_{our_key}_tpm"
+
+    # Try to select columns; if tissue missing, it will be NULL
+    # Use select with error handling for missing columns
+    try:
+        lf = lf.select(select_cols).rename(rename_map)
+    except Exception as e:
+        logger.warning("gtex_tissue_missing", error=str(e))
+        # Fallback: select available columns
+        available_cols = lf.columns
+        select_available = [col for col in select_cols if col in available_cols]
+        lf = lf.select(select_available).rename({
+            k: v for k, v in rename_map.items() if k in select_available
+        })
+
+    # Filter to requested gene_ids if provided
+    if gene_ids:
+        lf = lf.filter(pl.col("gene_id").is_in(gene_ids))
+
+    logger.info("gtex_parse_complete", tissues=list(target_tissue_cols.keys()))
+
+    return lf
+
+
+def fetch_cellxgene_expression(
+    gene_ids: list[str],
+    cache_dir: Optional[Path] = None,
+    batch_size: int = 100,
+) -> pl.LazyFrame:
+    """Fetch CellxGene single-cell expression data for target cell types.
+
+    Uses cellxgene_census library to query scRNA-seq data for photoreceptor
+    and hair cell populations. Computes mean expression per gene per cell type.
+
+    NOTE: cellxgene_census is an optional dependency (large install).
+    If not available, returns DataFrame with all NULL values and logs warning.
+
+    Args:
+        gene_ids: List of Ensembl gene IDs to query
+        cache_dir: Directory for caching (currently unused)
+        batch_size: Number of genes to process per batch (default: 100)
+
+    Returns:
+        LazyFrame with columns: gene_id, cellxgene_photoreceptor_expr,
+        cellxgene_hair_cell_expr
+        NULL if cellxgene_census not available or cell type data missing.
+    """
+    try:
+        import cellxgene_census
+    except ImportError:
+        logger.warning(
+            "cellxgene_census_unavailable",
+            message="cellxgene_census not installed. Install with: pip install 'usher-pipeline[expression]'",
+        )
+        # Return empty DataFrame with NULL values
+        return pl.LazyFrame({
+            "gene_id": gene_ids,
+            "cellxgene_photoreceptor_expr": [None] * len(gene_ids),
+            "cellxgene_hair_cell_expr": [None] * len(gene_ids),
+        })
+
+    logger.info("cellxgene_query_start", gene_count=len(gene_ids), batch_size=batch_size)
+
+    # For now, return placeholder with NULLs
+    # Full CellxGene integration requires census schema knowledge and filtering
+    # This is a complex query that would need cell type ontology matching
+    # Placeholder implementation for testing
+    logger.warning(
+        "cellxgene_not_implemented",
+        message="CellxGene integration is complex and not yet implemented. Returning NULL values.",
+    )
+
+    return pl.LazyFrame({
+        "gene_id": gene_ids,
+        "cellxgene_photoreceptor_expr": [None] * len(gene_ids),
+        "cellxgene_hair_cell_expr": [None] * len(gene_ids),
+    })
--- a/src/usher_pipeline/evidence/expression/load.py
+++ b/src/usher_pipeline/evidence/expression/load.py
@@ -0,0 +1,119 @@
+"""Load expression evidence data to DuckDB with provenance tracking."""
+
+from typing import Optional
+
+import polars as pl
+import structlog
+
+from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
+from usher_pipeline.evidence.expression.models import EXPRESSION_TABLE_NAME
+
+logger = structlog.get_logger()
+
+
+def load_to_duckdb(
+    df: pl.DataFrame,
+    store: PipelineStore,
+    provenance: ProvenanceTracker,
+    description: str = ""
+) -> None:
+    """Save expression evidence DataFrame to DuckDB with provenance.
+
+    Creates or replaces the tissue_expression table (idempotent).
+    Records provenance step with summary statistics.
+
+    Args:
+        df: Processed expression DataFrame with tau_specificity and expression_score_normalized
+        store: PipelineStore instance for DuckDB persistence
+        provenance: ProvenanceTracker instance for metadata recording
+        description: Optional description for checkpoint metadata
+    """
+    logger.info("expression_load_start", row_count=len(df))
+
+    # Calculate summary statistics for provenance
+    # Genes with retina expression (any source)
+    retina_expr_count = df.filter(
+        pl.col("hpa_retina_tpm").is_not_null() |
+        pl.col("gtex_retina_tpm").is_not_null() |
+        pl.col("cellxgene_photoreceptor_expr").is_not_null()
+    ).height
+
+    # Genes with inner ear expression (primarily CellxGene)
+    inner_ear_expr_count = df.filter(
+        pl.col("cellxgene_hair_cell_expr").is_not_null()
+    ).height
+
+    # Mean Tau specificity (excluding NULLs)
+    mean_tau = df.select(pl.col("tau_specificity").mean()).item()
+
+    # Expression score distribution
+    expr_score_stats = df.select([
+        pl.col("expression_score_normalized").min().alias("min"),
+        pl.col("expression_score_normalized").max().alias("max"),
+        pl.col("expression_score_normalized").mean().alias("mean"),
+        pl.col("expression_score_normalized").median().alias("median"),
+    ]).to_dicts()[0]
+
+    # Save to DuckDB with CREATE OR REPLACE (idempotent)
+    store.save_dataframe(
+        df=df,
+        table_name=EXPRESSION_TABLE_NAME,
+        description=description or "Tissue expression evidence with HPA, GTEx, and CellxGene data",
+        replace=True
+    )
+
+    # Record provenance step with details
+    provenance.record_step("load_tissue_expression", {
+        "row_count": len(df),
+        "retina_expression_count": retina_expr_count,
+        "inner_ear_expression_count": inner_ear_expr_count,
+        "mean_tau_specificity": round(mean_tau, 3) if mean_tau else None,
+        "expression_score_min": round(expr_score_stats["min"], 3) if expr_score_stats["min"] else None,
+        "expression_score_max": round(expr_score_stats["max"], 3) if expr_score_stats["max"] else None,
+        "expression_score_mean": round(expr_score_stats["mean"], 3) if expr_score_stats["mean"] else None,
+        "expression_score_median": round(expr_score_stats["median"], 3) if expr_score_stats["median"] else None,
+    })
+
+    logger.info(
+        "expression_load_complete",
+        row_count=len(df),
+        retina_expr=retina_expr_count,
+        inner_ear_expr=inner_ear_expr_count,
+        mean_tau=round(mean_tau, 3) if mean_tau else None,
+    )
+
+
+def query_tissue_enriched(
+    store: PipelineStore,
+    min_enrichment: float = 2.0
+) -> pl.DataFrame:
+    """Query genes enriched in Usher-relevant tissues from DuckDB.
+
+    Args:
+        store: PipelineStore instance
+        min_enrichment: Minimum usher_tissue_enrichment threshold (default: 2.0 = 2x enriched)
+
+    Returns:
+        DataFrame with tissue-enriched genes sorted by enrichment (most enriched first)
+        Columns: gene_id, gene_symbol, usher_tissue_enrichment, tau_specificity,
+                 expression_score_normalized
+    """
+    logger.info("expression_query_enriched", min_enrichment=min_enrichment)
+
+    # Query DuckDB: enriched genes
+    df = store.execute_query(
+        f"""
+        SELECT gene_id, gene_symbol, usher_tissue_enrichment, tau_specificity,
+               expression_score_normalized,
+               hpa_retina_tpm, gtex_retina_tpm, cellxgene_photoreceptor_expr,
+               cellxgene_hair_cell_expr
+        FROM {EXPRESSION_TABLE_NAME}
+        WHERE usher_tissue_enrichment >= ?
+        ORDER BY usher_tissue_enrichment DESC
+        """,
+        params=[min_enrichment]
+    )
+
+    logger.info("expression_query_complete", result_count=len(df))
+
+    return df
--- a/src/usher_pipeline/evidence/expression/models.py
+++ b/src/usher_pipeline/evidence/expression/models.py
@@ -0,0 +1,101 @@
+"""Data models for tissue expression evidence."""
+
+from pydantic import BaseModel
+
+# HPA normal tissue data download URL (bulk TSV, more efficient than per-gene API)
+HPA_NORMAL_TISSUE_URL = (
+    "https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
+)
+
+# GTEx v10 median gene expression bulk data
+GTEX_MEDIAN_EXPRESSION_URL = (
+    "https://storage.googleapis.com/adult-gtex/bulk-gex/v10/median-tpm/"
+    "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
+)
+
+# Table name in DuckDB
+EXPRESSION_TABLE_NAME = "tissue_expression"
+
+# Target tissues for Usher/cilia relevance
+# Maps our standardized tissue keys to API-specific identifiers
+TARGET_TISSUES = {
+    # Retina-related
+    "retina": {
+        "hpa": "retina",
+        "gtex": "Eye - Retina",  # Note: Not available in all GTEx versions
+        "cellxgene_tissue": ["retina", "eye"],
+    },
+    # Inner ear-related (primarily from scRNA-seq, not in HPA/GTEx bulk)
+    "inner_ear": {
+        "hpa": None,  # Not available in HPA bulk tissue data
+        "gtex": None,  # Not available in GTEx
+        "cellxgene_tissue": ["inner ear", "cochlea", "vestibular system"],
+    },
+    # Cilia-rich tissues
+    "cerebellum": {
+        "hpa": "cerebellum",
+        "gtex": "Brain - Cerebellum",
+        "cellxgene_tissue": ["cerebellum"],
+    },
+    "testis": {
+        "hpa": "testis",
+        "gtex": "Testis",
+        "cellxgene_tissue": ["testis"],
+    },
+    "fallopian_tube": {
+        "hpa": "fallopian tube",
+        "gtex": "Fallopian Tube",  # May not be available in all GTEx versions
+        "cellxgene_tissue": ["fallopian tube"],
+    },
+}
+
+# Target cell types for scRNA-seq (CellxGene)
+TARGET_CELL_TYPES = [
+    "photoreceptor cell",
+    "retinal rod cell",
+    "retinal cone cell",
+    "hair cell",  # Inner ear mechanoreceptor
+    "cochlear hair cell",
+    "vestibular hair cell",
+]
+
+
+class ExpressionRecord(BaseModel):
+    """Tissue expression evidence for a single gene.
+
+    Attributes:
+        gene_id: Ensembl gene ID (e.g., ENSG00000...)
+        gene_symbol: HGNC gene symbol
+        hpa_retina_tpm: HPA retina TPM expression (NULL if not in HPA)
+        hpa_cerebellum_tpm: HPA cerebellum TPM (proxy for cilia-rich tissue)
+        hpa_testis_tpm: HPA testis TPM (cilia-rich)
+        hpa_fallopian_tube_tpm: HPA fallopian tube TPM (ciliated epithelium)
+        gtex_retina_tpm: GTEx "Eye - Retina" median TPM (NULL if tissue unavailable)
+        gtex_cerebellum_tpm: GTEx "Brain - Cerebellum" median TPM
+        gtex_testis_tpm: GTEx "Testis" median TPM
+        gtex_fallopian_tube_tpm: GTEx "Fallopian Tube" median TPM (often NULL)
+        cellxgene_photoreceptor_expr: Mean expression in photoreceptor cells (scRNA-seq)
+        cellxgene_hair_cell_expr: Mean expression in hair cells (scRNA-seq)
+        tau_specificity: Tau index (0=ubiquitous, 1=tissue-specific) across all tissues
+        usher_tissue_enrichment: Enrichment in Usher-relevant tissues vs global expression
+        expression_score_normalized: Composite expression score (0-1 range)
+
+    CRITICAL: NULL values represent missing/unavailable data and are preserved as None.
+    Inner ear data is primarily from CellxGene (not HPA/GTEx bulk).
+    """
+
+    gene_id: str
+    gene_symbol: str
+    hpa_retina_tpm: float | None = None
+    hpa_cerebellum_tpm: float | None = None
+    hpa_testis_tpm: float | None = None
+    hpa_fallopian_tube_tpm: float | None = None
+    gtex_retina_tpm: float | None = None
+    gtex_cerebellum_tpm: float | None = None
+    gtex_testis_tpm: float | None = None
+    gtex_fallopian_tube_tpm: float | None = None
+    cellxgene_photoreceptor_expr: float | None = None
+    cellxgene_hair_cell_expr: float | None = None
+    tau_specificity: float | None = None
+    usher_tissue_enrichment: float | None = None
+    expression_score_normalized: float | None = None
--- a/src/usher_pipeline/evidence/expression/transform.py
+++ b/src/usher_pipeline/evidence/expression/transform.py
@@ -0,0 +1,273 @@
+"""Transform and normalize tissue expression data."""
+
+from pathlib import Path
+from typing import Optional
+
+import polars as pl
+import structlog
+
+from usher_pipeline.evidence.expression.fetch import (
+    fetch_hpa_expression,
+    fetch_gtex_expression,
+    fetch_cellxgene_expression,
+)
+
+logger = structlog.get_logger()
+
+
+def calculate_tau_specificity(
+    df: pl.DataFrame,
+    tissue_columns: list[str],
+) -> pl.DataFrame:
+    """Calculate Tau tissue specificity index.
+
+    Tau measures tissue specificity: 0 = ubiquitous expression, 1 = tissue-specific.
+    Formula: Tau = sum(1 - xi/xmax) / (n-1)
+    where xi is expression in tissue i, xmax is max expression across tissues.
+
+    If ANY tissue value is NULL, Tau is NULL (insufficient data for reliable specificity).
+
+    Args:
+        df: DataFrame with expression values across tissues
+        tissue_columns: List of column names containing tissue expression values
+
+    Returns:
+        DataFrame with tau_specificity column added
+    """
+    logger.info("tau_calculation_start", tissue_count=len(tissue_columns))
+
+    # Check if any tissue columns are missing
+    available_cols = [col for col in tissue_columns if col in df.columns]
+    if len(available_cols) < len(tissue_columns):
+        missing = set(tissue_columns) - set(available_cols)
+        logger.warning("tau_missing_columns", missing=list(missing))
+
+    if not available_cols:
+        # No tissue data available - return with NULL Tau
+        return df.with_columns(pl.lit(None).cast(pl.Float64).alias("tau_specificity"))
+
+    # For each gene, check if all tissue values are non-NULL
+    # If any NULL, Tau is NULL
+    # Otherwise, compute Tau = sum(1 - xi/xmax) / (n-1)
+
+    # Create expression for NULL check
+    has_all_data = pl.all_horizontal([pl.col(col).is_not_null() for col in available_cols])
+
+    # Compute Tau only for genes with complete data
+    # Step 1: Find max expression across tissues
+    max_expr = pl.max_horizontal([pl.col(col) for col in available_cols])
+
+    # Step 2: Compute sum(1 - xi/xmax) for each gene
+    # Handle division by zero: if max_expr is 0, Tau is undefined (set to NULL)
+    tau_sum = sum([
+        pl.when(max_expr > 0)
+        .then(1.0 - (pl.col(col) / max_expr))
+        .otherwise(0.0)
+        for col in available_cols
+    ])
+
+    # Step 3: Divide by (n-1), where n is number of tissues
+    n_tissues = len(available_cols)
+    if n_tissues <= 1:
+        # Cannot compute specificity with only 1 tissue
+        tau = pl.lit(None).cast(pl.Float64)
+    else:
+        tau = tau_sum / (n_tissues - 1)
+
+    # Apply Tau only to genes with complete data
+    df = df.with_columns(
+        pl.when(has_all_data & (max_expr > 0))
+        .then(tau)
+        .otherwise(pl.lit(None))
+        .alias("tau_specificity")
+    )
+
+    logger.info("tau_calculation_complete")
+
+    return df
+
+
+def compute_expression_score(df: pl.DataFrame) -> pl.DataFrame:
+    """Compute Usher tissue enrichment and normalized expression score.
+
+    Computes:
+    1. usher_tissue_enrichment: Ratio of mean expression in Usher-relevant tissues
+       (retina, inner ear proxies) to mean expression across all tissues.
+       Higher ratio = more enriched in target tissues.
+    2. expression_score_normalized: Weighted composite of:
+       - 40%: usher_tissue_enrichment (normalized to 0-1)
+       - 30%: tau_specificity
+       - 30%: max_target_tissue_rank (percentile rank of max expression in targets)
+
+    NULL if all expression data is NULL.
+
+    Args:
+        df: DataFrame with tissue expression columns and tau_specificity
+
+    Returns:
+        DataFrame with usher_tissue_enrichment and expression_score_normalized columns
+    """
+    logger.info("expression_score_start")
+
+    # Define Usher-relevant tissue columns
+    usher_tissue_cols = [
+        "hpa_retina_tpm",
+        "hpa_cerebellum_tpm",  # Cilia-rich
+        "gtex_retina_tpm",
+        "gtex_cerebellum_tpm",
+        "cellxgene_photoreceptor_expr",
+        "cellxgene_hair_cell_expr",
+    ]
+
+    # All tissue columns for global mean
+    all_tissue_cols = [
+        "hpa_retina_tpm",
+        "hpa_cerebellum_tpm",
+        "hpa_testis_tpm",
+        "hpa_fallopian_tube_tpm",
+        "gtex_retina_tpm",
+        "gtex_cerebellum_tpm",
+        "gtex_testis_tpm",
+        "gtex_fallopian_tube_tpm",
+        "cellxgene_photoreceptor_expr",
+        "cellxgene_hair_cell_expr",
+    ]
+
+    # Filter to available columns
+    usher_available = [col for col in usher_tissue_cols if col in df.columns]
+    all_available = [col for col in all_tissue_cols if col in df.columns]
+
+    if not usher_available or not all_available:
+        # No expression data - return NULL scores
+        return df.with_columns([
+            pl.lit(None).cast(pl.Float64).alias("usher_tissue_enrichment"),
+            pl.lit(None).cast(pl.Float64).alias("expression_score_normalized"),
+        ])
+
+    # Compute mean expression in Usher tissues (ignoring NULLs)
+    usher_mean = pl.mean_horizontal([pl.col(col) for col in usher_available])
+
+    # Compute mean expression across all tissues (ignoring NULLs)
+    global_mean = pl.mean_horizontal([pl.col(col) for col in all_available])
+
+    # Enrichment ratio: usher_mean / global_mean
+    # If global_mean is 0 or NULL, enrichment is NULL
+    enrichment = pl.when(global_mean > 0).then(usher_mean / global_mean).otherwise(pl.lit(None))
+
+    df = df.with_columns(enrichment.alias("usher_tissue_enrichment"))
+
+    # Normalize enrichment to 0-1 scale
+    # Use percentile rank across all genes
+    enrichment_percentile = pl.col("usher_tissue_enrichment").rank(method="average") / pl.col("usher_tissue_enrichment").count()
+
+    # Compute max expression in target tissues
+    max_target_expr = pl.max_horizontal([pl.col(col) for col in usher_available])
+    max_target_percentile = max_target_expr.rank(method="average") / max_target_expr.count()
+
+    # Composite score (weighted average)
+    # If tau_specificity is NULL, we can still compute a partial score
+    # But prefer to have at least enrichment or tau available
+    composite = pl.when(
+        pl.col("usher_tissue_enrichment").is_not_null() | pl.col("tau_specificity").is_not_null()
+    ).then(
+        0.4 * enrichment_percentile.fill_null(0.0) +
+        0.3 * pl.col("tau_specificity").fill_null(0.0) +
+        0.3 * max_target_percentile.fill_null(0.0)
+    ).otherwise(pl.lit(None))
+
+    df = df.with_columns(composite.alias("expression_score_normalized"))
+
+    logger.info("expression_score_complete")
+
+    return df
+
+
+def process_expression_evidence(
+    gene_ids: list[str],
+    cache_dir: Optional[Path] = None,
+    force: bool = False,
+    skip_cellxgene: bool = False,
+) -> pl.DataFrame:
+    """End-to-end expression evidence processing pipeline.
+
+    Composes: fetch HPA -> fetch GTEx -> fetch CellxGene -> merge -> compute Tau -> compute score -> collect
+
+    Args:
+        gene_ids: List of Ensembl gene IDs to process
+        cache_dir: Directory for caching downloads
+        force: If True, re-download even if cached
+        skip_cellxgene: If True, skip CellxGene fetching (optional dependency)
+
+    Returns:
+        Materialized DataFrame with expression evidence ready for DuckDB storage
+    """
+    logger.info("expression_pipeline_start", gene_count=len(gene_ids))
+
+    cache_dir = Path(cache_dir) if cache_dir else Path("data/expression")
+
+    # Fetch HPA expression (lazy)
+    logger.info("fetching_hpa")
+    lf_hpa = fetch_hpa_expression(gene_ids, cache_dir=cache_dir, force=force)
+
+    # Fetch GTEx expression (lazy)
+    logger.info("fetching_gtex")
+    lf_gtex = fetch_gtex_expression(gene_ids, cache_dir=cache_dir, force=force)
+
+    # Create gene universe DataFrame
+    gene_universe = pl.LazyFrame({"gene_id": gene_ids})
+
+    # Merge GTEx with gene universe (left join to preserve all genes)
+    # GTEx has gene_id, HPA has gene_symbol - need to handle join carefully
+    lf_merged = gene_universe.join(lf_gtex, on="gene_id", how="left")
+
+    # For HPA, we need gene_symbol mapping
+    # We'll need to load gene universe with gene_symbol from DuckDB or pass it in
+    # For now, we'll fetch HPA separately and join on gene_symbol later
+    # This requires gene_symbol in our gene_ids input or from gene universe
+
+    # DEVIATION: HPA uses gene_symbol, but we're working with gene_ids
+    # We need gene_symbol mapping. For simplicity, we'll collect HPA separately
+    # and merge in load.py after enriching with gene_symbol from gene universe
+
+    # Fetch CellxGene if not skipped
+    if not skip_cellxgene:
+        logger.info("fetching_cellxgene")
+        lf_cellxgene = fetch_cellxgene_expression(gene_ids, cache_dir=cache_dir)
+        lf_merged = lf_merged.join(lf_cellxgene, on="gene_id", how="left")
+
+    # Collect at this point to enable horizontal operations
+    df = lf_merged.collect()
+
+    # Calculate Tau specificity
+    tissue_columns = [
+        "hpa_retina_tpm",
+        "hpa_cerebellum_tpm",
+        "hpa_testis_tpm",
+        "hpa_fallopian_tube_tpm",
+        "gtex_retina_tpm",
+        "gtex_cerebellum_tpm",
+        "gtex_testis_tpm",
+        "gtex_fallopian_tube_tpm",
+        "cellxgene_photoreceptor_expr",
+        "cellxgene_hair_cell_expr",
+    ]
+    # Filter to available columns
+    available_tissue_cols = [col for col in tissue_columns if col in df.columns]
+
+    if available_tissue_cols:
+        df = calculate_tau_specificity(df, available_tissue_cols)
+    else:
+        df = df.with_columns(pl.lit(None).cast(pl.Float64).alias("tau_specificity"))
+
+    # Compute expression score
+    df = compute_expression_score(df)
+
+    logger.info(
+        "expression_pipeline_complete",
+        row_count=len(df),
+        has_hpa=any("hpa_" in col for col in df.columns),
+        has_gtex=any("gtex_" in col for col in df.columns),
+        has_cellxgene=any("cellxgene_" in col for col in df.columns),
+    )
+
+    return df