feat(03-06): implement literature evidence models, PubMed fetch, and scoring

- Create LiteratureRecord pydantic model with context-specific counts - Implement PubMed query via Biopython Entrez with rate limiting (3/sec default, 10/sec with API key) - Define SEARCH_CONTEXTS for cilia, sensory, cytoskeleton, cell_polarity queries - Implement evidence tier classification: direct_experimental > functional_mention > hts_hit > incidental > none - Implement quality-weighted scoring with bias mitigation via log2(total_pubmed_count) normalization - Add biopython>=1.84 dependency to pyproject.toml - Support checkpoint-restart for long-running PubMed queries (estimated 3-11 hours for 20K genes)
2026-02-11 19:00:20 +08:00
parent 6645c59b0b
commit 8aa66987f8
11 changed files with 1806 additions and 0 deletions
--- a/src/usher_pipeline/evidence/expression/transform.py
+++ b/src/usher_pipeline/evidence/expression/transform.py
@@ -0,0 +1,273 @@
+"""Transform and normalize tissue expression data."""
+
+from pathlib import Path
+from typing import Optional
+
+import polars as pl
+import structlog
+
+from usher_pipeline.evidence.expression.fetch import (
+    fetch_hpa_expression,
+    fetch_gtex_expression,
+    fetch_cellxgene_expression,
+)
+
+logger = structlog.get_logger()
+
+
+def calculate_tau_specificity(
+    df: pl.DataFrame,
+    tissue_columns: list[str],
+) -> pl.DataFrame:
+    """Calculate Tau tissue specificity index.
+
+    Tau measures tissue specificity: 0 = ubiquitous expression, 1 = tissue-specific.
+    Formula: Tau = sum(1 - xi/xmax) / (n-1)
+    where xi is expression in tissue i, xmax is max expression across tissues.
+
+    If ANY tissue value is NULL, Tau is NULL (insufficient data for reliable specificity).
+
+    Args:
+        df: DataFrame with expression values across tissues
+        tissue_columns: List of column names containing tissue expression values
+
+    Returns:
+        DataFrame with tau_specificity column added
+    """
+    logger.info("tau_calculation_start", tissue_count=len(tissue_columns))
+
+    # Check if any tissue columns are missing
+    available_cols = [col for col in tissue_columns if col in df.columns]
+    if len(available_cols) < len(tissue_columns):
+        missing = set(tissue_columns) - set(available_cols)
+        logger.warning("tau_missing_columns", missing=list(missing))
+
+    if not available_cols:
+        # No tissue data available - return with NULL Tau
+        return df.with_columns(pl.lit(None).cast(pl.Float64).alias("tau_specificity"))
+
+    # For each gene, check if all tissue values are non-NULL
+    # If any NULL, Tau is NULL
+    # Otherwise, compute Tau = sum(1 - xi/xmax) / (n-1)
+
+    # Create expression for NULL check
+    has_all_data = pl.all_horizontal([pl.col(col).is_not_null() for col in available_cols])
+
+    # Compute Tau only for genes with complete data
+    # Step 1: Find max expression across tissues
+    max_expr = pl.max_horizontal([pl.col(col) for col in available_cols])
+
+    # Step 2: Compute sum(1 - xi/xmax) for each gene
+    # Handle division by zero: if max_expr is 0, Tau is undefined (set to NULL)
+    tau_sum = sum([
+        pl.when(max_expr > 0)
+        .then(1.0 - (pl.col(col) / max_expr))
+        .otherwise(0.0)
+        for col in available_cols
+    ])
+
+    # Step 3: Divide by (n-1), where n is number of tissues
+    n_tissues = len(available_cols)
+    if n_tissues <= 1:
+        # Cannot compute specificity with only 1 tissue
+        tau = pl.lit(None).cast(pl.Float64)
+    else:
+        tau = tau_sum / (n_tissues - 1)
+
+    # Apply Tau only to genes with complete data
+    df = df.with_columns(
+        pl.when(has_all_data & (max_expr > 0))
+        .then(tau)
+        .otherwise(pl.lit(None))
+        .alias("tau_specificity")
+    )
+
+    logger.info("tau_calculation_complete")
+
+    return df
+
+
+def compute_expression_score(df: pl.DataFrame) -> pl.DataFrame:
+    """Compute Usher tissue enrichment and normalized expression score.
+
+    Computes:
+    1. usher_tissue_enrichment: Ratio of mean expression in Usher-relevant tissues
+       (retina, inner ear proxies) to mean expression across all tissues.
+       Higher ratio = more enriched in target tissues.
+    2. expression_score_normalized: Weighted composite of:
+       - 40%: usher_tissue_enrichment (normalized to 0-1)
+       - 30%: tau_specificity
+       - 30%: max_target_tissue_rank (percentile rank of max expression in targets)
+
+    NULL if all expression data is NULL.
+
+    Args:
+        df: DataFrame with tissue expression columns and tau_specificity
+
+    Returns:
+        DataFrame with usher_tissue_enrichment and expression_score_normalized columns
+    """
+    logger.info("expression_score_start")
+
+    # Define Usher-relevant tissue columns
+    usher_tissue_cols = [
+        "hpa_retina_tpm",
+        "hpa_cerebellum_tpm",  # Cilia-rich
+        "gtex_retina_tpm",
+        "gtex_cerebellum_tpm",
+        "cellxgene_photoreceptor_expr",
+        "cellxgene_hair_cell_expr",
+    ]
+
+    # All tissue columns for global mean
+    all_tissue_cols = [
+        "hpa_retina_tpm",
+        "hpa_cerebellum_tpm",
+        "hpa_testis_tpm",
+        "hpa_fallopian_tube_tpm",
+        "gtex_retina_tpm",
+        "gtex_cerebellum_tpm",
+        "gtex_testis_tpm",
+        "gtex_fallopian_tube_tpm",
+        "cellxgene_photoreceptor_expr",
+        "cellxgene_hair_cell_expr",
+    ]
+
+    # Filter to available columns
+    usher_available = [col for col in usher_tissue_cols if col in df.columns]
+    all_available = [col for col in all_tissue_cols if col in df.columns]
+
+    if not usher_available or not all_available:
+        # No expression data - return NULL scores
+        return df.with_columns([
+            pl.lit(None).cast(pl.Float64).alias("usher_tissue_enrichment"),
+            pl.lit(None).cast(pl.Float64).alias("expression_score_normalized"),
+        ])
+
+    # Compute mean expression in Usher tissues (ignoring NULLs)
+    usher_mean = pl.mean_horizontal([pl.col(col) for col in usher_available])
+
+    # Compute mean expression across all tissues (ignoring NULLs)
+    global_mean = pl.mean_horizontal([pl.col(col) for col in all_available])
+
+    # Enrichment ratio: usher_mean / global_mean
+    # If global_mean is 0 or NULL, enrichment is NULL
+    enrichment = pl.when(global_mean > 0).then(usher_mean / global_mean).otherwise(pl.lit(None))
+
+    df = df.with_columns(enrichment.alias("usher_tissue_enrichment"))
+
+    # Normalize enrichment to 0-1 scale
+    # Use percentile rank across all genes
+    enrichment_percentile = pl.col("usher_tissue_enrichment").rank(method="average") / pl.col("usher_tissue_enrichment").count()
+
+    # Compute max expression in target tissues
+    max_target_expr = pl.max_horizontal([pl.col(col) for col in usher_available])
+    max_target_percentile = max_target_expr.rank(method="average") / max_target_expr.count()
+
+    # Composite score (weighted average)
+    # If tau_specificity is NULL, we can still compute a partial score
+    # But prefer to have at least enrichment or tau available
+    composite = pl.when(
+        pl.col("usher_tissue_enrichment").is_not_null() | pl.col("tau_specificity").is_not_null()
+    ).then(
+        0.4 * enrichment_percentile.fill_null(0.0) +
+        0.3 * pl.col("tau_specificity").fill_null(0.0) +
+        0.3 * max_target_percentile.fill_null(0.0)
+    ).otherwise(pl.lit(None))
+
+    df = df.with_columns(composite.alias("expression_score_normalized"))
+
+    logger.info("expression_score_complete")
+
+    return df
+
+
+def process_expression_evidence(
+    gene_ids: list[str],
+    cache_dir: Optional[Path] = None,
+    force: bool = False,
+    skip_cellxgene: bool = False,
+) -> pl.DataFrame:
+    """End-to-end expression evidence processing pipeline.
+
+    Composes: fetch HPA -> fetch GTEx -> fetch CellxGene -> merge -> compute Tau -> compute score -> collect
+
+    Args:
+        gene_ids: List of Ensembl gene IDs to process
+        cache_dir: Directory for caching downloads
+        force: If True, re-download even if cached
+        skip_cellxgene: If True, skip CellxGene fetching (optional dependency)
+
+    Returns:
+        Materialized DataFrame with expression evidence ready for DuckDB storage
+    """
+    logger.info("expression_pipeline_start", gene_count=len(gene_ids))
+
+    cache_dir = Path(cache_dir) if cache_dir else Path("data/expression")
+
+    # Fetch HPA expression (lazy)
+    logger.info("fetching_hpa")
+    lf_hpa = fetch_hpa_expression(gene_ids, cache_dir=cache_dir, force=force)
+
+    # Fetch GTEx expression (lazy)
+    logger.info("fetching_gtex")
+    lf_gtex = fetch_gtex_expression(gene_ids, cache_dir=cache_dir, force=force)
+
+    # Create gene universe DataFrame
+    gene_universe = pl.LazyFrame({"gene_id": gene_ids})
+
+    # Merge GTEx with gene universe (left join to preserve all genes)
+    # GTEx has gene_id, HPA has gene_symbol - need to handle join carefully
+    lf_merged = gene_universe.join(lf_gtex, on="gene_id", how="left")
+
+    # For HPA, we need gene_symbol mapping
+    # We'll need to load gene universe with gene_symbol from DuckDB or pass it in
+    # For now, we'll fetch HPA separately and join on gene_symbol later
+    # This requires gene_symbol in our gene_ids input or from gene universe
+
+    # DEVIATION: HPA uses gene_symbol, but we're working with gene_ids
+    # We need gene_symbol mapping. For simplicity, we'll collect HPA separately
+    # and merge in load.py after enriching with gene_symbol from gene universe
+
+    # Fetch CellxGene if not skipped
+    if not skip_cellxgene:
+        logger.info("fetching_cellxgene")
+        lf_cellxgene = fetch_cellxgene_expression(gene_ids, cache_dir=cache_dir)
+        lf_merged = lf_merged.join(lf_cellxgene, on="gene_id", how="left")
+
+    # Collect at this point to enable horizontal operations
+    df = lf_merged.collect()
+
+    # Calculate Tau specificity
+    tissue_columns = [
+        "hpa_retina_tpm",
+        "hpa_cerebellum_tpm",
+        "hpa_testis_tpm",
+        "hpa_fallopian_tube_tpm",
+        "gtex_retina_tpm",
+        "gtex_cerebellum_tpm",
+        "gtex_testis_tpm",
+        "gtex_fallopian_tube_tpm",
+        "cellxgene_photoreceptor_expr",
+        "cellxgene_hair_cell_expr",
+    ]
+    # Filter to available columns
+    available_tissue_cols = [col for col in tissue_columns if col in df.columns]
+
+    if available_tissue_cols:
+        df = calculate_tau_specificity(df, available_tissue_cols)
+    else:
+        df = df.with_columns(pl.lit(None).cast(pl.Float64).alias("tau_specificity"))
+
+    # Compute expression score
+    df = compute_expression_score(df)
+
+    logger.info(
+        "expression_pipeline_complete",
+        row_count=len(df),
+        has_hpa=any("hpa_" in col for col in df.columns),
+        has_gtex=any("gtex_" in col for col in df.columns),
+        has_cellxgene=any("cellxgene_" in col for col in df.columns),
+    )
+
+    return df