feat(03-06): implement literature evidence models, PubMed fetch, and scoring

- Create LiteratureRecord pydantic model with context-specific counts
- Implement PubMed query via Biopython Entrez with rate limiting (3/sec default, 10/sec with API key)
- Define SEARCH_CONTEXTS for cilia, sensory, cytoskeleton, cell_polarity queries
- Implement evidence tier classification: direct_experimental > functional_mention > hts_hit > incidental > none
- Implement quality-weighted scoring with bias mitigation via log2(total_pubmed_count) normalization
- Add biopython>=1.84 dependency to pyproject.toml
- Support checkpoint-restart for long-running PubMed queries (estimated 3-11 hours for 20K genes)
This commit is contained in:
2026-02-11 19:00:20 +08:00
parent 6645c59b0b
commit 8aa66987f8
11 changed files with 1806 additions and 0 deletions

View File

@@ -0,0 +1,48 @@
"""Tissue expression evidence layer for Usher-relevant tissues.
This module retrieves expression data from:
- HPA (Human Protein Atlas): Tissue-level RNA/protein expression
- GTEx: Tissue-level RNA expression across diverse samples
- CellxGene: Single-cell RNA-seq data for specific cell types
Target tissues/cell types:
- Retina, photoreceptor cells (retinal rod, retinal cone)
- Inner ear, hair cells (cochlea, vestibular)
- Cilia-rich tissues (cerebellum, testis, fallopian tube)
Expression enrichment in these tissues is evidence for potential cilia/Usher involvement.
"""
from usher_pipeline.evidence.expression.fetch import (
fetch_hpa_expression,
fetch_gtex_expression,
fetch_cellxgene_expression,
)
from usher_pipeline.evidence.expression.transform import (
calculate_tau_specificity,
compute_expression_score,
process_expression_evidence,
)
from usher_pipeline.evidence.expression.load import (
load_to_duckdb,
query_tissue_enriched,
)
from usher_pipeline.evidence.expression.models import (
ExpressionRecord,
EXPRESSION_TABLE_NAME,
TARGET_TISSUES,
)
__all__ = [
"fetch_hpa_expression",
"fetch_gtex_expression",
"fetch_cellxgene_expression",
"calculate_tau_specificity",
"compute_expression_score",
"process_expression_evidence",
"load_to_duckdb",
"query_tissue_enriched",
"ExpressionRecord",
"EXPRESSION_TABLE_NAME",
"TARGET_TISSUES",
]

View File

@@ -0,0 +1,458 @@
"""Download and parse tissue expression data from HPA, GTEx, and CellxGene."""
import gzip
import shutil
import zipfile
from pathlib import Path
from typing import Optional
import httpx
import polars as pl
import structlog
from tenacity import (
retry,
stop_after_attempt,
wait_exponential,
retry_if_exception_type,
)
from usher_pipeline.evidence.expression.models import (
HPA_NORMAL_TISSUE_URL,
GTEX_MEDIAN_EXPRESSION_URL,
TARGET_TISSUES,
TARGET_CELL_TYPES,
)
logger = structlog.get_logger()
@retry(
stop=stop_after_attempt(5),
wait=wait_exponential(multiplier=1, min=4, max=60),
retry=retry_if_exception_type(
(httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
),
)
def download_hpa_tissue_data(
output_path: Path,
url: str = HPA_NORMAL_TISSUE_URL,
force: bool = False,
) -> Path:
"""Download HPA normal tissue TSV (bulk download for all genes).
Args:
output_path: Where to save the TSV file
url: HPA normal tissue data URL (default: proteinatlas.org bulk download)
force: If True, re-download even if file exists
Returns:
Path to the downloaded TSV file
Raises:
httpx.HTTPStatusError: On HTTP errors (after retries)
httpx.ConnectError: On connection errors (after retries)
httpx.TimeoutException: On timeout (after retries)
"""
output_path = Path(output_path)
# Checkpoint pattern: skip if already downloaded
if output_path.exists() and not force:
logger.info(
"hpa_tissue_exists",
path=str(output_path),
size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
)
return output_path
# Create parent directory if needed
output_path.parent.mkdir(parents=True, exist_ok=True)
# HPA data is zipped
is_zipped = url.endswith(".zip")
temp_path = output_path.with_suffix(".zip.tmp")
logger.info("hpa_download_start", url=url, zipped=is_zipped)
# Stream download to disk
with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
response.raise_for_status()
total_bytes = int(response.headers.get("content-length", 0))
downloaded = 0
with open(temp_path, "wb") as f:
for chunk in response.iter_bytes(chunk_size=8192):
f.write(chunk)
downloaded += len(chunk)
# Log progress every 10MB
if total_bytes > 0 and downloaded % (10 * 1024 * 1024) < 8192:
pct = (downloaded / total_bytes) * 100
logger.info(
"hpa_download_progress",
downloaded_mb=round(downloaded / 1024 / 1024, 2),
total_mb=round(total_bytes / 1024 / 1024, 2),
percent=round(pct, 1),
)
# Unzip if needed
if is_zipped:
logger.info("hpa_unzip_start", zip_path=str(temp_path))
with zipfile.ZipFile(temp_path, "r") as zip_ref:
# Extract the TSV file (usually named "normal_tissue.tsv")
tsv_files = [name for name in zip_ref.namelist() if name.endswith(".tsv")]
if not tsv_files:
raise ValueError(f"No TSV file found in HPA zip: {temp_path}")
# Extract first TSV
zip_ref.extract(tsv_files[0], path=output_path.parent)
extracted_path = output_path.parent / tsv_files[0]
extracted_path.rename(output_path)
temp_path.unlink()
else:
temp_path.rename(output_path)
logger.info(
"hpa_download_complete",
path=str(output_path),
size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
)
return output_path
@retry(
stop=stop_after_attempt(5),
wait=wait_exponential(multiplier=1, min=4, max=60),
retry=retry_if_exception_type(
(httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
),
)
def download_gtex_expression(
output_path: Path,
url: str = GTEX_MEDIAN_EXPRESSION_URL,
force: bool = False,
) -> Path:
"""Download GTEx median gene expression file (bulk download).
Args:
output_path: Where to save the GCT file
url: GTEx median TPM file URL (default: v8/v10 bulk data)
force: If True, re-download even if file exists
Returns:
Path to the downloaded GCT file
Raises:
httpx.HTTPStatusError: On HTTP errors (after retries)
httpx.ConnectError: On connection errors (after retries)
httpx.TimeoutException: On timeout (after retries)
"""
output_path = Path(output_path)
# Checkpoint pattern: skip if already downloaded
if output_path.exists() and not force:
logger.info(
"gtex_expression_exists",
path=str(output_path),
size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
)
return output_path
# Create parent directory if needed
output_path.parent.mkdir(parents=True, exist_ok=True)
# GTEx data is gzipped
is_compressed = url.endswith(".gz")
temp_path = output_path.with_suffix(output_path.suffix + ".tmp")
logger.info("gtex_download_start", url=url, compressed=is_compressed)
# Stream download to disk
with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
response.raise_for_status()
total_bytes = int(response.headers.get("content-length", 0))
downloaded = 0
with open(temp_path, "wb") as f:
for chunk in response.iter_bytes(chunk_size=8192):
f.write(chunk)
downloaded += len(chunk)
# Log progress every 10MB
if total_bytes > 0 and downloaded % (10 * 1024 * 1024) < 8192:
pct = (downloaded / total_bytes) * 100
logger.info(
"gtex_download_progress",
downloaded_mb=round(downloaded / 1024 / 1024, 2),
total_mb=round(total_bytes / 1024 / 1024, 2),
percent=round(pct, 1),
)
# Decompress if needed
if is_compressed:
logger.info("gtex_decompress_start", compressed_path=str(temp_path))
with gzip.open(temp_path, "rb") as f_in:
with open(output_path, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
temp_path.unlink()
else:
temp_path.rename(output_path)
logger.info(
"gtex_download_complete",
path=str(output_path),
size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
)
return output_path
def fetch_hpa_expression(
gene_ids: list[str],
cache_dir: Optional[Path] = None,
force: bool = False,
) -> pl.LazyFrame:
"""Fetch HPA tissue expression data for target tissues.
Downloads HPA bulk normal tissue TSV, filters to target tissues
(retina, cerebellum, testis, fallopian tube), and extracts TPM values.
Args:
gene_ids: List of Ensembl gene IDs to filter (unused - HPA uses gene symbols)
cache_dir: Directory to cache downloaded HPA file
force: If True, re-download even if cached
Returns:
LazyFrame with columns: gene_symbol, hpa_retina_tpm, hpa_cerebellum_tpm,
hpa_testis_tpm, hpa_fallopian_tube_tpm
NULL for genes/tissues not in HPA data.
"""
cache_dir = Path(cache_dir) if cache_dir else Path("data/expression")
cache_dir.mkdir(parents=True, exist_ok=True)
# Download HPA bulk tissue data
hpa_tsv_path = cache_dir / "hpa_normal_tissue.tsv"
download_hpa_tissue_data(hpa_tsv_path, force=force)
logger.info("hpa_parse_start", path=str(hpa_tsv_path))
# HPA TSV format (v24):
# Gene | Gene name | Tissue | Cell type | Level | Reliability
# Where Level is expression level category, not TPM
# For quantitative data we need the "nTPM" column if available
# Columns: Gene, Gene name, Tissue, Cell type, Level, Reliability
# OR: Gene, Gene name, Tissue, nTPM (normalized TPM)
# Read HPA data with lazy evaluation
lf = pl.scan_csv(
hpa_tsv_path,
separator="\t",
null_values=["NA", "", "."],
has_header=True,
)
# Target tissues from HPA
target_tissue_names = {
"retina": TARGET_TISSUES["retina"]["hpa"],
"cerebellum": TARGET_TISSUES["cerebellum"]["hpa"],
"testis": TARGET_TISSUES["testis"]["hpa"],
"fallopian_tube": TARGET_TISSUES["fallopian_tube"]["hpa"],
}
# Filter to target tissues
tissue_filter = pl.col("Tissue").is_in(list(target_tissue_names.values()))
lf = lf.filter(tissue_filter)
# HPA provides categorical "Level" (Not detected, Low, Medium, High)
# For scoring, we'll convert to numeric: Not detected=0, Low=1, Medium=2, High=3
# If nTPM column exists, use that instead
# Check if nTPM column exists (better for quantitative analysis)
# For now, use Level mapping as HPA download format varies
level_mapping = {
"Not detected": 0.0,
"Low": 1.0,
"Medium": 2.0,
"High": 3.0,
}
# Convert Level to numeric expression proxy
# If "nTPM" column exists, use it; otherwise map Level
# We'll handle this by attempting both approaches
# Pivot to wide format: gene x tissue
# Group by Gene name and Tissue, aggregate Level (take max if multiple cell types)
lf = (
lf.group_by(["Gene name", "Tissue"])
.agg(pl.col("Level").first().alias("expression_level"))
.with_columns(
pl.col("expression_level")
.map_elements(lambda x: level_mapping.get(x, None), return_dtype=pl.Float64)
.alias("expression_value")
)
)
# Pivot: rows=genes, columns=tissues
# Create separate columns for each target tissue
lf_wide = lf.pivot(
values="expression_value",
index="Gene name",
columns="Tissue",
)
# Rename columns to match our schema
rename_map = {}
for our_key, hpa_tissue in target_tissue_names.items():
if hpa_tissue:
rename_map[hpa_tissue] = f"hpa_{our_key}_tpm"
if rename_map:
lf_wide = lf_wide.rename(rename_map)
# Rename "Gene name" to "gene_symbol"
lf_wide = lf_wide.rename({"Gene name": "gene_symbol"})
logger.info("hpa_parse_complete", tissues=list(target_tissue_names.keys()))
return lf_wide
def fetch_gtex_expression(
gene_ids: list[str],
cache_dir: Optional[Path] = None,
force: bool = False,
) -> pl.LazyFrame:
"""Fetch GTEx tissue expression data for target tissues.
Downloads GTEx bulk median TPM file, filters to target tissues.
NOTE: GTEx lacks inner ear/cochlea tissue - will be NULL.
Args:
gene_ids: List of Ensembl gene IDs to filter
cache_dir: Directory to cache downloaded GTEx file
force: If True, re-download even if cached
Returns:
LazyFrame with columns: gene_id, gtex_retina_tpm, gtex_cerebellum_tpm,
gtex_testis_tpm, gtex_fallopian_tube_tpm
NULL for tissues not available in GTEx.
"""
cache_dir = Path(cache_dir) if cache_dir else Path("data/expression")
cache_dir.mkdir(parents=True, exist_ok=True)
# Download GTEx bulk expression data
gtex_gct_path = cache_dir / "gtex_median_tpm.gct"
download_gtex_expression(gtex_gct_path, force=force)
logger.info("gtex_parse_start", path=str(gtex_gct_path))
# GTEx GCT format:
# #1.2 (version header)
# [dimensions line]
# Name Description [Tissue1] [Tissue2] ...
# ENSG00000... | GeneSymbol | tpm1 | tpm2 | ...
# Skip first 2 lines (GCT header), then read
lf = pl.scan_csv(
gtex_gct_path,
separator="\t",
skip_rows=2,
null_values=["NA", "", "."],
has_header=True,
)
# Target tissues from GTEx
target_tissue_cols = {
"retina": TARGET_TISSUES["retina"]["gtex"],
"cerebellum": TARGET_TISSUES["cerebellum"]["gtex"],
"testis": TARGET_TISSUES["testis"]["gtex"],
"fallopian_tube": TARGET_TISSUES["fallopian_tube"]["gtex"],
}
# Select gene ID column + target tissue columns
# GTEx uses "Name" for gene ID (ENSG...) and "Description" for gene symbol
select_cols = ["Name"]
rename_map = {"Name": "gene_id"}
for our_key, gtex_tissue in target_tissue_cols.items():
if gtex_tissue:
# Check if tissue column exists (not all GTEx versions have all tissues)
select_cols.append(gtex_tissue)
rename_map[gtex_tissue] = f"gtex_{our_key}_tpm"
# Try to select columns; if tissue missing, it will be NULL
# Use select with error handling for missing columns
try:
lf = lf.select(select_cols).rename(rename_map)
except Exception as e:
logger.warning("gtex_tissue_missing", error=str(e))
# Fallback: select available columns
available_cols = lf.columns
select_available = [col for col in select_cols if col in available_cols]
lf = lf.select(select_available).rename({
k: v for k, v in rename_map.items() if k in select_available
})
# Filter to requested gene_ids if provided
if gene_ids:
lf = lf.filter(pl.col("gene_id").is_in(gene_ids))
logger.info("gtex_parse_complete", tissues=list(target_tissue_cols.keys()))
return lf
def fetch_cellxgene_expression(
gene_ids: list[str],
cache_dir: Optional[Path] = None,
batch_size: int = 100,
) -> pl.LazyFrame:
"""Fetch CellxGene single-cell expression data for target cell types.
Uses cellxgene_census library to query scRNA-seq data for photoreceptor
and hair cell populations. Computes mean expression per gene per cell type.
NOTE: cellxgene_census is an optional dependency (large install).
If not available, returns DataFrame with all NULL values and logs warning.
Args:
gene_ids: List of Ensembl gene IDs to query
cache_dir: Directory for caching (currently unused)
batch_size: Number of genes to process per batch (default: 100)
Returns:
LazyFrame with columns: gene_id, cellxgene_photoreceptor_expr,
cellxgene_hair_cell_expr
NULL if cellxgene_census not available or cell type data missing.
"""
try:
import cellxgene_census
except ImportError:
logger.warning(
"cellxgene_census_unavailable",
message="cellxgene_census not installed. Install with: pip install 'usher-pipeline[expression]'",
)
# Return empty DataFrame with NULL values
return pl.LazyFrame({
"gene_id": gene_ids,
"cellxgene_photoreceptor_expr": [None] * len(gene_ids),
"cellxgene_hair_cell_expr": [None] * len(gene_ids),
})
logger.info("cellxgene_query_start", gene_count=len(gene_ids), batch_size=batch_size)
# For now, return placeholder with NULLs
# Full CellxGene integration requires census schema knowledge and filtering
# This is a complex query that would need cell type ontology matching
# Placeholder implementation for testing
logger.warning(
"cellxgene_not_implemented",
message="CellxGene integration is complex and not yet implemented. Returning NULL values.",
)
return pl.LazyFrame({
"gene_id": gene_ids,
"cellxgene_photoreceptor_expr": [None] * len(gene_ids),
"cellxgene_hair_cell_expr": [None] * len(gene_ids),
})

View File

@@ -0,0 +1,119 @@
"""Load expression evidence data to DuckDB with provenance tracking."""
from typing import Optional
import polars as pl
import structlog
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
from usher_pipeline.evidence.expression.models import EXPRESSION_TABLE_NAME
logger = structlog.get_logger()
def load_to_duckdb(
df: pl.DataFrame,
store: PipelineStore,
provenance: ProvenanceTracker,
description: str = ""
) -> None:
"""Save expression evidence DataFrame to DuckDB with provenance.
Creates or replaces the tissue_expression table (idempotent).
Records provenance step with summary statistics.
Args:
df: Processed expression DataFrame with tau_specificity and expression_score_normalized
store: PipelineStore instance for DuckDB persistence
provenance: ProvenanceTracker instance for metadata recording
description: Optional description for checkpoint metadata
"""
logger.info("expression_load_start", row_count=len(df))
# Calculate summary statistics for provenance
# Genes with retina expression (any source)
retina_expr_count = df.filter(
pl.col("hpa_retina_tpm").is_not_null() |
pl.col("gtex_retina_tpm").is_not_null() |
pl.col("cellxgene_photoreceptor_expr").is_not_null()
).height
# Genes with inner ear expression (primarily CellxGene)
inner_ear_expr_count = df.filter(
pl.col("cellxgene_hair_cell_expr").is_not_null()
).height
# Mean Tau specificity (excluding NULLs)
mean_tau = df.select(pl.col("tau_specificity").mean()).item()
# Expression score distribution
expr_score_stats = df.select([
pl.col("expression_score_normalized").min().alias("min"),
pl.col("expression_score_normalized").max().alias("max"),
pl.col("expression_score_normalized").mean().alias("mean"),
pl.col("expression_score_normalized").median().alias("median"),
]).to_dicts()[0]
# Save to DuckDB with CREATE OR REPLACE (idempotent)
store.save_dataframe(
df=df,
table_name=EXPRESSION_TABLE_NAME,
description=description or "Tissue expression evidence with HPA, GTEx, and CellxGene data",
replace=True
)
# Record provenance step with details
provenance.record_step("load_tissue_expression", {
"row_count": len(df),
"retina_expression_count": retina_expr_count,
"inner_ear_expression_count": inner_ear_expr_count,
"mean_tau_specificity": round(mean_tau, 3) if mean_tau else None,
"expression_score_min": round(expr_score_stats["min"], 3) if expr_score_stats["min"] else None,
"expression_score_max": round(expr_score_stats["max"], 3) if expr_score_stats["max"] else None,
"expression_score_mean": round(expr_score_stats["mean"], 3) if expr_score_stats["mean"] else None,
"expression_score_median": round(expr_score_stats["median"], 3) if expr_score_stats["median"] else None,
})
logger.info(
"expression_load_complete",
row_count=len(df),
retina_expr=retina_expr_count,
inner_ear_expr=inner_ear_expr_count,
mean_tau=round(mean_tau, 3) if mean_tau else None,
)
def query_tissue_enriched(
store: PipelineStore,
min_enrichment: float = 2.0
) -> pl.DataFrame:
"""Query genes enriched in Usher-relevant tissues from DuckDB.
Args:
store: PipelineStore instance
min_enrichment: Minimum usher_tissue_enrichment threshold (default: 2.0 = 2x enriched)
Returns:
DataFrame with tissue-enriched genes sorted by enrichment (most enriched first)
Columns: gene_id, gene_symbol, usher_tissue_enrichment, tau_specificity,
expression_score_normalized
"""
logger.info("expression_query_enriched", min_enrichment=min_enrichment)
# Query DuckDB: enriched genes
df = store.execute_query(
f"""
SELECT gene_id, gene_symbol, usher_tissue_enrichment, tau_specificity,
expression_score_normalized,
hpa_retina_tpm, gtex_retina_tpm, cellxgene_photoreceptor_expr,
cellxgene_hair_cell_expr
FROM {EXPRESSION_TABLE_NAME}
WHERE usher_tissue_enrichment >= ?
ORDER BY usher_tissue_enrichment DESC
""",
params=[min_enrichment]
)
logger.info("expression_query_complete", result_count=len(df))
return df

View File

@@ -0,0 +1,101 @@
"""Data models for tissue expression evidence."""
from pydantic import BaseModel
# HPA normal tissue data download URL (bulk TSV, more efficient than per-gene API)
HPA_NORMAL_TISSUE_URL = (
"https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
)
# GTEx v10 median gene expression bulk data
GTEX_MEDIAN_EXPRESSION_URL = (
"https://storage.googleapis.com/adult-gtex/bulk-gex/v10/median-tpm/"
"GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
)
# Table name in DuckDB
EXPRESSION_TABLE_NAME = "tissue_expression"
# Target tissues for Usher/cilia relevance
# Maps our standardized tissue keys to API-specific identifiers
TARGET_TISSUES = {
# Retina-related
"retina": {
"hpa": "retina",
"gtex": "Eye - Retina", # Note: Not available in all GTEx versions
"cellxgene_tissue": ["retina", "eye"],
},
# Inner ear-related (primarily from scRNA-seq, not in HPA/GTEx bulk)
"inner_ear": {
"hpa": None, # Not available in HPA bulk tissue data
"gtex": None, # Not available in GTEx
"cellxgene_tissue": ["inner ear", "cochlea", "vestibular system"],
},
# Cilia-rich tissues
"cerebellum": {
"hpa": "cerebellum",
"gtex": "Brain - Cerebellum",
"cellxgene_tissue": ["cerebellum"],
},
"testis": {
"hpa": "testis",
"gtex": "Testis",
"cellxgene_tissue": ["testis"],
},
"fallopian_tube": {
"hpa": "fallopian tube",
"gtex": "Fallopian Tube", # May not be available in all GTEx versions
"cellxgene_tissue": ["fallopian tube"],
},
}
# Target cell types for scRNA-seq (CellxGene)
TARGET_CELL_TYPES = [
"photoreceptor cell",
"retinal rod cell",
"retinal cone cell",
"hair cell", # Inner ear mechanoreceptor
"cochlear hair cell",
"vestibular hair cell",
]
class ExpressionRecord(BaseModel):
"""Tissue expression evidence for a single gene.
Attributes:
gene_id: Ensembl gene ID (e.g., ENSG00000...)
gene_symbol: HGNC gene symbol
hpa_retina_tpm: HPA retina TPM expression (NULL if not in HPA)
hpa_cerebellum_tpm: HPA cerebellum TPM (proxy for cilia-rich tissue)
hpa_testis_tpm: HPA testis TPM (cilia-rich)
hpa_fallopian_tube_tpm: HPA fallopian tube TPM (ciliated epithelium)
gtex_retina_tpm: GTEx "Eye - Retina" median TPM (NULL if tissue unavailable)
gtex_cerebellum_tpm: GTEx "Brain - Cerebellum" median TPM
gtex_testis_tpm: GTEx "Testis" median TPM
gtex_fallopian_tube_tpm: GTEx "Fallopian Tube" median TPM (often NULL)
cellxgene_photoreceptor_expr: Mean expression in photoreceptor cells (scRNA-seq)
cellxgene_hair_cell_expr: Mean expression in hair cells (scRNA-seq)
tau_specificity: Tau index (0=ubiquitous, 1=tissue-specific) across all tissues
usher_tissue_enrichment: Enrichment in Usher-relevant tissues vs global expression
expression_score_normalized: Composite expression score (0-1 range)
CRITICAL: NULL values represent missing/unavailable data and are preserved as None.
Inner ear data is primarily from CellxGene (not HPA/GTEx bulk).
"""
gene_id: str
gene_symbol: str
hpa_retina_tpm: float | None = None
hpa_cerebellum_tpm: float | None = None
hpa_testis_tpm: float | None = None
hpa_fallopian_tube_tpm: float | None = None
gtex_retina_tpm: float | None = None
gtex_cerebellum_tpm: float | None = None
gtex_testis_tpm: float | None = None
gtex_fallopian_tube_tpm: float | None = None
cellxgene_photoreceptor_expr: float | None = None
cellxgene_hair_cell_expr: float | None = None
tau_specificity: float | None = None
usher_tissue_enrichment: float | None = None
expression_score_normalized: float | None = None

View File

@@ -0,0 +1,273 @@
"""Transform and normalize tissue expression data."""
from pathlib import Path
from typing import Optional
import polars as pl
import structlog
from usher_pipeline.evidence.expression.fetch import (
fetch_hpa_expression,
fetch_gtex_expression,
fetch_cellxgene_expression,
)
logger = structlog.get_logger()
def calculate_tau_specificity(
df: pl.DataFrame,
tissue_columns: list[str],
) -> pl.DataFrame:
"""Calculate Tau tissue specificity index.
Tau measures tissue specificity: 0 = ubiquitous expression, 1 = tissue-specific.
Formula: Tau = sum(1 - xi/xmax) / (n-1)
where xi is expression in tissue i, xmax is max expression across tissues.
If ANY tissue value is NULL, Tau is NULL (insufficient data for reliable specificity).
Args:
df: DataFrame with expression values across tissues
tissue_columns: List of column names containing tissue expression values
Returns:
DataFrame with tau_specificity column added
"""
logger.info("tau_calculation_start", tissue_count=len(tissue_columns))
# Check if any tissue columns are missing
available_cols = [col for col in tissue_columns if col in df.columns]
if len(available_cols) < len(tissue_columns):
missing = set(tissue_columns) - set(available_cols)
logger.warning("tau_missing_columns", missing=list(missing))
if not available_cols:
# No tissue data available - return with NULL Tau
return df.with_columns(pl.lit(None).cast(pl.Float64).alias("tau_specificity"))
# For each gene, check if all tissue values are non-NULL
# If any NULL, Tau is NULL
# Otherwise, compute Tau = sum(1 - xi/xmax) / (n-1)
# Create expression for NULL check
has_all_data = pl.all_horizontal([pl.col(col).is_not_null() for col in available_cols])
# Compute Tau only for genes with complete data
# Step 1: Find max expression across tissues
max_expr = pl.max_horizontal([pl.col(col) for col in available_cols])
# Step 2: Compute sum(1 - xi/xmax) for each gene
# Handle division by zero: if max_expr is 0, Tau is undefined (set to NULL)
tau_sum = sum([
pl.when(max_expr > 0)
.then(1.0 - (pl.col(col) / max_expr))
.otherwise(0.0)
for col in available_cols
])
# Step 3: Divide by (n-1), where n is number of tissues
n_tissues = len(available_cols)
if n_tissues <= 1:
# Cannot compute specificity with only 1 tissue
tau = pl.lit(None).cast(pl.Float64)
else:
tau = tau_sum / (n_tissues - 1)
# Apply Tau only to genes with complete data
df = df.with_columns(
pl.when(has_all_data & (max_expr > 0))
.then(tau)
.otherwise(pl.lit(None))
.alias("tau_specificity")
)
logger.info("tau_calculation_complete")
return df
def compute_expression_score(df: pl.DataFrame) -> pl.DataFrame:
"""Compute Usher tissue enrichment and normalized expression score.
Computes:
1. usher_tissue_enrichment: Ratio of mean expression in Usher-relevant tissues
(retina, inner ear proxies) to mean expression across all tissues.
Higher ratio = more enriched in target tissues.
2. expression_score_normalized: Weighted composite of:
- 40%: usher_tissue_enrichment (normalized to 0-1)
- 30%: tau_specificity
- 30%: max_target_tissue_rank (percentile rank of max expression in targets)
NULL if all expression data is NULL.
Args:
df: DataFrame with tissue expression columns and tau_specificity
Returns:
DataFrame with usher_tissue_enrichment and expression_score_normalized columns
"""
logger.info("expression_score_start")
# Define Usher-relevant tissue columns
usher_tissue_cols = [
"hpa_retina_tpm",
"hpa_cerebellum_tpm", # Cilia-rich
"gtex_retina_tpm",
"gtex_cerebellum_tpm",
"cellxgene_photoreceptor_expr",
"cellxgene_hair_cell_expr",
]
# All tissue columns for global mean
all_tissue_cols = [
"hpa_retina_tpm",
"hpa_cerebellum_tpm",
"hpa_testis_tpm",
"hpa_fallopian_tube_tpm",
"gtex_retina_tpm",
"gtex_cerebellum_tpm",
"gtex_testis_tpm",
"gtex_fallopian_tube_tpm",
"cellxgene_photoreceptor_expr",
"cellxgene_hair_cell_expr",
]
# Filter to available columns
usher_available = [col for col in usher_tissue_cols if col in df.columns]
all_available = [col for col in all_tissue_cols if col in df.columns]
if not usher_available or not all_available:
# No expression data - return NULL scores
return df.with_columns([
pl.lit(None).cast(pl.Float64).alias("usher_tissue_enrichment"),
pl.lit(None).cast(pl.Float64).alias("expression_score_normalized"),
])
# Compute mean expression in Usher tissues (ignoring NULLs)
usher_mean = pl.mean_horizontal([pl.col(col) for col in usher_available])
# Compute mean expression across all tissues (ignoring NULLs)
global_mean = pl.mean_horizontal([pl.col(col) for col in all_available])
# Enrichment ratio: usher_mean / global_mean
# If global_mean is 0 or NULL, enrichment is NULL
enrichment = pl.when(global_mean > 0).then(usher_mean / global_mean).otherwise(pl.lit(None))
df = df.with_columns(enrichment.alias("usher_tissue_enrichment"))
# Normalize enrichment to 0-1 scale
# Use percentile rank across all genes
enrichment_percentile = pl.col("usher_tissue_enrichment").rank(method="average") / pl.col("usher_tissue_enrichment").count()
# Compute max expression in target tissues
max_target_expr = pl.max_horizontal([pl.col(col) for col in usher_available])
max_target_percentile = max_target_expr.rank(method="average") / max_target_expr.count()
# Composite score (weighted average)
# If tau_specificity is NULL, we can still compute a partial score
# But prefer to have at least enrichment or tau available
composite = pl.when(
pl.col("usher_tissue_enrichment").is_not_null() | pl.col("tau_specificity").is_not_null()
).then(
0.4 * enrichment_percentile.fill_null(0.0) +
0.3 * pl.col("tau_specificity").fill_null(0.0) +
0.3 * max_target_percentile.fill_null(0.0)
).otherwise(pl.lit(None))
df = df.with_columns(composite.alias("expression_score_normalized"))
logger.info("expression_score_complete")
return df
def process_expression_evidence(
gene_ids: list[str],
cache_dir: Optional[Path] = None,
force: bool = False,
skip_cellxgene: bool = False,
) -> pl.DataFrame:
"""End-to-end expression evidence processing pipeline.
Composes: fetch HPA -> fetch GTEx -> fetch CellxGene -> merge -> compute Tau -> compute score -> collect
Args:
gene_ids: List of Ensembl gene IDs to process
cache_dir: Directory for caching downloads
force: If True, re-download even if cached
skip_cellxgene: If True, skip CellxGene fetching (optional dependency)
Returns:
Materialized DataFrame with expression evidence ready for DuckDB storage
"""
logger.info("expression_pipeline_start", gene_count=len(gene_ids))
cache_dir = Path(cache_dir) if cache_dir else Path("data/expression")
# Fetch HPA expression (lazy)
logger.info("fetching_hpa")
lf_hpa = fetch_hpa_expression(gene_ids, cache_dir=cache_dir, force=force)
# Fetch GTEx expression (lazy)
logger.info("fetching_gtex")
lf_gtex = fetch_gtex_expression(gene_ids, cache_dir=cache_dir, force=force)
# Create gene universe DataFrame
gene_universe = pl.LazyFrame({"gene_id": gene_ids})
# Merge GTEx with gene universe (left join to preserve all genes)
# GTEx has gene_id, HPA has gene_symbol - need to handle join carefully
lf_merged = gene_universe.join(lf_gtex, on="gene_id", how="left")
# For HPA, we need gene_symbol mapping
# We'll need to load gene universe with gene_symbol from DuckDB or pass it in
# For now, we'll fetch HPA separately and join on gene_symbol later
# This requires gene_symbol in our gene_ids input or from gene universe
# DEVIATION: HPA uses gene_symbol, but we're working with gene_ids
# We need gene_symbol mapping. For simplicity, we'll collect HPA separately
# and merge in load.py after enriching with gene_symbol from gene universe
# Fetch CellxGene if not skipped
if not skip_cellxgene:
logger.info("fetching_cellxgene")
lf_cellxgene = fetch_cellxgene_expression(gene_ids, cache_dir=cache_dir)
lf_merged = lf_merged.join(lf_cellxgene, on="gene_id", how="left")
# Collect at this point to enable horizontal operations
df = lf_merged.collect()
# Calculate Tau specificity
tissue_columns = [
"hpa_retina_tpm",
"hpa_cerebellum_tpm",
"hpa_testis_tpm",
"hpa_fallopian_tube_tpm",
"gtex_retina_tpm",
"gtex_cerebellum_tpm",
"gtex_testis_tpm",
"gtex_fallopian_tube_tpm",
"cellxgene_photoreceptor_expr",
"cellxgene_hair_cell_expr",
]
# Filter to available columns
available_tissue_cols = [col for col in tissue_columns if col in df.columns]
if available_tissue_cols:
df = calculate_tau_specificity(df, available_tissue_cols)
else:
df = df.with_columns(pl.lit(None).cast(pl.Float64).alias("tau_specificity"))
# Compute expression score
df = compute_expression_score(df)
logger.info(
"expression_pipeline_complete",
row_count=len(df),
has_hpa=any("hpa_" in col for col in df.columns),
has_gtex=any("gtex_" in col for col in df.columns),
has_cellxgene=any("cellxgene_" in col for col in df.columns),
)
return df