feat(03-06): implement literature evidence models, PubMed fetch, and scoring
- Create LiteratureRecord pydantic model with context-specific counts - Implement PubMed query via Biopython Entrez with rate limiting (3/sec default, 10/sec with API key) - Define SEARCH_CONTEXTS for cilia, sensory, cytoskeleton, cell_polarity queries - Implement evidence tier classification: direct_experimental > functional_mention > hts_hit > incidental > none - Implement quality-weighted scoring with bias mitigation via log2(total_pubmed_count) normalization - Add biopython>=1.84 dependency to pyproject.toml - Support checkpoint-restart for long-running PubMed queries (estimated 3-11 hours for 20K genes)
This commit is contained in:
48
src/usher_pipeline/evidence/expression/__init__.py
Normal file
48
src/usher_pipeline/evidence/expression/__init__.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""Tissue expression evidence layer for Usher-relevant tissues.
|
||||
|
||||
This module retrieves expression data from:
|
||||
- HPA (Human Protein Atlas): Tissue-level RNA/protein expression
|
||||
- GTEx: Tissue-level RNA expression across diverse samples
|
||||
- CellxGene: Single-cell RNA-seq data for specific cell types
|
||||
|
||||
Target tissues/cell types:
|
||||
- Retina, photoreceptor cells (retinal rod, retinal cone)
|
||||
- Inner ear, hair cells (cochlea, vestibular)
|
||||
- Cilia-rich tissues (cerebellum, testis, fallopian tube)
|
||||
|
||||
Expression enrichment in these tissues is evidence for potential cilia/Usher involvement.
|
||||
"""
|
||||
|
||||
from usher_pipeline.evidence.expression.fetch import (
|
||||
fetch_hpa_expression,
|
||||
fetch_gtex_expression,
|
||||
fetch_cellxgene_expression,
|
||||
)
|
||||
from usher_pipeline.evidence.expression.transform import (
|
||||
calculate_tau_specificity,
|
||||
compute_expression_score,
|
||||
process_expression_evidence,
|
||||
)
|
||||
from usher_pipeline.evidence.expression.load import (
|
||||
load_to_duckdb,
|
||||
query_tissue_enriched,
|
||||
)
|
||||
from usher_pipeline.evidence.expression.models import (
|
||||
ExpressionRecord,
|
||||
EXPRESSION_TABLE_NAME,
|
||||
TARGET_TISSUES,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"fetch_hpa_expression",
|
||||
"fetch_gtex_expression",
|
||||
"fetch_cellxgene_expression",
|
||||
"calculate_tau_specificity",
|
||||
"compute_expression_score",
|
||||
"process_expression_evidence",
|
||||
"load_to_duckdb",
|
||||
"query_tissue_enriched",
|
||||
"ExpressionRecord",
|
||||
"EXPRESSION_TABLE_NAME",
|
||||
"TARGET_TISSUES",
|
||||
]
|
||||
458
src/usher_pipeline/evidence/expression/fetch.py
Normal file
458
src/usher_pipeline/evidence/expression/fetch.py
Normal file
@@ -0,0 +1,458 @@
|
||||
"""Download and parse tissue expression data from HPA, GTEx, and CellxGene."""
|
||||
|
||||
import gzip
|
||||
import shutil
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
import polars as pl
|
||||
import structlog
|
||||
from tenacity import (
|
||||
retry,
|
||||
stop_after_attempt,
|
||||
wait_exponential,
|
||||
retry_if_exception_type,
|
||||
)
|
||||
|
||||
from usher_pipeline.evidence.expression.models import (
|
||||
HPA_NORMAL_TISSUE_URL,
|
||||
GTEX_MEDIAN_EXPRESSION_URL,
|
||||
TARGET_TISSUES,
|
||||
TARGET_CELL_TYPES,
|
||||
)
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(5),
|
||||
wait=wait_exponential(multiplier=1, min=4, max=60),
|
||||
retry=retry_if_exception_type(
|
||||
(httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
|
||||
),
|
||||
)
|
||||
def download_hpa_tissue_data(
|
||||
output_path: Path,
|
||||
url: str = HPA_NORMAL_TISSUE_URL,
|
||||
force: bool = False,
|
||||
) -> Path:
|
||||
"""Download HPA normal tissue TSV (bulk download for all genes).
|
||||
|
||||
Args:
|
||||
output_path: Where to save the TSV file
|
||||
url: HPA normal tissue data URL (default: proteinatlas.org bulk download)
|
||||
force: If True, re-download even if file exists
|
||||
|
||||
Returns:
|
||||
Path to the downloaded TSV file
|
||||
|
||||
Raises:
|
||||
httpx.HTTPStatusError: On HTTP errors (after retries)
|
||||
httpx.ConnectError: On connection errors (after retries)
|
||||
httpx.TimeoutException: On timeout (after retries)
|
||||
"""
|
||||
output_path = Path(output_path)
|
||||
|
||||
# Checkpoint pattern: skip if already downloaded
|
||||
if output_path.exists() and not force:
|
||||
logger.info(
|
||||
"hpa_tissue_exists",
|
||||
path=str(output_path),
|
||||
size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
|
||||
)
|
||||
return output_path
|
||||
|
||||
# Create parent directory if needed
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# HPA data is zipped
|
||||
is_zipped = url.endswith(".zip")
|
||||
temp_path = output_path.with_suffix(".zip.tmp")
|
||||
|
||||
logger.info("hpa_download_start", url=url, zipped=is_zipped)
|
||||
|
||||
# Stream download to disk
|
||||
with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
total_bytes = int(response.headers.get("content-length", 0))
|
||||
downloaded = 0
|
||||
|
||||
with open(temp_path, "wb") as f:
|
||||
for chunk in response.iter_bytes(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
|
||||
# Log progress every 10MB
|
||||
if total_bytes > 0 and downloaded % (10 * 1024 * 1024) < 8192:
|
||||
pct = (downloaded / total_bytes) * 100
|
||||
logger.info(
|
||||
"hpa_download_progress",
|
||||
downloaded_mb=round(downloaded / 1024 / 1024, 2),
|
||||
total_mb=round(total_bytes / 1024 / 1024, 2),
|
||||
percent=round(pct, 1),
|
||||
)
|
||||
|
||||
# Unzip if needed
|
||||
if is_zipped:
|
||||
logger.info("hpa_unzip_start", zip_path=str(temp_path))
|
||||
with zipfile.ZipFile(temp_path, "r") as zip_ref:
|
||||
# Extract the TSV file (usually named "normal_tissue.tsv")
|
||||
tsv_files = [name for name in zip_ref.namelist() if name.endswith(".tsv")]
|
||||
if not tsv_files:
|
||||
raise ValueError(f"No TSV file found in HPA zip: {temp_path}")
|
||||
# Extract first TSV
|
||||
zip_ref.extract(tsv_files[0], path=output_path.parent)
|
||||
extracted_path = output_path.parent / tsv_files[0]
|
||||
extracted_path.rename(output_path)
|
||||
temp_path.unlink()
|
||||
else:
|
||||
temp_path.rename(output_path)
|
||||
|
||||
logger.info(
|
||||
"hpa_download_complete",
|
||||
path=str(output_path),
|
||||
size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
|
||||
)
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(5),
|
||||
wait=wait_exponential(multiplier=1, min=4, max=60),
|
||||
retry=retry_if_exception_type(
|
||||
(httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
|
||||
),
|
||||
)
|
||||
def download_gtex_expression(
|
||||
output_path: Path,
|
||||
url: str = GTEX_MEDIAN_EXPRESSION_URL,
|
||||
force: bool = False,
|
||||
) -> Path:
|
||||
"""Download GTEx median gene expression file (bulk download).
|
||||
|
||||
Args:
|
||||
output_path: Where to save the GCT file
|
||||
url: GTEx median TPM file URL (default: v8/v10 bulk data)
|
||||
force: If True, re-download even if file exists
|
||||
|
||||
Returns:
|
||||
Path to the downloaded GCT file
|
||||
|
||||
Raises:
|
||||
httpx.HTTPStatusError: On HTTP errors (after retries)
|
||||
httpx.ConnectError: On connection errors (after retries)
|
||||
httpx.TimeoutException: On timeout (after retries)
|
||||
"""
|
||||
output_path = Path(output_path)
|
||||
|
||||
# Checkpoint pattern: skip if already downloaded
|
||||
if output_path.exists() and not force:
|
||||
logger.info(
|
||||
"gtex_expression_exists",
|
||||
path=str(output_path),
|
||||
size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
|
||||
)
|
||||
return output_path
|
||||
|
||||
# Create parent directory if needed
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# GTEx data is gzipped
|
||||
is_compressed = url.endswith(".gz")
|
||||
temp_path = output_path.with_suffix(output_path.suffix + ".tmp")
|
||||
|
||||
logger.info("gtex_download_start", url=url, compressed=is_compressed)
|
||||
|
||||
# Stream download to disk
|
||||
with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
total_bytes = int(response.headers.get("content-length", 0))
|
||||
downloaded = 0
|
||||
|
||||
with open(temp_path, "wb") as f:
|
||||
for chunk in response.iter_bytes(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
|
||||
# Log progress every 10MB
|
||||
if total_bytes > 0 and downloaded % (10 * 1024 * 1024) < 8192:
|
||||
pct = (downloaded / total_bytes) * 100
|
||||
logger.info(
|
||||
"gtex_download_progress",
|
||||
downloaded_mb=round(downloaded / 1024 / 1024, 2),
|
||||
total_mb=round(total_bytes / 1024 / 1024, 2),
|
||||
percent=round(pct, 1),
|
||||
)
|
||||
|
||||
# Decompress if needed
|
||||
if is_compressed:
|
||||
logger.info("gtex_decompress_start", compressed_path=str(temp_path))
|
||||
with gzip.open(temp_path, "rb") as f_in:
|
||||
with open(output_path, "wb") as f_out:
|
||||
shutil.copyfileobj(f_in, f_out)
|
||||
temp_path.unlink()
|
||||
else:
|
||||
temp_path.rename(output_path)
|
||||
|
||||
logger.info(
|
||||
"gtex_download_complete",
|
||||
path=str(output_path),
|
||||
size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
|
||||
)
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def fetch_hpa_expression(
|
||||
gene_ids: list[str],
|
||||
cache_dir: Optional[Path] = None,
|
||||
force: bool = False,
|
||||
) -> pl.LazyFrame:
|
||||
"""Fetch HPA tissue expression data for target tissues.
|
||||
|
||||
Downloads HPA bulk normal tissue TSV, filters to target tissues
|
||||
(retina, cerebellum, testis, fallopian tube), and extracts TPM values.
|
||||
|
||||
Args:
|
||||
gene_ids: List of Ensembl gene IDs to filter (unused - HPA uses gene symbols)
|
||||
cache_dir: Directory to cache downloaded HPA file
|
||||
force: If True, re-download even if cached
|
||||
|
||||
Returns:
|
||||
LazyFrame with columns: gene_symbol, hpa_retina_tpm, hpa_cerebellum_tpm,
|
||||
hpa_testis_tpm, hpa_fallopian_tube_tpm
|
||||
NULL for genes/tissues not in HPA data.
|
||||
"""
|
||||
cache_dir = Path(cache_dir) if cache_dir else Path("data/expression")
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download HPA bulk tissue data
|
||||
hpa_tsv_path = cache_dir / "hpa_normal_tissue.tsv"
|
||||
download_hpa_tissue_data(hpa_tsv_path, force=force)
|
||||
|
||||
logger.info("hpa_parse_start", path=str(hpa_tsv_path))
|
||||
|
||||
# HPA TSV format (v24):
|
||||
# Gene | Gene name | Tissue | Cell type | Level | Reliability
|
||||
# Where Level is expression level category, not TPM
|
||||
# For quantitative data we need the "nTPM" column if available
|
||||
# Columns: Gene, Gene name, Tissue, Cell type, Level, Reliability
|
||||
# OR: Gene, Gene name, Tissue, nTPM (normalized TPM)
|
||||
|
||||
# Read HPA data with lazy evaluation
|
||||
lf = pl.scan_csv(
|
||||
hpa_tsv_path,
|
||||
separator="\t",
|
||||
null_values=["NA", "", "."],
|
||||
has_header=True,
|
||||
)
|
||||
|
||||
# Target tissues from HPA
|
||||
target_tissue_names = {
|
||||
"retina": TARGET_TISSUES["retina"]["hpa"],
|
||||
"cerebellum": TARGET_TISSUES["cerebellum"]["hpa"],
|
||||
"testis": TARGET_TISSUES["testis"]["hpa"],
|
||||
"fallopian_tube": TARGET_TISSUES["fallopian_tube"]["hpa"],
|
||||
}
|
||||
|
||||
# Filter to target tissues
|
||||
tissue_filter = pl.col("Tissue").is_in(list(target_tissue_names.values()))
|
||||
lf = lf.filter(tissue_filter)
|
||||
|
||||
# HPA provides categorical "Level" (Not detected, Low, Medium, High)
|
||||
# For scoring, we'll convert to numeric: Not detected=0, Low=1, Medium=2, High=3
|
||||
# If nTPM column exists, use that instead
|
||||
|
||||
# Check if nTPM column exists (better for quantitative analysis)
|
||||
# For now, use Level mapping as HPA download format varies
|
||||
level_mapping = {
|
||||
"Not detected": 0.0,
|
||||
"Low": 1.0,
|
||||
"Medium": 2.0,
|
||||
"High": 3.0,
|
||||
}
|
||||
|
||||
# Convert Level to numeric expression proxy
|
||||
# If "nTPM" column exists, use it; otherwise map Level
|
||||
# We'll handle this by attempting both approaches
|
||||
|
||||
# Pivot to wide format: gene x tissue
|
||||
# Group by Gene name and Tissue, aggregate Level (take max if multiple cell types)
|
||||
lf = (
|
||||
lf.group_by(["Gene name", "Tissue"])
|
||||
.agg(pl.col("Level").first().alias("expression_level"))
|
||||
.with_columns(
|
||||
pl.col("expression_level")
|
||||
.map_elements(lambda x: level_mapping.get(x, None), return_dtype=pl.Float64)
|
||||
.alias("expression_value")
|
||||
)
|
||||
)
|
||||
|
||||
# Pivot: rows=genes, columns=tissues
|
||||
# Create separate columns for each target tissue
|
||||
lf_wide = lf.pivot(
|
||||
values="expression_value",
|
||||
index="Gene name",
|
||||
columns="Tissue",
|
||||
)
|
||||
|
||||
# Rename columns to match our schema
|
||||
rename_map = {}
|
||||
for our_key, hpa_tissue in target_tissue_names.items():
|
||||
if hpa_tissue:
|
||||
rename_map[hpa_tissue] = f"hpa_{our_key}_tpm"
|
||||
|
||||
if rename_map:
|
||||
lf_wide = lf_wide.rename(rename_map)
|
||||
|
||||
# Rename "Gene name" to "gene_symbol"
|
||||
lf_wide = lf_wide.rename({"Gene name": "gene_symbol"})
|
||||
|
||||
logger.info("hpa_parse_complete", tissues=list(target_tissue_names.keys()))
|
||||
|
||||
return lf_wide
|
||||
|
||||
|
||||
def fetch_gtex_expression(
|
||||
gene_ids: list[str],
|
||||
cache_dir: Optional[Path] = None,
|
||||
force: bool = False,
|
||||
) -> pl.LazyFrame:
|
||||
"""Fetch GTEx tissue expression data for target tissues.
|
||||
|
||||
Downloads GTEx bulk median TPM file, filters to target tissues.
|
||||
NOTE: GTEx lacks inner ear/cochlea tissue - will be NULL.
|
||||
|
||||
Args:
|
||||
gene_ids: List of Ensembl gene IDs to filter
|
||||
cache_dir: Directory to cache downloaded GTEx file
|
||||
force: If True, re-download even if cached
|
||||
|
||||
Returns:
|
||||
LazyFrame with columns: gene_id, gtex_retina_tpm, gtex_cerebellum_tpm,
|
||||
gtex_testis_tpm, gtex_fallopian_tube_tpm
|
||||
NULL for tissues not available in GTEx.
|
||||
"""
|
||||
cache_dir = Path(cache_dir) if cache_dir else Path("data/expression")
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download GTEx bulk expression data
|
||||
gtex_gct_path = cache_dir / "gtex_median_tpm.gct"
|
||||
download_gtex_expression(gtex_gct_path, force=force)
|
||||
|
||||
logger.info("gtex_parse_start", path=str(gtex_gct_path))
|
||||
|
||||
# GTEx GCT format:
|
||||
# #1.2 (version header)
|
||||
# [dimensions line]
|
||||
# Name Description [Tissue1] [Tissue2] ...
|
||||
# ENSG00000... | GeneSymbol | tpm1 | tpm2 | ...
|
||||
|
||||
# Skip first 2 lines (GCT header), then read
|
||||
lf = pl.scan_csv(
|
||||
gtex_gct_path,
|
||||
separator="\t",
|
||||
skip_rows=2,
|
||||
null_values=["NA", "", "."],
|
||||
has_header=True,
|
||||
)
|
||||
|
||||
# Target tissues from GTEx
|
||||
target_tissue_cols = {
|
||||
"retina": TARGET_TISSUES["retina"]["gtex"],
|
||||
"cerebellum": TARGET_TISSUES["cerebellum"]["gtex"],
|
||||
"testis": TARGET_TISSUES["testis"]["gtex"],
|
||||
"fallopian_tube": TARGET_TISSUES["fallopian_tube"]["gtex"],
|
||||
}
|
||||
|
||||
# Select gene ID column + target tissue columns
|
||||
# GTEx uses "Name" for gene ID (ENSG...) and "Description" for gene symbol
|
||||
select_cols = ["Name"]
|
||||
rename_map = {"Name": "gene_id"}
|
||||
|
||||
for our_key, gtex_tissue in target_tissue_cols.items():
|
||||
if gtex_tissue:
|
||||
# Check if tissue column exists (not all GTEx versions have all tissues)
|
||||
select_cols.append(gtex_tissue)
|
||||
rename_map[gtex_tissue] = f"gtex_{our_key}_tpm"
|
||||
|
||||
# Try to select columns; if tissue missing, it will be NULL
|
||||
# Use select with error handling for missing columns
|
||||
try:
|
||||
lf = lf.select(select_cols).rename(rename_map)
|
||||
except Exception as e:
|
||||
logger.warning("gtex_tissue_missing", error=str(e))
|
||||
# Fallback: select available columns
|
||||
available_cols = lf.columns
|
||||
select_available = [col for col in select_cols if col in available_cols]
|
||||
lf = lf.select(select_available).rename({
|
||||
k: v for k, v in rename_map.items() if k in select_available
|
||||
})
|
||||
|
||||
# Filter to requested gene_ids if provided
|
||||
if gene_ids:
|
||||
lf = lf.filter(pl.col("gene_id").is_in(gene_ids))
|
||||
|
||||
logger.info("gtex_parse_complete", tissues=list(target_tissue_cols.keys()))
|
||||
|
||||
return lf
|
||||
|
||||
|
||||
def fetch_cellxgene_expression(
|
||||
gene_ids: list[str],
|
||||
cache_dir: Optional[Path] = None,
|
||||
batch_size: int = 100,
|
||||
) -> pl.LazyFrame:
|
||||
"""Fetch CellxGene single-cell expression data for target cell types.
|
||||
|
||||
Uses cellxgene_census library to query scRNA-seq data for photoreceptor
|
||||
and hair cell populations. Computes mean expression per gene per cell type.
|
||||
|
||||
NOTE: cellxgene_census is an optional dependency (large install).
|
||||
If not available, returns DataFrame with all NULL values and logs warning.
|
||||
|
||||
Args:
|
||||
gene_ids: List of Ensembl gene IDs to query
|
||||
cache_dir: Directory for caching (currently unused)
|
||||
batch_size: Number of genes to process per batch (default: 100)
|
||||
|
||||
Returns:
|
||||
LazyFrame with columns: gene_id, cellxgene_photoreceptor_expr,
|
||||
cellxgene_hair_cell_expr
|
||||
NULL if cellxgene_census not available or cell type data missing.
|
||||
"""
|
||||
try:
|
||||
import cellxgene_census
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"cellxgene_census_unavailable",
|
||||
message="cellxgene_census not installed. Install with: pip install 'usher-pipeline[expression]'",
|
||||
)
|
||||
# Return empty DataFrame with NULL values
|
||||
return pl.LazyFrame({
|
||||
"gene_id": gene_ids,
|
||||
"cellxgene_photoreceptor_expr": [None] * len(gene_ids),
|
||||
"cellxgene_hair_cell_expr": [None] * len(gene_ids),
|
||||
})
|
||||
|
||||
logger.info("cellxgene_query_start", gene_count=len(gene_ids), batch_size=batch_size)
|
||||
|
||||
# For now, return placeholder with NULLs
|
||||
# Full CellxGene integration requires census schema knowledge and filtering
|
||||
# This is a complex query that would need cell type ontology matching
|
||||
# Placeholder implementation for testing
|
||||
logger.warning(
|
||||
"cellxgene_not_implemented",
|
||||
message="CellxGene integration is complex and not yet implemented. Returning NULL values.",
|
||||
)
|
||||
|
||||
return pl.LazyFrame({
|
||||
"gene_id": gene_ids,
|
||||
"cellxgene_photoreceptor_expr": [None] * len(gene_ids),
|
||||
"cellxgene_hair_cell_expr": [None] * len(gene_ids),
|
||||
})
|
||||
119
src/usher_pipeline/evidence/expression/load.py
Normal file
119
src/usher_pipeline/evidence/expression/load.py
Normal file
@@ -0,0 +1,119 @@
|
||||
"""Load expression evidence data to DuckDB with provenance tracking."""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import polars as pl
|
||||
import structlog
|
||||
|
||||
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
|
||||
from usher_pipeline.evidence.expression.models import EXPRESSION_TABLE_NAME
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
def load_to_duckdb(
|
||||
df: pl.DataFrame,
|
||||
store: PipelineStore,
|
||||
provenance: ProvenanceTracker,
|
||||
description: str = ""
|
||||
) -> None:
|
||||
"""Save expression evidence DataFrame to DuckDB with provenance.
|
||||
|
||||
Creates or replaces the tissue_expression table (idempotent).
|
||||
Records provenance step with summary statistics.
|
||||
|
||||
Args:
|
||||
df: Processed expression DataFrame with tau_specificity and expression_score_normalized
|
||||
store: PipelineStore instance for DuckDB persistence
|
||||
provenance: ProvenanceTracker instance for metadata recording
|
||||
description: Optional description for checkpoint metadata
|
||||
"""
|
||||
logger.info("expression_load_start", row_count=len(df))
|
||||
|
||||
# Calculate summary statistics for provenance
|
||||
# Genes with retina expression (any source)
|
||||
retina_expr_count = df.filter(
|
||||
pl.col("hpa_retina_tpm").is_not_null() |
|
||||
pl.col("gtex_retina_tpm").is_not_null() |
|
||||
pl.col("cellxgene_photoreceptor_expr").is_not_null()
|
||||
).height
|
||||
|
||||
# Genes with inner ear expression (primarily CellxGene)
|
||||
inner_ear_expr_count = df.filter(
|
||||
pl.col("cellxgene_hair_cell_expr").is_not_null()
|
||||
).height
|
||||
|
||||
# Mean Tau specificity (excluding NULLs)
|
||||
mean_tau = df.select(pl.col("tau_specificity").mean()).item()
|
||||
|
||||
# Expression score distribution
|
||||
expr_score_stats = df.select([
|
||||
pl.col("expression_score_normalized").min().alias("min"),
|
||||
pl.col("expression_score_normalized").max().alias("max"),
|
||||
pl.col("expression_score_normalized").mean().alias("mean"),
|
||||
pl.col("expression_score_normalized").median().alias("median"),
|
||||
]).to_dicts()[0]
|
||||
|
||||
# Save to DuckDB with CREATE OR REPLACE (idempotent)
|
||||
store.save_dataframe(
|
||||
df=df,
|
||||
table_name=EXPRESSION_TABLE_NAME,
|
||||
description=description or "Tissue expression evidence with HPA, GTEx, and CellxGene data",
|
||||
replace=True
|
||||
)
|
||||
|
||||
# Record provenance step with details
|
||||
provenance.record_step("load_tissue_expression", {
|
||||
"row_count": len(df),
|
||||
"retina_expression_count": retina_expr_count,
|
||||
"inner_ear_expression_count": inner_ear_expr_count,
|
||||
"mean_tau_specificity": round(mean_tau, 3) if mean_tau else None,
|
||||
"expression_score_min": round(expr_score_stats["min"], 3) if expr_score_stats["min"] else None,
|
||||
"expression_score_max": round(expr_score_stats["max"], 3) if expr_score_stats["max"] else None,
|
||||
"expression_score_mean": round(expr_score_stats["mean"], 3) if expr_score_stats["mean"] else None,
|
||||
"expression_score_median": round(expr_score_stats["median"], 3) if expr_score_stats["median"] else None,
|
||||
})
|
||||
|
||||
logger.info(
|
||||
"expression_load_complete",
|
||||
row_count=len(df),
|
||||
retina_expr=retina_expr_count,
|
||||
inner_ear_expr=inner_ear_expr_count,
|
||||
mean_tau=round(mean_tau, 3) if mean_tau else None,
|
||||
)
|
||||
|
||||
|
||||
def query_tissue_enriched(
|
||||
store: PipelineStore,
|
||||
min_enrichment: float = 2.0
|
||||
) -> pl.DataFrame:
|
||||
"""Query genes enriched in Usher-relevant tissues from DuckDB.
|
||||
|
||||
Args:
|
||||
store: PipelineStore instance
|
||||
min_enrichment: Minimum usher_tissue_enrichment threshold (default: 2.0 = 2x enriched)
|
||||
|
||||
Returns:
|
||||
DataFrame with tissue-enriched genes sorted by enrichment (most enriched first)
|
||||
Columns: gene_id, gene_symbol, usher_tissue_enrichment, tau_specificity,
|
||||
expression_score_normalized
|
||||
"""
|
||||
logger.info("expression_query_enriched", min_enrichment=min_enrichment)
|
||||
|
||||
# Query DuckDB: enriched genes
|
||||
df = store.execute_query(
|
||||
f"""
|
||||
SELECT gene_id, gene_symbol, usher_tissue_enrichment, tau_specificity,
|
||||
expression_score_normalized,
|
||||
hpa_retina_tpm, gtex_retina_tpm, cellxgene_photoreceptor_expr,
|
||||
cellxgene_hair_cell_expr
|
||||
FROM {EXPRESSION_TABLE_NAME}
|
||||
WHERE usher_tissue_enrichment >= ?
|
||||
ORDER BY usher_tissue_enrichment DESC
|
||||
""",
|
||||
params=[min_enrichment]
|
||||
)
|
||||
|
||||
logger.info("expression_query_complete", result_count=len(df))
|
||||
|
||||
return df
|
||||
101
src/usher_pipeline/evidence/expression/models.py
Normal file
101
src/usher_pipeline/evidence/expression/models.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""Data models for tissue expression evidence."""
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
# HPA normal tissue data download URL (bulk TSV, more efficient than per-gene API)
|
||||
HPA_NORMAL_TISSUE_URL = (
|
||||
"https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
|
||||
)
|
||||
|
||||
# GTEx v10 median gene expression bulk data
|
||||
GTEX_MEDIAN_EXPRESSION_URL = (
|
||||
"https://storage.googleapis.com/adult-gtex/bulk-gex/v10/median-tpm/"
|
||||
"GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
|
||||
)
|
||||
|
||||
# Table name in DuckDB
|
||||
EXPRESSION_TABLE_NAME = "tissue_expression"
|
||||
|
||||
# Target tissues for Usher/cilia relevance
|
||||
# Maps our standardized tissue keys to API-specific identifiers
|
||||
TARGET_TISSUES = {
|
||||
# Retina-related
|
||||
"retina": {
|
||||
"hpa": "retina",
|
||||
"gtex": "Eye - Retina", # Note: Not available in all GTEx versions
|
||||
"cellxgene_tissue": ["retina", "eye"],
|
||||
},
|
||||
# Inner ear-related (primarily from scRNA-seq, not in HPA/GTEx bulk)
|
||||
"inner_ear": {
|
||||
"hpa": None, # Not available in HPA bulk tissue data
|
||||
"gtex": None, # Not available in GTEx
|
||||
"cellxgene_tissue": ["inner ear", "cochlea", "vestibular system"],
|
||||
},
|
||||
# Cilia-rich tissues
|
||||
"cerebellum": {
|
||||
"hpa": "cerebellum",
|
||||
"gtex": "Brain - Cerebellum",
|
||||
"cellxgene_tissue": ["cerebellum"],
|
||||
},
|
||||
"testis": {
|
||||
"hpa": "testis",
|
||||
"gtex": "Testis",
|
||||
"cellxgene_tissue": ["testis"],
|
||||
},
|
||||
"fallopian_tube": {
|
||||
"hpa": "fallopian tube",
|
||||
"gtex": "Fallopian Tube", # May not be available in all GTEx versions
|
||||
"cellxgene_tissue": ["fallopian tube"],
|
||||
},
|
||||
}
|
||||
|
||||
# Target cell types for scRNA-seq (CellxGene)
|
||||
TARGET_CELL_TYPES = [
|
||||
"photoreceptor cell",
|
||||
"retinal rod cell",
|
||||
"retinal cone cell",
|
||||
"hair cell", # Inner ear mechanoreceptor
|
||||
"cochlear hair cell",
|
||||
"vestibular hair cell",
|
||||
]
|
||||
|
||||
|
||||
class ExpressionRecord(BaseModel):
|
||||
"""Tissue expression evidence for a single gene.
|
||||
|
||||
Attributes:
|
||||
gene_id: Ensembl gene ID (e.g., ENSG00000...)
|
||||
gene_symbol: HGNC gene symbol
|
||||
hpa_retina_tpm: HPA retina TPM expression (NULL if not in HPA)
|
||||
hpa_cerebellum_tpm: HPA cerebellum TPM (proxy for cilia-rich tissue)
|
||||
hpa_testis_tpm: HPA testis TPM (cilia-rich)
|
||||
hpa_fallopian_tube_tpm: HPA fallopian tube TPM (ciliated epithelium)
|
||||
gtex_retina_tpm: GTEx "Eye - Retina" median TPM (NULL if tissue unavailable)
|
||||
gtex_cerebellum_tpm: GTEx "Brain - Cerebellum" median TPM
|
||||
gtex_testis_tpm: GTEx "Testis" median TPM
|
||||
gtex_fallopian_tube_tpm: GTEx "Fallopian Tube" median TPM (often NULL)
|
||||
cellxgene_photoreceptor_expr: Mean expression in photoreceptor cells (scRNA-seq)
|
||||
cellxgene_hair_cell_expr: Mean expression in hair cells (scRNA-seq)
|
||||
tau_specificity: Tau index (0=ubiquitous, 1=tissue-specific) across all tissues
|
||||
usher_tissue_enrichment: Enrichment in Usher-relevant tissues vs global expression
|
||||
expression_score_normalized: Composite expression score (0-1 range)
|
||||
|
||||
CRITICAL: NULL values represent missing/unavailable data and are preserved as None.
|
||||
Inner ear data is primarily from CellxGene (not HPA/GTEx bulk).
|
||||
"""
|
||||
|
||||
gene_id: str
|
||||
gene_symbol: str
|
||||
hpa_retina_tpm: float | None = None
|
||||
hpa_cerebellum_tpm: float | None = None
|
||||
hpa_testis_tpm: float | None = None
|
||||
hpa_fallopian_tube_tpm: float | None = None
|
||||
gtex_retina_tpm: float | None = None
|
||||
gtex_cerebellum_tpm: float | None = None
|
||||
gtex_testis_tpm: float | None = None
|
||||
gtex_fallopian_tube_tpm: float | None = None
|
||||
cellxgene_photoreceptor_expr: float | None = None
|
||||
cellxgene_hair_cell_expr: float | None = None
|
||||
tau_specificity: float | None = None
|
||||
usher_tissue_enrichment: float | None = None
|
||||
expression_score_normalized: float | None = None
|
||||
273
src/usher_pipeline/evidence/expression/transform.py
Normal file
273
src/usher_pipeline/evidence/expression/transform.py
Normal file
@@ -0,0 +1,273 @@
|
||||
"""Transform and normalize tissue expression data."""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import polars as pl
|
||||
import structlog
|
||||
|
||||
from usher_pipeline.evidence.expression.fetch import (
|
||||
fetch_hpa_expression,
|
||||
fetch_gtex_expression,
|
||||
fetch_cellxgene_expression,
|
||||
)
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
def calculate_tau_specificity(
|
||||
df: pl.DataFrame,
|
||||
tissue_columns: list[str],
|
||||
) -> pl.DataFrame:
|
||||
"""Calculate Tau tissue specificity index.
|
||||
|
||||
Tau measures tissue specificity: 0 = ubiquitous expression, 1 = tissue-specific.
|
||||
Formula: Tau = sum(1 - xi/xmax) / (n-1)
|
||||
where xi is expression in tissue i, xmax is max expression across tissues.
|
||||
|
||||
If ANY tissue value is NULL, Tau is NULL (insufficient data for reliable specificity).
|
||||
|
||||
Args:
|
||||
df: DataFrame with expression values across tissues
|
||||
tissue_columns: List of column names containing tissue expression values
|
||||
|
||||
Returns:
|
||||
DataFrame with tau_specificity column added
|
||||
"""
|
||||
logger.info("tau_calculation_start", tissue_count=len(tissue_columns))
|
||||
|
||||
# Check if any tissue columns are missing
|
||||
available_cols = [col for col in tissue_columns if col in df.columns]
|
||||
if len(available_cols) < len(tissue_columns):
|
||||
missing = set(tissue_columns) - set(available_cols)
|
||||
logger.warning("tau_missing_columns", missing=list(missing))
|
||||
|
||||
if not available_cols:
|
||||
# No tissue data available - return with NULL Tau
|
||||
return df.with_columns(pl.lit(None).cast(pl.Float64).alias("tau_specificity"))
|
||||
|
||||
# For each gene, check if all tissue values are non-NULL
|
||||
# If any NULL, Tau is NULL
|
||||
# Otherwise, compute Tau = sum(1 - xi/xmax) / (n-1)
|
||||
|
||||
# Create expression for NULL check
|
||||
has_all_data = pl.all_horizontal([pl.col(col).is_not_null() for col in available_cols])
|
||||
|
||||
# Compute Tau only for genes with complete data
|
||||
# Step 1: Find max expression across tissues
|
||||
max_expr = pl.max_horizontal([pl.col(col) for col in available_cols])
|
||||
|
||||
# Step 2: Compute sum(1 - xi/xmax) for each gene
|
||||
# Handle division by zero: if max_expr is 0, Tau is undefined (set to NULL)
|
||||
tau_sum = sum([
|
||||
pl.when(max_expr > 0)
|
||||
.then(1.0 - (pl.col(col) / max_expr))
|
||||
.otherwise(0.0)
|
||||
for col in available_cols
|
||||
])
|
||||
|
||||
# Step 3: Divide by (n-1), where n is number of tissues
|
||||
n_tissues = len(available_cols)
|
||||
if n_tissues <= 1:
|
||||
# Cannot compute specificity with only 1 tissue
|
||||
tau = pl.lit(None).cast(pl.Float64)
|
||||
else:
|
||||
tau = tau_sum / (n_tissues - 1)
|
||||
|
||||
# Apply Tau only to genes with complete data
|
||||
df = df.with_columns(
|
||||
pl.when(has_all_data & (max_expr > 0))
|
||||
.then(tau)
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("tau_specificity")
|
||||
)
|
||||
|
||||
logger.info("tau_calculation_complete")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def compute_expression_score(df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Compute Usher tissue enrichment and normalized expression score.
|
||||
|
||||
Computes:
|
||||
1. usher_tissue_enrichment: Ratio of mean expression in Usher-relevant tissues
|
||||
(retina, inner ear proxies) to mean expression across all tissues.
|
||||
Higher ratio = more enriched in target tissues.
|
||||
2. expression_score_normalized: Weighted composite of:
|
||||
- 40%: usher_tissue_enrichment (normalized to 0-1)
|
||||
- 30%: tau_specificity
|
||||
- 30%: max_target_tissue_rank (percentile rank of max expression in targets)
|
||||
|
||||
NULL if all expression data is NULL.
|
||||
|
||||
Args:
|
||||
df: DataFrame with tissue expression columns and tau_specificity
|
||||
|
||||
Returns:
|
||||
DataFrame with usher_tissue_enrichment and expression_score_normalized columns
|
||||
"""
|
||||
logger.info("expression_score_start")
|
||||
|
||||
# Define Usher-relevant tissue columns
|
||||
usher_tissue_cols = [
|
||||
"hpa_retina_tpm",
|
||||
"hpa_cerebellum_tpm", # Cilia-rich
|
||||
"gtex_retina_tpm",
|
||||
"gtex_cerebellum_tpm",
|
||||
"cellxgene_photoreceptor_expr",
|
||||
"cellxgene_hair_cell_expr",
|
||||
]
|
||||
|
||||
# All tissue columns for global mean
|
||||
all_tissue_cols = [
|
||||
"hpa_retina_tpm",
|
||||
"hpa_cerebellum_tpm",
|
||||
"hpa_testis_tpm",
|
||||
"hpa_fallopian_tube_tpm",
|
||||
"gtex_retina_tpm",
|
||||
"gtex_cerebellum_tpm",
|
||||
"gtex_testis_tpm",
|
||||
"gtex_fallopian_tube_tpm",
|
||||
"cellxgene_photoreceptor_expr",
|
||||
"cellxgene_hair_cell_expr",
|
||||
]
|
||||
|
||||
# Filter to available columns
|
||||
usher_available = [col for col in usher_tissue_cols if col in df.columns]
|
||||
all_available = [col for col in all_tissue_cols if col in df.columns]
|
||||
|
||||
if not usher_available or not all_available:
|
||||
# No expression data - return NULL scores
|
||||
return df.with_columns([
|
||||
pl.lit(None).cast(pl.Float64).alias("usher_tissue_enrichment"),
|
||||
pl.lit(None).cast(pl.Float64).alias("expression_score_normalized"),
|
||||
])
|
||||
|
||||
# Compute mean expression in Usher tissues (ignoring NULLs)
|
||||
usher_mean = pl.mean_horizontal([pl.col(col) for col in usher_available])
|
||||
|
||||
# Compute mean expression across all tissues (ignoring NULLs)
|
||||
global_mean = pl.mean_horizontal([pl.col(col) for col in all_available])
|
||||
|
||||
# Enrichment ratio: usher_mean / global_mean
|
||||
# If global_mean is 0 or NULL, enrichment is NULL
|
||||
enrichment = pl.when(global_mean > 0).then(usher_mean / global_mean).otherwise(pl.lit(None))
|
||||
|
||||
df = df.with_columns(enrichment.alias("usher_tissue_enrichment"))
|
||||
|
||||
# Normalize enrichment to 0-1 scale
|
||||
# Use percentile rank across all genes
|
||||
enrichment_percentile = pl.col("usher_tissue_enrichment").rank(method="average") / pl.col("usher_tissue_enrichment").count()
|
||||
|
||||
# Compute max expression in target tissues
|
||||
max_target_expr = pl.max_horizontal([pl.col(col) for col in usher_available])
|
||||
max_target_percentile = max_target_expr.rank(method="average") / max_target_expr.count()
|
||||
|
||||
# Composite score (weighted average)
|
||||
# If tau_specificity is NULL, we can still compute a partial score
|
||||
# But prefer to have at least enrichment or tau available
|
||||
composite = pl.when(
|
||||
pl.col("usher_tissue_enrichment").is_not_null() | pl.col("tau_specificity").is_not_null()
|
||||
).then(
|
||||
0.4 * enrichment_percentile.fill_null(0.0) +
|
||||
0.3 * pl.col("tau_specificity").fill_null(0.0) +
|
||||
0.3 * max_target_percentile.fill_null(0.0)
|
||||
).otherwise(pl.lit(None))
|
||||
|
||||
df = df.with_columns(composite.alias("expression_score_normalized"))
|
||||
|
||||
logger.info("expression_score_complete")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def process_expression_evidence(
|
||||
gene_ids: list[str],
|
||||
cache_dir: Optional[Path] = None,
|
||||
force: bool = False,
|
||||
skip_cellxgene: bool = False,
|
||||
) -> pl.DataFrame:
|
||||
"""End-to-end expression evidence processing pipeline.
|
||||
|
||||
Composes: fetch HPA -> fetch GTEx -> fetch CellxGene -> merge -> compute Tau -> compute score -> collect
|
||||
|
||||
Args:
|
||||
gene_ids: List of Ensembl gene IDs to process
|
||||
cache_dir: Directory for caching downloads
|
||||
force: If True, re-download even if cached
|
||||
skip_cellxgene: If True, skip CellxGene fetching (optional dependency)
|
||||
|
||||
Returns:
|
||||
Materialized DataFrame with expression evidence ready for DuckDB storage
|
||||
"""
|
||||
logger.info("expression_pipeline_start", gene_count=len(gene_ids))
|
||||
|
||||
cache_dir = Path(cache_dir) if cache_dir else Path("data/expression")
|
||||
|
||||
# Fetch HPA expression (lazy)
|
||||
logger.info("fetching_hpa")
|
||||
lf_hpa = fetch_hpa_expression(gene_ids, cache_dir=cache_dir, force=force)
|
||||
|
||||
# Fetch GTEx expression (lazy)
|
||||
logger.info("fetching_gtex")
|
||||
lf_gtex = fetch_gtex_expression(gene_ids, cache_dir=cache_dir, force=force)
|
||||
|
||||
# Create gene universe DataFrame
|
||||
gene_universe = pl.LazyFrame({"gene_id": gene_ids})
|
||||
|
||||
# Merge GTEx with gene universe (left join to preserve all genes)
|
||||
# GTEx has gene_id, HPA has gene_symbol - need to handle join carefully
|
||||
lf_merged = gene_universe.join(lf_gtex, on="gene_id", how="left")
|
||||
|
||||
# For HPA, we need gene_symbol mapping
|
||||
# We'll need to load gene universe with gene_symbol from DuckDB or pass it in
|
||||
# For now, we'll fetch HPA separately and join on gene_symbol later
|
||||
# This requires gene_symbol in our gene_ids input or from gene universe
|
||||
|
||||
# DEVIATION: HPA uses gene_symbol, but we're working with gene_ids
|
||||
# We need gene_symbol mapping. For simplicity, we'll collect HPA separately
|
||||
# and merge in load.py after enriching with gene_symbol from gene universe
|
||||
|
||||
# Fetch CellxGene if not skipped
|
||||
if not skip_cellxgene:
|
||||
logger.info("fetching_cellxgene")
|
||||
lf_cellxgene = fetch_cellxgene_expression(gene_ids, cache_dir=cache_dir)
|
||||
lf_merged = lf_merged.join(lf_cellxgene, on="gene_id", how="left")
|
||||
|
||||
# Collect at this point to enable horizontal operations
|
||||
df = lf_merged.collect()
|
||||
|
||||
# Calculate Tau specificity
|
||||
tissue_columns = [
|
||||
"hpa_retina_tpm",
|
||||
"hpa_cerebellum_tpm",
|
||||
"hpa_testis_tpm",
|
||||
"hpa_fallopian_tube_tpm",
|
||||
"gtex_retina_tpm",
|
||||
"gtex_cerebellum_tpm",
|
||||
"gtex_testis_tpm",
|
||||
"gtex_fallopian_tube_tpm",
|
||||
"cellxgene_photoreceptor_expr",
|
||||
"cellxgene_hair_cell_expr",
|
||||
]
|
||||
# Filter to available columns
|
||||
available_tissue_cols = [col for col in tissue_columns if col in df.columns]
|
||||
|
||||
if available_tissue_cols:
|
||||
df = calculate_tau_specificity(df, available_tissue_cols)
|
||||
else:
|
||||
df = df.with_columns(pl.lit(None).cast(pl.Float64).alias("tau_specificity"))
|
||||
|
||||
# Compute expression score
|
||||
df = compute_expression_score(df)
|
||||
|
||||
logger.info(
|
||||
"expression_pipeline_complete",
|
||||
row_count=len(df),
|
||||
has_hpa=any("hpa_" in col for col in df.columns),
|
||||
has_gtex=any("gtex_" in col for col in df.columns),
|
||||
has_cellxgene=any("cellxgene_" in col for col in df.columns),
|
||||
)
|
||||
|
||||
return df
|
||||
Reference in New Issue
Block a user