feat(03-04): create localization evidence data model and processing
- Define LocalizationRecord model with HPA and proteomics fields - Implement fetch_hpa_subcellular to download HPA bulk data - Implement fetch_cilia_proteomics with curated reference gene sets - Implement classify_evidence_type (experimental vs computational) - Implement score_localization with cilia proximity scoring - Implement process_localization_evidence end-to-end pipeline - Create load_to_duckdb for persistence with provenance
This commit is contained in:
33
src/usher_pipeline/evidence/localization/__init__.py
Normal file
33
src/usher_pipeline/evidence/localization/__init__.py
Normal file
@@ -0,0 +1,33 @@
|
||||
"""Subcellular localization evidence layer.
|
||||
|
||||
Integrates HPA subcellular localization and published cilia/centrosome
|
||||
proteomics data to score genes by proximity to cilia-related compartments.
|
||||
"""
|
||||
|
||||
from usher_pipeline.evidence.localization.fetch import (
|
||||
fetch_hpa_subcellular,
|
||||
fetch_cilia_proteomics,
|
||||
)
|
||||
from usher_pipeline.evidence.localization.transform import (
|
||||
classify_evidence_type,
|
||||
score_localization,
|
||||
process_localization_evidence,
|
||||
)
|
||||
from usher_pipeline.evidence.localization.load import (
|
||||
load_to_duckdb,
|
||||
)
|
||||
from usher_pipeline.evidence.localization.models import (
|
||||
LocalizationRecord,
|
||||
LOCALIZATION_TABLE_NAME,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"fetch_hpa_subcellular",
|
||||
"fetch_cilia_proteomics",
|
||||
"classify_evidence_type",
|
||||
"score_localization",
|
||||
"process_localization_evidence",
|
||||
"load_to_duckdb",
|
||||
"LocalizationRecord",
|
||||
"LOCALIZATION_TABLE_NAME",
|
||||
]
|
||||
285
src/usher_pipeline/evidence/localization/fetch.py
Normal file
285
src/usher_pipeline/evidence/localization/fetch.py
Normal file
@@ -0,0 +1,285 @@
|
||||
"""Download and parse HPA subcellular localization and cilia proteomics data."""
|
||||
|
||||
import io
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
import polars as pl
|
||||
import structlog
|
||||
from tenacity import (
|
||||
retry,
|
||||
stop_after_attempt,
|
||||
wait_exponential,
|
||||
retry_if_exception_type,
|
||||
)
|
||||
|
||||
from usher_pipeline.evidence.localization.models import HPA_SUBCELLULAR_URL
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
# Curated reference gene sets from published proteomics studies
|
||||
# CiliaCarta: comprehensive cilium proteome database
|
||||
# Source: van Dam et al. (2019) PLoS ONE, Arnaiz et al. (2014) PNAS
|
||||
CILIA_PROTEOMICS_GENES = {
|
||||
"BBS1", "BBS2", "BBS4", "BBS5", "BBS7", "BBS9", "BBS10", "BBS12",
|
||||
"CEP290", "CEP164", "CEP120", "RPGRIP1L", "TCTN1", "TCTN2", "TCTN3",
|
||||
"IFT88", "IFT81", "IFT140", "IFT172", "IFT80", "IFT52", "IFT57",
|
||||
"DYNC2H1", "DYNC2LI1", "WDR34", "WDR60", "TCTEX1D2",
|
||||
"NPHP1", "NPHP4", "INVS", "ANKS6", "NEK8",
|
||||
"ARL13B", "INPP5E", "CEP41", "TMEM67", "TMEM216", "TMEM231", "TMEM237",
|
||||
"MKS1", "TMEM17", "CC2D2A", "AHI1", "RPGRIP1", "NPHP3",
|
||||
"OFD1", "C5orf42", "CSPP1", "C2CD3", "CEP83", "SCLT1",
|
||||
"KIF7", "GLI2", "GLI3", "SUFU", "GPR161",
|
||||
"TALPID3", "B9D1", "B9D2", "MKS1", "TCTN2",
|
||||
"KIAA0586", "NEK1", "DZIP1", "DZIP1L", "FUZ",
|
||||
"POC5", "POC1B", "CEP135", "CEP152", "CEP192",
|
||||
"ALMS1", "TTC21B", "IFT122", "IFT144", "WDR19", "WDR35",
|
||||
"SPAG1", "RSPH1", "RSPH4A", "RSPH9", "DNAH5", "DNAH11", "DNAI1", "DNAI2",
|
||||
"C21orf59", "CCDC39", "CCDC40", "CCDC65", "CCDC103", "CCDC114",
|
||||
"DRC1", "ARMC4", "TTC25", "ZMYND10", "LRRC6", "PIH1D3",
|
||||
"HYDIN", "SPEF2", "CFAP43", "CFAP44", "CFAP53", "CFAP54",
|
||||
}
|
||||
|
||||
# Centrosome proteome database genes
|
||||
# Source: Firat-Karalar & Stearns (2014), Andersen et al. (2003) MCP
|
||||
CENTROSOME_PROTEOMICS_GENES = {
|
||||
"PCNT", "CDK5RAP2", "CEP192", "CEP152", "CEP135", "CEP120",
|
||||
"TUBG1", "TUBG2", "TUBGCP2", "TUBGCP3", "TUBGCP4", "TUBGCP5", "TUBGCP6",
|
||||
"NEDD1", "AKAP9", "NINL", "NIN",
|
||||
"CEP170", "CEP170B", "CEP131", "CEP63", "CEP72", "CEP97",
|
||||
"PLK1", "PLK4", "AURKA", "AURKB",
|
||||
"SASS6", "CENPJ", "STIL", "SAS6", "CEP152",
|
||||
"POC5", "POC1A", "POC1B", "CPAP", "CEP135",
|
||||
"CEP295", "OFD1", "C2CD3", "CCDC14", "CCDC67", "CCDC120",
|
||||
"KIAA0753", "SSX2IP", "CEP89", "CEP104", "CEP112", "CEP128",
|
||||
"CP110", "CCDC67", "CEP97", "CNTROB", "CETN2", "CETN3",
|
||||
}
|
||||
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(5),
|
||||
wait=wait_exponential(multiplier=1, min=4, max=60),
|
||||
retry=retry_if_exception_type(
|
||||
(httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
|
||||
),
|
||||
)
|
||||
def download_hpa_subcellular(
|
||||
output_path: Path,
|
||||
url: str = HPA_SUBCELLULAR_URL,
|
||||
force: bool = False,
|
||||
) -> Path:
|
||||
"""Download HPA subcellular location data with retry and streaming.
|
||||
|
||||
Downloads the HPA subcellular_location.tsv.zip file, extracts the TSV,
|
||||
and saves it to the output path.
|
||||
|
||||
Args:
|
||||
output_path: Where to save the TSV file
|
||||
url: HPA subcellular location URL (default: official bulk download)
|
||||
force: If True, re-download even if file exists
|
||||
|
||||
Returns:
|
||||
Path to the downloaded TSV file
|
||||
|
||||
Raises:
|
||||
httpx.HTTPStatusError: On HTTP errors (after retries)
|
||||
httpx.ConnectError: On connection errors (after retries)
|
||||
httpx.TimeoutException: On timeout (after retries)
|
||||
"""
|
||||
output_path = Path(output_path)
|
||||
|
||||
# Checkpoint pattern: skip if already downloaded
|
||||
if output_path.exists() and not force:
|
||||
logger.info(
|
||||
"hpa_subcellular_exists",
|
||||
path=str(output_path),
|
||||
size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
|
||||
)
|
||||
return output_path
|
||||
|
||||
# Create parent directory if needed
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger.info("hpa_download_start", url=url)
|
||||
|
||||
# Stream download to memory (HPA file is ~10MB compressed)
|
||||
with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Read zip content into memory
|
||||
zip_content = response.read()
|
||||
|
||||
logger.info("hpa_download_complete", size_mb=round(len(zip_content) / 1024 / 1024, 2))
|
||||
|
||||
# Extract TSV from zip
|
||||
logger.info("hpa_extract_start")
|
||||
with zipfile.ZipFile(io.BytesIO(zip_content)) as zf:
|
||||
# Find the TSV file (should be subcellular_location.tsv)
|
||||
tsv_files = [f for f in zf.namelist() if f.endswith(".tsv")]
|
||||
if not tsv_files:
|
||||
raise ValueError(f"No TSV file found in HPA zip: {zf.namelist()}")
|
||||
|
||||
tsv_filename = tsv_files[0]
|
||||
logger.info("hpa_extract_file", filename=tsv_filename)
|
||||
|
||||
# Extract to output path
|
||||
with zf.open(tsv_filename) as tsv_file:
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(tsv_file.read())
|
||||
|
||||
logger.info(
|
||||
"hpa_extract_complete",
|
||||
path=str(output_path),
|
||||
size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
|
||||
)
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def fetch_hpa_subcellular(
|
||||
gene_ids: list[str],
|
||||
gene_symbol_map: pl.DataFrame,
|
||||
cache_dir: Optional[Path] = None,
|
||||
force: bool = False,
|
||||
) -> pl.DataFrame:
|
||||
"""Fetch HPA subcellular localization data for genes.
|
||||
|
||||
Downloads HPA subcellular location data, parses it, and filters to the
|
||||
input gene list. Maps gene symbols to gene IDs using the provided mapping.
|
||||
|
||||
Args:
|
||||
gene_ids: List of Ensembl gene IDs to fetch
|
||||
gene_symbol_map: DataFrame with gene_id and gene_symbol columns
|
||||
cache_dir: Directory to cache HPA download (default: data/localization)
|
||||
force: If True, re-download HPA data
|
||||
|
||||
Returns:
|
||||
DataFrame with columns:
|
||||
- gene_id: Ensembl gene ID
|
||||
- gene_symbol: HGNC symbol
|
||||
- hpa_main_location: Semicolon-separated location string
|
||||
- hpa_reliability: Reliability level
|
||||
"""
|
||||
# Default cache location
|
||||
if cache_dir is None:
|
||||
cache_dir = Path("data/localization")
|
||||
|
||||
cache_dir = Path(cache_dir)
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
tsv_path = cache_dir / "hpa_subcellular_location.tsv"
|
||||
|
||||
# Download HPA data
|
||||
logger.info("fetch_hpa_start", gene_count=len(gene_ids))
|
||||
tsv_path = download_hpa_subcellular(tsv_path, force=force)
|
||||
|
||||
# Parse TSV with polars
|
||||
logger.info("hpa_parse_start", path=str(tsv_path))
|
||||
df = pl.scan_csv(
|
||||
tsv_path,
|
||||
separator="\t",
|
||||
null_values=["", "NA"],
|
||||
has_header=True,
|
||||
)
|
||||
|
||||
# Extract relevant columns
|
||||
# HPA columns: Gene, Gene name, Reliability, Main location, Additional location, Extracellular location, ...
|
||||
df = df.select([
|
||||
pl.col("Gene name").alias("gene_symbol"), # HGNC symbol
|
||||
pl.col("Reliability").alias("hpa_reliability"),
|
||||
pl.col("Main location").alias("main_location"),
|
||||
pl.col("Additional location").alias("additional_location"),
|
||||
pl.col("Extracellular location").alias("extracellular_location"),
|
||||
])
|
||||
|
||||
# Combine all locations into one field (semicolon-separated)
|
||||
df = df.with_columns([
|
||||
pl.concat_str(
|
||||
[
|
||||
pl.col("main_location").fill_null(""),
|
||||
pl.col("additional_location").fill_null(""),
|
||||
pl.col("extracellular_location").fill_null(""),
|
||||
],
|
||||
separator=";",
|
||||
)
|
||||
.str.replace_all(";;+", ";") # Remove multiple semicolons
|
||||
.str.strip_chars(";") # Remove leading/trailing semicolons
|
||||
.alias("hpa_main_location")
|
||||
])
|
||||
|
||||
# Select final columns
|
||||
df = df.select([
|
||||
pl.col("gene_symbol"),
|
||||
pl.col("hpa_reliability"),
|
||||
pl.col("hpa_main_location"),
|
||||
])
|
||||
|
||||
# Collect to DataFrame
|
||||
df = df.collect()
|
||||
|
||||
logger.info("hpa_parse_complete", row_count=len(df))
|
||||
|
||||
# Map gene symbols to gene IDs
|
||||
logger.info("hpa_map_gene_ids")
|
||||
df = df.join(
|
||||
gene_symbol_map.select(["gene_id", "gene_symbol"]),
|
||||
on="gene_symbol",
|
||||
how="inner",
|
||||
)
|
||||
|
||||
# Filter to requested gene_ids
|
||||
gene_ids_set = set(gene_ids)
|
||||
df = df.filter(pl.col("gene_id").is_in(gene_ids_set))
|
||||
|
||||
logger.info("hpa_filter_complete", row_count=len(df))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def fetch_cilia_proteomics(
|
||||
gene_ids: list[str],
|
||||
gene_symbol_map: pl.DataFrame,
|
||||
) -> pl.DataFrame:
|
||||
"""Cross-reference genes against curated cilia/centrosome proteomics datasets.
|
||||
|
||||
Uses embedded gene sets from published studies (CiliaCarta, Centrosome-DB).
|
||||
Genes not found in datasets are marked as False (not NULL), since absence
|
||||
from proteomics is informative (not detected vs. not tested).
|
||||
|
||||
Args:
|
||||
gene_ids: List of Ensembl gene IDs to check
|
||||
gene_symbol_map: DataFrame with gene_id and gene_symbol columns
|
||||
|
||||
Returns:
|
||||
DataFrame with columns:
|
||||
- gene_id: Ensembl gene ID
|
||||
- gene_symbol: HGNC symbol
|
||||
- in_cilia_proteomics: bool (True if in CiliaCarta, False otherwise)
|
||||
- in_centrosome_proteomics: bool (True if in Centrosome-DB, False otherwise)
|
||||
"""
|
||||
logger.info(
|
||||
"fetch_proteomics_start",
|
||||
gene_count=len(gene_ids),
|
||||
cilia_ref_count=len(CILIA_PROTEOMICS_GENES),
|
||||
centrosome_ref_count=len(CENTROSOME_PROTEOMICS_GENES),
|
||||
)
|
||||
|
||||
# Filter gene symbol map to requested gene_ids
|
||||
gene_ids_set = set(gene_ids)
|
||||
df = gene_symbol_map.filter(pl.col("gene_id").is_in(gene_ids_set))
|
||||
|
||||
# Check membership in proteomics datasets
|
||||
df = df.with_columns([
|
||||
pl.col("gene_symbol").is_in(CILIA_PROTEOMICS_GENES).alias("in_cilia_proteomics"),
|
||||
pl.col("gene_symbol").is_in(CENTROSOME_PROTEOMICS_GENES).alias("in_centrosome_proteomics"),
|
||||
])
|
||||
|
||||
logger.info(
|
||||
"fetch_proteomics_complete",
|
||||
cilia_hits=df.filter(pl.col("in_cilia_proteomics")).height,
|
||||
centrosome_hits=df.filter(pl.col("in_centrosome_proteomics")).height,
|
||||
)
|
||||
|
||||
return df.select(["gene_id", "gene_symbol", "in_cilia_proteomics", "in_centrosome_proteomics"])
|
||||
119
src/usher_pipeline/evidence/localization/load.py
Normal file
119
src/usher_pipeline/evidence/localization/load.py
Normal file
@@ -0,0 +1,119 @@
|
||||
"""Load localization evidence to DuckDB with provenance tracking."""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import polars as pl
|
||||
import structlog
|
||||
|
||||
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
def load_to_duckdb(
|
||||
df: pl.DataFrame,
|
||||
store: PipelineStore,
|
||||
provenance: ProvenanceTracker,
|
||||
description: str = ""
|
||||
) -> None:
|
||||
"""Save localization evidence DataFrame to DuckDB with provenance.
|
||||
|
||||
Creates or replaces the subcellular_localization table (idempotent).
|
||||
Records provenance step with summary statistics.
|
||||
|
||||
Args:
|
||||
df: Processed localization DataFrame with evidence types and scores
|
||||
store: PipelineStore instance for DuckDB persistence
|
||||
provenance: ProvenanceTracker instance for metadata recording
|
||||
description: Optional description for checkpoint metadata
|
||||
"""
|
||||
logger.info("localization_load_start", row_count=len(df))
|
||||
|
||||
# Calculate summary statistics for provenance
|
||||
experimental_count = df.filter(pl.col("evidence_type") == "experimental").height
|
||||
computational_count = df.filter(pl.col("evidence_type") == "computational").height
|
||||
both_count = df.filter(pl.col("evidence_type") == "both").height
|
||||
none_count = df.filter(pl.col("evidence_type") == "none").height
|
||||
|
||||
cilia_compartment_count = df.filter(
|
||||
(pl.col("compartment_cilia") == True) | (pl.col("compartment_centrosome") == True)
|
||||
).height
|
||||
|
||||
mean_localization_score = df["localization_score_normalized"].mean()
|
||||
|
||||
# Count genes with high cilia proximity (> 0.5)
|
||||
high_proximity_count = df.filter(
|
||||
pl.col("cilia_proximity_score") > 0.5
|
||||
).height
|
||||
|
||||
# Save to DuckDB with CREATE OR REPLACE (idempotent)
|
||||
store.save_dataframe(
|
||||
df=df,
|
||||
table_name="subcellular_localization",
|
||||
description=description or "HPA subcellular localization with cilia/centrosome proteomics cross-references",
|
||||
replace=True
|
||||
)
|
||||
|
||||
# Record provenance step with details
|
||||
provenance.record_step("load_subcellular_localization", {
|
||||
"row_count": len(df),
|
||||
"experimental_count": experimental_count,
|
||||
"computational_count": computational_count,
|
||||
"both_count": both_count,
|
||||
"none_count": none_count,
|
||||
"cilia_compartment_count": cilia_compartment_count,
|
||||
"high_proximity_count": high_proximity_count,
|
||||
"mean_localization_score": float(mean_localization_score) if mean_localization_score is not None else None,
|
||||
})
|
||||
|
||||
logger.info(
|
||||
"localization_load_complete",
|
||||
row_count=len(df),
|
||||
experimental=experimental_count,
|
||||
computational=computational_count,
|
||||
both=both_count,
|
||||
none=none_count,
|
||||
cilia_compartment=cilia_compartment_count,
|
||||
high_proximity=high_proximity_count,
|
||||
mean_score=mean_localization_score,
|
||||
)
|
||||
|
||||
|
||||
def query_cilia_localized(
|
||||
store: PipelineStore,
|
||||
proximity_threshold: float = 0.5
|
||||
) -> pl.DataFrame:
|
||||
"""Query genes with high cilia proximity scores from DuckDB.
|
||||
|
||||
Demonstrates DuckDB query capability and provides helper for downstream
|
||||
analysis. Returns genes with strong localization evidence for cilia-related
|
||||
compartments.
|
||||
|
||||
Args:
|
||||
store: PipelineStore instance
|
||||
proximity_threshold: Minimum cilia_proximity_score (default: 0.5)
|
||||
|
||||
Returns:
|
||||
DataFrame with cilia-localized genes sorted by localization score
|
||||
Columns: gene_id, gene_symbol, evidence_type, compartment flags,
|
||||
cilia_proximity_score, localization_score_normalized
|
||||
"""
|
||||
logger.info("localization_query_cilia", proximity_threshold=proximity_threshold)
|
||||
|
||||
# Query DuckDB: genes with high cilia proximity
|
||||
df = store.execute_query(
|
||||
"""
|
||||
SELECT gene_id, gene_symbol, evidence_type,
|
||||
compartment_cilia, compartment_centrosome, compartment_basal_body,
|
||||
in_cilia_proteomics, in_centrosome_proteomics,
|
||||
cilia_proximity_score, localization_score_normalized
|
||||
FROM subcellular_localization
|
||||
WHERE cilia_proximity_score > ?
|
||||
ORDER BY localization_score_normalized DESC, cilia_proximity_score DESC
|
||||
""",
|
||||
params=[proximity_threshold]
|
||||
)
|
||||
|
||||
logger.info("localization_query_complete", result_count=len(df))
|
||||
|
||||
return df
|
||||
112
src/usher_pipeline/evidence/localization/models.py
Normal file
112
src/usher_pipeline/evidence/localization/models.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""Data models for subcellular localization evidence."""
|
||||
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
# Table name for DuckDB storage
|
||||
LOCALIZATION_TABLE_NAME = "subcellular_localization"
|
||||
|
||||
# HPA subcellular data URL (bulk download)
|
||||
HPA_SUBCELLULAR_URL = "https://www.proteinatlas.org/download/subcellular_location.tsv.zip"
|
||||
|
||||
# Compartment definitions for scoring
|
||||
CILIA_COMPARTMENTS = [
|
||||
"Cilia",
|
||||
"Cilium",
|
||||
"Centrosome",
|
||||
"Centriole",
|
||||
"Basal body",
|
||||
"Microtubule organizing center",
|
||||
]
|
||||
|
||||
CILIA_ADJACENT_COMPARTMENTS = [
|
||||
"Cytoskeleton",
|
||||
"Microtubules",
|
||||
"Cell Junctions",
|
||||
"Focal adhesion sites",
|
||||
]
|
||||
|
||||
|
||||
class LocalizationRecord(BaseModel):
|
||||
"""Represents subcellular localization evidence for a gene.
|
||||
|
||||
Integrates HPA subcellular location data with curated cilia/centrosome
|
||||
proteomics datasets to generate a cilia-proximity localization score.
|
||||
|
||||
Distinguishes experimental evidence (Enhanced/Supported reliability,
|
||||
proteomics) from computational predictions (Approved/Uncertain reliability).
|
||||
"""
|
||||
|
||||
# Core identifiers
|
||||
gene_id: str = Field(description="Ensembl gene ID (ENSG...)")
|
||||
gene_symbol: str = Field(description="HGNC gene symbol")
|
||||
|
||||
# HPA subcellular location data (NULL if gene not in HPA)
|
||||
hpa_main_location: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Semicolon-separated list of HPA subcellular locations"
|
||||
)
|
||||
hpa_reliability: Optional[str] = Field(
|
||||
default=None,
|
||||
description="HPA reliability level: Enhanced, Supported, Approved, Uncertain"
|
||||
)
|
||||
hpa_evidence_type: Optional[str] = Field(
|
||||
default=None,
|
||||
description="HPA evidence classification: experimental or predicted"
|
||||
)
|
||||
|
||||
# Proteomics dataset presence (False if not found, not NULL)
|
||||
in_cilia_proteomics: Optional[bool] = Field(
|
||||
default=None,
|
||||
description="Found in published cilium proteomics datasets (CiliaCarta, etc.)"
|
||||
)
|
||||
in_centrosome_proteomics: Optional[bool] = Field(
|
||||
default=None,
|
||||
description="Found in published centrosome proteomics datasets (Centrosome-DB, etc.)"
|
||||
)
|
||||
|
||||
# Compartment flags (parsed from HPA locations)
|
||||
compartment_cilia: Optional[bool] = Field(
|
||||
default=None,
|
||||
description="Localized to cilia/cilium"
|
||||
)
|
||||
compartment_centrosome: Optional[bool] = Field(
|
||||
default=None,
|
||||
description="Localized to centrosome"
|
||||
)
|
||||
compartment_basal_body: Optional[bool] = Field(
|
||||
default=None,
|
||||
description="Localized to basal body"
|
||||
)
|
||||
compartment_transition_zone: Optional[bool] = Field(
|
||||
default=None,
|
||||
description="Localized to transition zone"
|
||||
)
|
||||
compartment_stereocilia: Optional[bool] = Field(
|
||||
default=None,
|
||||
description="Localized to stereocilia"
|
||||
)
|
||||
|
||||
# Evidence classification
|
||||
evidence_type: str = Field(
|
||||
description="Evidence type: experimental, computational, both, or none"
|
||||
)
|
||||
|
||||
# Scoring
|
||||
cilia_proximity_score: Optional[float] = Field(
|
||||
default=None,
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
description="Proximity to cilia-related compartments (0-1, NULL if no data)"
|
||||
)
|
||||
localization_score_normalized: Optional[float] = Field(
|
||||
default=None,
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
description="Normalized localization score weighted by evidence type (0-1, NULL if no data)"
|
||||
)
|
||||
|
||||
class Config:
|
||||
"""Pydantic config."""
|
||||
validate_assignment = True
|
||||
315
src/usher_pipeline/evidence/localization/transform.py
Normal file
315
src/usher_pipeline/evidence/localization/transform.py
Normal file
@@ -0,0 +1,315 @@
|
||||
"""Transform localization data: classify evidence type and score cilia proximity."""
|
||||
|
||||
import polars as pl
|
||||
import structlog
|
||||
|
||||
from usher_pipeline.evidence.localization.models import (
|
||||
CILIA_COMPARTMENTS,
|
||||
CILIA_ADJACENT_COMPARTMENTS,
|
||||
)
|
||||
from usher_pipeline.evidence.localization.fetch import (
|
||||
fetch_hpa_subcellular,
|
||||
fetch_cilia_proteomics,
|
||||
)
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
def classify_evidence_type(df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Classify localization evidence as experimental vs computational.
|
||||
|
||||
HPA reliability levels:
|
||||
- Enhanced/Supported: experimental (antibody-based IHC with validation)
|
||||
- Approved/Uncertain: computational (predicted from RNA-seq or unvalidated)
|
||||
|
||||
Proteomics datasets are always experimental (MS-based detection).
|
||||
|
||||
Evidence type categories:
|
||||
- "experimental": high-confidence experimental evidence only
|
||||
- "computational": predicted/uncertain only
|
||||
- "both": both experimental and computational evidence
|
||||
- "none": no localization data available
|
||||
|
||||
Args:
|
||||
df: DataFrame with hpa_reliability, in_cilia_proteomics, in_centrosome_proteomics
|
||||
|
||||
Returns:
|
||||
DataFrame with added columns:
|
||||
- hpa_evidence_type: "experimental" or "predicted" (NULL if no HPA data)
|
||||
- evidence_type: "experimental", "computational", "both", "none"
|
||||
"""
|
||||
logger.info("classify_evidence_start", row_count=len(df))
|
||||
|
||||
# Classify HPA evidence type based on reliability
|
||||
df = df.with_columns([
|
||||
pl.when(pl.col("hpa_reliability").is_in(["Enhanced", "Supported"]))
|
||||
.then(pl.lit("experimental"))
|
||||
.when(pl.col("hpa_reliability").is_in(["Approved", "Uncertain"]))
|
||||
.then(pl.lit("predicted"))
|
||||
.otherwise(None)
|
||||
.alias("hpa_evidence_type")
|
||||
])
|
||||
|
||||
# Determine overall evidence type
|
||||
# Proteomics presence overrides computational HPA classification
|
||||
df = df.with_columns([
|
||||
pl.when(
|
||||
# Has proteomics evidence
|
||||
(pl.col("in_cilia_proteomics") == True) | (pl.col("in_centrosome_proteomics") == True)
|
||||
).then(
|
||||
# Proteomics is experimental
|
||||
pl.when(pl.col("hpa_evidence_type") == "experimental")
|
||||
.then(pl.lit("experimental")) # Both proteomics and HPA experimental
|
||||
.when(pl.col("hpa_evidence_type") == "predicted")
|
||||
.then(pl.lit("both")) # Proteomics experimental, HPA predicted
|
||||
.when(pl.col("hpa_evidence_type").is_null())
|
||||
.then(pl.lit("experimental")) # Only proteomics
|
||||
.otherwise(pl.lit("experimental"))
|
||||
).when(
|
||||
# No proteomics, but has HPA
|
||||
pl.col("hpa_evidence_type").is_not_null()
|
||||
).then(
|
||||
pl.col("hpa_evidence_type") # Use HPA classification
|
||||
).otherwise(
|
||||
# No proteomics, no HPA
|
||||
pl.lit("none")
|
||||
).alias("evidence_type")
|
||||
])
|
||||
|
||||
logger.info(
|
||||
"classify_evidence_complete",
|
||||
experimental=df.filter(pl.col("evidence_type") == "experimental").height,
|
||||
computational=df.filter(pl.col("evidence_type") == "computational").height,
|
||||
both=df.filter(pl.col("evidence_type") == "both").height,
|
||||
none=df.filter(pl.col("evidence_type") == "none").height,
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def score_localization(df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Score cilia proximity based on compartment localization.
|
||||
|
||||
Scoring logic:
|
||||
1. Parse HPA location string to identify compartments
|
||||
2. Set compartment boolean flags
|
||||
3. Calculate base cilia_proximity_score:
|
||||
- 1.0: Direct cilia compartment (Cilia, Centrosome, Basal body, etc.)
|
||||
- 0.5: Adjacent compartment (Cytoskeleton, Microtubules, etc.)
|
||||
- 0.3: In proteomics but no HPA cilia location
|
||||
- 0.0: No cilia-related evidence
|
||||
- NULL: No localization data at all
|
||||
4. Apply evidence weight:
|
||||
- experimental: 1.0x
|
||||
- computational: 0.6x
|
||||
- both: 1.0x (experimental evidence present)
|
||||
- none: NULL
|
||||
5. Calculate localization_score_normalized: cilia_proximity_score * evidence_weight
|
||||
|
||||
Args:
|
||||
df: DataFrame with hpa_main_location, in_cilia_proteomics,
|
||||
in_centrosome_proteomics, evidence_type
|
||||
|
||||
Returns:
|
||||
DataFrame with added columns:
|
||||
- compartment_cilia, compartment_centrosome, etc.: boolean flags
|
||||
- cilia_proximity_score: 0-1 score (NULL if no data)
|
||||
- localization_score_normalized: weighted score (NULL if no data)
|
||||
"""
|
||||
logger.info("score_localization_start", row_count=len(df))
|
||||
|
||||
# Parse compartment flags from HPA location string
|
||||
# Check if any CILIA_COMPARTMENTS substring appears in hpa_main_location
|
||||
df = df.with_columns([
|
||||
# Cilia/Cilium
|
||||
pl.when(
|
||||
pl.col("hpa_main_location").is_not_null()
|
||||
).then(
|
||||
pl.col("hpa_main_location").str.to_lowercase().str.contains("cili")
|
||||
).otherwise(None).alias("compartment_cilia"),
|
||||
|
||||
# Centrosome
|
||||
pl.when(
|
||||
pl.col("hpa_main_location").is_not_null()
|
||||
).then(
|
||||
pl.col("hpa_main_location").str.to_lowercase().str.contains("centrosome|centriole")
|
||||
).otherwise(None).alias("compartment_centrosome"),
|
||||
|
||||
# Basal body
|
||||
pl.when(
|
||||
pl.col("hpa_main_location").is_not_null()
|
||||
).then(
|
||||
pl.col("hpa_main_location").str.to_lowercase().str.contains("basal body")
|
||||
).otherwise(None).alias("compartment_basal_body"),
|
||||
|
||||
# Transition zone (rare in HPA, but check)
|
||||
pl.when(
|
||||
pl.col("hpa_main_location").is_not_null()
|
||||
).then(
|
||||
pl.col("hpa_main_location").str.to_lowercase().str.contains("transition zone")
|
||||
).otherwise(None).alias("compartment_transition_zone"),
|
||||
|
||||
# Stereocilia (hearing-related cilia)
|
||||
pl.when(
|
||||
pl.col("hpa_main_location").is_not_null()
|
||||
).then(
|
||||
pl.col("hpa_main_location").str.to_lowercase().str.contains("stereocili")
|
||||
).otherwise(None).alias("compartment_stereocilia"),
|
||||
])
|
||||
|
||||
# Check for adjacent compartments (cytoskeleton, microtubules)
|
||||
df = df.with_columns([
|
||||
pl.when(
|
||||
pl.col("hpa_main_location").is_not_null()
|
||||
).then(
|
||||
pl.col("hpa_main_location").str.to_lowercase().str.contains(
|
||||
"cytoskeleton|microtubule|cell junction|focal adhesion"
|
||||
)
|
||||
).otherwise(None).alias("has_adjacent_compartment")
|
||||
])
|
||||
|
||||
# Calculate base cilia proximity score
|
||||
df = df.with_columns([
|
||||
pl.when(
|
||||
# Direct cilia compartment
|
||||
(pl.col("compartment_cilia") == True)
|
||||
| (pl.col("compartment_centrosome") == True)
|
||||
| (pl.col("compartment_basal_body") == True)
|
||||
| (pl.col("compartment_transition_zone") == True)
|
||||
| (pl.col("compartment_stereocilia") == True)
|
||||
).then(
|
||||
pl.lit(1.0) # Direct match
|
||||
).when(
|
||||
# Adjacent compartment only
|
||||
pl.col("has_adjacent_compartment") == True
|
||||
).then(
|
||||
pl.lit(0.5) # Adjacent
|
||||
).when(
|
||||
# In proteomics but no HPA cilia location
|
||||
((pl.col("in_cilia_proteomics") == True) | (pl.col("in_centrosome_proteomics") == True))
|
||||
& (pl.col("hpa_main_location").is_null() | (pl.col("compartment_cilia").is_null()))
|
||||
).then(
|
||||
pl.lit(0.3) # Proteomics evidence without HPA confirmation
|
||||
).when(
|
||||
# Has HPA or proteomics data but no cilia evidence
|
||||
pl.col("hpa_main_location").is_not_null()
|
||||
| (pl.col("in_cilia_proteomics") == True)
|
||||
| (pl.col("in_centrosome_proteomics") == True)
|
||||
).then(
|
||||
pl.lit(0.0) # No cilia proximity
|
||||
).otherwise(
|
||||
None # No data at all
|
||||
).alias("cilia_proximity_score")
|
||||
])
|
||||
|
||||
# Apply evidence type weighting
|
||||
df = df.with_columns([
|
||||
pl.when(
|
||||
pl.col("evidence_type") == "none"
|
||||
).then(
|
||||
None # No evidence -> NULL score
|
||||
).when(
|
||||
pl.col("evidence_type") == "computational"
|
||||
).then(
|
||||
(pl.col("cilia_proximity_score") * 0.6).cast(pl.Float64) # Downweight computational
|
||||
).when(
|
||||
(pl.col("evidence_type") == "experimental") | (pl.col("evidence_type") == "both")
|
||||
).then(
|
||||
(pl.col("cilia_proximity_score") * 1.0).cast(pl.Float64) # Full weight for experimental
|
||||
).otherwise(
|
||||
pl.col("cilia_proximity_score")
|
||||
).alias("localization_score_normalized")
|
||||
])
|
||||
|
||||
# Drop temporary column
|
||||
df = df.drop("has_adjacent_compartment")
|
||||
|
||||
logger.info(
|
||||
"score_localization_complete",
|
||||
direct_cilia=df.filter((pl.col("compartment_cilia") == True) | (pl.col("compartment_centrosome") == True)).height,
|
||||
mean_proximity=df["cilia_proximity_score"].mean(),
|
||||
mean_normalized=df["localization_score_normalized"].mean(),
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def process_localization_evidence(
|
||||
gene_ids: list[str],
|
||||
gene_symbol_map: pl.DataFrame,
|
||||
cache_dir=None,
|
||||
force: bool = False,
|
||||
) -> pl.DataFrame:
|
||||
"""End-to-end localization evidence processing pipeline.
|
||||
|
||||
Fetches HPA subcellular data and proteomics cross-references,
|
||||
merges them, classifies evidence type, and scores cilia proximity.
|
||||
|
||||
Args:
|
||||
gene_ids: List of Ensembl gene IDs
|
||||
gene_symbol_map: DataFrame with gene_id and gene_symbol columns
|
||||
cache_dir: Directory to cache HPA download
|
||||
force: If True, re-download HPA data
|
||||
|
||||
Returns:
|
||||
DataFrame with all LocalizationRecord fields
|
||||
"""
|
||||
logger.info("process_localization_start", gene_count=len(gene_ids))
|
||||
|
||||
# Fetch HPA subcellular data
|
||||
hpa_df = fetch_hpa_subcellular(
|
||||
gene_ids=gene_ids,
|
||||
gene_symbol_map=gene_symbol_map,
|
||||
cache_dir=cache_dir,
|
||||
force=force,
|
||||
)
|
||||
|
||||
# Fetch proteomics cross-references
|
||||
proteomics_df = fetch_cilia_proteomics(
|
||||
gene_ids=gene_ids,
|
||||
gene_symbol_map=gene_symbol_map,
|
||||
)
|
||||
|
||||
# Merge HPA and proteomics data
|
||||
# Use full outer join to preserve genes in either dataset
|
||||
df = gene_symbol_map.filter(pl.col("gene_id").is_in(gene_ids))
|
||||
df = df.select(["gene_id", "gene_symbol"])
|
||||
|
||||
# Left join HPA data
|
||||
df = df.join(
|
||||
hpa_df.select(["gene_id", "hpa_main_location", "hpa_reliability"]),
|
||||
on="gene_id",
|
||||
how="left",
|
||||
)
|
||||
|
||||
# Left join proteomics data
|
||||
df = df.join(
|
||||
proteomics_df.select(["gene_id", "in_cilia_proteomics", "in_centrosome_proteomics"]),
|
||||
on="gene_id",
|
||||
how="left",
|
||||
)
|
||||
|
||||
# Fill NULL proteomics flags with False (absence is informative)
|
||||
df = df.with_columns([
|
||||
pl.col("in_cilia_proteomics").fill_null(False),
|
||||
pl.col("in_centrosome_proteomics").fill_null(False),
|
||||
])
|
||||
|
||||
logger.info(
|
||||
"process_merge_complete",
|
||||
row_count=len(df),
|
||||
has_hpa=df.filter(pl.col("hpa_main_location").is_not_null()).height,
|
||||
has_proteomics=df.filter(
|
||||
(pl.col("in_cilia_proteomics") == True) | (pl.col("in_centrosome_proteomics") == True)
|
||||
).height,
|
||||
)
|
||||
|
||||
# Classify evidence type
|
||||
df = classify_evidence_type(df)
|
||||
|
||||
# Score localization
|
||||
df = score_localization(df)
|
||||
|
||||
logger.info("process_localization_complete", row_count=len(df))
|
||||
|
||||
return df
|
||||
Reference in New Issue
Block a user