feat(03-03): implement protein evidence layer with UniProt/InterPro integration

- Create protein features data model with domain, coiled-coil, TM, cilia motifs
- Implement fetch.py with UniProt REST API and InterPro API queries
- Implement transform.py with feature extraction, motif detection, normalization
- Implement load.py with DuckDB persistence and provenance tracking
- Add CLI protein command following evidence layer pattern
- Add comprehensive unit and integration tests (all passing)
- Handle NULL preservation and List(Null) edge case
- Add get_steps() method to ProvenanceTracker for test compatibility
This commit is contained in:
2026-02-11 19:07:30 +08:00
parent bcd3c4ffbe
commit 46059874f2
10 changed files with 1937 additions and 0 deletions

View File

@@ -0,0 +1,46 @@
"""Protein sequence and structure features evidence layer.
Extracts protein features from UniProt and InterPro:
- Protein length, domain composition, domain count
- Coiled-coil regions, transmembrane domains, scaffold/adaptor domains
- Cilia-associated motifs detected via domain keyword matching
- Normalized composite protein score (0-1)
Evidence follows fetch -> transform -> load pattern with checkpoint-restart.
"""
from usher_pipeline.evidence.protein.models import (
ProteinFeatureRecord,
PROTEIN_TABLE_NAME,
CILIA_DOMAIN_KEYWORDS,
SCAFFOLD_DOMAIN_TYPES,
)
from usher_pipeline.evidence.protein.fetch import (
fetch_uniprot_features,
fetch_interpro_domains,
)
from usher_pipeline.evidence.protein.transform import (
extract_protein_features,
detect_cilia_motifs,
normalize_protein_features,
process_protein_evidence,
)
from usher_pipeline.evidence.protein.load import (
load_to_duckdb,
query_cilia_candidates,
)
__all__ = [
"ProteinFeatureRecord",
"PROTEIN_TABLE_NAME",
"CILIA_DOMAIN_KEYWORDS",
"SCAFFOLD_DOMAIN_TYPES",
"fetch_uniprot_features",
"fetch_interpro_domains",
"extract_protein_features",
"detect_cilia_motifs",
"normalize_protein_features",
"process_protein_evidence",
"load_to_duckdb",
"query_cilia_candidates",
]

View File

@@ -0,0 +1,274 @@
"""Fetch protein features from UniProt and InterPro APIs."""
import time
from typing import List
import httpx
import polars as pl
import structlog
from tenacity import (
retry,
stop_after_attempt,
wait_exponential,
retry_if_exception_type,
)
logger = structlog.get_logger()
# UniProt REST API base URL
UNIPROT_API_BASE = "https://rest.uniprot.org"
# InterPro REST API base URL
INTERPRO_API_BASE = "https://www.ebi.ac.uk/interpro/api"
@retry(
stop=stop_after_attempt(5),
wait=wait_exponential(multiplier=1, min=4, max=60),
retry=retry_if_exception_type(
(httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
),
)
def fetch_uniprot_features(uniprot_ids: List[str]) -> pl.DataFrame:
"""Query UniProt REST API for protein features.
Fetches protein length, domain annotations, coiled-coil regions,
and transmembrane regions from UniProt in batches.
Args:
uniprot_ids: List of UniProt accession IDs
Returns:
DataFrame with columns:
- uniprot_id: UniProt accession
- protein_length: Amino acid length
- domain_names: List of domain names
- coiled_coil_count: Number of coiled-coil regions
- transmembrane_count: Number of transmembrane regions
Raises:
httpx.HTTPStatusError: On HTTP errors (after retries)
httpx.ConnectError: On connection errors (after retries)
httpx.TimeoutException: On timeout (after retries)
"""
if not uniprot_ids:
return pl.DataFrame({
"uniprot_id": [],
"protein_length": [],
"domain_names": [],
"coiled_coil_count": [],
"transmembrane_count": [],
})
logger.info("uniprot_fetch_start", accession_count=len(uniprot_ids))
# UniProt batch size recommendation: 100 accessions per request
batch_size = 100
all_records = []
for i in range(0, len(uniprot_ids), batch_size):
batch = uniprot_ids[i:i + batch_size]
batch_num = (i // batch_size) + 1
total_batches = (len(uniprot_ids) + batch_size - 1) // batch_size
logger.info(
"uniprot_batch_start",
batch=batch_num,
total_batches=total_batches,
batch_size=len(batch),
)
# Query UniProt API with accession list
# Use search endpoint with fields parameter
query = " OR ".join(f"accession:{acc}" for acc in batch)
url = f"{UNIPROT_API_BASE}/uniprotkb/search"
params = {
"query": query,
"format": "json",
"fields": "accession,length,ft_domain,ft_coiled,ft_transmem,annotation_score",
"size": batch_size,
}
with httpx.Client(timeout=30.0) as client:
response = client.get(url, params=params)
response.raise_for_status()
data = response.json()
# Parse results
results = data.get("results", [])
# Create lookup for fast access
found_accessions = set()
for entry in results:
accession = entry.get("primaryAccession")
if not accession:
continue
found_accessions.add(accession)
# Extract protein length
length = entry.get("sequence", {}).get("length")
# Extract domain names from ft_domain features
domain_names = []
for feature in entry.get("features", []):
if feature.get("type") == "Domain":
description = feature.get("description", "")
if description:
domain_names.append(description)
# Count coiled-coil regions
coiled_coil_count = sum(
1 for feature in entry.get("features", [])
if feature.get("type") == "Coiled coil"
)
# Count transmembrane regions
transmembrane_count = sum(
1 for feature in entry.get("features", [])
if feature.get("type") == "Transmembrane"
)
all_records.append({
"uniprot_id": accession,
"protein_length": length,
"domain_names": domain_names,
"coiled_coil_count": coiled_coil_count,
"transmembrane_count": transmembrane_count,
})
# Add NULL records for accessions not found
for acc in batch:
if acc not in found_accessions:
all_records.append({
"uniprot_id": acc,
"protein_length": None,
"domain_names": [],
"coiled_coil_count": None,
"transmembrane_count": None,
})
# Rate limiting: UniProt allows 200 requests/second
# With batches of 100, this gives us 20K accessions/second
# Conservative: 5 requests/second = 500 accessions/second
time.sleep(0.2)
logger.info(
"uniprot_fetch_complete",
total_accessions=len(uniprot_ids),
records_found=sum(1 for r in all_records if r["protein_length"] is not None),
)
return pl.DataFrame(all_records)
@retry(
stop=stop_after_attempt(5),
wait=wait_exponential(multiplier=1, min=4, max=60),
retry=retry_if_exception_type(
(httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
),
)
def fetch_interpro_domains(uniprot_ids: List[str]) -> pl.DataFrame:
"""Query InterPro API for domain annotations.
Fetches detailed domain classification from InterPro to supplement
UniProt annotations.
Args:
uniprot_ids: List of UniProt accession IDs
Returns:
DataFrame with columns:
- uniprot_id: UniProt accession
- domain_names: List of InterPro domain names
- interpro_ids: List of InterPro accession IDs
Raises:
httpx.HTTPStatusError: On HTTP errors (after retries)
httpx.ConnectError: On connection errors (after retries)
httpx.TimeoutException: On timeout (after retries)
"""
if not uniprot_ids:
return pl.DataFrame({
"uniprot_id": [],
"domain_names": [],
"interpro_ids": [],
})
logger.info("interpro_fetch_start", accession_count=len(uniprot_ids))
# InterPro requires per-protein queries
# Use conservative rate limiting: 10 req/sec as recommended
all_records = []
# For large datasets (>10K proteins), log warning about potential slowness
if len(uniprot_ids) > 10000:
logger.warning(
"interpro_large_dataset",
accession_count=len(uniprot_ids),
estimated_time_min=len(uniprot_ids) / 10 / 60,
note="Consider InterPro bulk download for large datasets",
)
with httpx.Client(timeout=30.0) as client:
for idx, accession in enumerate(uniprot_ids, 1):
if idx % 100 == 0:
logger.info(
"interpro_progress",
processed=idx,
total=len(uniprot_ids),
percent=round(idx / len(uniprot_ids) * 100, 1),
)
# Query InterPro API
url = f"{INTERPRO_API_BASE}/entry/interpro/protein/uniprot/{accession}"
try:
response = client.get(url, params={"format": "json"})
response.raise_for_status()
data = response.json()
# Parse InterPro entries
domain_names = []
interpro_ids = []
if "results" in data:
for entry in data["results"]:
metadata = entry.get("metadata", {})
interpro_id = metadata.get("accession")
name = metadata.get("name", {}).get("name", "")
if interpro_id:
interpro_ids.append(interpro_id)
if name:
domain_names.append(name)
all_records.append({
"uniprot_id": accession,
"domain_names": domain_names,
"interpro_ids": interpro_ids,
})
except httpx.HTTPStatusError as e:
if e.response.status_code == 404:
# Protein not found in InterPro - this is OK
all_records.append({
"uniprot_id": accession,
"domain_names": [],
"interpro_ids": [],
})
else:
# Other HTTP errors - re-raise for retry
raise
# Rate limiting: 10 requests/second
time.sleep(0.1)
logger.info(
"interpro_fetch_complete",
total_accessions=len(uniprot_ids),
records_found=sum(1 for r in all_records if r["domain_names"]),
)
return pl.DataFrame(all_records)

View File

@@ -0,0 +1,128 @@
"""Load protein features to DuckDB with provenance tracking."""
from typing import Optional
import polars as pl
import structlog
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
logger = structlog.get_logger()
def load_to_duckdb(
df: pl.DataFrame,
store: PipelineStore,
provenance: ProvenanceTracker,
description: str = "",
) -> None:
"""Save protein features DataFrame to DuckDB with provenance.
Creates or replaces the protein_features table (idempotent).
Records provenance step with summary statistics.
Args:
df: Processed protein features DataFrame with normalized scores
store: PipelineStore instance for DuckDB persistence
provenance: ProvenanceTracker instance for metadata recording
description: Optional description for checkpoint metadata
"""
logger.info("protein_load_start", row_count=len(df))
# Calculate summary statistics for provenance
total_genes = len(df)
with_uniprot = df.filter(pl.col("uniprot_id").is_not_null()).height
null_uniprot = total_genes - with_uniprot
cilia_domain_count = df.filter(pl.col("has_cilia_domain") == True).height
scaffold_domain_count = df.filter(pl.col("scaffold_adaptor_domain") == True).height
coiled_coil_count = df.filter(pl.col("coiled_coil") == True).height
tm_domain_count = df.filter(pl.col("transmembrane_count") > 0).height
# Mean domain count (only for proteins with data)
mean_domain_count = (
df.filter(pl.col("domain_count").is_not_null())
.select(pl.col("domain_count").mean())
.item()
)
if mean_domain_count is not None:
mean_domain_count = round(mean_domain_count, 2)
# Save to DuckDB with CREATE OR REPLACE (idempotent)
store.save_dataframe(
df=df,
table_name="protein_features",
description=description or "Protein features from UniProt/InterPro with domain composition and cilia motif detection",
replace=True,
)
# Record provenance step with details
provenance.record_step("load_protein_features", {
"total_genes": total_genes,
"with_uniprot": with_uniprot,
"null_uniprot": null_uniprot,
"cilia_domain_count": cilia_domain_count,
"scaffold_domain_count": scaffold_domain_count,
"coiled_coil_count": coiled_coil_count,
"transmembrane_domain_count": tm_domain_count,
"mean_domain_count": mean_domain_count,
})
logger.info(
"protein_load_complete",
row_count=total_genes,
with_uniprot=with_uniprot,
cilia_domains=cilia_domain_count,
scaffold_domains=scaffold_domain_count,
coiled_coils=coiled_coil_count,
)
def query_cilia_candidates(
store: PipelineStore,
) -> pl.DataFrame:
"""Query genes with cilia-associated protein features.
Identifies candidate genes with:
- Cilia domain annotations, OR
- Both coiled-coil regions AND scaffold/adaptor domains
This combination is enriched in known cilia proteins and provides
structural evidence for potential cilia involvement.
Args:
store: PipelineStore instance
Returns:
DataFrame with candidate genes sorted by protein_score_normalized
Columns: gene_id, gene_symbol, protein_length, domain_count,
coiled_coil, has_cilia_domain, scaffold_adaptor_domain,
protein_score_normalized
"""
logger.info("protein_query_cilia_candidates")
# Query DuckDB for genes with cilia features
df = store.execute_query(
"""
SELECT
gene_id,
gene_symbol,
uniprot_id,
protein_length,
domain_count,
coiled_coil,
transmembrane_count,
scaffold_adaptor_domain,
has_cilia_domain,
has_sensory_domain,
protein_score_normalized
FROM protein_features
WHERE has_cilia_domain = TRUE
OR (coiled_coil = TRUE AND scaffold_adaptor_domain = TRUE)
ORDER BY protein_score_normalized DESC NULLS LAST
"""
)
logger.info("protein_query_complete", result_count=len(df))
return df

View File

@@ -0,0 +1,71 @@
"""Data models for protein sequence and structure features."""
from pydantic import BaseModel
# DuckDB table name for protein features
PROTEIN_TABLE_NAME = "protein_features"
# Cilia-associated domain keywords for motif detection
# These are structural patterns found in known cilia proteins
# Pattern matching does NOT presuppose cilia involvement - it flags features
# associated with cilia biology for further investigation
CILIA_DOMAIN_KEYWORDS = [
"IFT",
"intraflagellar",
"BBSome",
"ciliary",
"cilia",
"basal body",
"centrosome",
"transition zone",
"axoneme",
]
# Scaffold and adaptor domain types commonly found in Usher proteins
# These domains mediate protein-protein interactions and are enriched in
# cilia-associated proteins
SCAFFOLD_DOMAIN_TYPES = [
"PDZ",
"SH3",
"Ankyrin",
"WD40",
"Coiled coil",
"SAM",
"FERM",
"Harmonin",
]
class ProteinFeatureRecord(BaseModel):
"""Protein features for a single gene.
Attributes:
gene_id: Ensembl gene ID (e.g., ENSG00000...)
gene_symbol: HGNC gene symbol
uniprot_id: UniProt accession (NULL if no mapping)
protein_length: Amino acid length (NULL if no UniProt entry)
domain_count: Total number of annotated domains (NULL if no data)
coiled_coil: Has coiled-coil region (NULL if no data)
coiled_coil_count: Number of coiled-coil regions (NULL if no data)
transmembrane_count: Number of transmembrane regions (NULL if no data)
scaffold_adaptor_domain: Has PDZ, SH3, ankyrin, WD40, or similar scaffold domain (NULL if no data)
has_cilia_domain: Has IFT, BBSome, ciliary targeting, or transition zone domain (NULL if no data)
has_sensory_domain: Has stereocilia, photoreceptor, or sensory-associated domain (NULL if no data)
protein_score_normalized: Composite protein feature score 0-1 (NULL if no UniProt entry)
CRITICAL: NULL values represent missing data and are preserved as None.
Do NOT convert NULL to 0.0 - "unknown" is semantically different from "no domain".
"""
gene_id: str
gene_symbol: str
uniprot_id: str | None = None
protein_length: int | None = None
domain_count: int | None = None
coiled_coil: bool | None = None
coiled_coil_count: int | None = None
transmembrane_count: int | None = None
scaffold_adaptor_domain: bool | None = None
has_cilia_domain: bool | None = None
has_sensory_domain: bool | None = None
protein_score_normalized: float | None = None

View File

@@ -0,0 +1,411 @@
"""Transform and normalize protein features."""
from typing import List
import polars as pl
import structlog
from usher_pipeline.evidence.protein.models import (
CILIA_DOMAIN_KEYWORDS,
SCAFFOLD_DOMAIN_TYPES,
)
from usher_pipeline.evidence.protein.fetch import (
fetch_uniprot_features,
fetch_interpro_domains,
)
logger = structlog.get_logger()
def extract_protein_features(
uniprot_df: pl.DataFrame,
interpro_df: pl.DataFrame,
) -> pl.DataFrame:
"""Extract and combine protein features from UniProt and InterPro data.
Joins UniProt and InterPro data, computes domain counts, sets flags
for coiled-coil and transmembrane regions.
Args:
uniprot_df: DataFrame from fetch_uniprot_features with UniProt data
interpro_df: DataFrame from fetch_interpro_domains with InterPro data
Returns:
DataFrame with combined features:
- uniprot_id, protein_length, domain_count, coiled_coil,
coiled_coil_count, transmembrane_count, domain_names (combined)
"""
logger.info(
"extract_protein_features_start",
uniprot_rows=len(uniprot_df),
interpro_rows=len(interpro_df),
)
# Join UniProt and InterPro on uniprot_id
# Use left join to preserve all UniProt entries
combined = uniprot_df.join(
interpro_df,
on="uniprot_id",
how="left",
suffix="_interpro",
)
# Combine domain names from both sources (deduplicate)
# Use list.concat instead of + operator
combined = combined.with_columns([
# Combine domain name lists
pl.when(
pl.col("domain_names_interpro").is_not_null() &
(pl.col("domain_names_interpro").list.len() > 0)
)
.then(
pl.col("domain_names").list.concat(pl.col("domain_names_interpro"))
.list.unique()
)
.otherwise(pl.col("domain_names"))
.alias("domain_names_combined"),
])
# Compute domain count from combined sources
combined = combined.with_columns([
pl.col("domain_names_combined").list.len().alias("domain_count"),
])
# Set coiled_coil boolean from count
combined = combined.with_columns([
pl.when(pl.col("coiled_coil_count").is_not_null())
.then(pl.col("coiled_coil_count") > 0)
.otherwise(None)
.alias("coiled_coil"),
])
# Select final columns
result = combined.select([
"uniprot_id",
"protein_length",
"domain_count",
"coiled_coil",
"coiled_coil_count",
"transmembrane_count",
pl.col("domain_names_combined").alias("domain_names"),
])
logger.info(
"extract_protein_features_complete",
total_rows=len(result),
with_domains=result.filter(pl.col("domain_count") > 0).height,
)
return result
def detect_cilia_motifs(df: pl.DataFrame) -> pl.DataFrame:
"""Detect cilia-associated motifs via domain keyword matching.
Scans domain names for cilia-related keywords, scaffold/adaptor domains,
and sensory-related domains. Pattern matching does NOT presuppose cilia
involvement - it flags structural features for further investigation.
Args:
df: DataFrame with domain_names column (list of domain names)
Returns:
DataFrame with added columns:
- has_cilia_domain: Boolean flag for cilia-associated domains
- scaffold_adaptor_domain: Boolean flag for scaffold domains
- has_sensory_domain: Boolean flag for sensory domains
"""
logger.info("detect_cilia_motifs_start", row_count=len(df))
# Ensure domain_names is List(String), not List(Null)
# This handles edge case where all proteins have empty domain lists
if "domain_names" in df.columns:
current_dtype = df.schema["domain_names"]
if str(current_dtype) == "List(Null)" or "Null" in str(current_dtype):
df = df.with_columns([
pl.col("domain_names").cast(pl.List(pl.String)).alias("domain_names")
])
# Sensory domain keywords
sensory_keywords = [
"stereocilia",
"photoreceptor",
"usher",
"harmonin",
"cadherin 23",
"protocadherin",
]
# Helper function to check if any keyword matches any domain (case-insensitive)
def create_keyword_matcher(keywords: List[str]) -> pl.Expr:
"""Create polars expression that checks if any keyword matches any domain."""
# For each domain in the list, check if it contains any keyword
# Case-insensitive substring matching
conditions = []
for keyword in keywords:
conditions.append(
pl.col("domain_names")
.list.eval(
pl.when(pl.element().is_not_null())
.then(pl.element().str.to_lowercase().str.contains(keyword.lower()))
.otherwise(False)
)
.list.any()
)
# Return True if ANY condition matches
if conditions:
return pl.any_horizontal(conditions)
else:
return pl.lit(False)
# Detect cilia domains
df = df.with_columns([
pl.when(
pl.col("domain_names").is_not_null() &
(pl.col("domain_names").list.len() > 0)
)
.then(create_keyword_matcher(CILIA_DOMAIN_KEYWORDS))
.otherwise(None)
.alias("has_cilia_domain"),
])
# Detect scaffold/adaptor domains
df = df.with_columns([
pl.when(
pl.col("domain_names").is_not_null() &
(pl.col("domain_names").list.len() > 0)
)
.then(create_keyword_matcher(SCAFFOLD_DOMAIN_TYPES))
.otherwise(None)
.alias("scaffold_adaptor_domain"),
])
# Detect sensory domains
df = df.with_columns([
pl.when(
pl.col("domain_names").is_not_null() &
(pl.col("domain_names").list.len() > 0)
)
.then(create_keyword_matcher(sensory_keywords))
.otherwise(None)
.alias("has_sensory_domain"),
])
# Log summary
cilia_count = df.filter(pl.col("has_cilia_domain") == True).height
scaffold_count = df.filter(pl.col("scaffold_adaptor_domain") == True).height
sensory_count = df.filter(pl.col("has_sensory_domain") == True).height
logger.info(
"detect_cilia_motifs_complete",
cilia_domain_count=cilia_count,
scaffold_domain_count=scaffold_count,
sensory_domain_count=sensory_count,
)
return df
def normalize_protein_features(df: pl.DataFrame) -> pl.DataFrame:
"""Normalize protein features to 0-1 range and compute composite score.
Normalizes continuous features (protein_length, domain_count, transmembrane_count)
and computes weighted composite score. NULL preservation: genes without UniProt
entries get NULL scores (not 0.0).
Args:
df: DataFrame with extracted protein features
Returns:
DataFrame with added protein_score_normalized column (0-1 range)
"""
logger.info("normalize_protein_features_start", row_count=len(df))
# Filter to proteins with data for normalization stats
measured = df.filter(pl.col("protein_length").is_not_null())
if len(measured) == 0:
# No proteins with data - return with NULL scores
return df.with_columns([
pl.lit(None).cast(pl.Float64).alias("length_rank"),
pl.lit(None).cast(pl.Float64).alias("domain_rank"),
pl.lit(None).cast(pl.Float64).alias("transmembrane_normalized"),
pl.lit(None).cast(pl.Float64).alias("protein_score_normalized"),
])
# Normalize protein length via log-transform rank percentile
# Log-transform handles skewed distribution, rank percentile gives 0-1
df = df.with_columns([
pl.when(pl.col("protein_length").is_not_null())
.then(
pl.col("protein_length").log10().rank(method="average") /
measured.select(pl.col("protein_length").log10().rank(method="average").max()).item()
)
.otherwise(None)
.alias("length_rank"),
])
# Normalize domain count via rank percentile
df = df.with_columns([
pl.when(pl.col("domain_count").is_not_null())
.then(
pl.col("domain_count").rank(method="average") /
measured.select(pl.col("domain_count").rank(method="average").max()).item()
)
.otherwise(None)
.alias("domain_rank"),
])
# Normalize transmembrane count (cap at 20, then divide)
df = df.with_columns([
pl.when(pl.col("transmembrane_count").is_not_null())
.then(
pl.min_horizontal(pl.col("transmembrane_count"), pl.lit(20)) / 20.0
)
.otherwise(None)
.alias("transmembrane_normalized"),
])
# Convert boolean flags to 0/1 for composite score
df = df.with_columns([
pl.col("coiled_coil").cast(pl.Float64).fill_null(0.0).alias("coiled_coil_score"),
pl.col("has_cilia_domain").cast(pl.Float64).fill_null(0.0).alias("cilia_score"),
pl.col("scaffold_adaptor_domain").cast(pl.Float64).fill_null(0.0).alias("scaffold_score"),
])
# Composite protein score (weighted combination)
# Weights: length 15%, domain 20%, coiled-coil 20%, TM 20%, cilia 15%, scaffold 10%
# NULL if no protein_length (i.e., no UniProt entry)
df = df.with_columns([
pl.when(pl.col("protein_length").is_not_null())
.then(
(0.15 * pl.col("length_rank").fill_null(0.0)) +
(0.20 * pl.col("domain_rank").fill_null(0.0)) +
(0.20 * pl.col("coiled_coil_score")) +
(0.20 * pl.col("transmembrane_normalized").fill_null(0.0)) +
(0.15 * pl.col("cilia_score")) +
(0.10 * pl.col("scaffold_score"))
)
.otherwise(None)
.alias("protein_score_normalized"),
])
# Drop temporary scoring columns
df = df.drop([
"length_rank",
"domain_rank",
"transmembrane_normalized",
"coiled_coil_score",
"cilia_score",
"scaffold_score",
])
# Log summary statistics
score_stats = df.filter(pl.col("protein_score_normalized").is_not_null()).select([
pl.col("protein_score_normalized").min().alias("min"),
pl.col("protein_score_normalized").max().alias("max"),
pl.col("protein_score_normalized").mean().alias("mean"),
])
if len(score_stats) > 0:
logger.info(
"normalize_protein_features_complete",
scored_proteins=df.filter(pl.col("protein_score_normalized").is_not_null()).height,
score_min=round(score_stats["min"][0], 3),
score_max=round(score_stats["max"][0], 3),
score_mean=round(score_stats["mean"][0], 3),
)
else:
logger.info("normalize_protein_features_complete", scored_proteins=0)
return df
def process_protein_evidence(
gene_ids: List[str],
uniprot_mapping: pl.DataFrame,
) -> pl.DataFrame:
"""End-to-end protein evidence processing pipeline.
Composes: map gene IDs -> fetch UniProt -> fetch InterPro ->
extract features -> detect motifs -> normalize -> collect
Args:
gene_ids: List of Ensembl gene IDs
uniprot_mapping: DataFrame with gene_id, gene_symbol, uniprot_id columns
Returns:
Materialized DataFrame ready for DuckDB storage with columns:
- gene_id, gene_symbol, uniprot_id, protein_length, domain_count,
coiled_coil, coiled_coil_count, transmembrane_count,
scaffold_adaptor_domain, has_cilia_domain, has_sensory_domain,
protein_score_normalized
"""
logger.info("process_protein_evidence_start", gene_count=len(gene_ids))
# Filter mapping to requested genes
gene_map = uniprot_mapping.filter(pl.col("gene_id").is_in(gene_ids))
# Extract UniProt IDs (filter out NULLs)
uniprot_ids = (
gene_map
.filter(pl.col("uniprot_id").is_not_null())
.select("uniprot_id")
.unique()
.to_series()
.to_list()
)
logger.info(
"uniprot_mapping_complete",
total_genes=len(gene_ids),
mapped_genes=len(uniprot_ids),
)
# Fetch UniProt features
uniprot_df = fetch_uniprot_features(uniprot_ids)
# Fetch InterPro domains
interpro_df = fetch_interpro_domains(uniprot_ids)
# Extract features
features_df = extract_protein_features(uniprot_df, interpro_df)
# Detect cilia motifs
features_df = detect_cilia_motifs(features_df)
# Normalize features
features_df = normalize_protein_features(features_df)
# Join back to gene mapping to get gene_id and gene_symbol
# Use right join to preserve all genes (including those without UniProt mapping)
result = features_df.join(
gene_map.select(["gene_id", "gene_symbol", "uniprot_id"]),
on="uniprot_id",
how="right",
)
# Select final columns in order
result = result.select([
"gene_id",
"gene_symbol",
"uniprot_id",
"protein_length",
"domain_count",
"coiled_coil",
"coiled_coil_count",
"transmembrane_count",
"scaffold_adaptor_domain",
"has_cilia_domain",
"has_sensory_domain",
"protein_score_normalized",
])
logger.info(
"process_protein_evidence_complete",
total_genes=len(result),
with_uniprot=result.filter(pl.col("uniprot_id").is_not_null()).height,
with_scores=result.filter(pl.col("protein_score_normalized").is_not_null()).height,
)
return result