feat(03-03): implement protein evidence layer with UniProt/InterPro integration

- Create protein features data model with domain, coiled-coil, TM, cilia motifs - Implement fetch.py with UniProt REST API and InterPro API queries - Implement transform.py with feature extraction, motif detection, normalization - Implement load.py with DuckDB persistence and provenance tracking - Add CLI protein command following evidence layer pattern - Add comprehensive unit and integration tests (all passing) - Handle NULL preservation and List(Null) edge case - Add get_steps() method to ProvenanceTracker for test compatibility
2026-02-11 19:07:30 +08:00
parent bcd3c4ffbe
commit 46059874f2
10 changed files with 1937 additions and 0 deletions
--- a/src/usher_pipeline/evidence/protein/init.py
+++ b/src/usher_pipeline/evidence/protein/init.py
@@ -0,0 +1,46 @@
+"""Protein sequence and structure features evidence layer.
+
+Extracts protein features from UniProt and InterPro:
+- Protein length, domain composition, domain count
+- Coiled-coil regions, transmembrane domains, scaffold/adaptor domains
+- Cilia-associated motifs detected via domain keyword matching
+- Normalized composite protein score (0-1)
+
+Evidence follows fetch -> transform -> load pattern with checkpoint-restart.
+"""
+
+from usher_pipeline.evidence.protein.models import (
+    ProteinFeatureRecord,
+    PROTEIN_TABLE_NAME,
+    CILIA_DOMAIN_KEYWORDS,
+    SCAFFOLD_DOMAIN_TYPES,
+)
+from usher_pipeline.evidence.protein.fetch import (
+    fetch_uniprot_features,
+    fetch_interpro_domains,
+)
+from usher_pipeline.evidence.protein.transform import (
+    extract_protein_features,
+    detect_cilia_motifs,
+    normalize_protein_features,
+    process_protein_evidence,
+)
+from usher_pipeline.evidence.protein.load import (
+    load_to_duckdb,
+    query_cilia_candidates,
+)
+
+__all__ = [
+    "ProteinFeatureRecord",
+    "PROTEIN_TABLE_NAME",
+    "CILIA_DOMAIN_KEYWORDS",
+    "SCAFFOLD_DOMAIN_TYPES",
+    "fetch_uniprot_features",
+    "fetch_interpro_domains",
+    "extract_protein_features",
+    "detect_cilia_motifs",
+    "normalize_protein_features",
+    "process_protein_evidence",
+    "load_to_duckdb",
+    "query_cilia_candidates",
+]
--- a/src/usher_pipeline/evidence/protein/fetch.py
+++ b/src/usher_pipeline/evidence/protein/fetch.py
@@ -0,0 +1,274 @@
+"""Fetch protein features from UniProt and InterPro APIs."""
+
+import time
+from typing import List
+
+import httpx
+import polars as pl
+import structlog
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_exponential,
+    retry_if_exception_type,
+)
+
+logger = structlog.get_logger()
+
+# UniProt REST API base URL
+UNIPROT_API_BASE = "https://rest.uniprot.org"
+
+# InterPro REST API base URL
+INTERPRO_API_BASE = "https://www.ebi.ac.uk/interpro/api"
+
+
+@retry(
+    stop=stop_after_attempt(5),
+    wait=wait_exponential(multiplier=1, min=4, max=60),
+    retry=retry_if_exception_type(
+        (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
+    ),
+)
+def fetch_uniprot_features(uniprot_ids: List[str]) -> pl.DataFrame:
+    """Query UniProt REST API for protein features.
+
+    Fetches protein length, domain annotations, coiled-coil regions,
+    and transmembrane regions from UniProt in batches.
+
+    Args:
+        uniprot_ids: List of UniProt accession IDs
+
+    Returns:
+        DataFrame with columns:
+        - uniprot_id: UniProt accession
+        - protein_length: Amino acid length
+        - domain_names: List of domain names
+        - coiled_coil_count: Number of coiled-coil regions
+        - transmembrane_count: Number of transmembrane regions
+
+    Raises:
+        httpx.HTTPStatusError: On HTTP errors (after retries)
+        httpx.ConnectError: On connection errors (after retries)
+        httpx.TimeoutException: On timeout (after retries)
+    """
+    if not uniprot_ids:
+        return pl.DataFrame({
+            "uniprot_id": [],
+            "protein_length": [],
+            "domain_names": [],
+            "coiled_coil_count": [],
+            "transmembrane_count": [],
+        })
+
+    logger.info("uniprot_fetch_start", accession_count=len(uniprot_ids))
+
+    # UniProt batch size recommendation: 100 accessions per request
+    batch_size = 100
+    all_records = []
+
+    for i in range(0, len(uniprot_ids), batch_size):
+        batch = uniprot_ids[i:i + batch_size]
+        batch_num = (i // batch_size) + 1
+        total_batches = (len(uniprot_ids) + batch_size - 1) // batch_size
+
+        logger.info(
+            "uniprot_batch_start",
+            batch=batch_num,
+            total_batches=total_batches,
+            batch_size=len(batch),
+        )
+
+        # Query UniProt API with accession list
+        # Use search endpoint with fields parameter
+        query = " OR ".join(f"accession:{acc}" for acc in batch)
+        url = f"{UNIPROT_API_BASE}/uniprotkb/search"
+        params = {
+            "query": query,
+            "format": "json",
+            "fields": "accession,length,ft_domain,ft_coiled,ft_transmem,annotation_score",
+            "size": batch_size,
+        }
+
+        with httpx.Client(timeout=30.0) as client:
+            response = client.get(url, params=params)
+            response.raise_for_status()
+            data = response.json()
+
+        # Parse results
+        results = data.get("results", [])
+
+        # Create lookup for fast access
+        found_accessions = set()
+        for entry in results:
+            accession = entry.get("primaryAccession")
+            if not accession:
+                continue
+
+            found_accessions.add(accession)
+
+            # Extract protein length
+            length = entry.get("sequence", {}).get("length")
+
+            # Extract domain names from ft_domain features
+            domain_names = []
+            for feature in entry.get("features", []):
+                if feature.get("type") == "Domain":
+                    description = feature.get("description", "")
+                    if description:
+                        domain_names.append(description)
+
+            # Count coiled-coil regions
+            coiled_coil_count = sum(
+                1 for feature in entry.get("features", [])
+                if feature.get("type") == "Coiled coil"
+            )
+
+            # Count transmembrane regions
+            transmembrane_count = sum(
+                1 for feature in entry.get("features", [])
+                if feature.get("type") == "Transmembrane"
+            )
+
+            all_records.append({
+                "uniprot_id": accession,
+                "protein_length": length,
+                "domain_names": domain_names,
+                "coiled_coil_count": coiled_coil_count,
+                "transmembrane_count": transmembrane_count,
+            })
+
+        # Add NULL records for accessions not found
+        for acc in batch:
+            if acc not in found_accessions:
+                all_records.append({
+                    "uniprot_id": acc,
+                    "protein_length": None,
+                    "domain_names": [],
+                    "coiled_coil_count": None,
+                    "transmembrane_count": None,
+                })
+
+        # Rate limiting: UniProt allows 200 requests/second
+        # With batches of 100, this gives us 20K accessions/second
+        # Conservative: 5 requests/second = 500 accessions/second
+        time.sleep(0.2)
+
+    logger.info(
+        "uniprot_fetch_complete",
+        total_accessions=len(uniprot_ids),
+        records_found=sum(1 for r in all_records if r["protein_length"] is not None),
+    )
+
+    return pl.DataFrame(all_records)
+
+
+@retry(
+    stop=stop_after_attempt(5),
+    wait=wait_exponential(multiplier=1, min=4, max=60),
+    retry=retry_if_exception_type(
+        (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
+    ),
+)
+def fetch_interpro_domains(uniprot_ids: List[str]) -> pl.DataFrame:
+    """Query InterPro API for domain annotations.
+
+    Fetches detailed domain classification from InterPro to supplement
+    UniProt annotations.
+
+    Args:
+        uniprot_ids: List of UniProt accession IDs
+
+    Returns:
+        DataFrame with columns:
+        - uniprot_id: UniProt accession
+        - domain_names: List of InterPro domain names
+        - interpro_ids: List of InterPro accession IDs
+
+    Raises:
+        httpx.HTTPStatusError: On HTTP errors (after retries)
+        httpx.ConnectError: On connection errors (after retries)
+        httpx.TimeoutException: On timeout (after retries)
+    """
+    if not uniprot_ids:
+        return pl.DataFrame({
+            "uniprot_id": [],
+            "domain_names": [],
+            "interpro_ids": [],
+        })
+
+    logger.info("interpro_fetch_start", accession_count=len(uniprot_ids))
+
+    # InterPro requires per-protein queries
+    # Use conservative rate limiting: 10 req/sec as recommended
+    all_records = []
+
+    # For large datasets (>10K proteins), log warning about potential slowness
+    if len(uniprot_ids) > 10000:
+        logger.warning(
+            "interpro_large_dataset",
+            accession_count=len(uniprot_ids),
+            estimated_time_min=len(uniprot_ids) / 10 / 60,
+            note="Consider InterPro bulk download for large datasets",
+        )
+
+    with httpx.Client(timeout=30.0) as client:
+        for idx, accession in enumerate(uniprot_ids, 1):
+            if idx % 100 == 0:
+                logger.info(
+                    "interpro_progress",
+                    processed=idx,
+                    total=len(uniprot_ids),
+                    percent=round(idx / len(uniprot_ids) * 100, 1),
+                )
+
+            # Query InterPro API
+            url = f"{INTERPRO_API_BASE}/entry/interpro/protein/uniprot/{accession}"
+
+            try:
+                response = client.get(url, params={"format": "json"})
+                response.raise_for_status()
+                data = response.json()
+
+                # Parse InterPro entries
+                domain_names = []
+                interpro_ids = []
+
+                if "results" in data:
+                    for entry in data["results"]:
+                        metadata = entry.get("metadata", {})
+                        interpro_id = metadata.get("accession")
+                        name = metadata.get("name", {}).get("name", "")
+
+                        if interpro_id:
+                            interpro_ids.append(interpro_id)
+                        if name:
+                            domain_names.append(name)
+
+                all_records.append({
+                    "uniprot_id": accession,
+                    "domain_names": domain_names,
+                    "interpro_ids": interpro_ids,
+                })
+
+            except httpx.HTTPStatusError as e:
+                if e.response.status_code == 404:
+                    # Protein not found in InterPro - this is OK
+                    all_records.append({
+                        "uniprot_id": accession,
+                        "domain_names": [],
+                        "interpro_ids": [],
+                    })
+                else:
+                    # Other HTTP errors - re-raise for retry
+                    raise
+
+            # Rate limiting: 10 requests/second
+            time.sleep(0.1)
+
+    logger.info(
+        "interpro_fetch_complete",
+        total_accessions=len(uniprot_ids),
+        records_found=sum(1 for r in all_records if r["domain_names"]),
+    )
+
+    return pl.DataFrame(all_records)
--- a/src/usher_pipeline/evidence/protein/load.py
+++ b/src/usher_pipeline/evidence/protein/load.py
@@ -0,0 +1,128 @@
+"""Load protein features to DuckDB with provenance tracking."""
+
+from typing import Optional
+
+import polars as pl
+import structlog
+
+from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
+
+logger = structlog.get_logger()
+
+
+def load_to_duckdb(
+    df: pl.DataFrame,
+    store: PipelineStore,
+    provenance: ProvenanceTracker,
+    description: str = "",
+) -> None:
+    """Save protein features DataFrame to DuckDB with provenance.
+
+    Creates or replaces the protein_features table (idempotent).
+    Records provenance step with summary statistics.
+
+    Args:
+        df: Processed protein features DataFrame with normalized scores
+        store: PipelineStore instance for DuckDB persistence
+        provenance: ProvenanceTracker instance for metadata recording
+        description: Optional description for checkpoint metadata
+    """
+    logger.info("protein_load_start", row_count=len(df))
+
+    # Calculate summary statistics for provenance
+    total_genes = len(df)
+    with_uniprot = df.filter(pl.col("uniprot_id").is_not_null()).height
+    null_uniprot = total_genes - with_uniprot
+
+    cilia_domain_count = df.filter(pl.col("has_cilia_domain") == True).height
+    scaffold_domain_count = df.filter(pl.col("scaffold_adaptor_domain") == True).height
+    coiled_coil_count = df.filter(pl.col("coiled_coil") == True).height
+    tm_domain_count = df.filter(pl.col("transmembrane_count") > 0).height
+
+    # Mean domain count (only for proteins with data)
+    mean_domain_count = (
+        df.filter(pl.col("domain_count").is_not_null())
+        .select(pl.col("domain_count").mean())
+        .item()
+    )
+    if mean_domain_count is not None:
+        mean_domain_count = round(mean_domain_count, 2)
+
+    # Save to DuckDB with CREATE OR REPLACE (idempotent)
+    store.save_dataframe(
+        df=df,
+        table_name="protein_features",
+        description=description or "Protein features from UniProt/InterPro with domain composition and cilia motif detection",
+        replace=True,
+    )
+
+    # Record provenance step with details
+    provenance.record_step("load_protein_features", {
+        "total_genes": total_genes,
+        "with_uniprot": with_uniprot,
+        "null_uniprot": null_uniprot,
+        "cilia_domain_count": cilia_domain_count,
+        "scaffold_domain_count": scaffold_domain_count,
+        "coiled_coil_count": coiled_coil_count,
+        "transmembrane_domain_count": tm_domain_count,
+        "mean_domain_count": mean_domain_count,
+    })
+
+    logger.info(
+        "protein_load_complete",
+        row_count=total_genes,
+        with_uniprot=with_uniprot,
+        cilia_domains=cilia_domain_count,
+        scaffold_domains=scaffold_domain_count,
+        coiled_coils=coiled_coil_count,
+    )
+
+
+def query_cilia_candidates(
+    store: PipelineStore,
+) -> pl.DataFrame:
+    """Query genes with cilia-associated protein features.
+
+    Identifies candidate genes with:
+    - Cilia domain annotations, OR
+    - Both coiled-coil regions AND scaffold/adaptor domains
+
+    This combination is enriched in known cilia proteins and provides
+    structural evidence for potential cilia involvement.
+
+    Args:
+        store: PipelineStore instance
+
+    Returns:
+        DataFrame with candidate genes sorted by protein_score_normalized
+        Columns: gene_id, gene_symbol, protein_length, domain_count,
+                 coiled_coil, has_cilia_domain, scaffold_adaptor_domain,
+                 protein_score_normalized
+    """
+    logger.info("protein_query_cilia_candidates")
+
+    # Query DuckDB for genes with cilia features
+    df = store.execute_query(
+        """
+        SELECT
+            gene_id,
+            gene_symbol,
+            uniprot_id,
+            protein_length,
+            domain_count,
+            coiled_coil,
+            transmembrane_count,
+            scaffold_adaptor_domain,
+            has_cilia_domain,
+            has_sensory_domain,
+            protein_score_normalized
+        FROM protein_features
+        WHERE has_cilia_domain = TRUE
+           OR (coiled_coil = TRUE AND scaffold_adaptor_domain = TRUE)
+        ORDER BY protein_score_normalized DESC NULLS LAST
+        """
+    )
+
+    logger.info("protein_query_complete", result_count=len(df))
+
+    return df
--- a/src/usher_pipeline/evidence/protein/models.py
+++ b/src/usher_pipeline/evidence/protein/models.py
@@ -0,0 +1,71 @@
+"""Data models for protein sequence and structure features."""
+
+from pydantic import BaseModel
+
+# DuckDB table name for protein features
+PROTEIN_TABLE_NAME = "protein_features"
+
+# Cilia-associated domain keywords for motif detection
+# These are structural patterns found in known cilia proteins
+# Pattern matching does NOT presuppose cilia involvement - it flags features
+# associated with cilia biology for further investigation
+CILIA_DOMAIN_KEYWORDS = [
+    "IFT",
+    "intraflagellar",
+    "BBSome",
+    "ciliary",
+    "cilia",
+    "basal body",
+    "centrosome",
+    "transition zone",
+    "axoneme",
+]
+
+# Scaffold and adaptor domain types commonly found in Usher proteins
+# These domains mediate protein-protein interactions and are enriched in
+# cilia-associated proteins
+SCAFFOLD_DOMAIN_TYPES = [
+    "PDZ",
+    "SH3",
+    "Ankyrin",
+    "WD40",
+    "Coiled coil",
+    "SAM",
+    "FERM",
+    "Harmonin",
+]
+
+
+class ProteinFeatureRecord(BaseModel):
+    """Protein features for a single gene.
+
+    Attributes:
+        gene_id: Ensembl gene ID (e.g., ENSG00000...)
+        gene_symbol: HGNC gene symbol
+        uniprot_id: UniProt accession (NULL if no mapping)
+        protein_length: Amino acid length (NULL if no UniProt entry)
+        domain_count: Total number of annotated domains (NULL if no data)
+        coiled_coil: Has coiled-coil region (NULL if no data)
+        coiled_coil_count: Number of coiled-coil regions (NULL if no data)
+        transmembrane_count: Number of transmembrane regions (NULL if no data)
+        scaffold_adaptor_domain: Has PDZ, SH3, ankyrin, WD40, or similar scaffold domain (NULL if no data)
+        has_cilia_domain: Has IFT, BBSome, ciliary targeting, or transition zone domain (NULL if no data)
+        has_sensory_domain: Has stereocilia, photoreceptor, or sensory-associated domain (NULL if no data)
+        protein_score_normalized: Composite protein feature score 0-1 (NULL if no UniProt entry)
+
+    CRITICAL: NULL values represent missing data and are preserved as None.
+    Do NOT convert NULL to 0.0 - "unknown" is semantically different from "no domain".
+    """
+
+    gene_id: str
+    gene_symbol: str
+    uniprot_id: str | None = None
+    protein_length: int | None = None
+    domain_count: int | None = None
+    coiled_coil: bool | None = None
+    coiled_coil_count: int | None = None
+    transmembrane_count: int | None = None
+    scaffold_adaptor_domain: bool | None = None
+    has_cilia_domain: bool | None = None
+    has_sensory_domain: bool | None = None
+    protein_score_normalized: float | None = None
--- a/src/usher_pipeline/evidence/protein/transform.py
+++ b/src/usher_pipeline/evidence/protein/transform.py
@@ -0,0 +1,411 @@
+"""Transform and normalize protein features."""
+
+from typing import List
+
+import polars as pl
+import structlog
+
+from usher_pipeline.evidence.protein.models import (
+    CILIA_DOMAIN_KEYWORDS,
+    SCAFFOLD_DOMAIN_TYPES,
+)
+from usher_pipeline.evidence.protein.fetch import (
+    fetch_uniprot_features,
+    fetch_interpro_domains,
+)
+
+logger = structlog.get_logger()
+
+
+def extract_protein_features(
+    uniprot_df: pl.DataFrame,
+    interpro_df: pl.DataFrame,
+) -> pl.DataFrame:
+    """Extract and combine protein features from UniProt and InterPro data.
+
+    Joins UniProt and InterPro data, computes domain counts, sets flags
+    for coiled-coil and transmembrane regions.
+
+    Args:
+        uniprot_df: DataFrame from fetch_uniprot_features with UniProt data
+        interpro_df: DataFrame from fetch_interpro_domains with InterPro data
+
+    Returns:
+        DataFrame with combined features:
+        - uniprot_id, protein_length, domain_count, coiled_coil,
+          coiled_coil_count, transmembrane_count, domain_names (combined)
+    """
+    logger.info(
+        "extract_protein_features_start",
+        uniprot_rows=len(uniprot_df),
+        interpro_rows=len(interpro_df),
+    )
+
+    # Join UniProt and InterPro on uniprot_id
+    # Use left join to preserve all UniProt entries
+    combined = uniprot_df.join(
+        interpro_df,
+        on="uniprot_id",
+        how="left",
+        suffix="_interpro",
+    )
+
+    # Combine domain names from both sources (deduplicate)
+    # Use list.concat instead of + operator
+    combined = combined.with_columns([
+        # Combine domain name lists
+        pl.when(
+            pl.col("domain_names_interpro").is_not_null() &
+            (pl.col("domain_names_interpro").list.len() > 0)
+        )
+        .then(
+            pl.col("domain_names").list.concat(pl.col("domain_names_interpro"))
+            .list.unique()
+        )
+        .otherwise(pl.col("domain_names"))
+        .alias("domain_names_combined"),
+    ])
+
+    # Compute domain count from combined sources
+    combined = combined.with_columns([
+        pl.col("domain_names_combined").list.len().alias("domain_count"),
+    ])
+
+    # Set coiled_coil boolean from count
+    combined = combined.with_columns([
+        pl.when(pl.col("coiled_coil_count").is_not_null())
+        .then(pl.col("coiled_coil_count") > 0)
+        .otherwise(None)
+        .alias("coiled_coil"),
+    ])
+
+    # Select final columns
+    result = combined.select([
+        "uniprot_id",
+        "protein_length",
+        "domain_count",
+        "coiled_coil",
+        "coiled_coil_count",
+        "transmembrane_count",
+        pl.col("domain_names_combined").alias("domain_names"),
+    ])
+
+    logger.info(
+        "extract_protein_features_complete",
+        total_rows=len(result),
+        with_domains=result.filter(pl.col("domain_count") > 0).height,
+    )
+
+    return result
+
+
+def detect_cilia_motifs(df: pl.DataFrame) -> pl.DataFrame:
+    """Detect cilia-associated motifs via domain keyword matching.
+
+    Scans domain names for cilia-related keywords, scaffold/adaptor domains,
+    and sensory-related domains. Pattern matching does NOT presuppose cilia
+    involvement - it flags structural features for further investigation.
+
+    Args:
+        df: DataFrame with domain_names column (list of domain names)
+
+    Returns:
+        DataFrame with added columns:
+        - has_cilia_domain: Boolean flag for cilia-associated domains
+        - scaffold_adaptor_domain: Boolean flag for scaffold domains
+        - has_sensory_domain: Boolean flag for sensory domains
+    """
+    logger.info("detect_cilia_motifs_start", row_count=len(df))
+
+    # Ensure domain_names is List(String), not List(Null)
+    # This handles edge case where all proteins have empty domain lists
+    if "domain_names" in df.columns:
+        current_dtype = df.schema["domain_names"]
+        if str(current_dtype) == "List(Null)" or "Null" in str(current_dtype):
+            df = df.with_columns([
+                pl.col("domain_names").cast(pl.List(pl.String)).alias("domain_names")
+            ])
+
+    # Sensory domain keywords
+    sensory_keywords = [
+        "stereocilia",
+        "photoreceptor",
+        "usher",
+        "harmonin",
+        "cadherin 23",
+        "protocadherin",
+    ]
+
+    # Helper function to check if any keyword matches any domain (case-insensitive)
+    def create_keyword_matcher(keywords: List[str]) -> pl.Expr:
+        """Create polars expression that checks if any keyword matches any domain."""
+        # For each domain in the list, check if it contains any keyword
+        # Case-insensitive substring matching
+        conditions = []
+        for keyword in keywords:
+            conditions.append(
+                pl.col("domain_names")
+                .list.eval(
+                    pl.when(pl.element().is_not_null())
+                    .then(pl.element().str.to_lowercase().str.contains(keyword.lower()))
+                    .otherwise(False)
+                )
+                .list.any()
+            )
+        # Return True if ANY condition matches
+        if conditions:
+            return pl.any_horizontal(conditions)
+        else:
+            return pl.lit(False)
+
+    # Detect cilia domains
+    df = df.with_columns([
+        pl.when(
+            pl.col("domain_names").is_not_null() &
+            (pl.col("domain_names").list.len() > 0)
+        )
+        .then(create_keyword_matcher(CILIA_DOMAIN_KEYWORDS))
+        .otherwise(None)
+        .alias("has_cilia_domain"),
+    ])
+
+    # Detect scaffold/adaptor domains
+    df = df.with_columns([
+        pl.when(
+            pl.col("domain_names").is_not_null() &
+            (pl.col("domain_names").list.len() > 0)
+        )
+        .then(create_keyword_matcher(SCAFFOLD_DOMAIN_TYPES))
+        .otherwise(None)
+        .alias("scaffold_adaptor_domain"),
+    ])
+
+    # Detect sensory domains
+    df = df.with_columns([
+        pl.when(
+            pl.col("domain_names").is_not_null() &
+            (pl.col("domain_names").list.len() > 0)
+        )
+        .then(create_keyword_matcher(sensory_keywords))
+        .otherwise(None)
+        .alias("has_sensory_domain"),
+    ])
+
+    # Log summary
+    cilia_count = df.filter(pl.col("has_cilia_domain") == True).height
+    scaffold_count = df.filter(pl.col("scaffold_adaptor_domain") == True).height
+    sensory_count = df.filter(pl.col("has_sensory_domain") == True).height
+
+    logger.info(
+        "detect_cilia_motifs_complete",
+        cilia_domain_count=cilia_count,
+        scaffold_domain_count=scaffold_count,
+        sensory_domain_count=sensory_count,
+    )
+
+    return df
+
+
+def normalize_protein_features(df: pl.DataFrame) -> pl.DataFrame:
+    """Normalize protein features to 0-1 range and compute composite score.
+
+    Normalizes continuous features (protein_length, domain_count, transmembrane_count)
+    and computes weighted composite score. NULL preservation: genes without UniProt
+    entries get NULL scores (not 0.0).
+
+    Args:
+        df: DataFrame with extracted protein features
+
+    Returns:
+        DataFrame with added protein_score_normalized column (0-1 range)
+    """
+    logger.info("normalize_protein_features_start", row_count=len(df))
+
+    # Filter to proteins with data for normalization stats
+    measured = df.filter(pl.col("protein_length").is_not_null())
+
+    if len(measured) == 0:
+        # No proteins with data - return with NULL scores
+        return df.with_columns([
+            pl.lit(None).cast(pl.Float64).alias("length_rank"),
+            pl.lit(None).cast(pl.Float64).alias("domain_rank"),
+            pl.lit(None).cast(pl.Float64).alias("transmembrane_normalized"),
+            pl.lit(None).cast(pl.Float64).alias("protein_score_normalized"),
+        ])
+
+    # Normalize protein length via log-transform rank percentile
+    # Log-transform handles skewed distribution, rank percentile gives 0-1
+    df = df.with_columns([
+        pl.when(pl.col("protein_length").is_not_null())
+        .then(
+            pl.col("protein_length").log10().rank(method="average") /
+            measured.select(pl.col("protein_length").log10().rank(method="average").max()).item()
+        )
+        .otherwise(None)
+        .alias("length_rank"),
+    ])
+
+    # Normalize domain count via rank percentile
+    df = df.with_columns([
+        pl.when(pl.col("domain_count").is_not_null())
+        .then(
+            pl.col("domain_count").rank(method="average") /
+            measured.select(pl.col("domain_count").rank(method="average").max()).item()
+        )
+        .otherwise(None)
+        .alias("domain_rank"),
+    ])
+
+    # Normalize transmembrane count (cap at 20, then divide)
+    df = df.with_columns([
+        pl.when(pl.col("transmembrane_count").is_not_null())
+        .then(
+            pl.min_horizontal(pl.col("transmembrane_count"), pl.lit(20)) / 20.0
+        )
+        .otherwise(None)
+        .alias("transmembrane_normalized"),
+    ])
+
+    # Convert boolean flags to 0/1 for composite score
+    df = df.with_columns([
+        pl.col("coiled_coil").cast(pl.Float64).fill_null(0.0).alias("coiled_coil_score"),
+        pl.col("has_cilia_domain").cast(pl.Float64).fill_null(0.0).alias("cilia_score"),
+        pl.col("scaffold_adaptor_domain").cast(pl.Float64).fill_null(0.0).alias("scaffold_score"),
+    ])
+
+    # Composite protein score (weighted combination)
+    # Weights: length 15%, domain 20%, coiled-coil 20%, TM 20%, cilia 15%, scaffold 10%
+    # NULL if no protein_length (i.e., no UniProt entry)
+    df = df.with_columns([
+        pl.when(pl.col("protein_length").is_not_null())
+        .then(
+            (0.15 * pl.col("length_rank").fill_null(0.0)) +
+            (0.20 * pl.col("domain_rank").fill_null(0.0)) +
+            (0.20 * pl.col("coiled_coil_score")) +
+            (0.20 * pl.col("transmembrane_normalized").fill_null(0.0)) +
+            (0.15 * pl.col("cilia_score")) +
+            (0.10 * pl.col("scaffold_score"))
+        )
+        .otherwise(None)
+        .alias("protein_score_normalized"),
+    ])
+
+    # Drop temporary scoring columns
+    df = df.drop([
+        "length_rank",
+        "domain_rank",
+        "transmembrane_normalized",
+        "coiled_coil_score",
+        "cilia_score",
+        "scaffold_score",
+    ])
+
+    # Log summary statistics
+    score_stats = df.filter(pl.col("protein_score_normalized").is_not_null()).select([
+        pl.col("protein_score_normalized").min().alias("min"),
+        pl.col("protein_score_normalized").max().alias("max"),
+        pl.col("protein_score_normalized").mean().alias("mean"),
+    ])
+
+    if len(score_stats) > 0:
+        logger.info(
+            "normalize_protein_features_complete",
+            scored_proteins=df.filter(pl.col("protein_score_normalized").is_not_null()).height,
+            score_min=round(score_stats["min"][0], 3),
+            score_max=round(score_stats["max"][0], 3),
+            score_mean=round(score_stats["mean"][0], 3),
+        )
+    else:
+        logger.info("normalize_protein_features_complete", scored_proteins=0)
+
+    return df
+
+
+def process_protein_evidence(
+    gene_ids: List[str],
+    uniprot_mapping: pl.DataFrame,
+) -> pl.DataFrame:
+    """End-to-end protein evidence processing pipeline.
+
+    Composes: map gene IDs -> fetch UniProt -> fetch InterPro ->
+              extract features -> detect motifs -> normalize -> collect
+
+    Args:
+        gene_ids: List of Ensembl gene IDs
+        uniprot_mapping: DataFrame with gene_id, gene_symbol, uniprot_id columns
+
+    Returns:
+        Materialized DataFrame ready for DuckDB storage with columns:
+        - gene_id, gene_symbol, uniprot_id, protein_length, domain_count,
+          coiled_coil, coiled_coil_count, transmembrane_count,
+          scaffold_adaptor_domain, has_cilia_domain, has_sensory_domain,
+          protein_score_normalized
+    """
+    logger.info("process_protein_evidence_start", gene_count=len(gene_ids))
+
+    # Filter mapping to requested genes
+    gene_map = uniprot_mapping.filter(pl.col("gene_id").is_in(gene_ids))
+
+    # Extract UniProt IDs (filter out NULLs)
+    uniprot_ids = (
+        gene_map
+        .filter(pl.col("uniprot_id").is_not_null())
+        .select("uniprot_id")
+        .unique()
+        .to_series()
+        .to_list()
+    )
+
+    logger.info(
+        "uniprot_mapping_complete",
+        total_genes=len(gene_ids),
+        mapped_genes=len(uniprot_ids),
+    )
+
+    # Fetch UniProt features
+    uniprot_df = fetch_uniprot_features(uniprot_ids)
+
+    # Fetch InterPro domains
+    interpro_df = fetch_interpro_domains(uniprot_ids)
+
+    # Extract features
+    features_df = extract_protein_features(uniprot_df, interpro_df)
+
+    # Detect cilia motifs
+    features_df = detect_cilia_motifs(features_df)
+
+    # Normalize features
+    features_df = normalize_protein_features(features_df)
+
+    # Join back to gene mapping to get gene_id and gene_symbol
+    # Use right join to preserve all genes (including those without UniProt mapping)
+    result = features_df.join(
+        gene_map.select(["gene_id", "gene_symbol", "uniprot_id"]),
+        on="uniprot_id",
+        how="right",
+    )
+
+    # Select final columns in order
+    result = result.select([
+        "gene_id",
+        "gene_symbol",
+        "uniprot_id",
+        "protein_length",
+        "domain_count",
+        "coiled_coil",
+        "coiled_coil_count",
+        "transmembrane_count",
+        "scaffold_adaptor_domain",
+        "has_cilia_domain",
+        "has_sensory_domain",
+        "protein_score_normalized",
+    ])
+
+    logger.info(
+        "process_protein_evidence_complete",
+        total_genes=len(result),
+        with_uniprot=result.filter(pl.col("uniprot_id").is_not_null()).height,
+        with_scores=result.filter(pl.col("protein_score_normalized").is_not_null()).height,
+    )
+
+    return result
--- a/src/usher_pipeline/persistence/provenance.py
+++ b/src/usher_pipeline/persistence/provenance.py
@@ -37,6 +37,7 @@ class ProvenanceTracker:
            details: Optional dictionary of additional details
        """
        step = {
+            "name": step_name,
            "step_name": step_name,
            "timestamp": datetime.now(timezone.utc).isoformat(),
        }
@@ -44,6 +45,15 @@ class ProvenanceTracker:
            step["details"] = details
        self.processing_steps.append(step)

+    def get_steps(self) -> list[dict]:
+        """
+        Get all recorded processing steps.
+
+        Returns:
+            List of processing step dictionaries
+        """
+        return self.processing_steps
+
    def create_metadata(self) -> dict:
        """
        Create full provenance metadata dictionary.