feat(03-01): implement annotation evidence fetch and transform modules

- Create AnnotationRecord model with GO counts, UniProt scores, tier classification - Implement fetch_go_annotations using mygene.info batch queries - Implement fetch_uniprot_scores using UniProt REST API - Add classify_annotation_tier with 3-tier system (well/partial/poor) - Add normalize_annotation_score with weighted composite (GO 50%, UniProt 30%, Pathway 20%) - Implement process_annotation_evidence end-to-end pipeline - Follow NULL preservation pattern from gnomAD (unknown != zero) - Use lazy polars evaluation where applicable
2026-02-11 18:58:45 +08:00
parent 0d252da348
commit adbb74b965
4 changed files with 563 additions and 0 deletions
--- a/src/usher_pipeline/evidence/annotation/init.py
+++ b/src/usher_pipeline/evidence/annotation/init.py
@@ -0,0 +1,22 @@
 """Gene annotation completeness evidence layer."""
 from usher_pipeline.evidence.annotation.models import AnnotationRecord, ANNOTATION_TABLE_NAME
 from usher_pipeline.evidence.annotation.fetch import (
    fetch_go_annotations,
    fetch_uniprot_scores,
 )
 from usher_pipeline.evidence.annotation.transform import (
    classify_annotation_tier,
    normalize_annotation_score,
    process_annotation_evidence,
 )
 __all__ = [
    "AnnotationRecord",
    "ANNOTATION_TABLE_NAME",
    "fetch_go_annotations",
    "fetch_uniprot_scores",
    "classify_annotation_tier",
    "normalize_annotation_score",
    "process_annotation_evidence",
 ]
--- a/src/usher_pipeline/evidence/annotation/fetch.py
+++ b/src/usher_pipeline/evidence/annotation/fetch.py
@@ -0,0 +1,290 @@
 """Fetch gene annotation data from mygene.info and UniProt APIs."""
 from typing import Optional
 import math
 import httpx
 import mygene
 import polars as pl
 import structlog
 from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type,
 )
 logger = structlog.get_logger()
 # Initialize mygene client (singleton pattern - reuse across calls)
 _mg_client = None
 def _get_mygene_client() -> mygene.MyGeneInfo:
    """Get or create mygene client singleton."""
    global _mg_client
    if _mg_client is None:
        _mg_client = mygene.MyGeneInfo()
    return _mg_client
 def fetch_go_annotations(gene_ids: list[str], batch_size: int = 1000) -> pl.DataFrame:
    """Fetch GO annotations and pathway memberships from mygene.info.
    Uses mygene.querymany to batch query GO terms and pathway data.
    Processes in batches to avoid API timeout.
    Args:
        gene_ids: List of Ensembl gene IDs
        batch_size: Number of genes per batch query (default: 1000)
    Returns:
        DataFrame with columns:
        - gene_id: Ensembl gene ID
        - gene_symbol: HGNC symbol (NULL if not found)
        - go_term_count: Total GO term count across all ontologies (NULL if no GO data)
        - go_biological_process_count: GO BP term count (NULL if no GO data)
        - go_molecular_function_count: GO MF term count (NULL if no GO data)
        - go_cellular_component_count: GO CC term count (NULL if no GO data)
        - has_pathway_membership: Boolean indicating presence in KEGG/Reactome (NULL if no pathway data)
    Note: Genes with no GO annotations get NULL counts (not zero).
    """
    logger.info("fetch_go_annotations_start", gene_count=len(gene_ids))
    mg = _get_mygene_client()
    all_results = []
    # Process in batches to avoid mygene timeout
    num_batches = math.ceil(len(gene_ids) / batch_size)
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(gene_ids))
        batch = gene_ids[start_idx:end_idx]
        logger.info(
            "fetch_go_batch",
            batch_num=i + 1,
            total_batches=num_batches,
            batch_size=len(batch),
        )
        # Query mygene for GO terms, pathways, and symbol
        try:
            results = mg.querymany(
                batch,
                scopes="ensembl.gene",
                fields="go,pathway.kegg,pathway.reactome,symbol",
                species="human",
                returnall=False,
            )
            # Process each gene's result
            for result in results:
                gene_id = result.get("query")
                gene_symbol = result.get("symbol", None)
                # Extract GO term counts by category
                go_data = result.get("go", {})
                if isinstance(go_data, dict):
                    # Count GO terms by ontology
                    bp_terms = go_data.get("BP", [])
                    mf_terms = go_data.get("MF", [])
                    cc_terms = go_data.get("CC", [])
                    # Convert to list if single dict (mygene sometimes returns dict for single term)
                    bp_list = bp_terms if isinstance(bp_terms, list) else ([bp_terms] if bp_terms else [])
                    mf_list = mf_terms if isinstance(mf_terms, list) else ([mf_terms] if mf_terms else [])
                    cc_list = cc_terms if isinstance(cc_terms, list) else ([cc_terms] if cc_terms else [])
                    bp_count = len(bp_list) if bp_list else None
                    mf_count = len(mf_list) if mf_list else None
                    cc_count = len(cc_list) if cc_list else None
                    # Total GO count (sum of non-NULL counts, or NULL if all NULL)
                    counts = [c for c in [bp_count, mf_count, cc_count] if c is not None]
                    total_count = sum(counts) if counts else None
                else:
                    # No GO data
                    bp_count = None
                    mf_count = None
                    cc_count = None
                    total_count = None
                # Check pathway membership
                pathway_data = result.get("pathway", {})
                has_kegg = bool(pathway_data.get("kegg"))
                has_reactome = bool(pathway_data.get("reactome"))
                has_pathway = (has_kegg or has_reactome) if (has_kegg or has_reactome or pathway_data) else None
                all_results.append({
                    "gene_id": gene_id,
                    "gene_symbol": gene_symbol,
                    "go_term_count": total_count,
                    "go_biological_process_count": bp_count,
                    "go_molecular_function_count": mf_count,
                    "go_cellular_component_count": cc_count,
                    "has_pathway_membership": has_pathway,
                })
        except Exception as e:
            logger.warning(
                "fetch_go_batch_error",
                batch_num=i + 1,
                error=str(e),
            )
            # Add NULL entries for failed batch
            for gene_id in batch:
                all_results.append({
                    "gene_id": gene_id,
                    "gene_symbol": None,
                    "go_term_count": None,
                    "go_biological_process_count": None,
                    "go_molecular_function_count": None,
                    "go_cellular_component_count": None,
                    "has_pathway_membership": None,
                })
    logger.info("fetch_go_annotations_complete", result_count=len(all_results))
    return pl.DataFrame(all_results)
@retry(
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=2, max=30),
    retry=retry_if_exception_type(
        (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
    ),
 )
 def _query_uniprot_batch(accessions: list[str]) -> dict:
    """Query UniProt REST API for annotation scores (with retry).
    Args:
        accessions: List of UniProt accession IDs (max 100)
    Returns:
        Dict mapping accession -> annotation_score
    """
    if not accessions:
        return {}
    # Build OR query for batch lookup
    query = " OR ".join([f"accession:{acc}" for acc in accessions])
    url = "https://rest.uniprot.org/uniprotkb/search"
    params = {
        "query": query,
        "fields": "accession,annotation_score",
        "format": "json",
        "size": len(accessions),
    }
    with httpx.Client(timeout=30.0) as client:
        response = client.get(url, params=params)
        response.raise_for_status()
        data = response.json()
    # Parse results into mapping
    score_map = {}
    for entry in data.get("results", []):
        accession = entry.get("primaryAccession")
        score = entry.get("annotationScore")
        if accession and score is not None:
            score_map[accession] = int(score)
    return score_map
 def fetch_uniprot_scores(
    gene_ids: list[str],
    uniprot_mapping: pl.DataFrame,
    batch_size: int = 100,
 ) -> pl.DataFrame:
    """Fetch UniProt annotation scores for genes.
    Uses UniProt REST API to query annotation scores in batches.
    Rate-limited to avoid overwhelming the API (built-in via tenacity retry).
    Args:
        gene_ids: List of Ensembl gene IDs
        uniprot_mapping: DataFrame with gene_id and uniprot_accession columns
        batch_size: Number of UniProt accessions per batch (default: 100)
    Returns:
        DataFrame with columns:
        - gene_id: Ensembl gene ID
        - uniprot_annotation_score: UniProt annotation score 1-5 (NULL if no mapping/score)
    Note: Genes without UniProt mapping get NULL (not zero).
    """
    logger.info("fetch_uniprot_scores_start", gene_count=len(gene_ids))
    # Filter mapping to requested genes
    mapping_filtered = uniprot_mapping.filter(pl.col("gene_id").is_in(gene_ids))
    if mapping_filtered.height == 0:
        logger.warning("fetch_uniprot_no_mappings")
        # Return all genes with NULL scores
        return pl.DataFrame({
            "gene_id": gene_ids,
            "uniprot_annotation_score": [None] * len(gene_ids),
        })
    # Get unique accessions
    accessions = mapping_filtered.select("uniprot_accession").unique().to_series().to_list()
    logger.info("fetch_uniprot_accessions", accession_count=len(accessions))
    # Batch query UniProt API
    all_scores = {}
    num_batches = math.ceil(len(accessions) / batch_size)
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(accessions))
        batch = accessions[start_idx:end_idx]
        logger.info(
            "fetch_uniprot_batch",
            batch_num=i + 1,
            total_batches=num_batches,
            batch_size=len(batch),
        )
        try:
            batch_scores = _query_uniprot_batch(batch)
            all_scores.update(batch_scores)
        except Exception as e:
            logger.warning(
                "fetch_uniprot_batch_error",
                batch_num=i + 1,
                error=str(e),
            )
            # Continue with other batches - failed batch will have NULL scores
    # Create accession -> score mapping
    score_df = pl.DataFrame({
        "uniprot_accession": list(all_scores.keys()),
        "uniprot_annotation_score": list(all_scores.values()),
    })
    # Join back to gene IDs
    result = (
        mapping_filtered
        .select(["gene_id", "uniprot_accession"])
        .join(score_df, on="uniprot_accession", how="left")
        .group_by("gene_id")
        .agg(
            # Take first score if multiple accessions (consistent with gene universe pattern)
            pl.col("uniprot_annotation_score").first()
        )
    )
    # Ensure all requested genes are present (add NULL for missing)
    all_genes = pl.DataFrame({"gene_id": gene_ids})
    result = all_genes.join(result, on="gene_id", how="left")
    logger.info("fetch_uniprot_scores_complete", result_count=result.height)
    return result
--- a/src/usher_pipeline/evidence/annotation/models.py
+++ b/src/usher_pipeline/evidence/annotation/models.py
@@ -0,0 +1,38 @@
 """Data models for gene annotation completeness evidence."""
 from pydantic import BaseModel
 # Table name for DuckDB storage
 ANNOTATION_TABLE_NAME = "annotation_completeness"
 class AnnotationRecord(BaseModel):
    """Gene annotation completeness metrics for a single gene.
    Attributes:
        gene_id: Ensembl gene ID (e.g., ENSG00000...)
        gene_symbol: HGNC gene symbol
        go_term_count: Total number of GO terms (all ontologies) - NULL if no data
        go_biological_process_count: Number of GO Biological Process terms - NULL if no data
        go_molecular_function_count: Number of GO Molecular Function terms - NULL if no data
        go_cellular_component_count: Number of GO Cellular Component terms - NULL if no data
        uniprot_annotation_score: UniProt annotation score 1-5 - NULL if no mapping or score
        has_pathway_membership: Present in any KEGG/Reactome pathway - NULL if no data
        annotation_tier: Classification: "well_annotated", "partially_annotated", "poorly_annotated"
        annotation_score_normalized: Composite annotation score 0-1 (higher = better annotated) - NULL if all inputs NULL
    CRITICAL: NULL values represent missing data and are preserved as None.
    Do NOT convert NULL to 0 - "unknown annotation" is semantically different from "zero annotation".
    Conservative approach: NULL GO counts treated as zero for tier classification (assume unannotated).
    """
    gene_id: str
    gene_symbol: str
    go_term_count: int | None = None
    go_biological_process_count: int | None = None
    go_molecular_function_count: int | None = None
    go_cellular_component_count: int | None = None
    uniprot_annotation_score: int | None = None
    has_pathway_membership: bool | None = None
    annotation_tier: str = "poorly_annotated"
    annotation_score_normalized: float | None = None
--- a/src/usher_pipeline/evidence/annotation/transform.py
+++ b/src/usher_pipeline/evidence/annotation/transform.py
@@ -0,0 +1,213 @@
 """Transform and normalize gene annotation completeness metrics."""
 import math
 from pathlib import Path
 import polars as pl
 import structlog
 from usher_pipeline.evidence.annotation.fetch import (
    fetch_go_annotations,
    fetch_uniprot_scores,
 )
 logger = structlog.get_logger()
 def classify_annotation_tier(df: pl.DataFrame) -> pl.DataFrame:
    """Classify genes into annotation tiers based on composite metrics.
    Tier definitions:
    - "well_annotated": go_term_count >= 20 AND uniprot_annotation_score >= 4
    - "partially_annotated": go_term_count >= 5 OR uniprot_annotation_score >= 3
    - "poorly_annotated": Everything else (including NULLs)
    Conservative approach: NULL GO counts treated as zero for tier classification
    (assume unannotated until proven otherwise).
    Args:
        df: DataFrame with go_term_count and uniprot_annotation_score columns
    Returns:
        DataFrame with annotation_tier column added
    """
    logger.info("classify_annotation_tier_start", row_count=df.height)
    # Fill NULL GO counts with 0 for tier classification (conservative)
    # But preserve original NULL for downstream NULL handling
    df = df.with_columns([
        pl.col("go_term_count").fill_null(0).alias("_go_count_filled"),
        pl.col("uniprot_annotation_score").fill_null(0).alias("_uniprot_score_filled"),
    ])
    # Apply tier classification logic
    df = df.with_columns(
        pl.when(
            (pl.col("_go_count_filled") >= 20) & (pl.col("_uniprot_score_filled") >= 4)
        )
        .then(pl.lit("well_annotated"))
        .when(
            (pl.col("_go_count_filled") >= 5) | (pl.col("_uniprot_score_filled") >= 3)
        )
        .then(pl.lit("partially_annotated"))
        .otherwise(pl.lit("poorly_annotated"))
        .alias("annotation_tier")
    )
    # Drop temporary filled columns
    df = df.drop(["_go_count_filled", "_uniprot_score_filled"])
    # Log tier distribution
    tier_counts = df.group_by("annotation_tier").len().sort("annotation_tier")
    logger.info("classify_annotation_tier_complete", tier_distribution=tier_counts.to_dicts())
    return df
 def normalize_annotation_score(df: pl.DataFrame) -> pl.DataFrame:
    """Compute normalized composite annotation score (0-1 range).
    Formula: Weighted average of three components:
    - GO component (50%): log2(go_term_count + 1) normalized by max across dataset
    - UniProt component (30%): uniprot_annotation_score / 5.0
    - Pathway component (20%): has_pathway_membership as 0/1
    Result clamped to [0, 1]. NULL if ALL three inputs are NULL.
    Args:
        df: DataFrame with go_term_count, uniprot_annotation_score, has_pathway_membership
    Returns:
        DataFrame with annotation_score_normalized column added
    """
    logger.info("normalize_annotation_score_start", row_count=df.height)
    # Component weights
    WEIGHT_GO = 0.5
    WEIGHT_UNIPROT = 0.3
    WEIGHT_PATHWAY = 0.2
    # Compute GO component: log2(count + 1) normalized by max
    df = df.with_columns(
        pl.when(pl.col("go_term_count").is_not_null())
        .then((pl.col("go_term_count") + 1).log(base=2))
        .otherwise(None)
        .alias("_go_log")
    )
    # Get max for normalization (from non-NULL values)
    go_max = df.filter(pl.col("_go_log").is_not_null()).select(pl.col("_go_log").max()).item()
    if go_max is None or go_max == 0:
        # No GO data in dataset - all get NULL for GO component
        df = df.with_columns(pl.lit(None).cast(pl.Float64).alias("_go_component"))
    else:
        df = df.with_columns(
            pl.when(pl.col("_go_log").is_not_null())
            .then((pl.col("_go_log") / go_max) * WEIGHT_GO)
            .otherwise(None)
            .alias("_go_component")
        )
    # Compute UniProt component: score / 5.0
    df = df.with_columns(
        pl.when(pl.col("uniprot_annotation_score").is_not_null())
        .then((pl.col("uniprot_annotation_score") / 5.0) * WEIGHT_UNIPROT)
        .otherwise(None)
        .alias("_uniprot_component")
    )
    # Compute pathway component: boolean as 0/1
    df = df.with_columns(
        pl.when(pl.col("has_pathway_membership").is_not_null())
        .then(
            pl.when(pl.col("has_pathway_membership"))
            .then(WEIGHT_PATHWAY)
            .otherwise(0.0)
        )
        .otherwise(None)
        .alias("_pathway_component")
    )
    # Composite score: sum of non-NULL components, NULL if all NULL
    # Need to handle NULL properly: only compute if at least one component is non-NULL
    df = df.with_columns(
        pl.when(
            pl.col("_go_component").is_not_null()
            | pl.col("_uniprot_component").is_not_null()
            | pl.col("_pathway_component").is_not_null()
        )
        .then(
            # Sum components, treating NULL as 0 for the sum
            pl.col("_go_component").fill_null(0.0)
            + pl.col("_uniprot_component").fill_null(0.0)
            + pl.col("_pathway_component").fill_null(0.0)
        )
        .otherwise(None)
        .alias("annotation_score_normalized")
    )
    # Clamp to [0, 1] range (shouldn't exceed but defensive)
    df = df.with_columns(
        pl.when(pl.col("annotation_score_normalized").is_not_null())
        .then(
            pl.col("annotation_score_normalized").clip(0.0, 1.0)
        )
        .otherwise(None)
        .alias("annotation_score_normalized")
    )
    # Drop temporary columns
    df = df.drop(["_go_log", "_go_component", "_uniprot_component", "_pathway_component"])
    # Log score statistics
    stats = df.filter(pl.col("annotation_score_normalized").is_not_null()).select([
        pl.col("annotation_score_normalized").mean().alias("mean"),
        pl.col("annotation_score_normalized").median().alias("median"),
        pl.col("annotation_score_normalized").min().alias("min"),
        pl.col("annotation_score_normalized").max().alias("max"),
    ])
    if stats.height > 0:
        logger.info("normalize_annotation_score_complete", stats=stats.to_dicts()[0])
    else:
        logger.warning("normalize_annotation_score_complete", message="No valid scores computed")
    return df
 def process_annotation_evidence(
    gene_ids: list[str],
    uniprot_mapping: pl.DataFrame,
 ) -> pl.DataFrame:
    """End-to-end annotation evidence processing pipeline.
    Composes: fetch GO -> fetch UniProt -> join -> classify tier -> normalize -> collect.
    Args:
        gene_ids: List of Ensembl gene IDs to process
        uniprot_mapping: DataFrame with gene_id and uniprot_accession columns
    Returns:
        Materialized DataFrame with all annotation completeness metrics ready for DuckDB
    """
    logger.info("process_annotation_evidence_start", gene_count=len(gene_ids))
    # Fetch GO annotations and pathway memberships
    go_df = fetch_go_annotations(gene_ids)
    # Fetch UniProt annotation scores
    uniprot_df = fetch_uniprot_scores(gene_ids, uniprot_mapping)
    # Join GO and UniProt data
    df = go_df.join(uniprot_df, on="gene_id", how="left")
    # Classify annotation tiers
    df = classify_annotation_tier(df)
    # Normalize composite score
    df = normalize_annotation_score(df)
    logger.info("process_annotation_evidence_complete", result_count=df.height)
    return df