feat(03-05): implement animal model evidence fetch and transform

- models.py: AnimalModelRecord with ortholog confidence, phenotype flags, and normalized scoring - fetch.py: Retrieve orthologs from HCOP, phenotypes from MGI/ZFIN/IMPC with retry - transform.py: Filter sensory/cilia-relevant phenotypes, score with confidence weighting - Ortholog confidence: HIGH (8+ sources), MEDIUM (4-7), LOW (1-3) - Scoring: mouse +0.4, zebrafish +0.3, IMPC +0.3, weighted by confidence - NULL preservation: no ortholog = NULL score (not zero)
2026-02-11 19:00:24 +08:00
parent 8aa66987f8
commit 0e389c7e41
4 changed files with 1019 additions and 0 deletions
@@ -0,0 +1,42 @@
 """Animal model phenotype evidence layer.
 Retrieves knockout/perturbation phenotypes from:
 - MGI (Mouse Genome Informatics) - mouse phenotypes
 - ZFIN (Zebrafish Information Network) - zebrafish phenotypes
 - IMPC (International Mouse Phenotyping Consortium) - mouse phenotypes
 Maps human genes to model organism orthologs with confidence scoring,
 filters for sensory/cilia-relevant phenotypes, and scores evidence.
 """
 from usher_pipeline.evidence.animal_models.models import (
    AnimalModelRecord,
    ANIMAL_TABLE_NAME,
    SENSORY_MP_KEYWORDS,
    SENSORY_ZP_KEYWORDS,
 )
 from usher_pipeline.evidence.animal_models.fetch import (
    fetch_ortholog_mapping,
    fetch_mgi_phenotypes,
    fetch_zfin_phenotypes,
    fetch_impc_phenotypes,
 )
 from usher_pipeline.evidence.animal_models.transform import (
    filter_sensory_phenotypes,
    score_animal_evidence,
    process_animal_model_evidence,
 )
 __all__ = [
    "AnimalModelRecord",
    "ANIMAL_TABLE_NAME",
    "SENSORY_MP_KEYWORDS",
    "SENSORY_ZP_KEYWORDS",
    "fetch_ortholog_mapping",
    "fetch_mgi_phenotypes",
    "fetch_zfin_phenotypes",
    "fetch_impc_phenotypes",
    "filter_sensory_phenotypes",
    "score_animal_evidence",
    "process_animal_model_evidence",
 ]
@@ -0,0 +1,497 @@
 """Fetch animal model phenotype data and ortholog mappings."""
 import gzip
 import io
 from pathlib import Path
 from typing import Optional
 import httpx
 import polars as pl
 import structlog
 from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type,
 )
 logger = structlog.get_logger()
 # HCOP ortholog database URLs
 HCOP_HUMAN_MOUSE_URL = "https://ftp.ebi.ac.uk/pub/databases/genenames/hcop/human_mouse_hcop_fifteen_column.txt.gz"
 HCOP_HUMAN_ZEBRAFISH_URL = "https://ftp.ebi.ac.uk/pub/databases/genenames/hcop/human_zebrafish_hcop_fifteen_column.txt.gz"
 # MGI phenotype report URL
 MGI_GENE_PHENO_URL = "https://www.informatics.jax.org/downloads/reports/MGI_GenePheno.rpt"
 # ZFIN phenotype data URL
 ZFIN_PHENO_URL = "https://zfin.org/downloads/phenoGeneCleanData_fish.txt"
 # IMPC API base URL
 IMPC_API_BASE = "https://www.ebi.ac.uk/mi/impc/solr/genotype-phenotype/select"
@retry(
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=4, max=60),
    retry=retry_if_exception_type(
        (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
    ),
 )
 def _download_gzipped(url: str) -> bytes:
    """Download and decompress a gzipped file.
    Args:
        url: URL to download
    Returns:
        Decompressed file content as bytes
    """
    logger.info("download_start", url=url)
    with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
        response.raise_for_status()
        # Read compressed data
        compressed_data = b""
        for chunk in response.iter_bytes(chunk_size=8192):
            compressed_data += chunk
    # Decompress
    logger.info("decompress_start", compressed_size_mb=round(len(compressed_data) / 1024 / 1024, 2))
    decompressed = gzip.decompress(compressed_data)
    logger.info("decompress_complete", decompressed_size_mb=round(len(decompressed) / 1024 / 1024, 2))
    return decompressed
@retry(
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=4, max=60),
    retry=retry_if_exception_type(
        (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
    ),
 )
 def _download_text(url: str) -> str:
    """Download a text file with retry.
    Args:
        url: URL to download
    Returns:
        File content as string
    """
    logger.info("download_text_start", url=url)
    with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
        response.raise_for_status()
        content = response.text
    logger.info("download_text_complete", size_mb=round(len(content) / 1024 / 1024, 2))
    return content
 def fetch_ortholog_mapping(gene_ids: list[str]) -> pl.DataFrame:
    """Fetch human-to-mouse and human-to-zebrafish ortholog mappings from HCOP.
    Downloads HCOP ortholog data, assigns confidence scores based on number of
    supporting databases, and handles one-to-many mappings by selecting the
    ortholog with highest confidence.
    Confidence scoring:
    - HIGH: 8+ supporting databases
    - MEDIUM: 4-7 supporting databases
    - LOW: 1-3 supporting databases
    Args:
        gene_ids: List of human gene IDs (ENSG format)
    Returns:
        DataFrame with columns:
        - gene_id: Human gene ID
        - mouse_ortholog: Mouse gene symbol
        - mouse_ortholog_confidence: HIGH/MEDIUM/LOW
        - zebrafish_ortholog: Zebrafish gene symbol
        - zebrafish_ortholog_confidence: HIGH/MEDIUM/LOW
    """
    logger.info("fetch_ortholog_mapping_start", gene_count=len(gene_ids))
    # Download human-mouse HCOP data
    logger.info("fetch_hcop_mouse")
    mouse_data = _download_gzipped(HCOP_HUMAN_MOUSE_URL)
    mouse_df = pl.read_csv(
        io.BytesIO(mouse_data),
        separator="\t",
        null_values=["", "NA"],
    )
    logger.info("hcop_mouse_columns", columns=mouse_df.columns)
    # Parse mouse ortholog data
    # HCOP columns: human_entrez_gene, human_ensembl_gene, hgnc_id, human_name, human_symbol,
    #               human_chr, human_assert_ids, mouse_entrez_gene, mouse_ensembl_gene,
    #               mgi_id, mouse_name, mouse_symbol, mouse_chr, mouse_assert_ids, support
    mouse_orthologs = (
        mouse_df
        .filter(pl.col("human_ensembl_gene").is_in(gene_ids))
        .select([
            pl.col("human_ensembl_gene").alias("gene_id"),
            pl.col("mouse_symbol").alias("mouse_ortholog"),
            pl.col("support").str.split(",").list.len().alias("support_count"),
        ])
        .with_columns([
            pl.when(pl.col("support_count") >= 8)
            .then(pl.lit("HIGH"))
            .when(pl.col("support_count") >= 4)
            .then(pl.lit("MEDIUM"))
            .otherwise(pl.lit("LOW"))
            .alias("mouse_ortholog_confidence")
        ])
        .sort(["gene_id", "support_count"], descending=[False, True])
        .group_by("gene_id")
        .first()
        .select(["gene_id", "mouse_ortholog", "mouse_ortholog_confidence"])
    )
    logger.info("mouse_orthologs_mapped", count=len(mouse_orthologs))
    # Download human-zebrafish HCOP data
    logger.info("fetch_hcop_zebrafish")
    zebrafish_data = _download_gzipped(HCOP_HUMAN_ZEBRAFISH_URL)
    zebrafish_df = pl.read_csv(
        io.BytesIO(zebrafish_data),
        separator="\t",
        null_values=["", "NA"],
    )
    logger.info("hcop_zebrafish_columns", columns=zebrafish_df.columns)
    # Parse zebrafish ortholog data
    zebrafish_orthologs = (
        zebrafish_df
        .filter(pl.col("human_ensembl_gene").is_in(gene_ids))
        .select([
            pl.col("human_ensembl_gene").alias("gene_id"),
            pl.col("zebrafish_symbol").alias("zebrafish_ortholog"),
            pl.col("support").str.split(",").list.len().alias("support_count"),
        ])
        .with_columns([
            pl.when(pl.col("support_count") >= 8)
            .then(pl.lit("HIGH"))
            .when(pl.col("support_count") >= 4)
            .then(pl.lit("MEDIUM"))
            .otherwise(pl.lit("LOW"))
            .alias("zebrafish_ortholog_confidence")
        ])
        .sort(["gene_id", "support_count"], descending=[False, True])
        .group_by("gene_id")
        .first()
        .select(["gene_id", "zebrafish_ortholog", "zebrafish_ortholog_confidence"])
    )
    logger.info("zebrafish_orthologs_mapped", count=len(zebrafish_orthologs))
    # Create base DataFrame with all gene IDs
    base_df = pl.DataFrame({"gene_id": gene_ids})
    # Left join ortholog mappings
    result = (
        base_df
        .join(mouse_orthologs, on="gene_id", how="left")
        .join(zebrafish_orthologs, on="gene_id", how="left")
    )
    logger.info(
        "fetch_ortholog_mapping_complete",
        total_genes=len(result),
        mouse_mapped=result.filter(pl.col("mouse_ortholog").is_not_null()).height,
        zebrafish_mapped=result.filter(pl.col("zebrafish_ortholog").is_not_null()).height,
    )
    return result
 def fetch_mgi_phenotypes(mouse_gene_symbols: list[str]) -> pl.DataFrame:
    """Fetch mouse phenotype data from MGI (Mouse Genome Informatics).
    Downloads the MGI gene-phenotype report and extracts phenotype terms
    for the specified mouse genes.
    Args:
        mouse_gene_symbols: List of mouse gene symbols
    Returns:
        DataFrame with columns:
        - mouse_gene: Mouse gene symbol
        - mp_term_id: Mammalian Phenotype term ID
        - mp_term_name: Mammalian Phenotype term name
    """
    if not mouse_gene_symbols:
        logger.info("fetch_mgi_phenotypes_skip", reason="no_mouse_genes")
        return pl.DataFrame({
            "mouse_gene": [],
            "mp_term_id": [],
            "mp_term_name": [],
        })
    logger.info("fetch_mgi_phenotypes_start", gene_count=len(mouse_gene_symbols))
    # Download MGI phenotype report
    content = _download_text(MGI_GENE_PHENO_URL)
    # Parse TSV (skip first line if it's a comment)
    lines = content.strip().split("\n")
    if lines[0].startswith("#"):
        lines = lines[1:]
    # Read as DataFrame
    df = pl.read_csv(
        io.StringIO("\n".join(lines)),
        separator="\t",
        null_values=["", "NA"],
        has_header=True,
    )
    logger.info("mgi_raw_columns", columns=df.columns)
    # MGI_GenePheno.rpt columns vary, but typically include:
    # Allelic Composition, Allele Symbol(s), Genetic Background, Mammalian Phenotype ID, PubMed ID, MGI Marker Accession ID
    # We need to identify the right columns
    # Expected columns: marker symbol, MP ID, MP term
    # Common column names: "Marker Symbol", "Mammalian Phenotype ID"
    # Try to find the right columns
    marker_col = None
    mp_id_col = None
    for col in df.columns:
        col_lower = col.lower()
        if "marker" in col_lower and "symbol" in col_lower:
            marker_col = col
        elif "mammalian phenotype id" in col_lower or "mp id" in col_lower:
            mp_id_col = col
    if marker_col is None or mp_id_col is None:
        logger.warning("mgi_column_detection_failed", columns=df.columns)
        # Return empty result
        return pl.DataFrame({
            "mouse_gene": [],
            "mp_term_id": [],
            "mp_term_name": [],
        })
    # Filter for genes of interest and extract phenotypes
    # Note: MGI report may have one row per allele-phenotype combination
    # We'll aggregate unique phenotypes per gene
    result = (
        df
        .filter(pl.col(marker_col).is_in(mouse_gene_symbols))
        .select([
            pl.col(marker_col).alias("mouse_gene"),
            pl.col(mp_id_col).alias("mp_term_id"),
            pl.lit(None).alias("mp_term_name"),  # Term name not in this report
        ])
        .unique()
    )
    logger.info("fetch_mgi_phenotypes_complete", phenotype_count=len(result))
    return result
 def fetch_zfin_phenotypes(zebrafish_gene_symbols: list[str]) -> pl.DataFrame:
    """Fetch zebrafish phenotype data from ZFIN.
    Downloads ZFIN phenotype data and extracts phenotype terms for the
    specified zebrafish genes.
    Args:
        zebrafish_gene_symbols: List of zebrafish gene symbols
    Returns:
        DataFrame with columns:
        - zebrafish_gene: Zebrafish gene symbol
        - zp_term_id: Zebrafish Phenotype term ID (or descriptor)
        - zp_term_name: Zebrafish Phenotype term name
    """
    if not zebrafish_gene_symbols:
        logger.info("fetch_zfin_phenotypes_skip", reason="no_zebrafish_genes")
        return pl.DataFrame({
            "zebrafish_gene": [],
            "zp_term_id": [],
            "zp_term_name": [],
        })
    logger.info("fetch_zfin_phenotypes_start", gene_count=len(zebrafish_gene_symbols))
    # Download ZFIN phenotype data
    content = _download_text(ZFIN_PHENO_URL)
    # Parse TSV
    df = pl.read_csv(
        io.StringIO(content),
        separator="\t",
        null_values=["", "NA"],
        has_header=True,
    )
    logger.info("zfin_raw_columns", columns=df.columns)
    # ZFIN phenoGeneCleanData_fish.txt columns (typical):
    # Gene Symbol, Gene ID, Affected Structure or Process 1 subterm ID, etc.
    # Look for gene symbol and phenotype columns
    gene_col = None
    pheno_col = None
    for col in df.columns:
        col_lower = col.lower()
        if "gene" in col_lower and ("symbol" in col_lower or "name" in col_lower):
            gene_col = col
        elif "phenotype" in col_lower or "structure" in col_lower or "process" in col_lower:
            if pheno_col is None:  # Take first phenotype-related column
                pheno_col = col
    if gene_col is None or pheno_col is None:
        logger.warning("zfin_column_detection_failed", columns=df.columns)
        return pl.DataFrame({
            "zebrafish_gene": [],
            "zp_term_id": [],
            "zp_term_name": [],
        })
    # Filter and extract
    result = (
        df
        .filter(pl.col(gene_col).is_in(zebrafish_gene_symbols))
        .select([
            pl.col(gene_col).alias("zebrafish_gene"),
            pl.lit(None).alias("zp_term_id"),
            pl.col(pheno_col).alias("zp_term_name"),
        ])
        .unique()
    )
    logger.info("fetch_zfin_phenotypes_complete", phenotype_count=len(result))
    return result
@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=2, max=30),
    retry=retry_if_exception_type(
        (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
    ),
 )
 def _query_impc_batch(gene_symbols: list[str]) -> pl.DataFrame:
    """Query IMPC API for a batch of genes.
    Args:
        gene_symbols: List of mouse gene symbols (batch)
    Returns:
        DataFrame with IMPC phenotype data
    """
    # Build query: marker_symbol:(gene1 OR gene2 OR ...)
    query = "marker_symbol:(" + " OR ".join(gene_symbols) + ")"
    params = {
        "q": query,
        "rows": 10000,
        "wt": "json",
    }
    logger.info("impc_query_batch", gene_count=len(gene_symbols))
    response = httpx.get(IMPC_API_BASE, params=params, timeout=60.0)
    response.raise_for_status()
    data = response.json()
    docs = data.get("response", {}).get("docs", [])
    if not docs:
        return pl.DataFrame({
            "mouse_gene": [],
            "mp_term_id": [],
            "mp_term_name": [],
            "p_value": [],
        })
    # Extract relevant fields
    records = []
    for doc in docs:
        gene = doc.get("marker_symbol")
        mp_id = doc.get("mp_term_id")
        mp_name = doc.get("mp_term_name")
        p_value = doc.get("p_value")
        if gene and mp_id:
            records.append({
                "mouse_gene": gene,
                "mp_term_id": mp_id,
                "mp_term_name": mp_name,
                "p_value": p_value,
            })
    df = pl.DataFrame(records)
    logger.info("impc_batch_complete", phenotype_count=len(df))
    return df
 def fetch_impc_phenotypes(mouse_gene_symbols: list[str]) -> pl.DataFrame:
    """Fetch mouse phenotype data from IMPC (International Mouse Phenotyping Consortium).
    Queries the IMPC SOLR API in batches to get phenotype data for mouse genes.
    Includes statistical significance (p-value) for each phenotype.
    Args:
        mouse_gene_symbols: List of mouse gene symbols
    Returns:
        DataFrame with columns:
        - mouse_gene: Mouse gene symbol
        - mp_term_id: Mammalian Phenotype term ID
        - mp_term_name: Mammalian Phenotype term name
        - p_value: Statistical significance of phenotype
    """
    if not mouse_gene_symbols:
        logger.info("fetch_impc_phenotypes_skip", reason="no_mouse_genes")
        return pl.DataFrame({
            "mouse_gene": [],
            "mp_term_id": [],
            "mp_term_name": [],
            "p_value": [],
        })
    logger.info("fetch_impc_phenotypes_start", gene_count=len(mouse_gene_symbols))
    # Query in batches of 50 to avoid overloading API
    batch_size = 50
    all_results = []
    for i in range(0, len(mouse_gene_symbols), batch_size):
        batch = mouse_gene_symbols[i:i + batch_size]
        try:
            batch_df = _query_impc_batch(batch)
            all_results.append(batch_df)
        except Exception as e:
            logger.warning("impc_batch_failed", batch_index=i // batch_size, error=str(e))
            # Continue with other batches
    if not all_results:
        logger.warning("fetch_impc_phenotypes_no_results")
        return pl.DataFrame({
            "mouse_gene": [],
            "mp_term_id": [],
            "mp_term_name": [],
            "p_value": [],
        })
    # Combine all batches
    result = pl.concat(all_results, how="vertical_relaxed").unique()
    logger.info("fetch_impc_phenotypes_complete", total_phenotypes=len(result))
    return result
@@ -0,0 +1,119 @@
 """Data models for animal model phenotype evidence."""
 from typing import Optional
 from pydantic import BaseModel, Field
 # Table name for DuckDB storage
 ANIMAL_TABLE_NAME = "animal_model_phenotypes"
 # Mammalian Phenotype (MP) ontology keywords for sensory/cilia relevance
 SENSORY_MP_KEYWORDS = [
    "hearing",
    "deaf",
    "vestibular",
    "balance",
    "retina",
    "photoreceptor",
    "vision",
    "blind",
    "cochlea",
    "stereocilia",
    "cilia",
    "cilium",
    "flagellum",
    "situs inversus",
    "laterality",
    "hydrocephalus",
    "kidney cyst",
    "polycystic",
 ]
 # Zebrafish Phenotype (ZP) ontology keywords for sensory/cilia relevance
 SENSORY_ZP_KEYWORDS = [
    "hearing",
    "deaf",
    "vestibular",
    "balance",
    "retina",
    "photoreceptor",
    "vision",
    "blind",
    "eye",
    "ear",
    "otolith",
    "lateral line",
    "cilia",
    "cilium",
    "flagellum",
    "situs",
    "laterality",
    "hydrocephalus",
    "kidney cyst",
    "pronephros",
 ]
 class AnimalModelRecord(BaseModel):
    """Record representing animal model phenotype evidence for a gene.
    Attributes:
        gene_id: Human gene ID (ENSG)
        gene_symbol: Human gene symbol
        mouse_ortholog: Mouse gene symbol (MGI ID or symbol)
        mouse_ortholog_confidence: Ortholog confidence (HIGH/MEDIUM/LOW based on HCOP support)
        zebrafish_ortholog: Zebrafish gene symbol (ZFIN ID or symbol)
        zebrafish_ortholog_confidence: Ortholog confidence (HIGH/MEDIUM/LOW)
        has_mouse_phenotype: Whether mouse ortholog has phenotypes in MGI
        has_zebrafish_phenotype: Whether zebrafish ortholog has phenotypes in ZFIN
        has_impc_phenotype: Whether mouse ortholog has phenotypes in IMPC
        sensory_phenotype_count: Number of sensory-relevant phenotypes across all sources
        phenotype_categories: Semicolon-separated list of matched phenotype terms
        animal_model_score_normalized: Composite animal model evidence score (0-1 range)
    """
    gene_id: str = Field(..., description="Human gene ID (ENSG)")
    gene_symbol: str = Field(..., description="Human gene symbol")
    mouse_ortholog: Optional[str] = Field(None, description="Mouse gene symbol/ID")
    mouse_ortholog_confidence: Optional[str] = Field(
        None,
        description="Ortholog confidence: HIGH/MEDIUM/LOW"
    )
    zebrafish_ortholog: Optional[str] = Field(None, description="Zebrafish gene symbol/ID")
    zebrafish_ortholog_confidence: Optional[str] = Field(
        None,
        description="Ortholog confidence: HIGH/MEDIUM/LOW"
    )
    has_mouse_phenotype: Optional[bool] = Field(
        None,
        description="Mouse ortholog has phenotypes in MGI"
    )
    has_zebrafish_phenotype: Optional[bool] = Field(
        None,
        description="Zebrafish ortholog has phenotypes in ZFIN"
    )
    has_impc_phenotype: Optional[bool] = Field(
        None,
        description="Mouse ortholog has phenotypes in IMPC"
    )
    sensory_phenotype_count: Optional[int] = Field(
        None,
        description="Number of sensory-relevant phenotypes"
    )
    phenotype_categories: Optional[str] = Field(
        None,
        description="Semicolon-separated matched phenotype terms"
    )
    animal_model_score_normalized: Optional[float] = Field(
        None,
        ge=0.0,
        le=1.0,
        description="Composite animal model evidence score (0-1)"
    )
@@ -0,0 +1,361 @@
 """Transform animal model phenotype data: filter and score."""
 import polars as pl
 import structlog
 from usher_pipeline.evidence.animal_models.models import (
    SENSORY_MP_KEYWORDS,
    SENSORY_ZP_KEYWORDS,
 )
 from usher_pipeline.evidence.animal_models.fetch import (
    fetch_ortholog_mapping,
    fetch_mgi_phenotypes,
    fetch_zfin_phenotypes,
    fetch_impc_phenotypes,
 )
 logger = structlog.get_logger()
 def filter_sensory_phenotypes(
    phenotype_df: pl.DataFrame,
    keywords: list[str],
    term_column: str = "mp_term_name"
 ) -> pl.DataFrame:
    """Filter phenotypes for sensory/cilia relevance using keyword matching.
    Performs case-insensitive substring matching against phenotype terms.
    Returns only rows where the phenotype term matches at least one keyword.
    Args:
        phenotype_df: DataFrame with phenotype terms
        keywords: List of keywords to match (e.g., SENSORY_MP_KEYWORDS)
        term_column: Name of column containing phenotype term names
    Returns:
        Filtered DataFrame with only sensory-relevant phenotypes
    """
    if phenotype_df.is_empty():
        return phenotype_df
    logger.info("filter_sensory_phenotypes_start", row_count=len(phenotype_df))
    # Create case-insensitive match condition
    # Match if ANY keyword appears as substring in term
    match_condition = pl.lit(False)
    for keyword in keywords:
        match_condition = match_condition | pl.col(term_column).str.to_lowercase().str.contains(keyword.lower())
    # Filter phenotypes
    filtered = phenotype_df.filter(match_condition)
    logger.info(
        "filter_sensory_phenotypes_complete",
        input_count=len(phenotype_df),
        output_count=len(filtered),
        filtered_pct=round(100 * len(filtered) / len(phenotype_df), 1) if len(phenotype_df) > 0 else 0,
    )
    return filtered
 def score_animal_evidence(df: pl.DataFrame) -> pl.DataFrame:
    """Compute animal model evidence scores with ortholog confidence weighting.
    Scoring formula:
    - Base score = 0 if no phenotypes
    - For each organism with sensory phenotypes:
      * Mouse (MGI): +0.4 weighted by ortholog confidence
      * Zebrafish (ZFIN): +0.3 weighted by ortholog confidence
      * IMPC: +0.3 (independent confirmation bonus)
    - Confidence weighting: HIGH=1.0, MEDIUM=0.7, LOW=0.4
    - Multiply by log2(sensory_phenotype_count + 1) / log2(max_count + 1) to reward multiple phenotypes
    - Clamp to [0, 1]
    - NULL if no ortholog mapping exists
    Args:
        df: DataFrame with ortholog mappings and phenotype flags
    Returns:
        DataFrame with added animal_model_score_normalized column
    """
    logger.info("score_animal_evidence_start", gene_count=len(df))
    # Define confidence weights
    confidence_weight = pl.when(pl.col("confidence") == "HIGH").then(1.0)\
        .when(pl.col("confidence") == "MEDIUM").then(0.7)\
        .when(pl.col("confidence") == "LOW").then(0.4)\
        .otherwise(0.0)
    # Score for mouse phenotypes (MGI)
    mouse_score = (
        pl.when(pl.col("has_mouse_phenotype") == True)
        .then(
            0.4 * pl.when(pl.col("mouse_ortholog_confidence") == "HIGH").then(1.0)
            .when(pl.col("mouse_ortholog_confidence") == "MEDIUM").then(0.7)
            .when(pl.col("mouse_ortholog_confidence") == "LOW").then(0.4)
            .otherwise(0.0)
        )
        .otherwise(0.0)
    )
    # Score for zebrafish phenotypes (ZFIN)
    zebrafish_score = (
        pl.when(pl.col("has_zebrafish_phenotype") == True)
        .then(
            0.3 * pl.when(pl.col("zebrafish_ortholog_confidence") == "HIGH").then(1.0)
            .when(pl.col("zebrafish_ortholog_confidence") == "MEDIUM").then(0.7)
            .when(pl.col("zebrafish_ortholog_confidence") == "LOW").then(0.4)
            .otherwise(0.0)
        )
        .otherwise(0.0)
    )
    # Score for IMPC phenotypes (independent confirmation)
    impc_score = (
        pl.when(pl.col("has_impc_phenotype") == True)
        .then(0.3)
        .otherwise(0.0)
    )
    # Combine scores
    base_score = mouse_score + zebrafish_score + impc_score
    # Get max sensory phenotype count for normalization
    max_count = df.select(pl.col("sensory_phenotype_count").max()).item()
    if max_count is None or max_count == 0:
        max_count = 1  # Avoid division by zero
    # Apply phenotype count scaling (diminishing returns via log)
    # log2(count + 1) / log2(max_count + 1)
    import math
    max_log = math.log2(max_count + 1)
    phenotype_scaling = (
        pl.when(pl.col("sensory_phenotype_count").is_not_null())
        .then((pl.col("sensory_phenotype_count") + 1).log(2) / max_log)
        .otherwise(0.0)
    )
    # Final score: base_score * phenotype_scaling, clamped to [0, 1]
    # NULL if no ortholog mapping
    animal_model_score = (
        pl.when(
            pl.col("mouse_ortholog").is_null() & pl.col("zebrafish_ortholog").is_null()
        )
        .then(None)
        .otherwise(
            (base_score * phenotype_scaling).clip(0.0, 1.0)
        )
        .alias("animal_model_score_normalized")
    )
    result = df.with_columns([animal_model_score])
    logger.info(
        "score_animal_evidence_complete",
        scored_genes=result.filter(pl.col("animal_model_score_normalized").is_not_null()).height,
        null_genes=result.filter(pl.col("animal_model_score_normalized").is_null()).height,
    )
    return result
 def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame:
    """End-to-end processing of animal model phenotype evidence.
    Executes the full pipeline:
    1. Fetch ortholog mappings (mouse and zebrafish)
    2. Fetch phenotypes from MGI, ZFIN, and IMPC
    3. Filter for sensory/cilia-relevant phenotypes
    4. Aggregate phenotypes by gene
    5. Score evidence with confidence weighting
    Args:
        gene_ids: List of human gene IDs (ENSG format)
    Returns:
        DataFrame with animal model evidence for each gene
    """
    logger.info("process_animal_model_evidence_start", gene_count=len(gene_ids))
    # Step 1: Fetch ortholog mappings
    logger.info("step_1_fetch_orthologs")
    orthologs = fetch_ortholog_mapping(gene_ids)
    # Extract lists of orthologs to query
    mouse_genes = orthologs.filter(pl.col("mouse_ortholog").is_not_null())["mouse_ortholog"].to_list()
    zebrafish_genes = orthologs.filter(pl.col("zebrafish_ortholog").is_not_null())["zebrafish_ortholog"].to_list()
    logger.info(
        "orthologs_extracted",
        mouse_count=len(mouse_genes),
        zebrafish_count=len(zebrafish_genes),
    )
    # Step 2: Fetch phenotypes
    logger.info("step_2_fetch_phenotypes")
    # MGI phenotypes
    mgi_pheno = fetch_mgi_phenotypes(mouse_genes)
    mgi_sensory = filter_sensory_phenotypes(mgi_pheno, SENSORY_MP_KEYWORDS, "mp_term_name")
    # ZFIN phenotypes
    zfin_pheno = fetch_zfin_phenotypes(zebrafish_genes)
    zfin_sensory = filter_sensory_phenotypes(zfin_pheno, SENSORY_ZP_KEYWORDS, "zp_term_name")
    # IMPC phenotypes
    impc_pheno = fetch_impc_phenotypes(mouse_genes)
    impc_sensory = filter_sensory_phenotypes(impc_pheno, SENSORY_MP_KEYWORDS, "mp_term_name")
    logger.info(
        "phenotypes_filtered",
        mgi_total=len(mgi_pheno),
        mgi_sensory=len(mgi_sensory),
        zfin_total=len(zfin_pheno),
        zfin_sensory=len(zfin_sensory),
        impc_total=len(impc_pheno),
        impc_sensory=len(impc_sensory),
    )
    # Step 3: Aggregate phenotypes by gene
    logger.info("step_3_aggregate_phenotypes")
    # Count sensory phenotypes per mouse gene
    if not mgi_sensory.is_empty():
        mgi_counts = (
            mgi_sensory
            .group_by("mouse_gene")
            .agg([
                pl.col("mp_term_name").count().alias("mgi_phenotype_count"),
                pl.col("mp_term_name").str.concat("; ").alias("mgi_terms"),
            ])
        )
    else:
        mgi_counts = pl.DataFrame({
            "mouse_gene": [],
            "mgi_phenotype_count": [],
            "mgi_terms": [],
        })
    # Count sensory phenotypes per zebrafish gene
    if not zfin_sensory.is_empty():
        zfin_counts = (
            zfin_sensory
            .group_by("zebrafish_gene")
            .agg([
                pl.col("zp_term_name").count().alias("zfin_phenotype_count"),
                pl.col("zp_term_name").str.concat("; ").alias("zfin_terms"),
            ])
        )
    else:
        zfin_counts = pl.DataFrame({
            "zebrafish_gene": [],
            "zfin_phenotype_count": [],
            "zfin_terms": [],
        })
    # Count sensory phenotypes per mouse gene from IMPC
    if not impc_sensory.is_empty():
        impc_counts = (
            impc_sensory
            .group_by("mouse_gene")
            .agg([
                pl.col("mp_term_name").count().alias("impc_phenotype_count"),
                pl.col("mp_term_name").str.concat("; ").alias("impc_terms"),
            ])
        )
    else:
        impc_counts = pl.DataFrame({
            "mouse_gene": [],
            "impc_phenotype_count": [],
            "impc_terms": [],
        })
    # Step 4: Join phenotype data with ortholog mappings
    logger.info("step_4_join_data")
    result = (
        orthologs
        # Join MGI phenotypes
        .join(
            mgi_counts,
            left_on="mouse_ortholog",
            right_on="mouse_gene",
            how="left"
        )
        # Join ZFIN phenotypes
        .join(
            zfin_counts,
            left_on="zebrafish_ortholog",
            right_on="zebrafish_gene",
            how="left"
        )
        # Join IMPC phenotypes
        .join(
            impc_counts,
            left_on="mouse_ortholog",
            right_on="mouse_gene",
            how="left"
        )
        # Add flags
        .with_columns([
            (pl.col("mgi_phenotype_count") > 0).alias("has_mouse_phenotype"),
            (pl.col("zfin_phenotype_count") > 0).alias("has_zebrafish_phenotype"),
            (pl.col("impc_phenotype_count") > 0).alias("has_impc_phenotype"),
        ])
        # Calculate total sensory phenotype count
        .with_columns([
            (
                pl.col("mgi_phenotype_count").fill_null(0) +
                pl.col("zfin_phenotype_count").fill_null(0) +
                pl.col("impc_phenotype_count").fill_null(0)
            ).alias("sensory_phenotype_count")
        ])
        # Combine phenotype terms
        .with_columns([
            pl.concat_str([
                pl.col("mgi_terms").fill_null(""),
                pl.col("zfin_terms").fill_null(""),
                pl.col("impc_terms").fill_null(""),
            ], separator="; ").str.replace_all("; ; ", "; ").str.strip_chars("; ").alias("phenotype_categories")
        ])
        # Set sensory_phenotype_count to NULL if zero (preserve NULL pattern)
        .with_columns([
            pl.when(pl.col("sensory_phenotype_count") == 0)
            .then(None)
            .otherwise(pl.col("sensory_phenotype_count"))
            .alias("sensory_phenotype_count")
        ])
        # Select final columns
        .select([
            "gene_id",
            "mouse_ortholog",
            "mouse_ortholog_confidence",
            "zebrafish_ortholog",
            "zebrafish_ortholog_confidence",
            "has_mouse_phenotype",
            "has_zebrafish_phenotype",
            "has_impc_phenotype",
            "sensory_phenotype_count",
            "phenotype_categories",
        ])
    )
    # Step 5: Score evidence
    logger.info("step_5_score_evidence")
    result = score_animal_evidence(result)
    logger.info(
        "process_animal_model_evidence_complete",
        total_genes=len(result),
        with_orthologs=result.filter(
            pl.col("mouse_ortholog").is_not_null() | pl.col("zebrafish_ortholog").is_not_null()
        ).height,
        with_sensory_phenotypes=result.filter(
            pl.col("sensory_phenotype_count").is_not_null()
        ).height,
    )
    return result