feat(03-05): implement animal model evidence fetch and transform
- models.py: AnimalModelRecord with ortholog confidence, phenotype flags, and normalized scoring - fetch.py: Retrieve orthologs from HCOP, phenotypes from MGI/ZFIN/IMPC with retry - transform.py: Filter sensory/cilia-relevant phenotypes, score with confidence weighting - Ortholog confidence: HIGH (8+ sources), MEDIUM (4-7), LOW (1-3) - Scoring: mouse +0.4, zebrafish +0.3, IMPC +0.3, weighted by confidence - NULL preservation: no ortholog = NULL score (not zero)
This commit is contained in:
42
src/usher_pipeline/evidence/animal_models/__init__.py
Normal file
42
src/usher_pipeline/evidence/animal_models/__init__.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""Animal model phenotype evidence layer.
|
||||
|
||||
Retrieves knockout/perturbation phenotypes from:
|
||||
- MGI (Mouse Genome Informatics) - mouse phenotypes
|
||||
- ZFIN (Zebrafish Information Network) - zebrafish phenotypes
|
||||
- IMPC (International Mouse Phenotyping Consortium) - mouse phenotypes
|
||||
|
||||
Maps human genes to model organism orthologs with confidence scoring,
|
||||
filters for sensory/cilia-relevant phenotypes, and scores evidence.
|
||||
"""
|
||||
|
||||
from usher_pipeline.evidence.animal_models.models import (
|
||||
AnimalModelRecord,
|
||||
ANIMAL_TABLE_NAME,
|
||||
SENSORY_MP_KEYWORDS,
|
||||
SENSORY_ZP_KEYWORDS,
|
||||
)
|
||||
from usher_pipeline.evidence.animal_models.fetch import (
|
||||
fetch_ortholog_mapping,
|
||||
fetch_mgi_phenotypes,
|
||||
fetch_zfin_phenotypes,
|
||||
fetch_impc_phenotypes,
|
||||
)
|
||||
from usher_pipeline.evidence.animal_models.transform import (
|
||||
filter_sensory_phenotypes,
|
||||
score_animal_evidence,
|
||||
process_animal_model_evidence,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"AnimalModelRecord",
|
||||
"ANIMAL_TABLE_NAME",
|
||||
"SENSORY_MP_KEYWORDS",
|
||||
"SENSORY_ZP_KEYWORDS",
|
||||
"fetch_ortholog_mapping",
|
||||
"fetch_mgi_phenotypes",
|
||||
"fetch_zfin_phenotypes",
|
||||
"fetch_impc_phenotypes",
|
||||
"filter_sensory_phenotypes",
|
||||
"score_animal_evidence",
|
||||
"process_animal_model_evidence",
|
||||
]
|
||||
497
src/usher_pipeline/evidence/animal_models/fetch.py
Normal file
497
src/usher_pipeline/evidence/animal_models/fetch.py
Normal file
@@ -0,0 +1,497 @@
|
||||
"""Fetch animal model phenotype data and ortholog mappings."""
|
||||
|
||||
import gzip
|
||||
import io
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
import polars as pl
|
||||
import structlog
|
||||
from tenacity import (
|
||||
retry,
|
||||
stop_after_attempt,
|
||||
wait_exponential,
|
||||
retry_if_exception_type,
|
||||
)
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
# HCOP ortholog database URLs
|
||||
HCOP_HUMAN_MOUSE_URL = "https://ftp.ebi.ac.uk/pub/databases/genenames/hcop/human_mouse_hcop_fifteen_column.txt.gz"
|
||||
HCOP_HUMAN_ZEBRAFISH_URL = "https://ftp.ebi.ac.uk/pub/databases/genenames/hcop/human_zebrafish_hcop_fifteen_column.txt.gz"
|
||||
|
||||
# MGI phenotype report URL
|
||||
MGI_GENE_PHENO_URL = "https://www.informatics.jax.org/downloads/reports/MGI_GenePheno.rpt"
|
||||
|
||||
# ZFIN phenotype data URL
|
||||
ZFIN_PHENO_URL = "https://zfin.org/downloads/phenoGeneCleanData_fish.txt"
|
||||
|
||||
# IMPC API base URL
|
||||
IMPC_API_BASE = "https://www.ebi.ac.uk/mi/impc/solr/genotype-phenotype/select"
|
||||
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(5),
|
||||
wait=wait_exponential(multiplier=1, min=4, max=60),
|
||||
retry=retry_if_exception_type(
|
||||
(httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
|
||||
),
|
||||
)
|
||||
def _download_gzipped(url: str) -> bytes:
|
||||
"""Download and decompress a gzipped file.
|
||||
|
||||
Args:
|
||||
url: URL to download
|
||||
|
||||
Returns:
|
||||
Decompressed file content as bytes
|
||||
"""
|
||||
logger.info("download_start", url=url)
|
||||
|
||||
with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Read compressed data
|
||||
compressed_data = b""
|
||||
for chunk in response.iter_bytes(chunk_size=8192):
|
||||
compressed_data += chunk
|
||||
|
||||
# Decompress
|
||||
logger.info("decompress_start", compressed_size_mb=round(len(compressed_data) / 1024 / 1024, 2))
|
||||
decompressed = gzip.decompress(compressed_data)
|
||||
logger.info("decompress_complete", decompressed_size_mb=round(len(decompressed) / 1024 / 1024, 2))
|
||||
|
||||
return decompressed
|
||||
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(5),
|
||||
wait=wait_exponential(multiplier=1, min=4, max=60),
|
||||
retry=retry_if_exception_type(
|
||||
(httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
|
||||
),
|
||||
)
|
||||
def _download_text(url: str) -> str:
|
||||
"""Download a text file with retry.
|
||||
|
||||
Args:
|
||||
url: URL to download
|
||||
|
||||
Returns:
|
||||
File content as string
|
||||
"""
|
||||
logger.info("download_text_start", url=url)
|
||||
|
||||
with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
|
||||
response.raise_for_status()
|
||||
content = response.text
|
||||
|
||||
logger.info("download_text_complete", size_mb=round(len(content) / 1024 / 1024, 2))
|
||||
return content
|
||||
|
||||
|
||||
def fetch_ortholog_mapping(gene_ids: list[str]) -> pl.DataFrame:
|
||||
"""Fetch human-to-mouse and human-to-zebrafish ortholog mappings from HCOP.
|
||||
|
||||
Downloads HCOP ortholog data, assigns confidence scores based on number of
|
||||
supporting databases, and handles one-to-many mappings by selecting the
|
||||
ortholog with highest confidence.
|
||||
|
||||
Confidence scoring:
|
||||
- HIGH: 8+ supporting databases
|
||||
- MEDIUM: 4-7 supporting databases
|
||||
- LOW: 1-3 supporting databases
|
||||
|
||||
Args:
|
||||
gene_ids: List of human gene IDs (ENSG format)
|
||||
|
||||
Returns:
|
||||
DataFrame with columns:
|
||||
- gene_id: Human gene ID
|
||||
- mouse_ortholog: Mouse gene symbol
|
||||
- mouse_ortholog_confidence: HIGH/MEDIUM/LOW
|
||||
- zebrafish_ortholog: Zebrafish gene symbol
|
||||
- zebrafish_ortholog_confidence: HIGH/MEDIUM/LOW
|
||||
"""
|
||||
logger.info("fetch_ortholog_mapping_start", gene_count=len(gene_ids))
|
||||
|
||||
# Download human-mouse HCOP data
|
||||
logger.info("fetch_hcop_mouse")
|
||||
mouse_data = _download_gzipped(HCOP_HUMAN_MOUSE_URL)
|
||||
mouse_df = pl.read_csv(
|
||||
io.BytesIO(mouse_data),
|
||||
separator="\t",
|
||||
null_values=["", "NA"],
|
||||
)
|
||||
|
||||
logger.info("hcop_mouse_columns", columns=mouse_df.columns)
|
||||
|
||||
# Parse mouse ortholog data
|
||||
# HCOP columns: human_entrez_gene, human_ensembl_gene, hgnc_id, human_name, human_symbol,
|
||||
# human_chr, human_assert_ids, mouse_entrez_gene, mouse_ensembl_gene,
|
||||
# mgi_id, mouse_name, mouse_symbol, mouse_chr, mouse_assert_ids, support
|
||||
mouse_orthologs = (
|
||||
mouse_df
|
||||
.filter(pl.col("human_ensembl_gene").is_in(gene_ids))
|
||||
.select([
|
||||
pl.col("human_ensembl_gene").alias("gene_id"),
|
||||
pl.col("mouse_symbol").alias("mouse_ortholog"),
|
||||
pl.col("support").str.split(",").list.len().alias("support_count"),
|
||||
])
|
||||
.with_columns([
|
||||
pl.when(pl.col("support_count") >= 8)
|
||||
.then(pl.lit("HIGH"))
|
||||
.when(pl.col("support_count") >= 4)
|
||||
.then(pl.lit("MEDIUM"))
|
||||
.otherwise(pl.lit("LOW"))
|
||||
.alias("mouse_ortholog_confidence")
|
||||
])
|
||||
.sort(["gene_id", "support_count"], descending=[False, True])
|
||||
.group_by("gene_id")
|
||||
.first()
|
||||
.select(["gene_id", "mouse_ortholog", "mouse_ortholog_confidence"])
|
||||
)
|
||||
|
||||
logger.info("mouse_orthologs_mapped", count=len(mouse_orthologs))
|
||||
|
||||
# Download human-zebrafish HCOP data
|
||||
logger.info("fetch_hcop_zebrafish")
|
||||
zebrafish_data = _download_gzipped(HCOP_HUMAN_ZEBRAFISH_URL)
|
||||
zebrafish_df = pl.read_csv(
|
||||
io.BytesIO(zebrafish_data),
|
||||
separator="\t",
|
||||
null_values=["", "NA"],
|
||||
)
|
||||
|
||||
logger.info("hcop_zebrafish_columns", columns=zebrafish_df.columns)
|
||||
|
||||
# Parse zebrafish ortholog data
|
||||
zebrafish_orthologs = (
|
||||
zebrafish_df
|
||||
.filter(pl.col("human_ensembl_gene").is_in(gene_ids))
|
||||
.select([
|
||||
pl.col("human_ensembl_gene").alias("gene_id"),
|
||||
pl.col("zebrafish_symbol").alias("zebrafish_ortholog"),
|
||||
pl.col("support").str.split(",").list.len().alias("support_count"),
|
||||
])
|
||||
.with_columns([
|
||||
pl.when(pl.col("support_count") >= 8)
|
||||
.then(pl.lit("HIGH"))
|
||||
.when(pl.col("support_count") >= 4)
|
||||
.then(pl.lit("MEDIUM"))
|
||||
.otherwise(pl.lit("LOW"))
|
||||
.alias("zebrafish_ortholog_confidence")
|
||||
])
|
||||
.sort(["gene_id", "support_count"], descending=[False, True])
|
||||
.group_by("gene_id")
|
||||
.first()
|
||||
.select(["gene_id", "zebrafish_ortholog", "zebrafish_ortholog_confidence"])
|
||||
)
|
||||
|
||||
logger.info("zebrafish_orthologs_mapped", count=len(zebrafish_orthologs))
|
||||
|
||||
# Create base DataFrame with all gene IDs
|
||||
base_df = pl.DataFrame({"gene_id": gene_ids})
|
||||
|
||||
# Left join ortholog mappings
|
||||
result = (
|
||||
base_df
|
||||
.join(mouse_orthologs, on="gene_id", how="left")
|
||||
.join(zebrafish_orthologs, on="gene_id", how="left")
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"fetch_ortholog_mapping_complete",
|
||||
total_genes=len(result),
|
||||
mouse_mapped=result.filter(pl.col("mouse_ortholog").is_not_null()).height,
|
||||
zebrafish_mapped=result.filter(pl.col("zebrafish_ortholog").is_not_null()).height,
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def fetch_mgi_phenotypes(mouse_gene_symbols: list[str]) -> pl.DataFrame:
|
||||
"""Fetch mouse phenotype data from MGI (Mouse Genome Informatics).
|
||||
|
||||
Downloads the MGI gene-phenotype report and extracts phenotype terms
|
||||
for the specified mouse genes.
|
||||
|
||||
Args:
|
||||
mouse_gene_symbols: List of mouse gene symbols
|
||||
|
||||
Returns:
|
||||
DataFrame with columns:
|
||||
- mouse_gene: Mouse gene symbol
|
||||
- mp_term_id: Mammalian Phenotype term ID
|
||||
- mp_term_name: Mammalian Phenotype term name
|
||||
"""
|
||||
if not mouse_gene_symbols:
|
||||
logger.info("fetch_mgi_phenotypes_skip", reason="no_mouse_genes")
|
||||
return pl.DataFrame({
|
||||
"mouse_gene": [],
|
||||
"mp_term_id": [],
|
||||
"mp_term_name": [],
|
||||
})
|
||||
|
||||
logger.info("fetch_mgi_phenotypes_start", gene_count=len(mouse_gene_symbols))
|
||||
|
||||
# Download MGI phenotype report
|
||||
content = _download_text(MGI_GENE_PHENO_URL)
|
||||
|
||||
# Parse TSV (skip first line if it's a comment)
|
||||
lines = content.strip().split("\n")
|
||||
if lines[0].startswith("#"):
|
||||
lines = lines[1:]
|
||||
|
||||
# Read as DataFrame
|
||||
df = pl.read_csv(
|
||||
io.StringIO("\n".join(lines)),
|
||||
separator="\t",
|
||||
null_values=["", "NA"],
|
||||
has_header=True,
|
||||
)
|
||||
|
||||
logger.info("mgi_raw_columns", columns=df.columns)
|
||||
|
||||
# MGI_GenePheno.rpt columns vary, but typically include:
|
||||
# Allelic Composition, Allele Symbol(s), Genetic Background, Mammalian Phenotype ID, PubMed ID, MGI Marker Accession ID
|
||||
# We need to identify the right columns
|
||||
# Expected columns: marker symbol, MP ID, MP term
|
||||
# Common column names: "Marker Symbol", "Mammalian Phenotype ID"
|
||||
|
||||
# Try to find the right columns
|
||||
marker_col = None
|
||||
mp_id_col = None
|
||||
|
||||
for col in df.columns:
|
||||
col_lower = col.lower()
|
||||
if "marker" in col_lower and "symbol" in col_lower:
|
||||
marker_col = col
|
||||
elif "mammalian phenotype id" in col_lower or "mp id" in col_lower:
|
||||
mp_id_col = col
|
||||
|
||||
if marker_col is None or mp_id_col is None:
|
||||
logger.warning("mgi_column_detection_failed", columns=df.columns)
|
||||
# Return empty result
|
||||
return pl.DataFrame({
|
||||
"mouse_gene": [],
|
||||
"mp_term_id": [],
|
||||
"mp_term_name": [],
|
||||
})
|
||||
|
||||
# Filter for genes of interest and extract phenotypes
|
||||
# Note: MGI report may have one row per allele-phenotype combination
|
||||
# We'll aggregate unique phenotypes per gene
|
||||
result = (
|
||||
df
|
||||
.filter(pl.col(marker_col).is_in(mouse_gene_symbols))
|
||||
.select([
|
||||
pl.col(marker_col).alias("mouse_gene"),
|
||||
pl.col(mp_id_col).alias("mp_term_id"),
|
||||
pl.lit(None).alias("mp_term_name"), # Term name not in this report
|
||||
])
|
||||
.unique()
|
||||
)
|
||||
|
||||
logger.info("fetch_mgi_phenotypes_complete", phenotype_count=len(result))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def fetch_zfin_phenotypes(zebrafish_gene_symbols: list[str]) -> pl.DataFrame:
|
||||
"""Fetch zebrafish phenotype data from ZFIN.
|
||||
|
||||
Downloads ZFIN phenotype data and extracts phenotype terms for the
|
||||
specified zebrafish genes.
|
||||
|
||||
Args:
|
||||
zebrafish_gene_symbols: List of zebrafish gene symbols
|
||||
|
||||
Returns:
|
||||
DataFrame with columns:
|
||||
- zebrafish_gene: Zebrafish gene symbol
|
||||
- zp_term_id: Zebrafish Phenotype term ID (or descriptor)
|
||||
- zp_term_name: Zebrafish Phenotype term name
|
||||
"""
|
||||
if not zebrafish_gene_symbols:
|
||||
logger.info("fetch_zfin_phenotypes_skip", reason="no_zebrafish_genes")
|
||||
return pl.DataFrame({
|
||||
"zebrafish_gene": [],
|
||||
"zp_term_id": [],
|
||||
"zp_term_name": [],
|
||||
})
|
||||
|
||||
logger.info("fetch_zfin_phenotypes_start", gene_count=len(zebrafish_gene_symbols))
|
||||
|
||||
# Download ZFIN phenotype data
|
||||
content = _download_text(ZFIN_PHENO_URL)
|
||||
|
||||
# Parse TSV
|
||||
df = pl.read_csv(
|
||||
io.StringIO(content),
|
||||
separator="\t",
|
||||
null_values=["", "NA"],
|
||||
has_header=True,
|
||||
)
|
||||
|
||||
logger.info("zfin_raw_columns", columns=df.columns)
|
||||
|
||||
# ZFIN phenoGeneCleanData_fish.txt columns (typical):
|
||||
# Gene Symbol, Gene ID, Affected Structure or Process 1 subterm ID, etc.
|
||||
# Look for gene symbol and phenotype columns
|
||||
gene_col = None
|
||||
pheno_col = None
|
||||
|
||||
for col in df.columns:
|
||||
col_lower = col.lower()
|
||||
if "gene" in col_lower and ("symbol" in col_lower or "name" in col_lower):
|
||||
gene_col = col
|
||||
elif "phenotype" in col_lower or "structure" in col_lower or "process" in col_lower:
|
||||
if pheno_col is None: # Take first phenotype-related column
|
||||
pheno_col = col
|
||||
|
||||
if gene_col is None or pheno_col is None:
|
||||
logger.warning("zfin_column_detection_failed", columns=df.columns)
|
||||
return pl.DataFrame({
|
||||
"zebrafish_gene": [],
|
||||
"zp_term_id": [],
|
||||
"zp_term_name": [],
|
||||
})
|
||||
|
||||
# Filter and extract
|
||||
result = (
|
||||
df
|
||||
.filter(pl.col(gene_col).is_in(zebrafish_gene_symbols))
|
||||
.select([
|
||||
pl.col(gene_col).alias("zebrafish_gene"),
|
||||
pl.lit(None).alias("zp_term_id"),
|
||||
pl.col(pheno_col).alias("zp_term_name"),
|
||||
])
|
||||
.unique()
|
||||
)
|
||||
|
||||
logger.info("fetch_zfin_phenotypes_complete", phenotype_count=len(result))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=2, max=30),
|
||||
retry=retry_if_exception_type(
|
||||
(httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
|
||||
),
|
||||
)
|
||||
def _query_impc_batch(gene_symbols: list[str]) -> pl.DataFrame:
|
||||
"""Query IMPC API for a batch of genes.
|
||||
|
||||
Args:
|
||||
gene_symbols: List of mouse gene symbols (batch)
|
||||
|
||||
Returns:
|
||||
DataFrame with IMPC phenotype data
|
||||
"""
|
||||
# Build query: marker_symbol:(gene1 OR gene2 OR ...)
|
||||
query = "marker_symbol:(" + " OR ".join(gene_symbols) + ")"
|
||||
|
||||
params = {
|
||||
"q": query,
|
||||
"rows": 10000,
|
||||
"wt": "json",
|
||||
}
|
||||
|
||||
logger.info("impc_query_batch", gene_count=len(gene_symbols))
|
||||
|
||||
response = httpx.get(IMPC_API_BASE, params=params, timeout=60.0)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
docs = data.get("response", {}).get("docs", [])
|
||||
|
||||
if not docs:
|
||||
return pl.DataFrame({
|
||||
"mouse_gene": [],
|
||||
"mp_term_id": [],
|
||||
"mp_term_name": [],
|
||||
"p_value": [],
|
||||
})
|
||||
|
||||
# Extract relevant fields
|
||||
records = []
|
||||
for doc in docs:
|
||||
gene = doc.get("marker_symbol")
|
||||
mp_id = doc.get("mp_term_id")
|
||||
mp_name = doc.get("mp_term_name")
|
||||
p_value = doc.get("p_value")
|
||||
|
||||
if gene and mp_id:
|
||||
records.append({
|
||||
"mouse_gene": gene,
|
||||
"mp_term_id": mp_id,
|
||||
"mp_term_name": mp_name,
|
||||
"p_value": p_value,
|
||||
})
|
||||
|
||||
df = pl.DataFrame(records)
|
||||
logger.info("impc_batch_complete", phenotype_count=len(df))
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def fetch_impc_phenotypes(mouse_gene_symbols: list[str]) -> pl.DataFrame:
|
||||
"""Fetch mouse phenotype data from IMPC (International Mouse Phenotyping Consortium).
|
||||
|
||||
Queries the IMPC SOLR API in batches to get phenotype data for mouse genes.
|
||||
Includes statistical significance (p-value) for each phenotype.
|
||||
|
||||
Args:
|
||||
mouse_gene_symbols: List of mouse gene symbols
|
||||
|
||||
Returns:
|
||||
DataFrame with columns:
|
||||
- mouse_gene: Mouse gene symbol
|
||||
- mp_term_id: Mammalian Phenotype term ID
|
||||
- mp_term_name: Mammalian Phenotype term name
|
||||
- p_value: Statistical significance of phenotype
|
||||
"""
|
||||
if not mouse_gene_symbols:
|
||||
logger.info("fetch_impc_phenotypes_skip", reason="no_mouse_genes")
|
||||
return pl.DataFrame({
|
||||
"mouse_gene": [],
|
||||
"mp_term_id": [],
|
||||
"mp_term_name": [],
|
||||
"p_value": [],
|
||||
})
|
||||
|
||||
logger.info("fetch_impc_phenotypes_start", gene_count=len(mouse_gene_symbols))
|
||||
|
||||
# Query in batches of 50 to avoid overloading API
|
||||
batch_size = 50
|
||||
all_results = []
|
||||
|
||||
for i in range(0, len(mouse_gene_symbols), batch_size):
|
||||
batch = mouse_gene_symbols[i:i + batch_size]
|
||||
try:
|
||||
batch_df = _query_impc_batch(batch)
|
||||
all_results.append(batch_df)
|
||||
except Exception as e:
|
||||
logger.warning("impc_batch_failed", batch_index=i // batch_size, error=str(e))
|
||||
# Continue with other batches
|
||||
|
||||
if not all_results:
|
||||
logger.warning("fetch_impc_phenotypes_no_results")
|
||||
return pl.DataFrame({
|
||||
"mouse_gene": [],
|
||||
"mp_term_id": [],
|
||||
"mp_term_name": [],
|
||||
"p_value": [],
|
||||
})
|
||||
|
||||
# Combine all batches
|
||||
result = pl.concat(all_results, how="vertical_relaxed").unique()
|
||||
|
||||
logger.info("fetch_impc_phenotypes_complete", total_phenotypes=len(result))
|
||||
|
||||
return result
|
||||
119
src/usher_pipeline/evidence/animal_models/models.py
Normal file
119
src/usher_pipeline/evidence/animal_models/models.py
Normal file
@@ -0,0 +1,119 @@
|
||||
"""Data models for animal model phenotype evidence."""
|
||||
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
# Table name for DuckDB storage
|
||||
ANIMAL_TABLE_NAME = "animal_model_phenotypes"
|
||||
|
||||
|
||||
# Mammalian Phenotype (MP) ontology keywords for sensory/cilia relevance
|
||||
SENSORY_MP_KEYWORDS = [
|
||||
"hearing",
|
||||
"deaf",
|
||||
"vestibular",
|
||||
"balance",
|
||||
"retina",
|
||||
"photoreceptor",
|
||||
"vision",
|
||||
"blind",
|
||||
"cochlea",
|
||||
"stereocilia",
|
||||
"cilia",
|
||||
"cilium",
|
||||
"flagellum",
|
||||
"situs inversus",
|
||||
"laterality",
|
||||
"hydrocephalus",
|
||||
"kidney cyst",
|
||||
"polycystic",
|
||||
]
|
||||
|
||||
|
||||
# Zebrafish Phenotype (ZP) ontology keywords for sensory/cilia relevance
|
||||
SENSORY_ZP_KEYWORDS = [
|
||||
"hearing",
|
||||
"deaf",
|
||||
"vestibular",
|
||||
"balance",
|
||||
"retina",
|
||||
"photoreceptor",
|
||||
"vision",
|
||||
"blind",
|
||||
"eye",
|
||||
"ear",
|
||||
"otolith",
|
||||
"lateral line",
|
||||
"cilia",
|
||||
"cilium",
|
||||
"flagellum",
|
||||
"situs",
|
||||
"laterality",
|
||||
"hydrocephalus",
|
||||
"kidney cyst",
|
||||
"pronephros",
|
||||
]
|
||||
|
||||
|
||||
class AnimalModelRecord(BaseModel):
|
||||
"""Record representing animal model phenotype evidence for a gene.
|
||||
|
||||
Attributes:
|
||||
gene_id: Human gene ID (ENSG)
|
||||
gene_symbol: Human gene symbol
|
||||
mouse_ortholog: Mouse gene symbol (MGI ID or symbol)
|
||||
mouse_ortholog_confidence: Ortholog confidence (HIGH/MEDIUM/LOW based on HCOP support)
|
||||
zebrafish_ortholog: Zebrafish gene symbol (ZFIN ID or symbol)
|
||||
zebrafish_ortholog_confidence: Ortholog confidence (HIGH/MEDIUM/LOW)
|
||||
has_mouse_phenotype: Whether mouse ortholog has phenotypes in MGI
|
||||
has_zebrafish_phenotype: Whether zebrafish ortholog has phenotypes in ZFIN
|
||||
has_impc_phenotype: Whether mouse ortholog has phenotypes in IMPC
|
||||
sensory_phenotype_count: Number of sensory-relevant phenotypes across all sources
|
||||
phenotype_categories: Semicolon-separated list of matched phenotype terms
|
||||
animal_model_score_normalized: Composite animal model evidence score (0-1 range)
|
||||
"""
|
||||
|
||||
gene_id: str = Field(..., description="Human gene ID (ENSG)")
|
||||
gene_symbol: str = Field(..., description="Human gene symbol")
|
||||
|
||||
mouse_ortholog: Optional[str] = Field(None, description="Mouse gene symbol/ID")
|
||||
mouse_ortholog_confidence: Optional[str] = Field(
|
||||
None,
|
||||
description="Ortholog confidence: HIGH/MEDIUM/LOW"
|
||||
)
|
||||
|
||||
zebrafish_ortholog: Optional[str] = Field(None, description="Zebrafish gene symbol/ID")
|
||||
zebrafish_ortholog_confidence: Optional[str] = Field(
|
||||
None,
|
||||
description="Ortholog confidence: HIGH/MEDIUM/LOW"
|
||||
)
|
||||
|
||||
has_mouse_phenotype: Optional[bool] = Field(
|
||||
None,
|
||||
description="Mouse ortholog has phenotypes in MGI"
|
||||
)
|
||||
has_zebrafish_phenotype: Optional[bool] = Field(
|
||||
None,
|
||||
description="Zebrafish ortholog has phenotypes in ZFIN"
|
||||
)
|
||||
has_impc_phenotype: Optional[bool] = Field(
|
||||
None,
|
||||
description="Mouse ortholog has phenotypes in IMPC"
|
||||
)
|
||||
|
||||
sensory_phenotype_count: Optional[int] = Field(
|
||||
None,
|
||||
description="Number of sensory-relevant phenotypes"
|
||||
)
|
||||
phenotype_categories: Optional[str] = Field(
|
||||
None,
|
||||
description="Semicolon-separated matched phenotype terms"
|
||||
)
|
||||
|
||||
animal_model_score_normalized: Optional[float] = Field(
|
||||
None,
|
||||
ge=0.0,
|
||||
le=1.0,
|
||||
description="Composite animal model evidence score (0-1)"
|
||||
)
|
||||
361
src/usher_pipeline/evidence/animal_models/transform.py
Normal file
361
src/usher_pipeline/evidence/animal_models/transform.py
Normal file
@@ -0,0 +1,361 @@
|
||||
"""Transform animal model phenotype data: filter and score."""
|
||||
|
||||
import polars as pl
|
||||
import structlog
|
||||
|
||||
from usher_pipeline.evidence.animal_models.models import (
|
||||
SENSORY_MP_KEYWORDS,
|
||||
SENSORY_ZP_KEYWORDS,
|
||||
)
|
||||
from usher_pipeline.evidence.animal_models.fetch import (
|
||||
fetch_ortholog_mapping,
|
||||
fetch_mgi_phenotypes,
|
||||
fetch_zfin_phenotypes,
|
||||
fetch_impc_phenotypes,
|
||||
)
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
def filter_sensory_phenotypes(
|
||||
phenotype_df: pl.DataFrame,
|
||||
keywords: list[str],
|
||||
term_column: str = "mp_term_name"
|
||||
) -> pl.DataFrame:
|
||||
"""Filter phenotypes for sensory/cilia relevance using keyword matching.
|
||||
|
||||
Performs case-insensitive substring matching against phenotype terms.
|
||||
Returns only rows where the phenotype term matches at least one keyword.
|
||||
|
||||
Args:
|
||||
phenotype_df: DataFrame with phenotype terms
|
||||
keywords: List of keywords to match (e.g., SENSORY_MP_KEYWORDS)
|
||||
term_column: Name of column containing phenotype term names
|
||||
|
||||
Returns:
|
||||
Filtered DataFrame with only sensory-relevant phenotypes
|
||||
"""
|
||||
if phenotype_df.is_empty():
|
||||
return phenotype_df
|
||||
|
||||
logger.info("filter_sensory_phenotypes_start", row_count=len(phenotype_df))
|
||||
|
||||
# Create case-insensitive match condition
|
||||
# Match if ANY keyword appears as substring in term
|
||||
match_condition = pl.lit(False)
|
||||
|
||||
for keyword in keywords:
|
||||
match_condition = match_condition | pl.col(term_column).str.to_lowercase().str.contains(keyword.lower())
|
||||
|
||||
# Filter phenotypes
|
||||
filtered = phenotype_df.filter(match_condition)
|
||||
|
||||
logger.info(
|
||||
"filter_sensory_phenotypes_complete",
|
||||
input_count=len(phenotype_df),
|
||||
output_count=len(filtered),
|
||||
filtered_pct=round(100 * len(filtered) / len(phenotype_df), 1) if len(phenotype_df) > 0 else 0,
|
||||
)
|
||||
|
||||
return filtered
|
||||
|
||||
|
||||
def score_animal_evidence(df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Compute animal model evidence scores with ortholog confidence weighting.
|
||||
|
||||
Scoring formula:
|
||||
- Base score = 0 if no phenotypes
|
||||
- For each organism with sensory phenotypes:
|
||||
* Mouse (MGI): +0.4 weighted by ortholog confidence
|
||||
* Zebrafish (ZFIN): +0.3 weighted by ortholog confidence
|
||||
* IMPC: +0.3 (independent confirmation bonus)
|
||||
- Confidence weighting: HIGH=1.0, MEDIUM=0.7, LOW=0.4
|
||||
- Multiply by log2(sensory_phenotype_count + 1) / log2(max_count + 1) to reward multiple phenotypes
|
||||
- Clamp to [0, 1]
|
||||
- NULL if no ortholog mapping exists
|
||||
|
||||
Args:
|
||||
df: DataFrame with ortholog mappings and phenotype flags
|
||||
|
||||
Returns:
|
||||
DataFrame with added animal_model_score_normalized column
|
||||
"""
|
||||
logger.info("score_animal_evidence_start", gene_count=len(df))
|
||||
|
||||
# Define confidence weights
|
||||
confidence_weight = pl.when(pl.col("confidence") == "HIGH").then(1.0)\
|
||||
.when(pl.col("confidence") == "MEDIUM").then(0.7)\
|
||||
.when(pl.col("confidence") == "LOW").then(0.4)\
|
||||
.otherwise(0.0)
|
||||
|
||||
# Score for mouse phenotypes (MGI)
|
||||
mouse_score = (
|
||||
pl.when(pl.col("has_mouse_phenotype") == True)
|
||||
.then(
|
||||
0.4 * pl.when(pl.col("mouse_ortholog_confidence") == "HIGH").then(1.0)
|
||||
.when(pl.col("mouse_ortholog_confidence") == "MEDIUM").then(0.7)
|
||||
.when(pl.col("mouse_ortholog_confidence") == "LOW").then(0.4)
|
||||
.otherwise(0.0)
|
||||
)
|
||||
.otherwise(0.0)
|
||||
)
|
||||
|
||||
# Score for zebrafish phenotypes (ZFIN)
|
||||
zebrafish_score = (
|
||||
pl.when(pl.col("has_zebrafish_phenotype") == True)
|
||||
.then(
|
||||
0.3 * pl.when(pl.col("zebrafish_ortholog_confidence") == "HIGH").then(1.0)
|
||||
.when(pl.col("zebrafish_ortholog_confidence") == "MEDIUM").then(0.7)
|
||||
.when(pl.col("zebrafish_ortholog_confidence") == "LOW").then(0.4)
|
||||
.otherwise(0.0)
|
||||
)
|
||||
.otherwise(0.0)
|
||||
)
|
||||
|
||||
# Score for IMPC phenotypes (independent confirmation)
|
||||
impc_score = (
|
||||
pl.when(pl.col("has_impc_phenotype") == True)
|
||||
.then(0.3)
|
||||
.otherwise(0.0)
|
||||
)
|
||||
|
||||
# Combine scores
|
||||
base_score = mouse_score + zebrafish_score + impc_score
|
||||
|
||||
# Get max sensory phenotype count for normalization
|
||||
max_count = df.select(pl.col("sensory_phenotype_count").max()).item()
|
||||
if max_count is None or max_count == 0:
|
||||
max_count = 1 # Avoid division by zero
|
||||
|
||||
# Apply phenotype count scaling (diminishing returns via log)
|
||||
# log2(count + 1) / log2(max_count + 1)
|
||||
import math
|
||||
max_log = math.log2(max_count + 1)
|
||||
|
||||
phenotype_scaling = (
|
||||
pl.when(pl.col("sensory_phenotype_count").is_not_null())
|
||||
.then((pl.col("sensory_phenotype_count") + 1).log(2) / max_log)
|
||||
.otherwise(0.0)
|
||||
)
|
||||
|
||||
# Final score: base_score * phenotype_scaling, clamped to [0, 1]
|
||||
# NULL if no ortholog mapping
|
||||
animal_model_score = (
|
||||
pl.when(
|
||||
pl.col("mouse_ortholog").is_null() & pl.col("zebrafish_ortholog").is_null()
|
||||
)
|
||||
.then(None)
|
||||
.otherwise(
|
||||
(base_score * phenotype_scaling).clip(0.0, 1.0)
|
||||
)
|
||||
.alias("animal_model_score_normalized")
|
||||
)
|
||||
|
||||
result = df.with_columns([animal_model_score])
|
||||
|
||||
logger.info(
|
||||
"score_animal_evidence_complete",
|
||||
scored_genes=result.filter(pl.col("animal_model_score_normalized").is_not_null()).height,
|
||||
null_genes=result.filter(pl.col("animal_model_score_normalized").is_null()).height,
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame:
|
||||
"""End-to-end processing of animal model phenotype evidence.
|
||||
|
||||
Executes the full pipeline:
|
||||
1. Fetch ortholog mappings (mouse and zebrafish)
|
||||
2. Fetch phenotypes from MGI, ZFIN, and IMPC
|
||||
3. Filter for sensory/cilia-relevant phenotypes
|
||||
4. Aggregate phenotypes by gene
|
||||
5. Score evidence with confidence weighting
|
||||
|
||||
Args:
|
||||
gene_ids: List of human gene IDs (ENSG format)
|
||||
|
||||
Returns:
|
||||
DataFrame with animal model evidence for each gene
|
||||
"""
|
||||
logger.info("process_animal_model_evidence_start", gene_count=len(gene_ids))
|
||||
|
||||
# Step 1: Fetch ortholog mappings
|
||||
logger.info("step_1_fetch_orthologs")
|
||||
orthologs = fetch_ortholog_mapping(gene_ids)
|
||||
|
||||
# Extract lists of orthologs to query
|
||||
mouse_genes = orthologs.filter(pl.col("mouse_ortholog").is_not_null())["mouse_ortholog"].to_list()
|
||||
zebrafish_genes = orthologs.filter(pl.col("zebrafish_ortholog").is_not_null())["zebrafish_ortholog"].to_list()
|
||||
|
||||
logger.info(
|
||||
"orthologs_extracted",
|
||||
mouse_count=len(mouse_genes),
|
||||
zebrafish_count=len(zebrafish_genes),
|
||||
)
|
||||
|
||||
# Step 2: Fetch phenotypes
|
||||
logger.info("step_2_fetch_phenotypes")
|
||||
|
||||
# MGI phenotypes
|
||||
mgi_pheno = fetch_mgi_phenotypes(mouse_genes)
|
||||
mgi_sensory = filter_sensory_phenotypes(mgi_pheno, SENSORY_MP_KEYWORDS, "mp_term_name")
|
||||
|
||||
# ZFIN phenotypes
|
||||
zfin_pheno = fetch_zfin_phenotypes(zebrafish_genes)
|
||||
zfin_sensory = filter_sensory_phenotypes(zfin_pheno, SENSORY_ZP_KEYWORDS, "zp_term_name")
|
||||
|
||||
# IMPC phenotypes
|
||||
impc_pheno = fetch_impc_phenotypes(mouse_genes)
|
||||
impc_sensory = filter_sensory_phenotypes(impc_pheno, SENSORY_MP_KEYWORDS, "mp_term_name")
|
||||
|
||||
logger.info(
|
||||
"phenotypes_filtered",
|
||||
mgi_total=len(mgi_pheno),
|
||||
mgi_sensory=len(mgi_sensory),
|
||||
zfin_total=len(zfin_pheno),
|
||||
zfin_sensory=len(zfin_sensory),
|
||||
impc_total=len(impc_pheno),
|
||||
impc_sensory=len(impc_sensory),
|
||||
)
|
||||
|
||||
# Step 3: Aggregate phenotypes by gene
|
||||
logger.info("step_3_aggregate_phenotypes")
|
||||
|
||||
# Count sensory phenotypes per mouse gene
|
||||
if not mgi_sensory.is_empty():
|
||||
mgi_counts = (
|
||||
mgi_sensory
|
||||
.group_by("mouse_gene")
|
||||
.agg([
|
||||
pl.col("mp_term_name").count().alias("mgi_phenotype_count"),
|
||||
pl.col("mp_term_name").str.concat("; ").alias("mgi_terms"),
|
||||
])
|
||||
)
|
||||
else:
|
||||
mgi_counts = pl.DataFrame({
|
||||
"mouse_gene": [],
|
||||
"mgi_phenotype_count": [],
|
||||
"mgi_terms": [],
|
||||
})
|
||||
|
||||
# Count sensory phenotypes per zebrafish gene
|
||||
if not zfin_sensory.is_empty():
|
||||
zfin_counts = (
|
||||
zfin_sensory
|
||||
.group_by("zebrafish_gene")
|
||||
.agg([
|
||||
pl.col("zp_term_name").count().alias("zfin_phenotype_count"),
|
||||
pl.col("zp_term_name").str.concat("; ").alias("zfin_terms"),
|
||||
])
|
||||
)
|
||||
else:
|
||||
zfin_counts = pl.DataFrame({
|
||||
"zebrafish_gene": [],
|
||||
"zfin_phenotype_count": [],
|
||||
"zfin_terms": [],
|
||||
})
|
||||
|
||||
# Count sensory phenotypes per mouse gene from IMPC
|
||||
if not impc_sensory.is_empty():
|
||||
impc_counts = (
|
||||
impc_sensory
|
||||
.group_by("mouse_gene")
|
||||
.agg([
|
||||
pl.col("mp_term_name").count().alias("impc_phenotype_count"),
|
||||
pl.col("mp_term_name").str.concat("; ").alias("impc_terms"),
|
||||
])
|
||||
)
|
||||
else:
|
||||
impc_counts = pl.DataFrame({
|
||||
"mouse_gene": [],
|
||||
"impc_phenotype_count": [],
|
||||
"impc_terms": [],
|
||||
})
|
||||
|
||||
# Step 4: Join phenotype data with ortholog mappings
|
||||
logger.info("step_4_join_data")
|
||||
|
||||
result = (
|
||||
orthologs
|
||||
# Join MGI phenotypes
|
||||
.join(
|
||||
mgi_counts,
|
||||
left_on="mouse_ortholog",
|
||||
right_on="mouse_gene",
|
||||
how="left"
|
||||
)
|
||||
# Join ZFIN phenotypes
|
||||
.join(
|
||||
zfin_counts,
|
||||
left_on="zebrafish_ortholog",
|
||||
right_on="zebrafish_gene",
|
||||
how="left"
|
||||
)
|
||||
# Join IMPC phenotypes
|
||||
.join(
|
||||
impc_counts,
|
||||
left_on="mouse_ortholog",
|
||||
right_on="mouse_gene",
|
||||
how="left"
|
||||
)
|
||||
# Add flags
|
||||
.with_columns([
|
||||
(pl.col("mgi_phenotype_count") > 0).alias("has_mouse_phenotype"),
|
||||
(pl.col("zfin_phenotype_count") > 0).alias("has_zebrafish_phenotype"),
|
||||
(pl.col("impc_phenotype_count") > 0).alias("has_impc_phenotype"),
|
||||
])
|
||||
# Calculate total sensory phenotype count
|
||||
.with_columns([
|
||||
(
|
||||
pl.col("mgi_phenotype_count").fill_null(0) +
|
||||
pl.col("zfin_phenotype_count").fill_null(0) +
|
||||
pl.col("impc_phenotype_count").fill_null(0)
|
||||
).alias("sensory_phenotype_count")
|
||||
])
|
||||
# Combine phenotype terms
|
||||
.with_columns([
|
||||
pl.concat_str([
|
||||
pl.col("mgi_terms").fill_null(""),
|
||||
pl.col("zfin_terms").fill_null(""),
|
||||
pl.col("impc_terms").fill_null(""),
|
||||
], separator="; ").str.replace_all("; ; ", "; ").str.strip_chars("; ").alias("phenotype_categories")
|
||||
])
|
||||
# Set sensory_phenotype_count to NULL if zero (preserve NULL pattern)
|
||||
.with_columns([
|
||||
pl.when(pl.col("sensory_phenotype_count") == 0)
|
||||
.then(None)
|
||||
.otherwise(pl.col("sensory_phenotype_count"))
|
||||
.alias("sensory_phenotype_count")
|
||||
])
|
||||
# Select final columns
|
||||
.select([
|
||||
"gene_id",
|
||||
"mouse_ortholog",
|
||||
"mouse_ortholog_confidence",
|
||||
"zebrafish_ortholog",
|
||||
"zebrafish_ortholog_confidence",
|
||||
"has_mouse_phenotype",
|
||||
"has_zebrafish_phenotype",
|
||||
"has_impc_phenotype",
|
||||
"sensory_phenotype_count",
|
||||
"phenotype_categories",
|
||||
])
|
||||
)
|
||||
|
||||
# Step 5: Score evidence
|
||||
logger.info("step_5_score_evidence")
|
||||
result = score_animal_evidence(result)
|
||||
|
||||
logger.info(
|
||||
"process_animal_model_evidence_complete",
|
||||
total_genes=len(result),
|
||||
with_orthologs=result.filter(
|
||||
pl.col("mouse_ortholog").is_not_null() | pl.col("zebrafish_ortholog").is_not_null()
|
||||
).height,
|
||||
with_sensory_phenotypes=result.filter(
|
||||
pl.col("sensory_phenotype_count").is_not_null()
|
||||
).height,
|
||||
)
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user