feat(03-05): add animal model DuckDB loader, CLI, and comprehensive tests

- load.py: DuckDB persistence with provenance tracking, ortholog confidence distribution stats
- CLI animal-models command: checkpoint-restart pattern, top scoring genes display
- 10 unit tests: ortholog confidence scoring, keyword filtering, multi-organism bonus, NULL preservation
- 4 integration tests: full pipeline, checkpoint-restart, provenance tracking, empty phenotype handling
- All tests pass (14/14): validates fetch->transform->load->CLI flow
- Fixed polars deprecations: str.join replaces str.concat, pl.len replaces pl.count
This commit is contained in:
2026-02-11 19:06:49 +08:00
parent 99bc975a2c
commit bcd3c4ffbe
4 changed files with 681 additions and 0 deletions

View File

@@ -26,6 +26,10 @@ from usher_pipeline.evidence.animal_models.transform import (
score_animal_evidence,
process_animal_model_evidence,
)
from usher_pipeline.evidence.animal_models.load import (
load_to_duckdb,
query_sensory_phenotype_genes,
)
__all__ = [
"AnimalModelRecord",
@@ -39,4 +43,6 @@ __all__ = [
"filter_sensory_phenotypes",
"score_animal_evidence",
"process_animal_model_evidence",
"load_to_duckdb",
"query_sensory_phenotype_genes",
]

View File

@@ -0,0 +1,126 @@
"""Load animal model phenotype data to DuckDB with provenance tracking."""
from typing import Optional
import polars as pl
import structlog
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
logger = structlog.get_logger()
def load_to_duckdb(
df: pl.DataFrame,
store: PipelineStore,
provenance: ProvenanceTracker,
description: str = ""
) -> None:
"""Save animal model phenotype DataFrame to DuckDB with provenance.
Creates or replaces the animal_model_phenotypes table (idempotent).
Records provenance step with summary statistics.
Args:
df: Processed animal model DataFrame with orthologs, phenotypes, and scores
store: PipelineStore instance for DuckDB persistence
provenance: ProvenanceTracker instance for metadata recording
description: Optional description for checkpoint metadata
"""
logger.info("animal_model_load_start", row_count=len(df))
# Calculate summary statistics for provenance
with_mouse = df.filter(pl.col("mouse_ortholog").is_not_null()).height
with_zebrafish = df.filter(pl.col("zebrafish_ortholog").is_not_null()).height
with_sensory = df.filter(pl.col("sensory_phenotype_count").is_not_null()).height
# Ortholog confidence distribution
if with_mouse > 0:
mouse_conf_dist = (
df.filter(pl.col("mouse_ortholog").is_not_null())
.group_by("mouse_ortholog_confidence")
.agg(pl.len())
.to_dicts()
)
else:
mouse_conf_dist = []
if with_zebrafish > 0:
zebrafish_conf_dist = (
df.filter(pl.col("zebrafish_ortholog").is_not_null())
.group_by("zebrafish_ortholog_confidence")
.agg(pl.len())
.to_dicts()
)
else:
zebrafish_conf_dist = []
# Mean sensory phenotype count
mean_sensory_count = (
df.filter(pl.col("sensory_phenotype_count").is_not_null())
.select(pl.col("sensory_phenotype_count").mean())
.item()
)
if mean_sensory_count is None:
mean_sensory_count = 0.0
# Save to DuckDB with CREATE OR REPLACE (idempotent)
store.save_dataframe(
df=df,
table_name="animal_model_phenotypes",
description=description or "Animal model phenotypes from MGI, ZFIN, and IMPC with ortholog confidence scoring",
replace=True
)
# Record provenance step with details
provenance.record_step("load_animal_model_phenotypes", {
"row_count": len(df),
"genes_with_mouse_ortholog": with_mouse,
"genes_with_zebrafish_ortholog": with_zebrafish,
"genes_with_sensory_phenotypes": with_sensory,
"mouse_confidence_distribution": mouse_conf_dist,
"zebrafish_confidence_distribution": zebrafish_conf_dist,
"mean_sensory_phenotype_count": round(mean_sensory_count, 2),
})
logger.info(
"animal_model_load_complete",
row_count=len(df),
with_mouse=with_mouse,
with_zebrafish=with_zebrafish,
with_sensory=with_sensory,
)
def query_sensory_phenotype_genes(
store: PipelineStore,
min_score: float = 0.3
) -> pl.DataFrame:
"""Query genes with high animal model evidence from DuckDB.
Args:
store: PipelineStore instance
min_score: Minimum animal model score threshold (0-1)
Returns:
DataFrame with genes having animal model score >= min_score,
sorted by score (highest first)
"""
logger.info("animal_model_query_start", min_score=min_score)
# Query DuckDB: genes with sufficient animal model evidence
df = store.execute_query(
"""
SELECT gene_id, mouse_ortholog, zebrafish_ortholog,
sensory_phenotype_count, phenotype_categories,
animal_model_score_normalized
FROM animal_model_phenotypes
WHERE animal_model_score_normalized >= ?
ORDER BY animal_model_score_normalized DESC
""",
params=[min_score]
)
logger.info("animal_model_query_complete", result_count=len(df))
return df