feat(03-05): add animal model DuckDB loader, CLI, and comprehensive tests
- load.py: DuckDB persistence with provenance tracking, ortholog confidence distribution stats - CLI animal-models command: checkpoint-restart pattern, top scoring genes display - 10 unit tests: ortholog confidence scoring, keyword filtering, multi-organism bonus, NULL preservation - 4 integration tests: full pipeline, checkpoint-restart, provenance tracking, empty phenotype handling - All tests pass (14/14): validates fetch->transform->load->CLI flow - Fixed polars deprecations: str.join replaces str.concat, pl.len replaces pl.count
This commit is contained in:
@@ -26,6 +26,10 @@ from usher_pipeline.evidence.animal_models.transform import (
|
||||
score_animal_evidence,
|
||||
process_animal_model_evidence,
|
||||
)
|
||||
from usher_pipeline.evidence.animal_models.load import (
|
||||
load_to_duckdb,
|
||||
query_sensory_phenotype_genes,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"AnimalModelRecord",
|
||||
@@ -39,4 +43,6 @@ __all__ = [
|
||||
"filter_sensory_phenotypes",
|
||||
"score_animal_evidence",
|
||||
"process_animal_model_evidence",
|
||||
"load_to_duckdb",
|
||||
"query_sensory_phenotype_genes",
|
||||
]
|
||||
|
||||
126
src/usher_pipeline/evidence/animal_models/load.py
Normal file
126
src/usher_pipeline/evidence/animal_models/load.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""Load animal model phenotype data to DuckDB with provenance tracking."""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import polars as pl
|
||||
import structlog
|
||||
|
||||
from usher_pipeline.persistence import PipelineStore, ProvenanceTracker
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
def load_to_duckdb(
|
||||
df: pl.DataFrame,
|
||||
store: PipelineStore,
|
||||
provenance: ProvenanceTracker,
|
||||
description: str = ""
|
||||
) -> None:
|
||||
"""Save animal model phenotype DataFrame to DuckDB with provenance.
|
||||
|
||||
Creates or replaces the animal_model_phenotypes table (idempotent).
|
||||
Records provenance step with summary statistics.
|
||||
|
||||
Args:
|
||||
df: Processed animal model DataFrame with orthologs, phenotypes, and scores
|
||||
store: PipelineStore instance for DuckDB persistence
|
||||
provenance: ProvenanceTracker instance for metadata recording
|
||||
description: Optional description for checkpoint metadata
|
||||
"""
|
||||
logger.info("animal_model_load_start", row_count=len(df))
|
||||
|
||||
# Calculate summary statistics for provenance
|
||||
with_mouse = df.filter(pl.col("mouse_ortholog").is_not_null()).height
|
||||
with_zebrafish = df.filter(pl.col("zebrafish_ortholog").is_not_null()).height
|
||||
with_sensory = df.filter(pl.col("sensory_phenotype_count").is_not_null()).height
|
||||
|
||||
# Ortholog confidence distribution
|
||||
if with_mouse > 0:
|
||||
mouse_conf_dist = (
|
||||
df.filter(pl.col("mouse_ortholog").is_not_null())
|
||||
.group_by("mouse_ortholog_confidence")
|
||||
.agg(pl.len())
|
||||
.to_dicts()
|
||||
)
|
||||
else:
|
||||
mouse_conf_dist = []
|
||||
|
||||
if with_zebrafish > 0:
|
||||
zebrafish_conf_dist = (
|
||||
df.filter(pl.col("zebrafish_ortholog").is_not_null())
|
||||
.group_by("zebrafish_ortholog_confidence")
|
||||
.agg(pl.len())
|
||||
.to_dicts()
|
||||
)
|
||||
else:
|
||||
zebrafish_conf_dist = []
|
||||
|
||||
# Mean sensory phenotype count
|
||||
mean_sensory_count = (
|
||||
df.filter(pl.col("sensory_phenotype_count").is_not_null())
|
||||
.select(pl.col("sensory_phenotype_count").mean())
|
||||
.item()
|
||||
)
|
||||
if mean_sensory_count is None:
|
||||
mean_sensory_count = 0.0
|
||||
|
||||
# Save to DuckDB with CREATE OR REPLACE (idempotent)
|
||||
store.save_dataframe(
|
||||
df=df,
|
||||
table_name="animal_model_phenotypes",
|
||||
description=description or "Animal model phenotypes from MGI, ZFIN, and IMPC with ortholog confidence scoring",
|
||||
replace=True
|
||||
)
|
||||
|
||||
# Record provenance step with details
|
||||
provenance.record_step("load_animal_model_phenotypes", {
|
||||
"row_count": len(df),
|
||||
"genes_with_mouse_ortholog": with_mouse,
|
||||
"genes_with_zebrafish_ortholog": with_zebrafish,
|
||||
"genes_with_sensory_phenotypes": with_sensory,
|
||||
"mouse_confidence_distribution": mouse_conf_dist,
|
||||
"zebrafish_confidence_distribution": zebrafish_conf_dist,
|
||||
"mean_sensory_phenotype_count": round(mean_sensory_count, 2),
|
||||
})
|
||||
|
||||
logger.info(
|
||||
"animal_model_load_complete",
|
||||
row_count=len(df),
|
||||
with_mouse=with_mouse,
|
||||
with_zebrafish=with_zebrafish,
|
||||
with_sensory=with_sensory,
|
||||
)
|
||||
|
||||
|
||||
def query_sensory_phenotype_genes(
|
||||
store: PipelineStore,
|
||||
min_score: float = 0.3
|
||||
) -> pl.DataFrame:
|
||||
"""Query genes with high animal model evidence from DuckDB.
|
||||
|
||||
Args:
|
||||
store: PipelineStore instance
|
||||
min_score: Minimum animal model score threshold (0-1)
|
||||
|
||||
Returns:
|
||||
DataFrame with genes having animal model score >= min_score,
|
||||
sorted by score (highest first)
|
||||
"""
|
||||
logger.info("animal_model_query_start", min_score=min_score)
|
||||
|
||||
# Query DuckDB: genes with sufficient animal model evidence
|
||||
df = store.execute_query(
|
||||
"""
|
||||
SELECT gene_id, mouse_ortholog, zebrafish_ortholog,
|
||||
sensory_phenotype_count, phenotype_categories,
|
||||
animal_model_score_normalized
|
||||
FROM animal_model_phenotypes
|
||||
WHERE animal_model_score_normalized >= ?
|
||||
ORDER BY animal_model_score_normalized DESC
|
||||
""",
|
||||
params=[min_score]
|
||||
)
|
||||
|
||||
logger.info("animal_model_query_complete", result_count=len(df))
|
||||
|
||||
return df
|
||||
Reference in New Issue
Block a user