From ed21f18a98bfc5e3c753db757726d4beac7e7cb2 Mon Sep 17 00:00:00 2001 From: gbanyan Date: Wed, 11 Feb 2026 20:38:36 +0800 Subject: [PATCH] fix(03-05): handle NULL columns and deprecated polars API in animal models - Add NULL/empty column checks in fetch_ortholog_mapping - Fix NULL handling in filter_sensory_phenotypes with is_not_null guard - Replace deprecated str.concat with str.join - Add explicit schema to empty DataFrames for consistency --- .../evidence/animal_models/fetch.py | 51 +++++++++++-------- .../evidence/animal_models/transform.py | 28 +++++++--- 2 files changed, 50 insertions(+), 29 deletions(-) diff --git a/src/usher_pipeline/evidence/animal_models/fetch.py b/src/usher_pipeline/evidence/animal_models/fetch.py index 9fef004..dd97b18 100644 --- a/src/usher_pipeline/evidence/animal_models/fetch.py +++ b/src/usher_pipeline/evidence/animal_models/fetch.py @@ -168,27 +168,36 @@ def fetch_ortholog_mapping(gene_ids: list[str]) -> pl.DataFrame: logger.info("hcop_zebrafish_columns", columns=zebrafish_df.columns) # Parse zebrafish ortholog data - zebrafish_orthologs = ( - zebrafish_df - .filter(pl.col("human_ensembl_gene").is_in(gene_ids)) - .select([ - pl.col("human_ensembl_gene").alias("gene_id"), - pl.col("zebrafish_symbol").alias("zebrafish_ortholog"), - pl.col("support").str.split(",").list.len().alias("support_count"), - ]) - .with_columns([ - pl.when(pl.col("support_count") >= 8) - .then(pl.lit("HIGH")) - .when(pl.col("support_count") >= 4) - .then(pl.lit("MEDIUM")) - .otherwise(pl.lit("LOW")) - .alias("zebrafish_ortholog_confidence") - ]) - .sort(["gene_id", "support_count"], descending=[False, True]) - .group_by("gene_id") - .first() - .select(["gene_id", "zebrafish_ortholog", "zebrafish_ortholog_confidence"]) - ) + # Handle case where zebrafish_df might be empty or missing expected columns + if "zebrafish_symbol" in zebrafish_df.columns and len(zebrafish_df) > 0: + zebrafish_orthologs = ( + zebrafish_df + .filter(pl.col("human_ensembl_gene").is_in(gene_ids)) + .select([ + pl.col("human_ensembl_gene").alias("gene_id"), + pl.col("zebrafish_symbol").alias("zebrafish_ortholog"), + pl.col("support").str.split(",").list.len().alias("support_count"), + ]) + .with_columns([ + pl.when(pl.col("support_count") >= 8) + .then(pl.lit("HIGH")) + .when(pl.col("support_count") >= 4) + .then(pl.lit("MEDIUM")) + .otherwise(pl.lit("LOW")) + .alias("zebrafish_ortholog_confidence") + ]) + .sort(["gene_id", "support_count"], descending=[False, True]) + .group_by("gene_id") + .first() + .select(["gene_id", "zebrafish_ortholog", "zebrafish_ortholog_confidence"]) + ) + else: + # Return empty DataFrame with correct schema + zebrafish_orthologs = pl.DataFrame({ + "gene_id": [], + "zebrafish_ortholog": [], + "zebrafish_ortholog_confidence": [], + }, schema={"gene_id": pl.String, "zebrafish_ortholog": pl.String, "zebrafish_ortholog_confidence": pl.String}) logger.info("zebrafish_orthologs_mapped", count=len(zebrafish_orthologs)) diff --git a/src/usher_pipeline/evidence/animal_models/transform.py b/src/usher_pipeline/evidence/animal_models/transform.py index 9f5890d..2df82c5 100644 --- a/src/usher_pipeline/evidence/animal_models/transform.py +++ b/src/usher_pipeline/evidence/animal_models/transform.py @@ -38,14 +38,26 @@ def filter_sensory_phenotypes( if phenotype_df.is_empty(): return phenotype_df + # Skip filtering if term column is missing or all NULL + if term_column not in phenotype_df.columns: + logger.warning("filter_sensory_phenotypes_skip", reason=f"column_{term_column}_missing") + return pl.DataFrame(schema=phenotype_df.schema).clear() + + if phenotype_df[term_column].null_count() == len(phenotype_df): + logger.warning("filter_sensory_phenotypes_skip", reason=f"all_{term_column}_null") + return pl.DataFrame(schema=phenotype_df.schema).clear() + logger.info("filter_sensory_phenotypes_start", row_count=len(phenotype_df)) # Create case-insensitive match condition - # Match if ANY keyword appears as substring in term + # Match if ANY keyword appears as substring in term (handles NULL by checking is_not_null first) match_condition = pl.lit(False) for keyword in keywords: - match_condition = match_condition | pl.col(term_column).str.to_lowercase().str.contains(keyword.lower()) + match_condition = match_condition | ( + pl.col(term_column).is_not_null() & + pl.col(term_column).str.to_lowercase().str.contains(keyword.lower()) + ) # Filter phenotypes filtered = phenotype_df.filter(match_condition) @@ -229,7 +241,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame: .group_by("mouse_gene") .agg([ pl.col("mp_term_name").count().alias("mgi_phenotype_count"), - pl.col("mp_term_name").str.concat("; ").alias("mgi_terms"), + pl.col("mp_term_name").str.join("; ").alias("mgi_terms"), ]) ) else: @@ -237,7 +249,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame: "mouse_gene": [], "mgi_phenotype_count": [], "mgi_terms": [], - }) + }, schema={"mouse_gene": pl.String, "mgi_phenotype_count": pl.Int64, "mgi_terms": pl.String}) # Count sensory phenotypes per zebrafish gene if not zfin_sensory.is_empty(): @@ -246,7 +258,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame: .group_by("zebrafish_gene") .agg([ pl.col("zp_term_name").count().alias("zfin_phenotype_count"), - pl.col("zp_term_name").str.concat("; ").alias("zfin_terms"), + pl.col("zp_term_name").str.join("; ").alias("zfin_terms"), ]) ) else: @@ -254,7 +266,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame: "zebrafish_gene": [], "zfin_phenotype_count": [], "zfin_terms": [], - }) + }, schema={"zebrafish_gene": pl.String, "zfin_phenotype_count": pl.Int64, "zfin_terms": pl.String}) # Count sensory phenotypes per mouse gene from IMPC if not impc_sensory.is_empty(): @@ -263,7 +275,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame: .group_by("mouse_gene") .agg([ pl.col("mp_term_name").count().alias("impc_phenotype_count"), - pl.col("mp_term_name").str.concat("; ").alias("impc_terms"), + pl.col("mp_term_name").str.join("; ").alias("impc_terms"), ]) ) else: @@ -271,7 +283,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame: "mouse_gene": [], "impc_phenotype_count": [], "impc_terms": [], - }) + }, schema={"mouse_gene": pl.String, "impc_phenotype_count": pl.Int64, "impc_terms": pl.String}) # Step 4: Join phenotype data with ortholog mappings logger.info("step_4_join_data")