From ed21f18a98bfc5e3c753db757726d4beac7e7cb2 Mon Sep 17 00:00:00 2001
From: gbanyan <gbanyan.huang@gmail.com>
Date: Wed, 11 Feb 2026 20:38:36 +0800
Subject: [PATCH] fix(03-05): handle NULL columns and deprecated polars API in
 animal models

- Add NULL/empty column checks in fetch_ortholog_mapping
- Fix NULL handling in filter_sensory_phenotypes with is_not_null guard
- Replace deprecated str.concat with str.join
- Add explicit schema to empty DataFrames for consistency
---
 .../evidence/animal_models/fetch.py           | 51 +++++++++++--------
 .../evidence/animal_models/transform.py       | 28 +++++++---
 2 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/src/usher_pipeline/evidence/animal_models/fetch.py b/src/usher_pipeline/evidence/animal_models/fetch.py
index 9fef004..dd97b18 100644
--- a/src/usher_pipeline/evidence/animal_models/fetch.py
+++ b/src/usher_pipeline/evidence/animal_models/fetch.py
@@ -168,27 +168,36 @@ def fetch_ortholog_mapping(gene_ids: list[str]) -> pl.DataFrame:
     logger.info("hcop_zebrafish_columns", columns=zebrafish_df.columns)
 
     # Parse zebrafish ortholog data
-    zebrafish_orthologs = (
-        zebrafish_df
-        .filter(pl.col("human_ensembl_gene").is_in(gene_ids))
-        .select([
-            pl.col("human_ensembl_gene").alias("gene_id"),
-            pl.col("zebrafish_symbol").alias("zebrafish_ortholog"),
-            pl.col("support").str.split(",").list.len().alias("support_count"),
-        ])
-        .with_columns([
-            pl.when(pl.col("support_count") >= 8)
-            .then(pl.lit("HIGH"))
-            .when(pl.col("support_count") >= 4)
-            .then(pl.lit("MEDIUM"))
-            .otherwise(pl.lit("LOW"))
-            .alias("zebrafish_ortholog_confidence")
-        ])
-        .sort(["gene_id", "support_count"], descending=[False, True])
-        .group_by("gene_id")
-        .first()
-        .select(["gene_id", "zebrafish_ortholog", "zebrafish_ortholog_confidence"])
-    )
+    # Handle case where zebrafish_df might be empty or missing expected columns
+    if "zebrafish_symbol" in zebrafish_df.columns and len(zebrafish_df) > 0:
+        zebrafish_orthologs = (
+            zebrafish_df
+            .filter(pl.col("human_ensembl_gene").is_in(gene_ids))
+            .select([
+                pl.col("human_ensembl_gene").alias("gene_id"),
+                pl.col("zebrafish_symbol").alias("zebrafish_ortholog"),
+                pl.col("support").str.split(",").list.len().alias("support_count"),
+            ])
+            .with_columns([
+                pl.when(pl.col("support_count") >= 8)
+                .then(pl.lit("HIGH"))
+                .when(pl.col("support_count") >= 4)
+                .then(pl.lit("MEDIUM"))
+                .otherwise(pl.lit("LOW"))
+                .alias("zebrafish_ortholog_confidence")
+            ])
+            .sort(["gene_id", "support_count"], descending=[False, True])
+            .group_by("gene_id")
+            .first()
+            .select(["gene_id", "zebrafish_ortholog", "zebrafish_ortholog_confidence"])
+        )
+    else:
+        # Return empty DataFrame with correct schema
+        zebrafish_orthologs = pl.DataFrame({
+            "gene_id": [],
+            "zebrafish_ortholog": [],
+            "zebrafish_ortholog_confidence": [],
+        }, schema={"gene_id": pl.String, "zebrafish_ortholog": pl.String, "zebrafish_ortholog_confidence": pl.String})
 
     logger.info("zebrafish_orthologs_mapped", count=len(zebrafish_orthologs))
 
diff --git a/src/usher_pipeline/evidence/animal_models/transform.py b/src/usher_pipeline/evidence/animal_models/transform.py
index 9f5890d..2df82c5 100644
--- a/src/usher_pipeline/evidence/animal_models/transform.py
+++ b/src/usher_pipeline/evidence/animal_models/transform.py
@@ -38,14 +38,26 @@ def filter_sensory_phenotypes(
     if phenotype_df.is_empty():
         return phenotype_df
 
+    # Skip filtering if term column is missing or all NULL
+    if term_column not in phenotype_df.columns:
+        logger.warning("filter_sensory_phenotypes_skip", reason=f"column_{term_column}_missing")
+        return pl.DataFrame(schema=phenotype_df.schema).clear()
+
+    if phenotype_df[term_column].null_count() == len(phenotype_df):
+        logger.warning("filter_sensory_phenotypes_skip", reason=f"all_{term_column}_null")
+        return pl.DataFrame(schema=phenotype_df.schema).clear()
+
     logger.info("filter_sensory_phenotypes_start", row_count=len(phenotype_df))
 
     # Create case-insensitive match condition
-    # Match if ANY keyword appears as substring in term
+    # Match if ANY keyword appears as substring in term (handles NULL by checking is_not_null first)
     match_condition = pl.lit(False)
 
     for keyword in keywords:
-        match_condition = match_condition | pl.col(term_column).str.to_lowercase().str.contains(keyword.lower())
+        match_condition = match_condition | (
+            pl.col(term_column).is_not_null() &
+            pl.col(term_column).str.to_lowercase().str.contains(keyword.lower())
+        )
 
     # Filter phenotypes
     filtered = phenotype_df.filter(match_condition)
@@ -229,7 +241,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame:
             .group_by("mouse_gene")
             .agg([
                 pl.col("mp_term_name").count().alias("mgi_phenotype_count"),
-                pl.col("mp_term_name").str.concat("; ").alias("mgi_terms"),
+                pl.col("mp_term_name").str.join("; ").alias("mgi_terms"),
             ])
         )
     else:
@@ -237,7 +249,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame:
             "mouse_gene": [],
             "mgi_phenotype_count": [],
             "mgi_terms": [],
-        })
+        }, schema={"mouse_gene": pl.String, "mgi_phenotype_count": pl.Int64, "mgi_terms": pl.String})
 
     # Count sensory phenotypes per zebrafish gene
     if not zfin_sensory.is_empty():
@@ -246,7 +258,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame:
             .group_by("zebrafish_gene")
             .agg([
                 pl.col("zp_term_name").count().alias("zfin_phenotype_count"),
-                pl.col("zp_term_name").str.concat("; ").alias("zfin_terms"),
+                pl.col("zp_term_name").str.join("; ").alias("zfin_terms"),
             ])
         )
     else:
@@ -254,7 +266,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame:
             "zebrafish_gene": [],
             "zfin_phenotype_count": [],
             "zfin_terms": [],
-        })
+        }, schema={"zebrafish_gene": pl.String, "zfin_phenotype_count": pl.Int64, "zfin_terms": pl.String})
 
     # Count sensory phenotypes per mouse gene from IMPC
     if not impc_sensory.is_empty():
@@ -263,7 +275,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame:
             .group_by("mouse_gene")
             .agg([
                 pl.col("mp_term_name").count().alias("impc_phenotype_count"),
-                pl.col("mp_term_name").str.concat("; ").alias("impc_terms"),
+                pl.col("mp_term_name").str.join("; ").alias("impc_terms"),
             ])
         )
     else:
@@ -271,7 +283,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame:
             "mouse_gene": [],
             "impc_phenotype_count": [],
             "impc_terms": [],
-        })
+        }, schema={"mouse_gene": pl.String, "impc_phenotype_count": pl.Int64, "impc_terms": pl.String})
 
     # Step 4: Join phenotype data with ortholog mappings
     logger.info("step_4_join_data")