fix(03-05): handle NULL columns and deprecated polars API in animal models
- Add NULL/empty column checks in fetch_ortholog_mapping - Fix NULL handling in filter_sensory_phenotypes with is_not_null guard - Replace deprecated str.concat with str.join - Add explicit schema to empty DataFrames for consistency
This commit is contained in:
@@ -168,27 +168,36 @@ def fetch_ortholog_mapping(gene_ids: list[str]) -> pl.DataFrame:
|
|||||||
logger.info("hcop_zebrafish_columns", columns=zebrafish_df.columns)
|
logger.info("hcop_zebrafish_columns", columns=zebrafish_df.columns)
|
||||||
|
|
||||||
# Parse zebrafish ortholog data
|
# Parse zebrafish ortholog data
|
||||||
zebrafish_orthologs = (
|
# Handle case where zebrafish_df might be empty or missing expected columns
|
||||||
zebrafish_df
|
if "zebrafish_symbol" in zebrafish_df.columns and len(zebrafish_df) > 0:
|
||||||
.filter(pl.col("human_ensembl_gene").is_in(gene_ids))
|
zebrafish_orthologs = (
|
||||||
.select([
|
zebrafish_df
|
||||||
pl.col("human_ensembl_gene").alias("gene_id"),
|
.filter(pl.col("human_ensembl_gene").is_in(gene_ids))
|
||||||
pl.col("zebrafish_symbol").alias("zebrafish_ortholog"),
|
.select([
|
||||||
pl.col("support").str.split(",").list.len().alias("support_count"),
|
pl.col("human_ensembl_gene").alias("gene_id"),
|
||||||
])
|
pl.col("zebrafish_symbol").alias("zebrafish_ortholog"),
|
||||||
.with_columns([
|
pl.col("support").str.split(",").list.len().alias("support_count"),
|
||||||
pl.when(pl.col("support_count") >= 8)
|
])
|
||||||
.then(pl.lit("HIGH"))
|
.with_columns([
|
||||||
.when(pl.col("support_count") >= 4)
|
pl.when(pl.col("support_count") >= 8)
|
||||||
.then(pl.lit("MEDIUM"))
|
.then(pl.lit("HIGH"))
|
||||||
.otherwise(pl.lit("LOW"))
|
.when(pl.col("support_count") >= 4)
|
||||||
.alias("zebrafish_ortholog_confidence")
|
.then(pl.lit("MEDIUM"))
|
||||||
])
|
.otherwise(pl.lit("LOW"))
|
||||||
.sort(["gene_id", "support_count"], descending=[False, True])
|
.alias("zebrafish_ortholog_confidence")
|
||||||
.group_by("gene_id")
|
])
|
||||||
.first()
|
.sort(["gene_id", "support_count"], descending=[False, True])
|
||||||
.select(["gene_id", "zebrafish_ortholog", "zebrafish_ortholog_confidence"])
|
.group_by("gene_id")
|
||||||
)
|
.first()
|
||||||
|
.select(["gene_id", "zebrafish_ortholog", "zebrafish_ortholog_confidence"])
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Return empty DataFrame with correct schema
|
||||||
|
zebrafish_orthologs = pl.DataFrame({
|
||||||
|
"gene_id": [],
|
||||||
|
"zebrafish_ortholog": [],
|
||||||
|
"zebrafish_ortholog_confidence": [],
|
||||||
|
}, schema={"gene_id": pl.String, "zebrafish_ortholog": pl.String, "zebrafish_ortholog_confidence": pl.String})
|
||||||
|
|
||||||
logger.info("zebrafish_orthologs_mapped", count=len(zebrafish_orthologs))
|
logger.info("zebrafish_orthologs_mapped", count=len(zebrafish_orthologs))
|
||||||
|
|
||||||
|
|||||||
@@ -38,14 +38,26 @@ def filter_sensory_phenotypes(
|
|||||||
if phenotype_df.is_empty():
|
if phenotype_df.is_empty():
|
||||||
return phenotype_df
|
return phenotype_df
|
||||||
|
|
||||||
|
# Skip filtering if term column is missing or all NULL
|
||||||
|
if term_column not in phenotype_df.columns:
|
||||||
|
logger.warning("filter_sensory_phenotypes_skip", reason=f"column_{term_column}_missing")
|
||||||
|
return pl.DataFrame(schema=phenotype_df.schema).clear()
|
||||||
|
|
||||||
|
if phenotype_df[term_column].null_count() == len(phenotype_df):
|
||||||
|
logger.warning("filter_sensory_phenotypes_skip", reason=f"all_{term_column}_null")
|
||||||
|
return pl.DataFrame(schema=phenotype_df.schema).clear()
|
||||||
|
|
||||||
logger.info("filter_sensory_phenotypes_start", row_count=len(phenotype_df))
|
logger.info("filter_sensory_phenotypes_start", row_count=len(phenotype_df))
|
||||||
|
|
||||||
# Create case-insensitive match condition
|
# Create case-insensitive match condition
|
||||||
# Match if ANY keyword appears as substring in term
|
# Match if ANY keyword appears as substring in term (handles NULL by checking is_not_null first)
|
||||||
match_condition = pl.lit(False)
|
match_condition = pl.lit(False)
|
||||||
|
|
||||||
for keyword in keywords:
|
for keyword in keywords:
|
||||||
match_condition = match_condition | pl.col(term_column).str.to_lowercase().str.contains(keyword.lower())
|
match_condition = match_condition | (
|
||||||
|
pl.col(term_column).is_not_null() &
|
||||||
|
pl.col(term_column).str.to_lowercase().str.contains(keyword.lower())
|
||||||
|
)
|
||||||
|
|
||||||
# Filter phenotypes
|
# Filter phenotypes
|
||||||
filtered = phenotype_df.filter(match_condition)
|
filtered = phenotype_df.filter(match_condition)
|
||||||
@@ -229,7 +241,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame:
|
|||||||
.group_by("mouse_gene")
|
.group_by("mouse_gene")
|
||||||
.agg([
|
.agg([
|
||||||
pl.col("mp_term_name").count().alias("mgi_phenotype_count"),
|
pl.col("mp_term_name").count().alias("mgi_phenotype_count"),
|
||||||
pl.col("mp_term_name").str.concat("; ").alias("mgi_terms"),
|
pl.col("mp_term_name").str.join("; ").alias("mgi_terms"),
|
||||||
])
|
])
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@@ -237,7 +249,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame:
|
|||||||
"mouse_gene": [],
|
"mouse_gene": [],
|
||||||
"mgi_phenotype_count": [],
|
"mgi_phenotype_count": [],
|
||||||
"mgi_terms": [],
|
"mgi_terms": [],
|
||||||
})
|
}, schema={"mouse_gene": pl.String, "mgi_phenotype_count": pl.Int64, "mgi_terms": pl.String})
|
||||||
|
|
||||||
# Count sensory phenotypes per zebrafish gene
|
# Count sensory phenotypes per zebrafish gene
|
||||||
if not zfin_sensory.is_empty():
|
if not zfin_sensory.is_empty():
|
||||||
@@ -246,7 +258,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame:
|
|||||||
.group_by("zebrafish_gene")
|
.group_by("zebrafish_gene")
|
||||||
.agg([
|
.agg([
|
||||||
pl.col("zp_term_name").count().alias("zfin_phenotype_count"),
|
pl.col("zp_term_name").count().alias("zfin_phenotype_count"),
|
||||||
pl.col("zp_term_name").str.concat("; ").alias("zfin_terms"),
|
pl.col("zp_term_name").str.join("; ").alias("zfin_terms"),
|
||||||
])
|
])
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@@ -254,7 +266,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame:
|
|||||||
"zebrafish_gene": [],
|
"zebrafish_gene": [],
|
||||||
"zfin_phenotype_count": [],
|
"zfin_phenotype_count": [],
|
||||||
"zfin_terms": [],
|
"zfin_terms": [],
|
||||||
})
|
}, schema={"zebrafish_gene": pl.String, "zfin_phenotype_count": pl.Int64, "zfin_terms": pl.String})
|
||||||
|
|
||||||
# Count sensory phenotypes per mouse gene from IMPC
|
# Count sensory phenotypes per mouse gene from IMPC
|
||||||
if not impc_sensory.is_empty():
|
if not impc_sensory.is_empty():
|
||||||
@@ -263,7 +275,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame:
|
|||||||
.group_by("mouse_gene")
|
.group_by("mouse_gene")
|
||||||
.agg([
|
.agg([
|
||||||
pl.col("mp_term_name").count().alias("impc_phenotype_count"),
|
pl.col("mp_term_name").count().alias("impc_phenotype_count"),
|
||||||
pl.col("mp_term_name").str.concat("; ").alias("impc_terms"),
|
pl.col("mp_term_name").str.join("; ").alias("impc_terms"),
|
||||||
])
|
])
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@@ -271,7 +283,7 @@ def process_animal_model_evidence(gene_ids: list[str]) -> pl.DataFrame:
|
|||||||
"mouse_gene": [],
|
"mouse_gene": [],
|
||||||
"impc_phenotype_count": [],
|
"impc_phenotype_count": [],
|
||||||
"impc_terms": [],
|
"impc_terms": [],
|
||||||
})
|
}, schema={"mouse_gene": pl.String, "impc_phenotype_count": pl.Int64, "impc_terms": pl.String})
|
||||||
|
|
||||||
# Step 4: Join phenotype data with ortholog mappings
|
# Step 4: Join phenotype data with ortholog mappings
|
||||||
logger.info("step_4_join_data")
|
logger.info("step_4_join_data")
|
||||||
|
|||||||
Reference in New Issue
Block a user