fix: resolve runtime bugs for pipeline execution on Python 3.14 + latest deps
- gene_mapping: wrap mygene fetch_all generator in list() to fix len() error - gene_mapping: raise MAX_EXPECTED_GENES to 23000 (mygene DB growth) - setup_cmd: rename gene_universe columns to gene_id/gene_symbol for consistency with all downstream evidence layer code - gnomad: handle missing coverage columns in v4.1 constraint TSV - expression: fix HPA URL (v23.proteinatlas.org) and GTEx URL (v8 path) - expression: fix Polars pivot() API change (columns -> on), collect first - expression: handle missing GTEx tissues (Eye - Retina not in v8) - expression: ensure all expected columns exist even when sources unavailable - expression/load: safely check column existence before filtering - localization: fix HPA subcellular URL to v23 - animal_models: fix httpx stream response.read() before .text access - animal_models: increase infer_schema_length for HCOP and MGI TSV parsing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -262,12 +262,32 @@ def process_expression_evidence(
|
||||
# Compute expression score
|
||||
df = compute_expression_score(df)
|
||||
|
||||
# Ensure all expected columns exist (NULL if source unavailable)
|
||||
expected_cols = {
|
||||
"hpa_retina_tpm": pl.Float64,
|
||||
"hpa_cerebellum_tpm": pl.Float64,
|
||||
"hpa_testis_tpm": pl.Float64,
|
||||
"hpa_fallopian_tube_tpm": pl.Float64,
|
||||
"gtex_retina_tpm": pl.Float64,
|
||||
"gtex_cerebellum_tpm": pl.Float64,
|
||||
"gtex_testis_tpm": pl.Float64,
|
||||
"gtex_fallopian_tube_tpm": pl.Float64,
|
||||
"cellxgene_photoreceptor_expr": pl.Float64,
|
||||
"cellxgene_hair_cell_expr": pl.Float64,
|
||||
"tau_specificity": pl.Float64,
|
||||
"usher_tissue_enrichment": pl.Float64,
|
||||
"expression_score_normalized": pl.Float64,
|
||||
}
|
||||
for col_name, dtype in expected_cols.items():
|
||||
if col_name not in df.columns:
|
||||
df = df.with_columns(pl.lit(None).cast(dtype).alias(col_name))
|
||||
|
||||
logger.info(
|
||||
"expression_pipeline_complete",
|
||||
row_count=len(df),
|
||||
has_hpa=any("hpa_" in col for col in df.columns),
|
||||
has_gtex=any("gtex_" in col for col in df.columns),
|
||||
has_cellxgene=any("cellxgene_" in col for col in df.columns),
|
||||
has_hpa=any("hpa_" in col and df[col].null_count() < len(df) for col in df.columns if "hpa_" in col),
|
||||
has_gtex=any("gtex_" in col and df[col].null_count() < len(df) for col in df.columns if "gtex_" in col),
|
||||
has_cellxgene=any("cellxgene_" in col and df[col].null_count() < len(df) for col in df.columns if "cellxgene_" in col),
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
Reference in New Issue
Block a user