fix: resolve runtime bugs for pipeline execution on Python 3.14 + latest deps

- gene_mapping: wrap mygene fetch_all generator in list() to fix len() error - gene_mapping: raise MAX_EXPECTED_GENES to 23000 (mygene DB growth) - setup_cmd: rename gene_universe columns to gene_id/gene_symbol for consistency with all downstream evidence layer code - gnomad: handle missing coverage columns in v4.1 constraint TSV - expression: fix HPA URL (v23.proteinatlas.org) and GTEx URL (v8 path) - expression: fix Polars pivot() API change (columns -> on), collect first - expression: handle missing GTEx tissues (Eye - Retina not in v8) - expression: ensure all expected columns exist even when sources unavailable - expression/load: safely check column existence before filtering - localization: fix HPA subcellular URL to v23 - animal_models: fix httpx stream response.read() before .text access - animal_models: increase infer_schema_length for HCOP and MGI TSV parsing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 03:44:01 +08:00
parent a2ef2125ba
commit 6605ff0f2b
10 changed files with 112 additions and 65 deletions
--- a/src/usher_pipeline/evidence/expression/transform.py
+++ b/src/usher_pipeline/evidence/expression/transform.py
@@ -262,12 +262,32 @@ def process_expression_evidence(
    # Compute expression score
    df = compute_expression_score(df)

+    # Ensure all expected columns exist (NULL if source unavailable)
+    expected_cols = {
+        "hpa_retina_tpm": pl.Float64,
+        "hpa_cerebellum_tpm": pl.Float64,
+        "hpa_testis_tpm": pl.Float64,
+        "hpa_fallopian_tube_tpm": pl.Float64,
+        "gtex_retina_tpm": pl.Float64,
+        "gtex_cerebellum_tpm": pl.Float64,
+        "gtex_testis_tpm": pl.Float64,
+        "gtex_fallopian_tube_tpm": pl.Float64,
+        "cellxgene_photoreceptor_expr": pl.Float64,
+        "cellxgene_hair_cell_expr": pl.Float64,
+        "tau_specificity": pl.Float64,
+        "usher_tissue_enrichment": pl.Float64,
+        "expression_score_normalized": pl.Float64,
+    }
+    for col_name, dtype in expected_cols.items():
+        if col_name not in df.columns:
+            df = df.with_columns(pl.lit(None).cast(dtype).alias(col_name))
+
    logger.info(
        "expression_pipeline_complete",
        row_count=len(df),
-        has_hpa=any("hpa_" in col for col in df.columns),
-        has_gtex=any("gtex_" in col for col in df.columns),
-        has_cellxgene=any("cellxgene_" in col for col in df.columns),
+        has_hpa=any("hpa_" in col and df[col].null_count() < len(df) for col in df.columns if "hpa_" in col),
+        has_gtex=any("gtex_" in col and df[col].null_count() < len(df) for col in df.columns if "gtex_" in col),
+        has_cellxgene=any("cellxgene_" in col and df[col].null_count() < len(df) for col in df.columns if "cellxgene_" in col),
    )

    return df