From 6605ff0f2b123043f8c287d1d87826a66607f07b Mon Sep 17 00:00:00 2001
From: gbanyan <gbanyan.huang@gmail.com>
Date: Fri, 13 Feb 2026 03:44:01 +0800
Subject: [PATCH] fix: resolve runtime bugs for pipeline execution on Python
 3.14 + latest deps

- gene_mapping: wrap mygene fetch_all generator in list() to fix len() error
- gene_mapping: raise MAX_EXPECTED_GENES to 23000 (mygene DB growth)
- setup_cmd: rename gene_universe columns to gene_id/gene_symbol for
  consistency with all downstream evidence layer code
- gnomad: handle missing coverage columns in v4.1 constraint TSV
- expression: fix HPA URL (v23.proteinatlas.org) and GTEx URL (v8 path)
- expression: fix Polars pivot() API change (columns -> on), collect first
- expression: handle missing GTEx tissues (Eye - Retina not in v8)
- expression: ensure all expected columns exist even when sources unavailable
- expression/load: safely check column existence before filtering
- localization: fix HPA subcellular URL to v23
- animal_models: fix httpx stream response.read() before .text access
- animal_models: increase infer_schema_length for HCOP and MGI TSV parsing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/usher_pipeline/cli/setup_cmd.py           |  4 +-
 .../evidence/animal_models/fetch.py           |  5 +-
 .../evidence/expression/fetch.py              | 43 ++++++------
 .../evidence/expression/load.py               | 17 +++--
 .../evidence/expression/models.py             |  4 +-
 .../evidence/expression/transform.py          | 26 ++++++-
 .../evidence/gnomad/transform.py              | 68 ++++++++++++-------
 .../evidence/localization/models.py           |  2 +-
 src/usher_pipeline/gene_mapping/universe.py   |  6 +-
 src/usher_pipeline/gene_mapping/validator.py  |  2 +-
 10 files changed, 112 insertions(+), 65 deletions(-)

diff --git a/src/usher_pipeline/cli/setup_cmd.py b/src/usher_pipeline/cli/setup_cmd.py
index 282402e..b106cc9 100644
--- a/src/usher_pipeline/cli/setup_cmd.py
+++ b/src/usher_pipeline/cli/setup_cmd.py
@@ -189,8 +189,8 @@ def setup(ctx, force):
 
         # Create DataFrame with mapping results
         df = pl.DataFrame({
-            'ensembl_id': [r.ensembl_id for r in mapping_results],
-            'hgnc_symbol': [r.hgnc_symbol for r in mapping_results],
+            'gene_id': [r.ensembl_id for r in mapping_results],
+            'gene_symbol': [r.hgnc_symbol for r in mapping_results],
             'uniprot_accession': [r.uniprot_accession for r in mapping_results],
         })
 
diff --git a/src/usher_pipeline/evidence/animal_models/fetch.py b/src/usher_pipeline/evidence/animal_models/fetch.py
index dd97b18..41874ab 100644
--- a/src/usher_pipeline/evidence/animal_models/fetch.py
+++ b/src/usher_pipeline/evidence/animal_models/fetch.py
@@ -86,6 +86,7 @@ def _download_text(url: str) -> str:
 
     with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
         response.raise_for_status()
+        response.read()
         content = response.text
 
     logger.info("download_text_complete", size_mb=round(len(content) / 1024 / 1024, 2))
@@ -163,6 +164,7 @@ def fetch_ortholog_mapping(gene_ids: list[str]) -> pl.DataFrame:
         io.BytesIO(zebrafish_data),
         separator="\t",
         null_values=["", "NA"],
+        infer_schema_length=10000,
     )
 
     logger.info("hcop_zebrafish_columns", columns=zebrafish_df.columns)
@@ -254,12 +256,13 @@ def fetch_mgi_phenotypes(mouse_gene_symbols: list[str]) -> pl.DataFrame:
     if lines[0].startswith("#"):
         lines = lines[1:]
 
-    # Read as DataFrame
+    # Read as DataFrame (all columns as string to avoid type inference issues)
     df = pl.read_csv(
         io.StringIO("\n".join(lines)),
         separator="\t",
         null_values=["", "NA"],
         has_header=True,
+        infer_schema_length=10000,
     )
 
     logger.info("mgi_raw_columns", columns=df.columns)
diff --git a/src/usher_pipeline/evidence/expression/fetch.py b/src/usher_pipeline/evidence/expression/fetch.py
index a96bc7b..20deaef 100644
--- a/src/usher_pipeline/evidence/expression/fetch.py
+++ b/src/usher_pipeline/evidence/expression/fetch.py
@@ -293,12 +293,11 @@ def fetch_hpa_expression(
         )
     )
 
-    # Pivot: rows=genes, columns=tissues
-    # Create separate columns for each target tissue
-    lf_wide = lf.pivot(
+    # Pivot: rows=genes, columns=tissues (collect first — DataFrame.pivot is simpler)
+    df_wide = lf.collect().pivot(
+        on="Tissue",
         values="expression_value",
         index="Gene name",
-        columns="Tissue",
     )
 
     # Rename columns to match our schema
@@ -308,14 +307,14 @@ def fetch_hpa_expression(
             rename_map[hpa_tissue] = f"hpa_{our_key}_tpm"
 
     if rename_map:
-        lf_wide = lf_wide.rename(rename_map)
+        df_wide = df_wide.rename(rename_map)
 
     # Rename "Gene name" to "gene_symbol"
-    lf_wide = lf_wide.rename({"Gene name": "gene_symbol"})
+    df_wide = df_wide.rename({"Gene name": "gene_symbol"})
 
     logger.info("hpa_parse_complete", tissues=list(target_tissue_names.keys()))
 
-    return lf_wide
+    return df_wide.lazy()
 
 
 def fetch_gtex_expression(
@@ -372,27 +371,29 @@ def fetch_gtex_expression(
 
     # Select gene ID column + target tissue columns
     # GTEx uses "Name" for gene ID (ENSG...) and "Description" for gene symbol
+    available_cols = lf.collect_schema().names()
+
     select_cols = ["Name"]
     rename_map = {"Name": "gene_id"}
+    missing_tissues = []
 
     for our_key, gtex_tissue in target_tissue_cols.items():
-        if gtex_tissue:
-            # Check if tissue column exists (not all GTEx versions have all tissues)
+        if gtex_tissue and gtex_tissue in available_cols:
             select_cols.append(gtex_tissue)
             rename_map[gtex_tissue] = f"gtex_{our_key}_tpm"
+        elif gtex_tissue:
+            missing_tissues.append(gtex_tissue)
 
-    # Try to select columns; if tissue missing, it will be NULL
-    # Use select with error handling for missing columns
-    try:
-        lf = lf.select(select_cols).rename(rename_map)
-    except Exception as e:
-        logger.warning("gtex_tissue_missing", error=str(e))
-        # Fallback: select available columns
-        available_cols = lf.columns
-        select_available = [col for col in select_cols if col in available_cols]
-        lf = lf.select(select_available).rename({
-            k: v for k, v in rename_map.items() if k in select_available
-        })
+    if missing_tissues:
+        logger.warning("gtex_tissues_not_available", missing=missing_tissues)
+
+    lf = lf.select(select_cols).rename(rename_map)
+
+    # Add NULL columns for missing tissues
+    for our_key, gtex_tissue in target_tissue_cols.items():
+        col_name = f"gtex_{our_key}_tpm"
+        if gtex_tissue and gtex_tissue not in available_cols:
+            lf = lf.with_columns(pl.lit(None).cast(pl.Float64).alias(col_name))
 
     # Filter to requested gene_ids if provided
     if gene_ids:
diff --git a/src/usher_pipeline/evidence/expression/load.py b/src/usher_pipeline/evidence/expression/load.py
index 05696e0..f176a01 100644
--- a/src/usher_pipeline/evidence/expression/load.py
+++ b/src/usher_pipeline/evidence/expression/load.py
@@ -31,17 +31,20 @@ def load_to_duckdb(
     logger.info("expression_load_start", row_count=len(df))
 
     # Calculate summary statistics for provenance
-    # Genes with retina expression (any source)
+    # Genes with retina expression (any source) — check column existence
+    retina_filter_parts = []
+    for col in ["hpa_retina_tpm", "gtex_retina_tpm", "cellxgene_photoreceptor_expr"]:
+        if col in df.columns:
+            retina_filter_parts.append(pl.col(col).is_not_null())
     retina_expr_count = df.filter(
-        pl.col("hpa_retina_tpm").is_not_null() |
-        pl.col("gtex_retina_tpm").is_not_null() |
-        pl.col("cellxgene_photoreceptor_expr").is_not_null()
+        pl.any_horizontal(retina_filter_parts) if retina_filter_parts else pl.lit(False)
     ).height
 
     # Genes with inner ear expression (primarily CellxGene)
-    inner_ear_expr_count = df.filter(
-        pl.col("cellxgene_hair_cell_expr").is_not_null()
-    ).height
+    inner_ear_expr_count = (
+        df.filter(pl.col("cellxgene_hair_cell_expr").is_not_null()).height
+        if "cellxgene_hair_cell_expr" in df.columns else 0
+    )
 
     # Mean Tau specificity (excluding NULLs)
     mean_tau = df.select(pl.col("tau_specificity").mean()).item()
diff --git a/src/usher_pipeline/evidence/expression/models.py b/src/usher_pipeline/evidence/expression/models.py
index 41ab3ae..9954dee 100644
--- a/src/usher_pipeline/evidence/expression/models.py
+++ b/src/usher_pipeline/evidence/expression/models.py
@@ -4,12 +4,12 @@ from pydantic import BaseModel
 
 # HPA normal tissue data download URL (bulk TSV, more efficient than per-gene API)
 HPA_NORMAL_TISSUE_URL = (
-    "https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
+    "https://v23.proteinatlas.org/download/normal_tissue.tsv.zip"
 )
 
 # GTEx v10 median gene expression bulk data
 GTEX_MEDIAN_EXPRESSION_URL = (
-    "https://storage.googleapis.com/adult-gtex/bulk-gex/v10/median-tpm/"
+    "https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/"
     "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
 )
 
diff --git a/src/usher_pipeline/evidence/expression/transform.py b/src/usher_pipeline/evidence/expression/transform.py
index 5b39f2a..23c3551 100644
--- a/src/usher_pipeline/evidence/expression/transform.py
+++ b/src/usher_pipeline/evidence/expression/transform.py
@@ -262,12 +262,32 @@ def process_expression_evidence(
     # Compute expression score
     df = compute_expression_score(df)
 
+    # Ensure all expected columns exist (NULL if source unavailable)
+    expected_cols = {
+        "hpa_retina_tpm": pl.Float64,
+        "hpa_cerebellum_tpm": pl.Float64,
+        "hpa_testis_tpm": pl.Float64,
+        "hpa_fallopian_tube_tpm": pl.Float64,
+        "gtex_retina_tpm": pl.Float64,
+        "gtex_cerebellum_tpm": pl.Float64,
+        "gtex_testis_tpm": pl.Float64,
+        "gtex_fallopian_tube_tpm": pl.Float64,
+        "cellxgene_photoreceptor_expr": pl.Float64,
+        "cellxgene_hair_cell_expr": pl.Float64,
+        "tau_specificity": pl.Float64,
+        "usher_tissue_enrichment": pl.Float64,
+        "expression_score_normalized": pl.Float64,
+    }
+    for col_name, dtype in expected_cols.items():
+        if col_name not in df.columns:
+            df = df.with_columns(pl.lit(None).cast(dtype).alias(col_name))
+
     logger.info(
         "expression_pipeline_complete",
         row_count=len(df),
-        has_hpa=any("hpa_" in col for col in df.columns),
-        has_gtex=any("gtex_" in col for col in df.columns),
-        has_cellxgene=any("cellxgene_" in col for col in df.columns),
+        has_hpa=any("hpa_" in col and df[col].null_count() < len(df) for col in df.columns if "hpa_" in col),
+        has_gtex=any("gtex_" in col and df[col].null_count() < len(df) for col in df.columns if "gtex_" in col),
+        has_cellxgene=any("cellxgene_" in col and df[col].null_count() < len(df) for col in df.columns if "cellxgene_" in col),
     )
 
     return df
diff --git a/src/usher_pipeline/evidence/gnomad/transform.py b/src/usher_pipeline/evidence/gnomad/transform.py
index 19e7fd2..1e8fa59 100644
--- a/src/usher_pipeline/evidence/gnomad/transform.py
+++ b/src/usher_pipeline/evidence/gnomad/transform.py
@@ -27,38 +27,58 @@ def filter_by_coverage(
 
     Returns:
         LazyFrame with quality_flag column added:
-        - "measured": Good coverage AND has LOEUF estimate
+        - "measured": Has LOEUF estimate (and good coverage if available)
         - "incomplete_coverage": Coverage below thresholds
         - "no_data": Both LOEUF and pLI are NULL
     """
-    # Ensure numeric columns are properly cast to float (handles edge cases with mixed types)
-    lf = lf.with_columns([
-        pl.col("mean_depth").cast(pl.Float64, strict=False),
-        pl.col("cds_covered_pct").cast(pl.Float64, strict=False),
+    # Check which columns are available (gnomAD v4.1 lacks coverage columns)
+    available_cols = lf.collect_schema().names()
+    has_coverage = "mean_depth" in available_cols and "cds_covered_pct" in available_cols
+
+    # Ensure numeric columns are properly cast
+    cast_exprs = [
         pl.col("loeuf").cast(pl.Float64, strict=False),
         pl.col("pli").cast(pl.Float64, strict=False),
-    ])
+    ]
+    if has_coverage:
+        cast_exprs.extend([
+            pl.col("mean_depth").cast(pl.Float64, strict=False),
+            pl.col("cds_covered_pct").cast(pl.Float64, strict=False),
+        ])
+    lf = lf.with_columns(cast_exprs)
 
-    return lf.with_columns(
-        pl.when(
-            pl.col("mean_depth").is_not_null()
-            & pl.col("cds_covered_pct").is_not_null()
-            & (pl.col("mean_depth") >= min_depth)
-            & (pl.col("cds_covered_pct") >= min_cds_pct)
-            & pl.col("loeuf").is_not_null()
+    if has_coverage:
+        return lf.with_columns(
+            pl.when(
+                pl.col("mean_depth").is_not_null()
+                & pl.col("cds_covered_pct").is_not_null()
+                & (pl.col("mean_depth") >= min_depth)
+                & (pl.col("cds_covered_pct") >= min_cds_pct)
+                & pl.col("loeuf").is_not_null()
+            )
+            .then(pl.lit("measured"))
+            .when(pl.col("loeuf").is_null() & pl.col("pli").is_null())
+            .then(pl.lit("no_data"))
+            .when(
+                pl.col("mean_depth").is_not_null()
+                & pl.col("cds_covered_pct").is_not_null()
+                & ((pl.col("mean_depth") < min_depth) | (pl.col("cds_covered_pct") < min_cds_pct))
+            )
+            .then(pl.lit("incomplete_coverage"))
+            .otherwise(pl.lit("incomplete_coverage"))
+            .alias("quality_flag")
         )
-        .then(pl.lit("measured"))
-        .when(pl.col("loeuf").is_null() & pl.col("pli").is_null())
-        .then(pl.lit("no_data"))
-        .when(
-            pl.col("mean_depth").is_not_null()
-            & pl.col("cds_covered_pct").is_not_null()
-            & ((pl.col("mean_depth") < min_depth) | (pl.col("cds_covered_pct") < min_cds_pct))
+    else:
+        # No coverage columns (gnomAD v4.1) — classify by presence of constraint data
+        logger.info("gnomad_no_coverage_columns", msg="Using constraint-only quality flags")
+        return lf.with_columns(
+            pl.when(pl.col("loeuf").is_not_null())
+            .then(pl.lit("measured"))
+            .when(pl.col("loeuf").is_null() & pl.col("pli").is_null())
+            .then(pl.lit("no_data"))
+            .otherwise(pl.lit("incomplete_coverage"))
+            .alias("quality_flag")
         )
-        .then(pl.lit("incomplete_coverage"))
-        .otherwise(pl.lit("incomplete_coverage"))
-        .alias("quality_flag")
-    )
 
 
 def normalize_scores(lf: pl.LazyFrame) -> pl.LazyFrame:
diff --git a/src/usher_pipeline/evidence/localization/models.py b/src/usher_pipeline/evidence/localization/models.py
index 66c540d..be3f1fe 100644
--- a/src/usher_pipeline/evidence/localization/models.py
+++ b/src/usher_pipeline/evidence/localization/models.py
@@ -8,7 +8,7 @@ from pydantic import BaseModel, Field
 LOCALIZATION_TABLE_NAME = "subcellular_localization"
 
 # HPA subcellular data URL (bulk download)
-HPA_SUBCELLULAR_URL = "https://www.proteinatlas.org/download/subcellular_location.tsv.zip"
+HPA_SUBCELLULAR_URL = "https://v23.proteinatlas.org/download/subcellular_location.tsv.zip"
 
 # Compartment definitions for scoring
 CILIA_COMPARTMENTS = [
diff --git a/src/usher_pipeline/gene_mapping/universe.py b/src/usher_pipeline/gene_mapping/universe.py
index 1d21a9c..d784fee 100644
--- a/src/usher_pipeline/gene_mapping/universe.py
+++ b/src/usher_pipeline/gene_mapping/universe.py
@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
 
 # Expected range for human protein-coding genes
 MIN_EXPECTED_GENES = 19000
-MAX_EXPECTED_GENES = 22000
+MAX_EXPECTED_GENES = 23000
 
 
 def fetch_protein_coding_genes(ensembl_release: int = 113) -> GeneUniverse:
@@ -51,12 +51,12 @@ def fetch_protein_coding_genes(ensembl_release: int = 113) -> GeneUniverse:
 
     # Query for human protein-coding genes
     logger.info("Querying mygene for type_of_gene:protein-coding (species=9606)")
-    results = mg.query(
+    results = list(mg.query(
         'type_of_gene:"protein-coding"',
         species=9606,
         fields='ensembl.gene,symbol,name',
         fetch_all=True,
-    )
+    ))
 
     logger.info(f"Retrieved {len(results)} results from mygene")
 
diff --git a/src/usher_pipeline/gene_mapping/validator.py b/src/usher_pipeline/gene_mapping/validator.py
index 644f3e2..fee460f 100644
--- a/src/usher_pipeline/gene_mapping/validator.py
+++ b/src/usher_pipeline/gene_mapping/validator.py
@@ -171,7 +171,7 @@ def validate_gene_universe(genes: list[str]) -> ValidationResult:
 
     gene_count = len(genes)
     MIN_GENES = 19000
-    MAX_GENES = 22000
+    MAX_GENES = 23000
 
     # Check gene count
     if gene_count < MIN_GENES: