fix: resolve runtime bugs for pipeline execution on Python 3.14 + latest deps

- gene_mapping: wrap mygene fetch_all generator in list() to fix len() error - gene_mapping: raise MAX_EXPECTED_GENES to 23000 (mygene DB growth) - setup_cmd: rename gene_universe columns to gene_id/gene_symbol for consistency with all downstream evidence layer code - gnomad: handle missing coverage columns in v4.1 constraint TSV - expression: fix HPA URL (v23.proteinatlas.org) and GTEx URL (v8 path) - expression: fix Polars pivot() API change (columns -> on), collect first - expression: handle missing GTEx tissues (Eye - Retina not in v8) - expression: ensure all expected columns exist even when sources unavailable - expression/load: safely check column existence before filtering - localization: fix HPA subcellular URL to v23 - animal_models: fix httpx stream response.read() before .text access - animal_models: increase infer_schema_length for HCOP and MGI TSV parsing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 03:44:01 +08:00
parent a2ef2125ba
commit 6605ff0f2b
10 changed files with 112 additions and 65 deletions
--- a/src/usher_pipeline/cli/setup_cmd.py
+++ b/src/usher_pipeline/cli/setup_cmd.py
@@ -189,8 +189,8 @@ def setup(ctx, force):

        # Create DataFrame with mapping results
        df = pl.DataFrame({
-            'ensembl_id': [r.ensembl_id for r in mapping_results],
-            'hgnc_symbol': [r.hgnc_symbol for r in mapping_results],
+            'gene_id': [r.ensembl_id for r in mapping_results],
+            'gene_symbol': [r.hgnc_symbol for r in mapping_results],
            'uniprot_accession': [r.uniprot_accession for r in mapping_results],
        })

--- a/src/usher_pipeline/evidence/animal_models/fetch.py
+++ b/src/usher_pipeline/evidence/animal_models/fetch.py
@@ -86,6 +86,7 @@ def _download_text(url: str) -> str:

    with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
        response.raise_for_status()
+        response.read()
        content = response.text

    logger.info("download_text_complete", size_mb=round(len(content) / 1024 / 1024, 2))
@@ -163,6 +164,7 @@ def fetch_ortholog_mapping(gene_ids: list[str]) -> pl.DataFrame:
        io.BytesIO(zebrafish_data),
        separator="\t",
        null_values=["", "NA"],
+        infer_schema_length=10000,
    )

    logger.info("hcop_zebrafish_columns", columns=zebrafish_df.columns)
@@ -254,12 +256,13 @@ def fetch_mgi_phenotypes(mouse_gene_symbols: list[str]) -> pl.DataFrame:
    if lines[0].startswith("#"):
        lines = lines[1:]

-    # Read as DataFrame
+    # Read as DataFrame (all columns as string to avoid type inference issues)
    df = pl.read_csv(
        io.StringIO("\n".join(lines)),
        separator="\t",
        null_values=["", "NA"],
        has_header=True,
+        infer_schema_length=10000,
    )

    logger.info("mgi_raw_columns", columns=df.columns)
--- a/src/usher_pipeline/evidence/expression/fetch.py
+++ b/src/usher_pipeline/evidence/expression/fetch.py
@@ -293,12 +293,11 @@ def fetch_hpa_expression(
        )
    )

-    # Pivot: rows=genes, columns=tissues
-    # Create separate columns for each target tissue
-    lf_wide = lf.pivot(
+    # Pivot: rows=genes, columns=tissues (collect first — DataFrame.pivot is simpler)
+    df_wide = lf.collect().pivot(
+        on="Tissue",
        values="expression_value",
        index="Gene name",
-        columns="Tissue",
    )

    # Rename columns to match our schema
@@ -308,14 +307,14 @@ def fetch_hpa_expression(
            rename_map[hpa_tissue] = f"hpa_{our_key}_tpm"

    if rename_map:
-        lf_wide = lf_wide.rename(rename_map)
+        df_wide = df_wide.rename(rename_map)

    # Rename "Gene name" to "gene_symbol"
-    lf_wide = lf_wide.rename({"Gene name": "gene_symbol"})
+    df_wide = df_wide.rename({"Gene name": "gene_symbol"})

    logger.info("hpa_parse_complete", tissues=list(target_tissue_names.keys()))

-    return lf_wide
+    return df_wide.lazy()


 def fetch_gtex_expression(
@@ -372,27 +371,29 @@ def fetch_gtex_expression(

    # Select gene ID column + target tissue columns
    # GTEx uses "Name" for gene ID (ENSG...) and "Description" for gene symbol
+    available_cols = lf.collect_schema().names()
+
    select_cols = ["Name"]
    rename_map = {"Name": "gene_id"}
+    missing_tissues = []

    for our_key, gtex_tissue in target_tissue_cols.items():
-        if gtex_tissue:
-            # Check if tissue column exists (not all GTEx versions have all tissues)
+        if gtex_tissue and gtex_tissue in available_cols:
            select_cols.append(gtex_tissue)
            rename_map[gtex_tissue] = f"gtex_{our_key}_tpm"
+        elif gtex_tissue:
+            missing_tissues.append(gtex_tissue)
+
+    if missing_tissues:
+        logger.warning("gtex_tissues_not_available", missing=missing_tissues)

-    # Try to select columns; if tissue missing, it will be NULL
-    # Use select with error handling for missing columns
-    try:
    lf = lf.select(select_cols).rename(rename_map)
-    except Exception as e:
-        logger.warning("gtex_tissue_missing", error=str(e))
-        # Fallback: select available columns
-        available_cols = lf.columns
-        select_available = [col for col in select_cols if col in available_cols]
-        lf = lf.select(select_available).rename({
-            k: v for k, v in rename_map.items() if k in select_available
-        })
+
+    # Add NULL columns for missing tissues
+    for our_key, gtex_tissue in target_tissue_cols.items():
+        col_name = f"gtex_{our_key}_tpm"
+        if gtex_tissue and gtex_tissue not in available_cols:
+            lf = lf.with_columns(pl.lit(None).cast(pl.Float64).alias(col_name))

    # Filter to requested gene_ids if provided
    if gene_ids:
--- a/src/usher_pipeline/evidence/expression/load.py
+++ b/src/usher_pipeline/evidence/expression/load.py
@@ -31,17 +31,20 @@ def load_to_duckdb(
    logger.info("expression_load_start", row_count=len(df))

    # Calculate summary statistics for provenance
-    # Genes with retina expression (any source)
+    # Genes with retina expression (any source) — check column existence
+    retina_filter_parts = []
+    for col in ["hpa_retina_tpm", "gtex_retina_tpm", "cellxgene_photoreceptor_expr"]:
+        if col in df.columns:
+            retina_filter_parts.append(pl.col(col).is_not_null())
    retina_expr_count = df.filter(
-        pl.col("hpa_retina_tpm").is_not_null() |
-        pl.col("gtex_retina_tpm").is_not_null() |
-        pl.col("cellxgene_photoreceptor_expr").is_not_null()
+        pl.any_horizontal(retina_filter_parts) if retina_filter_parts else pl.lit(False)
    ).height

    # Genes with inner ear expression (primarily CellxGene)
-    inner_ear_expr_count = df.filter(
-        pl.col("cellxgene_hair_cell_expr").is_not_null()
-    ).height
+    inner_ear_expr_count = (
+        df.filter(pl.col("cellxgene_hair_cell_expr").is_not_null()).height
+        if "cellxgene_hair_cell_expr" in df.columns else 0
+    )

    # Mean Tau specificity (excluding NULLs)
    mean_tau = df.select(pl.col("tau_specificity").mean()).item()
--- a/src/usher_pipeline/evidence/expression/models.py
+++ b/src/usher_pipeline/evidence/expression/models.py
@@ -4,12 +4,12 @@ from pydantic import BaseModel

 # HPA normal tissue data download URL (bulk TSV, more efficient than per-gene API)
 HPA_NORMAL_TISSUE_URL = (
-    "https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
+    "https://v23.proteinatlas.org/download/normal_tissue.tsv.zip"
 )

 # GTEx v10 median gene expression bulk data
 GTEX_MEDIAN_EXPRESSION_URL = (
-    "https://storage.googleapis.com/adult-gtex/bulk-gex/v10/median-tpm/"
+    "https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/"
    "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
 )

--- a/src/usher_pipeline/evidence/expression/transform.py
+++ b/src/usher_pipeline/evidence/expression/transform.py
@@ -262,12 +262,32 @@ def process_expression_evidence(
    # Compute expression score
    df = compute_expression_score(df)

+    # Ensure all expected columns exist (NULL if source unavailable)
+    expected_cols = {
+        "hpa_retina_tpm": pl.Float64,
+        "hpa_cerebellum_tpm": pl.Float64,
+        "hpa_testis_tpm": pl.Float64,
+        "hpa_fallopian_tube_tpm": pl.Float64,
+        "gtex_retina_tpm": pl.Float64,
+        "gtex_cerebellum_tpm": pl.Float64,
+        "gtex_testis_tpm": pl.Float64,
+        "gtex_fallopian_tube_tpm": pl.Float64,
+        "cellxgene_photoreceptor_expr": pl.Float64,
+        "cellxgene_hair_cell_expr": pl.Float64,
+        "tau_specificity": pl.Float64,
+        "usher_tissue_enrichment": pl.Float64,
+        "expression_score_normalized": pl.Float64,
+    }
+    for col_name, dtype in expected_cols.items():
+        if col_name not in df.columns:
+            df = df.with_columns(pl.lit(None).cast(dtype).alias(col_name))
+
    logger.info(
        "expression_pipeline_complete",
        row_count=len(df),
-        has_hpa=any("hpa_" in col for col in df.columns),
-        has_gtex=any("gtex_" in col for col in df.columns),
-        has_cellxgene=any("cellxgene_" in col for col in df.columns),
+        has_hpa=any("hpa_" in col and df[col].null_count() < len(df) for col in df.columns if "hpa_" in col),
+        has_gtex=any("gtex_" in col and df[col].null_count() < len(df) for col in df.columns if "gtex_" in col),
+        has_cellxgene=any("cellxgene_" in col and df[col].null_count() < len(df) for col in df.columns if "cellxgene_" in col),
    )

    return df
--- a/src/usher_pipeline/evidence/gnomad/transform.py
+++ b/src/usher_pipeline/evidence/gnomad/transform.py
@@ -27,18 +27,27 @@ def filter_by_coverage(

    Returns:
        LazyFrame with quality_flag column added:
-        - "measured": Good coverage AND has LOEUF estimate
+        - "measured": Has LOEUF estimate (and good coverage if available)
        - "incomplete_coverage": Coverage below thresholds
        - "no_data": Both LOEUF and pLI are NULL
    """
-    # Ensure numeric columns are properly cast to float (handles edge cases with mixed types)
-    lf = lf.with_columns([
-        pl.col("mean_depth").cast(pl.Float64, strict=False),
-        pl.col("cds_covered_pct").cast(pl.Float64, strict=False),
+    # Check which columns are available (gnomAD v4.1 lacks coverage columns)
+    available_cols = lf.collect_schema().names()
+    has_coverage = "mean_depth" in available_cols and "cds_covered_pct" in available_cols
+
+    # Ensure numeric columns are properly cast
+    cast_exprs = [
        pl.col("loeuf").cast(pl.Float64, strict=False),
        pl.col("pli").cast(pl.Float64, strict=False),
+    ]
+    if has_coverage:
+        cast_exprs.extend([
+            pl.col("mean_depth").cast(pl.Float64, strict=False),
+            pl.col("cds_covered_pct").cast(pl.Float64, strict=False),
        ])
+    lf = lf.with_columns(cast_exprs)

+    if has_coverage:
        return lf.with_columns(
            pl.when(
                pl.col("mean_depth").is_not_null()
@@ -59,6 +68,17 @@ def filter_by_coverage(
            .otherwise(pl.lit("incomplete_coverage"))
            .alias("quality_flag")
        )
+    else:
+        # No coverage columns (gnomAD v4.1) — classify by presence of constraint data
+        logger.info("gnomad_no_coverage_columns", msg="Using constraint-only quality flags")
+        return lf.with_columns(
+            pl.when(pl.col("loeuf").is_not_null())
+            .then(pl.lit("measured"))
+            .when(pl.col("loeuf").is_null() & pl.col("pli").is_null())
+            .then(pl.lit("no_data"))
+            .otherwise(pl.lit("incomplete_coverage"))
+            .alias("quality_flag")
+        )


 def normalize_scores(lf: pl.LazyFrame) -> pl.LazyFrame:
--- a/src/usher_pipeline/evidence/localization/models.py
+++ b/src/usher_pipeline/evidence/localization/models.py
@@ -8,7 +8,7 @@ from pydantic import BaseModel, Field
 LOCALIZATION_TABLE_NAME = "subcellular_localization"

 # HPA subcellular data URL (bulk download)
-HPA_SUBCELLULAR_URL = "https://www.proteinatlas.org/download/subcellular_location.tsv.zip"
+HPA_SUBCELLULAR_URL = "https://v23.proteinatlas.org/download/subcellular_location.tsv.zip"

 # Compartment definitions for scoring
 CILIA_COMPARTMENTS = [
--- a/src/usher_pipeline/gene_mapping/universe.py
+++ b/src/usher_pipeline/gene_mapping/universe.py
@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)

 # Expected range for human protein-coding genes
 MIN_EXPECTED_GENES = 19000
-MAX_EXPECTED_GENES = 22000
+MAX_EXPECTED_GENES = 23000


 def fetch_protein_coding_genes(ensembl_release: int = 113) -> GeneUniverse:
@@ -51,12 +51,12 @@ def fetch_protein_coding_genes(ensembl_release: int = 113) -> GeneUniverse:

    # Query for human protein-coding genes
    logger.info("Querying mygene for type_of_gene:protein-coding (species=9606)")
-    results = mg.query(
+    results = list(mg.query(
        'type_of_gene:"protein-coding"',
        species=9606,
        fields='ensembl.gene,symbol,name',
        fetch_all=True,
-    )
+    ))

    logger.info(f"Retrieved {len(results)} results from mygene")

--- a/src/usher_pipeline/gene_mapping/validator.py
+++ b/src/usher_pipeline/gene_mapping/validator.py
@@ -171,7 +171,7 @@ def validate_gene_universe(genes: list[str]) -> ValidationResult:

    gene_count = len(genes)
    MIN_GENES = 19000
-    MAX_GENES = 22000
+    MAX_GENES = 23000

    # Check gene count
    if gene_count < MIN_GENES: