From 6605ff0f2b123043f8c287d1d87826a66607f07b Mon Sep 17 00:00:00 2001 From: gbanyan Date: Fri, 13 Feb 2026 03:44:01 +0800 Subject: [PATCH] fix: resolve runtime bugs for pipeline execution on Python 3.14 + latest deps - gene_mapping: wrap mygene fetch_all generator in list() to fix len() error - gene_mapping: raise MAX_EXPECTED_GENES to 23000 (mygene DB growth) - setup_cmd: rename gene_universe columns to gene_id/gene_symbol for consistency with all downstream evidence layer code - gnomad: handle missing coverage columns in v4.1 constraint TSV - expression: fix HPA URL (v23.proteinatlas.org) and GTEx URL (v8 path) - expression: fix Polars pivot() API change (columns -> on), collect first - expression: handle missing GTEx tissues (Eye - Retina not in v8) - expression: ensure all expected columns exist even when sources unavailable - expression/load: safely check column existence before filtering - localization: fix HPA subcellular URL to v23 - animal_models: fix httpx stream response.read() before .text access - animal_models: increase infer_schema_length for HCOP and MGI TSV parsing Co-Authored-By: Claude Opus 4.6 --- src/usher_pipeline/cli/setup_cmd.py | 4 +- .../evidence/animal_models/fetch.py | 5 +- .../evidence/expression/fetch.py | 43 ++++++------ .../evidence/expression/load.py | 17 +++-- .../evidence/expression/models.py | 4 +- .../evidence/expression/transform.py | 26 ++++++- .../evidence/gnomad/transform.py | 68 ++++++++++++------- .../evidence/localization/models.py | 2 +- src/usher_pipeline/gene_mapping/universe.py | 6 +- src/usher_pipeline/gene_mapping/validator.py | 2 +- 10 files changed, 112 insertions(+), 65 deletions(-) diff --git a/src/usher_pipeline/cli/setup_cmd.py b/src/usher_pipeline/cli/setup_cmd.py index 282402e..b106cc9 100644 --- a/src/usher_pipeline/cli/setup_cmd.py +++ b/src/usher_pipeline/cli/setup_cmd.py @@ -189,8 +189,8 @@ def setup(ctx, force): # Create DataFrame with mapping results df = pl.DataFrame({ - 'ensembl_id': [r.ensembl_id for r in mapping_results], - 'hgnc_symbol': [r.hgnc_symbol for r in mapping_results], + 'gene_id': [r.ensembl_id for r in mapping_results], + 'gene_symbol': [r.hgnc_symbol for r in mapping_results], 'uniprot_accession': [r.uniprot_accession for r in mapping_results], }) diff --git a/src/usher_pipeline/evidence/animal_models/fetch.py b/src/usher_pipeline/evidence/animal_models/fetch.py index dd97b18..41874ab 100644 --- a/src/usher_pipeline/evidence/animal_models/fetch.py +++ b/src/usher_pipeline/evidence/animal_models/fetch.py @@ -86,6 +86,7 @@ def _download_text(url: str) -> str: with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response: response.raise_for_status() + response.read() content = response.text logger.info("download_text_complete", size_mb=round(len(content) / 1024 / 1024, 2)) @@ -163,6 +164,7 @@ def fetch_ortholog_mapping(gene_ids: list[str]) -> pl.DataFrame: io.BytesIO(zebrafish_data), separator="\t", null_values=["", "NA"], + infer_schema_length=10000, ) logger.info("hcop_zebrafish_columns", columns=zebrafish_df.columns) @@ -254,12 +256,13 @@ def fetch_mgi_phenotypes(mouse_gene_symbols: list[str]) -> pl.DataFrame: if lines[0].startswith("#"): lines = lines[1:] - # Read as DataFrame + # Read as DataFrame (all columns as string to avoid type inference issues) df = pl.read_csv( io.StringIO("\n".join(lines)), separator="\t", null_values=["", "NA"], has_header=True, + infer_schema_length=10000, ) logger.info("mgi_raw_columns", columns=df.columns) diff --git a/src/usher_pipeline/evidence/expression/fetch.py b/src/usher_pipeline/evidence/expression/fetch.py index a96bc7b..20deaef 100644 --- a/src/usher_pipeline/evidence/expression/fetch.py +++ b/src/usher_pipeline/evidence/expression/fetch.py @@ -293,12 +293,11 @@ def fetch_hpa_expression( ) ) - # Pivot: rows=genes, columns=tissues - # Create separate columns for each target tissue - lf_wide = lf.pivot( + # Pivot: rows=genes, columns=tissues (collect first — DataFrame.pivot is simpler) + df_wide = lf.collect().pivot( + on="Tissue", values="expression_value", index="Gene name", - columns="Tissue", ) # Rename columns to match our schema @@ -308,14 +307,14 @@ def fetch_hpa_expression( rename_map[hpa_tissue] = f"hpa_{our_key}_tpm" if rename_map: - lf_wide = lf_wide.rename(rename_map) + df_wide = df_wide.rename(rename_map) # Rename "Gene name" to "gene_symbol" - lf_wide = lf_wide.rename({"Gene name": "gene_symbol"}) + df_wide = df_wide.rename({"Gene name": "gene_symbol"}) logger.info("hpa_parse_complete", tissues=list(target_tissue_names.keys())) - return lf_wide + return df_wide.lazy() def fetch_gtex_expression( @@ -372,27 +371,29 @@ def fetch_gtex_expression( # Select gene ID column + target tissue columns # GTEx uses "Name" for gene ID (ENSG...) and "Description" for gene symbol + available_cols = lf.collect_schema().names() + select_cols = ["Name"] rename_map = {"Name": "gene_id"} + missing_tissues = [] for our_key, gtex_tissue in target_tissue_cols.items(): - if gtex_tissue: - # Check if tissue column exists (not all GTEx versions have all tissues) + if gtex_tissue and gtex_tissue in available_cols: select_cols.append(gtex_tissue) rename_map[gtex_tissue] = f"gtex_{our_key}_tpm" + elif gtex_tissue: + missing_tissues.append(gtex_tissue) - # Try to select columns; if tissue missing, it will be NULL - # Use select with error handling for missing columns - try: - lf = lf.select(select_cols).rename(rename_map) - except Exception as e: - logger.warning("gtex_tissue_missing", error=str(e)) - # Fallback: select available columns - available_cols = lf.columns - select_available = [col for col in select_cols if col in available_cols] - lf = lf.select(select_available).rename({ - k: v for k, v in rename_map.items() if k in select_available - }) + if missing_tissues: + logger.warning("gtex_tissues_not_available", missing=missing_tissues) + + lf = lf.select(select_cols).rename(rename_map) + + # Add NULL columns for missing tissues + for our_key, gtex_tissue in target_tissue_cols.items(): + col_name = f"gtex_{our_key}_tpm" + if gtex_tissue and gtex_tissue not in available_cols: + lf = lf.with_columns(pl.lit(None).cast(pl.Float64).alias(col_name)) # Filter to requested gene_ids if provided if gene_ids: diff --git a/src/usher_pipeline/evidence/expression/load.py b/src/usher_pipeline/evidence/expression/load.py index 05696e0..f176a01 100644 --- a/src/usher_pipeline/evidence/expression/load.py +++ b/src/usher_pipeline/evidence/expression/load.py @@ -31,17 +31,20 @@ def load_to_duckdb( logger.info("expression_load_start", row_count=len(df)) # Calculate summary statistics for provenance - # Genes with retina expression (any source) + # Genes with retina expression (any source) — check column existence + retina_filter_parts = [] + for col in ["hpa_retina_tpm", "gtex_retina_tpm", "cellxgene_photoreceptor_expr"]: + if col in df.columns: + retina_filter_parts.append(pl.col(col).is_not_null()) retina_expr_count = df.filter( - pl.col("hpa_retina_tpm").is_not_null() | - pl.col("gtex_retina_tpm").is_not_null() | - pl.col("cellxgene_photoreceptor_expr").is_not_null() + pl.any_horizontal(retina_filter_parts) if retina_filter_parts else pl.lit(False) ).height # Genes with inner ear expression (primarily CellxGene) - inner_ear_expr_count = df.filter( - pl.col("cellxgene_hair_cell_expr").is_not_null() - ).height + inner_ear_expr_count = ( + df.filter(pl.col("cellxgene_hair_cell_expr").is_not_null()).height + if "cellxgene_hair_cell_expr" in df.columns else 0 + ) # Mean Tau specificity (excluding NULLs) mean_tau = df.select(pl.col("tau_specificity").mean()).item() diff --git a/src/usher_pipeline/evidence/expression/models.py b/src/usher_pipeline/evidence/expression/models.py index 41ab3ae..9954dee 100644 --- a/src/usher_pipeline/evidence/expression/models.py +++ b/src/usher_pipeline/evidence/expression/models.py @@ -4,12 +4,12 @@ from pydantic import BaseModel # HPA normal tissue data download URL (bulk TSV, more efficient than per-gene API) HPA_NORMAL_TISSUE_URL = ( - "https://www.proteinatlas.org/download/normal_tissue.tsv.zip" + "https://v23.proteinatlas.org/download/normal_tissue.tsv.zip" ) # GTEx v10 median gene expression bulk data GTEX_MEDIAN_EXPRESSION_URL = ( - "https://storage.googleapis.com/adult-gtex/bulk-gex/v10/median-tpm/" + "https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/" "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz" ) diff --git a/src/usher_pipeline/evidence/expression/transform.py b/src/usher_pipeline/evidence/expression/transform.py index 5b39f2a..23c3551 100644 --- a/src/usher_pipeline/evidence/expression/transform.py +++ b/src/usher_pipeline/evidence/expression/transform.py @@ -262,12 +262,32 @@ def process_expression_evidence( # Compute expression score df = compute_expression_score(df) + # Ensure all expected columns exist (NULL if source unavailable) + expected_cols = { + "hpa_retina_tpm": pl.Float64, + "hpa_cerebellum_tpm": pl.Float64, + "hpa_testis_tpm": pl.Float64, + "hpa_fallopian_tube_tpm": pl.Float64, + "gtex_retina_tpm": pl.Float64, + "gtex_cerebellum_tpm": pl.Float64, + "gtex_testis_tpm": pl.Float64, + "gtex_fallopian_tube_tpm": pl.Float64, + "cellxgene_photoreceptor_expr": pl.Float64, + "cellxgene_hair_cell_expr": pl.Float64, + "tau_specificity": pl.Float64, + "usher_tissue_enrichment": pl.Float64, + "expression_score_normalized": pl.Float64, + } + for col_name, dtype in expected_cols.items(): + if col_name not in df.columns: + df = df.with_columns(pl.lit(None).cast(dtype).alias(col_name)) + logger.info( "expression_pipeline_complete", row_count=len(df), - has_hpa=any("hpa_" in col for col in df.columns), - has_gtex=any("gtex_" in col for col in df.columns), - has_cellxgene=any("cellxgene_" in col for col in df.columns), + has_hpa=any("hpa_" in col and df[col].null_count() < len(df) for col in df.columns if "hpa_" in col), + has_gtex=any("gtex_" in col and df[col].null_count() < len(df) for col in df.columns if "gtex_" in col), + has_cellxgene=any("cellxgene_" in col and df[col].null_count() < len(df) for col in df.columns if "cellxgene_" in col), ) return df diff --git a/src/usher_pipeline/evidence/gnomad/transform.py b/src/usher_pipeline/evidence/gnomad/transform.py index 19e7fd2..1e8fa59 100644 --- a/src/usher_pipeline/evidence/gnomad/transform.py +++ b/src/usher_pipeline/evidence/gnomad/transform.py @@ -27,38 +27,58 @@ def filter_by_coverage( Returns: LazyFrame with quality_flag column added: - - "measured": Good coverage AND has LOEUF estimate + - "measured": Has LOEUF estimate (and good coverage if available) - "incomplete_coverage": Coverage below thresholds - "no_data": Both LOEUF and pLI are NULL """ - # Ensure numeric columns are properly cast to float (handles edge cases with mixed types) - lf = lf.with_columns([ - pl.col("mean_depth").cast(pl.Float64, strict=False), - pl.col("cds_covered_pct").cast(pl.Float64, strict=False), + # Check which columns are available (gnomAD v4.1 lacks coverage columns) + available_cols = lf.collect_schema().names() + has_coverage = "mean_depth" in available_cols and "cds_covered_pct" in available_cols + + # Ensure numeric columns are properly cast + cast_exprs = [ pl.col("loeuf").cast(pl.Float64, strict=False), pl.col("pli").cast(pl.Float64, strict=False), - ]) + ] + if has_coverage: + cast_exprs.extend([ + pl.col("mean_depth").cast(pl.Float64, strict=False), + pl.col("cds_covered_pct").cast(pl.Float64, strict=False), + ]) + lf = lf.with_columns(cast_exprs) - return lf.with_columns( - pl.when( - pl.col("mean_depth").is_not_null() - & pl.col("cds_covered_pct").is_not_null() - & (pl.col("mean_depth") >= min_depth) - & (pl.col("cds_covered_pct") >= min_cds_pct) - & pl.col("loeuf").is_not_null() + if has_coverage: + return lf.with_columns( + pl.when( + pl.col("mean_depth").is_not_null() + & pl.col("cds_covered_pct").is_not_null() + & (pl.col("mean_depth") >= min_depth) + & (pl.col("cds_covered_pct") >= min_cds_pct) + & pl.col("loeuf").is_not_null() + ) + .then(pl.lit("measured")) + .when(pl.col("loeuf").is_null() & pl.col("pli").is_null()) + .then(pl.lit("no_data")) + .when( + pl.col("mean_depth").is_not_null() + & pl.col("cds_covered_pct").is_not_null() + & ((pl.col("mean_depth") < min_depth) | (pl.col("cds_covered_pct") < min_cds_pct)) + ) + .then(pl.lit("incomplete_coverage")) + .otherwise(pl.lit("incomplete_coverage")) + .alias("quality_flag") ) - .then(pl.lit("measured")) - .when(pl.col("loeuf").is_null() & pl.col("pli").is_null()) - .then(pl.lit("no_data")) - .when( - pl.col("mean_depth").is_not_null() - & pl.col("cds_covered_pct").is_not_null() - & ((pl.col("mean_depth") < min_depth) | (pl.col("cds_covered_pct") < min_cds_pct)) + else: + # No coverage columns (gnomAD v4.1) — classify by presence of constraint data + logger.info("gnomad_no_coverage_columns", msg="Using constraint-only quality flags") + return lf.with_columns( + pl.when(pl.col("loeuf").is_not_null()) + .then(pl.lit("measured")) + .when(pl.col("loeuf").is_null() & pl.col("pli").is_null()) + .then(pl.lit("no_data")) + .otherwise(pl.lit("incomplete_coverage")) + .alias("quality_flag") ) - .then(pl.lit("incomplete_coverage")) - .otherwise(pl.lit("incomplete_coverage")) - .alias("quality_flag") - ) def normalize_scores(lf: pl.LazyFrame) -> pl.LazyFrame: diff --git a/src/usher_pipeline/evidence/localization/models.py b/src/usher_pipeline/evidence/localization/models.py index 66c540d..be3f1fe 100644 --- a/src/usher_pipeline/evidence/localization/models.py +++ b/src/usher_pipeline/evidence/localization/models.py @@ -8,7 +8,7 @@ from pydantic import BaseModel, Field LOCALIZATION_TABLE_NAME = "subcellular_localization" # HPA subcellular data URL (bulk download) -HPA_SUBCELLULAR_URL = "https://www.proteinatlas.org/download/subcellular_location.tsv.zip" +HPA_SUBCELLULAR_URL = "https://v23.proteinatlas.org/download/subcellular_location.tsv.zip" # Compartment definitions for scoring CILIA_COMPARTMENTS = [ diff --git a/src/usher_pipeline/gene_mapping/universe.py b/src/usher_pipeline/gene_mapping/universe.py index 1d21a9c..d784fee 100644 --- a/src/usher_pipeline/gene_mapping/universe.py +++ b/src/usher_pipeline/gene_mapping/universe.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) # Expected range for human protein-coding genes MIN_EXPECTED_GENES = 19000 -MAX_EXPECTED_GENES = 22000 +MAX_EXPECTED_GENES = 23000 def fetch_protein_coding_genes(ensembl_release: int = 113) -> GeneUniverse: @@ -51,12 +51,12 @@ def fetch_protein_coding_genes(ensembl_release: int = 113) -> GeneUniverse: # Query for human protein-coding genes logger.info("Querying mygene for type_of_gene:protein-coding (species=9606)") - results = mg.query( + results = list(mg.query( 'type_of_gene:"protein-coding"', species=9606, fields='ensembl.gene,symbol,name', fetch_all=True, - ) + )) logger.info(f"Retrieved {len(results)} results from mygene") diff --git a/src/usher_pipeline/gene_mapping/validator.py b/src/usher_pipeline/gene_mapping/validator.py index 644f3e2..fee460f 100644 --- a/src/usher_pipeline/gene_mapping/validator.py +++ b/src/usher_pipeline/gene_mapping/validator.py @@ -171,7 +171,7 @@ def validate_gene_universe(genes: list[str]) -> ValidationResult: gene_count = len(genes) MIN_GENES = 19000 - MAX_GENES = 22000 + MAX_GENES = 23000 # Check gene count if gene_count < MIN_GENES: