fix: resolve runtime bugs for pipeline execution on Python 3.14 + latest deps

- gene_mapping: wrap mygene fetch_all generator in list() to fix len() error
- gene_mapping: raise MAX_EXPECTED_GENES to 23000 (mygene DB growth)
- setup_cmd: rename gene_universe columns to gene_id/gene_symbol for
  consistency with all downstream evidence layer code
- gnomad: handle missing coverage columns in v4.1 constraint TSV
- expression: fix HPA URL (v23.proteinatlas.org) and GTEx URL (v8 path)
- expression: fix Polars pivot() API change (columns -> on), collect first
- expression: handle missing GTEx tissues (Eye - Retina not in v8)
- expression: ensure all expected columns exist even when sources unavailable
- expression/load: safely check column existence before filtering
- localization: fix HPA subcellular URL to v23
- animal_models: fix httpx stream response.read() before .text access
- animal_models: increase infer_schema_length for HCOP and MGI TSV parsing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-13 03:44:01 +08:00
parent a2ef2125ba
commit 6605ff0f2b
10 changed files with 112 additions and 65 deletions

View File

@@ -189,8 +189,8 @@ def setup(ctx, force):
# Create DataFrame with mapping results
df = pl.DataFrame({
'ensembl_id': [r.ensembl_id for r in mapping_results],
'hgnc_symbol': [r.hgnc_symbol for r in mapping_results],
'gene_id': [r.ensembl_id for r in mapping_results],
'gene_symbol': [r.hgnc_symbol for r in mapping_results],
'uniprot_accession': [r.uniprot_accession for r in mapping_results],
})

View File

@@ -86,6 +86,7 @@ def _download_text(url: str) -> str:
with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
response.raise_for_status()
response.read()
content = response.text
logger.info("download_text_complete", size_mb=round(len(content) / 1024 / 1024, 2))
@@ -163,6 +164,7 @@ def fetch_ortholog_mapping(gene_ids: list[str]) -> pl.DataFrame:
io.BytesIO(zebrafish_data),
separator="\t",
null_values=["", "NA"],
infer_schema_length=10000,
)
logger.info("hcop_zebrafish_columns", columns=zebrafish_df.columns)
@@ -254,12 +256,13 @@ def fetch_mgi_phenotypes(mouse_gene_symbols: list[str]) -> pl.DataFrame:
if lines[0].startswith("#"):
lines = lines[1:]
# Read as DataFrame
# Read as DataFrame (all columns as string to avoid type inference issues)
df = pl.read_csv(
io.StringIO("\n".join(lines)),
separator="\t",
null_values=["", "NA"],
has_header=True,
infer_schema_length=10000,
)
logger.info("mgi_raw_columns", columns=df.columns)

View File

@@ -293,12 +293,11 @@ def fetch_hpa_expression(
)
)
# Pivot: rows=genes, columns=tissues
# Create separate columns for each target tissue
lf_wide = lf.pivot(
# Pivot: rows=genes, columns=tissues (collect first — DataFrame.pivot is simpler)
df_wide = lf.collect().pivot(
on="Tissue",
values="expression_value",
index="Gene name",
columns="Tissue",
)
# Rename columns to match our schema
@@ -308,14 +307,14 @@ def fetch_hpa_expression(
rename_map[hpa_tissue] = f"hpa_{our_key}_tpm"
if rename_map:
lf_wide = lf_wide.rename(rename_map)
df_wide = df_wide.rename(rename_map)
# Rename "Gene name" to "gene_symbol"
lf_wide = lf_wide.rename({"Gene name": "gene_symbol"})
df_wide = df_wide.rename({"Gene name": "gene_symbol"})
logger.info("hpa_parse_complete", tissues=list(target_tissue_names.keys()))
return lf_wide
return df_wide.lazy()
def fetch_gtex_expression(
@@ -372,27 +371,29 @@ def fetch_gtex_expression(
# Select gene ID column + target tissue columns
# GTEx uses "Name" for gene ID (ENSG...) and "Description" for gene symbol
available_cols = lf.collect_schema().names()
select_cols = ["Name"]
rename_map = {"Name": "gene_id"}
missing_tissues = []
for our_key, gtex_tissue in target_tissue_cols.items():
if gtex_tissue:
# Check if tissue column exists (not all GTEx versions have all tissues)
if gtex_tissue and gtex_tissue in available_cols:
select_cols.append(gtex_tissue)
rename_map[gtex_tissue] = f"gtex_{our_key}_tpm"
elif gtex_tissue:
missing_tissues.append(gtex_tissue)
if missing_tissues:
logger.warning("gtex_tissues_not_available", missing=missing_tissues)
# Try to select columns; if tissue missing, it will be NULL
# Use select with error handling for missing columns
try:
lf = lf.select(select_cols).rename(rename_map)
except Exception as e:
logger.warning("gtex_tissue_missing", error=str(e))
# Fallback: select available columns
available_cols = lf.columns
select_available = [col for col in select_cols if col in available_cols]
lf = lf.select(select_available).rename({
k: v for k, v in rename_map.items() if k in select_available
})
# Add NULL columns for missing tissues
for our_key, gtex_tissue in target_tissue_cols.items():
col_name = f"gtex_{our_key}_tpm"
if gtex_tissue and gtex_tissue not in available_cols:
lf = lf.with_columns(pl.lit(None).cast(pl.Float64).alias(col_name))
# Filter to requested gene_ids if provided
if gene_ids:

View File

@@ -31,17 +31,20 @@ def load_to_duckdb(
logger.info("expression_load_start", row_count=len(df))
# Calculate summary statistics for provenance
# Genes with retina expression (any source)
# Genes with retina expression (any source) — check column existence
retina_filter_parts = []
for col in ["hpa_retina_tpm", "gtex_retina_tpm", "cellxgene_photoreceptor_expr"]:
if col in df.columns:
retina_filter_parts.append(pl.col(col).is_not_null())
retina_expr_count = df.filter(
pl.col("hpa_retina_tpm").is_not_null() |
pl.col("gtex_retina_tpm").is_not_null() |
pl.col("cellxgene_photoreceptor_expr").is_not_null()
pl.any_horizontal(retina_filter_parts) if retina_filter_parts else pl.lit(False)
).height
# Genes with inner ear expression (primarily CellxGene)
inner_ear_expr_count = df.filter(
pl.col("cellxgene_hair_cell_expr").is_not_null()
).height
inner_ear_expr_count = (
df.filter(pl.col("cellxgene_hair_cell_expr").is_not_null()).height
if "cellxgene_hair_cell_expr" in df.columns else 0
)
# Mean Tau specificity (excluding NULLs)
mean_tau = df.select(pl.col("tau_specificity").mean()).item()

View File

@@ -4,12 +4,12 @@ from pydantic import BaseModel
# HPA normal tissue data download URL (bulk TSV, more efficient than per-gene API)
HPA_NORMAL_TISSUE_URL = (
"https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
"https://v23.proteinatlas.org/download/normal_tissue.tsv.zip"
)
# GTEx v10 median gene expression bulk data
GTEX_MEDIAN_EXPRESSION_URL = (
"https://storage.googleapis.com/adult-gtex/bulk-gex/v10/median-tpm/"
"https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/"
"GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
)

View File

@@ -262,12 +262,32 @@ def process_expression_evidence(
# Compute expression score
df = compute_expression_score(df)
# Ensure all expected columns exist (NULL if source unavailable)
expected_cols = {
"hpa_retina_tpm": pl.Float64,
"hpa_cerebellum_tpm": pl.Float64,
"hpa_testis_tpm": pl.Float64,
"hpa_fallopian_tube_tpm": pl.Float64,
"gtex_retina_tpm": pl.Float64,
"gtex_cerebellum_tpm": pl.Float64,
"gtex_testis_tpm": pl.Float64,
"gtex_fallopian_tube_tpm": pl.Float64,
"cellxgene_photoreceptor_expr": pl.Float64,
"cellxgene_hair_cell_expr": pl.Float64,
"tau_specificity": pl.Float64,
"usher_tissue_enrichment": pl.Float64,
"expression_score_normalized": pl.Float64,
}
for col_name, dtype in expected_cols.items():
if col_name not in df.columns:
df = df.with_columns(pl.lit(None).cast(dtype).alias(col_name))
logger.info(
"expression_pipeline_complete",
row_count=len(df),
has_hpa=any("hpa_" in col for col in df.columns),
has_gtex=any("gtex_" in col for col in df.columns),
has_cellxgene=any("cellxgene_" in col for col in df.columns),
has_hpa=any("hpa_" in col and df[col].null_count() < len(df) for col in df.columns if "hpa_" in col),
has_gtex=any("gtex_" in col and df[col].null_count() < len(df) for col in df.columns if "gtex_" in col),
has_cellxgene=any("cellxgene_" in col and df[col].null_count() < len(df) for col in df.columns if "cellxgene_" in col),
)
return df

View File

@@ -27,18 +27,27 @@ def filter_by_coverage(
Returns:
LazyFrame with quality_flag column added:
- "measured": Good coverage AND has LOEUF estimate
- "measured": Has LOEUF estimate (and good coverage if available)
- "incomplete_coverage": Coverage below thresholds
- "no_data": Both LOEUF and pLI are NULL
"""
# Ensure numeric columns are properly cast to float (handles edge cases with mixed types)
lf = lf.with_columns([
pl.col("mean_depth").cast(pl.Float64, strict=False),
pl.col("cds_covered_pct").cast(pl.Float64, strict=False),
# Check which columns are available (gnomAD v4.1 lacks coverage columns)
available_cols = lf.collect_schema().names()
has_coverage = "mean_depth" in available_cols and "cds_covered_pct" in available_cols
# Ensure numeric columns are properly cast
cast_exprs = [
pl.col("loeuf").cast(pl.Float64, strict=False),
pl.col("pli").cast(pl.Float64, strict=False),
]
if has_coverage:
cast_exprs.extend([
pl.col("mean_depth").cast(pl.Float64, strict=False),
pl.col("cds_covered_pct").cast(pl.Float64, strict=False),
])
lf = lf.with_columns(cast_exprs)
if has_coverage:
return lf.with_columns(
pl.when(
pl.col("mean_depth").is_not_null()
@@ -59,6 +68,17 @@ def filter_by_coverage(
.otherwise(pl.lit("incomplete_coverage"))
.alias("quality_flag")
)
else:
# No coverage columns (gnomAD v4.1) — classify by presence of constraint data
logger.info("gnomad_no_coverage_columns", msg="Using constraint-only quality flags")
return lf.with_columns(
pl.when(pl.col("loeuf").is_not_null())
.then(pl.lit("measured"))
.when(pl.col("loeuf").is_null() & pl.col("pli").is_null())
.then(pl.lit("no_data"))
.otherwise(pl.lit("incomplete_coverage"))
.alias("quality_flag")
)
def normalize_scores(lf: pl.LazyFrame) -> pl.LazyFrame:

View File

@@ -8,7 +8,7 @@ from pydantic import BaseModel, Field
LOCALIZATION_TABLE_NAME = "subcellular_localization"
# HPA subcellular data URL (bulk download)
HPA_SUBCELLULAR_URL = "https://www.proteinatlas.org/download/subcellular_location.tsv.zip"
HPA_SUBCELLULAR_URL = "https://v23.proteinatlas.org/download/subcellular_location.tsv.zip"
# Compartment definitions for scoring
CILIA_COMPARTMENTS = [

View File

@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
# Expected range for human protein-coding genes
MIN_EXPECTED_GENES = 19000
MAX_EXPECTED_GENES = 22000
MAX_EXPECTED_GENES = 23000
def fetch_protein_coding_genes(ensembl_release: int = 113) -> GeneUniverse:
@@ -51,12 +51,12 @@ def fetch_protein_coding_genes(ensembl_release: int = 113) -> GeneUniverse:
# Query for human protein-coding genes
logger.info("Querying mygene for type_of_gene:protein-coding (species=9606)")
results = mg.query(
results = list(mg.query(
'type_of_gene:"protein-coding"',
species=9606,
fields='ensembl.gene,symbol,name',
fetch_all=True,
)
))
logger.info(f"Retrieved {len(results)} results from mygene")

View File

@@ -171,7 +171,7 @@ def validate_gene_universe(genes: list[str]) -> ValidationResult:
gene_count = len(genes)
MIN_GENES = 19000
MAX_GENES = 22000
MAX_GENES = 23000
# Check gene count
if gene_count < MIN_GENES: