fix: resolve runtime bugs for pipeline execution on Python 3.14 + latest deps
- gene_mapping: wrap mygene fetch_all generator in list() to fix len() error - gene_mapping: raise MAX_EXPECTED_GENES to 23000 (mygene DB growth) - setup_cmd: rename gene_universe columns to gene_id/gene_symbol for consistency with all downstream evidence layer code - gnomad: handle missing coverage columns in v4.1 constraint TSV - expression: fix HPA URL (v23.proteinatlas.org) and GTEx URL (v8 path) - expression: fix Polars pivot() API change (columns -> on), collect first - expression: handle missing GTEx tissues (Eye - Retina not in v8) - expression: ensure all expected columns exist even when sources unavailable - expression/load: safely check column existence before filtering - localization: fix HPA subcellular URL to v23 - animal_models: fix httpx stream response.read() before .text access - animal_models: increase infer_schema_length for HCOP and MGI TSV parsing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -189,8 +189,8 @@ def setup(ctx, force):
|
||||
|
||||
# Create DataFrame with mapping results
|
||||
df = pl.DataFrame({
|
||||
'ensembl_id': [r.ensembl_id for r in mapping_results],
|
||||
'hgnc_symbol': [r.hgnc_symbol for r in mapping_results],
|
||||
'gene_id': [r.ensembl_id for r in mapping_results],
|
||||
'gene_symbol': [r.hgnc_symbol for r in mapping_results],
|
||||
'uniprot_accession': [r.uniprot_accession for r in mapping_results],
|
||||
})
|
||||
|
||||
|
||||
@@ -86,6 +86,7 @@ def _download_text(url: str) -> str:
|
||||
|
||||
with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as response:
|
||||
response.raise_for_status()
|
||||
response.read()
|
||||
content = response.text
|
||||
|
||||
logger.info("download_text_complete", size_mb=round(len(content) / 1024 / 1024, 2))
|
||||
@@ -163,6 +164,7 @@ def fetch_ortholog_mapping(gene_ids: list[str]) -> pl.DataFrame:
|
||||
io.BytesIO(zebrafish_data),
|
||||
separator="\t",
|
||||
null_values=["", "NA"],
|
||||
infer_schema_length=10000,
|
||||
)
|
||||
|
||||
logger.info("hcop_zebrafish_columns", columns=zebrafish_df.columns)
|
||||
@@ -254,12 +256,13 @@ def fetch_mgi_phenotypes(mouse_gene_symbols: list[str]) -> pl.DataFrame:
|
||||
if lines[0].startswith("#"):
|
||||
lines = lines[1:]
|
||||
|
||||
# Read as DataFrame
|
||||
# Read as DataFrame (all columns as string to avoid type inference issues)
|
||||
df = pl.read_csv(
|
||||
io.StringIO("\n".join(lines)),
|
||||
separator="\t",
|
||||
null_values=["", "NA"],
|
||||
has_header=True,
|
||||
infer_schema_length=10000,
|
||||
)
|
||||
|
||||
logger.info("mgi_raw_columns", columns=df.columns)
|
||||
|
||||
@@ -293,12 +293,11 @@ def fetch_hpa_expression(
|
||||
)
|
||||
)
|
||||
|
||||
# Pivot: rows=genes, columns=tissues
|
||||
# Create separate columns for each target tissue
|
||||
lf_wide = lf.pivot(
|
||||
# Pivot: rows=genes, columns=tissues (collect first — DataFrame.pivot is simpler)
|
||||
df_wide = lf.collect().pivot(
|
||||
on="Tissue",
|
||||
values="expression_value",
|
||||
index="Gene name",
|
||||
columns="Tissue",
|
||||
)
|
||||
|
||||
# Rename columns to match our schema
|
||||
@@ -308,14 +307,14 @@ def fetch_hpa_expression(
|
||||
rename_map[hpa_tissue] = f"hpa_{our_key}_tpm"
|
||||
|
||||
if rename_map:
|
||||
lf_wide = lf_wide.rename(rename_map)
|
||||
df_wide = df_wide.rename(rename_map)
|
||||
|
||||
# Rename "Gene name" to "gene_symbol"
|
||||
lf_wide = lf_wide.rename({"Gene name": "gene_symbol"})
|
||||
df_wide = df_wide.rename({"Gene name": "gene_symbol"})
|
||||
|
||||
logger.info("hpa_parse_complete", tissues=list(target_tissue_names.keys()))
|
||||
|
||||
return lf_wide
|
||||
return df_wide.lazy()
|
||||
|
||||
|
||||
def fetch_gtex_expression(
|
||||
@@ -372,27 +371,29 @@ def fetch_gtex_expression(
|
||||
|
||||
# Select gene ID column + target tissue columns
|
||||
# GTEx uses "Name" for gene ID (ENSG...) and "Description" for gene symbol
|
||||
available_cols = lf.collect_schema().names()
|
||||
|
||||
select_cols = ["Name"]
|
||||
rename_map = {"Name": "gene_id"}
|
||||
missing_tissues = []
|
||||
|
||||
for our_key, gtex_tissue in target_tissue_cols.items():
|
||||
if gtex_tissue:
|
||||
# Check if tissue column exists (not all GTEx versions have all tissues)
|
||||
if gtex_tissue and gtex_tissue in available_cols:
|
||||
select_cols.append(gtex_tissue)
|
||||
rename_map[gtex_tissue] = f"gtex_{our_key}_tpm"
|
||||
elif gtex_tissue:
|
||||
missing_tissues.append(gtex_tissue)
|
||||
|
||||
if missing_tissues:
|
||||
logger.warning("gtex_tissues_not_available", missing=missing_tissues)
|
||||
|
||||
# Try to select columns; if tissue missing, it will be NULL
|
||||
# Use select with error handling for missing columns
|
||||
try:
|
||||
lf = lf.select(select_cols).rename(rename_map)
|
||||
except Exception as e:
|
||||
logger.warning("gtex_tissue_missing", error=str(e))
|
||||
# Fallback: select available columns
|
||||
available_cols = lf.columns
|
||||
select_available = [col for col in select_cols if col in available_cols]
|
||||
lf = lf.select(select_available).rename({
|
||||
k: v for k, v in rename_map.items() if k in select_available
|
||||
})
|
||||
|
||||
# Add NULL columns for missing tissues
|
||||
for our_key, gtex_tissue in target_tissue_cols.items():
|
||||
col_name = f"gtex_{our_key}_tpm"
|
||||
if gtex_tissue and gtex_tissue not in available_cols:
|
||||
lf = lf.with_columns(pl.lit(None).cast(pl.Float64).alias(col_name))
|
||||
|
||||
# Filter to requested gene_ids if provided
|
||||
if gene_ids:
|
||||
|
||||
@@ -31,17 +31,20 @@ def load_to_duckdb(
|
||||
logger.info("expression_load_start", row_count=len(df))
|
||||
|
||||
# Calculate summary statistics for provenance
|
||||
# Genes with retina expression (any source)
|
||||
# Genes with retina expression (any source) — check column existence
|
||||
retina_filter_parts = []
|
||||
for col in ["hpa_retina_tpm", "gtex_retina_tpm", "cellxgene_photoreceptor_expr"]:
|
||||
if col in df.columns:
|
||||
retina_filter_parts.append(pl.col(col).is_not_null())
|
||||
retina_expr_count = df.filter(
|
||||
pl.col("hpa_retina_tpm").is_not_null() |
|
||||
pl.col("gtex_retina_tpm").is_not_null() |
|
||||
pl.col("cellxgene_photoreceptor_expr").is_not_null()
|
||||
pl.any_horizontal(retina_filter_parts) if retina_filter_parts else pl.lit(False)
|
||||
).height
|
||||
|
||||
# Genes with inner ear expression (primarily CellxGene)
|
||||
inner_ear_expr_count = df.filter(
|
||||
pl.col("cellxgene_hair_cell_expr").is_not_null()
|
||||
).height
|
||||
inner_ear_expr_count = (
|
||||
df.filter(pl.col("cellxgene_hair_cell_expr").is_not_null()).height
|
||||
if "cellxgene_hair_cell_expr" in df.columns else 0
|
||||
)
|
||||
|
||||
# Mean Tau specificity (excluding NULLs)
|
||||
mean_tau = df.select(pl.col("tau_specificity").mean()).item()
|
||||
|
||||
@@ -4,12 +4,12 @@ from pydantic import BaseModel
|
||||
|
||||
# HPA normal tissue data download URL (bulk TSV, more efficient than per-gene API)
|
||||
HPA_NORMAL_TISSUE_URL = (
|
||||
"https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
|
||||
"https://v23.proteinatlas.org/download/normal_tissue.tsv.zip"
|
||||
)
|
||||
|
||||
# GTEx v10 median gene expression bulk data
|
||||
GTEX_MEDIAN_EXPRESSION_URL = (
|
||||
"https://storage.googleapis.com/adult-gtex/bulk-gex/v10/median-tpm/"
|
||||
"https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/"
|
||||
"GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
|
||||
)
|
||||
|
||||
|
||||
@@ -262,12 +262,32 @@ def process_expression_evidence(
|
||||
# Compute expression score
|
||||
df = compute_expression_score(df)
|
||||
|
||||
# Ensure all expected columns exist (NULL if source unavailable)
|
||||
expected_cols = {
|
||||
"hpa_retina_tpm": pl.Float64,
|
||||
"hpa_cerebellum_tpm": pl.Float64,
|
||||
"hpa_testis_tpm": pl.Float64,
|
||||
"hpa_fallopian_tube_tpm": pl.Float64,
|
||||
"gtex_retina_tpm": pl.Float64,
|
||||
"gtex_cerebellum_tpm": pl.Float64,
|
||||
"gtex_testis_tpm": pl.Float64,
|
||||
"gtex_fallopian_tube_tpm": pl.Float64,
|
||||
"cellxgene_photoreceptor_expr": pl.Float64,
|
||||
"cellxgene_hair_cell_expr": pl.Float64,
|
||||
"tau_specificity": pl.Float64,
|
||||
"usher_tissue_enrichment": pl.Float64,
|
||||
"expression_score_normalized": pl.Float64,
|
||||
}
|
||||
for col_name, dtype in expected_cols.items():
|
||||
if col_name not in df.columns:
|
||||
df = df.with_columns(pl.lit(None).cast(dtype).alias(col_name))
|
||||
|
||||
logger.info(
|
||||
"expression_pipeline_complete",
|
||||
row_count=len(df),
|
||||
has_hpa=any("hpa_" in col for col in df.columns),
|
||||
has_gtex=any("gtex_" in col for col in df.columns),
|
||||
has_cellxgene=any("cellxgene_" in col for col in df.columns),
|
||||
has_hpa=any("hpa_" in col and df[col].null_count() < len(df) for col in df.columns if "hpa_" in col),
|
||||
has_gtex=any("gtex_" in col and df[col].null_count() < len(df) for col in df.columns if "gtex_" in col),
|
||||
has_cellxgene=any("cellxgene_" in col and df[col].null_count() < len(df) for col in df.columns if "cellxgene_" in col),
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
@@ -27,18 +27,27 @@ def filter_by_coverage(
|
||||
|
||||
Returns:
|
||||
LazyFrame with quality_flag column added:
|
||||
- "measured": Good coverage AND has LOEUF estimate
|
||||
- "measured": Has LOEUF estimate (and good coverage if available)
|
||||
- "incomplete_coverage": Coverage below thresholds
|
||||
- "no_data": Both LOEUF and pLI are NULL
|
||||
"""
|
||||
# Ensure numeric columns are properly cast to float (handles edge cases with mixed types)
|
||||
lf = lf.with_columns([
|
||||
pl.col("mean_depth").cast(pl.Float64, strict=False),
|
||||
pl.col("cds_covered_pct").cast(pl.Float64, strict=False),
|
||||
# Check which columns are available (gnomAD v4.1 lacks coverage columns)
|
||||
available_cols = lf.collect_schema().names()
|
||||
has_coverage = "mean_depth" in available_cols and "cds_covered_pct" in available_cols
|
||||
|
||||
# Ensure numeric columns are properly cast
|
||||
cast_exprs = [
|
||||
pl.col("loeuf").cast(pl.Float64, strict=False),
|
||||
pl.col("pli").cast(pl.Float64, strict=False),
|
||||
]
|
||||
if has_coverage:
|
||||
cast_exprs.extend([
|
||||
pl.col("mean_depth").cast(pl.Float64, strict=False),
|
||||
pl.col("cds_covered_pct").cast(pl.Float64, strict=False),
|
||||
])
|
||||
lf = lf.with_columns(cast_exprs)
|
||||
|
||||
if has_coverage:
|
||||
return lf.with_columns(
|
||||
pl.when(
|
||||
pl.col("mean_depth").is_not_null()
|
||||
@@ -59,6 +68,17 @@ def filter_by_coverage(
|
||||
.otherwise(pl.lit("incomplete_coverage"))
|
||||
.alias("quality_flag")
|
||||
)
|
||||
else:
|
||||
# No coverage columns (gnomAD v4.1) — classify by presence of constraint data
|
||||
logger.info("gnomad_no_coverage_columns", msg="Using constraint-only quality flags")
|
||||
return lf.with_columns(
|
||||
pl.when(pl.col("loeuf").is_not_null())
|
||||
.then(pl.lit("measured"))
|
||||
.when(pl.col("loeuf").is_null() & pl.col("pli").is_null())
|
||||
.then(pl.lit("no_data"))
|
||||
.otherwise(pl.lit("incomplete_coverage"))
|
||||
.alias("quality_flag")
|
||||
)
|
||||
|
||||
|
||||
def normalize_scores(lf: pl.LazyFrame) -> pl.LazyFrame:
|
||||
|
||||
@@ -8,7 +8,7 @@ from pydantic import BaseModel, Field
|
||||
LOCALIZATION_TABLE_NAME = "subcellular_localization"
|
||||
|
||||
# HPA subcellular data URL (bulk download)
|
||||
HPA_SUBCELLULAR_URL = "https://www.proteinatlas.org/download/subcellular_location.tsv.zip"
|
||||
HPA_SUBCELLULAR_URL = "https://v23.proteinatlas.org/download/subcellular_location.tsv.zip"
|
||||
|
||||
# Compartment definitions for scoring
|
||||
CILIA_COMPARTMENTS = [
|
||||
|
||||
@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
# Expected range for human protein-coding genes
|
||||
MIN_EXPECTED_GENES = 19000
|
||||
MAX_EXPECTED_GENES = 22000
|
||||
MAX_EXPECTED_GENES = 23000
|
||||
|
||||
|
||||
def fetch_protein_coding_genes(ensembl_release: int = 113) -> GeneUniverse:
|
||||
@@ -51,12 +51,12 @@ def fetch_protein_coding_genes(ensembl_release: int = 113) -> GeneUniverse:
|
||||
|
||||
# Query for human protein-coding genes
|
||||
logger.info("Querying mygene for type_of_gene:protein-coding (species=9606)")
|
||||
results = mg.query(
|
||||
results = list(mg.query(
|
||||
'type_of_gene:"protein-coding"',
|
||||
species=9606,
|
||||
fields='ensembl.gene,symbol,name',
|
||||
fetch_all=True,
|
||||
)
|
||||
))
|
||||
|
||||
logger.info(f"Retrieved {len(results)} results from mygene")
|
||||
|
||||
|
||||
@@ -171,7 +171,7 @@ def validate_gene_universe(genes: list[str]) -> ValidationResult:
|
||||
|
||||
gene_count = len(genes)
|
||||
MIN_GENES = 19000
|
||||
MAX_GENES = 22000
|
||||
MAX_GENES = 23000
|
||||
|
||||
# Check gene count
|
||||
if gene_count < MIN_GENES:
|
||||
|
||||
Reference in New Issue
Block a user