fix: resolve runtime bugs for pipeline execution on Python 3.14 + latest deps

- gene_mapping: wrap mygene fetch_all generator in list() to fix len() error
- gene_mapping: raise MAX_EXPECTED_GENES to 23000 (mygene DB growth)
- setup_cmd: rename gene_universe columns to gene_id/gene_symbol for
  consistency with all downstream evidence layer code
- gnomad: handle missing coverage columns in v4.1 constraint TSV
- expression: fix HPA URL (v23.proteinatlas.org) and GTEx URL (v8 path)
- expression: fix Polars pivot() API change (columns -> on), collect first
- expression: handle missing GTEx tissues (Eye - Retina not in v8)
- expression: ensure all expected columns exist even when sources unavailable
- expression/load: safely check column existence before filtering
- localization: fix HPA subcellular URL to v23
- animal_models: fix httpx stream response.read() before .text access
- animal_models: increase infer_schema_length for HCOP and MGI TSV parsing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-13 03:44:01 +08:00
parent a2ef2125ba
commit 6605ff0f2b
10 changed files with 112 additions and 65 deletions

View File

@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
# Expected range for human protein-coding genes
MIN_EXPECTED_GENES = 19000
MAX_EXPECTED_GENES = 22000
MAX_EXPECTED_GENES = 23000
def fetch_protein_coding_genes(ensembl_release: int = 113) -> GeneUniverse:
@@ -51,12 +51,12 @@ def fetch_protein_coding_genes(ensembl_release: int = 113) -> GeneUniverse:
# Query for human protein-coding genes
logger.info("Querying mygene for type_of_gene:protein-coding (species=9606)")
results = mg.query(
results = list(mg.query(
'type_of_gene:"protein-coding"',
species=9606,
fields='ensembl.gene,symbol,name',
fetch_all=True,
)
))
logger.info(f"Retrieved {len(results)} results from mygene")

View File

@@ -171,7 +171,7 @@ def validate_gene_universe(genes: list[str]) -> ValidationResult:
gene_count = len(genes)
MIN_GENES = 19000
MAX_GENES = 22000
MAX_GENES = 23000
# Check gene count
if gene_count < MIN_GENES: