from __future__ import annotations import json from pathlib import Path from typing import Dict, Iterable, List, Set def merge_mappings(inputs: Iterable[Path], output: Path, version: str = "merged", sources: List[str] | None = None) -> Path: """ Merge multiple phenotype→gene mapping JSON files into one. Input schema: {"phenotype_to_genes": {"HP:xxxx": ["GENE1", ...]}, "version": "...", "source": "..."} """ merged: Dict[str, Set[str]] = {} source_list: List[str] = sources or [] for path in inputs: data = json.loads(Path(path).read_text()) phenos = data.get("phenotype_to_genes", {}) for pid, genes in phenos.items(): merged.setdefault(pid, set()).update(genes) src_label = data.get("source") or path.name source_list.append(src_label) out = { "version": version, "source": ",".join(source_list), "phenotype_to_genes": {pid: sorted(list(genes)) for pid, genes in merged.items()}, "metadata": {"merged_from": [str(p) for p in inputs]}, } output.parent.mkdir(parents=True, exist_ok=True) output.write_text(json.dumps(out, indent=2)) return output