32 lines
1.2 KiB
Python
32 lines
1.2 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Dict, Iterable, List, Set
|
|
|
|
|
|
def merge_mappings(inputs: Iterable[Path], output: Path, version: str = "merged", sources: List[str] | None = None) -> Path:
|
|
"""
|
|
Merge multiple phenotype→gene mapping JSON files into one.
|
|
Input schema: {"phenotype_to_genes": {"HP:xxxx": ["GENE1", ...]}, "version": "...", "source": "..."}
|
|
"""
|
|
merged: Dict[str, Set[str]] = {}
|
|
source_list: List[str] = sources or []
|
|
for path in inputs:
|
|
data = json.loads(Path(path).read_text())
|
|
phenos = data.get("phenotype_to_genes", {})
|
|
for pid, genes in phenos.items():
|
|
merged.setdefault(pid, set()).update(genes)
|
|
src_label = data.get("source") or path.name
|
|
source_list.append(src_label)
|
|
|
|
out = {
|
|
"version": version,
|
|
"source": ",".join(source_list),
|
|
"phenotype_to_genes": {pid: sorted(list(genes)) for pid, genes in merged.items()},
|
|
"metadata": {"merged_from": [str(p) for p in inputs]},
|
|
}
|
|
output.parent.mkdir(parents=True, exist_ok=True)
|
|
output.write_text(json.dumps(out, indent=2))
|
|
return output
|