#!/usr/bin/env python3 """ 職業資料抓取腳本 從 Wikidata SPARQL 和 ConceptNet API 抓取職業資料, 儲存為本地 JSON 檔案供應用程式使用。 使用方式: cd backend python scripts/fetch_occupations.py """ import json import sys from datetime import datetime, timezone from pathlib import Path from typing import List import httpx # 輸出目錄 DATA_DIR = Path(__file__).parent.parent / "app" / "data" def fetch_wikidata_occupations(language: str) -> List[dict]: """ 從 Wikidata SPARQL 端點抓取所有職業(使用分頁) Args: language: 語言代碼 (zh/en) Returns: 職業列表 [{"name": "...", "domain": "..."}, ...] """ print(f"[Wikidata] 正在抓取 {language} 職業資料(分頁模式)...") endpoint = "https://query.wikidata.org/sparql" page_size = 500 # 每頁筆數 all_bindings = [] offset = 0 try: with httpx.Client(timeout=120.0) as client: while True: # SPARQL 查詢 - 使用 SERVICE wikibase:label (更高效) query = f""" SELECT DISTINCT ?occupation ?occupationLabel ?fieldLabel WHERE {{ ?occupation wdt:P31 wd:Q28640. OPTIONAL {{ ?occupation wdt:P425 ?field. }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{language},en". }} }} LIMIT {page_size} OFFSET {offset} """ print(f"[Wikidata] 抓取第 {offset // page_size + 1} 頁 (offset={offset})...") response = client.get( endpoint, params={"query": query, "format": "json"}, headers={ "Accept": "application/sparql-results+json", "User-Agent": "NoveltySeeking/1.0", }, ) response.raise_for_status() data = response.json() bindings = data.get("results", {}).get("bindings", []) print(f"[Wikidata] 取得 {len(bindings)} 筆") if not bindings: # 沒有更多資料了 break all_bindings.extend(bindings) offset += page_size # 如果取得的筆數少於 page_size,表示已經是最後一頁 if len(bindings) < page_size: break print(f"[Wikidata] 總共取得 {len(all_bindings)} 筆原始資料") # 解析回應 occupations = [] for item in all_bindings: name = item.get("occupationLabel", {}).get("value", "") field = item.get("fieldLabel", {}).get("value", "") if name and len(name) >= 2: occupations.append({ "name": name, "domain": field if field else infer_domain(name, language), }) # 去重 seen = set() unique = [] for occ in occupations: if occ["name"] not in seen: seen.add(occ["name"]) unique.append(occ) print(f"[Wikidata] 去重後: {len(unique)} 筆職業") return unique except Exception as e: print(f"[Wikidata] 錯誤: {e}") raise def fetch_conceptnet_occupations(language: str) -> List[dict]: """ 從 ConceptNet API 抓取職業相關概念(使用分頁) Args: language: 語言代碼 (zh/en) Returns: 職業列表 [{"name": "...", "domain": "..."}, ...] """ print(f"[ConceptNet] 正在抓取 {language} 職業資料(分頁模式)...") endpoint = "https://api.conceptnet.io" lang_code = language page_size = 100 # ConceptNet 建議的 limit # 起始概念 start_concepts = { "zh": ["/c/zh/職業", "/c/zh/專業", "/c/zh/工作", "/c/zh/職務"], "en": ["/c/en/occupation", "/c/en/profession", "/c/en/job", "/c/en/career"], } # 要查詢的關係類型 relations = ["/r/IsA", "/r/RelatedTo", "/r/HasA", "/r/AtLocation"] all_occupations = [] try: with httpx.Client(timeout=60.0) as client: for concept in start_concepts.get(lang_code, start_concepts["zh"]): for rel in relations: offset = 0 max_pages = 5 # 每個組合最多抓 5 頁 for page in range(max_pages): try: print(f"[ConceptNet] 查詢 {concept} {rel} (offset={offset})...") # 查詢 start 參數 response = client.get( f"{endpoint}/query", params={ "start": concept, "rel": rel, "limit": page_size, "offset": offset, }, ) if response.status_code != 200: print(f"[ConceptNet] HTTP {response.status_code}, 跳過") break data = response.json() edges = data.get("edges", []) if not edges: break parsed = parse_conceptnet_response(data, lang_code) all_occupations.extend(parsed) print(f"[ConceptNet] 取得 {len(parsed)} 筆") if len(edges) < page_size: break offset += page_size except Exception as e: print(f"[ConceptNet] 錯誤: {e}") break # 去重 seen = set() unique = [] for occ in all_occupations: if occ["name"] not in seen: seen.add(occ["name"]) unique.append(occ) print(f"[ConceptNet] 去重後: {len(unique)} 筆概念") return unique except Exception as e: print(f"[ConceptNet] 錯誤: {e}") raise def parse_conceptnet_response(data: dict, lang_code: str) -> List[dict]: """解析 ConceptNet API 回應""" results = [] edges = data.get("edges", []) for edge in edges: start = edge.get("start", {}) end = edge.get("end", {}) # 嘗試從兩端取得有意義的概念 for node in [start, end]: node_id = node.get("@id", "") label = node.get("label", "") # 過濾:確保是目標語言且有意義 if f"/c/{lang_code}/" in node_id and label and len(label) >= 2: # 排除過於泛用的詞 if label not in ["職業", "工作", "專業", "occupation", "job", "profession"]: results.append({ "name": label, "domain": infer_domain(label, lang_code), }) return results def infer_domain(occupation_name: str, language: str) -> str: """根據職業名稱推斷領域""" if language == "zh": domain_keywords = { "醫": "醫療健康", "護": "醫療健康", "藥": "醫療健康", "師": "專業服務", "工程": "工程技術", "技術": "工程技術", "設計": "設計創意", "藝術": "藝術文化", "音樂": "藝術文化", "運動": "體育運動", "農": "農業", "漁": "漁業", "商": "商業貿易", "銷": "商業貿易", "法": "法律", "律": "法律", "教": "教育", "研究": "學術研究", "科學": "學術研究", "廚": "餐飲服務", "烹": "餐飲服務", "建築": "建築營造", "軍": "軍事國防", "警": "公共安全", "消防": "公共安全", "記者": "媒體傳播", "編輯": "媒體傳播", "作家": "文學創作", "程式": "資訊科技", "軟體": "資訊科技", "電腦": "資訊科技", } else: domain_keywords = { "doctor": "Healthcare", "nurse": "Healthcare", "medical": "Healthcare", "engineer": "Engineering", "technical": "Engineering", "design": "Design & Creative", "artist": "Arts & Culture", "music": "Arts & Culture", "sport": "Sports", "athletic": "Sports", "farm": "Agriculture", "fish": "Fishery", "business": "Business", "sales": "Business", "law": "Legal", "attorney": "Legal", "teach": "Education", "professor": "Education", "research": "Academic Research", "scien": "Academic Research", "chef": "Culinary", "cook": "Culinary", "architect": "Architecture", "military": "Military", "police": "Public Safety", "fire": "Public Safety", "journal": "Media", "editor": "Media", "writer": "Literature", "author": "Literature", "program": "Information Technology", "software": "Information Technology", "computer": "Information Technology", "develop": "Information Technology", } name_lower = occupation_name.lower() for keyword, domain in domain_keywords.items(): if keyword in name_lower: return domain return "專業領域" if language == "zh" else "Professional Field" def save_json(data: List[dict], source: str, language: str) -> None: """儲存資料到 JSON 檔案""" filename = f"{source}_occupations_{language}.json" filepath = DATA_DIR / filename output = { "metadata": { "source": source, "language": language, "fetched_at": datetime.now(timezone.utc).isoformat(), "total_count": len(data), }, "occupations": data, } with open(filepath, "w", encoding="utf-8") as f: json.dump(output, f, ensure_ascii=False, indent=2) print(f"[儲存] {filepath} ({len(data)} 筆)") def main(): """主程式""" print("=" * 60) print("職業資料抓取腳本") print(f"輸出目錄: {DATA_DIR}") print("=" * 60) print() # 確保輸出目錄存在 DATA_DIR.mkdir(parents=True, exist_ok=True) # 抓取 Wikidata print("--- Wikidata ---") try: wikidata_zh = fetch_wikidata_occupations("zh") save_json(wikidata_zh, "wikidata", "zh") except Exception as e: print(f"Wikidata 中文抓取失敗: {e}") wikidata_zh = [] try: wikidata_en = fetch_wikidata_occupations("en") save_json(wikidata_en, "wikidata", "en") except Exception as e: print(f"Wikidata 英文抓取失敗: {e}") wikidata_en = [] print() # 抓取 ConceptNet print("--- ConceptNet ---") try: conceptnet_zh = fetch_conceptnet_occupations("zh") save_json(conceptnet_zh, "conceptnet", "zh") except Exception as e: print(f"ConceptNet 中文抓取失敗: {e}") conceptnet_zh = [] try: conceptnet_en = fetch_conceptnet_occupations("en") save_json(conceptnet_en, "conceptnet", "en") except Exception as e: print(f"ConceptNet 英文抓取失敗: {e}") conceptnet_en = [] print() print("=" * 60) print("抓取完成!") print(f" Wikidata 中文: {len(wikidata_zh)} 筆") print(f" Wikidata 英文: {len(wikidata_en)} 筆") print(f" ConceptNet 中文: {len(conceptnet_zh)} 筆") print(f" ConceptNet 英文: {len(conceptnet_en)} 筆") print("=" * 60) if __name__ == "__main__": main()