feat: Add curated expert occupations with local data sources

- Add curated occupations seed files (210 entries in zh/en) with specific domains - Add DBpedia occupations data (2164 entries) for external source option - Refactor expert_source_service to read from local JSON files - Improve keyword generation prompts to leverage expert domain context - Add architecture analysis documentation (ARCHITECTURE_ANALYSIS.md) - Fix expert source selection bug (proper handling of empty custom_experts) - Update frontend to support curated/dbpedia/wikidata expert sources Key changes: - backend/app/data/: Local occupation data files - backend/app/services/expert_source_service.py: Simplified local file reading - backend/app/prompts/expert_transformation_prompt.py: Better domain-aware prompts - Removed expert_cache.py (no longer needed with local files) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 16:34:35 +08:00
parent 8777e27cbb
commit 5571076406
15 changed files with 9970 additions and 380 deletions
--- a/backend/scripts/fetch_occupations.py
+++ b/backend/scripts/fetch_occupations.py
@@ -0,0 +1,386 @@
+#!/usr/bin/env python3
+"""
+職業資料抓取腳本
+
+從 Wikidata SPARQL 和 ConceptNet API 抓取職業資料，
+儲存為本地 JSON 檔案供應用程式使用。
+
+使用方式:
+    cd backend
+    python scripts/fetch_occupations.py
+"""
+
+import json
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import List
+
+import httpx
+
+# 輸出目錄
+DATA_DIR = Path(__file__).parent.parent / "app" / "data"
+
+
+def fetch_wikidata_occupations(language: str) -> List[dict]:
+    """
+    從 Wikidata SPARQL 端點抓取所有職業（使用分頁）
+
+    Args:
+        language: 語言代碼 (zh/en)
+
+    Returns:
+        職業列表 [{"name": "...", "domain": "..."}, ...]
+    """
+    print(f"[Wikidata] 正在抓取 {language} 職業資料（分頁模式）...")
+
+    endpoint = "https://query.wikidata.org/sparql"
+    page_size = 500  # 每頁筆數
+    all_bindings = []
+    offset = 0
+
+    try:
+        with httpx.Client(timeout=120.0) as client:
+            while True:
+                # SPARQL 查詢 - 使用 SERVICE wikibase:label (更高效)
+                query = f"""
+                SELECT DISTINCT ?occupation ?occupationLabel ?fieldLabel WHERE {{
+                  ?occupation wdt:P31 wd:Q28640.
+                  OPTIONAL {{ ?occupation wdt:P425 ?field. }}
+                  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{language},en". }}
+                }}
+                LIMIT {page_size}
+                OFFSET {offset}
+                """
+
+                print(f"[Wikidata] 抓取第 {offset // page_size + 1} 頁 (offset={offset})...")
+
+                response = client.get(
+                    endpoint,
+                    params={"query": query, "format": "json"},
+                    headers={
+                        "Accept": "application/sparql-results+json",
+                        "User-Agent": "NoveltySeeking/1.0",
+                    },
+                )
+                response.raise_for_status()
+                data = response.json()
+
+                bindings = data.get("results", {}).get("bindings", [])
+                print(f"[Wikidata]   取得 {len(bindings)} 筆")
+
+                if not bindings:
+                    # 沒有更多資料了
+                    break
+
+                all_bindings.extend(bindings)
+                offset += page_size
+
+                # 如果取得的筆數少於 page_size，表示已經是最後一頁
+                if len(bindings) < page_size:
+                    break
+
+        print(f"[Wikidata] 總共取得 {len(all_bindings)} 筆原始資料")
+
+        # 解析回應
+        occupations = []
+        for item in all_bindings:
+            name = item.get("occupationLabel", {}).get("value", "")
+            field = item.get("fieldLabel", {}).get("value", "")
+
+            if name and len(name) >= 2:
+                occupations.append({
+                    "name": name,
+                    "domain": field if field else infer_domain(name, language),
+                })
+
+        # 去重
+        seen = set()
+        unique = []
+        for occ in occupations:
+            if occ["name"] not in seen:
+                seen.add(occ["name"])
+                unique.append(occ)
+
+        print(f"[Wikidata] 去重後: {len(unique)} 筆職業")
+        return unique
+
+    except Exception as e:
+        print(f"[Wikidata] 錯誤: {e}")
+        raise
+
+
+def fetch_conceptnet_occupations(language: str) -> List[dict]:
+    """
+    從 ConceptNet API 抓取職業相關概念（使用分頁）
+
+    Args:
+        language: 語言代碼 (zh/en)
+
+    Returns:
+        職業列表 [{"name": "...", "domain": "..."}, ...]
+    """
+    print(f"[ConceptNet] 正在抓取 {language} 職業資料（分頁模式）...")
+
+    endpoint = "https://api.conceptnet.io"
+    lang_code = language
+    page_size = 100  # ConceptNet 建議的 limit
+
+    # 起始概念
+    start_concepts = {
+        "zh": ["/c/zh/職業", "/c/zh/專業", "/c/zh/工作", "/c/zh/職務"],
+        "en": ["/c/en/occupation", "/c/en/profession", "/c/en/job", "/c/en/career"],
+    }
+
+    # 要查詢的關係類型
+    relations = ["/r/IsA", "/r/RelatedTo", "/r/HasA", "/r/AtLocation"]
+
+    all_occupations = []
+
+    try:
+        with httpx.Client(timeout=60.0) as client:
+            for concept in start_concepts.get(lang_code, start_concepts["zh"]):
+                for rel in relations:
+                    offset = 0
+                    max_pages = 5  # 每個組合最多抓 5 頁
+
+                    for page in range(max_pages):
+                        try:
+                            print(f"[ConceptNet] 查詢 {concept} {rel} (offset={offset})...")
+
+                            # 查詢 start 參數
+                            response = client.get(
+                                f"{endpoint}/query",
+                                params={
+                                    "start": concept,
+                                    "rel": rel,
+                                    "limit": page_size,
+                                    "offset": offset,
+                                },
+                            )
+
+                            if response.status_code != 200:
+                                print(f"[ConceptNet]   HTTP {response.status_code}, 跳過")
+                                break
+
+                            data = response.json()
+                            edges = data.get("edges", [])
+
+                            if not edges:
+                                break
+
+                            parsed = parse_conceptnet_response(data, lang_code)
+                            all_occupations.extend(parsed)
+                            print(f"[ConceptNet]   取得 {len(parsed)} 筆")
+
+                            if len(edges) < page_size:
+                                break
+
+                            offset += page_size
+
+                        except Exception as e:
+                            print(f"[ConceptNet]   錯誤: {e}")
+                            break
+
+        # 去重
+        seen = set()
+        unique = []
+        for occ in all_occupations:
+            if occ["name"] not in seen:
+                seen.add(occ["name"])
+                unique.append(occ)
+
+        print(f"[ConceptNet] 去重後: {len(unique)} 筆概念")
+        return unique
+
+    except Exception as e:
+        print(f"[ConceptNet] 錯誤: {e}")
+        raise
+
+
+def parse_conceptnet_response(data: dict, lang_code: str) -> List[dict]:
+    """解析 ConceptNet API 回應"""
+    results = []
+    edges = data.get("edges", [])
+
+    for edge in edges:
+        start = edge.get("start", {})
+        end = edge.get("end", {})
+
+        # 嘗試從兩端取得有意義的概念
+        for node in [start, end]:
+            node_id = node.get("@id", "")
+            label = node.get("label", "")
+
+            # 過濾：確保是目標語言且有意義
+            if f"/c/{lang_code}/" in node_id and label and len(label) >= 2:
+                # 排除過於泛用的詞
+                if label not in ["職業", "工作", "專業", "occupation", "job", "profession"]:
+                    results.append({
+                        "name": label,
+                        "domain": infer_domain(label, lang_code),
+                    })
+
+    return results
+
+
+def infer_domain(occupation_name: str, language: str) -> str:
+    """根據職業名稱推斷領域"""
+    if language == "zh":
+        domain_keywords = {
+            "醫": "醫療健康",
+            "護": "醫療健康",
+            "藥": "醫療健康",
+            "師": "專業服務",
+            "工程": "工程技術",
+            "技術": "工程技術",
+            "設計": "設計創意",
+            "藝術": "藝術文化",
+            "音樂": "藝術文化",
+            "運動": "體育運動",
+            "農": "農業",
+            "漁": "漁業",
+            "商": "商業貿易",
+            "銷": "商業貿易",
+            "法": "法律",
+            "律": "法律",
+            "教": "教育",
+            "研究": "學術研究",
+            "科學": "學術研究",
+            "廚": "餐飲服務",
+            "烹": "餐飲服務",
+            "建築": "建築營造",
+            "軍": "軍事國防",
+            "警": "公共安全",
+            "消防": "公共安全",
+            "記者": "媒體傳播",
+            "編輯": "媒體傳播",
+            "作家": "文學創作",
+            "程式": "資訊科技",
+            "軟體": "資訊科技",
+            "電腦": "資訊科技",
+        }
+    else:
+        domain_keywords = {
+            "doctor": "Healthcare",
+            "nurse": "Healthcare",
+            "medical": "Healthcare",
+            "engineer": "Engineering",
+            "technical": "Engineering",
+            "design": "Design & Creative",
+            "artist": "Arts & Culture",
+            "music": "Arts & Culture",
+            "sport": "Sports",
+            "athletic": "Sports",
+            "farm": "Agriculture",
+            "fish": "Fishery",
+            "business": "Business",
+            "sales": "Business",
+            "law": "Legal",
+            "attorney": "Legal",
+            "teach": "Education",
+            "professor": "Education",
+            "research": "Academic Research",
+            "scien": "Academic Research",
+            "chef": "Culinary",
+            "cook": "Culinary",
+            "architect": "Architecture",
+            "military": "Military",
+            "police": "Public Safety",
+            "fire": "Public Safety",
+            "journal": "Media",
+            "editor": "Media",
+            "writer": "Literature",
+            "author": "Literature",
+            "program": "Information Technology",
+            "software": "Information Technology",
+            "computer": "Information Technology",
+            "develop": "Information Technology",
+        }
+
+    name_lower = occupation_name.lower()
+    for keyword, domain in domain_keywords.items():
+        if keyword in name_lower:
+            return domain
+
+    return "專業領域" if language == "zh" else "Professional Field"
+
+
+def save_json(data: List[dict], source: str, language: str) -> None:
+    """儲存資料到 JSON 檔案"""
+    filename = f"{source}_occupations_{language}.json"
+    filepath = DATA_DIR / filename
+
+    output = {
+        "metadata": {
+            "source": source,
+            "language": language,
+            "fetched_at": datetime.now(timezone.utc).isoformat(),
+            "total_count": len(data),
+        },
+        "occupations": data,
+    }
+
+    with open(filepath, "w", encoding="utf-8") as f:
+        json.dump(output, f, ensure_ascii=False, indent=2)
+
+    print(f"[儲存] {filepath} ({len(data)} 筆)")
+
+
+def main():
+    """主程式"""
+    print("=" * 60)
+    print("職業資料抓取腳本")
+    print(f"輸出目錄: {DATA_DIR}")
+    print("=" * 60)
+    print()
+
+    # 確保輸出目錄存在
+    DATA_DIR.mkdir(parents=True, exist_ok=True)
+
+    # 抓取 Wikidata
+    print("--- Wikidata ---")
+    try:
+        wikidata_zh = fetch_wikidata_occupations("zh")
+        save_json(wikidata_zh, "wikidata", "zh")
+    except Exception as e:
+        print(f"Wikidata 中文抓取失敗: {e}")
+        wikidata_zh = []
+
+    try:
+        wikidata_en = fetch_wikidata_occupations("en")
+        save_json(wikidata_en, "wikidata", "en")
+    except Exception as e:
+        print(f"Wikidata 英文抓取失敗: {e}")
+        wikidata_en = []
+
+    print()
+
+    # 抓取 ConceptNet
+    print("--- ConceptNet ---")
+    try:
+        conceptnet_zh = fetch_conceptnet_occupations("zh")
+        save_json(conceptnet_zh, "conceptnet", "zh")
+    except Exception as e:
+        print(f"ConceptNet 中文抓取失敗: {e}")
+        conceptnet_zh = []
+
+    try:
+        conceptnet_en = fetch_conceptnet_occupations("en")
+        save_json(conceptnet_en, "conceptnet", "en")
+    except Exception as e:
+        print(f"ConceptNet 英文抓取失敗: {e}")
+        conceptnet_en = []
+
+    print()
+    print("=" * 60)
+    print("抓取完成！")
+    print(f"  Wikidata 中文: {len(wikidata_zh)} 筆")
+    print(f"  Wikidata 英文: {len(wikidata_en)} 筆")
+    print(f"  ConceptNet 中文: {len(conceptnet_zh)} 筆")
+    print(f"  ConceptNet 英文: {len(conceptnet_en)} 筆")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()