feat: Add curated expert occupations with local data sources

- Add curated occupations seed files (210 entries in zh/en) with specific domains
- Add DBpedia occupations data (2164 entries) for external source option
- Refactor expert_source_service to read from local JSON files
- Improve keyword generation prompts to leverage expert domain context
- Add architecture analysis documentation (ARCHITECTURE_ANALYSIS.md)
- Fix expert source selection bug (proper handling of empty custom_experts)
- Update frontend to support curated/dbpedia/wikidata expert sources

Key changes:
- backend/app/data/: Local occupation data files
- backend/app/services/expert_source_service.py: Simplified local file reading
- backend/app/prompts/expert_transformation_prompt.py: Better domain-aware prompts
- Removed expert_cache.py (no longer needed with local files)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-12-04 16:34:35 +08:00
parent 8777e27cbb
commit 5571076406
15 changed files with 9970 additions and 380 deletions

View File

@@ -0,0 +1,386 @@
#!/usr/bin/env python3
"""
職業資料抓取腳本
從 Wikidata SPARQL 和 ConceptNet API 抓取職業資料,
儲存為本地 JSON 檔案供應用程式使用。
使用方式:
cd backend
python scripts/fetch_occupations.py
"""
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import List
import httpx
# 輸出目錄
DATA_DIR = Path(__file__).parent.parent / "app" / "data"
def fetch_wikidata_occupations(language: str) -> List[dict]:
"""
從 Wikidata SPARQL 端點抓取所有職業(使用分頁)
Args:
language: 語言代碼 (zh/en)
Returns:
職業列表 [{"name": "...", "domain": "..."}, ...]
"""
print(f"[Wikidata] 正在抓取 {language} 職業資料(分頁模式)...")
endpoint = "https://query.wikidata.org/sparql"
page_size = 500 # 每頁筆數
all_bindings = []
offset = 0
try:
with httpx.Client(timeout=120.0) as client:
while True:
# SPARQL 查詢 - 使用 SERVICE wikibase:label (更高效)
query = f"""
SELECT DISTINCT ?occupation ?occupationLabel ?fieldLabel WHERE {{
?occupation wdt:P31 wd:Q28640.
OPTIONAL {{ ?occupation wdt:P425 ?field. }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{language},en". }}
}}
LIMIT {page_size}
OFFSET {offset}
"""
print(f"[Wikidata] 抓取第 {offset // page_size + 1} 頁 (offset={offset})...")
response = client.get(
endpoint,
params={"query": query, "format": "json"},
headers={
"Accept": "application/sparql-results+json",
"User-Agent": "NoveltySeeking/1.0",
},
)
response.raise_for_status()
data = response.json()
bindings = data.get("results", {}).get("bindings", [])
print(f"[Wikidata] 取得 {len(bindings)}")
if not bindings:
# 沒有更多資料了
break
all_bindings.extend(bindings)
offset += page_size
# 如果取得的筆數少於 page_size表示已經是最後一頁
if len(bindings) < page_size:
break
print(f"[Wikidata] 總共取得 {len(all_bindings)} 筆原始資料")
# 解析回應
occupations = []
for item in all_bindings:
name = item.get("occupationLabel", {}).get("value", "")
field = item.get("fieldLabel", {}).get("value", "")
if name and len(name) >= 2:
occupations.append({
"name": name,
"domain": field if field else infer_domain(name, language),
})
# 去重
seen = set()
unique = []
for occ in occupations:
if occ["name"] not in seen:
seen.add(occ["name"])
unique.append(occ)
print(f"[Wikidata] 去重後: {len(unique)} 筆職業")
return unique
except Exception as e:
print(f"[Wikidata] 錯誤: {e}")
raise
def fetch_conceptnet_occupations(language: str) -> List[dict]:
"""
從 ConceptNet API 抓取職業相關概念(使用分頁)
Args:
language: 語言代碼 (zh/en)
Returns:
職業列表 [{"name": "...", "domain": "..."}, ...]
"""
print(f"[ConceptNet] 正在抓取 {language} 職業資料(分頁模式)...")
endpoint = "https://api.conceptnet.io"
lang_code = language
page_size = 100 # ConceptNet 建議的 limit
# 起始概念
start_concepts = {
"zh": ["/c/zh/職業", "/c/zh/專業", "/c/zh/工作", "/c/zh/職務"],
"en": ["/c/en/occupation", "/c/en/profession", "/c/en/job", "/c/en/career"],
}
# 要查詢的關係類型
relations = ["/r/IsA", "/r/RelatedTo", "/r/HasA", "/r/AtLocation"]
all_occupations = []
try:
with httpx.Client(timeout=60.0) as client:
for concept in start_concepts.get(lang_code, start_concepts["zh"]):
for rel in relations:
offset = 0
max_pages = 5 # 每個組合最多抓 5 頁
for page in range(max_pages):
try:
print(f"[ConceptNet] 查詢 {concept} {rel} (offset={offset})...")
# 查詢 start 參數
response = client.get(
f"{endpoint}/query",
params={
"start": concept,
"rel": rel,
"limit": page_size,
"offset": offset,
},
)
if response.status_code != 200:
print(f"[ConceptNet] HTTP {response.status_code}, 跳過")
break
data = response.json()
edges = data.get("edges", [])
if not edges:
break
parsed = parse_conceptnet_response(data, lang_code)
all_occupations.extend(parsed)
print(f"[ConceptNet] 取得 {len(parsed)}")
if len(edges) < page_size:
break
offset += page_size
except Exception as e:
print(f"[ConceptNet] 錯誤: {e}")
break
# 去重
seen = set()
unique = []
for occ in all_occupations:
if occ["name"] not in seen:
seen.add(occ["name"])
unique.append(occ)
print(f"[ConceptNet] 去重後: {len(unique)} 筆概念")
return unique
except Exception as e:
print(f"[ConceptNet] 錯誤: {e}")
raise
def parse_conceptnet_response(data: dict, lang_code: str) -> List[dict]:
"""解析 ConceptNet API 回應"""
results = []
edges = data.get("edges", [])
for edge in edges:
start = edge.get("start", {})
end = edge.get("end", {})
# 嘗試從兩端取得有意義的概念
for node in [start, end]:
node_id = node.get("@id", "")
label = node.get("label", "")
# 過濾:確保是目標語言且有意義
if f"/c/{lang_code}/" in node_id and label and len(label) >= 2:
# 排除過於泛用的詞
if label not in ["職業", "工作", "專業", "occupation", "job", "profession"]:
results.append({
"name": label,
"domain": infer_domain(label, lang_code),
})
return results
def infer_domain(occupation_name: str, language: str) -> str:
"""根據職業名稱推斷領域"""
if language == "zh":
domain_keywords = {
"": "醫療健康",
"": "醫療健康",
"": "醫療健康",
"": "專業服務",
"工程": "工程技術",
"技術": "工程技術",
"設計": "設計創意",
"藝術": "藝術文化",
"音樂": "藝術文化",
"運動": "體育運動",
"": "農業",
"": "漁業",
"": "商業貿易",
"": "商業貿易",
"": "法律",
"": "法律",
"": "教育",
"研究": "學術研究",
"科學": "學術研究",
"": "餐飲服務",
"": "餐飲服務",
"建築": "建築營造",
"": "軍事國防",
"": "公共安全",
"消防": "公共安全",
"記者": "媒體傳播",
"編輯": "媒體傳播",
"作家": "文學創作",
"程式": "資訊科技",
"軟體": "資訊科技",
"電腦": "資訊科技",
}
else:
domain_keywords = {
"doctor": "Healthcare",
"nurse": "Healthcare",
"medical": "Healthcare",
"engineer": "Engineering",
"technical": "Engineering",
"design": "Design & Creative",
"artist": "Arts & Culture",
"music": "Arts & Culture",
"sport": "Sports",
"athletic": "Sports",
"farm": "Agriculture",
"fish": "Fishery",
"business": "Business",
"sales": "Business",
"law": "Legal",
"attorney": "Legal",
"teach": "Education",
"professor": "Education",
"research": "Academic Research",
"scien": "Academic Research",
"chef": "Culinary",
"cook": "Culinary",
"architect": "Architecture",
"military": "Military",
"police": "Public Safety",
"fire": "Public Safety",
"journal": "Media",
"editor": "Media",
"writer": "Literature",
"author": "Literature",
"program": "Information Technology",
"software": "Information Technology",
"computer": "Information Technology",
"develop": "Information Technology",
}
name_lower = occupation_name.lower()
for keyword, domain in domain_keywords.items():
if keyword in name_lower:
return domain
return "專業領域" if language == "zh" else "Professional Field"
def save_json(data: List[dict], source: str, language: str) -> None:
"""儲存資料到 JSON 檔案"""
filename = f"{source}_occupations_{language}.json"
filepath = DATA_DIR / filename
output = {
"metadata": {
"source": source,
"language": language,
"fetched_at": datetime.now(timezone.utc).isoformat(),
"total_count": len(data),
},
"occupations": data,
}
with open(filepath, "w", encoding="utf-8") as f:
json.dump(output, f, ensure_ascii=False, indent=2)
print(f"[儲存] {filepath} ({len(data)} 筆)")
def main():
"""主程式"""
print("=" * 60)
print("職業資料抓取腳本")
print(f"輸出目錄: {DATA_DIR}")
print("=" * 60)
print()
# 確保輸出目錄存在
DATA_DIR.mkdir(parents=True, exist_ok=True)
# 抓取 Wikidata
print("--- Wikidata ---")
try:
wikidata_zh = fetch_wikidata_occupations("zh")
save_json(wikidata_zh, "wikidata", "zh")
except Exception as e:
print(f"Wikidata 中文抓取失敗: {e}")
wikidata_zh = []
try:
wikidata_en = fetch_wikidata_occupations("en")
save_json(wikidata_en, "wikidata", "en")
except Exception as e:
print(f"Wikidata 英文抓取失敗: {e}")
wikidata_en = []
print()
# 抓取 ConceptNet
print("--- ConceptNet ---")
try:
conceptnet_zh = fetch_conceptnet_occupations("zh")
save_json(conceptnet_zh, "conceptnet", "zh")
except Exception as e:
print(f"ConceptNet 中文抓取失敗: {e}")
conceptnet_zh = []
try:
conceptnet_en = fetch_conceptnet_occupations("en")
save_json(conceptnet_en, "conceptnet", "en")
except Exception as e:
print(f"ConceptNet 英文抓取失敗: {e}")
conceptnet_en = []
print()
print("=" * 60)
print("抓取完成!")
print(f" Wikidata 中文: {len(wikidata_zh)}")
print(f" Wikidata 英文: {len(wikidata_en)}")
print(f" ConceptNet 中文: {len(conceptnet_zh)}")
print(f" ConceptNet 英文: {len(conceptnet_en)}")
print("=" * 60)
if __name__ == "__main__":
main()