feat: Add curated expert occupations with local data sources
- Add curated occupations seed files (210 entries in zh/en) with specific domains - Add DBpedia occupations data (2164 entries) for external source option - Refactor expert_source_service to read from local JSON files - Improve keyword generation prompts to leverage expert domain context - Add architecture analysis documentation (ARCHITECTURE_ANALYSIS.md) - Fix expert source selection bug (proper handling of empty custom_experts) - Update frontend to support curated/dbpedia/wikidata expert sources Key changes: - backend/app/data/: Local occupation data files - backend/app/services/expert_source_service.py: Simplified local file reading - backend/app/prompts/expert_transformation_prompt.py: Better domain-aware prompts - Removed expert_cache.py (no longer needed with local files) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
386
backend/scripts/fetch_occupations.py
Normal file
386
backend/scripts/fetch_occupations.py
Normal file
@@ -0,0 +1,386 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
職業資料抓取腳本
|
||||
|
||||
從 Wikidata SPARQL 和 ConceptNet API 抓取職業資料,
|
||||
儲存為本地 JSON 檔案供應用程式使用。
|
||||
|
||||
使用方式:
|
||||
cd backend
|
||||
python scripts/fetch_occupations.py
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
import httpx
|
||||
|
||||
# 輸出目錄
|
||||
DATA_DIR = Path(__file__).parent.parent / "app" / "data"
|
||||
|
||||
|
||||
def fetch_wikidata_occupations(language: str) -> List[dict]:
|
||||
"""
|
||||
從 Wikidata SPARQL 端點抓取所有職業(使用分頁)
|
||||
|
||||
Args:
|
||||
language: 語言代碼 (zh/en)
|
||||
|
||||
Returns:
|
||||
職業列表 [{"name": "...", "domain": "..."}, ...]
|
||||
"""
|
||||
print(f"[Wikidata] 正在抓取 {language} 職業資料(分頁模式)...")
|
||||
|
||||
endpoint = "https://query.wikidata.org/sparql"
|
||||
page_size = 500 # 每頁筆數
|
||||
all_bindings = []
|
||||
offset = 0
|
||||
|
||||
try:
|
||||
with httpx.Client(timeout=120.0) as client:
|
||||
while True:
|
||||
# SPARQL 查詢 - 使用 SERVICE wikibase:label (更高效)
|
||||
query = f"""
|
||||
SELECT DISTINCT ?occupation ?occupationLabel ?fieldLabel WHERE {{
|
||||
?occupation wdt:P31 wd:Q28640.
|
||||
OPTIONAL {{ ?occupation wdt:P425 ?field. }}
|
||||
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{language},en". }}
|
||||
}}
|
||||
LIMIT {page_size}
|
||||
OFFSET {offset}
|
||||
"""
|
||||
|
||||
print(f"[Wikidata] 抓取第 {offset // page_size + 1} 頁 (offset={offset})...")
|
||||
|
||||
response = client.get(
|
||||
endpoint,
|
||||
params={"query": query, "format": "json"},
|
||||
headers={
|
||||
"Accept": "application/sparql-results+json",
|
||||
"User-Agent": "NoveltySeeking/1.0",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
bindings = data.get("results", {}).get("bindings", [])
|
||||
print(f"[Wikidata] 取得 {len(bindings)} 筆")
|
||||
|
||||
if not bindings:
|
||||
# 沒有更多資料了
|
||||
break
|
||||
|
||||
all_bindings.extend(bindings)
|
||||
offset += page_size
|
||||
|
||||
# 如果取得的筆數少於 page_size,表示已經是最後一頁
|
||||
if len(bindings) < page_size:
|
||||
break
|
||||
|
||||
print(f"[Wikidata] 總共取得 {len(all_bindings)} 筆原始資料")
|
||||
|
||||
# 解析回應
|
||||
occupations = []
|
||||
for item in all_bindings:
|
||||
name = item.get("occupationLabel", {}).get("value", "")
|
||||
field = item.get("fieldLabel", {}).get("value", "")
|
||||
|
||||
if name and len(name) >= 2:
|
||||
occupations.append({
|
||||
"name": name,
|
||||
"domain": field if field else infer_domain(name, language),
|
||||
})
|
||||
|
||||
# 去重
|
||||
seen = set()
|
||||
unique = []
|
||||
for occ in occupations:
|
||||
if occ["name"] not in seen:
|
||||
seen.add(occ["name"])
|
||||
unique.append(occ)
|
||||
|
||||
print(f"[Wikidata] 去重後: {len(unique)} 筆職業")
|
||||
return unique
|
||||
|
||||
except Exception as e:
|
||||
print(f"[Wikidata] 錯誤: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def fetch_conceptnet_occupations(language: str) -> List[dict]:
|
||||
"""
|
||||
從 ConceptNet API 抓取職業相關概念(使用分頁)
|
||||
|
||||
Args:
|
||||
language: 語言代碼 (zh/en)
|
||||
|
||||
Returns:
|
||||
職業列表 [{"name": "...", "domain": "..."}, ...]
|
||||
"""
|
||||
print(f"[ConceptNet] 正在抓取 {language} 職業資料(分頁模式)...")
|
||||
|
||||
endpoint = "https://api.conceptnet.io"
|
||||
lang_code = language
|
||||
page_size = 100 # ConceptNet 建議的 limit
|
||||
|
||||
# 起始概念
|
||||
start_concepts = {
|
||||
"zh": ["/c/zh/職業", "/c/zh/專業", "/c/zh/工作", "/c/zh/職務"],
|
||||
"en": ["/c/en/occupation", "/c/en/profession", "/c/en/job", "/c/en/career"],
|
||||
}
|
||||
|
||||
# 要查詢的關係類型
|
||||
relations = ["/r/IsA", "/r/RelatedTo", "/r/HasA", "/r/AtLocation"]
|
||||
|
||||
all_occupations = []
|
||||
|
||||
try:
|
||||
with httpx.Client(timeout=60.0) as client:
|
||||
for concept in start_concepts.get(lang_code, start_concepts["zh"]):
|
||||
for rel in relations:
|
||||
offset = 0
|
||||
max_pages = 5 # 每個組合最多抓 5 頁
|
||||
|
||||
for page in range(max_pages):
|
||||
try:
|
||||
print(f"[ConceptNet] 查詢 {concept} {rel} (offset={offset})...")
|
||||
|
||||
# 查詢 start 參數
|
||||
response = client.get(
|
||||
f"{endpoint}/query",
|
||||
params={
|
||||
"start": concept,
|
||||
"rel": rel,
|
||||
"limit": page_size,
|
||||
"offset": offset,
|
||||
},
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"[ConceptNet] HTTP {response.status_code}, 跳過")
|
||||
break
|
||||
|
||||
data = response.json()
|
||||
edges = data.get("edges", [])
|
||||
|
||||
if not edges:
|
||||
break
|
||||
|
||||
parsed = parse_conceptnet_response(data, lang_code)
|
||||
all_occupations.extend(parsed)
|
||||
print(f"[ConceptNet] 取得 {len(parsed)} 筆")
|
||||
|
||||
if len(edges) < page_size:
|
||||
break
|
||||
|
||||
offset += page_size
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ConceptNet] 錯誤: {e}")
|
||||
break
|
||||
|
||||
# 去重
|
||||
seen = set()
|
||||
unique = []
|
||||
for occ in all_occupations:
|
||||
if occ["name"] not in seen:
|
||||
seen.add(occ["name"])
|
||||
unique.append(occ)
|
||||
|
||||
print(f"[ConceptNet] 去重後: {len(unique)} 筆概念")
|
||||
return unique
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ConceptNet] 錯誤: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def parse_conceptnet_response(data: dict, lang_code: str) -> List[dict]:
|
||||
"""解析 ConceptNet API 回應"""
|
||||
results = []
|
||||
edges = data.get("edges", [])
|
||||
|
||||
for edge in edges:
|
||||
start = edge.get("start", {})
|
||||
end = edge.get("end", {})
|
||||
|
||||
# 嘗試從兩端取得有意義的概念
|
||||
for node in [start, end]:
|
||||
node_id = node.get("@id", "")
|
||||
label = node.get("label", "")
|
||||
|
||||
# 過濾:確保是目標語言且有意義
|
||||
if f"/c/{lang_code}/" in node_id and label and len(label) >= 2:
|
||||
# 排除過於泛用的詞
|
||||
if label not in ["職業", "工作", "專業", "occupation", "job", "profession"]:
|
||||
results.append({
|
||||
"name": label,
|
||||
"domain": infer_domain(label, lang_code),
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def infer_domain(occupation_name: str, language: str) -> str:
|
||||
"""根據職業名稱推斷領域"""
|
||||
if language == "zh":
|
||||
domain_keywords = {
|
||||
"醫": "醫療健康",
|
||||
"護": "醫療健康",
|
||||
"藥": "醫療健康",
|
||||
"師": "專業服務",
|
||||
"工程": "工程技術",
|
||||
"技術": "工程技術",
|
||||
"設計": "設計創意",
|
||||
"藝術": "藝術文化",
|
||||
"音樂": "藝術文化",
|
||||
"運動": "體育運動",
|
||||
"農": "農業",
|
||||
"漁": "漁業",
|
||||
"商": "商業貿易",
|
||||
"銷": "商業貿易",
|
||||
"法": "法律",
|
||||
"律": "法律",
|
||||
"教": "教育",
|
||||
"研究": "學術研究",
|
||||
"科學": "學術研究",
|
||||
"廚": "餐飲服務",
|
||||
"烹": "餐飲服務",
|
||||
"建築": "建築營造",
|
||||
"軍": "軍事國防",
|
||||
"警": "公共安全",
|
||||
"消防": "公共安全",
|
||||
"記者": "媒體傳播",
|
||||
"編輯": "媒體傳播",
|
||||
"作家": "文學創作",
|
||||
"程式": "資訊科技",
|
||||
"軟體": "資訊科技",
|
||||
"電腦": "資訊科技",
|
||||
}
|
||||
else:
|
||||
domain_keywords = {
|
||||
"doctor": "Healthcare",
|
||||
"nurse": "Healthcare",
|
||||
"medical": "Healthcare",
|
||||
"engineer": "Engineering",
|
||||
"technical": "Engineering",
|
||||
"design": "Design & Creative",
|
||||
"artist": "Arts & Culture",
|
||||
"music": "Arts & Culture",
|
||||
"sport": "Sports",
|
||||
"athletic": "Sports",
|
||||
"farm": "Agriculture",
|
||||
"fish": "Fishery",
|
||||
"business": "Business",
|
||||
"sales": "Business",
|
||||
"law": "Legal",
|
||||
"attorney": "Legal",
|
||||
"teach": "Education",
|
||||
"professor": "Education",
|
||||
"research": "Academic Research",
|
||||
"scien": "Academic Research",
|
||||
"chef": "Culinary",
|
||||
"cook": "Culinary",
|
||||
"architect": "Architecture",
|
||||
"military": "Military",
|
||||
"police": "Public Safety",
|
||||
"fire": "Public Safety",
|
||||
"journal": "Media",
|
||||
"editor": "Media",
|
||||
"writer": "Literature",
|
||||
"author": "Literature",
|
||||
"program": "Information Technology",
|
||||
"software": "Information Technology",
|
||||
"computer": "Information Technology",
|
||||
"develop": "Information Technology",
|
||||
}
|
||||
|
||||
name_lower = occupation_name.lower()
|
||||
for keyword, domain in domain_keywords.items():
|
||||
if keyword in name_lower:
|
||||
return domain
|
||||
|
||||
return "專業領域" if language == "zh" else "Professional Field"
|
||||
|
||||
|
||||
def save_json(data: List[dict], source: str, language: str) -> None:
|
||||
"""儲存資料到 JSON 檔案"""
|
||||
filename = f"{source}_occupations_{language}.json"
|
||||
filepath = DATA_DIR / filename
|
||||
|
||||
output = {
|
||||
"metadata": {
|
||||
"source": source,
|
||||
"language": language,
|
||||
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||||
"total_count": len(data),
|
||||
},
|
||||
"occupations": data,
|
||||
}
|
||||
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
json.dump(output, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"[儲存] {filepath} ({len(data)} 筆)")
|
||||
|
||||
|
||||
def main():
|
||||
"""主程式"""
|
||||
print("=" * 60)
|
||||
print("職業資料抓取腳本")
|
||||
print(f"輸出目錄: {DATA_DIR}")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
# 確保輸出目錄存在
|
||||
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 抓取 Wikidata
|
||||
print("--- Wikidata ---")
|
||||
try:
|
||||
wikidata_zh = fetch_wikidata_occupations("zh")
|
||||
save_json(wikidata_zh, "wikidata", "zh")
|
||||
except Exception as e:
|
||||
print(f"Wikidata 中文抓取失敗: {e}")
|
||||
wikidata_zh = []
|
||||
|
||||
try:
|
||||
wikidata_en = fetch_wikidata_occupations("en")
|
||||
save_json(wikidata_en, "wikidata", "en")
|
||||
except Exception as e:
|
||||
print(f"Wikidata 英文抓取失敗: {e}")
|
||||
wikidata_en = []
|
||||
|
||||
print()
|
||||
|
||||
# 抓取 ConceptNet
|
||||
print("--- ConceptNet ---")
|
||||
try:
|
||||
conceptnet_zh = fetch_conceptnet_occupations("zh")
|
||||
save_json(conceptnet_zh, "conceptnet", "zh")
|
||||
except Exception as e:
|
||||
print(f"ConceptNet 中文抓取失敗: {e}")
|
||||
conceptnet_zh = []
|
||||
|
||||
try:
|
||||
conceptnet_en = fetch_conceptnet_occupations("en")
|
||||
save_json(conceptnet_en, "conceptnet", "en")
|
||||
except Exception as e:
|
||||
print(f"ConceptNet 英文抓取失敗: {e}")
|
||||
conceptnet_en = []
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("抓取完成!")
|
||||
print(f" Wikidata 中文: {len(wikidata_zh)} 筆")
|
||||
print(f" Wikidata 英文: {len(wikidata_en)} 筆")
|
||||
print(f" ConceptNet 中文: {len(conceptnet_zh)} 筆")
|
||||
print(f" ConceptNet 英文: {len(conceptnet_en)} 筆")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user