Files
novelty-seeking/backend/scripts/fetch_occupations.py
gbanyan 5571076406 feat: Add curated expert occupations with local data sources
- Add curated occupations seed files (210 entries in zh/en) with specific domains
- Add DBpedia occupations data (2164 entries) for external source option
- Refactor expert_source_service to read from local JSON files
- Improve keyword generation prompts to leverage expert domain context
- Add architecture analysis documentation (ARCHITECTURE_ANALYSIS.md)
- Fix expert source selection bug (proper handling of empty custom_experts)
- Update frontend to support curated/dbpedia/wikidata expert sources

Key changes:
- backend/app/data/: Local occupation data files
- backend/app/services/expert_source_service.py: Simplified local file reading
- backend/app/prompts/expert_transformation_prompt.py: Better domain-aware prompts
- Removed expert_cache.py (no longer needed with local files)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 16:34:35 +08:00

387 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
職業資料抓取腳本
從 Wikidata SPARQL 和 ConceptNet API 抓取職業資料,
儲存為本地 JSON 檔案供應用程式使用。
使用方式:
cd backend
python scripts/fetch_occupations.py
"""
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import List
import httpx
# 輸出目錄
DATA_DIR = Path(__file__).parent.parent / "app" / "data"
def fetch_wikidata_occupations(language: str) -> List[dict]:
"""
從 Wikidata SPARQL 端點抓取所有職業(使用分頁)
Args:
language: 語言代碼 (zh/en)
Returns:
職業列表 [{"name": "...", "domain": "..."}, ...]
"""
print(f"[Wikidata] 正在抓取 {language} 職業資料(分頁模式)...")
endpoint = "https://query.wikidata.org/sparql"
page_size = 500 # 每頁筆數
all_bindings = []
offset = 0
try:
with httpx.Client(timeout=120.0) as client:
while True:
# SPARQL 查詢 - 使用 SERVICE wikibase:label (更高效)
query = f"""
SELECT DISTINCT ?occupation ?occupationLabel ?fieldLabel WHERE {{
?occupation wdt:P31 wd:Q28640.
OPTIONAL {{ ?occupation wdt:P425 ?field. }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{language},en". }}
}}
LIMIT {page_size}
OFFSET {offset}
"""
print(f"[Wikidata] 抓取第 {offset // page_size + 1} 頁 (offset={offset})...")
response = client.get(
endpoint,
params={"query": query, "format": "json"},
headers={
"Accept": "application/sparql-results+json",
"User-Agent": "NoveltySeeking/1.0",
},
)
response.raise_for_status()
data = response.json()
bindings = data.get("results", {}).get("bindings", [])
print(f"[Wikidata] 取得 {len(bindings)}")
if not bindings:
# 沒有更多資料了
break
all_bindings.extend(bindings)
offset += page_size
# 如果取得的筆數少於 page_size表示已經是最後一頁
if len(bindings) < page_size:
break
print(f"[Wikidata] 總共取得 {len(all_bindings)} 筆原始資料")
# 解析回應
occupations = []
for item in all_bindings:
name = item.get("occupationLabel", {}).get("value", "")
field = item.get("fieldLabel", {}).get("value", "")
if name and len(name) >= 2:
occupations.append({
"name": name,
"domain": field if field else infer_domain(name, language),
})
# 去重
seen = set()
unique = []
for occ in occupations:
if occ["name"] not in seen:
seen.add(occ["name"])
unique.append(occ)
print(f"[Wikidata] 去重後: {len(unique)} 筆職業")
return unique
except Exception as e:
print(f"[Wikidata] 錯誤: {e}")
raise
def fetch_conceptnet_occupations(language: str) -> List[dict]:
"""
從 ConceptNet API 抓取職業相關概念(使用分頁)
Args:
language: 語言代碼 (zh/en)
Returns:
職業列表 [{"name": "...", "domain": "..."}, ...]
"""
print(f"[ConceptNet] 正在抓取 {language} 職業資料(分頁模式)...")
endpoint = "https://api.conceptnet.io"
lang_code = language
page_size = 100 # ConceptNet 建議的 limit
# 起始概念
start_concepts = {
"zh": ["/c/zh/職業", "/c/zh/專業", "/c/zh/工作", "/c/zh/職務"],
"en": ["/c/en/occupation", "/c/en/profession", "/c/en/job", "/c/en/career"],
}
# 要查詢的關係類型
relations = ["/r/IsA", "/r/RelatedTo", "/r/HasA", "/r/AtLocation"]
all_occupations = []
try:
with httpx.Client(timeout=60.0) as client:
for concept in start_concepts.get(lang_code, start_concepts["zh"]):
for rel in relations:
offset = 0
max_pages = 5 # 每個組合最多抓 5 頁
for page in range(max_pages):
try:
print(f"[ConceptNet] 查詢 {concept} {rel} (offset={offset})...")
# 查詢 start 參數
response = client.get(
f"{endpoint}/query",
params={
"start": concept,
"rel": rel,
"limit": page_size,
"offset": offset,
},
)
if response.status_code != 200:
print(f"[ConceptNet] HTTP {response.status_code}, 跳過")
break
data = response.json()
edges = data.get("edges", [])
if not edges:
break
parsed = parse_conceptnet_response(data, lang_code)
all_occupations.extend(parsed)
print(f"[ConceptNet] 取得 {len(parsed)}")
if len(edges) < page_size:
break
offset += page_size
except Exception as e:
print(f"[ConceptNet] 錯誤: {e}")
break
# 去重
seen = set()
unique = []
for occ in all_occupations:
if occ["name"] not in seen:
seen.add(occ["name"])
unique.append(occ)
print(f"[ConceptNet] 去重後: {len(unique)} 筆概念")
return unique
except Exception as e:
print(f"[ConceptNet] 錯誤: {e}")
raise
def parse_conceptnet_response(data: dict, lang_code: str) -> List[dict]:
"""解析 ConceptNet API 回應"""
results = []
edges = data.get("edges", [])
for edge in edges:
start = edge.get("start", {})
end = edge.get("end", {})
# 嘗試從兩端取得有意義的概念
for node in [start, end]:
node_id = node.get("@id", "")
label = node.get("label", "")
# 過濾:確保是目標語言且有意義
if f"/c/{lang_code}/" in node_id and label and len(label) >= 2:
# 排除過於泛用的詞
if label not in ["職業", "工作", "專業", "occupation", "job", "profession"]:
results.append({
"name": label,
"domain": infer_domain(label, lang_code),
})
return results
def infer_domain(occupation_name: str, language: str) -> str:
"""根據職業名稱推斷領域"""
if language == "zh":
domain_keywords = {
"": "醫療健康",
"": "醫療健康",
"": "醫療健康",
"": "專業服務",
"工程": "工程技術",
"技術": "工程技術",
"設計": "設計創意",
"藝術": "藝術文化",
"音樂": "藝術文化",
"運動": "體育運動",
"": "農業",
"": "漁業",
"": "商業貿易",
"": "商業貿易",
"": "法律",
"": "法律",
"": "教育",
"研究": "學術研究",
"科學": "學術研究",
"": "餐飲服務",
"": "餐飲服務",
"建築": "建築營造",
"": "軍事國防",
"": "公共安全",
"消防": "公共安全",
"記者": "媒體傳播",
"編輯": "媒體傳播",
"作家": "文學創作",
"程式": "資訊科技",
"軟體": "資訊科技",
"電腦": "資訊科技",
}
else:
domain_keywords = {
"doctor": "Healthcare",
"nurse": "Healthcare",
"medical": "Healthcare",
"engineer": "Engineering",
"technical": "Engineering",
"design": "Design & Creative",
"artist": "Arts & Culture",
"music": "Arts & Culture",
"sport": "Sports",
"athletic": "Sports",
"farm": "Agriculture",
"fish": "Fishery",
"business": "Business",
"sales": "Business",
"law": "Legal",
"attorney": "Legal",
"teach": "Education",
"professor": "Education",
"research": "Academic Research",
"scien": "Academic Research",
"chef": "Culinary",
"cook": "Culinary",
"architect": "Architecture",
"military": "Military",
"police": "Public Safety",
"fire": "Public Safety",
"journal": "Media",
"editor": "Media",
"writer": "Literature",
"author": "Literature",
"program": "Information Technology",
"software": "Information Technology",
"computer": "Information Technology",
"develop": "Information Technology",
}
name_lower = occupation_name.lower()
for keyword, domain in domain_keywords.items():
if keyword in name_lower:
return domain
return "專業領域" if language == "zh" else "Professional Field"
def save_json(data: List[dict], source: str, language: str) -> None:
"""儲存資料到 JSON 檔案"""
filename = f"{source}_occupations_{language}.json"
filepath = DATA_DIR / filename
output = {
"metadata": {
"source": source,
"language": language,
"fetched_at": datetime.now(timezone.utc).isoformat(),
"total_count": len(data),
},
"occupations": data,
}
with open(filepath, "w", encoding="utf-8") as f:
json.dump(output, f, ensure_ascii=False, indent=2)
print(f"[儲存] {filepath} ({len(data)} 筆)")
def main():
"""主程式"""
print("=" * 60)
print("職業資料抓取腳本")
print(f"輸出目錄: {DATA_DIR}")
print("=" * 60)
print()
# 確保輸出目錄存在
DATA_DIR.mkdir(parents=True, exist_ok=True)
# 抓取 Wikidata
print("--- Wikidata ---")
try:
wikidata_zh = fetch_wikidata_occupations("zh")
save_json(wikidata_zh, "wikidata", "zh")
except Exception as e:
print(f"Wikidata 中文抓取失敗: {e}")
wikidata_zh = []
try:
wikidata_en = fetch_wikidata_occupations("en")
save_json(wikidata_en, "wikidata", "en")
except Exception as e:
print(f"Wikidata 英文抓取失敗: {e}")
wikidata_en = []
print()
# 抓取 ConceptNet
print("--- ConceptNet ---")
try:
conceptnet_zh = fetch_conceptnet_occupations("zh")
save_json(conceptnet_zh, "conceptnet", "zh")
except Exception as e:
print(f"ConceptNet 中文抓取失敗: {e}")
conceptnet_zh = []
try:
conceptnet_en = fetch_conceptnet_occupations("en")
save_json(conceptnet_en, "conceptnet", "en")
except Exception as e:
print(f"ConceptNet 英文抓取失敗: {e}")
conceptnet_en = []
print()
print("=" * 60)
print("抓取完成!")
print(f" Wikidata 中文: {len(wikidata_zh)}")
print(f" Wikidata 英文: {len(wikidata_en)}")
print(f" ConceptNet 中文: {len(conceptnet_zh)}")
print(f" ConceptNet 英文: {len(conceptnet_en)}")
print("=" * 60)
if __name__ == "__main__":
main()