Files
novelty-seeking/backend/app/services/expert_source_service.py
gbanyan 5571076406 feat: Add curated expert occupations with local data sources
- Add curated occupations seed files (210 entries in zh/en) with specific domains
- Add DBpedia occupations data (2164 entries) for external source option
- Refactor expert_source_service to read from local JSON files
- Improve keyword generation prompts to leverage expert domain context
- Add architecture analysis documentation (ARCHITECTURE_ANALYSIS.md)
- Fix expert source selection bug (proper handling of empty custom_experts)
- Update frontend to support curated/dbpedia/wikidata expert sources

Key changes:
- backend/app/data/: Local occupation data files
- backend/app/services/expert_source_service.py: Simplified local file reading
- backend/app/prompts/expert_transformation_prompt.py: Better domain-aware prompts
- Removed expert_cache.py (no longer needed with local files)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 16:34:35 +08:00

202 lines
5.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Expert 本地資料來源服務
從本地 JSON 檔案讀取職業資料,提供隨機選取功能。
"""
import json
import logging
import random
from pathlib import Path
from typing import List, Tuple
logger = logging.getLogger(__name__)
# 資料目錄
DATA_DIR = Path(__file__).parent.parent / "data"
class LocalDataProvider:
"""從本地 JSON 檔案讀取職業資料"""
def __init__(self, source: str):
"""
Args:
source: 資料來源名稱 (dbpedia/wikidata)
"""
self.source = source
self._cache: dict = {} # 記憶體快取
def load_occupations(self, language: str = "en") -> List[dict]:
"""
載入職業資料
Args:
language: 語言代碼 (en/zh)
Returns:
職業列表 [{"name": "...", "domain": "..."}, ...]
"""
cache_key = f"{self.source}:{language}"
# 檢查記憶體快取
if cache_key in self._cache:
return self._cache[cache_key]
# 讀取檔案
file_path = DATA_DIR / f"{self.source}_occupations_{language}.json"
if not file_path.exists():
logger.warning(f"資料檔案不存在: {file_path}")
return []
try:
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
occupations = data.get("occupations", [])
logger.info(f"載入 {len(occupations)}{self.source} {language} 職業")
# 存入快取
self._cache[cache_key] = occupations
return occupations
except Exception as e:
logger.error(f"讀取職業資料失敗: {e}")
return []
def random_select(self, count: int, language: str = "en") -> List[dict]:
"""
隨機選取指定數量的職業
Args:
count: 需要的數量
language: 語言代碼
Returns:
隨機選取的職業列表
"""
all_occupations = self.load_occupations(language)
if not all_occupations:
return []
if len(all_occupations) <= count:
return all_occupations
return random.sample(all_occupations, count)
class ExpertSourceService:
"""統一的專家來源服務"""
def __init__(self):
self.curated = LocalDataProvider("curated") # 精選職業
self.dbpedia = LocalDataProvider("dbpedia")
self.wikidata = LocalDataProvider("wikidata")
def get_experts(
self,
source: str,
count: int,
language: str = "en",
fallback_to_llm: bool = True
) -> Tuple[List[dict], str]:
"""
從指定來源獲取專家資料
Args:
source: 來源類型 ("dbpedia" | "wikidata")
count: 需要的專家數量
language: 語言代碼
fallback_to_llm: 失敗時是否允許 fallback由呼叫者處理
Returns:
(專家資料列表, 實際使用的來源)
Raises:
ValueError: 當獲取失敗且資料為空時
"""
# 選擇 provider
if source == "curated":
provider = self.curated
# 精選職業支援 zh 和 en預設使用 zh
if language not in ["zh", "en"]:
language = "zh"
elif source == "wikidata":
provider = self.wikidata
else:
# 預設使用 dbpedia
provider = self.dbpedia
source = "dbpedia"
experts = provider.random_select(count, language)
if not experts:
raise ValueError(f"No occupations found from {source} ({language})")
logger.info(f"{source} 取得 {len(experts)} 位專家")
return experts, source
def get_available_sources(self) -> List[dict]:
"""
取得可用的資料來源資訊
Returns:
來源資訊列表
"""
sources = []
# 檢查精選職業(中文)
curated_zh = DATA_DIR / "curated_occupations_zh.json"
if curated_zh.exists():
with open(curated_zh, "r", encoding="utf-8") as f:
data = json.load(f)
sources.append({
"source": "curated",
"language": "zh",
"count": data["metadata"]["total_count"],
"created_at": data["metadata"]["created_at"]
})
# 檢查精選職業(英文)
curated_en = DATA_DIR / "curated_occupations_en.json"
if curated_en.exists():
with open(curated_en, "r", encoding="utf-8") as f:
data = json.load(f)
sources.append({
"source": "curated",
"language": "en",
"count": data["metadata"]["total_count"],
"created_at": data["metadata"]["created_at"]
})
# 檢查 DBpedia
dbpedia_en = DATA_DIR / "dbpedia_occupations_en.json"
if dbpedia_en.exists():
with open(dbpedia_en, "r", encoding="utf-8") as f:
data = json.load(f)
sources.append({
"source": "dbpedia",
"language": "en",
"count": data["metadata"]["total_count"],
"fetched_at": data["metadata"]["fetched_at"]
})
# 檢查 Wikidata
wikidata_zh = DATA_DIR / "wikidata_occupations_zh.json"
if wikidata_zh.exists():
with open(wikidata_zh, "r", encoding="utf-8") as f:
data = json.load(f)
sources.append({
"source": "wikidata",
"language": "zh",
"count": data["metadata"]["total_count"],
"fetched_at": data["metadata"]["fetched_at"]
})
return sources
# 全域服務實例
expert_source_service = ExpertSourceService()