feat: Add Deduplication Agent with embedding and LLM methods

Implement a new Deduplication Agent that identifies and groups similar
transformation descriptions. Supports two deduplication methods:
- Embedding: Fast vector similarity comparison using cosine similarity
- LLM: Accurate pairwise semantic comparison (slower but more precise)

Backend changes:
- Add deduplication router with /deduplicate endpoint
- Add embedding_service for vector-based similarity
- Add llm_deduplication_service for LLM-based comparison
- Improve expert_transformation error handling and progress reporting

Frontend changes:
- Add DeduplicationPanel with interactive group visualization
- Add useDeduplication hook for state management
- Integrate deduplication tab in main App
- Add threshold slider and method selector in sidebar

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-22 20:26:17 +08:00
parent 5571076406
commit bc281b8e0a
18 changed files with 1397 additions and 25 deletions

View File

@@ -0,0 +1,93 @@
"""
Deduplication Router - 使用 Embedding 或 LLM 去重描述
提供 API 端點將相似的創新描述分組,幫助識別重複的想法。
支援兩種方法:
- Embedding: 快速向量相似度比較
- LLM: 精準語意判斷(較慢但更準確)
"""
import logging
from fastapi import APIRouter, HTTPException
from ..models.schemas import DeduplicationRequest, DeduplicationResult, DeduplicationMethod
from ..services.embedding_service import embedding_service
from ..services.llm_deduplication_service import llm_deduplication_service
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/deduplication", tags=["deduplication"])
@router.post("/deduplicate", response_model=DeduplicationResult)
async def deduplicate_descriptions(request: DeduplicationRequest) -> DeduplicationResult:
"""
去重描述
支援兩種方法:
- embedding: 使用向量相似度(快速)
- llm: 使用 LLM 成對比較(精準但較慢)
Args:
request: 去重請求,包含描述列表、方法選擇和相關參數
Returns:
DeduplicationResult: 去重結果,包含分組資訊
Raises:
HTTPException: 如果去重處理失敗
"""
method = request.method
logger.info(f"Deduplication request: {len(request.descriptions)} descriptions, method={method.value}, threshold={request.similarity_threshold}")
if not request.descriptions:
return DeduplicationResult(
total_input=0,
total_groups=0,
total_duplicates=0,
groups=[],
threshold_used=request.similarity_threshold,
method_used=method,
model_used=request.model or ("nomic-embed-text" if method == DeduplicationMethod.EMBEDDING else "qwen3:4b")
)
try:
if method == DeduplicationMethod.EMBEDDING:
# 使用 Embedding 相似度去重
result = await embedding_service.deduplicate(
descriptions=request.descriptions,
threshold=request.similarity_threshold,
model=request.model
)
else:
# 使用 LLM 成對比較去重
result = await llm_deduplication_service.deduplicate(
descriptions=request.descriptions,
model=request.model
)
return result
except ValueError as e:
logger.error(f"Deduplication failed: {e}")
raise HTTPException(status_code=500, detail=str(e))
except Exception as e:
logger.error(f"Unexpected error during deduplication: {e}")
raise HTTPException(status_code=500, detail=f"Deduplication failed: {str(e)}")
@router.get("/models")
async def list_embedding_models():
"""
列出可用的 Embedding 模型
Returns:
dict: 可用模型列表和建議的預設模型
"""
return {
"default": "nomic-embed-text",
"available": [
{"name": "nomic-embed-text", "description": "Fast and efficient embedding model"},
{"name": "mxbai-embed-large", "description": "High quality embeddings"},
{"name": "all-minilm", "description": "Lightweight embedding model"},
],
"note": "Run 'ollama pull <model>' to install a model"
}

View File

@@ -221,8 +221,27 @@ async def generate_expert_transformation_events(
desc_prompt, model=model, temperature=temperature
)
desc_data = extract_json_from_response(desc_response)
desc_text = desc_data.get("description", "")
# 嘗試解析 JSON若失敗則使用原始回應作為描述
desc_text = ""
try:
desc_data = extract_json_from_response(desc_response)
# 支援多種可能的 key: description, content, text, desc
desc_text = (
desc_data.get("description") or
desc_data.get("content") or
desc_data.get("text") or
desc_data.get("desc") or
""
)
except ValueError:
# JSON 解析失敗,嘗試清理原始回應作為描述
cleaned = desc_response.strip()
# 移除可能的 markdown 和多餘符號
if cleaned.startswith('"') and cleaned.endswith('"'):
cleaned = cleaned[1:-1]
if len(cleaned) > 5 and len(cleaned) < 100:
desc_text = cleaned
logger.info(f"[DESC] 使用 fallback 描述 for '{kw.keyword}': {desc_text[:50]}")
if desc_text:
descriptions.append(ExpertTransformationDescription(
@@ -231,15 +250,22 @@ async def generate_expert_transformation_events(
expert_name=kw.expert_name,
description=desc_text
))
else:
logger.warning(f"[DESC] Empty description for keyword='{kw.keyword}', parsed_data={desc_data}")
# Send progress update
yield f"event: description_progress\ndata: {json.dumps({'current': idx + 1, 'total': len(all_expert_keywords), 'keyword': kw.keyword}, ensure_ascii=False)}\n\n"
# Send progress update with success/fail status
yield f"event: description_progress\ndata: {json.dumps({'current': idx + 1, 'total': len(all_expert_keywords), 'keyword': kw.keyword, 'success': bool(desc_text)}, ensure_ascii=False)}\n\n"
except Exception as e:
logger.warning(f"Failed to generate description for '{kw.keyword}': {e}")
logger.warning(f"[DESC] Failed to generate description for '{kw.keyword}': {e}")
yield f"event: description_progress\ndata: {json.dumps({'current': idx + 1, 'total': len(all_expert_keywords), 'keyword': kw.keyword, 'success': False, 'error': str(e)}, ensure_ascii=False)}\n\n"
# Continue with next keyword
yield f"event: description_complete\ndata: {json.dumps({'count': len(descriptions)}, ensure_ascii=False)}\n\n"
# 統計成功率
success_rate = len(descriptions) / len(all_expert_keywords) * 100 if all_expert_keywords else 0
logger.info(f"[DESC] 描述生成完成: {len(descriptions)}/{len(all_expert_keywords)} 成功 ({success_rate:.1f}%)")
yield f"event: description_complete\ndata: {json.dumps({'count': len(descriptions), 'total': len(all_expert_keywords), 'success_rate': success_rate}, ensure_ascii=False)}\n\n"
# ========== Build final result ==========
result = ExpertTransformationCategoryResult(