feat: Add Deduplication Agent with embedding and LLM methods

Implement a new Deduplication Agent that identifies and groups similar transformation descriptions. Supports two deduplication methods: - Embedding: Fast vector similarity comparison using cosine similarity - LLM: Accurate pairwise semantic comparison (slower but more precise) Backend changes: - Add deduplication router with /deduplicate endpoint - Add embedding_service for vector-based similarity - Add llm_deduplication_service for LLM-based comparison - Improve expert_transformation error handling and progress reporting Frontend changes: - Add DeduplicationPanel with interactive group visualization - Add useDeduplication hook for state management - Integrate deduplication tab in main App - Add threshold slider and method selector in sidebar 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 20:26:17 +08:00
parent 5571076406
commit bc281b8e0a
18 changed files with 1397 additions and 25 deletions
--- a/backend/app/routers/deduplication.py
+++ b/backend/app/routers/deduplication.py
@@ -0,0 +1,93 @@
+"""
+Deduplication Router - 使用 Embedding 或 LLM 去重描述
+
+提供 API 端點將相似的創新描述分組，幫助識別重複的想法。
+支援兩種方法：
+- Embedding: 快速向量相似度比較
+- LLM: 精準語意判斷（較慢但更準確）
+"""
+
+import logging
+from fastapi import APIRouter, HTTPException
+
+from ..models.schemas import DeduplicationRequest, DeduplicationResult, DeduplicationMethod
+from ..services.embedding_service import embedding_service
+from ..services.llm_deduplication_service import llm_deduplication_service
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/api/deduplication", tags=["deduplication"])
+
+
+@router.post("/deduplicate", response_model=DeduplicationResult)
+async def deduplicate_descriptions(request: DeduplicationRequest) -> DeduplicationResult:
+    """
+    去重描述
+
+    支援兩種方法：
+    - embedding: 使用向量相似度（快速）
+    - llm: 使用 LLM 成對比較（精準但較慢）
+
+    Args:
+        request: 去重請求，包含描述列表、方法選擇和相關參數
+
+    Returns:
+        DeduplicationResult: 去重結果，包含分組資訊
+
+    Raises:
+        HTTPException: 如果去重處理失敗
+    """
+    method = request.method
+    logger.info(f"Deduplication request: {len(request.descriptions)} descriptions, method={method.value}, threshold={request.similarity_threshold}")
+
+    if not request.descriptions:
+        return DeduplicationResult(
+            total_input=0,
+            total_groups=0,
+            total_duplicates=0,
+            groups=[],
+            threshold_used=request.similarity_threshold,
+            method_used=method,
+            model_used=request.model or ("nomic-embed-text" if method == DeduplicationMethod.EMBEDDING else "qwen3:4b")
+        )
+
+    try:
+        if method == DeduplicationMethod.EMBEDDING:
+            # 使用 Embedding 相似度去重
+            result = await embedding_service.deduplicate(
+                descriptions=request.descriptions,
+                threshold=request.similarity_threshold,
+                model=request.model
+            )
+        else:
+            # 使用 LLM 成對比較去重
+            result = await llm_deduplication_service.deduplicate(
+                descriptions=request.descriptions,
+                model=request.model
+            )
+        return result
+    except ValueError as e:
+        logger.error(f"Deduplication failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+    except Exception as e:
+        logger.error(f"Unexpected error during deduplication: {e}")
+        raise HTTPException(status_code=500, detail=f"Deduplication failed: {str(e)}")
+
+
+@router.get("/models")
+async def list_embedding_models():
+    """
+    列出可用的 Embedding 模型
+
+    Returns:
+        dict: 可用模型列表和建議的預設模型
+    """
+    return {
+        "default": "nomic-embed-text",
+        "available": [
+            {"name": "nomic-embed-text", "description": "Fast and efficient embedding model"},
+            {"name": "mxbai-embed-large", "description": "High quality embeddings"},
+            {"name": "all-minilm", "description": "Lightweight embedding model"},
+        ],
+        "note": "Run 'ollama pull <model>' to install a model"
+    }