feat: Add Deduplication Agent with embedding and LLM methods

Implement a new Deduplication Agent that identifies and groups similar transformation descriptions. Supports two deduplication methods: - Embedding: Fast vector similarity comparison using cosine similarity - LLM: Accurate pairwise semantic comparison (slower but more precise) Backend changes: - Add deduplication router with /deduplicate endpoint - Add embedding_service for vector-based similarity - Add llm_deduplication_service for LLM-based comparison - Improve expert_transformation error handling and progress reporting Frontend changes: - Add DeduplicationPanel with interactive group visualization - Add useDeduplication hook for state management - Integrate deduplication tab in main App - Add threshold slider and method selector in sidebar 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 20:26:17 +08:00
parent 5571076406
commit bc281b8e0a
18 changed files with 1397 additions and 25 deletions
--- a/backend/app/services/embedding_service.py
+++ b/backend/app/services/embedding_service.py
@@ -0,0 +1,250 @@
+"""
+Embedding Service - generates embeddings and performs similarity-based deduplication
+
+使用 Ollama 的 embedding 端點生成向量，並透過餘弦相似度進行去重分組。
+"""
+
+import logging
+from typing import List, Optional
+
+import httpx
+import numpy as np
+
+from ..config import settings
+from ..models.schemas import (
+    ExpertTransformationDescription,
+    DeduplicationResult,
+    DeduplicationMethod,
+    DescriptionGroup,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class EmbeddingService:
+    """Embedding 服務：生成向量並執行相似度去重"""
+
+    def __init__(self):
+        self.base_url = settings.ollama_base_url
+        self.default_model = "nomic-embed-text"  # Ollama 預設的 embedding 模型
+        self.client = httpx.AsyncClient(timeout=120.0)
+
+    async def get_embedding(self, text: str, model: Optional[str] = None) -> List[float]:
+        """取得單一文字的 embedding 向量"""
+        model = model or self.default_model
+        url = f"{self.base_url}/api/embed"
+
+        try:
+            response = await self.client.post(url, json={
+                "model": model,
+                "input": text
+            })
+            response.raise_for_status()
+            result = response.json()
+            return result["embeddings"][0]
+        except httpx.HTTPStatusError as e:
+            logger.error(f"Embedding API error: {e.response.status_code} - {e.response.text}")
+            raise
+        except Exception as e:
+            logger.error(f"Embedding error: {e}")
+            raise
+
+    async def get_embeddings_batch(
+        self,
+        texts: List[str],
+        model: Optional[str] = None
+    ) -> List[List[float]]:
+        """批次取得多個文字的 embedding 向量"""
+        if not texts:
+            return []
+
+        model = model or self.default_model
+        url = f"{self.base_url}/api/embed"
+
+        try:
+            # Ollama 支援批次 embedding
+            response = await self.client.post(url, json={
+                "model": model,
+                "input": texts
+            })
+            response.raise_for_status()
+            result = response.json()
+            return result["embeddings"]
+        except httpx.HTTPStatusError as e:
+            logger.error(f"Batch embedding API error: {e.response.status_code} - {e.response.text}")
+            # 如果批次失敗，嘗試逐一處理
+            logger.info("Falling back to single embedding requests...")
+            embeddings = []
+            for text in texts:
+                emb = await self.get_embedding(text, model)
+                embeddings.append(emb)
+            return embeddings
+        except Exception as e:
+            logger.error(f"Batch embedding error: {e}")
+            raise
+
+    def cosine_similarity(self, a: List[float], b: List[float]) -> float:
+        """計算兩個向量的餘弦相似度"""
+        a_np = np.array(a)
+        b_np = np.array(b)
+        norm_a = np.linalg.norm(a_np)
+        norm_b = np.linalg.norm(b_np)
+        if norm_a == 0 or norm_b == 0:
+            return 0.0
+        return float(np.dot(a_np, b_np) / (norm_a * norm_b))
+
+    def build_similarity_matrix(
+        self,
+        embeddings: List[List[float]]
+    ) -> np.ndarray:
+        """建立成對相似度矩陣"""
+        n = len(embeddings)
+        matrix = np.zeros((n, n))
+
+        for i in range(n):
+            matrix[i][i] = 1.0  # 自己與自己的相似度為 1
+            for j in range(i + 1, n):
+                sim = self.cosine_similarity(embeddings[i], embeddings[j])
+                matrix[i][j] = sim
+                matrix[j][i] = sim
+
+        return matrix
+
+    def cluster_by_similarity(
+        self,
+        similarity_matrix: np.ndarray,
+        threshold: float
+    ) -> List[List[int]]:
+        """
+        貪婪聚類：將相似度 >= threshold 的項目分組
+
+        演算法：
+        1. 從第一個未分配的項目開始
+        2. 找出所有與該項目相似度 >= threshold 的項目
+        3. 歸入同一組
+        4. 重複直到所有項目都已分配
+
+        Returns:
+            List[List[int]]: 每個子列表包含同組項目的索引
+        """
+        n = len(similarity_matrix)
+        assigned = [False] * n
+        groups = []
+
+        for i in range(n):
+            if assigned[i]:
+                continue
+
+            # 開始新的分組，以 item i 為代表
+            group = [i]
+            assigned[i] = True
+
+            # 找出所有與 i 相似的項目
+            for j in range(i + 1, n):
+                if not assigned[j] and similarity_matrix[i][j] >= threshold:
+                    group.append(j)
+                    assigned[j] = True
+
+            groups.append(group)
+
+        return groups
+
+    async def deduplicate(
+        self,
+        descriptions: List[ExpertTransformationDescription],
+        threshold: float = 0.85,
+        model: Optional[str] = None
+    ) -> DeduplicationResult:
+        """
+        主要去重方法
+
+        Args:
+            descriptions: 要去重的描述列表
+            threshold: 相似度閾值 (0.0-1.0)，預設 0.85
+            model: Embedding 模型名稱
+
+        Returns:
+            DeduplicationResult: 去重結果，包含分組資訊
+        """
+        model = model or self.default_model
+
+        # 空輸入處理
+        if not descriptions:
+            return DeduplicationResult(
+                total_input=0,
+                total_groups=0,
+                total_duplicates=0,
+                groups=[],
+                threshold_used=threshold,
+                method_used=DeduplicationMethod.EMBEDDING,
+                model_used=model
+            )
+
+        # 提取描述文字
+        texts = [d.description for d in descriptions]
+        logger.info(f"Generating embeddings for {len(texts)} descriptions using model '{model}'...")
+
+        # 批次取得 embeddings
+        try:
+            embeddings = await self.get_embeddings_batch(texts, model)
+        except Exception as e:
+            logger.error(f"Failed to generate embeddings: {e}")
+            raise ValueError(f"Embedding generation failed: {e}. Make sure the model '{model}' is installed (run: ollama pull {model})")
+
+        # 建立相似度矩陣
+        logger.info("Building similarity matrix...")
+        sim_matrix = self.build_similarity_matrix(embeddings)
+
+        # 聚類
+        logger.info(f"Clustering with threshold {threshold}...")
+        clusters = self.cluster_by_similarity(sim_matrix, threshold)
+
+        # 建立結果分組
+        result_groups = []
+        total_duplicates = 0
+
+        for group_idx, indices in enumerate(clusters):
+            if len(indices) == 1:
+                # 獨立項目 - 無重複
+                result_groups.append(DescriptionGroup(
+                    group_id=f"group-{group_idx}",
+                    representative=descriptions[indices[0]],
+                    duplicates=[],
+                    similarity_scores=[]
+                ))
+            else:
+                # 有重複的分組 - 第一個為代表
+                rep_idx = indices[0]
+                dup_indices = indices[1:]
+                dup_scores = [
+                    float(sim_matrix[rep_idx][idx])
+                    for idx in dup_indices
+                ]
+
+                result_groups.append(DescriptionGroup(
+                    group_id=f"group-{group_idx}",
+                    representative=descriptions[rep_idx],
+                    duplicates=[descriptions[idx] for idx in dup_indices],
+                    similarity_scores=dup_scores
+                ))
+                total_duplicates += len(dup_indices)
+
+        logger.info(f"Deduplication complete: {len(descriptions)} -> {len(result_groups)} groups, {total_duplicates} duplicates found")
+
+        return DeduplicationResult(
+            total_input=len(descriptions),
+            total_groups=len(result_groups),
+            total_duplicates=total_duplicates,
+            groups=result_groups,
+            threshold_used=threshold,
+            method_used=DeduplicationMethod.EMBEDDING,
+            model_used=model
+        )
+
+    async def close(self):
+        """關閉 HTTP 客戶端"""
+        await self.client.aclose()
+
+
+# 全域實例
+embedding_service = EmbeddingService()