From bc281b8e0a0848e62ea9d08fc2ae2de162b13169 Mon Sep 17 00:00:00 2001 From: gbanyan Date: Mon, 22 Dec 2025 20:26:17 +0800 Subject: [PATCH] feat: Add Deduplication Agent with embedding and LLM methods MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement a new Deduplication Agent that identifies and groups similar transformation descriptions. Supports two deduplication methods: - Embedding: Fast vector similarity comparison using cosine similarity - LLM: Accurate pairwise semantic comparison (slower but more precise) Backend changes: - Add deduplication router with /deduplicate endpoint - Add embedding_service for vector-based similarity - Add llm_deduplication_service for LLM-based comparison - Improve expert_transformation error handling and progress reporting Frontend changes: - Add DeduplicationPanel with interactive group visualization - Add useDeduplication hook for state management - Integrate deduplication tab in main App - Add threshold slider and method selector in sidebar 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .gitignore | 3 + backend/app/main.py | 7 +- backend/app/models/schemas.py | 35 +++ .../prompts/expert_transformation_prompt.py | 15 +- backend/app/routers/deduplication.py | 93 ++++++ backend/app/routers/expert_transformation.py | 38 ++- backend/app/services/embedding_service.py | 250 ++++++++++++++++ .../app/services/llm_deduplication_service.py | 252 ++++++++++++++++ backend/requirements.txt | 1 + frontend/src/App.tsx | 112 +++++++- .../src/components/DeduplicationPanel.tsx | 271 ++++++++++++++++++ .../components/TransformationInputPanel.tsx | 22 ++ .../src/components/TransformationPanel.tsx | 13 +- .../components/deduplication/GroupCard.tsx | 147 ++++++++++ frontend/src/hooks/useDeduplication.ts | 100 +++++++ frontend/src/hooks/useExpertTransformation.ts | 4 +- frontend/src/services/api.ts | 25 +- frontend/src/types/index.ts | 34 +++ 18 files changed, 1397 insertions(+), 25 deletions(-) create mode 100644 backend/app/routers/deduplication.py create mode 100644 backend/app/services/embedding_service.py create mode 100644 backend/app/services/llm_deduplication_service.py create mode 100644 frontend/src/components/DeduplicationPanel.tsx create mode 100644 frontend/src/components/deduplication/GroupCard.tsx create mode 100644 frontend/src/hooks/useDeduplication.ts diff --git a/.gitignore b/.gitignore index 8c01a56..ab102f8 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,6 @@ env/ .DS_Store .idea/ .vscode/ + +# Serena (MCP tools) +.serena/ diff --git a/backend/app/main.py b/backend/app/main.py index db3224b..a607f36 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -3,14 +3,18 @@ from contextlib import asynccontextmanager from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware -from .routers import attributes, transformation, expert_transformation +from .routers import attributes, transformation, expert_transformation, deduplication from .services.llm_service import ollama_provider +from .services.embedding_service import embedding_service +from .services.llm_deduplication_service import llm_deduplication_service @asynccontextmanager async def lifespan(app: FastAPI): yield await ollama_provider.close() + await embedding_service.close() + await llm_deduplication_service.close() app = FastAPI( @@ -31,6 +35,7 @@ app.add_middleware( app.include_router(attributes.router) app.include_router(transformation.router) app.include_router(expert_transformation.router) +app.include_router(deduplication.router) @app.get("/") diff --git a/backend/app/models/schemas.py b/backend/app/models/schemas.py index 57730b4..d277fae 100644 --- a/backend/app/models/schemas.py +++ b/backend/app/models/schemas.py @@ -232,3 +232,38 @@ class ExpertTransformationRequest(BaseModel): # LLM parameters model: Optional[str] = None temperature: Optional[float] = 0.7 + + +# ===== Deduplication Agent schemas ===== + +class DeduplicationMethod(str, Enum): + """去重方法""" + EMBEDDING = "embedding" # 向量相似度 + LLM = "llm" # LLM 成對判斷 + + +class DeduplicationRequest(BaseModel): + """去重請求""" + descriptions: List[ExpertTransformationDescription] + method: DeduplicationMethod = DeduplicationMethod.EMBEDDING # 去重方法 + similarity_threshold: float = 0.85 # 餘弦相似度閾值 (0.0-1.0),僅 Embedding 使用 + model: Optional[str] = None # Embedding/LLM 模型 + + +class DescriptionGroup(BaseModel): + """相似描述分組""" + group_id: str # "group-0", "group-1"... + representative: ExpertTransformationDescription # 代表描述 + duplicates: List[ExpertTransformationDescription] # 相似描述 + similarity_scores: List[float] # 每個重複項的相似度分數 + + +class DeduplicationResult(BaseModel): + """去重結果""" + total_input: int # 輸入描述總數 + total_groups: int # 分組數量 + total_duplicates: int # 重複項數量 + groups: List[DescriptionGroup] + threshold_used: float + method_used: DeduplicationMethod # 使用的去重方法 + model_used: str # 使用的模型 diff --git a/backend/app/prompts/expert_transformation_prompt.py b/backend/app/prompts/expert_transformation_prompt.py index 6d989b7..f40481c 100644 --- a/backend/app/prompts/expert_transformation_prompt.py +++ b/backend/app/prompts/expert_transformation_prompt.py @@ -90,16 +90,15 @@ def get_single_description_prompt( ) -> str: """Step 2: 為單一關鍵字生成描述""" # 如果 domain 是通用的,就只用職業名稱 - domain_text = f"({expert_domain})" if expert_domain and expert_domain != "Professional Field" else "" + domain_text = f"({expert_domain}領域)" if expert_domain and expert_domain != "Professional Field" else "" return f"""/no_think -物件:「{query}」 -專家:{expert_name}{domain_text} +你是一位{expert_name}{domain_text}。 + +任務:為「{query}」生成一段創新應用描述。 關鍵字:{keyword} -你是一位{expert_name}。從你的專業視角,生成一段創新應用描述(15-30字),說明如何將「{keyword}」的概念應用到「{query}」上。 +從你的專業視角,說明如何將「{keyword}」的概念應用到「{query}」上。描述要具體、有創意,15-30字。 -描述要體現{expert_name}的專業思維和獨特觀點。 - -回傳 JSON: -{{"description": "應用描述"}}""" +只回傳 JSON,不要其他文字: +{{"description": "你的創新應用描述"}}""" diff --git a/backend/app/routers/deduplication.py b/backend/app/routers/deduplication.py new file mode 100644 index 0000000..134cc42 --- /dev/null +++ b/backend/app/routers/deduplication.py @@ -0,0 +1,93 @@ +""" +Deduplication Router - 使用 Embedding 或 LLM 去重描述 + +提供 API 端點將相似的創新描述分組,幫助識別重複的想法。 +支援兩種方法: +- Embedding: 快速向量相似度比較 +- LLM: 精準語意判斷(較慢但更準確) +""" + +import logging +from fastapi import APIRouter, HTTPException + +from ..models.schemas import DeduplicationRequest, DeduplicationResult, DeduplicationMethod +from ..services.embedding_service import embedding_service +from ..services.llm_deduplication_service import llm_deduplication_service + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/api/deduplication", tags=["deduplication"]) + + +@router.post("/deduplicate", response_model=DeduplicationResult) +async def deduplicate_descriptions(request: DeduplicationRequest) -> DeduplicationResult: + """ + 去重描述 + + 支援兩種方法: + - embedding: 使用向量相似度(快速) + - llm: 使用 LLM 成對比較(精準但較慢) + + Args: + request: 去重請求,包含描述列表、方法選擇和相關參數 + + Returns: + DeduplicationResult: 去重結果,包含分組資訊 + + Raises: + HTTPException: 如果去重處理失敗 + """ + method = request.method + logger.info(f"Deduplication request: {len(request.descriptions)} descriptions, method={method.value}, threshold={request.similarity_threshold}") + + if not request.descriptions: + return DeduplicationResult( + total_input=0, + total_groups=0, + total_duplicates=0, + groups=[], + threshold_used=request.similarity_threshold, + method_used=method, + model_used=request.model or ("nomic-embed-text" if method == DeduplicationMethod.EMBEDDING else "qwen3:4b") + ) + + try: + if method == DeduplicationMethod.EMBEDDING: + # 使用 Embedding 相似度去重 + result = await embedding_service.deduplicate( + descriptions=request.descriptions, + threshold=request.similarity_threshold, + model=request.model + ) + else: + # 使用 LLM 成對比較去重 + result = await llm_deduplication_service.deduplicate( + descriptions=request.descriptions, + model=request.model + ) + return result + except ValueError as e: + logger.error(f"Deduplication failed: {e}") + raise HTTPException(status_code=500, detail=str(e)) + except Exception as e: + logger.error(f"Unexpected error during deduplication: {e}") + raise HTTPException(status_code=500, detail=f"Deduplication failed: {str(e)}") + + +@router.get("/models") +async def list_embedding_models(): + """ + 列出可用的 Embedding 模型 + + Returns: + dict: 可用模型列表和建議的預設模型 + """ + return { + "default": "nomic-embed-text", + "available": [ + {"name": "nomic-embed-text", "description": "Fast and efficient embedding model"}, + {"name": "mxbai-embed-large", "description": "High quality embeddings"}, + {"name": "all-minilm", "description": "Lightweight embedding model"}, + ], + "note": "Run 'ollama pull ' to install a model" + } diff --git a/backend/app/routers/expert_transformation.py b/backend/app/routers/expert_transformation.py index ae78344..07299b7 100644 --- a/backend/app/routers/expert_transformation.py +++ b/backend/app/routers/expert_transformation.py @@ -221,8 +221,27 @@ async def generate_expert_transformation_events( desc_prompt, model=model, temperature=temperature ) - desc_data = extract_json_from_response(desc_response) - desc_text = desc_data.get("description", "") + # 嘗試解析 JSON,若失敗則使用原始回應作為描述 + desc_text = "" + try: + desc_data = extract_json_from_response(desc_response) + # 支援多種可能的 key: description, content, text, desc + desc_text = ( + desc_data.get("description") or + desc_data.get("content") or + desc_data.get("text") or + desc_data.get("desc") or + "" + ) + except ValueError: + # JSON 解析失敗,嘗試清理原始回應作為描述 + cleaned = desc_response.strip() + # 移除可能的 markdown 和多餘符號 + if cleaned.startswith('"') and cleaned.endswith('"'): + cleaned = cleaned[1:-1] + if len(cleaned) > 5 and len(cleaned) < 100: + desc_text = cleaned + logger.info(f"[DESC] 使用 fallback 描述 for '{kw.keyword}': {desc_text[:50]}") if desc_text: descriptions.append(ExpertTransformationDescription( @@ -231,15 +250,22 @@ async def generate_expert_transformation_events( expert_name=kw.expert_name, description=desc_text )) + else: + logger.warning(f"[DESC] Empty description for keyword='{kw.keyword}', parsed_data={desc_data}") - # Send progress update - yield f"event: description_progress\ndata: {json.dumps({'current': idx + 1, 'total': len(all_expert_keywords), 'keyword': kw.keyword}, ensure_ascii=False)}\n\n" + # Send progress update with success/fail status + yield f"event: description_progress\ndata: {json.dumps({'current': idx + 1, 'total': len(all_expert_keywords), 'keyword': kw.keyword, 'success': bool(desc_text)}, ensure_ascii=False)}\n\n" except Exception as e: - logger.warning(f"Failed to generate description for '{kw.keyword}': {e}") + logger.warning(f"[DESC] Failed to generate description for '{kw.keyword}': {e}") + yield f"event: description_progress\ndata: {json.dumps({'current': idx + 1, 'total': len(all_expert_keywords), 'keyword': kw.keyword, 'success': False, 'error': str(e)}, ensure_ascii=False)}\n\n" # Continue with next keyword - yield f"event: description_complete\ndata: {json.dumps({'count': len(descriptions)}, ensure_ascii=False)}\n\n" + # 統計成功率 + success_rate = len(descriptions) / len(all_expert_keywords) * 100 if all_expert_keywords else 0 + logger.info(f"[DESC] 描述生成完成: {len(descriptions)}/{len(all_expert_keywords)} 成功 ({success_rate:.1f}%)") + + yield f"event: description_complete\ndata: {json.dumps({'count': len(descriptions), 'total': len(all_expert_keywords), 'success_rate': success_rate}, ensure_ascii=False)}\n\n" # ========== Build final result ========== result = ExpertTransformationCategoryResult( diff --git a/backend/app/services/embedding_service.py b/backend/app/services/embedding_service.py new file mode 100644 index 0000000..90908ad --- /dev/null +++ b/backend/app/services/embedding_service.py @@ -0,0 +1,250 @@ +""" +Embedding Service - generates embeddings and performs similarity-based deduplication + +使用 Ollama 的 embedding 端點生成向量,並透過餘弦相似度進行去重分組。 +""" + +import logging +from typing import List, Optional + +import httpx +import numpy as np + +from ..config import settings +from ..models.schemas import ( + ExpertTransformationDescription, + DeduplicationResult, + DeduplicationMethod, + DescriptionGroup, +) + +logger = logging.getLogger(__name__) + + +class EmbeddingService: + """Embedding 服務:生成向量並執行相似度去重""" + + def __init__(self): + self.base_url = settings.ollama_base_url + self.default_model = "nomic-embed-text" # Ollama 預設的 embedding 模型 + self.client = httpx.AsyncClient(timeout=120.0) + + async def get_embedding(self, text: str, model: Optional[str] = None) -> List[float]: + """取得單一文字的 embedding 向量""" + model = model or self.default_model + url = f"{self.base_url}/api/embed" + + try: + response = await self.client.post(url, json={ + "model": model, + "input": text + }) + response.raise_for_status() + result = response.json() + return result["embeddings"][0] + except httpx.HTTPStatusError as e: + logger.error(f"Embedding API error: {e.response.status_code} - {e.response.text}") + raise + except Exception as e: + logger.error(f"Embedding error: {e}") + raise + + async def get_embeddings_batch( + self, + texts: List[str], + model: Optional[str] = None + ) -> List[List[float]]: + """批次取得多個文字的 embedding 向量""" + if not texts: + return [] + + model = model or self.default_model + url = f"{self.base_url}/api/embed" + + try: + # Ollama 支援批次 embedding + response = await self.client.post(url, json={ + "model": model, + "input": texts + }) + response.raise_for_status() + result = response.json() + return result["embeddings"] + except httpx.HTTPStatusError as e: + logger.error(f"Batch embedding API error: {e.response.status_code} - {e.response.text}") + # 如果批次失敗,嘗試逐一處理 + logger.info("Falling back to single embedding requests...") + embeddings = [] + for text in texts: + emb = await self.get_embedding(text, model) + embeddings.append(emb) + return embeddings + except Exception as e: + logger.error(f"Batch embedding error: {e}") + raise + + def cosine_similarity(self, a: List[float], b: List[float]) -> float: + """計算兩個向量的餘弦相似度""" + a_np = np.array(a) + b_np = np.array(b) + norm_a = np.linalg.norm(a_np) + norm_b = np.linalg.norm(b_np) + if norm_a == 0 or norm_b == 0: + return 0.0 + return float(np.dot(a_np, b_np) / (norm_a * norm_b)) + + def build_similarity_matrix( + self, + embeddings: List[List[float]] + ) -> np.ndarray: + """建立成對相似度矩陣""" + n = len(embeddings) + matrix = np.zeros((n, n)) + + for i in range(n): + matrix[i][i] = 1.0 # 自己與自己的相似度為 1 + for j in range(i + 1, n): + sim = self.cosine_similarity(embeddings[i], embeddings[j]) + matrix[i][j] = sim + matrix[j][i] = sim + + return matrix + + def cluster_by_similarity( + self, + similarity_matrix: np.ndarray, + threshold: float + ) -> List[List[int]]: + """ + 貪婪聚類:將相似度 >= threshold 的項目分組 + + 演算法: + 1. 從第一個未分配的項目開始 + 2. 找出所有與該項目相似度 >= threshold 的項目 + 3. 歸入同一組 + 4. 重複直到所有項目都已分配 + + Returns: + List[List[int]]: 每個子列表包含同組項目的索引 + """ + n = len(similarity_matrix) + assigned = [False] * n + groups = [] + + for i in range(n): + if assigned[i]: + continue + + # 開始新的分組,以 item i 為代表 + group = [i] + assigned[i] = True + + # 找出所有與 i 相似的項目 + for j in range(i + 1, n): + if not assigned[j] and similarity_matrix[i][j] >= threshold: + group.append(j) + assigned[j] = True + + groups.append(group) + + return groups + + async def deduplicate( + self, + descriptions: List[ExpertTransformationDescription], + threshold: float = 0.85, + model: Optional[str] = None + ) -> DeduplicationResult: + """ + 主要去重方法 + + Args: + descriptions: 要去重的描述列表 + threshold: 相似度閾值 (0.0-1.0),預設 0.85 + model: Embedding 模型名稱 + + Returns: + DeduplicationResult: 去重結果,包含分組資訊 + """ + model = model or self.default_model + + # 空輸入處理 + if not descriptions: + return DeduplicationResult( + total_input=0, + total_groups=0, + total_duplicates=0, + groups=[], + threshold_used=threshold, + method_used=DeduplicationMethod.EMBEDDING, + model_used=model + ) + + # 提取描述文字 + texts = [d.description for d in descriptions] + logger.info(f"Generating embeddings for {len(texts)} descriptions using model '{model}'...") + + # 批次取得 embeddings + try: + embeddings = await self.get_embeddings_batch(texts, model) + except Exception as e: + logger.error(f"Failed to generate embeddings: {e}") + raise ValueError(f"Embedding generation failed: {e}. Make sure the model '{model}' is installed (run: ollama pull {model})") + + # 建立相似度矩陣 + logger.info("Building similarity matrix...") + sim_matrix = self.build_similarity_matrix(embeddings) + + # 聚類 + logger.info(f"Clustering with threshold {threshold}...") + clusters = self.cluster_by_similarity(sim_matrix, threshold) + + # 建立結果分組 + result_groups = [] + total_duplicates = 0 + + for group_idx, indices in enumerate(clusters): + if len(indices) == 1: + # 獨立項目 - 無重複 + result_groups.append(DescriptionGroup( + group_id=f"group-{group_idx}", + representative=descriptions[indices[0]], + duplicates=[], + similarity_scores=[] + )) + else: + # 有重複的分組 - 第一個為代表 + rep_idx = indices[0] + dup_indices = indices[1:] + dup_scores = [ + float(sim_matrix[rep_idx][idx]) + for idx in dup_indices + ] + + result_groups.append(DescriptionGroup( + group_id=f"group-{group_idx}", + representative=descriptions[rep_idx], + duplicates=[descriptions[idx] for idx in dup_indices], + similarity_scores=dup_scores + )) + total_duplicates += len(dup_indices) + + logger.info(f"Deduplication complete: {len(descriptions)} -> {len(result_groups)} groups, {total_duplicates} duplicates found") + + return DeduplicationResult( + total_input=len(descriptions), + total_groups=len(result_groups), + total_duplicates=total_duplicates, + groups=result_groups, + threshold_used=threshold, + method_used=DeduplicationMethod.EMBEDDING, + model_used=model + ) + + async def close(self): + """關閉 HTTP 客戶端""" + await self.client.aclose() + + +# 全域實例 +embedding_service = EmbeddingService() diff --git a/backend/app/services/llm_deduplication_service.py b/backend/app/services/llm_deduplication_service.py new file mode 100644 index 0000000..17bf8d9 --- /dev/null +++ b/backend/app/services/llm_deduplication_service.py @@ -0,0 +1,252 @@ +""" +LLM Deduplication Service - 使用 LLM 成對比較進行去重 + +讓 LLM 判斷兩個描述是否語意重複,透過並行處理加速。 +""" + +import asyncio +import logging +from typing import List, Tuple, Optional + +import httpx +import numpy as np + +from ..config import settings +from ..models.schemas import ( + ExpertTransformationDescription, + DeduplicationResult, + DeduplicationMethod, + DescriptionGroup, +) + +logger = logging.getLogger(__name__) + + +class LLMDeduplicationService: + """LLM 去重服務:使用 LLM 成對比較判斷語意相似度""" + + def __init__(self): + self.base_url = settings.ollama_base_url + self.default_model = "qwen3:4b" # 快速模型,適合簡單判斷 + self.client = httpx.AsyncClient(timeout=60.0) + self.max_concurrent = 5 # 最大並行數,避免 Ollama 過載 + + async def compare_pair( + self, + desc1: str, + desc2: str, + model: str, + semaphore: asyncio.Semaphore + ) -> bool: + """ + 讓 LLM 判斷兩個描述是否語意重複 + + Args: + desc1: 第一個描述 + desc2: 第二個描述 + model: LLM 模型名稱 + semaphore: 並行控制信號量 + + Returns: + bool: 是否為重複描述 + """ + async with semaphore: # 控制並行數 + prompt = f"""判斷以下兩個創新描述是否表達相同或非常相似的概念: + +描述1: {desc1} + +描述2: {desc2} + +如果兩者描述的創新概念本質相同或非常相似,回答 "YES" +如果兩者描述不同的創新概念,回答 "NO" +只回答 YES 或 NO,不要其他文字""" + + try: + response = await self.client.post( + f"{self.base_url}/api/generate", + json={ + "model": model, + "prompt": prompt, + "stream": False, + "options": { + "temperature": 0.1, # 低溫度以獲得一致的判斷 + "num_predict": 10, # 只需要短回答 + } + } + ) + response.raise_for_status() + result = response.json()["response"].strip().upper() + is_similar = result.startswith("YES") + logger.debug(f"LLM comparison: '{desc1[:30]}...' vs '{desc2[:30]}...' -> {result} ({is_similar})") + return is_similar + except Exception as e: + logger.error(f"LLM comparison failed: {e}") + return False # 失敗時假設不相似 + + async def compare_batch( + self, + pairs: List[Tuple[int, int, str, str]], + model: str + ) -> List[Tuple[int, int, bool]]: + """ + 並行批次比較多個描述對 + + Args: + pairs: 待比較的配對列表 [(i, j, desc1, desc2), ...] + model: LLM 模型名稱 + + Returns: + 比較結果列表 [(i, j, is_similar), ...] + """ + semaphore = asyncio.Semaphore(self.max_concurrent) + + async def compare_one(pair: Tuple[int, int, str, str]) -> Tuple[int, int, bool]: + i, j, desc1, desc2 = pair + is_similar = await self.compare_pair(desc1, desc2, model, semaphore) + return (i, j, is_similar) + + # 使用 asyncio.gather 並行執行所有比較 + results = await asyncio.gather(*[compare_one(p) for p in pairs]) + return results + + def cluster_by_similarity( + self, + similarity_matrix: np.ndarray, + threshold: float + ) -> List[List[int]]: + """ + 貪婪聚類:將相似度 >= threshold 的項目分組 + + 與 embedding_service 使用相同的演算法 + """ + n = len(similarity_matrix) + assigned = [False] * n + groups = [] + + for i in range(n): + if assigned[i]: + continue + + # 開始新的分組,以 item i 為代表 + group = [i] + assigned[i] = True + + # 找出所有與 i 相似的項目 + for j in range(i + 1, n): + if not assigned[j] and similarity_matrix[i][j] >= threshold: + group.append(j) + assigned[j] = True + + groups.append(group) + + return groups + + async def deduplicate( + self, + descriptions: List[ExpertTransformationDescription], + model: Optional[str] = None + ) -> DeduplicationResult: + """ + 使用 LLM 成對比較進行去重 + + Args: + descriptions: 要去重的描述列表 + model: LLM 模型名稱 + + Returns: + DeduplicationResult: 去重結果 + """ + model = model or self.default_model + + # 空輸入處理 + if not descriptions: + return DeduplicationResult( + total_input=0, + total_groups=0, + total_duplicates=0, + groups=[], + threshold_used=0.5, # LLM 方法固定使用 0.5 閾值 + method_used=DeduplicationMethod.LLM, + model_used=model + ) + + n = len(descriptions) + similarity_matrix = np.zeros((n, n)) + + # 對角線為 1(自己與自己相似) + for i in range(n): + similarity_matrix[i][i] = 1.0 + + # 建立所有需要比較的配對 + pairs = [] + for i in range(n): + for j in range(i + 1, n): + pairs.append(( + i, j, + descriptions[i].description, + descriptions[j].description + )) + + total_pairs = len(pairs) + logger.info(f"LLM deduplication: {total_pairs} pairs to compare (parallel={self.max_concurrent}, model={model})") + + # 並行批次比較 + results = await self.compare_batch(pairs, model) + + # 填入相似度矩陣 + for i, j, is_similar in results: + similarity_value = 1.0 if is_similar else 0.0 + similarity_matrix[i][j] = similarity_value + similarity_matrix[j][i] = similarity_value + + # 使用閾值 0.5 聚類(因為 LLM 輸出只有 0/1) + logger.info("Clustering results...") + clusters = self.cluster_by_similarity(similarity_matrix, 0.5) + + # 建立結果分組 + result_groups = [] + total_duplicates = 0 + + for group_idx, indices in enumerate(clusters): + if len(indices) == 1: + # 獨立項目 - 無重複 + result_groups.append(DescriptionGroup( + group_id=f"group-{group_idx}", + representative=descriptions[indices[0]], + duplicates=[], + similarity_scores=[] + )) + else: + # 有重複的分組 - 第一個為代表 + rep_idx = indices[0] + dup_indices = indices[1:] + # LLM 方法的相似度分數都是 1.0(因為是 YES/NO 判斷) + dup_scores = [1.0 for _ in dup_indices] + + result_groups.append(DescriptionGroup( + group_id=f"group-{group_idx}", + representative=descriptions[rep_idx], + duplicates=[descriptions[idx] for idx in dup_indices], + similarity_scores=dup_scores + )) + total_duplicates += len(dup_indices) + + logger.info(f"LLM deduplication complete: {n} -> {len(result_groups)} groups, {total_duplicates} duplicates found") + + return DeduplicationResult( + total_input=n, + total_groups=len(result_groups), + total_duplicates=total_duplicates, + groups=result_groups, + threshold_used=0.5, # LLM 方法固定使用 0.5 閾值 + method_used=DeduplicationMethod.LLM, + model_used=model + ) + + async def close(self): + """關閉 HTTP 客戶端""" + await self.client.aclose() + + +# 全域實例 +llm_deduplication_service = LLMDeduplicationService() diff --git a/backend/requirements.txt b/backend/requirements.txt index accbfcd..9d81be7 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -4,3 +4,4 @@ httpx>=0.26.0 pydantic>=2.5.0 pydantic-settings>=2.1.0 python-dotenv>=1.0.0 +numpy>=1.26.0 diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index bb78088..379c940 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -1,16 +1,17 @@ import { useState, useRef, useCallback, useEffect } from 'react'; -import { ConfigProvider, Layout, theme, Typography, Space, Tabs } from 'antd'; -import { ApartmentOutlined, ThunderboltOutlined } from '@ant-design/icons'; +import { ConfigProvider, Layout, theme, Typography, Space, Tabs, Slider, Radio } from 'antd'; +import { ApartmentOutlined, ThunderboltOutlined, FilterOutlined } from '@ant-design/icons'; import { ThemeToggle } from './components/ThemeToggle'; import { InputPanel } from './components/InputPanel'; import { TransformationInputPanel } from './components/TransformationInputPanel'; import { MindmapPanel } from './components/MindmapPanel'; import { TransformationPanel } from './components/TransformationPanel'; +import { DeduplicationPanel } from './components/DeduplicationPanel'; import { useAttribute } from './hooks/useAttribute'; import { getModels } from './services/api'; import type { MindmapDAGRef } from './components/MindmapDAG'; import type { TransformationDAGRef } from './components/TransformationDAG'; -import type { CategoryMode, ExpertSource } from './types'; +import type { CategoryMode, ExpertSource, ExpertTransformationDAGResult, DeduplicationMethod } from './types'; const { Header, Sider, Content } = Layout; const { Title } = Typography; @@ -45,8 +46,14 @@ function App() { }); const [customExpertsInput, setCustomExpertsInput] = useState(''); const [expertSource, setExpertSource] = useState('llm'); + const [expertLanguage, setExpertLanguage] = useState<'en' | 'zh'>('en'); const [shouldStartTransform, setShouldStartTransform] = useState(false); const [transformLoading, setTransformLoading] = useState(false); + const [transformationResult, setTransformationResult] = useState(null); + + // Deduplication settings + const [deduplicationThreshold, setDeduplicationThreshold] = useState(0.85); + const [deduplicationMethod, setDeduplicationMethod] = useState('embedding'); // Available models from API const [availableModels, setAvailableModels] = useState([]); @@ -188,9 +195,32 @@ function App() { temperature={transformTemperature} expertConfig={expertConfig} expertSource={expertSource} + expertLanguage={expertLanguage} shouldStartTransform={shouldStartTransform} onTransformComplete={() => setShouldStartTransform(false)} onLoadingChange={setTransformLoading} + onResultsChange={setTransformationResult} + /> + + ), + }, + { + key: 'deduplication', + label: ( + + + Deduplication + + ), + children: ( +
+
), @@ -206,7 +236,7 @@ function App() { overflow: 'auto', }} > - {activeTab === 'attribute' ? ( + {activeTab === 'attribute' && ( - ) : ( + )} + {activeTab === 'transformation' && ( )} + {activeTab === 'deduplication' && ( +
+ + + Deduplication Settings + + + {/* Method Selection */} +
+ + Method + + setDeduplicationMethod(e.target.value)} + buttonStyle="solid" + style={{ width: '100%' }} + > + + Embedding + + + LLM Judge + + + + {deduplicationMethod === 'embedding' + ? 'Fast vector similarity comparison' + : 'Accurate but slower pairwise LLM comparison'} + +
+ + {/* Threshold Slider - Only for Embedding method */} + {deduplicationMethod === 'embedding' && ( +
+ + Similarity Threshold + + + Higher = stricter matching, fewer groups + + `${((val ?? 0) * 100).toFixed(0)}%` }} + /> + + Current: {(deduplicationThreshold * 100).toFixed(0)}% similarity required + +
+ )} + + {/* LLM Warning */} + {deduplicationMethod === 'llm' && ( + + Note: LLM method requires N*(N-1)/2 comparisons. May take longer for many descriptions. + + )} +
+ )} diff --git a/frontend/src/components/DeduplicationPanel.tsx b/frontend/src/components/DeduplicationPanel.tsx new file mode 100644 index 0000000..3397a54 --- /dev/null +++ b/frontend/src/components/DeduplicationPanel.tsx @@ -0,0 +1,271 @@ +import React, { useEffect, useMemo } from 'react'; +import { + Card, + Button, + Slider, + Statistic, + Row, + Col, + Empty, + Spin, + Alert, + Typography, + Space, + Divider, +} from 'antd'; +import { + FilterOutlined, + ReloadOutlined, + CheckCircleOutlined, + ClusterOutlined, + CopyOutlined, +} from '@ant-design/icons'; +import { useDeduplication } from '../hooks/useDeduplication'; +import { GroupCard } from './deduplication/GroupCard'; +import type { + ExpertTransformationDAGResult, + ExpertTransformationDescription, + DeduplicationMethod, +} from '../types'; + +const { Title, Text } = Typography; + +interface DeduplicationPanelProps { + transformationResult: ExpertTransformationDAGResult | null; + isDark: boolean; + threshold: number; + onThresholdChange: (value: number) => void; + method: DeduplicationMethod; + onMethodChange?: (method: DeduplicationMethod) => void; // Optional, handled in App.tsx sidebar +} + +/** + * Panel for deduplicating transformation descriptions + */ +export const DeduplicationPanel: React.FC = ({ + transformationResult, + isDark, + threshold, + onThresholdChange, + method, + // onMethodChange is handled in App.tsx sidebar +}) => { + const { loading, result, error, progress, deduplicate, clearResult } = useDeduplication(); + + // Extract all descriptions from transformation result + const allDescriptions = useMemo(() => { + if (!transformationResult) return []; + + const descriptions: ExpertTransformationDescription[] = []; + for (const categoryResult of transformationResult.results) { + descriptions.push(...categoryResult.descriptions); + } + return descriptions; + }, [transformationResult]); + + // Clear result when transformation result or method changes + useEffect(() => { + clearResult(); + }, [transformationResult, method, clearResult]); + + const handleDeduplicate = () => { + if (allDescriptions.length > 0) { + deduplicate(allDescriptions, threshold, method); + } + }; + + const containerStyle: React.CSSProperties = { + height: '100%', + display: 'flex', + flexDirection: 'column', + padding: 16, + overflow: 'hidden', + }; + + const headerCardStyle: React.CSSProperties = { + marginBottom: 16, + background: isDark ? '#1f1f1f' : '#fff', + borderRadius: 8, + }; + + const resultsContainerStyle: React.CSSProperties = { + flex: 1, + overflow: 'auto', + paddingRight: 8, + }; + + // No transformation data + if (!transformationResult) { + return ( +
+ + + No transformation data available + + + Please run the Transformation Agent first + + + } + /> +
+ ); + } + + // No descriptions found + if (allDescriptions.length === 0) { + return ( +
+ +
+ ); + } + + return ( +
+ {/* Header Card with Controls */} + + + + } + /> + + + } + valueStyle={{ color: result ? '#52c41a' : undefined }} + /> + + + } + valueStyle={{ color: result?.total_duplicates ? '#fa8c16' : undefined }} + /> + + + + + Similarity Threshold: {(threshold * 100).toFixed(0)}% + + `${((val ?? 0) * 100).toFixed(0)}%` }} + /> + + + + + + + + + + {progress.message || 'Ready to analyze'} + + + + + {result && ( + + )} + + + + + + + {/* Error Alert */} + {error && ( + + )} + + {/* Loading State */} + {loading && ( +
+ +
+ {progress.message} +
+
+ )} + + {/* Results */} + {!loading && result && ( +
+ + <ClusterOutlined style={{ marginRight: 8 }} /> + {result.total_groups} Groups + {result.total_duplicates > 0 && ( + <Text type="secondary" style={{ fontSize: 14, fontWeight: 'normal', marginLeft: 8 }}> + ({result.total_duplicates} duplicates removed) + </Text> + )} + + + {result.groups.map((group, index) => ( + + ))} + + {result.total_groups === 0 && ( + + )} +
+ )} + + {/* Initial State - show prompt */} + {!loading && !result && !error && ( +
+ + + Ready to Deduplicate + + + Click the "Deduplicate" button to analyze {allDescriptions.length} descriptions + and group similar ones together. + +
+ )} +
+ ); +}; + +export default DeduplicationPanel; diff --git a/frontend/src/components/TransformationInputPanel.tsx b/frontend/src/components/TransformationInputPanel.tsx index 4ffbc11..e7333b3 100644 --- a/frontend/src/components/TransformationInputPanel.tsx +++ b/frontend/src/components/TransformationInputPanel.tsx @@ -12,6 +12,11 @@ const EXPERT_SOURCE_OPTIONS = [ { label: 'Wikidata', value: 'wikidata' as ExpertSource, description: '從 Wikidata 查詢職業 (需等待 API)' }, ]; +const EXPERT_LANGUAGE_OPTIONS = [ + { label: 'English', value: 'en' as const }, + { label: '中文', value: 'zh' as const }, +]; + interface TransformationInputPanelProps { onTransform: () => void; loading: boolean; @@ -26,6 +31,7 @@ interface TransformationInputPanelProps { }; customExpertsInput: string; expertSource: ExpertSource; + expertLanguage: 'en' | 'zh'; onModelChange: (model: string) => void; onTemperatureChange: (temperature: number) => void; onExpertConfigChange: (config: { @@ -35,6 +41,7 @@ interface TransformationInputPanelProps { }) => void; onCustomExpertsInputChange: (value: string) => void; onExpertSourceChange: (source: ExpertSource) => void; + onExpertLanguageChange: (language: 'en' | 'zh') => void; availableModels: string[]; } @@ -48,11 +55,13 @@ export const TransformationInputPanel: React.FC = expertConfig, customExpertsInput, expertSource, + expertLanguage, onModelChange, onTemperatureChange, onExpertConfigChange, onCustomExpertsInputChange, onExpertSourceChange, + onExpertLanguageChange, availableModels, }) => { return ( @@ -142,6 +151,19 @@ export const TransformationInputPanel: React.FC = {EXPERT_SOURCE_OPTIONS.find((opt) => opt.value === expertSource)?.description} + + {/* Language selector - only for curated source */} + {expertSource === 'curated' && ( +
+ 職業名稱語言 +