feat: Add Deduplication Agent with embedding and LLM methods

Implement a new Deduplication Agent that identifies and groups similar transformation descriptions. Supports two deduplication methods: - Embedding: Fast vector similarity comparison using cosine similarity - LLM: Accurate pairwise semantic comparison (slower but more precise) Backend changes: - Add deduplication router with /deduplicate endpoint - Add embedding_service for vector-based similarity - Add llm_deduplication_service for LLM-based comparison - Improve expert_transformation error handling and progress reporting Frontend changes: - Add DeduplicationPanel with interactive group visualization - Add useDeduplication hook for state management - Integrate deduplication tab in main App - Add threshold slider and method selector in sidebar 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 20:26:17 +08:00
parent 5571076406
commit bc281b8e0a
18 changed files with 1397 additions and 25 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -36,3 +36,6 @@ env/
 .DS_Store
 .idea/
 .vscode/
 # Serena (MCP tools)
 .serena/
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -3,14 +3,18 @@ from contextlib import asynccontextmanager
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
-from .routers import attributes, transformation, expert_transformation
+from .routers import attributes, transformation, expert_transformation, deduplication
 from .services.llm_service import ollama_provider
 from .services.embedding_service import embedding_service
 from .services.llm_deduplication_service import llm_deduplication_service
@asynccontextmanager
 async def lifespan(app: FastAPI):
    yield
    await ollama_provider.close()
    await embedding_service.close()
    await llm_deduplication_service.close()
 app = FastAPI(
@@ -31,6 +35,7 @@ app.add_middleware(
 app.include_router(attributes.router)
 app.include_router(transformation.router)
 app.include_router(expert_transformation.router)
 app.include_router(deduplication.router)
@app.get("/")
--- a/backend/app/models/schemas.py
+++ b/backend/app/models/schemas.py
@@ -232,3 +232,38 @@ class ExpertTransformationRequest(BaseModel):
    # LLM parameters
    model: Optional[str] = None
    temperature: Optional[float] = 0.7
 # ===== Deduplication Agent schemas =====
 class DeduplicationMethod(str, Enum):
    """去重方法"""
    EMBEDDING = "embedding"  # 向量相似度
    LLM = "llm"              # LLM 成對判斷
 class DeduplicationRequest(BaseModel):
    """去重請求"""
    descriptions: List[ExpertTransformationDescription]
    method: DeduplicationMethod = DeduplicationMethod.EMBEDDING  # 去重方法
    similarity_threshold: float = 0.85  # 餘弦相似度閾值 (0.0-1.0)，僅 Embedding 使用
    model: Optional[str] = None  # Embedding/LLM 模型
 class DescriptionGroup(BaseModel):
    """相似描述分組"""
    group_id: str                          # "group-0", "group-1"...
    representative: ExpertTransformationDescription  # 代表描述
    duplicates: List[ExpertTransformationDescription]  # 相似描述
    similarity_scores: List[float]         # 每個重複項的相似度分數
 class DeduplicationResult(BaseModel):
    """去重結果"""
    total_input: int           # 輸入描述總數
    total_groups: int          # 分組數量
    total_duplicates: int      # 重複項數量
    groups: List[DescriptionGroup]
    threshold_used: float
    method_used: DeduplicationMethod       # 使用的去重方法
    model_used: str                        # 使用的模型
--- a/backend/app/prompts/expert_transformation_prompt.py
+++ b/backend/app/prompts/expert_transformation_prompt.py
@@ -90,16 +90,15 @@ def get_single_description_prompt(
 ) -> str:
    """Step 2: 為單一關鍵字生成描述"""
    # 如果 domain 是通用的，就只用職業名稱
-    domain_text = f"（{expert_domain}）" if expert_domain and expert_domain != "Professional Field" else ""
+    domain_text = f"（{expert_domain}領域）" if expert_domain and expert_domain != "Professional Field" else ""
    return f"""/no_think
-物件：「{query}」
+你是一位{expert_name}{domain_text}。
-專家：{expert_name}{domain_text}
+
 任務：為「{query}」生成一段創新應用描述。
 關鍵字：{keyword}
-你是一位{expert_name}。從你的專業視角，生成一段創新應用描述（15-30字），說明如何將「{keyword}」的概念應用到「{query}」上。
+從你的專業視角，說明如何將「{keyword}」的概念應用到「{query}」上。描述要具體、有創意，15-30字。
-描述要體現{expert_name}的專業思維和獨特觀點。
+只回傳 JSON，不要其他文字：
-
+{{"description": "你的創新應用描述"}}"""
 回傳 JSON：
 {{"description": "應用描述"}}"""
--- a/backend/app/routers/deduplication.py
+++ b/backend/app/routers/deduplication.py
@@ -0,0 +1,93 @@
 """
 Deduplication Router - 使用 Embedding 或 LLM 去重描述
 提供 API 端點將相似的創新描述分組，幫助識別重複的想法。
 支援兩種方法：
 - Embedding: 快速向量相似度比較
 - LLM: 精準語意判斷（較慢但更準確）
 """
 import logging
 from fastapi import APIRouter, HTTPException
 from ..models.schemas import DeduplicationRequest, DeduplicationResult, DeduplicationMethod
 from ..services.embedding_service import embedding_service
 from ..services.llm_deduplication_service import llm_deduplication_service
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/deduplication", tags=["deduplication"])
@router.post("/deduplicate", response_model=DeduplicationResult)
 async def deduplicate_descriptions(request: DeduplicationRequest) -> DeduplicationResult:
    """
    去重描述
    支援兩種方法：
    - embedding: 使用向量相似度（快速）
    - llm: 使用 LLM 成對比較（精準但較慢）
    Args:
        request: 去重請求，包含描述列表、方法選擇和相關參數
    Returns:
        DeduplicationResult: 去重結果，包含分組資訊
    Raises:
        HTTPException: 如果去重處理失敗
    """
    method = request.method
    logger.info(f"Deduplication request: {len(request.descriptions)} descriptions, method={method.value}, threshold={request.similarity_threshold}")
    if not request.descriptions:
        return DeduplicationResult(
            total_input=0,
            total_groups=0,
            total_duplicates=0,
            groups=[],
            threshold_used=request.similarity_threshold,
            method_used=method,
            model_used=request.model or ("nomic-embed-text" if method == DeduplicationMethod.EMBEDDING else "qwen3:4b")
        )
    try:
        if method == DeduplicationMethod.EMBEDDING:
            # 使用 Embedding 相似度去重
            result = await embedding_service.deduplicate(
                descriptions=request.descriptions,
                threshold=request.similarity_threshold,
                model=request.model
            )
        else:
            # 使用 LLM 成對比較去重
            result = await llm_deduplication_service.deduplicate(
                descriptions=request.descriptions,
                model=request.model
            )
        return result
    except ValueError as e:
        logger.error(f"Deduplication failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))
    except Exception as e:
        logger.error(f"Unexpected error during deduplication: {e}")
        raise HTTPException(status_code=500, detail=f"Deduplication failed: {str(e)}")
@router.get("/models")
 async def list_embedding_models():
    """
    列出可用的 Embedding 模型
    Returns:
        dict: 可用模型列表和建議的預設模型
    """
    return {
        "default": "nomic-embed-text",
        "available": [
            {"name": "nomic-embed-text", "description": "Fast and efficient embedding model"},
            {"name": "mxbai-embed-large", "description": "High quality embeddings"},
            {"name": "all-minilm", "description": "Lightweight embedding model"},
        ],
        "note": "Run 'ollama pull <model>' to install a model"
    }
--- a/backend/app/routers/expert_transformation.py
+++ b/backend/app/routers/expert_transformation.py
@@ -221,8 +221,27 @@ async def generate_expert_transformation_events(
                    desc_prompt, model=model, temperature=temperature
                )
-                desc_data = extract_json_from_response(desc_response)
+                # 嘗試解析 JSON，若失敗則使用原始回應作為描述
-                desc_text = desc_data.get("description", "")
+                desc_text = ""
                try:
                    desc_data = extract_json_from_response(desc_response)
                    # 支援多種可能的 key: description, content, text, desc
                    desc_text = (
                        desc_data.get("description") or
                        desc_data.get("content") or
                        desc_data.get("text") or
                        desc_data.get("desc") or
                        ""
                    )
                except ValueError:
                    # JSON 解析失敗，嘗試清理原始回應作為描述
                    cleaned = desc_response.strip()
                    # 移除可能的 markdown 和多餘符號
                    if cleaned.startswith('"') and cleaned.endswith('"'):
                        cleaned = cleaned[1:-1]
                    if len(cleaned) > 5 and len(cleaned) < 100:
                        desc_text = cleaned
                        logger.info(f"[DESC] 使用 fallback 描述 for '{kw.keyword}': {desc_text[:50]}")
                if desc_text:
                    descriptions.append(ExpertTransformationDescription(
@@ -231,15 +250,22 @@ async def generate_expert_transformation_events(
                        expert_name=kw.expert_name,
                        description=desc_text
                    ))
                else:
                    logger.warning(f"[DESC] Empty description for keyword='{kw.keyword}', parsed_data={desc_data}")
-                # Send progress update
+                # Send progress update with success/fail status
-                yield f"event: description_progress\ndata: {json.dumps({'current': idx + 1, 'total': len(all_expert_keywords), 'keyword': kw.keyword}, ensure_ascii=False)}\n\n"
+                yield f"event: description_progress\ndata: {json.dumps({'current': idx + 1, 'total': len(all_expert_keywords), 'keyword': kw.keyword, 'success': bool(desc_text)}, ensure_ascii=False)}\n\n"
            except Exception as e:
-                logger.warning(f"Failed to generate description for '{kw.keyword}': {e}")
+                logger.warning(f"[DESC] Failed to generate description for '{kw.keyword}': {e}")
                yield f"event: description_progress\ndata: {json.dumps({'current': idx + 1, 'total': len(all_expert_keywords), 'keyword': kw.keyword, 'success': False, 'error': str(e)}, ensure_ascii=False)}\n\n"
                # Continue with next keyword
-        yield f"event: description_complete\ndata: {json.dumps({'count': len(descriptions)}, ensure_ascii=False)}\n\n"
+        # 統計成功率
        success_rate = len(descriptions) / len(all_expert_keywords) * 100 if all_expert_keywords else 0
        logger.info(f"[DESC] 描述生成完成: {len(descriptions)}/{len(all_expert_keywords)} 成功 ({success_rate:.1f}%)")
        yield f"event: description_complete\ndata: {json.dumps({'count': len(descriptions), 'total': len(all_expert_keywords), 'success_rate': success_rate}, ensure_ascii=False)}\n\n"
        # ========== Build final result ==========
        result = ExpertTransformationCategoryResult(
--- a/backend/app/services/embedding_service.py
+++ b/backend/app/services/embedding_service.py
@@ -0,0 +1,250 @@
 """
 Embedding Service - generates embeddings and performs similarity-based deduplication
 使用 Ollama 的 embedding 端點生成向量，並透過餘弦相似度進行去重分組。
 """
 import logging
 from typing import List, Optional
 import httpx
 import numpy as np
 from ..config import settings
 from ..models.schemas import (
    ExpertTransformationDescription,
    DeduplicationResult,
    DeduplicationMethod,
    DescriptionGroup,
 )
 logger = logging.getLogger(__name__)
 class EmbeddingService:
    """Embedding 服務：生成向量並執行相似度去重"""
    def __init__(self):
        self.base_url = settings.ollama_base_url
        self.default_model = "nomic-embed-text"  # Ollama 預設的 embedding 模型
        self.client = httpx.AsyncClient(timeout=120.0)
    async def get_embedding(self, text: str, model: Optional[str] = None) -> List[float]:
        """取得單一文字的 embedding 向量"""
        model = model or self.default_model
        url = f"{self.base_url}/api/embed"
        try:
            response = await self.client.post(url, json={
                "model": model,
                "input": text
            })
            response.raise_for_status()
            result = response.json()
            return result["embeddings"][0]
        except httpx.HTTPStatusError as e:
            logger.error(f"Embedding API error: {e.response.status_code} - {e.response.text}")
            raise
        except Exception as e:
            logger.error(f"Embedding error: {e}")
            raise
    async def get_embeddings_batch(
        self,
        texts: List[str],
        model: Optional[str] = None
    ) -> List[List[float]]:
        """批次取得多個文字的 embedding 向量"""
        if not texts:
            return []
        model = model or self.default_model
        url = f"{self.base_url}/api/embed"
        try:
            # Ollama 支援批次 embedding
            response = await self.client.post(url, json={
                "model": model,
                "input": texts
            })
            response.raise_for_status()
            result = response.json()
            return result["embeddings"]
        except httpx.HTTPStatusError as e:
            logger.error(f"Batch embedding API error: {e.response.status_code} - {e.response.text}")
            # 如果批次失敗，嘗試逐一處理
            logger.info("Falling back to single embedding requests...")
            embeddings = []
            for text in texts:
                emb = await self.get_embedding(text, model)
                embeddings.append(emb)
            return embeddings
        except Exception as e:
            logger.error(f"Batch embedding error: {e}")
            raise
    def cosine_similarity(self, a: List[float], b: List[float]) -> float:
        """計算兩個向量的餘弦相似度"""
        a_np = np.array(a)
        b_np = np.array(b)
        norm_a = np.linalg.norm(a_np)
        norm_b = np.linalg.norm(b_np)
        if norm_a == 0 or norm_b == 0:
            return 0.0
        return float(np.dot(a_np, b_np) / (norm_a * norm_b))
    def build_similarity_matrix(
        self,
        embeddings: List[List[float]]
    ) -> np.ndarray:
        """建立成對相似度矩陣"""
        n = len(embeddings)
        matrix = np.zeros((n, n))
        for i in range(n):
            matrix[i][i] = 1.0  # 自己與自己的相似度為 1
            for j in range(i + 1, n):
                sim = self.cosine_similarity(embeddings[i], embeddings[j])
                matrix[i][j] = sim
                matrix[j][i] = sim
        return matrix
    def cluster_by_similarity(
        self,
        similarity_matrix: np.ndarray,
        threshold: float
    ) -> List[List[int]]:
        """
        貪婪聚類：將相似度 >= threshold 的項目分組
        演算法：
        1. 從第一個未分配的項目開始
        2. 找出所有與該項目相似度 >= threshold 的項目
        3. 歸入同一組
        4. 重複直到所有項目都已分配
        Returns:
            List[List[int]]: 每個子列表包含同組項目的索引
        """
        n = len(similarity_matrix)
        assigned = [False] * n
        groups = []
        for i in range(n):
            if assigned[i]:
                continue
            # 開始新的分組，以 item i 為代表
            group = [i]
            assigned[i] = True
            # 找出所有與 i 相似的項目
            for j in range(i + 1, n):
                if not assigned[j] and similarity_matrix[i][j] >= threshold:
                    group.append(j)
                    assigned[j] = True
            groups.append(group)
        return groups
    async def deduplicate(
        self,
        descriptions: List[ExpertTransformationDescription],
        threshold: float = 0.85,
        model: Optional[str] = None
    ) -> DeduplicationResult:
        """
        主要去重方法
        Args:
            descriptions: 要去重的描述列表
            threshold: 相似度閾值 (0.0-1.0)，預設 0.85
            model: Embedding 模型名稱
        Returns:
            DeduplicationResult: 去重結果，包含分組資訊
        """
        model = model or self.default_model
        # 空輸入處理
        if not descriptions:
            return DeduplicationResult(
                total_input=0,
                total_groups=0,
                total_duplicates=0,
                groups=[],
                threshold_used=threshold,
                method_used=DeduplicationMethod.EMBEDDING,
                model_used=model
            )
        # 提取描述文字
        texts = [d.description for d in descriptions]
        logger.info(f"Generating embeddings for {len(texts)} descriptions using model '{model}'...")
        # 批次取得 embeddings
        try:
            embeddings = await self.get_embeddings_batch(texts, model)
        except Exception as e:
            logger.error(f"Failed to generate embeddings: {e}")
            raise ValueError(f"Embedding generation failed: {e}. Make sure the model '{model}' is installed (run: ollama pull {model})")
        # 建立相似度矩陣
        logger.info("Building similarity matrix...")
        sim_matrix = self.build_similarity_matrix(embeddings)
        # 聚類
        logger.info(f"Clustering with threshold {threshold}...")
        clusters = self.cluster_by_similarity(sim_matrix, threshold)
        # 建立結果分組
        result_groups = []
        total_duplicates = 0
        for group_idx, indices in enumerate(clusters):
            if len(indices) == 1:
                # 獨立項目 - 無重複
                result_groups.append(DescriptionGroup(
                    group_id=f"group-{group_idx}",
                    representative=descriptions[indices[0]],
                    duplicates=[],
                    similarity_scores=[]
                ))
            else:
                # 有重複的分組 - 第一個為代表
                rep_idx = indices[0]
                dup_indices = indices[1:]
                dup_scores = [
                    float(sim_matrix[rep_idx][idx])
                    for idx in dup_indices
                ]
                result_groups.append(DescriptionGroup(
                    group_id=f"group-{group_idx}",
                    representative=descriptions[rep_idx],
                    duplicates=[descriptions[idx] for idx in dup_indices],
                    similarity_scores=dup_scores
                ))
                total_duplicates += len(dup_indices)
        logger.info(f"Deduplication complete: {len(descriptions)} -> {len(result_groups)} groups, {total_duplicates} duplicates found")
        return DeduplicationResult(
            total_input=len(descriptions),
            total_groups=len(result_groups),
            total_duplicates=total_duplicates,
            groups=result_groups,
            threshold_used=threshold,
            method_used=DeduplicationMethod.EMBEDDING,
            model_used=model
        )
    async def close(self):
        """關閉 HTTP 客戶端"""
        await self.client.aclose()
 # 全域實例
 embedding_service = EmbeddingService()
--- a/backend/app/services/llm_deduplication_service.py
+++ b/backend/app/services/llm_deduplication_service.py
@@ -0,0 +1,252 @@
 """
 LLM Deduplication Service - 使用 LLM 成對比較進行去重
 讓 LLM 判斷兩個描述是否語意重複，透過並行處理加速。
 """
 import asyncio
 import logging
 from typing import List, Tuple, Optional
 import httpx
 import numpy as np
 from ..config import settings
 from ..models.schemas import (
    ExpertTransformationDescription,
    DeduplicationResult,
    DeduplicationMethod,
    DescriptionGroup,
 )
 logger = logging.getLogger(__name__)
 class LLMDeduplicationService:
    """LLM 去重服務：使用 LLM 成對比較判斷語意相似度"""
    def __init__(self):
        self.base_url = settings.ollama_base_url
        self.default_model = "qwen3:4b"  # 快速模型，適合簡單判斷
        self.client = httpx.AsyncClient(timeout=60.0)
        self.max_concurrent = 5  # 最大並行數，避免 Ollama 過載
    async def compare_pair(
        self,
        desc1: str,
        desc2: str,
        model: str,
        semaphore: asyncio.Semaphore
    ) -> bool:
        """
        讓 LLM 判斷兩個描述是否語意重複
        Args:
            desc1: 第一個描述
            desc2: 第二個描述
            model: LLM 模型名稱
            semaphore: 並行控制信號量
        Returns:
            bool: 是否為重複描述
        """
        async with semaphore:  # 控制並行數
            prompt = f"""判斷以下兩個創新描述是否表達相同或非常相似的概念：
 描述1: {desc1}
 描述2: {desc2}
 如果兩者描述的創新概念本質相同或非常相似，回答 "YES"
 如果兩者描述不同的創新概念，回答 "NO"
 只回答 YES 或 NO，不要其他文字"""
            try:
                response = await self.client.post(
                    f"{self.base_url}/api/generate",
                    json={
                        "model": model,
                        "prompt": prompt,
                        "stream": False,
                        "options": {
                            "temperature": 0.1,  # 低溫度以獲得一致的判斷
                            "num_predict": 10,   # 只需要短回答
                        }
                    }
                )
                response.raise_for_status()
                result = response.json()["response"].strip().upper()
                is_similar = result.startswith("YES")
                logger.debug(f"LLM comparison: '{desc1[:30]}...' vs '{desc2[:30]}...' -> {result} ({is_similar})")
                return is_similar
            except Exception as e:
                logger.error(f"LLM comparison failed: {e}")
                return False  # 失敗時假設不相似
    async def compare_batch(
        self,
        pairs: List[Tuple[int, int, str, str]],
        model: str
    ) -> List[Tuple[int, int, bool]]:
        """
        並行批次比較多個描述對
        Args:
            pairs: 待比較的配對列表 [(i, j, desc1, desc2), ...]
            model: LLM 模型名稱
        Returns:
            比較結果列表 [(i, j, is_similar), ...]
        """
        semaphore = asyncio.Semaphore(self.max_concurrent)
        async def compare_one(pair: Tuple[int, int, str, str]) -> Tuple[int, int, bool]:
            i, j, desc1, desc2 = pair
            is_similar = await self.compare_pair(desc1, desc2, model, semaphore)
            return (i, j, is_similar)
        # 使用 asyncio.gather 並行執行所有比較
        results = await asyncio.gather(*[compare_one(p) for p in pairs])
        return results
    def cluster_by_similarity(
        self,
        similarity_matrix: np.ndarray,
        threshold: float
    ) -> List[List[int]]:
        """
        貪婪聚類：將相似度 >= threshold 的項目分組
        與 embedding_service 使用相同的演算法
        """
        n = len(similarity_matrix)
        assigned = [False] * n
        groups = []
        for i in range(n):
            if assigned[i]:
                continue
            # 開始新的分組，以 item i 為代表
            group = [i]
            assigned[i] = True
            # 找出所有與 i 相似的項目
            for j in range(i + 1, n):
                if not assigned[j] and similarity_matrix[i][j] >= threshold:
                    group.append(j)
                    assigned[j] = True
            groups.append(group)
        return groups
    async def deduplicate(
        self,
        descriptions: List[ExpertTransformationDescription],
        model: Optional[str] = None
    ) -> DeduplicationResult:
        """
        使用 LLM 成對比較進行去重
        Args:
            descriptions: 要去重的描述列表
            model: LLM 模型名稱
        Returns:
            DeduplicationResult: 去重結果
        """
        model = model or self.default_model
        # 空輸入處理
        if not descriptions:
            return DeduplicationResult(
                total_input=0,
                total_groups=0,
                total_duplicates=0,
                groups=[],
                threshold_used=0.5,  # LLM 方法固定使用 0.5 閾值
                method_used=DeduplicationMethod.LLM,
                model_used=model
            )
        n = len(descriptions)
        similarity_matrix = np.zeros((n, n))
        # 對角線為 1（自己與自己相似）
        for i in range(n):
            similarity_matrix[i][i] = 1.0
        # 建立所有需要比較的配對
        pairs = []
        for i in range(n):
            for j in range(i + 1, n):
                pairs.append((
                    i, j,
                    descriptions[i].description,
                    descriptions[j].description
                ))
        total_pairs = len(pairs)
        logger.info(f"LLM deduplication: {total_pairs} pairs to compare (parallel={self.max_concurrent}, model={model})")
        # 並行批次比較
        results = await self.compare_batch(pairs, model)
        # 填入相似度矩陣
        for i, j, is_similar in results:
            similarity_value = 1.0 if is_similar else 0.0
            similarity_matrix[i][j] = similarity_value
            similarity_matrix[j][i] = similarity_value
        # 使用閾值 0.5 聚類（因為 LLM 輸出只有 0/1）
        logger.info("Clustering results...")
        clusters = self.cluster_by_similarity(similarity_matrix, 0.5)
        # 建立結果分組
        result_groups = []
        total_duplicates = 0
        for group_idx, indices in enumerate(clusters):
            if len(indices) == 1:
                # 獨立項目 - 無重複
                result_groups.append(DescriptionGroup(
                    group_id=f"group-{group_idx}",
                    representative=descriptions[indices[0]],
                    duplicates=[],
                    similarity_scores=[]
                ))
            else:
                # 有重複的分組 - 第一個為代表
                rep_idx = indices[0]
                dup_indices = indices[1:]
                # LLM 方法的相似度分數都是 1.0（因為是 YES/NO 判斷）
                dup_scores = [1.0 for _ in dup_indices]
                result_groups.append(DescriptionGroup(
                    group_id=f"group-{group_idx}",
                    representative=descriptions[rep_idx],
                    duplicates=[descriptions[idx] for idx in dup_indices],
                    similarity_scores=dup_scores
                ))
                total_duplicates += len(dup_indices)
        logger.info(f"LLM deduplication complete: {n} -> {len(result_groups)} groups, {total_duplicates} duplicates found")
        return DeduplicationResult(
            total_input=n,
            total_groups=len(result_groups),
            total_duplicates=total_duplicates,
            groups=result_groups,
            threshold_used=0.5,  # LLM 方法固定使用 0.5 閾值
            method_used=DeduplicationMethod.LLM,
            model_used=model
        )
    async def close(self):
        """關閉 HTTP 客戶端"""
        await self.client.aclose()
 # 全域實例
 llm_deduplication_service = LLMDeduplicationService()
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -4,3 +4,4 @@ httpx>=0.26.0
 pydantic>=2.5.0
 pydantic-settings>=2.1.0
 python-dotenv>=1.0.0
 numpy>=1.26.0
--- a/frontend/src/App.tsx
+++ b/frontend/src/App.tsx
@@ -1,16 +1,17 @@
 import { useState, useRef, useCallback, useEffect } from 'react';
-import { ConfigProvider, Layout, theme, Typography, Space, Tabs } from 'antd';
+import { ConfigProvider, Layout, theme, Typography, Space, Tabs, Slider, Radio } from 'antd';
-import { ApartmentOutlined, ThunderboltOutlined } from '@ant-design/icons';
+import { ApartmentOutlined, ThunderboltOutlined, FilterOutlined } from '@ant-design/icons';
 import { ThemeToggle } from './components/ThemeToggle';
 import { InputPanel } from './components/InputPanel';
 import { TransformationInputPanel } from './components/TransformationInputPanel';
 import { MindmapPanel } from './components/MindmapPanel';
 import { TransformationPanel } from './components/TransformationPanel';
 import { DeduplicationPanel } from './components/DeduplicationPanel';
 import { useAttribute } from './hooks/useAttribute';
 import { getModels } from './services/api';
 import type { MindmapDAGRef } from './components/MindmapDAG';
 import type { TransformationDAGRef } from './components/TransformationDAG';
-import type { CategoryMode, ExpertSource } from './types';
+import type { CategoryMode, ExpertSource, ExpertTransformationDAGResult, DeduplicationMethod } from './types';
 const { Header, Sider, Content } = Layout;
 const { Title } = Typography;
@@ -45,8 +46,14 @@ function App() {
  });
  const [customExpertsInput, setCustomExpertsInput] = useState('');
  const [expertSource, setExpertSource] = useState<ExpertSource>('llm');
  const [expertLanguage, setExpertLanguage] = useState<'en' | 'zh'>('en');
  const [shouldStartTransform, setShouldStartTransform] = useState(false);
  const [transformLoading, setTransformLoading] = useState(false);
  const [transformationResult, setTransformationResult] = useState<ExpertTransformationDAGResult | null>(null);
  // Deduplication settings
  const [deduplicationThreshold, setDeduplicationThreshold] = useState(0.85);
  const [deduplicationMethod, setDeduplicationMethod] = useState<DeduplicationMethod>('embedding');
  // Available models from API
  const [availableModels, setAvailableModels] = useState<string[]>([]);
@@ -188,9 +195,32 @@ function App() {
                        temperature={transformTemperature}
                        expertConfig={expertConfig}
                        expertSource={expertSource}
                        expertLanguage={expertLanguage}
                        shouldStartTransform={shouldStartTransform}
                        onTransformComplete={() => setShouldStartTransform(false)}
                        onLoadingChange={setTransformLoading}
                        onResultsChange={setTransformationResult}
                      />
                    </div>
                  ),
                },
                {
                  key: 'deduplication',
                  label: (
                    <span>
                      <FilterOutlined style={{ marginRight: 8 }} />
                      Deduplication
                    </span>
                  ),
                  children: (
                    <div style={{ height: 'calc(100vh - 140px)' }}>
                      <DeduplicationPanel
                        transformationResult={transformationResult}
                        isDark={isDark}
                        threshold={deduplicationThreshold}
                        onThresholdChange={setDeduplicationThreshold}
                        method={deduplicationMethod}
                        onMethodChange={setDeduplicationMethod}
                      />
                    </div>
                  ),
@@ -206,7 +236,7 @@ function App() {
              overflow: 'auto',
            }}
          >
-            {activeTab === 'attribute' ? (
+            {activeTab === 'attribute' && (
              <InputPanel
                loading={loading}
                progress={progress}
@@ -218,7 +248,8 @@ function App() {
                visualSettings={visualSettings}
                onVisualSettingsChange={setVisualSettings}
              />
-            ) : (
+            )}
            {activeTab === 'transformation' && (
              <TransformationInputPanel
                onTransform={handleTransform}
                loading={transformLoading}
@@ -229,14 +260,85 @@ function App() {
                expertConfig={expertConfig}
                customExpertsInput={customExpertsInput}
                expertSource={expertSource}
                expertLanguage={expertLanguage}
                onModelChange={setTransformModel}
                onTemperatureChange={setTransformTemperature}
                onExpertConfigChange={setExpertConfig}
                onCustomExpertsInputChange={setCustomExpertsInput}
                onExpertSourceChange={setExpertSource}
                onExpertLanguageChange={setExpertLanguage}
                availableModels={availableModels}
              />
            )}
            {activeTab === 'deduplication' && (
              <div style={{ padding: 16 }}>
                <Typography.Title level={5} style={{ marginBottom: 16 }}>
                  <FilterOutlined style={{ marginRight: 8 }} />
                  Deduplication Settings
                </Typography.Title>
                {/* Method Selection */}
                <div style={{ marginBottom: 20 }}>
                  <Typography.Text strong style={{ display: 'block', marginBottom: 8 }}>
                    Method
                  </Typography.Text>
                  <Radio.Group
                    value={deduplicationMethod}
                    onChange={(e) => setDeduplicationMethod(e.target.value)}
                    buttonStyle="solid"
                    style={{ width: '100%' }}
                  >
                    <Radio.Button value="embedding" style={{ width: '50%', textAlign: 'center' }}>
                      Embedding
                    </Radio.Button>
                    <Radio.Button value="llm" style={{ width: '50%', textAlign: 'center' }}>
                      LLM Judge
                    </Radio.Button>
                  </Radio.Group>
                  <Typography.Text type="secondary" style={{ display: 'block', marginTop: 8, fontSize: 12 }}>
                    {deduplicationMethod === 'embedding'
                      ? 'Fast vector similarity comparison'
                      : 'Accurate but slower pairwise LLM comparison'}
                  </Typography.Text>
                </div>
                {/* Threshold Slider - Only for Embedding method */}
                {deduplicationMethod === 'embedding' && (
                  <div style={{ marginBottom: 20 }}>
                    <Typography.Text strong style={{ display: 'block', marginBottom: 8 }}>
                      Similarity Threshold
                    </Typography.Text>
                    <Typography.Text type="secondary" style={{ display: 'block', marginBottom: 12, fontSize: 12 }}>
                      Higher = stricter matching, fewer groups
                    </Typography.Text>
                    <Slider
                      min={0.5}
                      max={1.0}
                      step={0.05}
                      value={deduplicationThreshold}
                      onChange={setDeduplicationThreshold}
                      marks={{
                        0.5: '50%',
                        0.7: '70%',
                        0.85: '85%',
                        1.0: '100%',
                      }}
                      tooltip={{ formatter: (val) => `${((val ?? 0) * 100).toFixed(0)}%` }}
                    />
                    <Typography.Text type="secondary" style={{ fontSize: 12 }}>
                      Current: {(deduplicationThreshold * 100).toFixed(0)}% similarity required
                    </Typography.Text>
                  </div>
                )}
                {/* LLM Warning */}
                {deduplicationMethod === 'llm' && (
                  <Typography.Text type="warning" style={{ display: 'block', fontSize: 12 }}>
                    Note: LLM method requires N*(N-1)/2 comparisons. May take longer for many descriptions.
                  </Typography.Text>
                )}
              </div>
            )}
          </Sider>
        </Layout>
      </Layout>
--- a/frontend/src/components/DeduplicationPanel.tsx
+++ b/frontend/src/components/DeduplicationPanel.tsx
@@ -0,0 +1,271 @@
 import React, { useEffect, useMemo } from 'react';
 import {
  Card,
  Button,
  Slider,
  Statistic,
  Row,
  Col,
  Empty,
  Spin,
  Alert,
  Typography,
  Space,
  Divider,
 } from 'antd';
 import {
  FilterOutlined,
  ReloadOutlined,
  CheckCircleOutlined,
  ClusterOutlined,
  CopyOutlined,
 } from '@ant-design/icons';
 import { useDeduplication } from '../hooks/useDeduplication';
 import { GroupCard } from './deduplication/GroupCard';
 import type {
  ExpertTransformationDAGResult,
  ExpertTransformationDescription,
  DeduplicationMethod,
 } from '../types';
 const { Title, Text } = Typography;
 interface DeduplicationPanelProps {
  transformationResult: ExpertTransformationDAGResult | null;
  isDark: boolean;
  threshold: number;
  onThresholdChange: (value: number) => void;
  method: DeduplicationMethod;
  onMethodChange?: (method: DeduplicationMethod) => void;  // Optional, handled in App.tsx sidebar
 }
 /**
 * Panel for deduplicating transformation descriptions
 */
 export const DeduplicationPanel: React.FC<DeduplicationPanelProps> = ({
  transformationResult,
  isDark,
  threshold,
  onThresholdChange,
  method,
  // onMethodChange is handled in App.tsx sidebar
 }) => {
  const { loading, result, error, progress, deduplicate, clearResult } = useDeduplication();
  // Extract all descriptions from transformation result
  const allDescriptions = useMemo<ExpertTransformationDescription[]>(() => {
    if (!transformationResult) return [];
    const descriptions: ExpertTransformationDescription[] = [];
    for (const categoryResult of transformationResult.results) {
      descriptions.push(...categoryResult.descriptions);
    }
    return descriptions;
  }, [transformationResult]);
  // Clear result when transformation result or method changes
  useEffect(() => {
    clearResult();
  }, [transformationResult, method, clearResult]);
  const handleDeduplicate = () => {
    if (allDescriptions.length > 0) {
      deduplicate(allDescriptions, threshold, method);
    }
  };
  const containerStyle: React.CSSProperties = {
    height: '100%',
    display: 'flex',
    flexDirection: 'column',
    padding: 16,
    overflow: 'hidden',
  };
  const headerCardStyle: React.CSSProperties = {
    marginBottom: 16,
    background: isDark ? '#1f1f1f' : '#fff',
    borderRadius: 8,
  };
  const resultsContainerStyle: React.CSSProperties = {
    flex: 1,
    overflow: 'auto',
    paddingRight: 8,
  };
  // No transformation data
  if (!transformationResult) {
    return (
      <div style={{ ...containerStyle, justifyContent: 'center', alignItems: 'center' }}>
        <Empty
          description={
            <Space direction="vertical" size={4}>
              <Text style={{ color: isDark ? '#999' : '#666' }}>
                No transformation data available
              </Text>
              <Text type="secondary" style={{ fontSize: 12 }}>
                Please run the Transformation Agent first
              </Text>
            </Space>
          }
        />
      </div>
    );
  }
  // No descriptions found
  if (allDescriptions.length === 0) {
    return (
      <div style={{ ...containerStyle, justifyContent: 'center', alignItems: 'center' }}>
        <Empty description="No descriptions found in transformation result" />
      </div>
    );
  }
  return (
    <div style={containerStyle}>
      {/* Header Card with Controls */}
      <Card size="small" style={headerCardStyle}>
        <Row gutter={[16, 16]} align="middle">
          <Col span={6}>
            <Statistic
              title="Total Descriptions"
              value={allDescriptions.length}
              prefix={<FilterOutlined />}
            />
          </Col>
          <Col span={6}>
            <Statistic
              title="Unique Groups"
              value={result?.total_groups ?? '-'}
              prefix={<ClusterOutlined />}
              valueStyle={{ color: result ? '#52c41a' : undefined }}
            />
          </Col>
          <Col span={6}>
            <Statistic
              title="Duplicates Found"
              value={result?.total_duplicates ?? '-'}
              prefix={<CopyOutlined />}
              valueStyle={{ color: result?.total_duplicates ? '#fa8c16' : undefined }}
            />
          </Col>
          <Col span={6}>
            <Space direction="vertical" size={4} style={{ width: '100%' }}>
              <Text type="secondary" style={{ fontSize: 12 }}>
                Similarity Threshold: {(threshold * 100).toFixed(0)}%
              </Text>
              <Slider
                min={0.5}
                max={1.0}
                step={0.05}
                value={threshold}
                onChange={onThresholdChange}
                disabled={loading}
                tooltip={{ formatter: (val) => `${((val ?? 0) * 100).toFixed(0)}%` }}
              />
            </Space>
          </Col>
        </Row>
        <Divider style={{ margin: '12px 0' }} />
        <Row justify="space-between" align="middle">
          <Col>
            <Text type="secondary" style={{ fontSize: 12 }}>
              {progress.message || 'Ready to analyze'}
            </Text>
          </Col>
          <Col>
            <Space>
              {result && (
                <Button
                  icon={<ReloadOutlined />}
                  onClick={clearResult}
                  disabled={loading}
                >
                  Clear
                </Button>
              )}
              <Button
                type="primary"
                icon={<CheckCircleOutlined />}
                onClick={handleDeduplicate}
                loading={loading}
              >
                {loading ? 'Processing...' : 'Deduplicate'}
              </Button>
            </Space>
          </Col>
        </Row>
      </Card>
      {/* Error Alert */}
      {error && (
        <Alert
          message="Deduplication Error"
          description={error}
          type="error"
          showIcon
          closable
          style={{ marginBottom: 16 }}
        />
      )}
      {/* Loading State */}
      {loading && (
        <div style={{ textAlign: 'center', padding: 40 }}>
          <Spin size="large" />
          <div style={{ marginTop: 16 }}>
            <Text type="secondary">{progress.message}</Text>
          </div>
        </div>
      )}
      {/* Results */}
      {!loading && result && (
        <div style={resultsContainerStyle}>
          <Title level={5} style={{ marginBottom: 16, color: isDark ? '#fff' : '#000' }}>
            <ClusterOutlined style={{ marginRight: 8 }} />
            {result.total_groups} Groups
            {result.total_duplicates > 0 && (
              <Text type="secondary" style={{ fontSize: 14, fontWeight: 'normal', marginLeft: 8 }}>
                ({result.total_duplicates} duplicates removed)
              </Text>
            )}
          </Title>
          {result.groups.map((group, index) => (
            <GroupCard
              key={group.group_id}
              group={group}
              isDark={isDark}
              index={index}
            />
          ))}
          {result.total_groups === 0 && (
            <Empty description="No groups found" />
          )}
        </div>
      )}
      {/* Initial State - show prompt */}
      {!loading && !result && !error && (
        <div style={{ textAlign: 'center', padding: 40 }}>
          <FilterOutlined style={{ fontSize: 48, color: '#1890ff', marginBottom: 16 }} />
          <Title level={4} style={{ color: isDark ? '#fff' : '#000' }}>
            Ready to Deduplicate
          </Title>
          <Text type="secondary">
            Click the "Deduplicate" button to analyze {allDescriptions.length} descriptions
            and group similar ones together.
          </Text>
        </div>
      )}
    </div>
  );
 };
 export default DeduplicationPanel;
--- a/frontend/src/components/TransformationInputPanel.tsx
+++ b/frontend/src/components/TransformationInputPanel.tsx
@@ -12,6 +12,11 @@ const EXPERT_SOURCE_OPTIONS = [
  { label: 'Wikidata', value: 'wikidata' as ExpertSource, description: '從 Wikidata 查詢職業 (需等待 API)' },
 ];
 const EXPERT_LANGUAGE_OPTIONS = [
  { label: 'English', value: 'en' as const },
  { label: '中文', value: 'zh' as const },
 ];
 interface TransformationInputPanelProps {
  onTransform: () => void;
  loading: boolean;
@@ -26,6 +31,7 @@ interface TransformationInputPanelProps {
  };
  customExpertsInput: string;
  expertSource: ExpertSource;
  expertLanguage: 'en' | 'zh';
  onModelChange: (model: string) => void;
  onTemperatureChange: (temperature: number) => void;
  onExpertConfigChange: (config: {
@@ -35,6 +41,7 @@ interface TransformationInputPanelProps {
  }) => void;
  onCustomExpertsInputChange: (value: string) => void;
  onExpertSourceChange: (source: ExpertSource) => void;
  onExpertLanguageChange: (language: 'en' | 'zh') => void;
  availableModels: string[];
 }
@@ -48,11 +55,13 @@ export const TransformationInputPanel: React.FC<TransformationInputPanelProps> =
  expertConfig,
  customExpertsInput,
  expertSource,
  expertLanguage,
  onModelChange,
  onTemperatureChange,
  onExpertConfigChange,
  onCustomExpertsInputChange,
  onExpertSourceChange,
  onExpertLanguageChange,
  availableModels,
 }) => {
  return (
@@ -142,6 +151,19 @@ export const TransformationInputPanel: React.FC<TransformationInputPanelProps> =
            <Text type="secondary" style={{ fontSize: 11 }}>
              {EXPERT_SOURCE_OPTIONS.find((opt) => opt.value === expertSource)?.description}
            </Text>
            {/* Language selector - only for curated source */}
            {expertSource === 'curated' && (
              <div style={{ marginTop: 8 }}>
                <Text style={{ fontSize: 12 }}>職業名稱語言</Text>
                <Select
                  value={expertLanguage}
                  onChange={onExpertLanguageChange}
                  style={{ width: '100%', marginTop: 4 }}
                  options={EXPERT_LANGUAGE_OPTIONS}
                />
              </div>
            )}
          </Space>
        </Card>
--- a/frontend/src/components/TransformationPanel.tsx
+++ b/frontend/src/components/TransformationPanel.tsx
@@ -1,7 +1,7 @@
 import { forwardRef, useMemo, useCallback, useEffect } from 'react';
 import { Empty, Spin, Button, Progress, Card, Space, Typography, Tag } from 'antd';
 import { ReloadOutlined } from '@ant-design/icons';
-import type { AttributeDAG, ExpertTransformationInput, ExpertSource } from '../types';
+import type { AttributeDAG, ExpertTransformationInput, ExpertSource, ExpertTransformationDAGResult } from '../types';
 import { TransformationDAG } from './TransformationDAG';
 import type { TransformationDAGRef } from './TransformationDAG';
 import { useExpertTransformation } from '../hooks/useExpertTransformation';
@@ -19,26 +19,33 @@ interface TransformationPanelProps {
    custom_experts?: string[];
  };
  expertSource: ExpertSource;
  expertLanguage: 'en' | 'zh';
  shouldStartTransform: boolean;
  onTransformComplete: () => void;
  onLoadingChange: (loading: boolean) => void;
  onResultsChange?: (results: ExpertTransformationDAGResult | null) => void;
 }
 export const TransformationPanel = forwardRef<TransformationDAGRef, TransformationPanelProps>(
-  ({ attributeData, isDark, model, temperature, expertConfig, expertSource, shouldStartTransform, onTransformComplete, onLoadingChange }, ref) => {
+  ({ attributeData, isDark, model, temperature, expertConfig, expertSource, expertLanguage, shouldStartTransform, onTransformComplete, onLoadingChange, onResultsChange }, ref) => {
    const {
      loading,
      progress,
      results,
      transformAll,
      clearResults,
-    } = useExpertTransformation({ model, temperature, expertSource });
+    } = useExpertTransformation({ model, temperature, expertSource, expertLanguage });
    // Notify parent of loading state changes
    useEffect(() => {
      onLoadingChange(loading);
    }, [loading, onLoadingChange]);
    // Notify parent of results changes
    useEffect(() => {
      onResultsChange?.(results);
    }, [results, onResultsChange]);
    // Build expert transformation input from attribute data
    const transformationInput = useMemo((): ExpertTransformationInput | null => {
      if (!attributeData) return null;
--- a/frontend/src/components/deduplication/GroupCard.tsx
+++ b/frontend/src/components/deduplication/GroupCard.tsx
@@ -0,0 +1,147 @@
 import React, { useState } from 'react';
 import { Card, Tag, Collapse, Typography, Space, Badge } from 'antd';
 import { StarFilled, CopyOutlined, UserOutlined } from '@ant-design/icons';
 import type { DescriptionGroup } from '../../types';
 const { Text, Paragraph } = Typography;
 const { Panel } = Collapse;
 interface GroupCardProps {
  group: DescriptionGroup;
  isDark: boolean;
  index: number;
 }
 /**
 * Card component for displaying a group of similar descriptions
 */
 export const GroupCard: React.FC<GroupCardProps> = ({ group, isDark, index }) => {
  const [expanded, setExpanded] = useState(false);
  const hasDuplicates = group.duplicates.length > 0;
  const cardStyle: React.CSSProperties = {
    marginBottom: 16,
    borderRadius: 8,
    border: isDark ? '1px solid #303030' : '1px solid #f0f0f0',
    background: isDark ? '#1f1f1f' : '#fff',
  };
  const representativeStyle: React.CSSProperties = {
    background: isDark
      ? 'linear-gradient(135deg, #1a472a 0%, #2d5a3d 100%)'
      : 'linear-gradient(135deg, #f6ffed 0%, #d9f7be 100%)',
    padding: 12,
    borderRadius: 6,
    marginBottom: hasDuplicates ? 12 : 0,
  };
  const duplicateItemStyle: React.CSSProperties = {
    background: isDark ? '#2a2a2a' : '#fafafa',
    padding: 10,
    borderRadius: 4,
    marginBottom: 8,
    borderLeft: `3px solid ${isDark ? '#faad14' : '#fa8c16'}`,
  };
  return (
    <Card
      size="small"
      style={cardStyle}
      title={
        <Space>
          <Badge
            count={index + 1}
            style={{
              backgroundColor: hasDuplicates ? '#52c41a' : '#1890ff',
            }}
          />
          <Text strong style={{ color: isDark ? '#fff' : '#000' }}>
            {group.representative.keyword}
          </Text>
          {hasDuplicates && (
            <Tag color="orange" icon={<CopyOutlined />}>
              {group.duplicates.length} similar
            </Tag>
          )}
        </Space>
      }
      extra={
        <Tag color={isDark ? 'geekblue' : 'blue'}>
          <UserOutlined style={{ marginRight: 4 }} />
          {group.representative.expert_name}
        </Tag>
      }
    >
      {/* Representative description */}
      <div style={representativeStyle}>
        <Space direction="vertical" size={4} style={{ width: '100%' }}>
          <Space>
            <StarFilled style={{ color: '#52c41a' }} />
            <Text type="secondary" style={{ fontSize: 12 }}>
              Representative
            </Text>
          </Space>
          <Paragraph
            style={{
              margin: 0,
              color: isDark ? '#e0e0e0' : '#333',
              fontSize: 14,
            }}
          >
            {group.representative.description}
          </Paragraph>
        </Space>
      </div>
      {/* Duplicates section */}
      {hasDuplicates && (
        <Collapse
          ghost
          activeKey={expanded ? ['duplicates'] : []}
          onChange={() => setExpanded(!expanded)}
          style={{ marginTop: 8 }}
        >
          <Panel
            key="duplicates"
            header={
              <Text type="secondary" style={{ fontSize: 12 }}>
                View {group.duplicates.length} similar description(s)
              </Text>
            }
            style={{ padding: 0 }}
          >
            <Space direction="vertical" size={0} style={{ width: '100%' }}>
              {group.duplicates.map((dup, dupIndex) => (
                <div key={`${dup.expert_id}-${dupIndex}`} style={duplicateItemStyle}>
                  <Space direction="vertical" size={2} style={{ width: '100%' }}>
                    <Space size="small">
                      <Tag color="default" style={{ fontSize: 11 }}>
                        {dup.keyword}
                      </Tag>
                      <Tag color="cyan" style={{ fontSize: 11 }}>
                        {dup.expert_name}
                      </Tag>
                      <Tag color="orange" style={{ fontSize: 11 }}>
                        {(group.similarity_scores[dupIndex] * 100).toFixed(0)}% similar
                      </Tag>
                    </Space>
                    <Text
                      style={{
                        fontSize: 13,
                        color: isDark ? '#b0b0b0' : '#666',
                      }}
                    >
                      {dup.description}
                    </Text>
                  </Space>
                </div>
              ))}
            </Space>
          </Panel>
        </Collapse>
      )}
    </Card>
  );
 };
 export default GroupCard;
--- a/frontend/src/hooks/useDeduplication.ts
+++ b/frontend/src/hooks/useDeduplication.ts
@@ -0,0 +1,100 @@
 import { useState, useCallback } from 'react';
 import { deduplicateDescriptions } from '../services/api';
 import type {
  ExpertTransformationDescription,
  DeduplicationResult,
  DeduplicationProgress,
  DeduplicationMethod,
 } from '../types';
 /**
 * Hook for managing deduplication state and operations
 */
 export function useDeduplication() {
  const [loading, setLoading] = useState(false);
  const [result, setResult] = useState<DeduplicationResult | null>(null);
  const [error, setError] = useState<string | null>(null);
  const [progress, setProgress] = useState<DeduplicationProgress>({
    step: 'idle',
    message: '',
  });
  /**
   * Execute deduplication on a list of descriptions
   *
   * @param descriptions - List of descriptions to deduplicate
   * @param threshold - Similarity threshold (only used for embedding method)
   * @param method - Deduplication method: 'embedding' (fast) or 'llm' (accurate but slow)
   */
  const deduplicate = useCallback(async (
    descriptions: ExpertTransformationDescription[],
    threshold: number = 0.85,
    method: DeduplicationMethod = 'embedding'
  ) => {
    if (!descriptions || descriptions.length === 0) {
      setError('No descriptions to deduplicate');
      return;
    }
    setLoading(true);
    setError(null);
    setResult(null);
    // 根據方法顯示不同的進度訊息
    const methodLabel = method === 'embedding' ? 'Embedding' : 'LLM';
    const pairCount = (descriptions.length * (descriptions.length - 1)) / 2;
    const progressMessage = method === 'llm'
      ? `Processing ${descriptions.length} descriptions with LLM (${pairCount} comparisons)...`
      : `Processing ${descriptions.length} descriptions with ${methodLabel}...`;
    setProgress({
      step: 'processing',
      message: progressMessage,
    });
    try {
      const deduplicationResult = await deduplicateDescriptions({
        descriptions,
        similarity_threshold: threshold,
        method,
      });
      setResult(deduplicationResult);
      setProgress({
        step: 'done',
        message: `Found ${deduplicationResult.total_groups} unique groups, ${deduplicationResult.total_duplicates} duplicates (${methodLabel})`,
      });
    } catch (err) {
      const errorMessage = err instanceof Error ? err.message : 'Unknown error';
      setError(errorMessage);
      setProgress({
        step: 'error',
        message: 'Deduplication failed',
        error: errorMessage,
      });
    } finally {
      setLoading(false);
    }
  }, []);
  /**
   * Clear results and reset state
   */
  const clearResult = useCallback(() => {
    setResult(null);
    setError(null);
    setProgress({
      step: 'idle',
      message: '',
    });
  }, []);
  return {
    loading,
    result,
    error,
    progress,
    deduplicate,
    clearResult,
  };
 }
--- a/frontend/src/hooks/useExpertTransformation.ts
+++ b/frontend/src/hooks/useExpertTransformation.ts
@@ -14,6 +14,7 @@ interface UseExpertTransformationOptions {
  model?: string;
  temperature?: number;
  expertSource?: ExpertSource;
  expertLanguage?: 'en' | 'zh';
 }
 export function useExpertTransformation(options: UseExpertTransformationOptions = {}) {
@@ -63,6 +64,7 @@ export function useExpertTransformation(options: UseExpertTransformationOptions
            keywords_per_expert: expertConfig.keywords_per_expert,
            custom_experts: expertConfig.custom_experts,
            expert_source: options.expertSource,
            expert_language: options.expertLanguage,
            model: options.model,
            temperature: options.temperature,
          },
@@ -155,7 +157,7 @@ export function useExpertTransformation(options: UseExpertTransformationOptions
        });
      });
    },
-    [options.model, options.temperature, options.expertSource]
+    [options.model, options.temperature, options.expertSource, options.expertLanguage]
  );
  const transformAll = useCallback(
--- a/frontend/src/services/api.ts
+++ b/frontend/src/services/api.ts
@@ -10,7 +10,9 @@ import type {
  TransformationCategoryResult,
  ExpertTransformationRequest,
  ExpertTransformationCategoryResult,
-  ExpertProfile
+  ExpertProfile,
  DeduplicationRequest,
  DeduplicationResult
 } from '../types';
 // 自動使用當前瀏覽器的 hostname，支援遠端存取
@@ -299,3 +301,24 @@ export async function expertTransformCategoryStream(
    }
  }
 }
 // ===== Deduplication Agent API =====
 export async function deduplicateDescriptions(
  request: DeduplicationRequest
 ): Promise<DeduplicationResult> {
  const response = await fetch(`${API_BASE_URL}/deduplication/deduplicate`, {
    method: 'POST',
    headers: {
      'Content-Type': 'application/json',
    },
    body: JSON.stringify(request),
  });
  if (!response.ok) {
    const errorText = await response.text();
    throw new Error(`API error: ${response.status} - ${errorText}`);
  }
  return response.json();
 }
--- a/frontend/src/types/index.ts
+++ b/frontend/src/types/index.ts
@@ -265,3 +265,37 @@ export interface ExpertTransformationInput {
    custom_experts?: string[];
  };
 }
 // ===== Deduplication Agent types =====
 export type DeduplicationMethod = 'embedding' | 'llm';
 export interface DeduplicationRequest {
  descriptions: ExpertTransformationDescription[];
  method?: DeduplicationMethod;  // 去重方法，default: 'embedding'
  similarity_threshold?: number;  // 0.0-1.0, default 0.85，僅 embedding 使用
  model?: string;  // Embedding/LLM model
 }
 export interface DescriptionGroup {
  group_id: string;
  representative: ExpertTransformationDescription;
  duplicates: ExpertTransformationDescription[];
  similarity_scores: number[];
 }
 export interface DeduplicationResult {
  total_input: number;
  total_groups: number;
  total_duplicates: number;
  groups: DescriptionGroup[];
  threshold_used: number;
  method_used: DeduplicationMethod;  // 使用的方法
  model_used: string;                // 使用的模型
 }
 export interface DeduplicationProgress {
  step: 'idle' | 'processing' | 'done' | 'error';
  message: string;
  error?: string;
 }