feat: Add Deduplication Agent with embedding and LLM methods
Implement a new Deduplication Agent that identifies and groups similar transformation descriptions. Supports two deduplication methods: - Embedding: Fast vector similarity comparison using cosine similarity - LLM: Accurate pairwise semantic comparison (slower but more precise) Backend changes: - Add deduplication router with /deduplicate endpoint - Add embedding_service for vector-based similarity - Add llm_deduplication_service for LLM-based comparison - Improve expert_transformation error handling and progress reporting Frontend changes: - Add DeduplicationPanel with interactive group visualization - Add useDeduplication hook for state management - Integrate deduplication tab in main App - Add threshold slider and method selector in sidebar 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -36,3 +36,6 @@ env/
|
|||||||
.DS_Store
|
.DS_Store
|
||||||
.idea/
|
.idea/
|
||||||
.vscode/
|
.vscode/
|
||||||
|
|
||||||
|
# Serena (MCP tools)
|
||||||
|
.serena/
|
||||||
|
|||||||
@@ -3,14 +3,18 @@ from contextlib import asynccontextmanager
|
|||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
|
||||||
from .routers import attributes, transformation, expert_transformation
|
from .routers import attributes, transformation, expert_transformation, deduplication
|
||||||
from .services.llm_service import ollama_provider
|
from .services.llm_service import ollama_provider
|
||||||
|
from .services.embedding_service import embedding_service
|
||||||
|
from .services.llm_deduplication_service import llm_deduplication_service
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
yield
|
yield
|
||||||
await ollama_provider.close()
|
await ollama_provider.close()
|
||||||
|
await embedding_service.close()
|
||||||
|
await llm_deduplication_service.close()
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
@@ -31,6 +35,7 @@ app.add_middleware(
|
|||||||
app.include_router(attributes.router)
|
app.include_router(attributes.router)
|
||||||
app.include_router(transformation.router)
|
app.include_router(transformation.router)
|
||||||
app.include_router(expert_transformation.router)
|
app.include_router(expert_transformation.router)
|
||||||
|
app.include_router(deduplication.router)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
|
|||||||
@@ -232,3 +232,38 @@ class ExpertTransformationRequest(BaseModel):
|
|||||||
# LLM parameters
|
# LLM parameters
|
||||||
model: Optional[str] = None
|
model: Optional[str] = None
|
||||||
temperature: Optional[float] = 0.7
|
temperature: Optional[float] = 0.7
|
||||||
|
|
||||||
|
|
||||||
|
# ===== Deduplication Agent schemas =====
|
||||||
|
|
||||||
|
class DeduplicationMethod(str, Enum):
|
||||||
|
"""去重方法"""
|
||||||
|
EMBEDDING = "embedding" # 向量相似度
|
||||||
|
LLM = "llm" # LLM 成對判斷
|
||||||
|
|
||||||
|
|
||||||
|
class DeduplicationRequest(BaseModel):
|
||||||
|
"""去重請求"""
|
||||||
|
descriptions: List[ExpertTransformationDescription]
|
||||||
|
method: DeduplicationMethod = DeduplicationMethod.EMBEDDING # 去重方法
|
||||||
|
similarity_threshold: float = 0.85 # 餘弦相似度閾值 (0.0-1.0),僅 Embedding 使用
|
||||||
|
model: Optional[str] = None # Embedding/LLM 模型
|
||||||
|
|
||||||
|
|
||||||
|
class DescriptionGroup(BaseModel):
|
||||||
|
"""相似描述分組"""
|
||||||
|
group_id: str # "group-0", "group-1"...
|
||||||
|
representative: ExpertTransformationDescription # 代表描述
|
||||||
|
duplicates: List[ExpertTransformationDescription] # 相似描述
|
||||||
|
similarity_scores: List[float] # 每個重複項的相似度分數
|
||||||
|
|
||||||
|
|
||||||
|
class DeduplicationResult(BaseModel):
|
||||||
|
"""去重結果"""
|
||||||
|
total_input: int # 輸入描述總數
|
||||||
|
total_groups: int # 分組數量
|
||||||
|
total_duplicates: int # 重複項數量
|
||||||
|
groups: List[DescriptionGroup]
|
||||||
|
threshold_used: float
|
||||||
|
method_used: DeduplicationMethod # 使用的去重方法
|
||||||
|
model_used: str # 使用的模型
|
||||||
|
|||||||
@@ -90,16 +90,15 @@ def get_single_description_prompt(
|
|||||||
) -> str:
|
) -> str:
|
||||||
"""Step 2: 為單一關鍵字生成描述"""
|
"""Step 2: 為單一關鍵字生成描述"""
|
||||||
# 如果 domain 是通用的,就只用職業名稱
|
# 如果 domain 是通用的,就只用職業名稱
|
||||||
domain_text = f"({expert_domain})" if expert_domain and expert_domain != "Professional Field" else ""
|
domain_text = f"({expert_domain}領域)" if expert_domain and expert_domain != "Professional Field" else ""
|
||||||
|
|
||||||
return f"""/no_think
|
return f"""/no_think
|
||||||
物件:「{query}」
|
你是一位{expert_name}{domain_text}。
|
||||||
專家:{expert_name}{domain_text}
|
|
||||||
|
任務:為「{query}」生成一段創新應用描述。
|
||||||
關鍵字:{keyword}
|
關鍵字:{keyword}
|
||||||
|
|
||||||
你是一位{expert_name}。從你的專業視角,生成一段創新應用描述(15-30字),說明如何將「{keyword}」的概念應用到「{query}」上。
|
從你的專業視角,說明如何將「{keyword}」的概念應用到「{query}」上。描述要具體、有創意,15-30字。
|
||||||
|
|
||||||
描述要體現{expert_name}的專業思維和獨特觀點。
|
只回傳 JSON,不要其他文字:
|
||||||
|
{{"description": "你的創新應用描述"}}"""
|
||||||
回傳 JSON:
|
|
||||||
{{"description": "應用描述"}}"""
|
|
||||||
|
|||||||
93
backend/app/routers/deduplication.py
Normal file
93
backend/app/routers/deduplication.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
"""
|
||||||
|
Deduplication Router - 使用 Embedding 或 LLM 去重描述
|
||||||
|
|
||||||
|
提供 API 端點將相似的創新描述分組,幫助識別重複的想法。
|
||||||
|
支援兩種方法:
|
||||||
|
- Embedding: 快速向量相似度比較
|
||||||
|
- LLM: 精準語意判斷(較慢但更準確)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
|
from ..models.schemas import DeduplicationRequest, DeduplicationResult, DeduplicationMethod
|
||||||
|
from ..services.embedding_service import embedding_service
|
||||||
|
from ..services.llm_deduplication_service import llm_deduplication_service
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/deduplication", tags=["deduplication"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/deduplicate", response_model=DeduplicationResult)
|
||||||
|
async def deduplicate_descriptions(request: DeduplicationRequest) -> DeduplicationResult:
|
||||||
|
"""
|
||||||
|
去重描述
|
||||||
|
|
||||||
|
支援兩種方法:
|
||||||
|
- embedding: 使用向量相似度(快速)
|
||||||
|
- llm: 使用 LLM 成對比較(精準但較慢)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: 去重請求,包含描述列表、方法選擇和相關參數
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DeduplicationResult: 去重結果,包含分組資訊
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: 如果去重處理失敗
|
||||||
|
"""
|
||||||
|
method = request.method
|
||||||
|
logger.info(f"Deduplication request: {len(request.descriptions)} descriptions, method={method.value}, threshold={request.similarity_threshold}")
|
||||||
|
|
||||||
|
if not request.descriptions:
|
||||||
|
return DeduplicationResult(
|
||||||
|
total_input=0,
|
||||||
|
total_groups=0,
|
||||||
|
total_duplicates=0,
|
||||||
|
groups=[],
|
||||||
|
threshold_used=request.similarity_threshold,
|
||||||
|
method_used=method,
|
||||||
|
model_used=request.model or ("nomic-embed-text" if method == DeduplicationMethod.EMBEDDING else "qwen3:4b")
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if method == DeduplicationMethod.EMBEDDING:
|
||||||
|
# 使用 Embedding 相似度去重
|
||||||
|
result = await embedding_service.deduplicate(
|
||||||
|
descriptions=request.descriptions,
|
||||||
|
threshold=request.similarity_threshold,
|
||||||
|
model=request.model
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# 使用 LLM 成對比較去重
|
||||||
|
result = await llm_deduplication_service.deduplicate(
|
||||||
|
descriptions=request.descriptions,
|
||||||
|
model=request.model
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
except ValueError as e:
|
||||||
|
logger.error(f"Deduplication failed: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error during deduplication: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Deduplication failed: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/models")
|
||||||
|
async def list_embedding_models():
|
||||||
|
"""
|
||||||
|
列出可用的 Embedding 模型
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: 可用模型列表和建議的預設模型
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"default": "nomic-embed-text",
|
||||||
|
"available": [
|
||||||
|
{"name": "nomic-embed-text", "description": "Fast and efficient embedding model"},
|
||||||
|
{"name": "mxbai-embed-large", "description": "High quality embeddings"},
|
||||||
|
{"name": "all-minilm", "description": "Lightweight embedding model"},
|
||||||
|
],
|
||||||
|
"note": "Run 'ollama pull <model>' to install a model"
|
||||||
|
}
|
||||||
@@ -221,8 +221,27 @@ async def generate_expert_transformation_events(
|
|||||||
desc_prompt, model=model, temperature=temperature
|
desc_prompt, model=model, temperature=temperature
|
||||||
)
|
)
|
||||||
|
|
||||||
desc_data = extract_json_from_response(desc_response)
|
# 嘗試解析 JSON,若失敗則使用原始回應作為描述
|
||||||
desc_text = desc_data.get("description", "")
|
desc_text = ""
|
||||||
|
try:
|
||||||
|
desc_data = extract_json_from_response(desc_response)
|
||||||
|
# 支援多種可能的 key: description, content, text, desc
|
||||||
|
desc_text = (
|
||||||
|
desc_data.get("description") or
|
||||||
|
desc_data.get("content") or
|
||||||
|
desc_data.get("text") or
|
||||||
|
desc_data.get("desc") or
|
||||||
|
""
|
||||||
|
)
|
||||||
|
except ValueError:
|
||||||
|
# JSON 解析失敗,嘗試清理原始回應作為描述
|
||||||
|
cleaned = desc_response.strip()
|
||||||
|
# 移除可能的 markdown 和多餘符號
|
||||||
|
if cleaned.startswith('"') and cleaned.endswith('"'):
|
||||||
|
cleaned = cleaned[1:-1]
|
||||||
|
if len(cleaned) > 5 and len(cleaned) < 100:
|
||||||
|
desc_text = cleaned
|
||||||
|
logger.info(f"[DESC] 使用 fallback 描述 for '{kw.keyword}': {desc_text[:50]}")
|
||||||
|
|
||||||
if desc_text:
|
if desc_text:
|
||||||
descriptions.append(ExpertTransformationDescription(
|
descriptions.append(ExpertTransformationDescription(
|
||||||
@@ -231,15 +250,22 @@ async def generate_expert_transformation_events(
|
|||||||
expert_name=kw.expert_name,
|
expert_name=kw.expert_name,
|
||||||
description=desc_text
|
description=desc_text
|
||||||
))
|
))
|
||||||
|
else:
|
||||||
|
logger.warning(f"[DESC] Empty description for keyword='{kw.keyword}', parsed_data={desc_data}")
|
||||||
|
|
||||||
# Send progress update
|
# Send progress update with success/fail status
|
||||||
yield f"event: description_progress\ndata: {json.dumps({'current': idx + 1, 'total': len(all_expert_keywords), 'keyword': kw.keyword}, ensure_ascii=False)}\n\n"
|
yield f"event: description_progress\ndata: {json.dumps({'current': idx + 1, 'total': len(all_expert_keywords), 'keyword': kw.keyword, 'success': bool(desc_text)}, ensure_ascii=False)}\n\n"
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to generate description for '{kw.keyword}': {e}")
|
logger.warning(f"[DESC] Failed to generate description for '{kw.keyword}': {e}")
|
||||||
|
yield f"event: description_progress\ndata: {json.dumps({'current': idx + 1, 'total': len(all_expert_keywords), 'keyword': kw.keyword, 'success': False, 'error': str(e)}, ensure_ascii=False)}\n\n"
|
||||||
# Continue with next keyword
|
# Continue with next keyword
|
||||||
|
|
||||||
yield f"event: description_complete\ndata: {json.dumps({'count': len(descriptions)}, ensure_ascii=False)}\n\n"
|
# 統計成功率
|
||||||
|
success_rate = len(descriptions) / len(all_expert_keywords) * 100 if all_expert_keywords else 0
|
||||||
|
logger.info(f"[DESC] 描述生成完成: {len(descriptions)}/{len(all_expert_keywords)} 成功 ({success_rate:.1f}%)")
|
||||||
|
|
||||||
|
yield f"event: description_complete\ndata: {json.dumps({'count': len(descriptions), 'total': len(all_expert_keywords), 'success_rate': success_rate}, ensure_ascii=False)}\n\n"
|
||||||
|
|
||||||
# ========== Build final result ==========
|
# ========== Build final result ==========
|
||||||
result = ExpertTransformationCategoryResult(
|
result = ExpertTransformationCategoryResult(
|
||||||
|
|||||||
250
backend/app/services/embedding_service.py
Normal file
250
backend/app/services/embedding_service.py
Normal file
@@ -0,0 +1,250 @@
|
|||||||
|
"""
|
||||||
|
Embedding Service - generates embeddings and performs similarity-based deduplication
|
||||||
|
|
||||||
|
使用 Ollama 的 embedding 端點生成向量,並透過餘弦相似度進行去重分組。
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from ..config import settings
|
||||||
|
from ..models.schemas import (
|
||||||
|
ExpertTransformationDescription,
|
||||||
|
DeduplicationResult,
|
||||||
|
DeduplicationMethod,
|
||||||
|
DescriptionGroup,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class EmbeddingService:
|
||||||
|
"""Embedding 服務:生成向量並執行相似度去重"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.base_url = settings.ollama_base_url
|
||||||
|
self.default_model = "nomic-embed-text" # Ollama 預設的 embedding 模型
|
||||||
|
self.client = httpx.AsyncClient(timeout=120.0)
|
||||||
|
|
||||||
|
async def get_embedding(self, text: str, model: Optional[str] = None) -> List[float]:
|
||||||
|
"""取得單一文字的 embedding 向量"""
|
||||||
|
model = model or self.default_model
|
||||||
|
url = f"{self.base_url}/api/embed"
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await self.client.post(url, json={
|
||||||
|
"model": model,
|
||||||
|
"input": text
|
||||||
|
})
|
||||||
|
response.raise_for_status()
|
||||||
|
result = response.json()
|
||||||
|
return result["embeddings"][0]
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
logger.error(f"Embedding API error: {e.response.status_code} - {e.response.text}")
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Embedding error: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
async def get_embeddings_batch(
|
||||||
|
self,
|
||||||
|
texts: List[str],
|
||||||
|
model: Optional[str] = None
|
||||||
|
) -> List[List[float]]:
|
||||||
|
"""批次取得多個文字的 embedding 向量"""
|
||||||
|
if not texts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
model = model or self.default_model
|
||||||
|
url = f"{self.base_url}/api/embed"
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Ollama 支援批次 embedding
|
||||||
|
response = await self.client.post(url, json={
|
||||||
|
"model": model,
|
||||||
|
"input": texts
|
||||||
|
})
|
||||||
|
response.raise_for_status()
|
||||||
|
result = response.json()
|
||||||
|
return result["embeddings"]
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
logger.error(f"Batch embedding API error: {e.response.status_code} - {e.response.text}")
|
||||||
|
# 如果批次失敗,嘗試逐一處理
|
||||||
|
logger.info("Falling back to single embedding requests...")
|
||||||
|
embeddings = []
|
||||||
|
for text in texts:
|
||||||
|
emb = await self.get_embedding(text, model)
|
||||||
|
embeddings.append(emb)
|
||||||
|
return embeddings
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Batch embedding error: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def cosine_similarity(self, a: List[float], b: List[float]) -> float:
|
||||||
|
"""計算兩個向量的餘弦相似度"""
|
||||||
|
a_np = np.array(a)
|
||||||
|
b_np = np.array(b)
|
||||||
|
norm_a = np.linalg.norm(a_np)
|
||||||
|
norm_b = np.linalg.norm(b_np)
|
||||||
|
if norm_a == 0 or norm_b == 0:
|
||||||
|
return 0.0
|
||||||
|
return float(np.dot(a_np, b_np) / (norm_a * norm_b))
|
||||||
|
|
||||||
|
def build_similarity_matrix(
|
||||||
|
self,
|
||||||
|
embeddings: List[List[float]]
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""建立成對相似度矩陣"""
|
||||||
|
n = len(embeddings)
|
||||||
|
matrix = np.zeros((n, n))
|
||||||
|
|
||||||
|
for i in range(n):
|
||||||
|
matrix[i][i] = 1.0 # 自己與自己的相似度為 1
|
||||||
|
for j in range(i + 1, n):
|
||||||
|
sim = self.cosine_similarity(embeddings[i], embeddings[j])
|
||||||
|
matrix[i][j] = sim
|
||||||
|
matrix[j][i] = sim
|
||||||
|
|
||||||
|
return matrix
|
||||||
|
|
||||||
|
def cluster_by_similarity(
|
||||||
|
self,
|
||||||
|
similarity_matrix: np.ndarray,
|
||||||
|
threshold: float
|
||||||
|
) -> List[List[int]]:
|
||||||
|
"""
|
||||||
|
貪婪聚類:將相似度 >= threshold 的項目分組
|
||||||
|
|
||||||
|
演算法:
|
||||||
|
1. 從第一個未分配的項目開始
|
||||||
|
2. 找出所有與該項目相似度 >= threshold 的項目
|
||||||
|
3. 歸入同一組
|
||||||
|
4. 重複直到所有項目都已分配
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[List[int]]: 每個子列表包含同組項目的索引
|
||||||
|
"""
|
||||||
|
n = len(similarity_matrix)
|
||||||
|
assigned = [False] * n
|
||||||
|
groups = []
|
||||||
|
|
||||||
|
for i in range(n):
|
||||||
|
if assigned[i]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 開始新的分組,以 item i 為代表
|
||||||
|
group = [i]
|
||||||
|
assigned[i] = True
|
||||||
|
|
||||||
|
# 找出所有與 i 相似的項目
|
||||||
|
for j in range(i + 1, n):
|
||||||
|
if not assigned[j] and similarity_matrix[i][j] >= threshold:
|
||||||
|
group.append(j)
|
||||||
|
assigned[j] = True
|
||||||
|
|
||||||
|
groups.append(group)
|
||||||
|
|
||||||
|
return groups
|
||||||
|
|
||||||
|
async def deduplicate(
|
||||||
|
self,
|
||||||
|
descriptions: List[ExpertTransformationDescription],
|
||||||
|
threshold: float = 0.85,
|
||||||
|
model: Optional[str] = None
|
||||||
|
) -> DeduplicationResult:
|
||||||
|
"""
|
||||||
|
主要去重方法
|
||||||
|
|
||||||
|
Args:
|
||||||
|
descriptions: 要去重的描述列表
|
||||||
|
threshold: 相似度閾值 (0.0-1.0),預設 0.85
|
||||||
|
model: Embedding 模型名稱
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DeduplicationResult: 去重結果,包含分組資訊
|
||||||
|
"""
|
||||||
|
model = model or self.default_model
|
||||||
|
|
||||||
|
# 空輸入處理
|
||||||
|
if not descriptions:
|
||||||
|
return DeduplicationResult(
|
||||||
|
total_input=0,
|
||||||
|
total_groups=0,
|
||||||
|
total_duplicates=0,
|
||||||
|
groups=[],
|
||||||
|
threshold_used=threshold,
|
||||||
|
method_used=DeduplicationMethod.EMBEDDING,
|
||||||
|
model_used=model
|
||||||
|
)
|
||||||
|
|
||||||
|
# 提取描述文字
|
||||||
|
texts = [d.description for d in descriptions]
|
||||||
|
logger.info(f"Generating embeddings for {len(texts)} descriptions using model '{model}'...")
|
||||||
|
|
||||||
|
# 批次取得 embeddings
|
||||||
|
try:
|
||||||
|
embeddings = await self.get_embeddings_batch(texts, model)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to generate embeddings: {e}")
|
||||||
|
raise ValueError(f"Embedding generation failed: {e}. Make sure the model '{model}' is installed (run: ollama pull {model})")
|
||||||
|
|
||||||
|
# 建立相似度矩陣
|
||||||
|
logger.info("Building similarity matrix...")
|
||||||
|
sim_matrix = self.build_similarity_matrix(embeddings)
|
||||||
|
|
||||||
|
# 聚類
|
||||||
|
logger.info(f"Clustering with threshold {threshold}...")
|
||||||
|
clusters = self.cluster_by_similarity(sim_matrix, threshold)
|
||||||
|
|
||||||
|
# 建立結果分組
|
||||||
|
result_groups = []
|
||||||
|
total_duplicates = 0
|
||||||
|
|
||||||
|
for group_idx, indices in enumerate(clusters):
|
||||||
|
if len(indices) == 1:
|
||||||
|
# 獨立項目 - 無重複
|
||||||
|
result_groups.append(DescriptionGroup(
|
||||||
|
group_id=f"group-{group_idx}",
|
||||||
|
representative=descriptions[indices[0]],
|
||||||
|
duplicates=[],
|
||||||
|
similarity_scores=[]
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
# 有重複的分組 - 第一個為代表
|
||||||
|
rep_idx = indices[0]
|
||||||
|
dup_indices = indices[1:]
|
||||||
|
dup_scores = [
|
||||||
|
float(sim_matrix[rep_idx][idx])
|
||||||
|
for idx in dup_indices
|
||||||
|
]
|
||||||
|
|
||||||
|
result_groups.append(DescriptionGroup(
|
||||||
|
group_id=f"group-{group_idx}",
|
||||||
|
representative=descriptions[rep_idx],
|
||||||
|
duplicates=[descriptions[idx] for idx in dup_indices],
|
||||||
|
similarity_scores=dup_scores
|
||||||
|
))
|
||||||
|
total_duplicates += len(dup_indices)
|
||||||
|
|
||||||
|
logger.info(f"Deduplication complete: {len(descriptions)} -> {len(result_groups)} groups, {total_duplicates} duplicates found")
|
||||||
|
|
||||||
|
return DeduplicationResult(
|
||||||
|
total_input=len(descriptions),
|
||||||
|
total_groups=len(result_groups),
|
||||||
|
total_duplicates=total_duplicates,
|
||||||
|
groups=result_groups,
|
||||||
|
threshold_used=threshold,
|
||||||
|
method_used=DeduplicationMethod.EMBEDDING,
|
||||||
|
model_used=model
|
||||||
|
)
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
"""關閉 HTTP 客戶端"""
|
||||||
|
await self.client.aclose()
|
||||||
|
|
||||||
|
|
||||||
|
# 全域實例
|
||||||
|
embedding_service = EmbeddingService()
|
||||||
252
backend/app/services/llm_deduplication_service.py
Normal file
252
backend/app/services/llm_deduplication_service.py
Normal file
@@ -0,0 +1,252 @@
|
|||||||
|
"""
|
||||||
|
LLM Deduplication Service - 使用 LLM 成對比較進行去重
|
||||||
|
|
||||||
|
讓 LLM 判斷兩個描述是否語意重複,透過並行處理加速。
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
from typing import List, Tuple, Optional
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from ..config import settings
|
||||||
|
from ..models.schemas import (
|
||||||
|
ExpertTransformationDescription,
|
||||||
|
DeduplicationResult,
|
||||||
|
DeduplicationMethod,
|
||||||
|
DescriptionGroup,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class LLMDeduplicationService:
|
||||||
|
"""LLM 去重服務:使用 LLM 成對比較判斷語意相似度"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.base_url = settings.ollama_base_url
|
||||||
|
self.default_model = "qwen3:4b" # 快速模型,適合簡單判斷
|
||||||
|
self.client = httpx.AsyncClient(timeout=60.0)
|
||||||
|
self.max_concurrent = 5 # 最大並行數,避免 Ollama 過載
|
||||||
|
|
||||||
|
async def compare_pair(
|
||||||
|
self,
|
||||||
|
desc1: str,
|
||||||
|
desc2: str,
|
||||||
|
model: str,
|
||||||
|
semaphore: asyncio.Semaphore
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
讓 LLM 判斷兩個描述是否語意重複
|
||||||
|
|
||||||
|
Args:
|
||||||
|
desc1: 第一個描述
|
||||||
|
desc2: 第二個描述
|
||||||
|
model: LLM 模型名稱
|
||||||
|
semaphore: 並行控制信號量
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: 是否為重複描述
|
||||||
|
"""
|
||||||
|
async with semaphore: # 控制並行數
|
||||||
|
prompt = f"""判斷以下兩個創新描述是否表達相同或非常相似的概念:
|
||||||
|
|
||||||
|
描述1: {desc1}
|
||||||
|
|
||||||
|
描述2: {desc2}
|
||||||
|
|
||||||
|
如果兩者描述的創新概念本質相同或非常相似,回答 "YES"
|
||||||
|
如果兩者描述不同的創新概念,回答 "NO"
|
||||||
|
只回答 YES 或 NO,不要其他文字"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await self.client.post(
|
||||||
|
f"{self.base_url}/api/generate",
|
||||||
|
json={
|
||||||
|
"model": model,
|
||||||
|
"prompt": prompt,
|
||||||
|
"stream": False,
|
||||||
|
"options": {
|
||||||
|
"temperature": 0.1, # 低溫度以獲得一致的判斷
|
||||||
|
"num_predict": 10, # 只需要短回答
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
result = response.json()["response"].strip().upper()
|
||||||
|
is_similar = result.startswith("YES")
|
||||||
|
logger.debug(f"LLM comparison: '{desc1[:30]}...' vs '{desc2[:30]}...' -> {result} ({is_similar})")
|
||||||
|
return is_similar
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"LLM comparison failed: {e}")
|
||||||
|
return False # 失敗時假設不相似
|
||||||
|
|
||||||
|
async def compare_batch(
|
||||||
|
self,
|
||||||
|
pairs: List[Tuple[int, int, str, str]],
|
||||||
|
model: str
|
||||||
|
) -> List[Tuple[int, int, bool]]:
|
||||||
|
"""
|
||||||
|
並行批次比較多個描述對
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pairs: 待比較的配對列表 [(i, j, desc1, desc2), ...]
|
||||||
|
model: LLM 模型名稱
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
比較結果列表 [(i, j, is_similar), ...]
|
||||||
|
"""
|
||||||
|
semaphore = asyncio.Semaphore(self.max_concurrent)
|
||||||
|
|
||||||
|
async def compare_one(pair: Tuple[int, int, str, str]) -> Tuple[int, int, bool]:
|
||||||
|
i, j, desc1, desc2 = pair
|
||||||
|
is_similar = await self.compare_pair(desc1, desc2, model, semaphore)
|
||||||
|
return (i, j, is_similar)
|
||||||
|
|
||||||
|
# 使用 asyncio.gather 並行執行所有比較
|
||||||
|
results = await asyncio.gather(*[compare_one(p) for p in pairs])
|
||||||
|
return results
|
||||||
|
|
||||||
|
def cluster_by_similarity(
|
||||||
|
self,
|
||||||
|
similarity_matrix: np.ndarray,
|
||||||
|
threshold: float
|
||||||
|
) -> List[List[int]]:
|
||||||
|
"""
|
||||||
|
貪婪聚類:將相似度 >= threshold 的項目分組
|
||||||
|
|
||||||
|
與 embedding_service 使用相同的演算法
|
||||||
|
"""
|
||||||
|
n = len(similarity_matrix)
|
||||||
|
assigned = [False] * n
|
||||||
|
groups = []
|
||||||
|
|
||||||
|
for i in range(n):
|
||||||
|
if assigned[i]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 開始新的分組,以 item i 為代表
|
||||||
|
group = [i]
|
||||||
|
assigned[i] = True
|
||||||
|
|
||||||
|
# 找出所有與 i 相似的項目
|
||||||
|
for j in range(i + 1, n):
|
||||||
|
if not assigned[j] and similarity_matrix[i][j] >= threshold:
|
||||||
|
group.append(j)
|
||||||
|
assigned[j] = True
|
||||||
|
|
||||||
|
groups.append(group)
|
||||||
|
|
||||||
|
return groups
|
||||||
|
|
||||||
|
async def deduplicate(
|
||||||
|
self,
|
||||||
|
descriptions: List[ExpertTransformationDescription],
|
||||||
|
model: Optional[str] = None
|
||||||
|
) -> DeduplicationResult:
|
||||||
|
"""
|
||||||
|
使用 LLM 成對比較進行去重
|
||||||
|
|
||||||
|
Args:
|
||||||
|
descriptions: 要去重的描述列表
|
||||||
|
model: LLM 模型名稱
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DeduplicationResult: 去重結果
|
||||||
|
"""
|
||||||
|
model = model or self.default_model
|
||||||
|
|
||||||
|
# 空輸入處理
|
||||||
|
if not descriptions:
|
||||||
|
return DeduplicationResult(
|
||||||
|
total_input=0,
|
||||||
|
total_groups=0,
|
||||||
|
total_duplicates=0,
|
||||||
|
groups=[],
|
||||||
|
threshold_used=0.5, # LLM 方法固定使用 0.5 閾值
|
||||||
|
method_used=DeduplicationMethod.LLM,
|
||||||
|
model_used=model
|
||||||
|
)
|
||||||
|
|
||||||
|
n = len(descriptions)
|
||||||
|
similarity_matrix = np.zeros((n, n))
|
||||||
|
|
||||||
|
# 對角線為 1(自己與自己相似)
|
||||||
|
for i in range(n):
|
||||||
|
similarity_matrix[i][i] = 1.0
|
||||||
|
|
||||||
|
# 建立所有需要比較的配對
|
||||||
|
pairs = []
|
||||||
|
for i in range(n):
|
||||||
|
for j in range(i + 1, n):
|
||||||
|
pairs.append((
|
||||||
|
i, j,
|
||||||
|
descriptions[i].description,
|
||||||
|
descriptions[j].description
|
||||||
|
))
|
||||||
|
|
||||||
|
total_pairs = len(pairs)
|
||||||
|
logger.info(f"LLM deduplication: {total_pairs} pairs to compare (parallel={self.max_concurrent}, model={model})")
|
||||||
|
|
||||||
|
# 並行批次比較
|
||||||
|
results = await self.compare_batch(pairs, model)
|
||||||
|
|
||||||
|
# 填入相似度矩陣
|
||||||
|
for i, j, is_similar in results:
|
||||||
|
similarity_value = 1.0 if is_similar else 0.0
|
||||||
|
similarity_matrix[i][j] = similarity_value
|
||||||
|
similarity_matrix[j][i] = similarity_value
|
||||||
|
|
||||||
|
# 使用閾值 0.5 聚類(因為 LLM 輸出只有 0/1)
|
||||||
|
logger.info("Clustering results...")
|
||||||
|
clusters = self.cluster_by_similarity(similarity_matrix, 0.5)
|
||||||
|
|
||||||
|
# 建立結果分組
|
||||||
|
result_groups = []
|
||||||
|
total_duplicates = 0
|
||||||
|
|
||||||
|
for group_idx, indices in enumerate(clusters):
|
||||||
|
if len(indices) == 1:
|
||||||
|
# 獨立項目 - 無重複
|
||||||
|
result_groups.append(DescriptionGroup(
|
||||||
|
group_id=f"group-{group_idx}",
|
||||||
|
representative=descriptions[indices[0]],
|
||||||
|
duplicates=[],
|
||||||
|
similarity_scores=[]
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
# 有重複的分組 - 第一個為代表
|
||||||
|
rep_idx = indices[0]
|
||||||
|
dup_indices = indices[1:]
|
||||||
|
# LLM 方法的相似度分數都是 1.0(因為是 YES/NO 判斷)
|
||||||
|
dup_scores = [1.0 for _ in dup_indices]
|
||||||
|
|
||||||
|
result_groups.append(DescriptionGroup(
|
||||||
|
group_id=f"group-{group_idx}",
|
||||||
|
representative=descriptions[rep_idx],
|
||||||
|
duplicates=[descriptions[idx] for idx in dup_indices],
|
||||||
|
similarity_scores=dup_scores
|
||||||
|
))
|
||||||
|
total_duplicates += len(dup_indices)
|
||||||
|
|
||||||
|
logger.info(f"LLM deduplication complete: {n} -> {len(result_groups)} groups, {total_duplicates} duplicates found")
|
||||||
|
|
||||||
|
return DeduplicationResult(
|
||||||
|
total_input=n,
|
||||||
|
total_groups=len(result_groups),
|
||||||
|
total_duplicates=total_duplicates,
|
||||||
|
groups=result_groups,
|
||||||
|
threshold_used=0.5, # LLM 方法固定使用 0.5 閾值
|
||||||
|
method_used=DeduplicationMethod.LLM,
|
||||||
|
model_used=model
|
||||||
|
)
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
"""關閉 HTTP 客戶端"""
|
||||||
|
await self.client.aclose()
|
||||||
|
|
||||||
|
|
||||||
|
# 全域實例
|
||||||
|
llm_deduplication_service = LLMDeduplicationService()
|
||||||
@@ -4,3 +4,4 @@ httpx>=0.26.0
|
|||||||
pydantic>=2.5.0
|
pydantic>=2.5.0
|
||||||
pydantic-settings>=2.1.0
|
pydantic-settings>=2.1.0
|
||||||
python-dotenv>=1.0.0
|
python-dotenv>=1.0.0
|
||||||
|
numpy>=1.26.0
|
||||||
|
|||||||
@@ -1,16 +1,17 @@
|
|||||||
import { useState, useRef, useCallback, useEffect } from 'react';
|
import { useState, useRef, useCallback, useEffect } from 'react';
|
||||||
import { ConfigProvider, Layout, theme, Typography, Space, Tabs } from 'antd';
|
import { ConfigProvider, Layout, theme, Typography, Space, Tabs, Slider, Radio } from 'antd';
|
||||||
import { ApartmentOutlined, ThunderboltOutlined } from '@ant-design/icons';
|
import { ApartmentOutlined, ThunderboltOutlined, FilterOutlined } from '@ant-design/icons';
|
||||||
import { ThemeToggle } from './components/ThemeToggle';
|
import { ThemeToggle } from './components/ThemeToggle';
|
||||||
import { InputPanel } from './components/InputPanel';
|
import { InputPanel } from './components/InputPanel';
|
||||||
import { TransformationInputPanel } from './components/TransformationInputPanel';
|
import { TransformationInputPanel } from './components/TransformationInputPanel';
|
||||||
import { MindmapPanel } from './components/MindmapPanel';
|
import { MindmapPanel } from './components/MindmapPanel';
|
||||||
import { TransformationPanel } from './components/TransformationPanel';
|
import { TransformationPanel } from './components/TransformationPanel';
|
||||||
|
import { DeduplicationPanel } from './components/DeduplicationPanel';
|
||||||
import { useAttribute } from './hooks/useAttribute';
|
import { useAttribute } from './hooks/useAttribute';
|
||||||
import { getModels } from './services/api';
|
import { getModels } from './services/api';
|
||||||
import type { MindmapDAGRef } from './components/MindmapDAG';
|
import type { MindmapDAGRef } from './components/MindmapDAG';
|
||||||
import type { TransformationDAGRef } from './components/TransformationDAG';
|
import type { TransformationDAGRef } from './components/TransformationDAG';
|
||||||
import type { CategoryMode, ExpertSource } from './types';
|
import type { CategoryMode, ExpertSource, ExpertTransformationDAGResult, DeduplicationMethod } from './types';
|
||||||
|
|
||||||
const { Header, Sider, Content } = Layout;
|
const { Header, Sider, Content } = Layout;
|
||||||
const { Title } = Typography;
|
const { Title } = Typography;
|
||||||
@@ -45,8 +46,14 @@ function App() {
|
|||||||
});
|
});
|
||||||
const [customExpertsInput, setCustomExpertsInput] = useState('');
|
const [customExpertsInput, setCustomExpertsInput] = useState('');
|
||||||
const [expertSource, setExpertSource] = useState<ExpertSource>('llm');
|
const [expertSource, setExpertSource] = useState<ExpertSource>('llm');
|
||||||
|
const [expertLanguage, setExpertLanguage] = useState<'en' | 'zh'>('en');
|
||||||
const [shouldStartTransform, setShouldStartTransform] = useState(false);
|
const [shouldStartTransform, setShouldStartTransform] = useState(false);
|
||||||
const [transformLoading, setTransformLoading] = useState(false);
|
const [transformLoading, setTransformLoading] = useState(false);
|
||||||
|
const [transformationResult, setTransformationResult] = useState<ExpertTransformationDAGResult | null>(null);
|
||||||
|
|
||||||
|
// Deduplication settings
|
||||||
|
const [deduplicationThreshold, setDeduplicationThreshold] = useState(0.85);
|
||||||
|
const [deduplicationMethod, setDeduplicationMethod] = useState<DeduplicationMethod>('embedding');
|
||||||
|
|
||||||
// Available models from API
|
// Available models from API
|
||||||
const [availableModels, setAvailableModels] = useState<string[]>([]);
|
const [availableModels, setAvailableModels] = useState<string[]>([]);
|
||||||
@@ -188,9 +195,32 @@ function App() {
|
|||||||
temperature={transformTemperature}
|
temperature={transformTemperature}
|
||||||
expertConfig={expertConfig}
|
expertConfig={expertConfig}
|
||||||
expertSource={expertSource}
|
expertSource={expertSource}
|
||||||
|
expertLanguage={expertLanguage}
|
||||||
shouldStartTransform={shouldStartTransform}
|
shouldStartTransform={shouldStartTransform}
|
||||||
onTransformComplete={() => setShouldStartTransform(false)}
|
onTransformComplete={() => setShouldStartTransform(false)}
|
||||||
onLoadingChange={setTransformLoading}
|
onLoadingChange={setTransformLoading}
|
||||||
|
onResultsChange={setTransformationResult}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
key: 'deduplication',
|
||||||
|
label: (
|
||||||
|
<span>
|
||||||
|
<FilterOutlined style={{ marginRight: 8 }} />
|
||||||
|
Deduplication
|
||||||
|
</span>
|
||||||
|
),
|
||||||
|
children: (
|
||||||
|
<div style={{ height: 'calc(100vh - 140px)' }}>
|
||||||
|
<DeduplicationPanel
|
||||||
|
transformationResult={transformationResult}
|
||||||
|
isDark={isDark}
|
||||||
|
threshold={deduplicationThreshold}
|
||||||
|
onThresholdChange={setDeduplicationThreshold}
|
||||||
|
method={deduplicationMethod}
|
||||||
|
onMethodChange={setDeduplicationMethod}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
),
|
),
|
||||||
@@ -206,7 +236,7 @@ function App() {
|
|||||||
overflow: 'auto',
|
overflow: 'auto',
|
||||||
}}
|
}}
|
||||||
>
|
>
|
||||||
{activeTab === 'attribute' ? (
|
{activeTab === 'attribute' && (
|
||||||
<InputPanel
|
<InputPanel
|
||||||
loading={loading}
|
loading={loading}
|
||||||
progress={progress}
|
progress={progress}
|
||||||
@@ -218,7 +248,8 @@ function App() {
|
|||||||
visualSettings={visualSettings}
|
visualSettings={visualSettings}
|
||||||
onVisualSettingsChange={setVisualSettings}
|
onVisualSettingsChange={setVisualSettings}
|
||||||
/>
|
/>
|
||||||
) : (
|
)}
|
||||||
|
{activeTab === 'transformation' && (
|
||||||
<TransformationInputPanel
|
<TransformationInputPanel
|
||||||
onTransform={handleTransform}
|
onTransform={handleTransform}
|
||||||
loading={transformLoading}
|
loading={transformLoading}
|
||||||
@@ -229,14 +260,85 @@ function App() {
|
|||||||
expertConfig={expertConfig}
|
expertConfig={expertConfig}
|
||||||
customExpertsInput={customExpertsInput}
|
customExpertsInput={customExpertsInput}
|
||||||
expertSource={expertSource}
|
expertSource={expertSource}
|
||||||
|
expertLanguage={expertLanguage}
|
||||||
onModelChange={setTransformModel}
|
onModelChange={setTransformModel}
|
||||||
onTemperatureChange={setTransformTemperature}
|
onTemperatureChange={setTransformTemperature}
|
||||||
onExpertConfigChange={setExpertConfig}
|
onExpertConfigChange={setExpertConfig}
|
||||||
onCustomExpertsInputChange={setCustomExpertsInput}
|
onCustomExpertsInputChange={setCustomExpertsInput}
|
||||||
onExpertSourceChange={setExpertSource}
|
onExpertSourceChange={setExpertSource}
|
||||||
|
onExpertLanguageChange={setExpertLanguage}
|
||||||
availableModels={availableModels}
|
availableModels={availableModels}
|
||||||
/>
|
/>
|
||||||
)}
|
)}
|
||||||
|
{activeTab === 'deduplication' && (
|
||||||
|
<div style={{ padding: 16 }}>
|
||||||
|
<Typography.Title level={5} style={{ marginBottom: 16 }}>
|
||||||
|
<FilterOutlined style={{ marginRight: 8 }} />
|
||||||
|
Deduplication Settings
|
||||||
|
</Typography.Title>
|
||||||
|
|
||||||
|
{/* Method Selection */}
|
||||||
|
<div style={{ marginBottom: 20 }}>
|
||||||
|
<Typography.Text strong style={{ display: 'block', marginBottom: 8 }}>
|
||||||
|
Method
|
||||||
|
</Typography.Text>
|
||||||
|
<Radio.Group
|
||||||
|
value={deduplicationMethod}
|
||||||
|
onChange={(e) => setDeduplicationMethod(e.target.value)}
|
||||||
|
buttonStyle="solid"
|
||||||
|
style={{ width: '100%' }}
|
||||||
|
>
|
||||||
|
<Radio.Button value="embedding" style={{ width: '50%', textAlign: 'center' }}>
|
||||||
|
Embedding
|
||||||
|
</Radio.Button>
|
||||||
|
<Radio.Button value="llm" style={{ width: '50%', textAlign: 'center' }}>
|
||||||
|
LLM Judge
|
||||||
|
</Radio.Button>
|
||||||
|
</Radio.Group>
|
||||||
|
<Typography.Text type="secondary" style={{ display: 'block', marginTop: 8, fontSize: 12 }}>
|
||||||
|
{deduplicationMethod === 'embedding'
|
||||||
|
? 'Fast vector similarity comparison'
|
||||||
|
: 'Accurate but slower pairwise LLM comparison'}
|
||||||
|
</Typography.Text>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Threshold Slider - Only for Embedding method */}
|
||||||
|
{deduplicationMethod === 'embedding' && (
|
||||||
|
<div style={{ marginBottom: 20 }}>
|
||||||
|
<Typography.Text strong style={{ display: 'block', marginBottom: 8 }}>
|
||||||
|
Similarity Threshold
|
||||||
|
</Typography.Text>
|
||||||
|
<Typography.Text type="secondary" style={{ display: 'block', marginBottom: 12, fontSize: 12 }}>
|
||||||
|
Higher = stricter matching, fewer groups
|
||||||
|
</Typography.Text>
|
||||||
|
<Slider
|
||||||
|
min={0.5}
|
||||||
|
max={1.0}
|
||||||
|
step={0.05}
|
||||||
|
value={deduplicationThreshold}
|
||||||
|
onChange={setDeduplicationThreshold}
|
||||||
|
marks={{
|
||||||
|
0.5: '50%',
|
||||||
|
0.7: '70%',
|
||||||
|
0.85: '85%',
|
||||||
|
1.0: '100%',
|
||||||
|
}}
|
||||||
|
tooltip={{ formatter: (val) => `${((val ?? 0) * 100).toFixed(0)}%` }}
|
||||||
|
/>
|
||||||
|
<Typography.Text type="secondary" style={{ fontSize: 12 }}>
|
||||||
|
Current: {(deduplicationThreshold * 100).toFixed(0)}% similarity required
|
||||||
|
</Typography.Text>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* LLM Warning */}
|
||||||
|
{deduplicationMethod === 'llm' && (
|
||||||
|
<Typography.Text type="warning" style={{ display: 'block', fontSize: 12 }}>
|
||||||
|
Note: LLM method requires N*(N-1)/2 comparisons. May take longer for many descriptions.
|
||||||
|
</Typography.Text>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
</Sider>
|
</Sider>
|
||||||
</Layout>
|
</Layout>
|
||||||
</Layout>
|
</Layout>
|
||||||
|
|||||||
271
frontend/src/components/DeduplicationPanel.tsx
Normal file
271
frontend/src/components/DeduplicationPanel.tsx
Normal file
@@ -0,0 +1,271 @@
|
|||||||
|
import React, { useEffect, useMemo } from 'react';
|
||||||
|
import {
|
||||||
|
Card,
|
||||||
|
Button,
|
||||||
|
Slider,
|
||||||
|
Statistic,
|
||||||
|
Row,
|
||||||
|
Col,
|
||||||
|
Empty,
|
||||||
|
Spin,
|
||||||
|
Alert,
|
||||||
|
Typography,
|
||||||
|
Space,
|
||||||
|
Divider,
|
||||||
|
} from 'antd';
|
||||||
|
import {
|
||||||
|
FilterOutlined,
|
||||||
|
ReloadOutlined,
|
||||||
|
CheckCircleOutlined,
|
||||||
|
ClusterOutlined,
|
||||||
|
CopyOutlined,
|
||||||
|
} from '@ant-design/icons';
|
||||||
|
import { useDeduplication } from '../hooks/useDeduplication';
|
||||||
|
import { GroupCard } from './deduplication/GroupCard';
|
||||||
|
import type {
|
||||||
|
ExpertTransformationDAGResult,
|
||||||
|
ExpertTransformationDescription,
|
||||||
|
DeduplicationMethod,
|
||||||
|
} from '../types';
|
||||||
|
|
||||||
|
const { Title, Text } = Typography;
|
||||||
|
|
||||||
|
interface DeduplicationPanelProps {
|
||||||
|
transformationResult: ExpertTransformationDAGResult | null;
|
||||||
|
isDark: boolean;
|
||||||
|
threshold: number;
|
||||||
|
onThresholdChange: (value: number) => void;
|
||||||
|
method: DeduplicationMethod;
|
||||||
|
onMethodChange?: (method: DeduplicationMethod) => void; // Optional, handled in App.tsx sidebar
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Panel for deduplicating transformation descriptions
|
||||||
|
*/
|
||||||
|
export const DeduplicationPanel: React.FC<DeduplicationPanelProps> = ({
|
||||||
|
transformationResult,
|
||||||
|
isDark,
|
||||||
|
threshold,
|
||||||
|
onThresholdChange,
|
||||||
|
method,
|
||||||
|
// onMethodChange is handled in App.tsx sidebar
|
||||||
|
}) => {
|
||||||
|
const { loading, result, error, progress, deduplicate, clearResult } = useDeduplication();
|
||||||
|
|
||||||
|
// Extract all descriptions from transformation result
|
||||||
|
const allDescriptions = useMemo<ExpertTransformationDescription[]>(() => {
|
||||||
|
if (!transformationResult) return [];
|
||||||
|
|
||||||
|
const descriptions: ExpertTransformationDescription[] = [];
|
||||||
|
for (const categoryResult of transformationResult.results) {
|
||||||
|
descriptions.push(...categoryResult.descriptions);
|
||||||
|
}
|
||||||
|
return descriptions;
|
||||||
|
}, [transformationResult]);
|
||||||
|
|
||||||
|
// Clear result when transformation result or method changes
|
||||||
|
useEffect(() => {
|
||||||
|
clearResult();
|
||||||
|
}, [transformationResult, method, clearResult]);
|
||||||
|
|
||||||
|
const handleDeduplicate = () => {
|
||||||
|
if (allDescriptions.length > 0) {
|
||||||
|
deduplicate(allDescriptions, threshold, method);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const containerStyle: React.CSSProperties = {
|
||||||
|
height: '100%',
|
||||||
|
display: 'flex',
|
||||||
|
flexDirection: 'column',
|
||||||
|
padding: 16,
|
||||||
|
overflow: 'hidden',
|
||||||
|
};
|
||||||
|
|
||||||
|
const headerCardStyle: React.CSSProperties = {
|
||||||
|
marginBottom: 16,
|
||||||
|
background: isDark ? '#1f1f1f' : '#fff',
|
||||||
|
borderRadius: 8,
|
||||||
|
};
|
||||||
|
|
||||||
|
const resultsContainerStyle: React.CSSProperties = {
|
||||||
|
flex: 1,
|
||||||
|
overflow: 'auto',
|
||||||
|
paddingRight: 8,
|
||||||
|
};
|
||||||
|
|
||||||
|
// No transformation data
|
||||||
|
if (!transformationResult) {
|
||||||
|
return (
|
||||||
|
<div style={{ ...containerStyle, justifyContent: 'center', alignItems: 'center' }}>
|
||||||
|
<Empty
|
||||||
|
description={
|
||||||
|
<Space direction="vertical" size={4}>
|
||||||
|
<Text style={{ color: isDark ? '#999' : '#666' }}>
|
||||||
|
No transformation data available
|
||||||
|
</Text>
|
||||||
|
<Text type="secondary" style={{ fontSize: 12 }}>
|
||||||
|
Please run the Transformation Agent first
|
||||||
|
</Text>
|
||||||
|
</Space>
|
||||||
|
}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// No descriptions found
|
||||||
|
if (allDescriptions.length === 0) {
|
||||||
|
return (
|
||||||
|
<div style={{ ...containerStyle, justifyContent: 'center', alignItems: 'center' }}>
|
||||||
|
<Empty description="No descriptions found in transformation result" />
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div style={containerStyle}>
|
||||||
|
{/* Header Card with Controls */}
|
||||||
|
<Card size="small" style={headerCardStyle}>
|
||||||
|
<Row gutter={[16, 16]} align="middle">
|
||||||
|
<Col span={6}>
|
||||||
|
<Statistic
|
||||||
|
title="Total Descriptions"
|
||||||
|
value={allDescriptions.length}
|
||||||
|
prefix={<FilterOutlined />}
|
||||||
|
/>
|
||||||
|
</Col>
|
||||||
|
<Col span={6}>
|
||||||
|
<Statistic
|
||||||
|
title="Unique Groups"
|
||||||
|
value={result?.total_groups ?? '-'}
|
||||||
|
prefix={<ClusterOutlined />}
|
||||||
|
valueStyle={{ color: result ? '#52c41a' : undefined }}
|
||||||
|
/>
|
||||||
|
</Col>
|
||||||
|
<Col span={6}>
|
||||||
|
<Statistic
|
||||||
|
title="Duplicates Found"
|
||||||
|
value={result?.total_duplicates ?? '-'}
|
||||||
|
prefix={<CopyOutlined />}
|
||||||
|
valueStyle={{ color: result?.total_duplicates ? '#fa8c16' : undefined }}
|
||||||
|
/>
|
||||||
|
</Col>
|
||||||
|
<Col span={6}>
|
||||||
|
<Space direction="vertical" size={4} style={{ width: '100%' }}>
|
||||||
|
<Text type="secondary" style={{ fontSize: 12 }}>
|
||||||
|
Similarity Threshold: {(threshold * 100).toFixed(0)}%
|
||||||
|
</Text>
|
||||||
|
<Slider
|
||||||
|
min={0.5}
|
||||||
|
max={1.0}
|
||||||
|
step={0.05}
|
||||||
|
value={threshold}
|
||||||
|
onChange={onThresholdChange}
|
||||||
|
disabled={loading}
|
||||||
|
tooltip={{ formatter: (val) => `${((val ?? 0) * 100).toFixed(0)}%` }}
|
||||||
|
/>
|
||||||
|
</Space>
|
||||||
|
</Col>
|
||||||
|
</Row>
|
||||||
|
|
||||||
|
<Divider style={{ margin: '12px 0' }} />
|
||||||
|
|
||||||
|
<Row justify="space-between" align="middle">
|
||||||
|
<Col>
|
||||||
|
<Text type="secondary" style={{ fontSize: 12 }}>
|
||||||
|
{progress.message || 'Ready to analyze'}
|
||||||
|
</Text>
|
||||||
|
</Col>
|
||||||
|
<Col>
|
||||||
|
<Space>
|
||||||
|
{result && (
|
||||||
|
<Button
|
||||||
|
icon={<ReloadOutlined />}
|
||||||
|
onClick={clearResult}
|
||||||
|
disabled={loading}
|
||||||
|
>
|
||||||
|
Clear
|
||||||
|
</Button>
|
||||||
|
)}
|
||||||
|
<Button
|
||||||
|
type="primary"
|
||||||
|
icon={<CheckCircleOutlined />}
|
||||||
|
onClick={handleDeduplicate}
|
||||||
|
loading={loading}
|
||||||
|
>
|
||||||
|
{loading ? 'Processing...' : 'Deduplicate'}
|
||||||
|
</Button>
|
||||||
|
</Space>
|
||||||
|
</Col>
|
||||||
|
</Row>
|
||||||
|
</Card>
|
||||||
|
|
||||||
|
{/* Error Alert */}
|
||||||
|
{error && (
|
||||||
|
<Alert
|
||||||
|
message="Deduplication Error"
|
||||||
|
description={error}
|
||||||
|
type="error"
|
||||||
|
showIcon
|
||||||
|
closable
|
||||||
|
style={{ marginBottom: 16 }}
|
||||||
|
/>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Loading State */}
|
||||||
|
{loading && (
|
||||||
|
<div style={{ textAlign: 'center', padding: 40 }}>
|
||||||
|
<Spin size="large" />
|
||||||
|
<div style={{ marginTop: 16 }}>
|
||||||
|
<Text type="secondary">{progress.message}</Text>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Results */}
|
||||||
|
{!loading && result && (
|
||||||
|
<div style={resultsContainerStyle}>
|
||||||
|
<Title level={5} style={{ marginBottom: 16, color: isDark ? '#fff' : '#000' }}>
|
||||||
|
<ClusterOutlined style={{ marginRight: 8 }} />
|
||||||
|
{result.total_groups} Groups
|
||||||
|
{result.total_duplicates > 0 && (
|
||||||
|
<Text type="secondary" style={{ fontSize: 14, fontWeight: 'normal', marginLeft: 8 }}>
|
||||||
|
({result.total_duplicates} duplicates removed)
|
||||||
|
</Text>
|
||||||
|
)}
|
||||||
|
</Title>
|
||||||
|
|
||||||
|
{result.groups.map((group, index) => (
|
||||||
|
<GroupCard
|
||||||
|
key={group.group_id}
|
||||||
|
group={group}
|
||||||
|
isDark={isDark}
|
||||||
|
index={index}
|
||||||
|
/>
|
||||||
|
))}
|
||||||
|
|
||||||
|
{result.total_groups === 0 && (
|
||||||
|
<Empty description="No groups found" />
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Initial State - show prompt */}
|
||||||
|
{!loading && !result && !error && (
|
||||||
|
<div style={{ textAlign: 'center', padding: 40 }}>
|
||||||
|
<FilterOutlined style={{ fontSize: 48, color: '#1890ff', marginBottom: 16 }} />
|
||||||
|
<Title level={4} style={{ color: isDark ? '#fff' : '#000' }}>
|
||||||
|
Ready to Deduplicate
|
||||||
|
</Title>
|
||||||
|
<Text type="secondary">
|
||||||
|
Click the "Deduplicate" button to analyze {allDescriptions.length} descriptions
|
||||||
|
and group similar ones together.
|
||||||
|
</Text>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
export default DeduplicationPanel;
|
||||||
@@ -12,6 +12,11 @@ const EXPERT_SOURCE_OPTIONS = [
|
|||||||
{ label: 'Wikidata', value: 'wikidata' as ExpertSource, description: '從 Wikidata 查詢職業 (需等待 API)' },
|
{ label: 'Wikidata', value: 'wikidata' as ExpertSource, description: '從 Wikidata 查詢職業 (需等待 API)' },
|
||||||
];
|
];
|
||||||
|
|
||||||
|
const EXPERT_LANGUAGE_OPTIONS = [
|
||||||
|
{ label: 'English', value: 'en' as const },
|
||||||
|
{ label: '中文', value: 'zh' as const },
|
||||||
|
];
|
||||||
|
|
||||||
interface TransformationInputPanelProps {
|
interface TransformationInputPanelProps {
|
||||||
onTransform: () => void;
|
onTransform: () => void;
|
||||||
loading: boolean;
|
loading: boolean;
|
||||||
@@ -26,6 +31,7 @@ interface TransformationInputPanelProps {
|
|||||||
};
|
};
|
||||||
customExpertsInput: string;
|
customExpertsInput: string;
|
||||||
expertSource: ExpertSource;
|
expertSource: ExpertSource;
|
||||||
|
expertLanguage: 'en' | 'zh';
|
||||||
onModelChange: (model: string) => void;
|
onModelChange: (model: string) => void;
|
||||||
onTemperatureChange: (temperature: number) => void;
|
onTemperatureChange: (temperature: number) => void;
|
||||||
onExpertConfigChange: (config: {
|
onExpertConfigChange: (config: {
|
||||||
@@ -35,6 +41,7 @@ interface TransformationInputPanelProps {
|
|||||||
}) => void;
|
}) => void;
|
||||||
onCustomExpertsInputChange: (value: string) => void;
|
onCustomExpertsInputChange: (value: string) => void;
|
||||||
onExpertSourceChange: (source: ExpertSource) => void;
|
onExpertSourceChange: (source: ExpertSource) => void;
|
||||||
|
onExpertLanguageChange: (language: 'en' | 'zh') => void;
|
||||||
availableModels: string[];
|
availableModels: string[];
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -48,11 +55,13 @@ export const TransformationInputPanel: React.FC<TransformationInputPanelProps> =
|
|||||||
expertConfig,
|
expertConfig,
|
||||||
customExpertsInput,
|
customExpertsInput,
|
||||||
expertSource,
|
expertSource,
|
||||||
|
expertLanguage,
|
||||||
onModelChange,
|
onModelChange,
|
||||||
onTemperatureChange,
|
onTemperatureChange,
|
||||||
onExpertConfigChange,
|
onExpertConfigChange,
|
||||||
onCustomExpertsInputChange,
|
onCustomExpertsInputChange,
|
||||||
onExpertSourceChange,
|
onExpertSourceChange,
|
||||||
|
onExpertLanguageChange,
|
||||||
availableModels,
|
availableModels,
|
||||||
}) => {
|
}) => {
|
||||||
return (
|
return (
|
||||||
@@ -142,6 +151,19 @@ export const TransformationInputPanel: React.FC<TransformationInputPanelProps> =
|
|||||||
<Text type="secondary" style={{ fontSize: 11 }}>
|
<Text type="secondary" style={{ fontSize: 11 }}>
|
||||||
{EXPERT_SOURCE_OPTIONS.find((opt) => opt.value === expertSource)?.description}
|
{EXPERT_SOURCE_OPTIONS.find((opt) => opt.value === expertSource)?.description}
|
||||||
</Text>
|
</Text>
|
||||||
|
|
||||||
|
{/* Language selector - only for curated source */}
|
||||||
|
{expertSource === 'curated' && (
|
||||||
|
<div style={{ marginTop: 8 }}>
|
||||||
|
<Text style={{ fontSize: 12 }}>職業名稱語言</Text>
|
||||||
|
<Select
|
||||||
|
value={expertLanguage}
|
||||||
|
onChange={onExpertLanguageChange}
|
||||||
|
style={{ width: '100%', marginTop: 4 }}
|
||||||
|
options={EXPERT_LANGUAGE_OPTIONS}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
</Space>
|
</Space>
|
||||||
</Card>
|
</Card>
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import { forwardRef, useMemo, useCallback, useEffect } from 'react';
|
import { forwardRef, useMemo, useCallback, useEffect } from 'react';
|
||||||
import { Empty, Spin, Button, Progress, Card, Space, Typography, Tag } from 'antd';
|
import { Empty, Spin, Button, Progress, Card, Space, Typography, Tag } from 'antd';
|
||||||
import { ReloadOutlined } from '@ant-design/icons';
|
import { ReloadOutlined } from '@ant-design/icons';
|
||||||
import type { AttributeDAG, ExpertTransformationInput, ExpertSource } from '../types';
|
import type { AttributeDAG, ExpertTransformationInput, ExpertSource, ExpertTransformationDAGResult } from '../types';
|
||||||
import { TransformationDAG } from './TransformationDAG';
|
import { TransformationDAG } from './TransformationDAG';
|
||||||
import type { TransformationDAGRef } from './TransformationDAG';
|
import type { TransformationDAGRef } from './TransformationDAG';
|
||||||
import { useExpertTransformation } from '../hooks/useExpertTransformation';
|
import { useExpertTransformation } from '../hooks/useExpertTransformation';
|
||||||
@@ -19,26 +19,33 @@ interface TransformationPanelProps {
|
|||||||
custom_experts?: string[];
|
custom_experts?: string[];
|
||||||
};
|
};
|
||||||
expertSource: ExpertSource;
|
expertSource: ExpertSource;
|
||||||
|
expertLanguage: 'en' | 'zh';
|
||||||
shouldStartTransform: boolean;
|
shouldStartTransform: boolean;
|
||||||
onTransformComplete: () => void;
|
onTransformComplete: () => void;
|
||||||
onLoadingChange: (loading: boolean) => void;
|
onLoadingChange: (loading: boolean) => void;
|
||||||
|
onResultsChange?: (results: ExpertTransformationDAGResult | null) => void;
|
||||||
}
|
}
|
||||||
|
|
||||||
export const TransformationPanel = forwardRef<TransformationDAGRef, TransformationPanelProps>(
|
export const TransformationPanel = forwardRef<TransformationDAGRef, TransformationPanelProps>(
|
||||||
({ attributeData, isDark, model, temperature, expertConfig, expertSource, shouldStartTransform, onTransformComplete, onLoadingChange }, ref) => {
|
({ attributeData, isDark, model, temperature, expertConfig, expertSource, expertLanguage, shouldStartTransform, onTransformComplete, onLoadingChange, onResultsChange }, ref) => {
|
||||||
const {
|
const {
|
||||||
loading,
|
loading,
|
||||||
progress,
|
progress,
|
||||||
results,
|
results,
|
||||||
transformAll,
|
transformAll,
|
||||||
clearResults,
|
clearResults,
|
||||||
} = useExpertTransformation({ model, temperature, expertSource });
|
} = useExpertTransformation({ model, temperature, expertSource, expertLanguage });
|
||||||
|
|
||||||
// Notify parent of loading state changes
|
// Notify parent of loading state changes
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
onLoadingChange(loading);
|
onLoadingChange(loading);
|
||||||
}, [loading, onLoadingChange]);
|
}, [loading, onLoadingChange]);
|
||||||
|
|
||||||
|
// Notify parent of results changes
|
||||||
|
useEffect(() => {
|
||||||
|
onResultsChange?.(results);
|
||||||
|
}, [results, onResultsChange]);
|
||||||
|
|
||||||
// Build expert transformation input from attribute data
|
// Build expert transformation input from attribute data
|
||||||
const transformationInput = useMemo((): ExpertTransformationInput | null => {
|
const transformationInput = useMemo((): ExpertTransformationInput | null => {
|
||||||
if (!attributeData) return null;
|
if (!attributeData) return null;
|
||||||
|
|||||||
147
frontend/src/components/deduplication/GroupCard.tsx
Normal file
147
frontend/src/components/deduplication/GroupCard.tsx
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
import React, { useState } from 'react';
|
||||||
|
import { Card, Tag, Collapse, Typography, Space, Badge } from 'antd';
|
||||||
|
import { StarFilled, CopyOutlined, UserOutlined } from '@ant-design/icons';
|
||||||
|
import type { DescriptionGroup } from '../../types';
|
||||||
|
|
||||||
|
const { Text, Paragraph } = Typography;
|
||||||
|
const { Panel } = Collapse;
|
||||||
|
|
||||||
|
interface GroupCardProps {
|
||||||
|
group: DescriptionGroup;
|
||||||
|
isDark: boolean;
|
||||||
|
index: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Card component for displaying a group of similar descriptions
|
||||||
|
*/
|
||||||
|
export const GroupCard: React.FC<GroupCardProps> = ({ group, isDark, index }) => {
|
||||||
|
const [expanded, setExpanded] = useState(false);
|
||||||
|
const hasDuplicates = group.duplicates.length > 0;
|
||||||
|
|
||||||
|
const cardStyle: React.CSSProperties = {
|
||||||
|
marginBottom: 16,
|
||||||
|
borderRadius: 8,
|
||||||
|
border: isDark ? '1px solid #303030' : '1px solid #f0f0f0',
|
||||||
|
background: isDark ? '#1f1f1f' : '#fff',
|
||||||
|
};
|
||||||
|
|
||||||
|
const representativeStyle: React.CSSProperties = {
|
||||||
|
background: isDark
|
||||||
|
? 'linear-gradient(135deg, #1a472a 0%, #2d5a3d 100%)'
|
||||||
|
: 'linear-gradient(135deg, #f6ffed 0%, #d9f7be 100%)',
|
||||||
|
padding: 12,
|
||||||
|
borderRadius: 6,
|
||||||
|
marginBottom: hasDuplicates ? 12 : 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
const duplicateItemStyle: React.CSSProperties = {
|
||||||
|
background: isDark ? '#2a2a2a' : '#fafafa',
|
||||||
|
padding: 10,
|
||||||
|
borderRadius: 4,
|
||||||
|
marginBottom: 8,
|
||||||
|
borderLeft: `3px solid ${isDark ? '#faad14' : '#fa8c16'}`,
|
||||||
|
};
|
||||||
|
|
||||||
|
return (
|
||||||
|
<Card
|
||||||
|
size="small"
|
||||||
|
style={cardStyle}
|
||||||
|
title={
|
||||||
|
<Space>
|
||||||
|
<Badge
|
||||||
|
count={index + 1}
|
||||||
|
style={{
|
||||||
|
backgroundColor: hasDuplicates ? '#52c41a' : '#1890ff',
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
<Text strong style={{ color: isDark ? '#fff' : '#000' }}>
|
||||||
|
{group.representative.keyword}
|
||||||
|
</Text>
|
||||||
|
{hasDuplicates && (
|
||||||
|
<Tag color="orange" icon={<CopyOutlined />}>
|
||||||
|
{group.duplicates.length} similar
|
||||||
|
</Tag>
|
||||||
|
)}
|
||||||
|
</Space>
|
||||||
|
}
|
||||||
|
extra={
|
||||||
|
<Tag color={isDark ? 'geekblue' : 'blue'}>
|
||||||
|
<UserOutlined style={{ marginRight: 4 }} />
|
||||||
|
{group.representative.expert_name}
|
||||||
|
</Tag>
|
||||||
|
}
|
||||||
|
>
|
||||||
|
{/* Representative description */}
|
||||||
|
<div style={representativeStyle}>
|
||||||
|
<Space direction="vertical" size={4} style={{ width: '100%' }}>
|
||||||
|
<Space>
|
||||||
|
<StarFilled style={{ color: '#52c41a' }} />
|
||||||
|
<Text type="secondary" style={{ fontSize: 12 }}>
|
||||||
|
Representative
|
||||||
|
</Text>
|
||||||
|
</Space>
|
||||||
|
<Paragraph
|
||||||
|
style={{
|
||||||
|
margin: 0,
|
||||||
|
color: isDark ? '#e0e0e0' : '#333',
|
||||||
|
fontSize: 14,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{group.representative.description}
|
||||||
|
</Paragraph>
|
||||||
|
</Space>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Duplicates section */}
|
||||||
|
{hasDuplicates && (
|
||||||
|
<Collapse
|
||||||
|
ghost
|
||||||
|
activeKey={expanded ? ['duplicates'] : []}
|
||||||
|
onChange={() => setExpanded(!expanded)}
|
||||||
|
style={{ marginTop: 8 }}
|
||||||
|
>
|
||||||
|
<Panel
|
||||||
|
key="duplicates"
|
||||||
|
header={
|
||||||
|
<Text type="secondary" style={{ fontSize: 12 }}>
|
||||||
|
View {group.duplicates.length} similar description(s)
|
||||||
|
</Text>
|
||||||
|
}
|
||||||
|
style={{ padding: 0 }}
|
||||||
|
>
|
||||||
|
<Space direction="vertical" size={0} style={{ width: '100%' }}>
|
||||||
|
{group.duplicates.map((dup, dupIndex) => (
|
||||||
|
<div key={`${dup.expert_id}-${dupIndex}`} style={duplicateItemStyle}>
|
||||||
|
<Space direction="vertical" size={2} style={{ width: '100%' }}>
|
||||||
|
<Space size="small">
|
||||||
|
<Tag color="default" style={{ fontSize: 11 }}>
|
||||||
|
{dup.keyword}
|
||||||
|
</Tag>
|
||||||
|
<Tag color="cyan" style={{ fontSize: 11 }}>
|
||||||
|
{dup.expert_name}
|
||||||
|
</Tag>
|
||||||
|
<Tag color="orange" style={{ fontSize: 11 }}>
|
||||||
|
{(group.similarity_scores[dupIndex] * 100).toFixed(0)}% similar
|
||||||
|
</Tag>
|
||||||
|
</Space>
|
||||||
|
<Text
|
||||||
|
style={{
|
||||||
|
fontSize: 13,
|
||||||
|
color: isDark ? '#b0b0b0' : '#666',
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{dup.description}
|
||||||
|
</Text>
|
||||||
|
</Space>
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</Space>
|
||||||
|
</Panel>
|
||||||
|
</Collapse>
|
||||||
|
)}
|
||||||
|
</Card>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
export default GroupCard;
|
||||||
100
frontend/src/hooks/useDeduplication.ts
Normal file
100
frontend/src/hooks/useDeduplication.ts
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
import { useState, useCallback } from 'react';
|
||||||
|
import { deduplicateDescriptions } from '../services/api';
|
||||||
|
import type {
|
||||||
|
ExpertTransformationDescription,
|
||||||
|
DeduplicationResult,
|
||||||
|
DeduplicationProgress,
|
||||||
|
DeduplicationMethod,
|
||||||
|
} from '../types';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Hook for managing deduplication state and operations
|
||||||
|
*/
|
||||||
|
export function useDeduplication() {
|
||||||
|
const [loading, setLoading] = useState(false);
|
||||||
|
const [result, setResult] = useState<DeduplicationResult | null>(null);
|
||||||
|
const [error, setError] = useState<string | null>(null);
|
||||||
|
const [progress, setProgress] = useState<DeduplicationProgress>({
|
||||||
|
step: 'idle',
|
||||||
|
message: '',
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Execute deduplication on a list of descriptions
|
||||||
|
*
|
||||||
|
* @param descriptions - List of descriptions to deduplicate
|
||||||
|
* @param threshold - Similarity threshold (only used for embedding method)
|
||||||
|
* @param method - Deduplication method: 'embedding' (fast) or 'llm' (accurate but slow)
|
||||||
|
*/
|
||||||
|
const deduplicate = useCallback(async (
|
||||||
|
descriptions: ExpertTransformationDescription[],
|
||||||
|
threshold: number = 0.85,
|
||||||
|
method: DeduplicationMethod = 'embedding'
|
||||||
|
) => {
|
||||||
|
if (!descriptions || descriptions.length === 0) {
|
||||||
|
setError('No descriptions to deduplicate');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
setLoading(true);
|
||||||
|
setError(null);
|
||||||
|
setResult(null);
|
||||||
|
|
||||||
|
// 根據方法顯示不同的進度訊息
|
||||||
|
const methodLabel = method === 'embedding' ? 'Embedding' : 'LLM';
|
||||||
|
const pairCount = (descriptions.length * (descriptions.length - 1)) / 2;
|
||||||
|
const progressMessage = method === 'llm'
|
||||||
|
? `Processing ${descriptions.length} descriptions with LLM (${pairCount} comparisons)...`
|
||||||
|
: `Processing ${descriptions.length} descriptions with ${methodLabel}...`;
|
||||||
|
|
||||||
|
setProgress({
|
||||||
|
step: 'processing',
|
||||||
|
message: progressMessage,
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
const deduplicationResult = await deduplicateDescriptions({
|
||||||
|
descriptions,
|
||||||
|
similarity_threshold: threshold,
|
||||||
|
method,
|
||||||
|
});
|
||||||
|
|
||||||
|
setResult(deduplicationResult);
|
||||||
|
setProgress({
|
||||||
|
step: 'done',
|
||||||
|
message: `Found ${deduplicationResult.total_groups} unique groups, ${deduplicationResult.total_duplicates} duplicates (${methodLabel})`,
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
const errorMessage = err instanceof Error ? err.message : 'Unknown error';
|
||||||
|
setError(errorMessage);
|
||||||
|
setProgress({
|
||||||
|
step: 'error',
|
||||||
|
message: 'Deduplication failed',
|
||||||
|
error: errorMessage,
|
||||||
|
});
|
||||||
|
} finally {
|
||||||
|
setLoading(false);
|
||||||
|
}
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clear results and reset state
|
||||||
|
*/
|
||||||
|
const clearResult = useCallback(() => {
|
||||||
|
setResult(null);
|
||||||
|
setError(null);
|
||||||
|
setProgress({
|
||||||
|
step: 'idle',
|
||||||
|
message: '',
|
||||||
|
});
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
return {
|
||||||
|
loading,
|
||||||
|
result,
|
||||||
|
error,
|
||||||
|
progress,
|
||||||
|
deduplicate,
|
||||||
|
clearResult,
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -14,6 +14,7 @@ interface UseExpertTransformationOptions {
|
|||||||
model?: string;
|
model?: string;
|
||||||
temperature?: number;
|
temperature?: number;
|
||||||
expertSource?: ExpertSource;
|
expertSource?: ExpertSource;
|
||||||
|
expertLanguage?: 'en' | 'zh';
|
||||||
}
|
}
|
||||||
|
|
||||||
export function useExpertTransformation(options: UseExpertTransformationOptions = {}) {
|
export function useExpertTransformation(options: UseExpertTransformationOptions = {}) {
|
||||||
@@ -63,6 +64,7 @@ export function useExpertTransformation(options: UseExpertTransformationOptions
|
|||||||
keywords_per_expert: expertConfig.keywords_per_expert,
|
keywords_per_expert: expertConfig.keywords_per_expert,
|
||||||
custom_experts: expertConfig.custom_experts,
|
custom_experts: expertConfig.custom_experts,
|
||||||
expert_source: options.expertSource,
|
expert_source: options.expertSource,
|
||||||
|
expert_language: options.expertLanguage,
|
||||||
model: options.model,
|
model: options.model,
|
||||||
temperature: options.temperature,
|
temperature: options.temperature,
|
||||||
},
|
},
|
||||||
@@ -155,7 +157,7 @@ export function useExpertTransformation(options: UseExpertTransformationOptions
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
[options.model, options.temperature, options.expertSource]
|
[options.model, options.temperature, options.expertSource, options.expertLanguage]
|
||||||
);
|
);
|
||||||
|
|
||||||
const transformAll = useCallback(
|
const transformAll = useCallback(
|
||||||
|
|||||||
@@ -10,7 +10,9 @@ import type {
|
|||||||
TransformationCategoryResult,
|
TransformationCategoryResult,
|
||||||
ExpertTransformationRequest,
|
ExpertTransformationRequest,
|
||||||
ExpertTransformationCategoryResult,
|
ExpertTransformationCategoryResult,
|
||||||
ExpertProfile
|
ExpertProfile,
|
||||||
|
DeduplicationRequest,
|
||||||
|
DeduplicationResult
|
||||||
} from '../types';
|
} from '../types';
|
||||||
|
|
||||||
// 自動使用當前瀏覽器的 hostname,支援遠端存取
|
// 自動使用當前瀏覽器的 hostname,支援遠端存取
|
||||||
@@ -299,3 +301,24 @@ export async function expertTransformCategoryStream(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ===== Deduplication Agent API =====
|
||||||
|
|
||||||
|
export async function deduplicateDescriptions(
|
||||||
|
request: DeduplicationRequest
|
||||||
|
): Promise<DeduplicationResult> {
|
||||||
|
const response = await fetch(`${API_BASE_URL}/deduplication/deduplicate`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
body: JSON.stringify(request),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const errorText = await response.text();
|
||||||
|
throw new Error(`API error: ${response.status} - ${errorText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return response.json();
|
||||||
|
}
|
||||||
|
|||||||
@@ -265,3 +265,37 @@ export interface ExpertTransformationInput {
|
|||||||
custom_experts?: string[];
|
custom_experts?: string[];
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ===== Deduplication Agent types =====
|
||||||
|
|
||||||
|
export type DeduplicationMethod = 'embedding' | 'llm';
|
||||||
|
|
||||||
|
export interface DeduplicationRequest {
|
||||||
|
descriptions: ExpertTransformationDescription[];
|
||||||
|
method?: DeduplicationMethod; // 去重方法,default: 'embedding'
|
||||||
|
similarity_threshold?: number; // 0.0-1.0, default 0.85,僅 embedding 使用
|
||||||
|
model?: string; // Embedding/LLM model
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DescriptionGroup {
|
||||||
|
group_id: string;
|
||||||
|
representative: ExpertTransformationDescription;
|
||||||
|
duplicates: ExpertTransformationDescription[];
|
||||||
|
similarity_scores: number[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DeduplicationResult {
|
||||||
|
total_input: number;
|
||||||
|
total_groups: number;
|
||||||
|
total_duplicates: number;
|
||||||
|
groups: DescriptionGroup[];
|
||||||
|
threshold_used: number;
|
||||||
|
method_used: DeduplicationMethod; // 使用的方法
|
||||||
|
model_used: string; // 使用的模型
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DeduplicationProgress {
|
||||||
|
step: 'idle' | 'processing' | 'done' | 'error';
|
||||||
|
message: string;
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user