feat: Add Deduplication Agent with embedding and LLM methods

Implement a new Deduplication Agent that identifies and groups similar
transformation descriptions. Supports two deduplication methods:
- Embedding: Fast vector similarity comparison using cosine similarity
- LLM: Accurate pairwise semantic comparison (slower but more precise)

Backend changes:
- Add deduplication router with /deduplicate endpoint
- Add embedding_service for vector-based similarity
- Add llm_deduplication_service for LLM-based comparison
- Improve expert_transformation error handling and progress reporting

Frontend changes:
- Add DeduplicationPanel with interactive group visualization
- Add useDeduplication hook for state management
- Integrate deduplication tab in main App
- Add threshold slider and method selector in sidebar

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-22 20:26:17 +08:00
parent 5571076406
commit bc281b8e0a
18 changed files with 1397 additions and 25 deletions

View File

@@ -3,14 +3,18 @@ from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from .routers import attributes, transformation, expert_transformation
from .routers import attributes, transformation, expert_transformation, deduplication
from .services.llm_service import ollama_provider
from .services.embedding_service import embedding_service
from .services.llm_deduplication_service import llm_deduplication_service
@asynccontextmanager
async def lifespan(app: FastAPI):
yield
await ollama_provider.close()
await embedding_service.close()
await llm_deduplication_service.close()
app = FastAPI(
@@ -31,6 +35,7 @@ app.add_middleware(
app.include_router(attributes.router)
app.include_router(transformation.router)
app.include_router(expert_transformation.router)
app.include_router(deduplication.router)
@app.get("/")

View File

@@ -232,3 +232,38 @@ class ExpertTransformationRequest(BaseModel):
# LLM parameters
model: Optional[str] = None
temperature: Optional[float] = 0.7
# ===== Deduplication Agent schemas =====
class DeduplicationMethod(str, Enum):
"""去重方法"""
EMBEDDING = "embedding" # 向量相似度
LLM = "llm" # LLM 成對判斷
class DeduplicationRequest(BaseModel):
"""去重請求"""
descriptions: List[ExpertTransformationDescription]
method: DeduplicationMethod = DeduplicationMethod.EMBEDDING # 去重方法
similarity_threshold: float = 0.85 # 餘弦相似度閾值 (0.0-1.0),僅 Embedding 使用
model: Optional[str] = None # Embedding/LLM 模型
class DescriptionGroup(BaseModel):
"""相似描述分組"""
group_id: str # "group-0", "group-1"...
representative: ExpertTransformationDescription # 代表描述
duplicates: List[ExpertTransformationDescription] # 相似描述
similarity_scores: List[float] # 每個重複項的相似度分數
class DeduplicationResult(BaseModel):
"""去重結果"""
total_input: int # 輸入描述總數
total_groups: int # 分組數量
total_duplicates: int # 重複項數量
groups: List[DescriptionGroup]
threshold_used: float
method_used: DeduplicationMethod # 使用的去重方法
model_used: str # 使用的模型

View File

@@ -90,16 +90,15 @@ def get_single_description_prompt(
) -> str:
"""Step 2: 為單一關鍵字生成描述"""
# 如果 domain 是通用的,就只用職業名稱
domain_text = f"{expert_domain}" if expert_domain and expert_domain != "Professional Field" else ""
domain_text = f"{expert_domain}領域" if expert_domain and expert_domain != "Professional Field" else ""
return f"""/no_think
物件:「{query}
專家:{expert_name}{domain_text}
你是一位{expert_name}{domain_text}
任務:為「{query}」生成一段創新應用描述。
關鍵字:{keyword}
你是一位{expert_name}。從你的專業視角生成一段創新應用描述15-30字,說明如何將「{keyword}」的概念應用到「{query}」上。
從你的專業視角,說明如何將「{keyword}」的概念應用到「{query}」上。描述要具體、有創意15-30字。
描述要體現{expert_name}的專業思維和獨特觀點。
回傳 JSON
{{"description": "應用描述"}}"""
只回傳 JSON不要其他文字
{{"description": "你的創新應用描述"}}"""

View File

@@ -0,0 +1,93 @@
"""
Deduplication Router - 使用 Embedding 或 LLM 去重描述
提供 API 端點將相似的創新描述分組,幫助識別重複的想法。
支援兩種方法:
- Embedding: 快速向量相似度比較
- LLM: 精準語意判斷(較慢但更準確)
"""
import logging
from fastapi import APIRouter, HTTPException
from ..models.schemas import DeduplicationRequest, DeduplicationResult, DeduplicationMethod
from ..services.embedding_service import embedding_service
from ..services.llm_deduplication_service import llm_deduplication_service
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/deduplication", tags=["deduplication"])
@router.post("/deduplicate", response_model=DeduplicationResult)
async def deduplicate_descriptions(request: DeduplicationRequest) -> DeduplicationResult:
"""
去重描述
支援兩種方法:
- embedding: 使用向量相似度(快速)
- llm: 使用 LLM 成對比較(精準但較慢)
Args:
request: 去重請求,包含描述列表、方法選擇和相關參數
Returns:
DeduplicationResult: 去重結果,包含分組資訊
Raises:
HTTPException: 如果去重處理失敗
"""
method = request.method
logger.info(f"Deduplication request: {len(request.descriptions)} descriptions, method={method.value}, threshold={request.similarity_threshold}")
if not request.descriptions:
return DeduplicationResult(
total_input=0,
total_groups=0,
total_duplicates=0,
groups=[],
threshold_used=request.similarity_threshold,
method_used=method,
model_used=request.model or ("nomic-embed-text" if method == DeduplicationMethod.EMBEDDING else "qwen3:4b")
)
try:
if method == DeduplicationMethod.EMBEDDING:
# 使用 Embedding 相似度去重
result = await embedding_service.deduplicate(
descriptions=request.descriptions,
threshold=request.similarity_threshold,
model=request.model
)
else:
# 使用 LLM 成對比較去重
result = await llm_deduplication_service.deduplicate(
descriptions=request.descriptions,
model=request.model
)
return result
except ValueError as e:
logger.error(f"Deduplication failed: {e}")
raise HTTPException(status_code=500, detail=str(e))
except Exception as e:
logger.error(f"Unexpected error during deduplication: {e}")
raise HTTPException(status_code=500, detail=f"Deduplication failed: {str(e)}")
@router.get("/models")
async def list_embedding_models():
"""
列出可用的 Embedding 模型
Returns:
dict: 可用模型列表和建議的預設模型
"""
return {
"default": "nomic-embed-text",
"available": [
{"name": "nomic-embed-text", "description": "Fast and efficient embedding model"},
{"name": "mxbai-embed-large", "description": "High quality embeddings"},
{"name": "all-minilm", "description": "Lightweight embedding model"},
],
"note": "Run 'ollama pull <model>' to install a model"
}

View File

@@ -221,8 +221,27 @@ async def generate_expert_transformation_events(
desc_prompt, model=model, temperature=temperature
)
desc_data = extract_json_from_response(desc_response)
desc_text = desc_data.get("description", "")
# 嘗試解析 JSON若失敗則使用原始回應作為描述
desc_text = ""
try:
desc_data = extract_json_from_response(desc_response)
# 支援多種可能的 key: description, content, text, desc
desc_text = (
desc_data.get("description") or
desc_data.get("content") or
desc_data.get("text") or
desc_data.get("desc") or
""
)
except ValueError:
# JSON 解析失敗,嘗試清理原始回應作為描述
cleaned = desc_response.strip()
# 移除可能的 markdown 和多餘符號
if cleaned.startswith('"') and cleaned.endswith('"'):
cleaned = cleaned[1:-1]
if len(cleaned) > 5 and len(cleaned) < 100:
desc_text = cleaned
logger.info(f"[DESC] 使用 fallback 描述 for '{kw.keyword}': {desc_text[:50]}")
if desc_text:
descriptions.append(ExpertTransformationDescription(
@@ -231,15 +250,22 @@ async def generate_expert_transformation_events(
expert_name=kw.expert_name,
description=desc_text
))
else:
logger.warning(f"[DESC] Empty description for keyword='{kw.keyword}', parsed_data={desc_data}")
# Send progress update
yield f"event: description_progress\ndata: {json.dumps({'current': idx + 1, 'total': len(all_expert_keywords), 'keyword': kw.keyword}, ensure_ascii=False)}\n\n"
# Send progress update with success/fail status
yield f"event: description_progress\ndata: {json.dumps({'current': idx + 1, 'total': len(all_expert_keywords), 'keyword': kw.keyword, 'success': bool(desc_text)}, ensure_ascii=False)}\n\n"
except Exception as e:
logger.warning(f"Failed to generate description for '{kw.keyword}': {e}")
logger.warning(f"[DESC] Failed to generate description for '{kw.keyword}': {e}")
yield f"event: description_progress\ndata: {json.dumps({'current': idx + 1, 'total': len(all_expert_keywords), 'keyword': kw.keyword, 'success': False, 'error': str(e)}, ensure_ascii=False)}\n\n"
# Continue with next keyword
yield f"event: description_complete\ndata: {json.dumps({'count': len(descriptions)}, ensure_ascii=False)}\n\n"
# 統計成功率
success_rate = len(descriptions) / len(all_expert_keywords) * 100 if all_expert_keywords else 0
logger.info(f"[DESC] 描述生成完成: {len(descriptions)}/{len(all_expert_keywords)} 成功 ({success_rate:.1f}%)")
yield f"event: description_complete\ndata: {json.dumps({'count': len(descriptions), 'total': len(all_expert_keywords), 'success_rate': success_rate}, ensure_ascii=False)}\n\n"
# ========== Build final result ==========
result = ExpertTransformationCategoryResult(

View File

@@ -0,0 +1,250 @@
"""
Embedding Service - generates embeddings and performs similarity-based deduplication
使用 Ollama 的 embedding 端點生成向量,並透過餘弦相似度進行去重分組。
"""
import logging
from typing import List, Optional
import httpx
import numpy as np
from ..config import settings
from ..models.schemas import (
ExpertTransformationDescription,
DeduplicationResult,
DeduplicationMethod,
DescriptionGroup,
)
logger = logging.getLogger(__name__)
class EmbeddingService:
"""Embedding 服務:生成向量並執行相似度去重"""
def __init__(self):
self.base_url = settings.ollama_base_url
self.default_model = "nomic-embed-text" # Ollama 預設的 embedding 模型
self.client = httpx.AsyncClient(timeout=120.0)
async def get_embedding(self, text: str, model: Optional[str] = None) -> List[float]:
"""取得單一文字的 embedding 向量"""
model = model or self.default_model
url = f"{self.base_url}/api/embed"
try:
response = await self.client.post(url, json={
"model": model,
"input": text
})
response.raise_for_status()
result = response.json()
return result["embeddings"][0]
except httpx.HTTPStatusError as e:
logger.error(f"Embedding API error: {e.response.status_code} - {e.response.text}")
raise
except Exception as e:
logger.error(f"Embedding error: {e}")
raise
async def get_embeddings_batch(
self,
texts: List[str],
model: Optional[str] = None
) -> List[List[float]]:
"""批次取得多個文字的 embedding 向量"""
if not texts:
return []
model = model or self.default_model
url = f"{self.base_url}/api/embed"
try:
# Ollama 支援批次 embedding
response = await self.client.post(url, json={
"model": model,
"input": texts
})
response.raise_for_status()
result = response.json()
return result["embeddings"]
except httpx.HTTPStatusError as e:
logger.error(f"Batch embedding API error: {e.response.status_code} - {e.response.text}")
# 如果批次失敗,嘗試逐一處理
logger.info("Falling back to single embedding requests...")
embeddings = []
for text in texts:
emb = await self.get_embedding(text, model)
embeddings.append(emb)
return embeddings
except Exception as e:
logger.error(f"Batch embedding error: {e}")
raise
def cosine_similarity(self, a: List[float], b: List[float]) -> float:
"""計算兩個向量的餘弦相似度"""
a_np = np.array(a)
b_np = np.array(b)
norm_a = np.linalg.norm(a_np)
norm_b = np.linalg.norm(b_np)
if norm_a == 0 or norm_b == 0:
return 0.0
return float(np.dot(a_np, b_np) / (norm_a * norm_b))
def build_similarity_matrix(
self,
embeddings: List[List[float]]
) -> np.ndarray:
"""建立成對相似度矩陣"""
n = len(embeddings)
matrix = np.zeros((n, n))
for i in range(n):
matrix[i][i] = 1.0 # 自己與自己的相似度為 1
for j in range(i + 1, n):
sim = self.cosine_similarity(embeddings[i], embeddings[j])
matrix[i][j] = sim
matrix[j][i] = sim
return matrix
def cluster_by_similarity(
self,
similarity_matrix: np.ndarray,
threshold: float
) -> List[List[int]]:
"""
貪婪聚類:將相似度 >= threshold 的項目分組
演算法:
1. 從第一個未分配的項目開始
2. 找出所有與該項目相似度 >= threshold 的項目
3. 歸入同一組
4. 重複直到所有項目都已分配
Returns:
List[List[int]]: 每個子列表包含同組項目的索引
"""
n = len(similarity_matrix)
assigned = [False] * n
groups = []
for i in range(n):
if assigned[i]:
continue
# 開始新的分組,以 item i 為代表
group = [i]
assigned[i] = True
# 找出所有與 i 相似的項目
for j in range(i + 1, n):
if not assigned[j] and similarity_matrix[i][j] >= threshold:
group.append(j)
assigned[j] = True
groups.append(group)
return groups
async def deduplicate(
self,
descriptions: List[ExpertTransformationDescription],
threshold: float = 0.85,
model: Optional[str] = None
) -> DeduplicationResult:
"""
主要去重方法
Args:
descriptions: 要去重的描述列表
threshold: 相似度閾值 (0.0-1.0),預設 0.85
model: Embedding 模型名稱
Returns:
DeduplicationResult: 去重結果,包含分組資訊
"""
model = model or self.default_model
# 空輸入處理
if not descriptions:
return DeduplicationResult(
total_input=0,
total_groups=0,
total_duplicates=0,
groups=[],
threshold_used=threshold,
method_used=DeduplicationMethod.EMBEDDING,
model_used=model
)
# 提取描述文字
texts = [d.description for d in descriptions]
logger.info(f"Generating embeddings for {len(texts)} descriptions using model '{model}'...")
# 批次取得 embeddings
try:
embeddings = await self.get_embeddings_batch(texts, model)
except Exception as e:
logger.error(f"Failed to generate embeddings: {e}")
raise ValueError(f"Embedding generation failed: {e}. Make sure the model '{model}' is installed (run: ollama pull {model})")
# 建立相似度矩陣
logger.info("Building similarity matrix...")
sim_matrix = self.build_similarity_matrix(embeddings)
# 聚類
logger.info(f"Clustering with threshold {threshold}...")
clusters = self.cluster_by_similarity(sim_matrix, threshold)
# 建立結果分組
result_groups = []
total_duplicates = 0
for group_idx, indices in enumerate(clusters):
if len(indices) == 1:
# 獨立項目 - 無重複
result_groups.append(DescriptionGroup(
group_id=f"group-{group_idx}",
representative=descriptions[indices[0]],
duplicates=[],
similarity_scores=[]
))
else:
# 有重複的分組 - 第一個為代表
rep_idx = indices[0]
dup_indices = indices[1:]
dup_scores = [
float(sim_matrix[rep_idx][idx])
for idx in dup_indices
]
result_groups.append(DescriptionGroup(
group_id=f"group-{group_idx}",
representative=descriptions[rep_idx],
duplicates=[descriptions[idx] for idx in dup_indices],
similarity_scores=dup_scores
))
total_duplicates += len(dup_indices)
logger.info(f"Deduplication complete: {len(descriptions)} -> {len(result_groups)} groups, {total_duplicates} duplicates found")
return DeduplicationResult(
total_input=len(descriptions),
total_groups=len(result_groups),
total_duplicates=total_duplicates,
groups=result_groups,
threshold_used=threshold,
method_used=DeduplicationMethod.EMBEDDING,
model_used=model
)
async def close(self):
"""關閉 HTTP 客戶端"""
await self.client.aclose()
# 全域實例
embedding_service = EmbeddingService()

View File

@@ -0,0 +1,252 @@
"""
LLM Deduplication Service - 使用 LLM 成對比較進行去重
讓 LLM 判斷兩個描述是否語意重複,透過並行處理加速。
"""
import asyncio
import logging
from typing import List, Tuple, Optional
import httpx
import numpy as np
from ..config import settings
from ..models.schemas import (
ExpertTransformationDescription,
DeduplicationResult,
DeduplicationMethod,
DescriptionGroup,
)
logger = logging.getLogger(__name__)
class LLMDeduplicationService:
"""LLM 去重服務:使用 LLM 成對比較判斷語意相似度"""
def __init__(self):
self.base_url = settings.ollama_base_url
self.default_model = "qwen3:4b" # 快速模型,適合簡單判斷
self.client = httpx.AsyncClient(timeout=60.0)
self.max_concurrent = 5 # 最大並行數,避免 Ollama 過載
async def compare_pair(
self,
desc1: str,
desc2: str,
model: str,
semaphore: asyncio.Semaphore
) -> bool:
"""
讓 LLM 判斷兩個描述是否語意重複
Args:
desc1: 第一個描述
desc2: 第二個描述
model: LLM 模型名稱
semaphore: 並行控制信號量
Returns:
bool: 是否為重複描述
"""
async with semaphore: # 控制並行數
prompt = f"""判斷以下兩個創新描述是否表達相同或非常相似的概念:
描述1: {desc1}
描述2: {desc2}
如果兩者描述的創新概念本質相同或非常相似,回答 "YES"
如果兩者描述不同的創新概念,回答 "NO"
只回答 YES 或 NO不要其他文字"""
try:
response = await self.client.post(
f"{self.base_url}/api/generate",
json={
"model": model,
"prompt": prompt,
"stream": False,
"options": {
"temperature": 0.1, # 低溫度以獲得一致的判斷
"num_predict": 10, # 只需要短回答
}
}
)
response.raise_for_status()
result = response.json()["response"].strip().upper()
is_similar = result.startswith("YES")
logger.debug(f"LLM comparison: '{desc1[:30]}...' vs '{desc2[:30]}...' -> {result} ({is_similar})")
return is_similar
except Exception as e:
logger.error(f"LLM comparison failed: {e}")
return False # 失敗時假設不相似
async def compare_batch(
self,
pairs: List[Tuple[int, int, str, str]],
model: str
) -> List[Tuple[int, int, bool]]:
"""
並行批次比較多個描述對
Args:
pairs: 待比較的配對列表 [(i, j, desc1, desc2), ...]
model: LLM 模型名稱
Returns:
比較結果列表 [(i, j, is_similar), ...]
"""
semaphore = asyncio.Semaphore(self.max_concurrent)
async def compare_one(pair: Tuple[int, int, str, str]) -> Tuple[int, int, bool]:
i, j, desc1, desc2 = pair
is_similar = await self.compare_pair(desc1, desc2, model, semaphore)
return (i, j, is_similar)
# 使用 asyncio.gather 並行執行所有比較
results = await asyncio.gather(*[compare_one(p) for p in pairs])
return results
def cluster_by_similarity(
self,
similarity_matrix: np.ndarray,
threshold: float
) -> List[List[int]]:
"""
貪婪聚類:將相似度 >= threshold 的項目分組
與 embedding_service 使用相同的演算法
"""
n = len(similarity_matrix)
assigned = [False] * n
groups = []
for i in range(n):
if assigned[i]:
continue
# 開始新的分組,以 item i 為代表
group = [i]
assigned[i] = True
# 找出所有與 i 相似的項目
for j in range(i + 1, n):
if not assigned[j] and similarity_matrix[i][j] >= threshold:
group.append(j)
assigned[j] = True
groups.append(group)
return groups
async def deduplicate(
self,
descriptions: List[ExpertTransformationDescription],
model: Optional[str] = None
) -> DeduplicationResult:
"""
使用 LLM 成對比較進行去重
Args:
descriptions: 要去重的描述列表
model: LLM 模型名稱
Returns:
DeduplicationResult: 去重結果
"""
model = model or self.default_model
# 空輸入處理
if not descriptions:
return DeduplicationResult(
total_input=0,
total_groups=0,
total_duplicates=0,
groups=[],
threshold_used=0.5, # LLM 方法固定使用 0.5 閾值
method_used=DeduplicationMethod.LLM,
model_used=model
)
n = len(descriptions)
similarity_matrix = np.zeros((n, n))
# 對角線為 1自己與自己相似
for i in range(n):
similarity_matrix[i][i] = 1.0
# 建立所有需要比較的配對
pairs = []
for i in range(n):
for j in range(i + 1, n):
pairs.append((
i, j,
descriptions[i].description,
descriptions[j].description
))
total_pairs = len(pairs)
logger.info(f"LLM deduplication: {total_pairs} pairs to compare (parallel={self.max_concurrent}, model={model})")
# 並行批次比較
results = await self.compare_batch(pairs, model)
# 填入相似度矩陣
for i, j, is_similar in results:
similarity_value = 1.0 if is_similar else 0.0
similarity_matrix[i][j] = similarity_value
similarity_matrix[j][i] = similarity_value
# 使用閾值 0.5 聚類(因為 LLM 輸出只有 0/1
logger.info("Clustering results...")
clusters = self.cluster_by_similarity(similarity_matrix, 0.5)
# 建立結果分組
result_groups = []
total_duplicates = 0
for group_idx, indices in enumerate(clusters):
if len(indices) == 1:
# 獨立項目 - 無重複
result_groups.append(DescriptionGroup(
group_id=f"group-{group_idx}",
representative=descriptions[indices[0]],
duplicates=[],
similarity_scores=[]
))
else:
# 有重複的分組 - 第一個為代表
rep_idx = indices[0]
dup_indices = indices[1:]
# LLM 方法的相似度分數都是 1.0(因為是 YES/NO 判斷)
dup_scores = [1.0 for _ in dup_indices]
result_groups.append(DescriptionGroup(
group_id=f"group-{group_idx}",
representative=descriptions[rep_idx],
duplicates=[descriptions[idx] for idx in dup_indices],
similarity_scores=dup_scores
))
total_duplicates += len(dup_indices)
logger.info(f"LLM deduplication complete: {n} -> {len(result_groups)} groups, {total_duplicates} duplicates found")
return DeduplicationResult(
total_input=n,
total_groups=len(result_groups),
total_duplicates=total_duplicates,
groups=result_groups,
threshold_used=0.5, # LLM 方法固定使用 0.5 閾值
method_used=DeduplicationMethod.LLM,
model_used=model
)
async def close(self):
"""關閉 HTTP 客戶端"""
await self.client.aclose()
# 全域實例
llm_deduplication_service = LLMDeduplicationService()