feat: Add Deduplication Agent with embedding and LLM methods
Implement a new Deduplication Agent that identifies and groups similar transformation descriptions. Supports two deduplication methods: - Embedding: Fast vector similarity comparison using cosine similarity - LLM: Accurate pairwise semantic comparison (slower but more precise) Backend changes: - Add deduplication router with /deduplicate endpoint - Add embedding_service for vector-based similarity - Add llm_deduplication_service for LLM-based comparison - Improve expert_transformation error handling and progress reporting Frontend changes: - Add DeduplicationPanel with interactive group visualization - Add useDeduplication hook for state management - Integrate deduplication tab in main App - Add threshold slider and method selector in sidebar 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
250
backend/app/services/embedding_service.py
Normal file
250
backend/app/services/embedding_service.py
Normal file
@@ -0,0 +1,250 @@
|
||||
"""
|
||||
Embedding Service - generates embeddings and performs similarity-based deduplication
|
||||
|
||||
使用 Ollama 的 embedding 端點生成向量,並透過餘弦相似度進行去重分組。
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
import httpx
|
||||
import numpy as np
|
||||
|
||||
from ..config import settings
|
||||
from ..models.schemas import (
|
||||
ExpertTransformationDescription,
|
||||
DeduplicationResult,
|
||||
DeduplicationMethod,
|
||||
DescriptionGroup,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EmbeddingService:
|
||||
"""Embedding 服務:生成向量並執行相似度去重"""
|
||||
|
||||
def __init__(self):
|
||||
self.base_url = settings.ollama_base_url
|
||||
self.default_model = "nomic-embed-text" # Ollama 預設的 embedding 模型
|
||||
self.client = httpx.AsyncClient(timeout=120.0)
|
||||
|
||||
async def get_embedding(self, text: str, model: Optional[str] = None) -> List[float]:
|
||||
"""取得單一文字的 embedding 向量"""
|
||||
model = model or self.default_model
|
||||
url = f"{self.base_url}/api/embed"
|
||||
|
||||
try:
|
||||
response = await self.client.post(url, json={
|
||||
"model": model,
|
||||
"input": text
|
||||
})
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
return result["embeddings"][0]
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"Embedding API error: {e.response.status_code} - {e.response.text}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Embedding error: {e}")
|
||||
raise
|
||||
|
||||
async def get_embeddings_batch(
|
||||
self,
|
||||
texts: List[str],
|
||||
model: Optional[str] = None
|
||||
) -> List[List[float]]:
|
||||
"""批次取得多個文字的 embedding 向量"""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
model = model or self.default_model
|
||||
url = f"{self.base_url}/api/embed"
|
||||
|
||||
try:
|
||||
# Ollama 支援批次 embedding
|
||||
response = await self.client.post(url, json={
|
||||
"model": model,
|
||||
"input": texts
|
||||
})
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
return result["embeddings"]
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"Batch embedding API error: {e.response.status_code} - {e.response.text}")
|
||||
# 如果批次失敗,嘗試逐一處理
|
||||
logger.info("Falling back to single embedding requests...")
|
||||
embeddings = []
|
||||
for text in texts:
|
||||
emb = await self.get_embedding(text, model)
|
||||
embeddings.append(emb)
|
||||
return embeddings
|
||||
except Exception as e:
|
||||
logger.error(f"Batch embedding error: {e}")
|
||||
raise
|
||||
|
||||
def cosine_similarity(self, a: List[float], b: List[float]) -> float:
|
||||
"""計算兩個向量的餘弦相似度"""
|
||||
a_np = np.array(a)
|
||||
b_np = np.array(b)
|
||||
norm_a = np.linalg.norm(a_np)
|
||||
norm_b = np.linalg.norm(b_np)
|
||||
if norm_a == 0 or norm_b == 0:
|
||||
return 0.0
|
||||
return float(np.dot(a_np, b_np) / (norm_a * norm_b))
|
||||
|
||||
def build_similarity_matrix(
|
||||
self,
|
||||
embeddings: List[List[float]]
|
||||
) -> np.ndarray:
|
||||
"""建立成對相似度矩陣"""
|
||||
n = len(embeddings)
|
||||
matrix = np.zeros((n, n))
|
||||
|
||||
for i in range(n):
|
||||
matrix[i][i] = 1.0 # 自己與自己的相似度為 1
|
||||
for j in range(i + 1, n):
|
||||
sim = self.cosine_similarity(embeddings[i], embeddings[j])
|
||||
matrix[i][j] = sim
|
||||
matrix[j][i] = sim
|
||||
|
||||
return matrix
|
||||
|
||||
def cluster_by_similarity(
|
||||
self,
|
||||
similarity_matrix: np.ndarray,
|
||||
threshold: float
|
||||
) -> List[List[int]]:
|
||||
"""
|
||||
貪婪聚類:將相似度 >= threshold 的項目分組
|
||||
|
||||
演算法:
|
||||
1. 從第一個未分配的項目開始
|
||||
2. 找出所有與該項目相似度 >= threshold 的項目
|
||||
3. 歸入同一組
|
||||
4. 重複直到所有項目都已分配
|
||||
|
||||
Returns:
|
||||
List[List[int]]: 每個子列表包含同組項目的索引
|
||||
"""
|
||||
n = len(similarity_matrix)
|
||||
assigned = [False] * n
|
||||
groups = []
|
||||
|
||||
for i in range(n):
|
||||
if assigned[i]:
|
||||
continue
|
||||
|
||||
# 開始新的分組,以 item i 為代表
|
||||
group = [i]
|
||||
assigned[i] = True
|
||||
|
||||
# 找出所有與 i 相似的項目
|
||||
for j in range(i + 1, n):
|
||||
if not assigned[j] and similarity_matrix[i][j] >= threshold:
|
||||
group.append(j)
|
||||
assigned[j] = True
|
||||
|
||||
groups.append(group)
|
||||
|
||||
return groups
|
||||
|
||||
async def deduplicate(
|
||||
self,
|
||||
descriptions: List[ExpertTransformationDescription],
|
||||
threshold: float = 0.85,
|
||||
model: Optional[str] = None
|
||||
) -> DeduplicationResult:
|
||||
"""
|
||||
主要去重方法
|
||||
|
||||
Args:
|
||||
descriptions: 要去重的描述列表
|
||||
threshold: 相似度閾值 (0.0-1.0),預設 0.85
|
||||
model: Embedding 模型名稱
|
||||
|
||||
Returns:
|
||||
DeduplicationResult: 去重結果,包含分組資訊
|
||||
"""
|
||||
model = model or self.default_model
|
||||
|
||||
# 空輸入處理
|
||||
if not descriptions:
|
||||
return DeduplicationResult(
|
||||
total_input=0,
|
||||
total_groups=0,
|
||||
total_duplicates=0,
|
||||
groups=[],
|
||||
threshold_used=threshold,
|
||||
method_used=DeduplicationMethod.EMBEDDING,
|
||||
model_used=model
|
||||
)
|
||||
|
||||
# 提取描述文字
|
||||
texts = [d.description for d in descriptions]
|
||||
logger.info(f"Generating embeddings for {len(texts)} descriptions using model '{model}'...")
|
||||
|
||||
# 批次取得 embeddings
|
||||
try:
|
||||
embeddings = await self.get_embeddings_batch(texts, model)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate embeddings: {e}")
|
||||
raise ValueError(f"Embedding generation failed: {e}. Make sure the model '{model}' is installed (run: ollama pull {model})")
|
||||
|
||||
# 建立相似度矩陣
|
||||
logger.info("Building similarity matrix...")
|
||||
sim_matrix = self.build_similarity_matrix(embeddings)
|
||||
|
||||
# 聚類
|
||||
logger.info(f"Clustering with threshold {threshold}...")
|
||||
clusters = self.cluster_by_similarity(sim_matrix, threshold)
|
||||
|
||||
# 建立結果分組
|
||||
result_groups = []
|
||||
total_duplicates = 0
|
||||
|
||||
for group_idx, indices in enumerate(clusters):
|
||||
if len(indices) == 1:
|
||||
# 獨立項目 - 無重複
|
||||
result_groups.append(DescriptionGroup(
|
||||
group_id=f"group-{group_idx}",
|
||||
representative=descriptions[indices[0]],
|
||||
duplicates=[],
|
||||
similarity_scores=[]
|
||||
))
|
||||
else:
|
||||
# 有重複的分組 - 第一個為代表
|
||||
rep_idx = indices[0]
|
||||
dup_indices = indices[1:]
|
||||
dup_scores = [
|
||||
float(sim_matrix[rep_idx][idx])
|
||||
for idx in dup_indices
|
||||
]
|
||||
|
||||
result_groups.append(DescriptionGroup(
|
||||
group_id=f"group-{group_idx}",
|
||||
representative=descriptions[rep_idx],
|
||||
duplicates=[descriptions[idx] for idx in dup_indices],
|
||||
similarity_scores=dup_scores
|
||||
))
|
||||
total_duplicates += len(dup_indices)
|
||||
|
||||
logger.info(f"Deduplication complete: {len(descriptions)} -> {len(result_groups)} groups, {total_duplicates} duplicates found")
|
||||
|
||||
return DeduplicationResult(
|
||||
total_input=len(descriptions),
|
||||
total_groups=len(result_groups),
|
||||
total_duplicates=total_duplicates,
|
||||
groups=result_groups,
|
||||
threshold_used=threshold,
|
||||
method_used=DeduplicationMethod.EMBEDDING,
|
||||
model_used=model
|
||||
)
|
||||
|
||||
async def close(self):
|
||||
"""關閉 HTTP 客戶端"""
|
||||
await self.client.aclose()
|
||||
|
||||
|
||||
# 全域實例
|
||||
embedding_service = EmbeddingService()
|
||||
252
backend/app/services/llm_deduplication_service.py
Normal file
252
backend/app/services/llm_deduplication_service.py
Normal file
@@ -0,0 +1,252 @@
|
||||
"""
|
||||
LLM Deduplication Service - 使用 LLM 成對比較進行去重
|
||||
|
||||
讓 LLM 判斷兩個描述是否語意重複,透過並行處理加速。
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
import httpx
|
||||
import numpy as np
|
||||
|
||||
from ..config import settings
|
||||
from ..models.schemas import (
|
||||
ExpertTransformationDescription,
|
||||
DeduplicationResult,
|
||||
DeduplicationMethod,
|
||||
DescriptionGroup,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LLMDeduplicationService:
|
||||
"""LLM 去重服務:使用 LLM 成對比較判斷語意相似度"""
|
||||
|
||||
def __init__(self):
|
||||
self.base_url = settings.ollama_base_url
|
||||
self.default_model = "qwen3:4b" # 快速模型,適合簡單判斷
|
||||
self.client = httpx.AsyncClient(timeout=60.0)
|
||||
self.max_concurrent = 5 # 最大並行數,避免 Ollama 過載
|
||||
|
||||
async def compare_pair(
|
||||
self,
|
||||
desc1: str,
|
||||
desc2: str,
|
||||
model: str,
|
||||
semaphore: asyncio.Semaphore
|
||||
) -> bool:
|
||||
"""
|
||||
讓 LLM 判斷兩個描述是否語意重複
|
||||
|
||||
Args:
|
||||
desc1: 第一個描述
|
||||
desc2: 第二個描述
|
||||
model: LLM 模型名稱
|
||||
semaphore: 並行控制信號量
|
||||
|
||||
Returns:
|
||||
bool: 是否為重複描述
|
||||
"""
|
||||
async with semaphore: # 控制並行數
|
||||
prompt = f"""判斷以下兩個創新描述是否表達相同或非常相似的概念:
|
||||
|
||||
描述1: {desc1}
|
||||
|
||||
描述2: {desc2}
|
||||
|
||||
如果兩者描述的創新概念本質相同或非常相似,回答 "YES"
|
||||
如果兩者描述不同的創新概念,回答 "NO"
|
||||
只回答 YES 或 NO,不要其他文字"""
|
||||
|
||||
try:
|
||||
response = await self.client.post(
|
||||
f"{self.base_url}/api/generate",
|
||||
json={
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.1, # 低溫度以獲得一致的判斷
|
||||
"num_predict": 10, # 只需要短回答
|
||||
}
|
||||
}
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()["response"].strip().upper()
|
||||
is_similar = result.startswith("YES")
|
||||
logger.debug(f"LLM comparison: '{desc1[:30]}...' vs '{desc2[:30]}...' -> {result} ({is_similar})")
|
||||
return is_similar
|
||||
except Exception as e:
|
||||
logger.error(f"LLM comparison failed: {e}")
|
||||
return False # 失敗時假設不相似
|
||||
|
||||
async def compare_batch(
|
||||
self,
|
||||
pairs: List[Tuple[int, int, str, str]],
|
||||
model: str
|
||||
) -> List[Tuple[int, int, bool]]:
|
||||
"""
|
||||
並行批次比較多個描述對
|
||||
|
||||
Args:
|
||||
pairs: 待比較的配對列表 [(i, j, desc1, desc2), ...]
|
||||
model: LLM 模型名稱
|
||||
|
||||
Returns:
|
||||
比較結果列表 [(i, j, is_similar), ...]
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(self.max_concurrent)
|
||||
|
||||
async def compare_one(pair: Tuple[int, int, str, str]) -> Tuple[int, int, bool]:
|
||||
i, j, desc1, desc2 = pair
|
||||
is_similar = await self.compare_pair(desc1, desc2, model, semaphore)
|
||||
return (i, j, is_similar)
|
||||
|
||||
# 使用 asyncio.gather 並行執行所有比較
|
||||
results = await asyncio.gather(*[compare_one(p) for p in pairs])
|
||||
return results
|
||||
|
||||
def cluster_by_similarity(
|
||||
self,
|
||||
similarity_matrix: np.ndarray,
|
||||
threshold: float
|
||||
) -> List[List[int]]:
|
||||
"""
|
||||
貪婪聚類:將相似度 >= threshold 的項目分組
|
||||
|
||||
與 embedding_service 使用相同的演算法
|
||||
"""
|
||||
n = len(similarity_matrix)
|
||||
assigned = [False] * n
|
||||
groups = []
|
||||
|
||||
for i in range(n):
|
||||
if assigned[i]:
|
||||
continue
|
||||
|
||||
# 開始新的分組,以 item i 為代表
|
||||
group = [i]
|
||||
assigned[i] = True
|
||||
|
||||
# 找出所有與 i 相似的項目
|
||||
for j in range(i + 1, n):
|
||||
if not assigned[j] and similarity_matrix[i][j] >= threshold:
|
||||
group.append(j)
|
||||
assigned[j] = True
|
||||
|
||||
groups.append(group)
|
||||
|
||||
return groups
|
||||
|
||||
async def deduplicate(
|
||||
self,
|
||||
descriptions: List[ExpertTransformationDescription],
|
||||
model: Optional[str] = None
|
||||
) -> DeduplicationResult:
|
||||
"""
|
||||
使用 LLM 成對比較進行去重
|
||||
|
||||
Args:
|
||||
descriptions: 要去重的描述列表
|
||||
model: LLM 模型名稱
|
||||
|
||||
Returns:
|
||||
DeduplicationResult: 去重結果
|
||||
"""
|
||||
model = model or self.default_model
|
||||
|
||||
# 空輸入處理
|
||||
if not descriptions:
|
||||
return DeduplicationResult(
|
||||
total_input=0,
|
||||
total_groups=0,
|
||||
total_duplicates=0,
|
||||
groups=[],
|
||||
threshold_used=0.5, # LLM 方法固定使用 0.5 閾值
|
||||
method_used=DeduplicationMethod.LLM,
|
||||
model_used=model
|
||||
)
|
||||
|
||||
n = len(descriptions)
|
||||
similarity_matrix = np.zeros((n, n))
|
||||
|
||||
# 對角線為 1(自己與自己相似)
|
||||
for i in range(n):
|
||||
similarity_matrix[i][i] = 1.0
|
||||
|
||||
# 建立所有需要比較的配對
|
||||
pairs = []
|
||||
for i in range(n):
|
||||
for j in range(i + 1, n):
|
||||
pairs.append((
|
||||
i, j,
|
||||
descriptions[i].description,
|
||||
descriptions[j].description
|
||||
))
|
||||
|
||||
total_pairs = len(pairs)
|
||||
logger.info(f"LLM deduplication: {total_pairs} pairs to compare (parallel={self.max_concurrent}, model={model})")
|
||||
|
||||
# 並行批次比較
|
||||
results = await self.compare_batch(pairs, model)
|
||||
|
||||
# 填入相似度矩陣
|
||||
for i, j, is_similar in results:
|
||||
similarity_value = 1.0 if is_similar else 0.0
|
||||
similarity_matrix[i][j] = similarity_value
|
||||
similarity_matrix[j][i] = similarity_value
|
||||
|
||||
# 使用閾值 0.5 聚類(因為 LLM 輸出只有 0/1)
|
||||
logger.info("Clustering results...")
|
||||
clusters = self.cluster_by_similarity(similarity_matrix, 0.5)
|
||||
|
||||
# 建立結果分組
|
||||
result_groups = []
|
||||
total_duplicates = 0
|
||||
|
||||
for group_idx, indices in enumerate(clusters):
|
||||
if len(indices) == 1:
|
||||
# 獨立項目 - 無重複
|
||||
result_groups.append(DescriptionGroup(
|
||||
group_id=f"group-{group_idx}",
|
||||
representative=descriptions[indices[0]],
|
||||
duplicates=[],
|
||||
similarity_scores=[]
|
||||
))
|
||||
else:
|
||||
# 有重複的分組 - 第一個為代表
|
||||
rep_idx = indices[0]
|
||||
dup_indices = indices[1:]
|
||||
# LLM 方法的相似度分數都是 1.0(因為是 YES/NO 判斷)
|
||||
dup_scores = [1.0 for _ in dup_indices]
|
||||
|
||||
result_groups.append(DescriptionGroup(
|
||||
group_id=f"group-{group_idx}",
|
||||
representative=descriptions[rep_idx],
|
||||
duplicates=[descriptions[idx] for idx in dup_indices],
|
||||
similarity_scores=dup_scores
|
||||
))
|
||||
total_duplicates += len(dup_indices)
|
||||
|
||||
logger.info(f"LLM deduplication complete: {n} -> {len(result_groups)} groups, {total_duplicates} duplicates found")
|
||||
|
||||
return DeduplicationResult(
|
||||
total_input=n,
|
||||
total_groups=len(result_groups),
|
||||
total_duplicates=total_duplicates,
|
||||
groups=result_groups,
|
||||
threshold_used=0.5, # LLM 方法固定使用 0.5 閾值
|
||||
method_used=DeduplicationMethod.LLM,
|
||||
model_used=model
|
||||
)
|
||||
|
||||
async def close(self):
|
||||
"""關閉 HTTP 客戶端"""
|
||||
await self.client.aclose()
|
||||
|
||||
|
||||
# 全域實例
|
||||
llm_deduplication_service = LLMDeduplicationService()
|
||||
Reference in New Issue
Block a user