feat: Add Deduplication Agent with embedding and LLM methods
Implement a new Deduplication Agent that identifies and groups similar transformation descriptions. Supports two deduplication methods: - Embedding: Fast vector similarity comparison using cosine similarity - LLM: Accurate pairwise semantic comparison (slower but more precise) Backend changes: - Add deduplication router with /deduplicate endpoint - Add embedding_service for vector-based similarity - Add llm_deduplication_service for LLM-based comparison - Improve expert_transformation error handling and progress reporting Frontend changes: - Add DeduplicationPanel with interactive group visualization - Add useDeduplication hook for state management - Integrate deduplication tab in main App - Add threshold slider and method selector in sidebar 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
93
backend/app/routers/deduplication.py
Normal file
93
backend/app/routers/deduplication.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""
|
||||
Deduplication Router - 使用 Embedding 或 LLM 去重描述
|
||||
|
||||
提供 API 端點將相似的創新描述分組,幫助識別重複的想法。
|
||||
支援兩種方法:
|
||||
- Embedding: 快速向量相似度比較
|
||||
- LLM: 精準語意判斷(較慢但更準確)
|
||||
"""
|
||||
|
||||
import logging
|
||||
from fastapi import APIRouter, HTTPException
|
||||
|
||||
from ..models.schemas import DeduplicationRequest, DeduplicationResult, DeduplicationMethod
|
||||
from ..services.embedding_service import embedding_service
|
||||
from ..services.llm_deduplication_service import llm_deduplication_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/api/deduplication", tags=["deduplication"])
|
||||
|
||||
|
||||
@router.post("/deduplicate", response_model=DeduplicationResult)
|
||||
async def deduplicate_descriptions(request: DeduplicationRequest) -> DeduplicationResult:
|
||||
"""
|
||||
去重描述
|
||||
|
||||
支援兩種方法:
|
||||
- embedding: 使用向量相似度(快速)
|
||||
- llm: 使用 LLM 成對比較(精準但較慢)
|
||||
|
||||
Args:
|
||||
request: 去重請求,包含描述列表、方法選擇和相關參數
|
||||
|
||||
Returns:
|
||||
DeduplicationResult: 去重結果,包含分組資訊
|
||||
|
||||
Raises:
|
||||
HTTPException: 如果去重處理失敗
|
||||
"""
|
||||
method = request.method
|
||||
logger.info(f"Deduplication request: {len(request.descriptions)} descriptions, method={method.value}, threshold={request.similarity_threshold}")
|
||||
|
||||
if not request.descriptions:
|
||||
return DeduplicationResult(
|
||||
total_input=0,
|
||||
total_groups=0,
|
||||
total_duplicates=0,
|
||||
groups=[],
|
||||
threshold_used=request.similarity_threshold,
|
||||
method_used=method,
|
||||
model_used=request.model or ("nomic-embed-text" if method == DeduplicationMethod.EMBEDDING else "qwen3:4b")
|
||||
)
|
||||
|
||||
try:
|
||||
if method == DeduplicationMethod.EMBEDDING:
|
||||
# 使用 Embedding 相似度去重
|
||||
result = await embedding_service.deduplicate(
|
||||
descriptions=request.descriptions,
|
||||
threshold=request.similarity_threshold,
|
||||
model=request.model
|
||||
)
|
||||
else:
|
||||
# 使用 LLM 成對比較去重
|
||||
result = await llm_deduplication_service.deduplicate(
|
||||
descriptions=request.descriptions,
|
||||
model=request.model
|
||||
)
|
||||
return result
|
||||
except ValueError as e:
|
||||
logger.error(f"Deduplication failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error during deduplication: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Deduplication failed: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/models")
|
||||
async def list_embedding_models():
|
||||
"""
|
||||
列出可用的 Embedding 模型
|
||||
|
||||
Returns:
|
||||
dict: 可用模型列表和建議的預設模型
|
||||
"""
|
||||
return {
|
||||
"default": "nomic-embed-text",
|
||||
"available": [
|
||||
{"name": "nomic-embed-text", "description": "Fast and efficient embedding model"},
|
||||
{"name": "mxbai-embed-large", "description": "High quality embeddings"},
|
||||
{"name": "all-minilm", "description": "Lightweight embedding model"},
|
||||
],
|
||||
"note": "Run 'ollama pull <model>' to install a model"
|
||||
}
|
||||
Reference in New Issue
Block a user