Files
novelty-seeking/backend/app/models/schemas.py
gbanyan bc281b8e0a feat: Add Deduplication Agent with embedding and LLM methods
Implement a new Deduplication Agent that identifies and groups similar
transformation descriptions. Supports two deduplication methods:
- Embedding: Fast vector similarity comparison using cosine similarity
- LLM: Accurate pairwise semantic comparison (slower but more precise)

Backend changes:
- Add deduplication router with /deduplicate endpoint
- Add embedding_service for vector-based similarity
- Add llm_deduplication_service for LLM-based comparison
- Improve expert_transformation error handling and progress reporting

Frontend changes:
- Add DeduplicationPanel with interactive group visualization
- Add useDeduplication hook for state management
- Integrate deduplication tab in main App
- Add threshold slider and method selector in sidebar

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-22 20:26:17 +08:00

270 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from pydantic import BaseModel
from typing import Optional, List, Dict
from enum import Enum
class AttributeNode(BaseModel):
name: str
category: Optional[str] = None # 材料, 功能, 用途, 使用族群
children: Optional[List["AttributeNode"]] = None
AttributeNode.model_rebuild()
class AnalyzeRequest(BaseModel):
query: str
model: Optional[str] = None
temperature: Optional[float] = 0.7
categories: Optional[List[str]] = None # 如果為 None使用預設類別
class AnalyzeResponse(BaseModel):
query: str
attributes: AttributeNode
class ModelListResponse(BaseModel):
models: List[str]
# ===== Multi-step streaming schemas =====
class Step1Result(BaseModel):
"""Step 1 的結果:各類別屬性列表"""
materials: List[str]
functions: List[str]
usages: List[str]
users: List[str]
class CausalChain(BaseModel):
"""單條因果鏈"""
material: str
function: str
usage: str
user: str
class StreamAnalyzeRequest(BaseModel):
"""多步驟分析請求(更新為支持動態類別)"""
query: str
model: Optional[str] = None
temperature: Optional[float] = 0.7
chain_count: int = 5 # 用戶可設定要生成多少條因果鏈
# 新增:動態類別支持
category_mode: Optional[str] = "dynamic_auto" # CategoryMode enum 值
custom_categories: Optional[List[str]] = None
suggested_category_count: int = 3 # 建議 LLM 生成的類別數量
class StreamAnalyzeResponse(BaseModel):
"""最終完整結果"""
query: str
step1_result: Step1Result
causal_chains: List[CausalChain]
attributes: AttributeNode
# ===== Dynamic category system schemas =====
class CategoryMode(str, Enum):
"""類別模式"""
FIXED_ONLY = "fixed_only"
FIXED_PLUS_CUSTOM = "fixed_plus_custom"
FIXED_PLUS_DYNAMIC = "fixed_plus_dynamic" # Fixed + LLM suggested
CUSTOM_ONLY = "custom_only"
DYNAMIC_AUTO = "dynamic_auto"
class CategoryDefinition(BaseModel):
"""類別定義"""
name: str
description: Optional[str] = None
is_fixed: bool = True # LLM 生成的為 False
order: int = 0
class Step0Result(BaseModel):
"""Step 0: LLM 分析建議類別"""
categories: List[CategoryDefinition]
class DynamicStep1Result(BaseModel):
"""動態版本的 Step 1 結果"""
attributes: Dict[str, List[str]] # {類別名: [屬性列表]}
class DynamicCausalChain(BaseModel):
"""動態版本的因果鏈"""
chain: Dict[str, str] # {類別名: 選中屬性}
# ===== DAG (Directed Acyclic Graph) schemas =====
class DAGNode(BaseModel):
"""DAG 節點 - 每個屬性只出現一次"""
id: str # 唯一 ID: "{category}_{index}"
name: str # 顯示名稱
category: str # 所屬類別
order: int # 欄位內位置
class DAGEdge(BaseModel):
"""DAG 邊 - 節點之間的連接"""
source_id: str
target_id: str
class AttributeDAG(BaseModel):
"""完整 DAG 結構"""
query: str
categories: List[CategoryDefinition]
nodes: List[DAGNode]
edges: List[DAGEdge]
class DAGRelationship(BaseModel):
"""Step 2 輸出 - 單一關係"""
source_category: str
source: str # source attribute name
target_category: str
target: str # target attribute name
# ===== Transformation Agent schemas =====
class TransformationRequest(BaseModel):
"""Transformation Agent 請求"""
query: str # 原始查詢 (e.g., "腳踏車")
category: str # 類別名稱 (e.g., "功能")
attributes: List[str] # 該類別的屬性列表
model: Optional[str] = None
temperature: Optional[float] = 0.7
keyword_count: int = 3 # 要生成的新關鍵字數量
class TransformationDescription(BaseModel):
"""單一轉換描述"""
keyword: str # 新關鍵字
description: str # 與 query 結合的描述
class TransformationCategoryResult(BaseModel):
"""單一類別的轉換結果"""
category: str
original_attributes: List[str] # 原始屬性
new_keywords: List[str] # 新生成的關鍵字
descriptions: List[TransformationDescription]
class TransformationDAGResult(BaseModel):
"""完整 Transformation 結果"""
query: str
results: List[TransformationCategoryResult]
# ===== Expert Transformation Agent schemas =====
class ExpertProfile(BaseModel):
"""專家檔案"""
id: str # e.g., "expert-0"
name: str # e.g., "藥師"
domain: str # e.g., "醫療與健康"
perspective: Optional[str] = None # e.g., "從藥物與健康管理角度思考"
class ExpertKeyword(BaseModel):
"""專家視角生成的關鍵字"""
keyword: str # 關鍵字本身
expert_id: str # 哪個專家生成的
expert_name: str # 專家名稱(冗餘,方便前端)
source_attribute: str # 來自哪個原始屬性
class ExpertTransformationDescription(BaseModel):
"""專家關鍵字的描述"""
keyword: str
expert_id: str
expert_name: str
description: str
class ExpertTransformationCategoryResult(BaseModel):
"""單一類別的轉換結果(專家版)"""
category: str
original_attributes: List[str]
expert_keywords: List[ExpertKeyword] # 所有專家生成的關鍵字
descriptions: List[ExpertTransformationDescription]
class ExpertTransformationDAGResult(BaseModel):
"""完整轉換結果(專家版)"""
query: str
experts: List[ExpertProfile] # 使用的專家列表
results: List[ExpertTransformationCategoryResult]
class ExpertSource(str, Enum):
"""專家來源類型"""
LLM = "llm"
CURATED = "curated" # 精選職業210筆含具體領域
DBPEDIA = "dbpedia"
WIKIDATA = "wikidata"
class ExpertTransformationRequest(BaseModel):
"""Expert Transformation Agent 請求"""
query: str
category: str
attributes: List[str]
# Expert parameters
expert_count: int = 3 # 專家數量 (2-8)
keywords_per_expert: int = 1 # 每個專家為每個屬性生成幾個關鍵字 (1-3)
custom_experts: Optional[List[str]] = None # 用戶指定專家 ["藥師", "工程師"]
# Expert source parameters
expert_source: ExpertSource = ExpertSource.LLM # 專家來源
expert_language: str = "en" # 外部來源的語言 (目前只有英文資料)
# LLM parameters
model: Optional[str] = None
temperature: Optional[float] = 0.7
# ===== Deduplication Agent schemas =====
class DeduplicationMethod(str, Enum):
"""去重方法"""
EMBEDDING = "embedding" # 向量相似度
LLM = "llm" # LLM 成對判斷
class DeduplicationRequest(BaseModel):
"""去重請求"""
descriptions: List[ExpertTransformationDescription]
method: DeduplicationMethod = DeduplicationMethod.EMBEDDING # 去重方法
similarity_threshold: float = 0.85 # 餘弦相似度閾值 (0.0-1.0),僅 Embedding 使用
model: Optional[str] = None # Embedding/LLM 模型
class DescriptionGroup(BaseModel):
"""相似描述分組"""
group_id: str # "group-0", "group-1"...
representative: ExpertTransformationDescription # 代表描述
duplicates: List[ExpertTransformationDescription] # 相似描述
similarity_scores: List[float] # 每個重複項的相似度分數
class DeduplicationResult(BaseModel):
"""去重結果"""
total_input: int # 輸入描述總數
total_groups: int # 分組數量
total_duplicates: int # 重複項數量
groups: List[DescriptionGroup]
threshold_used: float
method_used: DeduplicationMethod # 使用的去重方法
model_used: str # 使用的模型