""" Embedding Service - generates embeddings and performs similarity-based deduplication 使用 Ollama 的 embedding 端點生成向量,並透過餘弦相似度進行去重分組。 """ import logging from typing import List, Optional import httpx import numpy as np from ..config import settings from ..models.schemas import ( ExpertTransformationDescription, DeduplicationResult, DeduplicationMethod, DescriptionGroup, ) logger = logging.getLogger(__name__) class EmbeddingService: """Embedding 服務:生成向量並執行相似度去重""" def __init__(self): self.base_url = settings.ollama_base_url self.default_model = "qwen3-embedding:4b" # Qwen3 embedding model for better semantic understanding self.client = httpx.AsyncClient(timeout=120.0) async def get_embedding(self, text: str, model: Optional[str] = None) -> List[float]: """取得單一文字的 embedding 向量""" model = model or self.default_model url = f"{self.base_url}/api/embed" try: response = await self.client.post(url, json={ "model": model, "input": text }) response.raise_for_status() result = response.json() return result["embeddings"][0] except httpx.HTTPStatusError as e: logger.error(f"Embedding API error: {e.response.status_code} - {e.response.text}") raise except Exception as e: logger.error(f"Embedding error: {e}") raise async def get_embeddings_batch( self, texts: List[str], model: Optional[str] = None ) -> List[List[float]]: """批次取得多個文字的 embedding 向量""" if not texts: return [] model = model or self.default_model url = f"{self.base_url}/api/embed" try: # Ollama 支援批次 embedding response = await self.client.post(url, json={ "model": model, "input": texts }) response.raise_for_status() result = response.json() return result["embeddings"] except httpx.HTTPStatusError as e: logger.error(f"Batch embedding API error: {e.response.status_code} - {e.response.text}") # 如果批次失敗,嘗試逐一處理 logger.info("Falling back to single embedding requests...") embeddings = [] for text in texts: emb = await self.get_embedding(text, model) embeddings.append(emb) return embeddings except Exception as e: logger.error(f"Batch embedding error: {e}") raise def cosine_similarity(self, a: List[float], b: List[float]) -> float: """計算兩個向量的餘弦相似度""" a_np = np.array(a) b_np = np.array(b) norm_a = np.linalg.norm(a_np) norm_b = np.linalg.norm(b_np) if norm_a == 0 or norm_b == 0: return 0.0 return float(np.dot(a_np, b_np) / (norm_a * norm_b)) def build_similarity_matrix( self, embeddings: List[List[float]] ) -> np.ndarray: """建立成對相似度矩陣""" n = len(embeddings) matrix = np.zeros((n, n)) for i in range(n): matrix[i][i] = 1.0 # 自己與自己的相似度為 1 for j in range(i + 1, n): sim = self.cosine_similarity(embeddings[i], embeddings[j]) matrix[i][j] = sim matrix[j][i] = sim return matrix def cluster_by_similarity( self, similarity_matrix: np.ndarray, threshold: float ) -> List[List[int]]: """ 貪婪聚類:將相似度 >= threshold 的項目分組 演算法: 1. 從第一個未分配的項目開始 2. 找出所有與該項目相似度 >= threshold 的項目 3. 歸入同一組 4. 重複直到所有項目都已分配 Returns: List[List[int]]: 每個子列表包含同組項目的索引 """ n = len(similarity_matrix) assigned = [False] * n groups = [] for i in range(n): if assigned[i]: continue # 開始新的分組,以 item i 為代表 group = [i] assigned[i] = True # 找出所有與 i 相似的項目 for j in range(i + 1, n): if not assigned[j] and similarity_matrix[i][j] >= threshold: group.append(j) assigned[j] = True groups.append(group) return groups async def deduplicate( self, descriptions: List[ExpertTransformationDescription], threshold: float = 0.85, model: Optional[str] = None ) -> DeduplicationResult: """ 主要去重方法 Args: descriptions: 要去重的描述列表 threshold: 相似度閾值 (0.0-1.0),預設 0.85 model: Embedding 模型名稱 Returns: DeduplicationResult: 去重結果,包含分組資訊 """ model = model or self.default_model # 空輸入處理 if not descriptions: return DeduplicationResult( total_input=0, total_groups=0, total_duplicates=0, groups=[], threshold_used=threshold, method_used=DeduplicationMethod.EMBEDDING, model_used=model ) # 提取描述文字 texts = [d.description for d in descriptions] logger.info(f"Generating embeddings for {len(texts)} descriptions using model '{model}'...") # 批次取得 embeddings try: embeddings = await self.get_embeddings_batch(texts, model) except Exception as e: logger.error(f"Failed to generate embeddings: {e}") raise ValueError(f"Embedding generation failed: {e}. Make sure the model '{model}' is installed (run: ollama pull {model})") # 建立相似度矩陣 logger.info("Building similarity matrix...") sim_matrix = self.build_similarity_matrix(embeddings) # 聚類 logger.info(f"Clustering with threshold {threshold}...") clusters = self.cluster_by_similarity(sim_matrix, threshold) # 建立結果分組 result_groups = [] total_duplicates = 0 for group_idx, indices in enumerate(clusters): if len(indices) == 1: # 獨立項目 - 無重複 result_groups.append(DescriptionGroup( group_id=f"group-{group_idx}", representative=descriptions[indices[0]], duplicates=[], similarity_scores=[] )) else: # 有重複的分組 - 第一個為代表 rep_idx = indices[0] dup_indices = indices[1:] dup_scores = [ float(sim_matrix[rep_idx][idx]) for idx in dup_indices ] result_groups.append(DescriptionGroup( group_id=f"group-{group_idx}", representative=descriptions[rep_idx], duplicates=[descriptions[idx] for idx in dup_indices], similarity_scores=dup_scores )) total_duplicates += len(dup_indices) logger.info(f"Deduplication complete: {len(descriptions)} -> {len(result_groups)} groups, {total_duplicates} duplicates found") return DeduplicationResult( total_input=len(descriptions), total_groups=len(result_groups), total_duplicates=total_duplicates, groups=result_groups, threshold_used=threshold, method_used=DeduplicationMethod.EMBEDDING, model_used=model ) async def close(self): """關閉 HTTP 客戶端""" await self.client.aclose() # 全域實例 embedding_service = EmbeddingService()