novelty-seeking/backend/app/services/embedding_service.py

"""
Embedding Service - generates embeddings and performs similarity-based deduplication

使用 Ollama 的 embedding 端點生成向量，並透過餘弦相似度進行去重分組。
"""

import logging
from typing import List, Optional

import httpx
import numpy as np

from ..config import settings
from ..models.schemas import (
    ExpertTransformationDescription,
    DeduplicationResult,
    DeduplicationMethod,
    DescriptionGroup,
)

logger = logging.getLogger(__name__)


class EmbeddingService:
    """Embedding 服務：生成向量並執行相似度去重"""

    def __init__(self):
        self.base_url = settings.ollama_base_url
        self.default_model = "qwen3-embedding:4b"  # Qwen3 embedding model for better semantic understanding
        self.client = httpx.AsyncClient(timeout=120.0)

    async def get_embedding(self, text: str, model: Optional[str] = None) -> List[float]:
        """取得單一文字的 embedding 向量"""
        model = model or self.default_model
        url = f"{self.base_url}/api/embed"

        try:
            response = await self.client.post(url, json={
                "model": model,
                "input": text
            })
            response.raise_for_status()
            result = response.json()
            return result["embeddings"][0]
        except httpx.HTTPStatusError as e:
            logger.error(f"Embedding API error: {e.response.status_code} - {e.response.text}")
            raise
        except Exception as e:
            logger.error(f"Embedding error: {e}")
            raise

    async def get_embeddings_batch(
        self,
        texts: List[str],
        model: Optional[str] = None
    ) -> List[List[float]]:
        """批次取得多個文字的 embedding 向量"""
        if not texts:
            return []

        model = model or self.default_model
        url = f"{self.base_url}/api/embed"

        try:
            # Ollama 支援批次 embedding
            response = await self.client.post(url, json={
                "model": model,
                "input": texts
            })
            response.raise_for_status()
            result = response.json()
            return result["embeddings"]
        except httpx.HTTPStatusError as e:
            logger.error(f"Batch embedding API error: {e.response.status_code} - {e.response.text}")
            # 如果批次失敗，嘗試逐一處理
            logger.info("Falling back to single embedding requests...")
            embeddings = []
            for text in texts:
                emb = await self.get_embedding(text, model)
                embeddings.append(emb)
            return embeddings
        except Exception as e:
            logger.error(f"Batch embedding error: {e}")
            raise

    def cosine_similarity(self, a: List[float], b: List[float]) -> float:
        """計算兩個向量的餘弦相似度"""
        a_np = np.array(a)
        b_np = np.array(b)
        norm_a = np.linalg.norm(a_np)
        norm_b = np.linalg.norm(b_np)
        if norm_a == 0 or norm_b == 0:
            return 0.0
        return float(np.dot(a_np, b_np) / (norm_a * norm_b))

    def build_similarity_matrix(
        self,
        embeddings: List[List[float]]
    ) -> np.ndarray:
        """建立成對相似度矩陣"""
        n = len(embeddings)
        matrix = np.zeros((n, n))

        for i in range(n):
            matrix[i][i] = 1.0  # 自己與自己的相似度為 1
            for j in range(i + 1, n):
                sim = self.cosine_similarity(embeddings[i], embeddings[j])
                matrix[i][j] = sim
                matrix[j][i] = sim

        return matrix

    def cluster_by_similarity(
        self,
        similarity_matrix: np.ndarray,
        threshold: float
    ) -> List[List[int]]:
        """
        貪婪聚類：將相似度 >= threshold 的項目分組

        演算法：
        1. 從第一個未分配的項目開始
        2. 找出所有與該項目相似度 >= threshold 的項目
        3. 歸入同一組
        4. 重複直到所有項目都已分配

        Returns:
            List[List[int]]: 每個子列表包含同組項目的索引
        """
        n = len(similarity_matrix)
        assigned = [False] * n
        groups = []

        for i in range(n):
            if assigned[i]:
                continue

            # 開始新的分組，以 item i 為代表
            group = [i]
            assigned[i] = True

            # 找出所有與 i 相似的項目
            for j in range(i + 1, n):
                if not assigned[j] and similarity_matrix[i][j] >= threshold:
                    group.append(j)
                    assigned[j] = True

            groups.append(group)

        return groups

    async def deduplicate(
        self,
        descriptions: List[ExpertTransformationDescription],
        threshold: float = 0.85,
        model: Optional[str] = None
    ) -> DeduplicationResult:
        """
        主要去重方法

        Args:
            descriptions: 要去重的描述列表
            threshold: 相似度閾值 (0.0-1.0)，預設 0.85
            model: Embedding 模型名稱

        Returns:
            DeduplicationResult: 去重結果，包含分組資訊
        """
        model = model or self.default_model

        # 空輸入處理
        if not descriptions:
            return DeduplicationResult(
                total_input=0,
                total_groups=0,
                total_duplicates=0,
                groups=[],
                threshold_used=threshold,
                method_used=DeduplicationMethod.EMBEDDING,
                model_used=model
            )

        # 提取描述文字
        texts = [d.description for d in descriptions]
        logger.info(f"Generating embeddings for {len(texts)} descriptions using model '{model}'...")

        # 批次取得 embeddings
        try:
            embeddings = await self.get_embeddings_batch(texts, model)
        except Exception as e:
            logger.error(f"Failed to generate embeddings: {e}")
            raise ValueError(f"Embedding generation failed: {e}. Make sure the model '{model}' is installed (run: ollama pull {model})")

        # 建立相似度矩陣
        logger.info("Building similarity matrix...")
        sim_matrix = self.build_similarity_matrix(embeddings)

        # 聚類
        logger.info(f"Clustering with threshold {threshold}...")
        clusters = self.cluster_by_similarity(sim_matrix, threshold)

        # 建立結果分組
        result_groups = []
        total_duplicates = 0

        for group_idx, indices in enumerate(clusters):
            if len(indices) == 1:
                # 獨立項目 - 無重複
                result_groups.append(DescriptionGroup(
                    group_id=f"group-{group_idx}",
                    representative=descriptions[indices[0]],
                    duplicates=[],
                    similarity_scores=[]
                ))
            else:
                # 有重複的分組 - 第一個為代表
                rep_idx = indices[0]
                dup_indices = indices[1:]
                dup_scores = [
                    float(sim_matrix[rep_idx][idx])
                    for idx in dup_indices
                ]

                result_groups.append(DescriptionGroup(
                    group_id=f"group-{group_idx}",
                    representative=descriptions[rep_idx],
                    duplicates=[descriptions[idx] for idx in dup_indices],
                    similarity_scores=dup_scores
                ))
                total_duplicates += len(dup_indices)

        logger.info(f"Deduplication complete: {len(descriptions)} -> {len(result_groups)} groups, {total_duplicates} duplicates found")

        return DeduplicationResult(
            total_input=len(descriptions),
            total_groups=len(result_groups),
            total_duplicates=total_duplicates,
            groups=result_groups,
            threshold_used=threshold,
            method_used=DeduplicationMethod.EMBEDDING,
            model_used=model
        )

    async def close(self):
        """關閉 HTTP 客戶端"""
        await self.client.aclose()


# 全域實例
embedding_service = EmbeddingService()