chore: save local changes

2026-01-05 22:32:08 +08:00
parent bc281b8e0a
commit ec48709755
42 changed files with 5576 additions and 254 deletions
--- a/backend/app/services/llm_deduplication_service.py
+++ b/backend/app/services/llm_deduplication_service.py
@@ -1,12 +1,12 @@
 """
-LLM Deduplication Service - 使用 LLM 成對比較進行去重
+LLM Deduplication Service - Using LLM pairwise comparison for deduplication

-讓 LLM 判斷兩個描述是否語意重複，透過並行處理加速。
+Let LLM determine whether two descriptions are semantically duplicate, accelerated by parallel processing.
 """

 import asyncio
 import logging
-from typing import List, Tuple, Optional
+from typing import List, Tuple, Optional, Literal

 import httpx
 import numpy as np
@@ -18,6 +18,7 @@ from ..models.schemas import (
    DeduplicationMethod,
    DescriptionGroup,
 )
+from ..prompts.language_config import LanguageType

 logger = logging.getLogger(__name__)

@@ -31,27 +32,20 @@ class LLMDeduplicationService:
        self.client = httpx.AsyncClient(timeout=60.0)
        self.max_concurrent = 5  # 最大並行數，避免 Ollama 過載

-    async def compare_pair(
-        self,
-        desc1: str,
-        desc2: str,
-        model: str,
-        semaphore: asyncio.Semaphore
-    ) -> bool:
-        """
-        讓 LLM 判斷兩個描述是否語意重複
+    def _get_comparison_prompt(self, desc1: str, desc2: str, lang: LanguageType = "zh") -> str:
+        """Get comparison prompt in the specified language"""
+        if lang == "en":
+            return f"""Determine whether the following two innovative descriptions express the same or very similar concepts:

-        Args:
-            desc1: 第一個描述
-            desc2: 第二個描述
-            model: LLM 模型名稱
-            semaphore: 並行控制信號量
+Description 1: {desc1}

-        Returns:
-            bool: 是否為重複描述
-        """
-        async with semaphore:  # 控制並行數
-            prompt = f"""判斷以下兩個創新描述是否表達相同或非常相似的概念：
+Description 2: {desc2}
+
+If both descriptions essentially express the same or very similar innovative concept, answer "YES"
+If the two descriptions express different innovative concepts, answer "NO"
+Only answer YES or NO, no other text"""
+        else:
+            return f"""判斷以下兩個創新描述是否表達相同或非常相似的概念：

 描述1: {desc1}

@@ -61,6 +55,30 @@ class LLMDeduplicationService:
 如果兩者描述不同的創新概念，回答 "NO"
 只回答 YES 或 NO，不要其他文字"""

+    async def compare_pair(
+        self,
+        desc1: str,
+        desc2: str,
+        model: str,
+        semaphore: asyncio.Semaphore,
+        lang: LanguageType = "zh"
+    ) -> bool:
+        """
+        Let LLM determine whether two descriptions are semantically duplicate
+
+        Args:
+            desc1: First description
+            desc2: Second description
+            model: LLM model name
+            semaphore: Concurrency control semaphore
+            lang: Language for the prompt
+
+        Returns:
+            bool: Whether the descriptions are duplicates
+        """
+        async with semaphore:  # Control concurrency
+            prompt = self._get_comparison_prompt(desc1, desc2, lang)
+
            try:
                response = await self.client.post(
                    f"{self.base_url}/api/generate",
@@ -86,26 +104,28 @@ class LLMDeduplicationService:
    async def compare_batch(
        self,
        pairs: List[Tuple[int, int, str, str]],
-        model: str
+        model: str,
+        lang: LanguageType = "zh"
    ) -> List[Tuple[int, int, bool]]:
        """
-        並行批次比較多個描述對
+        Parallel batch comparison of multiple description pairs

        Args:
-            pairs: 待比較的配對列表 [(i, j, desc1, desc2), ...]
-            model: LLM 模型名稱
+            pairs: List of pairs to compare [(i, j, desc1, desc2), ...]
+            model: LLM model name
+            lang: Language for the prompt

        Returns:
-            比較結果列表 [(i, j, is_similar), ...]
+            List of comparison results [(i, j, is_similar), ...]
        """
        semaphore = asyncio.Semaphore(self.max_concurrent)

        async def compare_one(pair: Tuple[int, int, str, str]) -> Tuple[int, int, bool]:
            i, j, desc1, desc2 = pair
-            is_similar = await self.compare_pair(desc1, desc2, model, semaphore)
+            is_similar = await self.compare_pair(desc1, desc2, model, semaphore, lang)
            return (i, j, is_similar)

-        # 使用 asyncio.gather 並行執行所有比較
+        # Use asyncio.gather to execute all comparisons in parallel
        results = await asyncio.gather(*[compare_one(p) for p in pairs])
        return results

@@ -144,17 +164,19 @@ class LLMDeduplicationService:
    async def deduplicate(
        self,
        descriptions: List[ExpertTransformationDescription],
-        model: Optional[str] = None
+        model: Optional[str] = None,
+        lang: LanguageType = "zh"
    ) -> DeduplicationResult:
        """
-        使用 LLM 成對比較進行去重
+        Use LLM pairwise comparison for deduplication

        Args:
-            descriptions: 要去重的描述列表
-            model: LLM 模型名稱
+            descriptions: List of descriptions to deduplicate
+            model: LLM model name
+            lang: Language for the prompt

        Returns:
-            DeduplicationResult: 去重結果
+            DeduplicationResult: Deduplication result
        """
        model = model or self.default_model

@@ -188,10 +210,10 @@ class LLMDeduplicationService:
                ))

        total_pairs = len(pairs)
-        logger.info(f"LLM deduplication: {total_pairs} pairs to compare (parallel={self.max_concurrent}, model={model})")
+        logger.info(f"LLM deduplication: {total_pairs} pairs to compare (parallel={self.max_concurrent}, model={model}, lang={lang})")

-        # 並行批次比較
-        results = await self.compare_batch(pairs, model)
+        # Parallel batch comparison
+        results = await self.compare_batch(pairs, model, lang)

        # 填入相似度矩陣
        for i, j, is_similar in results: