chore: save local changes

2026-01-05 22:32:08 +08:00
parent bc281b8e0a
commit ec48709755
42 changed files with 5576 additions and 254 deletions
--- a/backend/app/services/llm_deduplication_service.py
+++ b/backend/app/services/llm_deduplication_service.py
@@ -1,12 +1,12 @@
 """
-LLM Deduplication Service - 使用 LLM 成對比較進行去重
+LLM Deduplication Service - Using LLM pairwise comparison for deduplication

-讓 LLM 判斷兩個描述是否語意重複，透過並行處理加速。
+Let LLM determine whether two descriptions are semantically duplicate, accelerated by parallel processing.
 """

 import asyncio
 import logging
-from typing import List, Tuple, Optional
+from typing import List, Tuple, Optional, Literal

 import httpx
 import numpy as np
@@ -18,6 +18,7 @@ from ..models.schemas import (
    DeduplicationMethod,
    DescriptionGroup,
 )
+from ..prompts.language_config import LanguageType

 logger = logging.getLogger(__name__)

@@ -31,27 +32,20 @@ class LLMDeduplicationService:
        self.client = httpx.AsyncClient(timeout=60.0)
        self.max_concurrent = 5  # 最大並行數，避免 Ollama 過載

-    async def compare_pair(
-        self,
-        desc1: str,
-        desc2: str,
-        model: str,
-        semaphore: asyncio.Semaphore
-    ) -> bool:
-        """
-        讓 LLM 判斷兩個描述是否語意重複
+    def _get_comparison_prompt(self, desc1: str, desc2: str, lang: LanguageType = "zh") -> str:
+        """Get comparison prompt in the specified language"""
+        if lang == "en":
+            return f"""Determine whether the following two innovative descriptions express the same or very similar concepts:

-        Args:
-            desc1: 第一個描述
-            desc2: 第二個描述
-            model: LLM 模型名稱
-            semaphore: 並行控制信號量
+Description 1: {desc1}

-        Returns:
-            bool: 是否為重複描述
-        """
-        async with semaphore:  # 控制並行數
-            prompt = f"""判斷以下兩個創新描述是否表達相同或非常相似的概念：
+Description 2: {desc2}
+
+If both descriptions essentially express the same or very similar innovative concept, answer "YES"
+If the two descriptions express different innovative concepts, answer "NO"
+Only answer YES or NO, no other text"""
+        else:
+            return f"""判斷以下兩個創新描述是否表達相同或非常相似的概念：

 描述1: {desc1}

@@ -61,6 +55,30 @@ class LLMDeduplicationService:
 如果兩者描述不同的創新概念，回答 "NO"
 只回答 YES 或 NO，不要其他文字"""

+    async def compare_pair(
+        self,
+        desc1: str,
+        desc2: str,
+        model: str,
+        semaphore: asyncio.Semaphore,
+        lang: LanguageType = "zh"
+    ) -> bool:
+        """
+        Let LLM determine whether two descriptions are semantically duplicate
+
+        Args:
+            desc1: First description
+            desc2: Second description
+            model: LLM model name
+            semaphore: Concurrency control semaphore
+            lang: Language for the prompt
+
+        Returns:
+            bool: Whether the descriptions are duplicates
+        """
+        async with semaphore:  # Control concurrency
+            prompt = self._get_comparison_prompt(desc1, desc2, lang)
+
            try:
                response = await self.client.post(
                    f"{self.base_url}/api/generate",
@@ -86,26 +104,28 @@ class LLMDeduplicationService:
    async def compare_batch(
        self,
        pairs: List[Tuple[int, int, str, str]],
-        model: str
+        model: str,
+        lang: LanguageType = "zh"
    ) -> List[Tuple[int, int, bool]]:
        """
-        並行批次比較多個描述對
+        Parallel batch comparison of multiple description pairs

        Args:
-            pairs: 待比較的配對列表 [(i, j, desc1, desc2), ...]
-            model: LLM 模型名稱
+            pairs: List of pairs to compare [(i, j, desc1, desc2), ...]
+            model: LLM model name
+            lang: Language for the prompt

        Returns:
-            比較結果列表 [(i, j, is_similar), ...]
+            List of comparison results [(i, j, is_similar), ...]
        """
        semaphore = asyncio.Semaphore(self.max_concurrent)

        async def compare_one(pair: Tuple[int, int, str, str]) -> Tuple[int, int, bool]:
            i, j, desc1, desc2 = pair
-            is_similar = await self.compare_pair(desc1, desc2, model, semaphore)
+            is_similar = await self.compare_pair(desc1, desc2, model, semaphore, lang)
            return (i, j, is_similar)

-        # 使用 asyncio.gather 並行執行所有比較
+        # Use asyncio.gather to execute all comparisons in parallel
        results = await asyncio.gather(*[compare_one(p) for p in pairs])
        return results

@@ -144,17 +164,19 @@ class LLMDeduplicationService:
    async def deduplicate(
        self,
        descriptions: List[ExpertTransformationDescription],
-        model: Optional[str] = None
+        model: Optional[str] = None,
+        lang: LanguageType = "zh"
    ) -> DeduplicationResult:
        """
-        使用 LLM 成對比較進行去重
+        Use LLM pairwise comparison for deduplication

        Args:
-            descriptions: 要去重的描述列表
-            model: LLM 模型名稱
+            descriptions: List of descriptions to deduplicate
+            model: LLM model name
+            lang: Language for the prompt

        Returns:
-            DeduplicationResult: 去重結果
+            DeduplicationResult: Deduplication result
        """
        model = model or self.default_model

@@ -188,10 +210,10 @@ class LLMDeduplicationService:
                ))

        total_pairs = len(pairs)
-        logger.info(f"LLM deduplication: {total_pairs} pairs to compare (parallel={self.max_concurrent}, model={model})")
+        logger.info(f"LLM deduplication: {total_pairs} pairs to compare (parallel={self.max_concurrent}, model={model}, lang={lang})")

-        # 並行批次比較
-        results = await self.compare_batch(pairs, model)
+        # Parallel batch comparison
+        results = await self.compare_batch(pairs, model, lang)

        # 填入相似度矩陣
        for i, j, is_similar in results:
--- a/backend/app/services/patent_search_service.py
+++ b/backend/app/services/patent_search_service.py
@@ -0,0 +1,195 @@
+"""Patent Search Service using Google Patents XHR API"""
+
+import httpx
+import logging
+from typing import List, Optional
+from urllib.parse import quote_plus
+
+logger = logging.getLogger(__name__)
+
+
+class PatentSearchResult:
+    """Single patent search result"""
+    def __init__(
+        self,
+        publication_number: str,
+        title: str,
+        snippet: str,
+        publication_date: Optional[str],
+        assignee: Optional[str],
+        inventor: Optional[str],
+        status: str,
+        pdf_url: Optional[str] = None,
+        thumbnail_url: Optional[str] = None,
+    ):
+        self.publication_number = publication_number
+        self.title = title
+        self.snippet = snippet
+        self.publication_date = publication_date
+        self.assignee = assignee
+        self.inventor = inventor
+        self.status = status
+        self.pdf_url = pdf_url
+        self.thumbnail_url = thumbnail_url
+
+    def to_dict(self):
+        return {
+            "publication_number": self.publication_number,
+            "title": self.title,
+            "snippet": self.snippet,
+            "publication_date": self.publication_date,
+            "assignee": self.assignee,
+            "inventor": self.inventor,
+            "status": self.status,
+            "pdf_url": self.pdf_url,
+            "thumbnail_url": self.thumbnail_url,
+        }
+
+
+class PatentSearchService:
+    """Service for searching patents using Google Patents"""
+
+    GOOGLE_PATENTS_XHR_URL = "https://patents.google.com/xhr/query"
+    GOOGLE_PATENTS_PDF_BASE = "https://patentimages.storage.googleapis.com/"
+
+    def __init__(self):
+        self._client: Optional[httpx.AsyncClient] = None
+
+    # Browser-like headers to avoid being blocked
+    DEFAULT_HEADERS = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+        "Accept": "application/json, text/plain, */*",
+        "Accept-Language": "en-US,en;q=0.9",
+        "Referer": "https://patents.google.com/",
+        "Origin": "https://patents.google.com",
+    }
+
+    async def _get_client(self) -> httpx.AsyncClient:
+        if self._client is None or self._client.is_closed:
+            self._client = httpx.AsyncClient(
+                timeout=30.0,
+                headers=self.DEFAULT_HEADERS,
+                follow_redirects=True,
+            )
+        return self._client
+
+    async def close(self):
+        if self._client and not self._client.is_closed:
+            await self._client.aclose()
+
+    async def search(
+        self,
+        query: str,
+        max_results: int = 10,
+    ) -> dict:
+        """
+        Search Google Patents for relevant patents
+
+        Args:
+            query: Search query (can be a description or keywords)
+            max_results: Maximum number of results to return
+
+        Returns:
+            Dict with total_results count and list of patent results
+        """
+        try:
+            client = await self._get_client()
+
+            # URL encode the query
+            encoded_query = quote_plus(query)
+            url = f"{self.GOOGLE_PATENTS_XHR_URL}?url=q%3D{encoded_query}&exp=&tags="
+
+            logger.info(f"Searching patents with query: {query[:100]}...")
+
+            response = await client.get(url)
+
+            if response.status_code != 200:
+                logger.error(f"Google Patents API returned status {response.status_code}")
+                return {
+                    "total_results": 0,
+                    "patents": [],
+                    "error": f"API returned status {response.status_code}"
+                }
+
+            data = response.json()
+
+            # Parse results
+            results = data.get("results", {})
+            total_num = results.get("total_num_results", 0)
+            clusters = results.get("cluster", [])
+
+            patents: List[PatentSearchResult] = []
+
+            if clusters and len(clusters) > 0:
+                patent_results = clusters[0].get("result", [])
+
+                for item in patent_results[:max_results]:
+                    patent_data = item.get("patent", {})
+                    family_meta = patent_data.get("family_metadata", {})
+                    aggregated = family_meta.get("aggregated", {})
+                    country_status = aggregated.get("country_status", [])
+
+                    status = "UNKNOWN"
+                    if country_status and len(country_status) > 0:
+                        best_stage = country_status[0].get("best_patent_stage", {})
+                        status = best_stage.get("state", "UNKNOWN")
+
+                    # Build PDF URL if available
+                    pdf_path = patent_data.get("pdf", "")
+                    pdf_url = f"{self.GOOGLE_PATENTS_PDF_BASE}{pdf_path}" if pdf_path else None
+
+                    # Build thumbnail URL
+                    thumbnail = patent_data.get("thumbnail", "")
+                    thumbnail_url = f"{self.GOOGLE_PATENTS_PDF_BASE}{thumbnail}" if thumbnail else None
+
+                    patent = PatentSearchResult(
+                        publication_number=patent_data.get("publication_number", ""),
+                        title=self._clean_html(patent_data.get("title", "")),
+                        snippet=self._clean_html(patent_data.get("snippet", "")),
+                        publication_date=patent_data.get("publication_date"),
+                        assignee=patent_data.get("assignee"),
+                        inventor=patent_data.get("inventor"),
+                        status=status,
+                        pdf_url=pdf_url,
+                        thumbnail_url=thumbnail_url,
+                    )
+                    patents.append(patent)
+
+            logger.info(f"Found {total_num} total patents, returning {len(patents)}")
+
+            return {
+                "total_results": total_num,
+                "patents": [p.to_dict() for p in patents],
+            }
+
+        except httpx.HTTPError as e:
+            logger.error(f"HTTP error searching patents: {e}")
+            return {
+                "total_results": 0,
+                "patents": [],
+                "error": str(e)
+            }
+        except Exception as e:
+            logger.error(f"Error searching patents: {e}")
+            return {
+                "total_results": 0,
+                "patents": [],
+                "error": str(e)
+            }
+
+    def _clean_html(self, text: str) -> str:
+        """Remove HTML entities and tags from text"""
+        if not text:
+            return ""
+        # Replace common HTML entities
+        text = text.replace("&hellip;", "...")
+        text = text.replace("&amp;", "&")
+        text = text.replace("&lt;", "<")
+        text = text.replace("&gt;", ">")
+        text = text.replace("&quot;", '"')
+        text = text.replace("&#39;", "'")
+        return text.strip()
+
+
+# Singleton instance
+patent_search_service = PatentSearchService()