chore: save local changes

This commit is contained in:
2026-01-05 22:32:08 +08:00
parent bc281b8e0a
commit ec48709755
42 changed files with 5576 additions and 254 deletions

View File

@@ -1,12 +1,12 @@
"""
LLM Deduplication Service - 使用 LLM 成對比較進行去重
LLM Deduplication Service - Using LLM pairwise comparison for deduplication
LLM 判斷兩個描述是否語意重複,透過並行處理加速。
Let LLM determine whether two descriptions are semantically duplicate, accelerated by parallel processing.
"""
import asyncio
import logging
from typing import List, Tuple, Optional
from typing import List, Tuple, Optional, Literal
import httpx
import numpy as np
@@ -18,6 +18,7 @@ from ..models.schemas import (
DeduplicationMethod,
DescriptionGroup,
)
from ..prompts.language_config import LanguageType
logger = logging.getLogger(__name__)
@@ -31,27 +32,20 @@ class LLMDeduplicationService:
self.client = httpx.AsyncClient(timeout=60.0)
self.max_concurrent = 5 # 最大並行數,避免 Ollama 過載
async def compare_pair(
self,
desc1: str,
desc2: str,
model: str,
semaphore: asyncio.Semaphore
) -> bool:
"""
讓 LLM 判斷兩個描述是否語意重複
def _get_comparison_prompt(self, desc1: str, desc2: str, lang: LanguageType = "zh") -> str:
"""Get comparison prompt in the specified language"""
if lang == "en":
return f"""Determine whether the following two innovative descriptions express the same or very similar concepts:
Args:
desc1: 第一個描述
desc2: 第二個描述
model: LLM 模型名稱
semaphore: 並行控制信號量
Description 1: {desc1}
Returns:
bool: 是否為重複描述
"""
async with semaphore: # 控制並行數
prompt = f"""判斷以下兩個創新描述是否表達相同或非常相似的概念:
Description 2: {desc2}
If both descriptions essentially express the same or very similar innovative concept, answer "YES"
If the two descriptions express different innovative concepts, answer "NO"
Only answer YES or NO, no other text"""
else:
return f"""判斷以下兩個創新描述是否表達相同或非常相似的概念:
描述1: {desc1}
@@ -61,6 +55,30 @@ class LLMDeduplicationService:
如果兩者描述不同的創新概念,回答 "NO"
只回答 YES 或 NO不要其他文字"""
async def compare_pair(
self,
desc1: str,
desc2: str,
model: str,
semaphore: asyncio.Semaphore,
lang: LanguageType = "zh"
) -> bool:
"""
Let LLM determine whether two descriptions are semantically duplicate
Args:
desc1: First description
desc2: Second description
model: LLM model name
semaphore: Concurrency control semaphore
lang: Language for the prompt
Returns:
bool: Whether the descriptions are duplicates
"""
async with semaphore: # Control concurrency
prompt = self._get_comparison_prompt(desc1, desc2, lang)
try:
response = await self.client.post(
f"{self.base_url}/api/generate",
@@ -86,26 +104,28 @@ class LLMDeduplicationService:
async def compare_batch(
self,
pairs: List[Tuple[int, int, str, str]],
model: str
model: str,
lang: LanguageType = "zh"
) -> List[Tuple[int, int, bool]]:
"""
並行批次比較多個描述對
Parallel batch comparison of multiple description pairs
Args:
pairs: 待比較的配對列表 [(i, j, desc1, desc2), ...]
model: LLM 模型名稱
pairs: List of pairs to compare [(i, j, desc1, desc2), ...]
model: LLM model name
lang: Language for the prompt
Returns:
比較結果列表 [(i, j, is_similar), ...]
List of comparison results [(i, j, is_similar), ...]
"""
semaphore = asyncio.Semaphore(self.max_concurrent)
async def compare_one(pair: Tuple[int, int, str, str]) -> Tuple[int, int, bool]:
i, j, desc1, desc2 = pair
is_similar = await self.compare_pair(desc1, desc2, model, semaphore)
is_similar = await self.compare_pair(desc1, desc2, model, semaphore, lang)
return (i, j, is_similar)
# 使用 asyncio.gather 並行執行所有比較
# Use asyncio.gather to execute all comparisons in parallel
results = await asyncio.gather(*[compare_one(p) for p in pairs])
return results
@@ -144,17 +164,19 @@ class LLMDeduplicationService:
async def deduplicate(
self,
descriptions: List[ExpertTransformationDescription],
model: Optional[str] = None
model: Optional[str] = None,
lang: LanguageType = "zh"
) -> DeduplicationResult:
"""
使用 LLM 成對比較進行去重
Use LLM pairwise comparison for deduplication
Args:
descriptions: 要去重的描述列表
model: LLM 模型名稱
descriptions: List of descriptions to deduplicate
model: LLM model name
lang: Language for the prompt
Returns:
DeduplicationResult: 去重結果
DeduplicationResult: Deduplication result
"""
model = model or self.default_model
@@ -188,10 +210,10 @@ class LLMDeduplicationService:
))
total_pairs = len(pairs)
logger.info(f"LLM deduplication: {total_pairs} pairs to compare (parallel={self.max_concurrent}, model={model})")
logger.info(f"LLM deduplication: {total_pairs} pairs to compare (parallel={self.max_concurrent}, model={model}, lang={lang})")
# 並行批次比較
results = await self.compare_batch(pairs, model)
# Parallel batch comparison
results = await self.compare_batch(pairs, model, lang)
# 填入相似度矩陣
for i, j, is_similar in results:

View File

@@ -0,0 +1,195 @@
"""Patent Search Service using Google Patents XHR API"""
import httpx
import logging
from typing import List, Optional
from urllib.parse import quote_plus
logger = logging.getLogger(__name__)
class PatentSearchResult:
"""Single patent search result"""
def __init__(
self,
publication_number: str,
title: str,
snippet: str,
publication_date: Optional[str],
assignee: Optional[str],
inventor: Optional[str],
status: str,
pdf_url: Optional[str] = None,
thumbnail_url: Optional[str] = None,
):
self.publication_number = publication_number
self.title = title
self.snippet = snippet
self.publication_date = publication_date
self.assignee = assignee
self.inventor = inventor
self.status = status
self.pdf_url = pdf_url
self.thumbnail_url = thumbnail_url
def to_dict(self):
return {
"publication_number": self.publication_number,
"title": self.title,
"snippet": self.snippet,
"publication_date": self.publication_date,
"assignee": self.assignee,
"inventor": self.inventor,
"status": self.status,
"pdf_url": self.pdf_url,
"thumbnail_url": self.thumbnail_url,
}
class PatentSearchService:
"""Service for searching patents using Google Patents"""
GOOGLE_PATENTS_XHR_URL = "https://patents.google.com/xhr/query"
GOOGLE_PATENTS_PDF_BASE = "https://patentimages.storage.googleapis.com/"
def __init__(self):
self._client: Optional[httpx.AsyncClient] = None
# Browser-like headers to avoid being blocked
DEFAULT_HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "en-US,en;q=0.9",
"Referer": "https://patents.google.com/",
"Origin": "https://patents.google.com",
}
async def _get_client(self) -> httpx.AsyncClient:
if self._client is None or self._client.is_closed:
self._client = httpx.AsyncClient(
timeout=30.0,
headers=self.DEFAULT_HEADERS,
follow_redirects=True,
)
return self._client
async def close(self):
if self._client and not self._client.is_closed:
await self._client.aclose()
async def search(
self,
query: str,
max_results: int = 10,
) -> dict:
"""
Search Google Patents for relevant patents
Args:
query: Search query (can be a description or keywords)
max_results: Maximum number of results to return
Returns:
Dict with total_results count and list of patent results
"""
try:
client = await self._get_client()
# URL encode the query
encoded_query = quote_plus(query)
url = f"{self.GOOGLE_PATENTS_XHR_URL}?url=q%3D{encoded_query}&exp=&tags="
logger.info(f"Searching patents with query: {query[:100]}...")
response = await client.get(url)
if response.status_code != 200:
logger.error(f"Google Patents API returned status {response.status_code}")
return {
"total_results": 0,
"patents": [],
"error": f"API returned status {response.status_code}"
}
data = response.json()
# Parse results
results = data.get("results", {})
total_num = results.get("total_num_results", 0)
clusters = results.get("cluster", [])
patents: List[PatentSearchResult] = []
if clusters and len(clusters) > 0:
patent_results = clusters[0].get("result", [])
for item in patent_results[:max_results]:
patent_data = item.get("patent", {})
family_meta = patent_data.get("family_metadata", {})
aggregated = family_meta.get("aggregated", {})
country_status = aggregated.get("country_status", [])
status = "UNKNOWN"
if country_status and len(country_status) > 0:
best_stage = country_status[0].get("best_patent_stage", {})
status = best_stage.get("state", "UNKNOWN")
# Build PDF URL if available
pdf_path = patent_data.get("pdf", "")
pdf_url = f"{self.GOOGLE_PATENTS_PDF_BASE}{pdf_path}" if pdf_path else None
# Build thumbnail URL
thumbnail = patent_data.get("thumbnail", "")
thumbnail_url = f"{self.GOOGLE_PATENTS_PDF_BASE}{thumbnail}" if thumbnail else None
patent = PatentSearchResult(
publication_number=patent_data.get("publication_number", ""),
title=self._clean_html(patent_data.get("title", "")),
snippet=self._clean_html(patent_data.get("snippet", "")),
publication_date=patent_data.get("publication_date"),
assignee=patent_data.get("assignee"),
inventor=patent_data.get("inventor"),
status=status,
pdf_url=pdf_url,
thumbnail_url=thumbnail_url,
)
patents.append(patent)
logger.info(f"Found {total_num} total patents, returning {len(patents)}")
return {
"total_results": total_num,
"patents": [p.to_dict() for p in patents],
}
except httpx.HTTPError as e:
logger.error(f"HTTP error searching patents: {e}")
return {
"total_results": 0,
"patents": [],
"error": str(e)
}
except Exception as e:
logger.error(f"Error searching patents: {e}")
return {
"total_results": 0,
"patents": [],
"error": str(e)
}
def _clean_html(self, text: str) -> str:
"""Remove HTML entities and tags from text"""
if not text:
return ""
# Replace common HTML entities
text = text.replace("…", "...")
text = text.replace("&", "&")
text = text.replace("&lt;", "<")
text = text.replace("&gt;", ">")
text = text.replace("&quot;", '"')
text = text.replace("&#39;", "'")
return text.strip()
# Singleton instance
patent_search_service = PatentSearchService()