chore: save local changes
This commit is contained in:
@@ -1,12 +1,12 @@
|
||||
"""
|
||||
LLM Deduplication Service - 使用 LLM 成對比較進行去重
|
||||
LLM Deduplication Service - Using LLM pairwise comparison for deduplication
|
||||
|
||||
讓 LLM 判斷兩個描述是否語意重複,透過並行處理加速。
|
||||
Let LLM determine whether two descriptions are semantically duplicate, accelerated by parallel processing.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import List, Tuple, Optional
|
||||
from typing import List, Tuple, Optional, Literal
|
||||
|
||||
import httpx
|
||||
import numpy as np
|
||||
@@ -18,6 +18,7 @@ from ..models.schemas import (
|
||||
DeduplicationMethod,
|
||||
DescriptionGroup,
|
||||
)
|
||||
from ..prompts.language_config import LanguageType
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -31,27 +32,20 @@ class LLMDeduplicationService:
|
||||
self.client = httpx.AsyncClient(timeout=60.0)
|
||||
self.max_concurrent = 5 # 最大並行數,避免 Ollama 過載
|
||||
|
||||
async def compare_pair(
|
||||
self,
|
||||
desc1: str,
|
||||
desc2: str,
|
||||
model: str,
|
||||
semaphore: asyncio.Semaphore
|
||||
) -> bool:
|
||||
"""
|
||||
讓 LLM 判斷兩個描述是否語意重複
|
||||
def _get_comparison_prompt(self, desc1: str, desc2: str, lang: LanguageType = "zh") -> str:
|
||||
"""Get comparison prompt in the specified language"""
|
||||
if lang == "en":
|
||||
return f"""Determine whether the following two innovative descriptions express the same or very similar concepts:
|
||||
|
||||
Args:
|
||||
desc1: 第一個描述
|
||||
desc2: 第二個描述
|
||||
model: LLM 模型名稱
|
||||
semaphore: 並行控制信號量
|
||||
Description 1: {desc1}
|
||||
|
||||
Returns:
|
||||
bool: 是否為重複描述
|
||||
"""
|
||||
async with semaphore: # 控制並行數
|
||||
prompt = f"""判斷以下兩個創新描述是否表達相同或非常相似的概念:
|
||||
Description 2: {desc2}
|
||||
|
||||
If both descriptions essentially express the same or very similar innovative concept, answer "YES"
|
||||
If the two descriptions express different innovative concepts, answer "NO"
|
||||
Only answer YES or NO, no other text"""
|
||||
else:
|
||||
return f"""判斷以下兩個創新描述是否表達相同或非常相似的概念:
|
||||
|
||||
描述1: {desc1}
|
||||
|
||||
@@ -61,6 +55,30 @@ class LLMDeduplicationService:
|
||||
如果兩者描述不同的創新概念,回答 "NO"
|
||||
只回答 YES 或 NO,不要其他文字"""
|
||||
|
||||
async def compare_pair(
|
||||
self,
|
||||
desc1: str,
|
||||
desc2: str,
|
||||
model: str,
|
||||
semaphore: asyncio.Semaphore,
|
||||
lang: LanguageType = "zh"
|
||||
) -> bool:
|
||||
"""
|
||||
Let LLM determine whether two descriptions are semantically duplicate
|
||||
|
||||
Args:
|
||||
desc1: First description
|
||||
desc2: Second description
|
||||
model: LLM model name
|
||||
semaphore: Concurrency control semaphore
|
||||
lang: Language for the prompt
|
||||
|
||||
Returns:
|
||||
bool: Whether the descriptions are duplicates
|
||||
"""
|
||||
async with semaphore: # Control concurrency
|
||||
prompt = self._get_comparison_prompt(desc1, desc2, lang)
|
||||
|
||||
try:
|
||||
response = await self.client.post(
|
||||
f"{self.base_url}/api/generate",
|
||||
@@ -86,26 +104,28 @@ class LLMDeduplicationService:
|
||||
async def compare_batch(
|
||||
self,
|
||||
pairs: List[Tuple[int, int, str, str]],
|
||||
model: str
|
||||
model: str,
|
||||
lang: LanguageType = "zh"
|
||||
) -> List[Tuple[int, int, bool]]:
|
||||
"""
|
||||
並行批次比較多個描述對
|
||||
Parallel batch comparison of multiple description pairs
|
||||
|
||||
Args:
|
||||
pairs: 待比較的配對列表 [(i, j, desc1, desc2), ...]
|
||||
model: LLM 模型名稱
|
||||
pairs: List of pairs to compare [(i, j, desc1, desc2), ...]
|
||||
model: LLM model name
|
||||
lang: Language for the prompt
|
||||
|
||||
Returns:
|
||||
比較結果列表 [(i, j, is_similar), ...]
|
||||
List of comparison results [(i, j, is_similar), ...]
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(self.max_concurrent)
|
||||
|
||||
async def compare_one(pair: Tuple[int, int, str, str]) -> Tuple[int, int, bool]:
|
||||
i, j, desc1, desc2 = pair
|
||||
is_similar = await self.compare_pair(desc1, desc2, model, semaphore)
|
||||
is_similar = await self.compare_pair(desc1, desc2, model, semaphore, lang)
|
||||
return (i, j, is_similar)
|
||||
|
||||
# 使用 asyncio.gather 並行執行所有比較
|
||||
# Use asyncio.gather to execute all comparisons in parallel
|
||||
results = await asyncio.gather(*[compare_one(p) for p in pairs])
|
||||
return results
|
||||
|
||||
@@ -144,17 +164,19 @@ class LLMDeduplicationService:
|
||||
async def deduplicate(
|
||||
self,
|
||||
descriptions: List[ExpertTransformationDescription],
|
||||
model: Optional[str] = None
|
||||
model: Optional[str] = None,
|
||||
lang: LanguageType = "zh"
|
||||
) -> DeduplicationResult:
|
||||
"""
|
||||
使用 LLM 成對比較進行去重
|
||||
Use LLM pairwise comparison for deduplication
|
||||
|
||||
Args:
|
||||
descriptions: 要去重的描述列表
|
||||
model: LLM 模型名稱
|
||||
descriptions: List of descriptions to deduplicate
|
||||
model: LLM model name
|
||||
lang: Language for the prompt
|
||||
|
||||
Returns:
|
||||
DeduplicationResult: 去重結果
|
||||
DeduplicationResult: Deduplication result
|
||||
"""
|
||||
model = model or self.default_model
|
||||
|
||||
@@ -188,10 +210,10 @@ class LLMDeduplicationService:
|
||||
))
|
||||
|
||||
total_pairs = len(pairs)
|
||||
logger.info(f"LLM deduplication: {total_pairs} pairs to compare (parallel={self.max_concurrent}, model={model})")
|
||||
logger.info(f"LLM deduplication: {total_pairs} pairs to compare (parallel={self.max_concurrent}, model={model}, lang={lang})")
|
||||
|
||||
# 並行批次比較
|
||||
results = await self.compare_batch(pairs, model)
|
||||
# Parallel batch comparison
|
||||
results = await self.compare_batch(pairs, model, lang)
|
||||
|
||||
# 填入相似度矩陣
|
||||
for i, j, is_similar in results:
|
||||
|
||||
195
backend/app/services/patent_search_service.py
Normal file
195
backend/app/services/patent_search_service.py
Normal file
@@ -0,0 +1,195 @@
|
||||
"""Patent Search Service using Google Patents XHR API"""
|
||||
|
||||
import httpx
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
from urllib.parse import quote_plus
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PatentSearchResult:
|
||||
"""Single patent search result"""
|
||||
def __init__(
|
||||
self,
|
||||
publication_number: str,
|
||||
title: str,
|
||||
snippet: str,
|
||||
publication_date: Optional[str],
|
||||
assignee: Optional[str],
|
||||
inventor: Optional[str],
|
||||
status: str,
|
||||
pdf_url: Optional[str] = None,
|
||||
thumbnail_url: Optional[str] = None,
|
||||
):
|
||||
self.publication_number = publication_number
|
||||
self.title = title
|
||||
self.snippet = snippet
|
||||
self.publication_date = publication_date
|
||||
self.assignee = assignee
|
||||
self.inventor = inventor
|
||||
self.status = status
|
||||
self.pdf_url = pdf_url
|
||||
self.thumbnail_url = thumbnail_url
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"publication_number": self.publication_number,
|
||||
"title": self.title,
|
||||
"snippet": self.snippet,
|
||||
"publication_date": self.publication_date,
|
||||
"assignee": self.assignee,
|
||||
"inventor": self.inventor,
|
||||
"status": self.status,
|
||||
"pdf_url": self.pdf_url,
|
||||
"thumbnail_url": self.thumbnail_url,
|
||||
}
|
||||
|
||||
|
||||
class PatentSearchService:
|
||||
"""Service for searching patents using Google Patents"""
|
||||
|
||||
GOOGLE_PATENTS_XHR_URL = "https://patents.google.com/xhr/query"
|
||||
GOOGLE_PATENTS_PDF_BASE = "https://patentimages.storage.googleapis.com/"
|
||||
|
||||
def __init__(self):
|
||||
self._client: Optional[httpx.AsyncClient] = None
|
||||
|
||||
# Browser-like headers to avoid being blocked
|
||||
DEFAULT_HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Referer": "https://patents.google.com/",
|
||||
"Origin": "https://patents.google.com",
|
||||
}
|
||||
|
||||
async def _get_client(self) -> httpx.AsyncClient:
|
||||
if self._client is None or self._client.is_closed:
|
||||
self._client = httpx.AsyncClient(
|
||||
timeout=30.0,
|
||||
headers=self.DEFAULT_HEADERS,
|
||||
follow_redirects=True,
|
||||
)
|
||||
return self._client
|
||||
|
||||
async def close(self):
|
||||
if self._client and not self._client.is_closed:
|
||||
await self._client.aclose()
|
||||
|
||||
async def search(
|
||||
self,
|
||||
query: str,
|
||||
max_results: int = 10,
|
||||
) -> dict:
|
||||
"""
|
||||
Search Google Patents for relevant patents
|
||||
|
||||
Args:
|
||||
query: Search query (can be a description or keywords)
|
||||
max_results: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
Dict with total_results count and list of patent results
|
||||
"""
|
||||
try:
|
||||
client = await self._get_client()
|
||||
|
||||
# URL encode the query
|
||||
encoded_query = quote_plus(query)
|
||||
url = f"{self.GOOGLE_PATENTS_XHR_URL}?url=q%3D{encoded_query}&exp=&tags="
|
||||
|
||||
logger.info(f"Searching patents with query: {query[:100]}...")
|
||||
|
||||
response = await client.get(url)
|
||||
|
||||
if response.status_code != 200:
|
||||
logger.error(f"Google Patents API returned status {response.status_code}")
|
||||
return {
|
||||
"total_results": 0,
|
||||
"patents": [],
|
||||
"error": f"API returned status {response.status_code}"
|
||||
}
|
||||
|
||||
data = response.json()
|
||||
|
||||
# Parse results
|
||||
results = data.get("results", {})
|
||||
total_num = results.get("total_num_results", 0)
|
||||
clusters = results.get("cluster", [])
|
||||
|
||||
patents: List[PatentSearchResult] = []
|
||||
|
||||
if clusters and len(clusters) > 0:
|
||||
patent_results = clusters[0].get("result", [])
|
||||
|
||||
for item in patent_results[:max_results]:
|
||||
patent_data = item.get("patent", {})
|
||||
family_meta = patent_data.get("family_metadata", {})
|
||||
aggregated = family_meta.get("aggregated", {})
|
||||
country_status = aggregated.get("country_status", [])
|
||||
|
||||
status = "UNKNOWN"
|
||||
if country_status and len(country_status) > 0:
|
||||
best_stage = country_status[0].get("best_patent_stage", {})
|
||||
status = best_stage.get("state", "UNKNOWN")
|
||||
|
||||
# Build PDF URL if available
|
||||
pdf_path = patent_data.get("pdf", "")
|
||||
pdf_url = f"{self.GOOGLE_PATENTS_PDF_BASE}{pdf_path}" if pdf_path else None
|
||||
|
||||
# Build thumbnail URL
|
||||
thumbnail = patent_data.get("thumbnail", "")
|
||||
thumbnail_url = f"{self.GOOGLE_PATENTS_PDF_BASE}{thumbnail}" if thumbnail else None
|
||||
|
||||
patent = PatentSearchResult(
|
||||
publication_number=patent_data.get("publication_number", ""),
|
||||
title=self._clean_html(patent_data.get("title", "")),
|
||||
snippet=self._clean_html(patent_data.get("snippet", "")),
|
||||
publication_date=patent_data.get("publication_date"),
|
||||
assignee=patent_data.get("assignee"),
|
||||
inventor=patent_data.get("inventor"),
|
||||
status=status,
|
||||
pdf_url=pdf_url,
|
||||
thumbnail_url=thumbnail_url,
|
||||
)
|
||||
patents.append(patent)
|
||||
|
||||
logger.info(f"Found {total_num} total patents, returning {len(patents)}")
|
||||
|
||||
return {
|
||||
"total_results": total_num,
|
||||
"patents": [p.to_dict() for p in patents],
|
||||
}
|
||||
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"HTTP error searching patents: {e}")
|
||||
return {
|
||||
"total_results": 0,
|
||||
"patents": [],
|
||||
"error": str(e)
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error searching patents: {e}")
|
||||
return {
|
||||
"total_results": 0,
|
||||
"patents": [],
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def _clean_html(self, text: str) -> str:
|
||||
"""Remove HTML entities and tags from text"""
|
||||
if not text:
|
||||
return ""
|
||||
# Replace common HTML entities
|
||||
text = text.replace("…", "...")
|
||||
text = text.replace("&", "&")
|
||||
text = text.replace("<", "<")
|
||||
text = text.replace(">", ">")
|
||||
text = text.replace(""", '"')
|
||||
text = text.replace("'", "'")
|
||||
return text.strip()
|
||||
|
||||
|
||||
# Singleton instance
|
||||
patent_search_service = PatentSearchService()
|
||||
Reference in New Issue
Block a user