Files
novelty-seeking/backend/app/services/patent_search_service.py
gbanyan 26a56a2a07 feat: Enhance patent search and update research documentation
- Improve patent search service with expanded functionality
- Update PatentSearchPanel UI component
- Add new research_report.md
- Update experimental protocol, literature review, paper outline, and theoretical framework

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-19 15:52:33 +08:00

265 lines
8.9 KiB
Python

"""Patent Search Service using Lens.org API"""
import httpx
import logging
from typing import List, Optional, Dict, Any
from dataclasses import dataclass, asdict
from app.config import settings
logger = logging.getLogger(__name__)
@dataclass
class PatentSearchResult:
"""Single patent search result from Lens.org"""
lens_id: str
doc_number: str
jurisdiction: str
kind: str
title: str
abstract: Optional[str]
date_published: Optional[str]
applicants: List[str]
inventors: List[str]
legal_status: Optional[str]
classifications_cpc: List[str]
families_simple: List[str]
url: str
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
class PatentSearchService:
"""Service for searching patents using Lens.org API"""
LENS_API_URL = "https://api.lens.org/patent/search"
def __init__(self):
self._client: Optional[httpx.AsyncClient] = None
async def _get_client(self) -> httpx.AsyncClient:
if self._client is None or self._client.is_closed:
self._client = httpx.AsyncClient(
timeout=30.0,
follow_redirects=True,
)
return self._client
async def close(self):
if self._client and not self._client.is_closed:
await self._client.aclose()
def _get_headers(self) -> Dict[str, str]:
"""Get headers with authorization token"""
token = settings.lens_api_token
if not token:
raise ValueError("LENS_API_TOKEN environment variable is not set")
return {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json",
"Accept": "application/json",
}
async def search(
self,
query: str,
max_results: int = 10,
) -> dict:
"""
Search Lens.org for relevant patents
Args:
query: Search query (searches title, abstract, and claims)
max_results: Maximum number of results to return
Returns:
Dict with total_results count and list of patent results
"""
try:
client = await self._get_client()
# Build Lens.org query using query string format for full-text search
request_body = {
"query": query,
"size": max_results,
"sort": [{"_score": "desc"}]
}
logger.info(f"Searching Lens.org patents with query: {query[:100]}...")
response = await client.post(
self.LENS_API_URL,
json=request_body,
headers=self._get_headers(),
)
if response.status_code == 401:
logger.error("Lens.org API authentication failed - check LENS_API_TOKEN")
return {
"total_results": 0,
"patents": [],
"error": "Authentication failed - invalid API token"
}
if response.status_code == 429:
logger.warning("Lens.org API rate limit exceeded")
return {
"total_results": 0,
"patents": [],
"error": "Rate limit exceeded - please try again later"
}
if response.status_code != 200:
logger.error(f"Lens.org API returned status {response.status_code}: {response.text}")
return {
"total_results": 0,
"patents": [],
"error": f"API returned status {response.status_code}"
}
data = response.json()
total_results = data.get("total", 0)
results = data.get("data", [])
patents: List[PatentSearchResult] = []
for item in results:
patent = self._parse_patent(item)
patents.append(patent)
logger.info(f"Found {total_results} total patents, returning {len(patents)}")
return {
"total_results": total_results,
"patents": [p.to_dict() for p in patents],
}
except ValueError as e:
logger.error(f"Configuration error: {e}")
return {
"total_results": 0,
"patents": [],
"error": str(e)
}
except httpx.HTTPError as e:
logger.error(f"HTTP error searching patents: {e}")
return {
"total_results": 0,
"patents": [],
"error": str(e)
}
except Exception as e:
logger.error(f"Error searching patents: {e}")
return {
"total_results": 0,
"patents": [],
"error": str(e)
}
def _parse_patent(self, item: Dict[str, Any]) -> PatentSearchResult:
"""Parse a single patent result from Lens.org response"""
lens_id = item.get("lens_id", "")
jurisdiction = item.get("jurisdiction", "")
doc_number = item.get("doc_number", "")
kind = item.get("kind", "")
# Get biblio section (contains title, parties, classifications)
biblio = item.get("biblio", {})
# Extract title from biblio.invention_title (list with lang info)
title_data = biblio.get("invention_title", [])
title = self._extract_text_with_lang(title_data)
# Extract abstract (top-level, list with lang info)
abstract_data = item.get("abstract", [])
abstract = self._extract_text_with_lang(abstract_data)
# Extract applicants from biblio.parties.applicants
parties = biblio.get("parties", {})
applicants = []
applicant_data = parties.get("applicants", [])
if isinstance(applicant_data, list):
for app in applicant_data:
if isinstance(app, dict):
name = app.get("extracted_name", {}).get("value", "")
if name:
applicants.append(name)
# Extract inventors from biblio.parties.inventors
inventors = []
inventor_data = parties.get("inventors", [])
if isinstance(inventor_data, list):
for inv in inventor_data:
if isinstance(inv, dict):
name = inv.get("extracted_name", {}).get("value", "")
if name:
inventors.append(name)
# Extract legal status
legal_status_data = item.get("legal_status", {})
legal_status = None
if isinstance(legal_status_data, dict):
legal_status = legal_status_data.get("patent_status")
# Extract CPC classifications from biblio.classifications_cpc
classifications_cpc = []
cpc_data = biblio.get("classifications_cpc", [])
if isinstance(cpc_data, list):
for cpc in cpc_data:
if isinstance(cpc, dict):
symbol = cpc.get("symbol", "")
if symbol:
classifications_cpc.append(symbol)
# Extract simple family members
families_simple = []
families_data = item.get("families", {})
if isinstance(families_data, dict):
simple_family = families_data.get("simple", {})
if isinstance(simple_family, dict):
members = simple_family.get("members", [])
if isinstance(members, list):
families_simple = [m.get("lens_id", "") for m in members if isinstance(m, dict) and m.get("lens_id")]
# Build URL to Lens.org patent page
url = f"https://www.lens.org/lens/patent/{lens_id}" if lens_id else ""
return PatentSearchResult(
lens_id=lens_id,
doc_number=doc_number,
jurisdiction=jurisdiction,
kind=kind,
title=title,
abstract=abstract,
date_published=item.get("date_published"),
applicants=applicants,
inventors=inventors,
legal_status=legal_status,
classifications_cpc=classifications_cpc,
families_simple=families_simple,
url=url,
)
def _extract_text_with_lang(self, data: Any, prefer_lang: str = "en") -> str:
"""Extract text from Lens.org language-tagged list, preferring specified language"""
if not data:
return ""
if isinstance(data, str):
return data
if isinstance(data, list) and data:
# Prefer specified language
for item in data:
if isinstance(item, dict) and item.get("lang") == prefer_lang:
return item.get("text", "")
# Fall back to first item
first = data[0]
if isinstance(first, dict):
return first.get("text", "")
return str(first)
return ""
# Singleton instance
patent_search_service = PatentSearchService()