Files
pdf_signature_extraction/paddleocr_client.py
gbanyan 479d4e0019 Add PaddleOCR masking and region detection pipeline
- Created PaddleOCR client for remote server communication
- Implemented text masking + region detection pipeline
- Test results: 100% recall on sample PDF (found both signatures)
- Identified issues: split regions, printed text not fully masked
- Documented 5 solution options in PADDLEOCR_STATUS.md
- Next: Implement region merging and two-stage cleaning
2025-10-28 22:28:18 +08:00

170 lines
4.7 KiB
Python

#!/usr/bin/env python3
"""
PaddleOCR Client
Connects to remote PaddleOCR server for OCR inference
"""
import requests
import base64
import numpy as np
from typing import List, Dict, Tuple, Optional
from PIL import Image
from io import BytesIO
class PaddleOCRClient:
"""Client for remote PaddleOCR server."""
def __init__(self, server_url: str = "http://192.168.30.36:5555"):
"""
Initialize PaddleOCR client.
Args:
server_url: URL of the PaddleOCR server
"""
self.server_url = server_url.rstrip('/')
self.timeout = 30 # seconds
def health_check(self) -> bool:
"""
Check if server is healthy.
Returns:
True if server is healthy, False otherwise
"""
try:
response = requests.get(
f"{self.server_url}/health",
timeout=5
)
return response.status_code == 200 and response.json().get('status') == 'ok'
except Exception as e:
print(f"Health check failed: {e}")
return False
def ocr(self, image: np.ndarray) -> List[Dict]:
"""
Perform OCR on an image.
Args:
image: numpy array of the image (RGB format)
Returns:
List of detection results, each containing:
- box: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
- text: detected text string
- confidence: confidence score (0-1)
Raises:
Exception if OCR fails
"""
# Convert numpy array to PIL Image
if len(image.shape) == 2: # Grayscale
pil_image = Image.fromarray(image)
else: # RGB or RGBA
pil_image = Image.fromarray(image.astype(np.uint8))
# Encode to base64
buffered = BytesIO()
pil_image.save(buffered, format="PNG")
image_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
# Send request
try:
response = requests.post(
f"{self.server_url}/ocr",
json={"image": image_base64},
timeout=self.timeout
)
response.raise_for_status()
result = response.json()
if not result.get('success'):
error_msg = result.get('error', 'Unknown error')
raise Exception(f"OCR failed: {error_msg}")
return result.get('results', [])
except requests.exceptions.Timeout:
raise Exception(f"OCR request timed out after {self.timeout} seconds")
except requests.exceptions.ConnectionError:
raise Exception(f"Could not connect to server at {self.server_url}")
except Exception as e:
raise Exception(f"OCR request failed: {str(e)}")
def get_text_boxes(self, image: np.ndarray) -> List[Tuple[int, int, int, int]]:
"""
Get bounding boxes of all detected text.
Args:
image: numpy array of the image
Returns:
List of bounding boxes as (x, y, w, h) tuples
"""
results = self.ocr(image)
boxes = []
for result in results:
box = result['box'] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
# Convert polygon to bounding box
xs = [point[0] for point in box]
ys = [point[1] for point in box]
x = int(min(xs))
y = int(min(ys))
w = int(max(xs) - min(xs))
h = int(max(ys) - min(ys))
boxes.append((x, y, w, h))
return boxes
def __repr__(self):
return f"PaddleOCRClient(server_url='{self.server_url}')"
# Convenience function
def create_ocr_client(server_url: str = "http://192.168.30.36:5555") -> PaddleOCRClient:
"""
Create and test PaddleOCR client.
Args:
server_url: URL of the PaddleOCR server
Returns:
PaddleOCRClient instance
Raises:
Exception if server is not reachable
"""
client = PaddleOCRClient(server_url)
if not client.health_check():
raise Exception(
f"PaddleOCR server at {server_url} is not responding. "
"Make sure the server is running on the Linux machine."
)
return client
if __name__ == "__main__":
# Test the client
print("Testing PaddleOCR client...")
try:
client = create_ocr_client()
print(f"✅ Connected to server: {client.server_url}")
# Create a test image
test_image = np.ones((100, 100, 3), dtype=np.uint8) * 255
print("Running test OCR...")
results = client.ocr(test_image)
print(f"✅ OCR test successful! Found {len(results)} text regions")
except Exception as e:
print(f"❌ Error: {e}")