- Created PaddleOCR client for remote server communication - Implemented text masking + region detection pipeline - Test results: 100% recall on sample PDF (found both signatures) - Identified issues: split regions, printed text not fully masked - Documented 5 solution options in PADDLEOCR_STATUS.md - Next: Implement region merging and two-stage cleaning
170 lines
4.7 KiB
Python
170 lines
4.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
PaddleOCR Client
|
|
Connects to remote PaddleOCR server for OCR inference
|
|
"""
|
|
|
|
import requests
|
|
import base64
|
|
import numpy as np
|
|
from typing import List, Dict, Tuple, Optional
|
|
from PIL import Image
|
|
from io import BytesIO
|
|
|
|
class PaddleOCRClient:
|
|
"""Client for remote PaddleOCR server."""
|
|
|
|
def __init__(self, server_url: str = "http://192.168.30.36:5555"):
|
|
"""
|
|
Initialize PaddleOCR client.
|
|
|
|
Args:
|
|
server_url: URL of the PaddleOCR server
|
|
"""
|
|
self.server_url = server_url.rstrip('/')
|
|
self.timeout = 30 # seconds
|
|
|
|
def health_check(self) -> bool:
|
|
"""
|
|
Check if server is healthy.
|
|
|
|
Returns:
|
|
True if server is healthy, False otherwise
|
|
"""
|
|
try:
|
|
response = requests.get(
|
|
f"{self.server_url}/health",
|
|
timeout=5
|
|
)
|
|
return response.status_code == 200 and response.json().get('status') == 'ok'
|
|
except Exception as e:
|
|
print(f"Health check failed: {e}")
|
|
return False
|
|
|
|
def ocr(self, image: np.ndarray) -> List[Dict]:
|
|
"""
|
|
Perform OCR on an image.
|
|
|
|
Args:
|
|
image: numpy array of the image (RGB format)
|
|
|
|
Returns:
|
|
List of detection results, each containing:
|
|
- box: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
|
- text: detected text string
|
|
- confidence: confidence score (0-1)
|
|
|
|
Raises:
|
|
Exception if OCR fails
|
|
"""
|
|
# Convert numpy array to PIL Image
|
|
if len(image.shape) == 2: # Grayscale
|
|
pil_image = Image.fromarray(image)
|
|
else: # RGB or RGBA
|
|
pil_image = Image.fromarray(image.astype(np.uint8))
|
|
|
|
# Encode to base64
|
|
buffered = BytesIO()
|
|
pil_image.save(buffered, format="PNG")
|
|
image_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
|
|
# Send request
|
|
try:
|
|
response = requests.post(
|
|
f"{self.server_url}/ocr",
|
|
json={"image": image_base64},
|
|
timeout=self.timeout
|
|
)
|
|
response.raise_for_status()
|
|
|
|
result = response.json()
|
|
|
|
if not result.get('success'):
|
|
error_msg = result.get('error', 'Unknown error')
|
|
raise Exception(f"OCR failed: {error_msg}")
|
|
|
|
return result.get('results', [])
|
|
|
|
except requests.exceptions.Timeout:
|
|
raise Exception(f"OCR request timed out after {self.timeout} seconds")
|
|
except requests.exceptions.ConnectionError:
|
|
raise Exception(f"Could not connect to server at {self.server_url}")
|
|
except Exception as e:
|
|
raise Exception(f"OCR request failed: {str(e)}")
|
|
|
|
def get_text_boxes(self, image: np.ndarray) -> List[Tuple[int, int, int, int]]:
|
|
"""
|
|
Get bounding boxes of all detected text.
|
|
|
|
Args:
|
|
image: numpy array of the image
|
|
|
|
Returns:
|
|
List of bounding boxes as (x, y, w, h) tuples
|
|
"""
|
|
results = self.ocr(image)
|
|
boxes = []
|
|
|
|
for result in results:
|
|
box = result['box'] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
|
|
|
# Convert polygon to bounding box
|
|
xs = [point[0] for point in box]
|
|
ys = [point[1] for point in box]
|
|
|
|
x = int(min(xs))
|
|
y = int(min(ys))
|
|
w = int(max(xs) - min(xs))
|
|
h = int(max(ys) - min(ys))
|
|
|
|
boxes.append((x, y, w, h))
|
|
|
|
return boxes
|
|
|
|
def __repr__(self):
|
|
return f"PaddleOCRClient(server_url='{self.server_url}')"
|
|
|
|
|
|
# Convenience function
|
|
def create_ocr_client(server_url: str = "http://192.168.30.36:5555") -> PaddleOCRClient:
|
|
"""
|
|
Create and test PaddleOCR client.
|
|
|
|
Args:
|
|
server_url: URL of the PaddleOCR server
|
|
|
|
Returns:
|
|
PaddleOCRClient instance
|
|
|
|
Raises:
|
|
Exception if server is not reachable
|
|
"""
|
|
client = PaddleOCRClient(server_url)
|
|
|
|
if not client.health_check():
|
|
raise Exception(
|
|
f"PaddleOCR server at {server_url} is not responding. "
|
|
"Make sure the server is running on the Linux machine."
|
|
)
|
|
|
|
return client
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Test the client
|
|
print("Testing PaddleOCR client...")
|
|
|
|
try:
|
|
client = create_ocr_client()
|
|
print(f"✅ Connected to server: {client.server_url}")
|
|
|
|
# Create a test image
|
|
test_image = np.ones((100, 100, 3), dtype=np.uint8) * 255
|
|
|
|
print("Running test OCR...")
|
|
results = client.ocr(test_image)
|
|
print(f"✅ OCR test successful! Found {len(results)} text regions")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|