Add PaddleOCR masking and region detection pipeline
- Created PaddleOCR client for remote server communication - Implemented text masking + region detection pipeline - Test results: 100% recall on sample PDF (found both signatures) - Identified issues: split regions, printed text not fully masked - Documented 5 solution options in PADDLEOCR_STATUS.md - Next: Implement region merging and two-stage cleaning
This commit is contained in:
169
paddleocr_client.py
Normal file
169
paddleocr_client.py
Normal file
@@ -0,0 +1,169 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PaddleOCR Client
|
||||
Connects to remote PaddleOCR server for OCR inference
|
||||
"""
|
||||
|
||||
import requests
|
||||
import base64
|
||||
import numpy as np
|
||||
from typing import List, Dict, Tuple, Optional
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
|
||||
class PaddleOCRClient:
|
||||
"""Client for remote PaddleOCR server."""
|
||||
|
||||
def __init__(self, server_url: str = "http://192.168.30.36:5555"):
|
||||
"""
|
||||
Initialize PaddleOCR client.
|
||||
|
||||
Args:
|
||||
server_url: URL of the PaddleOCR server
|
||||
"""
|
||||
self.server_url = server_url.rstrip('/')
|
||||
self.timeout = 30 # seconds
|
||||
|
||||
def health_check(self) -> bool:
|
||||
"""
|
||||
Check if server is healthy.
|
||||
|
||||
Returns:
|
||||
True if server is healthy, False otherwise
|
||||
"""
|
||||
try:
|
||||
response = requests.get(
|
||||
f"{self.server_url}/health",
|
||||
timeout=5
|
||||
)
|
||||
return response.status_code == 200 and response.json().get('status') == 'ok'
|
||||
except Exception as e:
|
||||
print(f"Health check failed: {e}")
|
||||
return False
|
||||
|
||||
def ocr(self, image: np.ndarray) -> List[Dict]:
|
||||
"""
|
||||
Perform OCR on an image.
|
||||
|
||||
Args:
|
||||
image: numpy array of the image (RGB format)
|
||||
|
||||
Returns:
|
||||
List of detection results, each containing:
|
||||
- box: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||||
- text: detected text string
|
||||
- confidence: confidence score (0-1)
|
||||
|
||||
Raises:
|
||||
Exception if OCR fails
|
||||
"""
|
||||
# Convert numpy array to PIL Image
|
||||
if len(image.shape) == 2: # Grayscale
|
||||
pil_image = Image.fromarray(image)
|
||||
else: # RGB or RGBA
|
||||
pil_image = Image.fromarray(image.astype(np.uint8))
|
||||
|
||||
# Encode to base64
|
||||
buffered = BytesIO()
|
||||
pil_image.save(buffered, format="PNG")
|
||||
image_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
|
||||
# Send request
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{self.server_url}/ocr",
|
||||
json={"image": image_base64},
|
||||
timeout=self.timeout
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
|
||||
if not result.get('success'):
|
||||
error_msg = result.get('error', 'Unknown error')
|
||||
raise Exception(f"OCR failed: {error_msg}")
|
||||
|
||||
return result.get('results', [])
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
raise Exception(f"OCR request timed out after {self.timeout} seconds")
|
||||
except requests.exceptions.ConnectionError:
|
||||
raise Exception(f"Could not connect to server at {self.server_url}")
|
||||
except Exception as e:
|
||||
raise Exception(f"OCR request failed: {str(e)}")
|
||||
|
||||
def get_text_boxes(self, image: np.ndarray) -> List[Tuple[int, int, int, int]]:
|
||||
"""
|
||||
Get bounding boxes of all detected text.
|
||||
|
||||
Args:
|
||||
image: numpy array of the image
|
||||
|
||||
Returns:
|
||||
List of bounding boxes as (x, y, w, h) tuples
|
||||
"""
|
||||
results = self.ocr(image)
|
||||
boxes = []
|
||||
|
||||
for result in results:
|
||||
box = result['box'] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||||
|
||||
# Convert polygon to bounding box
|
||||
xs = [point[0] for point in box]
|
||||
ys = [point[1] for point in box]
|
||||
|
||||
x = int(min(xs))
|
||||
y = int(min(ys))
|
||||
w = int(max(xs) - min(xs))
|
||||
h = int(max(ys) - min(ys))
|
||||
|
||||
boxes.append((x, y, w, h))
|
||||
|
||||
return boxes
|
||||
|
||||
def __repr__(self):
|
||||
return f"PaddleOCRClient(server_url='{self.server_url}')"
|
||||
|
||||
|
||||
# Convenience function
|
||||
def create_ocr_client(server_url: str = "http://192.168.30.36:5555") -> PaddleOCRClient:
|
||||
"""
|
||||
Create and test PaddleOCR client.
|
||||
|
||||
Args:
|
||||
server_url: URL of the PaddleOCR server
|
||||
|
||||
Returns:
|
||||
PaddleOCRClient instance
|
||||
|
||||
Raises:
|
||||
Exception if server is not reachable
|
||||
"""
|
||||
client = PaddleOCRClient(server_url)
|
||||
|
||||
if not client.health_check():
|
||||
raise Exception(
|
||||
f"PaddleOCR server at {server_url} is not responding. "
|
||||
"Make sure the server is running on the Linux machine."
|
||||
)
|
||||
|
||||
return client
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test the client
|
||||
print("Testing PaddleOCR client...")
|
||||
|
||||
try:
|
||||
client = create_ocr_client()
|
||||
print(f"✅ Connected to server: {client.server_url}")
|
||||
|
||||
# Create a test image
|
||||
test_image = np.ones((100, 100, 3), dtype=np.uint8) * 255
|
||||
|
||||
print("Running test OCR...")
|
||||
results = client.ocr(test_image)
|
||||
print(f"✅ OCR test successful! Found {len(results)} text regions")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
Reference in New Issue
Block a user