Add PaddleOCR masking and region detection pipeline
- Created PaddleOCR client for remote server communication - Implemented text masking + region detection pipeline - Test results: 100% recall on sample PDF (found both signatures) - Identified issues: split regions, printed text not fully masked - Documented 5 solution options in PADDLEOCR_STATUS.md - Next: Implement region merging and two-stage cleaning
This commit is contained in:
102
test_paddleocr.py
Normal file
102
test_paddleocr.py
Normal file
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test PaddleOCR on a sample PDF page."""
|
||||
|
||||
import fitz # PyMuPDF
|
||||
from paddleocr import PaddleOCR
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import cv2
|
||||
from pathlib import Path
|
||||
|
||||
# Configuration
|
||||
TEST_PDF = "/Volumes/NV2/PDF-Processing/signature-image-output/201301_1324_AI1_page3.pdf"
|
||||
DPI = 300
|
||||
|
||||
print("="*80)
|
||||
print("Testing PaddleOCR on macOS Apple Silicon")
|
||||
print("="*80)
|
||||
|
||||
# Step 1: Render PDF to image
|
||||
print("\n1. Rendering PDF to image...")
|
||||
try:
|
||||
doc = fitz.open(TEST_PDF)
|
||||
page = doc[0]
|
||||
mat = fitz.Matrix(DPI/72, DPI/72)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
|
||||
|
||||
if pix.n == 4: # RGBA
|
||||
image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
|
||||
|
||||
print(f" ✅ Rendered: {image.shape[1]}x{image.shape[0]} pixels")
|
||||
doc.close()
|
||||
except Exception as e:
|
||||
print(f" ❌ Error: {e}")
|
||||
exit(1)
|
||||
|
||||
# Step 2: Initialize PaddleOCR
|
||||
print("\n2. Initializing PaddleOCR...")
|
||||
print(" (First run will download models, may take a few minutes...)")
|
||||
try:
|
||||
# Use the correct syntax from official docs
|
||||
ocr = PaddleOCR(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False,
|
||||
use_textline_orientation=False,
|
||||
lang='ch' # Chinese language
|
||||
)
|
||||
print(" ✅ PaddleOCR initialized successfully")
|
||||
except Exception as e:
|
||||
print(f" ❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print("\n Note: PaddleOCR requires PaddlePaddle backend.")
|
||||
print(" If this is a module import error, PaddlePaddle may not support this platform.")
|
||||
exit(1)
|
||||
|
||||
# Step 3: Run OCR
|
||||
print("\n3. Running OCR to detect printed text...")
|
||||
try:
|
||||
result = ocr.ocr(image, cls=False)
|
||||
|
||||
if result and result[0]:
|
||||
print(f" ✅ Detected {len(result[0])} text regions")
|
||||
|
||||
# Show first few detections
|
||||
print("\n Sample detections:")
|
||||
for i, item in enumerate(result[0][:5]):
|
||||
box = item[0] # Bounding box coordinates
|
||||
text = item[1][0] # Detected text
|
||||
confidence = item[1][1] # Confidence score
|
||||
print(f" {i+1}. Text: '{text}' (confidence: {confidence:.2f})")
|
||||
print(f" Box: {box}")
|
||||
else:
|
||||
print(" ⚠️ No text detected")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Error during OCR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
exit(1)
|
||||
|
||||
# Step 4: Visualize detection
|
||||
print("\n4. Creating visualization...")
|
||||
try:
|
||||
vis_image = image.copy()
|
||||
|
||||
if result and result[0]:
|
||||
for item in result[0]:
|
||||
box = np.array(item[0], dtype=np.int32)
|
||||
cv2.polylines(vis_image, [box], True, (0, 255, 0), 2)
|
||||
|
||||
# Save visualization
|
||||
output_path = "/Volumes/NV2/PDF-Processing/signature-image-output/paddleocr_test_detection.png"
|
||||
cv2.imwrite(output_path, cv2.cvtColor(vis_image, cv2.COLOR_RGB2BGR))
|
||||
print(f" ✅ Saved visualization: {output_path}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Error during visualization: {e}")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("PaddleOCR test completed!")
|
||||
print("="*80)
|
||||
Reference in New Issue
Block a user