Files
pdf_signature_extraction/test_paddleocr.py
gbanyan 479d4e0019 Add PaddleOCR masking and region detection pipeline
- Created PaddleOCR client for remote server communication
- Implemented text masking + region detection pipeline
- Test results: 100% recall on sample PDF (found both signatures)
- Identified issues: split regions, printed text not fully masked
- Documented 5 solution options in PADDLEOCR_STATUS.md
- Next: Implement region merging and two-stage cleaning
2025-10-28 22:28:18 +08:00

103 lines
3.0 KiB
Python

#!/usr/bin/env python3
"""Test PaddleOCR on a sample PDF page."""
import fitz # PyMuPDF
from paddleocr import PaddleOCR
import numpy as np
from PIL import Image
import cv2
from pathlib import Path
# Configuration
TEST_PDF = "/Volumes/NV2/PDF-Processing/signature-image-output/201301_1324_AI1_page3.pdf"
DPI = 300
print("="*80)
print("Testing PaddleOCR on macOS Apple Silicon")
print("="*80)
# Step 1: Render PDF to image
print("\n1. Rendering PDF to image...")
try:
doc = fitz.open(TEST_PDF)
page = doc[0]
mat = fitz.Matrix(DPI/72, DPI/72)
pix = page.get_pixmap(matrix=mat)
image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
if pix.n == 4: # RGBA
image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
print(f" ✅ Rendered: {image.shape[1]}x{image.shape[0]} pixels")
doc.close()
except Exception as e:
print(f" ❌ Error: {e}")
exit(1)
# Step 2: Initialize PaddleOCR
print("\n2. Initializing PaddleOCR...")
print(" (First run will download models, may take a few minutes...)")
try:
# Use the correct syntax from official docs
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
lang='ch' # Chinese language
)
print(" ✅ PaddleOCR initialized successfully")
except Exception as e:
print(f" ❌ Error: {e}")
import traceback
traceback.print_exc()
print("\n Note: PaddleOCR requires PaddlePaddle backend.")
print(" If this is a module import error, PaddlePaddle may not support this platform.")
exit(1)
# Step 3: Run OCR
print("\n3. Running OCR to detect printed text...")
try:
result = ocr.ocr(image, cls=False)
if result and result[0]:
print(f" ✅ Detected {len(result[0])} text regions")
# Show first few detections
print("\n Sample detections:")
for i, item in enumerate(result[0][:5]):
box = item[0] # Bounding box coordinates
text = item[1][0] # Detected text
confidence = item[1][1] # Confidence score
print(f" {i+1}. Text: '{text}' (confidence: {confidence:.2f})")
print(f" Box: {box}")
else:
print(" ⚠️ No text detected")
except Exception as e:
print(f" ❌ Error during OCR: {e}")
import traceback
traceback.print_exc()
exit(1)
# Step 4: Visualize detection
print("\n4. Creating visualization...")
try:
vis_image = image.copy()
if result and result[0]:
for item in result[0]:
box = np.array(item[0], dtype=np.int32)
cv2.polylines(vis_image, [box], True, (0, 255, 0), 2)
# Save visualization
output_path = "/Volumes/NV2/PDF-Processing/signature-image-output/paddleocr_test_detection.png"
cv2.imwrite(output_path, cv2.cvtColor(vis_image, cv2.COLOR_RGB2BGR))
print(f" ✅ Saved visualization: {output_path}")
except Exception as e:
print(f" ❌ Error during visualization: {e}")
print("\n" + "="*80)
print("PaddleOCR test completed!")
print("="*80)