Files
pdf_signature_extraction/test_mask_and_detect.py
gbanyan 479d4e0019 Add PaddleOCR masking and region detection pipeline
- Created PaddleOCR client for remote server communication
- Implemented text masking + region detection pipeline
- Test results: 100% recall on sample PDF (found both signatures)
- Identified issues: split regions, printed text not fully masked
- Documented 5 solution options in PADDLEOCR_STATUS.md
- Next: Implement region merging and two-stage cleaning
2025-10-28 22:28:18 +08:00

217 lines
7.2 KiB
Python

#!/usr/bin/env python3
"""
Test PaddleOCR Masking + Region Detection Pipeline
This script demonstrates:
1. PaddleOCR detects printed text bounding boxes
2. Mask out all printed text areas (fill with black)
3. Detect remaining non-white regions (potential handwriting)
4. Visualize the results
"""
import fitz # PyMuPDF
import numpy as np
import cv2
from pathlib import Path
from paddleocr_client import create_ocr_client
# Configuration
TEST_PDF = "/Volumes/NV2/PDF-Processing/signature-image-output/201301_1324_AI1_page3.pdf"
OUTPUT_DIR = "/Volumes/NV2/PDF-Processing/signature-image-output/mask_test"
DPI = 300
# Region detection parameters
MIN_REGION_AREA = 3000 # Minimum pixels for a region
MAX_REGION_AREA = 300000 # Maximum pixels for a region
MIN_ASPECT_RATIO = 0.3 # Minimum width/height ratio
MAX_ASPECT_RATIO = 15.0 # Maximum width/height ratio
print("="*80)
print("PaddleOCR Masking + Region Detection Test")
print("="*80)
# Create output directory
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
# Step 1: Connect to PaddleOCR server
print("\n1. Connecting to PaddleOCR server...")
try:
ocr_client = create_ocr_client()
print(f" ✅ Connected: {ocr_client.server_url}")
except Exception as e:
print(f" ❌ Error: {e}")
exit(1)
# Step 2: Render PDF to image
print("\n2. Rendering PDF to image...")
try:
doc = fitz.open(TEST_PDF)
page = doc[0]
mat = fitz.Matrix(DPI/72, DPI/72)
pix = page.get_pixmap(matrix=mat)
original_image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
if pix.n == 4: # RGBA
original_image = cv2.cvtColor(original_image, cv2.COLOR_RGBA2RGB)
print(f" ✅ Rendered: {original_image.shape[1]}x{original_image.shape[0]} pixels")
doc.close()
except Exception as e:
print(f" ❌ Error: {e}")
exit(1)
# Step 3: Detect printed text with PaddleOCR
print("\n3. Detecting printed text with PaddleOCR...")
try:
text_boxes = ocr_client.get_text_boxes(original_image)
print(f" ✅ Detected {len(text_boxes)} text regions")
# Show some sample boxes
if text_boxes:
print(" Sample text boxes (x, y, w, h):")
for i, box in enumerate(text_boxes[:3]):
print(f" {i+1}. {box}")
except Exception as e:
print(f" ❌ Error: {e}")
exit(1)
# Step 4: Mask out printed text areas
print("\n4. Masking printed text areas...")
try:
masked_image = original_image.copy()
# Fill each text box with black
for (x, y, w, h) in text_boxes:
cv2.rectangle(masked_image, (x, y), (x + w, y + h), (0, 0, 0), -1)
print(f" ✅ Masked {len(text_boxes)} text regions")
# Save masked image
masked_path = Path(OUTPUT_DIR) / "01_masked_image.png"
cv2.imwrite(str(masked_path), cv2.cvtColor(masked_image, cv2.COLOR_RGB2BGR))
print(f" 📁 Saved: {masked_path}")
except Exception as e:
print(f" ❌ Error: {e}")
exit(1)
# Step 5: Detect remaining non-white regions
print("\n5. Detecting remaining non-white regions...")
try:
# Convert to grayscale
gray = cv2.cvtColor(masked_image, cv2.COLOR_RGB2GRAY)
# Threshold to find non-white areas
# Anything darker than 250 is considered "content"
_, binary = cv2.threshold(gray, 250, 255, cv2.THRESH_BINARY_INV)
# Apply morphological operations to connect nearby regions
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
morphed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=2)
# Find contours
contours, _ = cv2.findContours(morphed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
print(f" ✅ Found {len(contours)} contours")
# Filter contours by size and aspect ratio
potential_regions = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
area = w * h
aspect_ratio = w / h if h > 0 else 0
# Check constraints
if (MIN_REGION_AREA <= area <= MAX_REGION_AREA and
MIN_ASPECT_RATIO <= aspect_ratio <= MAX_ASPECT_RATIO):
potential_regions.append({
'box': (x, y, w, h),
'area': area,
'aspect_ratio': aspect_ratio
})
print(f" ✅ Filtered to {len(potential_regions)} potential handwriting regions")
# Show region details
if potential_regions:
print("\n Detected regions:")
for i, region in enumerate(potential_regions[:5]):
x, y, w, h = region['box']
print(f" {i+1}. Box: ({x}, {y}, {w}, {h}), "
f"Area: {region['area']}, "
f"Aspect: {region['aspect_ratio']:.2f}")
except Exception as e:
print(f" ❌ Error: {e}")
import traceback
traceback.print_exc()
exit(1)
# Step 6: Visualize results
print("\n6. Creating visualizations...")
try:
# Visualization 1: Original with text boxes
vis_original = original_image.copy()
for (x, y, w, h) in text_boxes:
cv2.rectangle(vis_original, (x, y), (x + w, y + h), (0, 255, 0), 3)
vis_original_path = Path(OUTPUT_DIR) / "02_original_with_text_boxes.png"
cv2.imwrite(str(vis_original_path), cv2.cvtColor(vis_original, cv2.COLOR_RGB2BGR))
print(f" 📁 Original + text boxes: {vis_original_path}")
# Visualization 2: Masked image with detected regions
vis_masked = masked_image.copy()
for region in potential_regions:
x, y, w, h = region['box']
cv2.rectangle(vis_masked, (x, y), (x + w, y + h), (255, 0, 0), 3)
vis_masked_path = Path(OUTPUT_DIR) / "03_masked_with_regions.png"
cv2.imwrite(str(vis_masked_path), cv2.cvtColor(vis_masked, cv2.COLOR_RGB2BGR))
print(f" 📁 Masked + regions: {vis_masked_path}")
# Visualization 3: Binary threshold result
binary_path = Path(OUTPUT_DIR) / "04_binary_threshold.png"
cv2.imwrite(str(binary_path), binary)
print(f" 📁 Binary threshold: {binary_path}")
# Visualization 4: Morphed result
morphed_path = Path(OUTPUT_DIR) / "05_morphed.png"
cv2.imwrite(str(morphed_path), morphed)
print(f" 📁 Morphed: {morphed_path}")
# Extract and save each detected region
print("\n7. Extracting detected regions...")
for i, region in enumerate(potential_regions):
x, y, w, h = region['box']
# Add padding
padding = 10
x_pad = max(0, x - padding)
y_pad = max(0, y - padding)
w_pad = min(original_image.shape[1] - x_pad, w + 2*padding)
h_pad = min(original_image.shape[0] - y_pad, h + 2*padding)
# Extract region from original image
region_img = original_image[y_pad:y_pad+h_pad, x_pad:x_pad+w_pad]
# Save region
region_path = Path(OUTPUT_DIR) / f"region_{i+1:02d}.png"
cv2.imwrite(str(region_path), cv2.cvtColor(region_img, cv2.COLOR_RGB2BGR))
print(f" 📁 Region {i+1}: {region_path}")
except Exception as e:
print(f" ❌ Error: {e}")
import traceback
traceback.print_exc()
print("\n" + "="*80)
print("Test completed!")
print(f"Results saved to: {OUTPUT_DIR}")
print("="*80)
print("\nSummary:")
print(f" - Printed text regions detected: {len(text_boxes)}")
print(f" - Potential handwriting regions: {len(potential_regions)}")
print(f" - Expected signatures: 2 (楊智惠, 張志銘)")
print("="*80)