Add PaddleOCR masking and region detection pipeline
- Created PaddleOCR client for remote server communication - Implemented text masking + region detection pipeline - Test results: 100% recall on sample PDF (found both signatures) - Identified issues: split regions, printed text not fully masked - Documented 5 solution options in PADDLEOCR_STATUS.md - Next: Implement region merging and two-stage cleaning
This commit is contained in:
216
test_mask_and_detect.py
Normal file
216
test_mask_and_detect.py
Normal file
@@ -0,0 +1,216 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test PaddleOCR Masking + Region Detection Pipeline
|
||||
|
||||
This script demonstrates:
|
||||
1. PaddleOCR detects printed text bounding boxes
|
||||
2. Mask out all printed text areas (fill with black)
|
||||
3. Detect remaining non-white regions (potential handwriting)
|
||||
4. Visualize the results
|
||||
"""
|
||||
|
||||
import fitz # PyMuPDF
|
||||
import numpy as np
|
||||
import cv2
|
||||
from pathlib import Path
|
||||
from paddleocr_client import create_ocr_client
|
||||
|
||||
# Configuration
|
||||
TEST_PDF = "/Volumes/NV2/PDF-Processing/signature-image-output/201301_1324_AI1_page3.pdf"
|
||||
OUTPUT_DIR = "/Volumes/NV2/PDF-Processing/signature-image-output/mask_test"
|
||||
DPI = 300
|
||||
|
||||
# Region detection parameters
|
||||
MIN_REGION_AREA = 3000 # Minimum pixels for a region
|
||||
MAX_REGION_AREA = 300000 # Maximum pixels for a region
|
||||
MIN_ASPECT_RATIO = 0.3 # Minimum width/height ratio
|
||||
MAX_ASPECT_RATIO = 15.0 # Maximum width/height ratio
|
||||
|
||||
print("="*80)
|
||||
print("PaddleOCR Masking + Region Detection Test")
|
||||
print("="*80)
|
||||
|
||||
# Create output directory
|
||||
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Step 1: Connect to PaddleOCR server
|
||||
print("\n1. Connecting to PaddleOCR server...")
|
||||
try:
|
||||
ocr_client = create_ocr_client()
|
||||
print(f" ✅ Connected: {ocr_client.server_url}")
|
||||
except Exception as e:
|
||||
print(f" ❌ Error: {e}")
|
||||
exit(1)
|
||||
|
||||
# Step 2: Render PDF to image
|
||||
print("\n2. Rendering PDF to image...")
|
||||
try:
|
||||
doc = fitz.open(TEST_PDF)
|
||||
page = doc[0]
|
||||
mat = fitz.Matrix(DPI/72, DPI/72)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
original_image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
|
||||
|
||||
if pix.n == 4: # RGBA
|
||||
original_image = cv2.cvtColor(original_image, cv2.COLOR_RGBA2RGB)
|
||||
|
||||
print(f" ✅ Rendered: {original_image.shape[1]}x{original_image.shape[0]} pixels")
|
||||
doc.close()
|
||||
except Exception as e:
|
||||
print(f" ❌ Error: {e}")
|
||||
exit(1)
|
||||
|
||||
# Step 3: Detect printed text with PaddleOCR
|
||||
print("\n3. Detecting printed text with PaddleOCR...")
|
||||
try:
|
||||
text_boxes = ocr_client.get_text_boxes(original_image)
|
||||
print(f" ✅ Detected {len(text_boxes)} text regions")
|
||||
|
||||
# Show some sample boxes
|
||||
if text_boxes:
|
||||
print(" Sample text boxes (x, y, w, h):")
|
||||
for i, box in enumerate(text_boxes[:3]):
|
||||
print(f" {i+1}. {box}")
|
||||
except Exception as e:
|
||||
print(f" ❌ Error: {e}")
|
||||
exit(1)
|
||||
|
||||
# Step 4: Mask out printed text areas
|
||||
print("\n4. Masking printed text areas...")
|
||||
try:
|
||||
masked_image = original_image.copy()
|
||||
|
||||
# Fill each text box with black
|
||||
for (x, y, w, h) in text_boxes:
|
||||
cv2.rectangle(masked_image, (x, y), (x + w, y + h), (0, 0, 0), -1)
|
||||
|
||||
print(f" ✅ Masked {len(text_boxes)} text regions")
|
||||
|
||||
# Save masked image
|
||||
masked_path = Path(OUTPUT_DIR) / "01_masked_image.png"
|
||||
cv2.imwrite(str(masked_path), cv2.cvtColor(masked_image, cv2.COLOR_RGB2BGR))
|
||||
print(f" 📁 Saved: {masked_path}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Error: {e}")
|
||||
exit(1)
|
||||
|
||||
# Step 5: Detect remaining non-white regions
|
||||
print("\n5. Detecting remaining non-white regions...")
|
||||
try:
|
||||
# Convert to grayscale
|
||||
gray = cv2.cvtColor(masked_image, cv2.COLOR_RGB2GRAY)
|
||||
|
||||
# Threshold to find non-white areas
|
||||
# Anything darker than 250 is considered "content"
|
||||
_, binary = cv2.threshold(gray, 250, 255, cv2.THRESH_BINARY_INV)
|
||||
|
||||
# Apply morphological operations to connect nearby regions
|
||||
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
|
||||
morphed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=2)
|
||||
|
||||
# Find contours
|
||||
contours, _ = cv2.findContours(morphed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
print(f" ✅ Found {len(contours)} contours")
|
||||
|
||||
# Filter contours by size and aspect ratio
|
||||
potential_regions = []
|
||||
|
||||
for contour in contours:
|
||||
x, y, w, h = cv2.boundingRect(contour)
|
||||
area = w * h
|
||||
aspect_ratio = w / h if h > 0 else 0
|
||||
|
||||
# Check constraints
|
||||
if (MIN_REGION_AREA <= area <= MAX_REGION_AREA and
|
||||
MIN_ASPECT_RATIO <= aspect_ratio <= MAX_ASPECT_RATIO):
|
||||
potential_regions.append({
|
||||
'box': (x, y, w, h),
|
||||
'area': area,
|
||||
'aspect_ratio': aspect_ratio
|
||||
})
|
||||
|
||||
print(f" ✅ Filtered to {len(potential_regions)} potential handwriting regions")
|
||||
|
||||
# Show region details
|
||||
if potential_regions:
|
||||
print("\n Detected regions:")
|
||||
for i, region in enumerate(potential_regions[:5]):
|
||||
x, y, w, h = region['box']
|
||||
print(f" {i+1}. Box: ({x}, {y}, {w}, {h}), "
|
||||
f"Area: {region['area']}, "
|
||||
f"Aspect: {region['aspect_ratio']:.2f}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
exit(1)
|
||||
|
||||
# Step 6: Visualize results
|
||||
print("\n6. Creating visualizations...")
|
||||
try:
|
||||
# Visualization 1: Original with text boxes
|
||||
vis_original = original_image.copy()
|
||||
for (x, y, w, h) in text_boxes:
|
||||
cv2.rectangle(vis_original, (x, y), (x + w, y + h), (0, 255, 0), 3)
|
||||
|
||||
vis_original_path = Path(OUTPUT_DIR) / "02_original_with_text_boxes.png"
|
||||
cv2.imwrite(str(vis_original_path), cv2.cvtColor(vis_original, cv2.COLOR_RGB2BGR))
|
||||
print(f" 📁 Original + text boxes: {vis_original_path}")
|
||||
|
||||
# Visualization 2: Masked image with detected regions
|
||||
vis_masked = masked_image.copy()
|
||||
for region in potential_regions:
|
||||
x, y, w, h = region['box']
|
||||
cv2.rectangle(vis_masked, (x, y), (x + w, y + h), (255, 0, 0), 3)
|
||||
|
||||
vis_masked_path = Path(OUTPUT_DIR) / "03_masked_with_regions.png"
|
||||
cv2.imwrite(str(vis_masked_path), cv2.cvtColor(vis_masked, cv2.COLOR_RGB2BGR))
|
||||
print(f" 📁 Masked + regions: {vis_masked_path}")
|
||||
|
||||
# Visualization 3: Binary threshold result
|
||||
binary_path = Path(OUTPUT_DIR) / "04_binary_threshold.png"
|
||||
cv2.imwrite(str(binary_path), binary)
|
||||
print(f" 📁 Binary threshold: {binary_path}")
|
||||
|
||||
# Visualization 4: Morphed result
|
||||
morphed_path = Path(OUTPUT_DIR) / "05_morphed.png"
|
||||
cv2.imwrite(str(morphed_path), morphed)
|
||||
print(f" 📁 Morphed: {morphed_path}")
|
||||
|
||||
# Extract and save each detected region
|
||||
print("\n7. Extracting detected regions...")
|
||||
for i, region in enumerate(potential_regions):
|
||||
x, y, w, h = region['box']
|
||||
|
||||
# Add padding
|
||||
padding = 10
|
||||
x_pad = max(0, x - padding)
|
||||
y_pad = max(0, y - padding)
|
||||
w_pad = min(original_image.shape[1] - x_pad, w + 2*padding)
|
||||
h_pad = min(original_image.shape[0] - y_pad, h + 2*padding)
|
||||
|
||||
# Extract region from original image
|
||||
region_img = original_image[y_pad:y_pad+h_pad, x_pad:x_pad+w_pad]
|
||||
|
||||
# Save region
|
||||
region_path = Path(OUTPUT_DIR) / f"region_{i+1:02d}.png"
|
||||
cv2.imwrite(str(region_path), cv2.cvtColor(region_img, cv2.COLOR_RGB2BGR))
|
||||
print(f" 📁 Region {i+1}: {region_path}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Test completed!")
|
||||
print(f"Results saved to: {OUTPUT_DIR}")
|
||||
print("="*80)
|
||||
print("\nSummary:")
|
||||
print(f" - Printed text regions detected: {len(text_boxes)}")
|
||||
print(f" - Potential handwriting regions: {len(potential_regions)}")
|
||||
print(f" - Expected signatures: 2 (楊智惠, 張志銘)")
|
||||
print("="*80)
|
||||
Reference in New Issue
Block a user