Add PaddleOCR masking and region detection pipeline

- Created PaddleOCR client for remote server communication - Implemented text masking + region detection pipeline - Test results: 100% recall on sample PDF (found both signatures) - Identified issues: split regions, printed text not fully masked - Documented 5 solution options in PADDLEOCR_STATUS.md - Next: Implement region merging and two-stage cleaning
2025-10-28 22:28:18 +08:00
parent 52612e14ba
commit 479d4e0019
6 changed files with 1118 additions and 0 deletions
--- a/test_mask_and_detect.py
+++ b/test_mask_and_detect.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""
+Test PaddleOCR Masking + Region Detection Pipeline
+
+This script demonstrates:
+1. PaddleOCR detects printed text bounding boxes
+2. Mask out all printed text areas (fill with black)
+3. Detect remaining non-white regions (potential handwriting)
+4. Visualize the results
+"""
+
+import fitz  # PyMuPDF
+import numpy as np
+import cv2
+from pathlib import Path
+from paddleocr_client import create_ocr_client
+
+# Configuration
+TEST_PDF = "/Volumes/NV2/PDF-Processing/signature-image-output/201301_1324_AI1_page3.pdf"
+OUTPUT_DIR = "/Volumes/NV2/PDF-Processing/signature-image-output/mask_test"
+DPI = 300
+
+# Region detection parameters
+MIN_REGION_AREA = 3000      # Minimum pixels for a region
+MAX_REGION_AREA = 300000    # Maximum pixels for a region
+MIN_ASPECT_RATIO = 0.3      # Minimum width/height ratio
+MAX_ASPECT_RATIO = 15.0     # Maximum width/height ratio
+
+print("="*80)
+print("PaddleOCR Masking + Region Detection Test")
+print("="*80)
+
+# Create output directory
+Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
+
+# Step 1: Connect to PaddleOCR server
+print("\n1. Connecting to PaddleOCR server...")
+try:
+    ocr_client = create_ocr_client()
+    print(f"   ✅ Connected: {ocr_client.server_url}")
+except Exception as e:
+    print(f"   ❌ Error: {e}")
+    exit(1)
+
+# Step 2: Render PDF to image
+print("\n2. Rendering PDF to image...")
+try:
+    doc = fitz.open(TEST_PDF)
+    page = doc[0]
+    mat = fitz.Matrix(DPI/72, DPI/72)
+    pix = page.get_pixmap(matrix=mat)
+    original_image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
+
+    if pix.n == 4:  # RGBA
+        original_image = cv2.cvtColor(original_image, cv2.COLOR_RGBA2RGB)
+
+    print(f"   ✅ Rendered: {original_image.shape[1]}x{original_image.shape[0]} pixels")
+    doc.close()
+except Exception as e:
+    print(f"   ❌ Error: {e}")
+    exit(1)
+
+# Step 3: Detect printed text with PaddleOCR
+print("\n3. Detecting printed text with PaddleOCR...")
+try:
+    text_boxes = ocr_client.get_text_boxes(original_image)
+    print(f"   ✅ Detected {len(text_boxes)} text regions")
+
+    # Show some sample boxes
+    if text_boxes:
+        print("   Sample text boxes (x, y, w, h):")
+        for i, box in enumerate(text_boxes[:3]):
+            print(f"      {i+1}. {box}")
+except Exception as e:
+    print(f"   ❌ Error: {e}")
+    exit(1)
+
+# Step 4: Mask out printed text areas
+print("\n4. Masking printed text areas...")
+try:
+    masked_image = original_image.copy()
+
+    # Fill each text box with black
+    for (x, y, w, h) in text_boxes:
+        cv2.rectangle(masked_image, (x, y), (x + w, y + h), (0, 0, 0), -1)
+
+    print(f"   ✅ Masked {len(text_boxes)} text regions")
+
+    # Save masked image
+    masked_path = Path(OUTPUT_DIR) / "01_masked_image.png"
+    cv2.imwrite(str(masked_path), cv2.cvtColor(masked_image, cv2.COLOR_RGB2BGR))
+    print(f"   📁 Saved: {masked_path}")
+
+except Exception as e:
+    print(f"   ❌ Error: {e}")
+    exit(1)
+
+# Step 5: Detect remaining non-white regions
+print("\n5. Detecting remaining non-white regions...")
+try:
+    # Convert to grayscale
+    gray = cv2.cvtColor(masked_image, cv2.COLOR_RGB2GRAY)
+
+    # Threshold to find non-white areas
+    # Anything darker than 250 is considered "content"
+    _, binary = cv2.threshold(gray, 250, 255, cv2.THRESH_BINARY_INV)
+
+    # Apply morphological operations to connect nearby regions
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
+    morphed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=2)
+
+    # Find contours
+    contours, _ = cv2.findContours(morphed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    print(f"   ✅ Found {len(contours)} contours")
+
+    # Filter contours by size and aspect ratio
+    potential_regions = []
+
+    for contour in contours:
+        x, y, w, h = cv2.boundingRect(contour)
+        area = w * h
+        aspect_ratio = w / h if h > 0 else 0
+
+        # Check constraints
+        if (MIN_REGION_AREA <= area <= MAX_REGION_AREA and
+            MIN_ASPECT_RATIO <= aspect_ratio <= MAX_ASPECT_RATIO):
+            potential_regions.append({
+                'box': (x, y, w, h),
+                'area': area,
+                'aspect_ratio': aspect_ratio
+            })
+
+    print(f"   ✅ Filtered to {len(potential_regions)} potential handwriting regions")
+
+    # Show region details
+    if potential_regions:
+        print("\n   Detected regions:")
+        for i, region in enumerate(potential_regions[:5]):
+            x, y, w, h = region['box']
+            print(f"      {i+1}. Box: ({x}, {y}, {w}, {h}), "
+                  f"Area: {region['area']}, "
+                  f"Aspect: {region['aspect_ratio']:.2f}")
+
+except Exception as e:
+    print(f"   ❌ Error: {e}")
+    import traceback
+    traceback.print_exc()
+    exit(1)
+
+# Step 6: Visualize results
+print("\n6. Creating visualizations...")
+try:
+    # Visualization 1: Original with text boxes
+    vis_original = original_image.copy()
+    for (x, y, w, h) in text_boxes:
+        cv2.rectangle(vis_original, (x, y), (x + w, y + h), (0, 255, 0), 3)
+
+    vis_original_path = Path(OUTPUT_DIR) / "02_original_with_text_boxes.png"
+    cv2.imwrite(str(vis_original_path), cv2.cvtColor(vis_original, cv2.COLOR_RGB2BGR))
+    print(f"   📁 Original + text boxes: {vis_original_path}")
+
+    # Visualization 2: Masked image with detected regions
+    vis_masked = masked_image.copy()
+    for region in potential_regions:
+        x, y, w, h = region['box']
+        cv2.rectangle(vis_masked, (x, y), (x + w, y + h), (255, 0, 0), 3)
+
+    vis_masked_path = Path(OUTPUT_DIR) / "03_masked_with_regions.png"
+    cv2.imwrite(str(vis_masked_path), cv2.cvtColor(vis_masked, cv2.COLOR_RGB2BGR))
+    print(f"   📁 Masked + regions: {vis_masked_path}")
+
+    # Visualization 3: Binary threshold result
+    binary_path = Path(OUTPUT_DIR) / "04_binary_threshold.png"
+    cv2.imwrite(str(binary_path), binary)
+    print(f"   📁 Binary threshold: {binary_path}")
+
+    # Visualization 4: Morphed result
+    morphed_path = Path(OUTPUT_DIR) / "05_morphed.png"
+    cv2.imwrite(str(morphed_path), morphed)
+    print(f"   📁 Morphed: {morphed_path}")
+
+    # Extract and save each detected region
+    print("\n7. Extracting detected regions...")
+    for i, region in enumerate(potential_regions):
+        x, y, w, h = region['box']
+
+        # Add padding
+        padding = 10
+        x_pad = max(0, x - padding)
+        y_pad = max(0, y - padding)
+        w_pad = min(original_image.shape[1] - x_pad, w + 2*padding)
+        h_pad = min(original_image.shape[0] - y_pad, h + 2*padding)
+
+        # Extract region from original image
+        region_img = original_image[y_pad:y_pad+h_pad, x_pad:x_pad+w_pad]
+
+        # Save region
+        region_path = Path(OUTPUT_DIR) / f"region_{i+1:02d}.png"
+        cv2.imwrite(str(region_path), cv2.cvtColor(region_img, cv2.COLOR_RGB2BGR))
+        print(f"   📁 Region {i+1}: {region_path}")
+
+except Exception as e:
+    print(f"   ❌ Error: {e}")
+    import traceback
+    traceback.print_exc()
+
+print("\n" + "="*80)
+print("Test completed!")
+print(f"Results saved to: {OUTPUT_DIR}")
+print("="*80)
+print("\nSummary:")
+print(f"  - Printed text regions detected: {len(text_boxes)}")
+print(f"  - Potential handwriting regions: {len(potential_regions)}")
+print(f"  - Expected signatures: 2 (楊智惠, 張志銘)")
+print("="*80)