Add PaddleOCR masking and region detection pipeline

- Created PaddleOCR client for remote server communication - Implemented text masking + region detection pipeline - Test results: 100% recall on sample PDF (found both signatures) - Identified issues: split regions, printed text not fully masked - Documented 5 solution options in PADDLEOCR_STATUS.md - Next: Implement region merging and two-stage cleaning
2025-10-28 22:28:18 +08:00
parent 52612e14ba
commit 479d4e0019
6 changed files with 1118 additions and 0 deletions
--- a/test_paddleocr.py
+++ b/test_paddleocr.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""Test PaddleOCR on a sample PDF page."""
+
+import fitz  # PyMuPDF
+from paddleocr import PaddleOCR
+import numpy as np
+from PIL import Image
+import cv2
+from pathlib import Path
+
+# Configuration
+TEST_PDF = "/Volumes/NV2/PDF-Processing/signature-image-output/201301_1324_AI1_page3.pdf"
+DPI = 300
+
+print("="*80)
+print("Testing PaddleOCR on macOS Apple Silicon")
+print("="*80)
+
+# Step 1: Render PDF to image
+print("\n1. Rendering PDF to image...")
+try:
+    doc = fitz.open(TEST_PDF)
+    page = doc[0]
+    mat = fitz.Matrix(DPI/72, DPI/72)
+    pix = page.get_pixmap(matrix=mat)
+    image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
+
+    if pix.n == 4:  # RGBA
+        image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
+
+    print(f"   ✅ Rendered: {image.shape[1]}x{image.shape[0]} pixels")
+    doc.close()
+except Exception as e:
+    print(f"   ❌ Error: {e}")
+    exit(1)
+
+# Step 2: Initialize PaddleOCR
+print("\n2. Initializing PaddleOCR...")
+print("   (First run will download models, may take a few minutes...)")
+try:
+    # Use the correct syntax from official docs
+    ocr = PaddleOCR(
+        use_doc_orientation_classify=False,
+        use_doc_unwarping=False,
+        use_textline_orientation=False,
+        lang='ch'  # Chinese language
+    )
+    print("   ✅ PaddleOCR initialized successfully")
+except Exception as e:
+    print(f"   ❌ Error: {e}")
+    import traceback
+    traceback.print_exc()
+    print("\n   Note: PaddleOCR requires PaddlePaddle backend.")
+    print("   If this is a module import error, PaddlePaddle may not support this platform.")
+    exit(1)
+
+# Step 3: Run OCR
+print("\n3. Running OCR to detect printed text...")
+try:
+    result = ocr.ocr(image, cls=False)
+
+    if result and result[0]:
+        print(f"   ✅ Detected {len(result[0])} text regions")
+
+        # Show first few detections
+        print("\n   Sample detections:")
+        for i, item in enumerate(result[0][:5]):
+            box = item[0]  # Bounding box coordinates
+            text = item[1][0]  # Detected text
+            confidence = item[1][1]  # Confidence score
+            print(f"      {i+1}. Text: '{text}' (confidence: {confidence:.2f})")
+            print(f"         Box: {box}")
+    else:
+        print("   ⚠️  No text detected")
+
+except Exception as e:
+    print(f"   ❌ Error during OCR: {e}")
+    import traceback
+    traceback.print_exc()
+    exit(1)
+
+# Step 4: Visualize detection
+print("\n4. Creating visualization...")
+try:
+    vis_image = image.copy()
+
+    if result and result[0]:
+        for item in result[0]:
+            box = np.array(item[0], dtype=np.int32)
+            cv2.polylines(vis_image, [box], True, (0, 255, 0), 2)
+
+    # Save visualization
+    output_path = "/Volumes/NV2/PDF-Processing/signature-image-output/paddleocr_test_detection.png"
+    cv2.imwrite(output_path, cv2.cvtColor(vis_image, cv2.COLOR_RGB2BGR))
+    print(f"   ✅ Saved visualization: {output_path}")
+
+except Exception as e:
+    print(f"   ❌ Error during visualization: {e}")
+
+print("\n" + "="*80)
+print("PaddleOCR test completed!")
+print("="*80)