Complete OpenCV Method 3 implementation with 86.5% handwriting retention

- Implemented comprehensive feature analysis based on size, stroke length, and regularity - Size-based scoring: height >50px indicates handwriting - Stroke length ratio: >0.4 indicates handwriting - Irregularity metrics: low compactness/solidity indicates handwriting - Successfully tested on sample PDF with 2 signatures (楊智惠, 張志銘) - Created detailed documentation: CURRENT_STATUS.md and NEW_SESSION_HANDOFF.md - Stable PaddleOCR 2.7.3 configuration documented (numpy 1.26.4, opencv 4.6.0.66) - Prepared research plan for PP-OCRv5 upgrade investigation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 10:35:46 +08:00
parent 479d4e0019
commit 8f231da3bc
6 changed files with 1718 additions and 0 deletions
--- a/test_opencv_separation.py
+++ b/test_opencv_separation.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+"""
+Test OpenCV methods to separate handwriting from printed text
+
+Tests two methods:
+1. Stroke Width Analysis (笔画宽度分析)
+2. Connected Components + Shape Features (连通组件+形状特征)
+"""
+
+import cv2
+import numpy as np
+from pathlib import Path
+
+# Test image - contains both printed and handwritten
+TEST_IMAGE = "/Volumes/NV2/PDF-Processing/signature-image-output/paddleocr_improved/signature_02_original.png"
+OUTPUT_DIR = "/Volumes/NV2/PDF-Processing/signature-image-output/opencv_separation_test"
+
+print("="*80)
+print("OpenCV Handwriting Separation Test")
+print("="*80)
+
+# Create output directory
+Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
+
+# Load image
+print(f"\nLoading test image: {Path(TEST_IMAGE).name}")
+image = cv2.imread(TEST_IMAGE)
+if image is None:
+    print(f"Error: Cannot load image from {TEST_IMAGE}")
+    exit(1)
+
+image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+print(f"Image size: {image.shape[1]}x{image.shape[0]}")
+
+# Convert to grayscale
+gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+# Binarize
+_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+# Save binary for reference
+cv2.imwrite(str(Path(OUTPUT_DIR) / "00_binary.png"), binary)
+print("\n📁 Saved: 00_binary.png")
+
+print("\n" + "="*80)
+print("METHOD 1: Stroke Width Analysis (笔画宽度分析)")
+print("="*80)
+
+def method1_stroke_width(binary_img, threshold_values=[2.0, 3.0, 4.0, 5.0]):
+    """
+    Method 1: Separate by stroke width using distance transform
+
+    Args:
+        binary_img: Binary image (foreground = 255, background = 0)
+        threshold_values: List of distance thresholds to test
+
+    Returns:
+        List of (threshold, result_image) tuples
+    """
+    results = []
+
+    # Calculate distance transform
+    dist_transform = cv2.distanceTransform(binary_img, cv2.DIST_L2, 5)
+
+    # Normalize for visualization
+    dist_normalized = cv2.normalize(dist_transform, None, 0, 255, cv2.NORM_MINMAX, cv2.CV_8U)
+    results.append(('distance_transform', dist_normalized))
+
+    print("\n  Distance transform statistics:")
+    print(f"    Min: {dist_transform.min():.2f}")
+    print(f"    Max: {dist_transform.max():.2f}")
+    print(f"    Mean: {dist_transform.mean():.2f}")
+    print(f"    Median: {np.median(dist_transform):.2f}")
+
+    # Test different thresholds
+    print("\n  Testing different stroke width thresholds:")
+
+    for threshold in threshold_values:
+        # Pixels with distance > threshold are considered "thick strokes" (handwriting)
+        handwriting_mask = (dist_transform > threshold).astype(np.uint8) * 255
+
+        # Count pixels
+        total_foreground = np.count_nonzero(binary_img)
+        handwriting_pixels = np.count_nonzero(handwriting_mask)
+        percentage = (handwriting_pixels / total_foreground * 100) if total_foreground > 0 else 0
+
+        print(f"    Threshold {threshold:.1f}: {handwriting_pixels} pixels ({percentage:.1f}% of foreground)")
+
+        results.append((f'threshold_{threshold:.1f}', handwriting_mask))
+
+    return results
+
+# Run Method 1
+method1_results = method1_stroke_width(binary, threshold_values=[2.0, 2.5, 3.0, 3.5, 4.0, 5.0])
+
+# Save Method 1 results
+print("\n  Saving results...")
+for name, result_img in method1_results:
+    output_path = Path(OUTPUT_DIR) / f"method1_{name}.png"
+    cv2.imwrite(str(output_path), result_img)
+    print(f"    📁 {output_path.name}")
+
+# Apply best threshold result to original image
+best_threshold = 3.0  # Will adjust based on visual inspection
+_, best_mask = [(n, r) for n, r in method1_results if f'threshold_{best_threshold}' in n][0]
+
+# Dilate mask slightly to connect nearby strokes
+kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
+best_mask_dilated = cv2.dilate(best_mask, kernel, iterations=1)
+
+# Apply to color image
+result_method1 = cv2.bitwise_and(image, image, mask=best_mask_dilated)
+cv2.imwrite(str(Path(OUTPUT_DIR) / "method1_final_result.png"), result_method1)
+print(f"\n  📁 Final result: method1_final_result.png (threshold={best_threshold})")
+
+
+print("\n" + "="*80)
+print("METHOD 2: Connected Components + Shape Features (连通组件分析)")
+print("="*80)
+
+def method2_component_analysis(binary_img, original_img):
+    """
+    Method 2: Analyze each connected component's shape features
+
+    Printed text characteristics:
+    - Regular bounding box (aspect ratio ~1:1)
+    - Medium size (200-2000 pixels)
+    - High circularity/compactness
+
+    Handwriting characteristics:
+    - Irregular shapes
+    - May be large (connected strokes)
+    - Variable aspect ratios
+    """
+    # Find connected components
+    num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(binary_img, connectivity=8)
+
+    print(f"\n  Found {num_labels - 1} connected components")
+
+    # Create masks for different categories
+    handwriting_mask = np.zeros_like(binary_img)
+    printed_mask = np.zeros_like(binary_img)
+
+    # Analyze each component
+    component_info = []
+
+    for i in range(1, num_labels):  # Skip background (0)
+        x, y, w, h, area = stats[i]
+
+        # Calculate features
+        aspect_ratio = w / h if h > 0 else 0
+        perimeter = cv2.arcLength(cv2.findContours((labels == i).astype(np.uint8),
+                                                    cv2.RETR_EXTERNAL,
+                                                    cv2.CHAIN_APPROX_SIMPLE)[0][0], True)
+        compactness = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
+
+        # Classification logic
+        # Printed text: medium size, regular aspect ratio, compact
+        is_printed = (
+            (200 < area < 3000) and              # Medium size
+            (0.3 < aspect_ratio < 3.0) and       # Not too elongated
+            (area < 1000)                         # Small to medium
+        )
+
+        # Handwriting: larger, or irregular, or very wide/tall
+        is_handwriting = (
+            (area >= 3000) or                     # Large components (likely handwriting)
+            (aspect_ratio > 3.0) or               # Very elongated (连笔)
+            (aspect_ratio < 0.3) or               # Very tall
+            not is_printed                        # Default to handwriting if not clearly printed
+        )
+
+        component_info.append({
+            'id': i,
+            'area': area,
+            'aspect_ratio': aspect_ratio,
+            'compactness': compactness,
+            'is_printed': is_printed,
+            'is_handwriting': is_handwriting
+        })
+
+        # Assign to mask
+        if is_handwriting:
+            handwriting_mask[labels == i] = 255
+        if is_printed:
+            printed_mask[labels == i] = 255
+
+    # Print statistics
+    print("\n  Component statistics:")
+    handwriting_components = [c for c in component_info if c['is_handwriting']]
+    printed_components = [c for c in component_info if c['is_printed']]
+
+    print(f"    Handwriting components: {len(handwriting_components)}")
+    print(f"    Printed components: {len(printed_components)}")
+
+    # Show top 5 largest components
+    print("\n  Top 5 largest components:")
+    sorted_components = sorted(component_info, key=lambda c: c['area'], reverse=True)
+    for i, comp in enumerate(sorted_components[:5], 1):
+        comp_type = "Handwriting" if comp['is_handwriting'] else "Printed"
+        print(f"    {i}. Area: {comp['area']:5d}, Aspect: {comp['aspect_ratio']:.2f}, "
+              f"Type: {comp_type}")
+
+    return handwriting_mask, printed_mask, component_info
+
+# Run Method 2
+handwriting_mask_m2, printed_mask_m2, components = method2_component_analysis(binary, image)
+
+# Save Method 2 results
+print("\n  Saving results...")
+
+# Handwriting mask
+cv2.imwrite(str(Path(OUTPUT_DIR) / "method2_handwriting_mask.png"), handwriting_mask_m2)
+print(f"    📁 method2_handwriting_mask.png")
+
+# Printed mask
+cv2.imwrite(str(Path(OUTPUT_DIR) / "method2_printed_mask.png"), printed_mask_m2)
+print(f"    📁 method2_printed_mask.png")
+
+# Apply to original image
+result_handwriting = cv2.bitwise_and(image, image, mask=handwriting_mask_m2)
+result_printed = cv2.bitwise_and(image, image, mask=printed_mask_m2)
+
+cv2.imwrite(str(Path(OUTPUT_DIR) / "method2_handwriting_result.png"), result_handwriting)
+print(f"    📁 method2_handwriting_result.png")
+
+cv2.imwrite(str(Path(OUTPUT_DIR) / "method2_printed_result.png"), result_printed)
+print(f"    📁 method2_printed_result.png")
+
+# Create visualization with component labels
+vis_components = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
+vis_components = cv2.cvtColor(vis_components, cv2.COLOR_BGR2RGB)
+
+# Color code: green = handwriting, red = printed
+vis_overlay = image.copy()
+vis_overlay[handwriting_mask_m2 > 0] = [0, 255, 0]  # Green for handwriting
+vis_overlay[printed_mask_m2 > 0] = [0, 0, 255]      # Red for printed
+
+# Blend with original
+vis_final = cv2.addWeighted(image, 0.6, vis_overlay, 0.4, 0)
+cv2.imwrite(str(Path(OUTPUT_DIR) / "method2_visualization.png"), vis_final)
+print(f"    📁 method2_visualization.png (green=handwriting, red=printed)")
+
+
+print("\n" + "="*80)
+print("COMPARISON")
+print("="*80)
+
+# Count non-white pixels in each result
+def count_content_pixels(img):
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) if len(img.shape) == 3 else img
+    return np.count_nonzero(gray > 10)
+
+original_pixels = count_content_pixels(image)
+method1_pixels = count_content_pixels(result_method1)
+method2_pixels = count_content_pixels(result_handwriting)
+
+print(f"\nContent pixels retained:")
+print(f"  Original image:     {original_pixels:6d} pixels")
+print(f"  Method 1 (stroke):  {method1_pixels:6d} pixels ({method1_pixels/original_pixels*100:.1f}%)")
+print(f"  Method 2 (component): {method2_pixels:6d} pixels ({method2_pixels/original_pixels*100:.1f}%)")
+
+print("\n" + "="*80)
+print("Test completed!")
+print(f"Results saved to: {OUTPUT_DIR}")
+print("="*80)
+
+print("\nNext steps:")
+print("  1. Review the output images")
+print("  2. Check which method better preserves handwriting")
+print("  3. Adjust thresholds if needed")
+print("  4. Choose the best method for production pipeline")