Complete OpenCV Method 3 implementation with 86.5% handwriting retention

- Implemented comprehensive feature analysis based on size, stroke length, and regularity - Size-based scoring: height >50px indicates handwriting - Stroke length ratio: >0.4 indicates handwriting - Irregularity metrics: low compactness/solidity indicates handwriting - Successfully tested on sample PDF with 2 signatures (楊智惠, 張志銘) - Created detailed documentation: CURRENT_STATUS.md and NEW_SESSION_HANDOFF.md - Stable PaddleOCR 2.7.3 configuration documented (numpy 1.26.4, opencv 4.6.0.66) - Prepared research plan for PP-OCRv5 upgrade investigation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 10:35:46 +08:00
parent 479d4e0019
commit 8f231da3bc
6 changed files with 1718 additions and 0 deletions
--- a/test_opencv_advanced.py
+++ b/test_opencv_advanced.py
@@ -0,0 +1,256 @@
+#!/usr/bin/env python3
+"""
+Advanced OpenCV separation based on key observations:
+1. 手写字比印刷字大 (Handwriting is LARGER)
+2. 手写笔画长度更长 (Handwriting strokes are LONGER)
+3. 印刷标楷体规律，手写潦草 (Printed is regular, handwriting is messy)
+"""
+
+import cv2
+import numpy as np
+from pathlib import Path
+from scipy import ndimage
+
+# Test image
+TEST_IMAGE = "/Volumes/NV2/PDF-Processing/signature-image-output/paddleocr_improved/signature_02_original.png"
+OUTPUT_DIR = "/Volumes/NV2/PDF-Processing/signature-image-output/opencv_advanced_test"
+
+print("="*80)
+print("Advanced OpenCV Separation - Size + Stroke Length + Regularity")
+print("="*80)
+
+Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
+
+# Load and preprocess
+image = cv2.imread(TEST_IMAGE)
+gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+print(f"\nImage: {image.shape[1]}x{image.shape[0]}")
+
+# Save binary
+cv2.imwrite(str(Path(OUTPUT_DIR) / "00_binary.png"), binary)
+
+
+print("\n" + "="*80)
+print("METHOD 3: Comprehensive Feature Analysis")
+print("="*80)
+
+# Find connected components
+num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(binary, connectivity=8)
+
+print(f"\nFound {num_labels - 1} connected components")
+print("\nAnalyzing each component...")
+
+# Store analysis for each component
+components_analysis = []
+
+for i in range(1, num_labels):
+    x, y, w, h, area = stats[i]
+
+    # Extract component mask
+    component_mask = (labels == i).astype(np.uint8) * 255
+
+    # ============================================
+    # FEATURE 1: Size (手写字比印刷字大)
+    # ============================================
+    bbox_area = w * h
+    font_height = h  # Character height is a good indicator
+
+    # ============================================
+    # FEATURE 2: Stroke Length (笔画长度)
+    # ============================================
+    # Skeletonize to get the actual stroke centerline
+    from skimage.morphology import skeletonize
+    skeleton = skeletonize(component_mask // 255)
+    stroke_length = np.sum(skeleton)  # Total length of strokes
+
+    # Stroke length ratio (length relative to area)
+    stroke_length_ratio = stroke_length / area if area > 0 else 0
+
+    # ============================================
+    # FEATURE 3: Regularity vs Messiness
+    # ============================================
+    # 3a. Compactness (regular shapes are more compact)
+    contours, _ = cv2.findContours(component_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    if contours:
+        perimeter = cv2.arcLength(contours[0], True)
+        compactness = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
+    else:
+        compactness = 0
+
+    # 3b. Solidity (ratio of area to convex hull area)
+    if contours:
+        hull = cv2.convexHull(contours[0])
+        hull_area = cv2.contourArea(hull)
+        solidity = area / hull_area if hull_area > 0 else 0
+    else:
+        solidity = 0
+
+    # 3c. Extent (ratio of area to bounding box area)
+    extent = area / bbox_area if bbox_area > 0 else 0
+
+    # 3d. Edge roughness (measure irregularity)
+    # More irregular edges = more "messy" = likely handwriting
+    edges = cv2.Canny(component_mask, 50, 150)
+    edge_pixels = np.sum(edges > 0)
+    edge_roughness = edge_pixels / perimeter if perimeter > 0 else 0
+
+    # ============================================
+    # CLASSIFICATION LOGIC
+    # ============================================
+
+    # Large characters are likely handwriting
+    is_large = font_height > 40  # Threshold for "large" characters
+
+    # Long strokes relative to area indicate handwriting
+    is_long_stroke = stroke_length_ratio > 0.4  # Handwriting has higher ratio
+
+    # Regular shapes (high compactness, high solidity) = printed
+    # Irregular shapes (low compactness, low solidity) = handwriting
+    is_irregular = compactness < 0.3 or solidity < 0.7 or extent < 0.5
+
+    # DECISION RULES
+    handwriting_score = 0
+
+    # Size-based scoring (重要!)
+    if font_height > 50:
+        handwriting_score += 3  # Very large = likely handwriting
+    elif font_height > 35:
+        handwriting_score += 2  # Medium-large = possibly handwriting
+    elif font_height < 25:
+        handwriting_score -= 2  # Small = likely printed
+
+    # Stroke length scoring
+    if stroke_length_ratio > 0.5:
+        handwriting_score += 2  # Long strokes
+    elif stroke_length_ratio > 0.35:
+        handwriting_score += 1
+
+    # Regularity scoring (标楷体 is regular, 手写 is messy)
+    if is_irregular:
+        handwriting_score += 1  # Irregular = handwriting
+    else:
+        handwriting_score -= 1  # Regular = printed
+
+    # Area scoring
+    if area > 2000:
+        handwriting_score += 2  # Large area = handwriting
+    elif area < 500:
+        handwriting_score -= 1  # Small area = printed
+
+    # Final classification
+    is_handwriting = handwriting_score > 0
+
+    components_analysis.append({
+        'id': i,
+        'box': (x, y, w, h),
+        'area': area,
+        'height': font_height,
+        'stroke_length': stroke_length,
+        'stroke_ratio': stroke_length_ratio,
+        'compactness': compactness,
+        'solidity': solidity,
+        'extent': extent,
+        'edge_roughness': edge_roughness,
+        'handwriting_score': handwriting_score,
+        'is_handwriting': is_handwriting,
+        'mask': component_mask
+    })
+
+# Sort by area (largest first)
+components_analysis.sort(key=lambda c: c['area'], reverse=True)
+
+# Print analysis
+print("\n" + "-"*80)
+print("Top 10 Components Analysis:")
+print("-"*80)
+print(f"{'ID':<4} {'Area':<6} {'H':<4} {'StrokeLen':<9} {'StrokeR':<7} {'Compact':<7} "
+      f"{'Solid':<6} {'Score':<5} {'Type':<12}")
+print("-"*80)
+
+for i, comp in enumerate(components_analysis[:10]):
+    comp_type = "✅ Handwriting" if comp['is_handwriting'] else "❌ Printed"
+    print(f"{comp['id']:<4} {comp['area']:<6} {comp['height']:<4} "
+          f"{comp['stroke_length']:<9.0f} {comp['stroke_ratio']:<7.3f} "
+          f"{comp['compactness']:<7.3f} {comp['solidity']:<6.3f} "
+          f"{comp['handwriting_score']:>+5} {comp_type:<12}")
+
+# Create masks
+handwriting_mask = np.zeros_like(binary)
+printed_mask = np.zeros_like(binary)
+
+for comp in components_analysis:
+    if comp['is_handwriting']:
+        handwriting_mask = cv2.bitwise_or(handwriting_mask, comp['mask'])
+    else:
+        printed_mask = cv2.bitwise_or(printed_mask, comp['mask'])
+
+# Statistics
+hw_count = sum(1 for c in components_analysis if c['is_handwriting'])
+pr_count = sum(1 for c in components_analysis if not c['is_handwriting'])
+
+print("\n" + "="*80)
+print("Classification Results:")
+print("="*80)
+print(f"  Handwriting components: {hw_count}")
+print(f"  Printed components: {pr_count}")
+print(f"  Total: {len(components_analysis)}")
+
+# Apply to original image
+result_handwriting = cv2.bitwise_and(image, image, mask=handwriting_mask)
+result_printed = cv2.bitwise_and(image, image, mask=printed_mask)
+
+# Save results
+cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_handwriting_mask.png"), handwriting_mask)
+cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_printed_mask.png"), printed_mask)
+cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_handwriting_result.png"), result_handwriting)
+cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_printed_result.png"), result_printed)
+
+# Create visualization
+vis_overlay = image.copy()
+vis_overlay[handwriting_mask > 0] = [0, 255, 0]  # Green for handwriting
+vis_overlay[printed_mask > 0] = [0, 0, 255]      # Red for printed
+vis_final = cv2.addWeighted(image, 0.6, vis_overlay, 0.4, 0)
+
+# Add labels to visualization
+for comp in components_analysis[:15]:  # Label top 15
+    x, y, w, h = comp['box']
+    cx, cy = x + w//2, y + h//2
+
+    color = (0, 255, 0) if comp['is_handwriting'] else (0, 0, 255)
+    label = f"H{comp['handwriting_score']:+d}" if comp['is_handwriting'] else f"P{comp['handwriting_score']:+d}"
+
+    cv2.putText(vis_final, label, (cx-15, cy), cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1)
+
+cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_visualization.png"), vis_final)
+
+print("\n📁 Saved results:")
+print("  - method3_handwriting_mask.png")
+print("  - method3_printed_mask.png")
+print("  - method3_handwriting_result.png")
+print("  - method3_printed_result.png")
+print("  - method3_visualization.png")
+
+# Calculate content pixels
+hw_pixels = np.count_nonzero(handwriting_mask)
+pr_pixels = np.count_nonzero(printed_mask)
+total_pixels = np.count_nonzero(binary)
+
+print("\n" + "="*80)
+print("Pixel Distribution:")
+print("="*80)
+print(f"  Total foreground:   {total_pixels:6d} pixels (100.0%)")
+print(f"  Handwriting:        {hw_pixels:6d} pixels ({hw_pixels/total_pixels*100:5.1f}%)")
+print(f"  Printed:            {pr_pixels:6d} pixels ({pr_pixels/total_pixels*100:5.1f}%)")
+
+print("\n" + "="*80)
+print("Test completed!")
+print(f"Results: {OUTPUT_DIR}")
+print("="*80)
+
+print("\n📊 Feature Analysis Summary:")
+print("  ✅ Size-based classification: Large characters → Handwriting")
+print("  ✅ Stroke length analysis: Long stroke ratio → Handwriting")
+print("  ✅ Regularity analysis: Irregular shapes → Handwriting")
+print("\nNext: Review visualization to tune thresholds if needed")