pdf_signature_extraction/test_opencv_advanced.py

#!/usr/bin/env python3
"""
Advanced OpenCV separation based on key observations:
1. 手写字比印刷字大 (Handwriting is LARGER)
2. 手写笔画长度更长 (Handwriting strokes are LONGER)
3. 印刷标楷体规律，手写潦草 (Printed is regular, handwriting is messy)
"""

import cv2
import numpy as np
from pathlib import Path
from scipy import ndimage

# Test image
TEST_IMAGE = "/Volumes/NV2/PDF-Processing/signature-image-output/paddleocr_improved/signature_02_original.png"
OUTPUT_DIR = "/Volumes/NV2/PDF-Processing/signature-image-output/opencv_advanced_test"

print("="*80)
print("Advanced OpenCV Separation - Size + Stroke Length + Regularity")
print("="*80)

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

# Load and preprocess
image = cv2.imread(TEST_IMAGE)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

print(f"\nImage: {image.shape[1]}x{image.shape[0]}")

# Save binary
cv2.imwrite(str(Path(OUTPUT_DIR) / "00_binary.png"), binary)


print("\n" + "="*80)
print("METHOD 3: Comprehensive Feature Analysis")
print("="*80)

# Find connected components
num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(binary, connectivity=8)

print(f"\nFound {num_labels - 1} connected components")
print("\nAnalyzing each component...")

# Store analysis for each component
components_analysis = []

for i in range(1, num_labels):
    x, y, w, h, area = stats[i]

    # Extract component mask
    component_mask = (labels == i).astype(np.uint8) * 255

    # ============================================
    # FEATURE 1: Size (手写字比印刷字大)
    # ============================================
    bbox_area = w * h
    font_height = h  # Character height is a good indicator

    # ============================================
    # FEATURE 2: Stroke Length (笔画长度)
    # ============================================
    # Skeletonize to get the actual stroke centerline
    from skimage.morphology import skeletonize
    skeleton = skeletonize(component_mask // 255)
    stroke_length = np.sum(skeleton)  # Total length of strokes

    # Stroke length ratio (length relative to area)
    stroke_length_ratio = stroke_length / area if area > 0 else 0

    # ============================================
    # FEATURE 3: Regularity vs Messiness
    # ============================================
    # 3a. Compactness (regular shapes are more compact)
    contours, _ = cv2.findContours(component_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        perimeter = cv2.arcLength(contours[0], True)
        compactness = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
    else:
        compactness = 0

    # 3b. Solidity (ratio of area to convex hull area)
    if contours:
        hull = cv2.convexHull(contours[0])
        hull_area = cv2.contourArea(hull)
        solidity = area / hull_area if hull_area > 0 else 0
    else:
        solidity = 0

    # 3c. Extent (ratio of area to bounding box area)
    extent = area / bbox_area if bbox_area > 0 else 0

    # 3d. Edge roughness (measure irregularity)
    # More irregular edges = more "messy" = likely handwriting
    edges = cv2.Canny(component_mask, 50, 150)
    edge_pixels = np.sum(edges > 0)
    edge_roughness = edge_pixels / perimeter if perimeter > 0 else 0

    # ============================================
    # CLASSIFICATION LOGIC
    # ============================================

    # Large characters are likely handwriting
    is_large = font_height > 40  # Threshold for "large" characters

    # Long strokes relative to area indicate handwriting
    is_long_stroke = stroke_length_ratio > 0.4  # Handwriting has higher ratio

    # Regular shapes (high compactness, high solidity) = printed
    # Irregular shapes (low compactness, low solidity) = handwriting
    is_irregular = compactness < 0.3 or solidity < 0.7 or extent < 0.5

    # DECISION RULES
    handwriting_score = 0

    # Size-based scoring (重要!)
    if font_height > 50:
        handwriting_score += 3  # Very large = likely handwriting
    elif font_height > 35:
        handwriting_score += 2  # Medium-large = possibly handwriting
    elif font_height < 25:
        handwriting_score -= 2  # Small = likely printed

    # Stroke length scoring
    if stroke_length_ratio > 0.5:
        handwriting_score += 2  # Long strokes
    elif stroke_length_ratio > 0.35:
        handwriting_score += 1

    # Regularity scoring (标楷体 is regular, 手写 is messy)
    if is_irregular:
        handwriting_score += 1  # Irregular = handwriting
    else:
        handwriting_score -= 1  # Regular = printed

    # Area scoring
    if area > 2000:
        handwriting_score += 2  # Large area = handwriting
    elif area < 500:
        handwriting_score -= 1  # Small area = printed

    # Final classification
    is_handwriting = handwriting_score > 0

    components_analysis.append({
        'id': i,
        'box': (x, y, w, h),
        'area': area,
        'height': font_height,
        'stroke_length': stroke_length,
        'stroke_ratio': stroke_length_ratio,
        'compactness': compactness,
        'solidity': solidity,
        'extent': extent,
        'edge_roughness': edge_roughness,
        'handwriting_score': handwriting_score,
        'is_handwriting': is_handwriting,
        'mask': component_mask
    })

# Sort by area (largest first)
components_analysis.sort(key=lambda c: c['area'], reverse=True)

# Print analysis
print("\n" + "-"*80)
print("Top 10 Components Analysis:")
print("-"*80)
print(f"{'ID':<4} {'Area':<6} {'H':<4} {'StrokeLen':<9} {'StrokeR':<7} {'Compact':<7} "
      f"{'Solid':<6} {'Score':<5} {'Type':<12}")
print("-"*80)

for i, comp in enumerate(components_analysis[:10]):
    comp_type = "✅ Handwriting" if comp['is_handwriting'] else "❌ Printed"
    print(f"{comp['id']:<4} {comp['area']:<6} {comp['height']:<4} "
          f"{comp['stroke_length']:<9.0f} {comp['stroke_ratio']:<7.3f} "
          f"{comp['compactness']:<7.3f} {comp['solidity']:<6.3f} "
          f"{comp['handwriting_score']:>+5} {comp_type:<12}")

# Create masks
handwriting_mask = np.zeros_like(binary)
printed_mask = np.zeros_like(binary)

for comp in components_analysis:
    if comp['is_handwriting']:
        handwriting_mask = cv2.bitwise_or(handwriting_mask, comp['mask'])
    else:
        printed_mask = cv2.bitwise_or(printed_mask, comp['mask'])

# Statistics
hw_count = sum(1 for c in components_analysis if c['is_handwriting'])
pr_count = sum(1 for c in components_analysis if not c['is_handwriting'])

print("\n" + "="*80)
print("Classification Results:")
print("="*80)
print(f"  Handwriting components: {hw_count}")
print(f"  Printed components: {pr_count}")
print(f"  Total: {len(components_analysis)}")

# Apply to original image
result_handwriting = cv2.bitwise_and(image, image, mask=handwriting_mask)
result_printed = cv2.bitwise_and(image, image, mask=printed_mask)

# Save results
cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_handwriting_mask.png"), handwriting_mask)
cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_printed_mask.png"), printed_mask)
cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_handwriting_result.png"), result_handwriting)
cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_printed_result.png"), result_printed)

# Create visualization
vis_overlay = image.copy()
vis_overlay[handwriting_mask > 0] = [0, 255, 0]  # Green for handwriting
vis_overlay[printed_mask > 0] = [0, 0, 255]      # Red for printed
vis_final = cv2.addWeighted(image, 0.6, vis_overlay, 0.4, 0)

# Add labels to visualization
for comp in components_analysis[:15]:  # Label top 15
    x, y, w, h = comp['box']
    cx, cy = x + w//2, y + h//2

    color = (0, 255, 0) if comp['is_handwriting'] else (0, 0, 255)
    label = f"H{comp['handwriting_score']:+d}" if comp['is_handwriting'] else f"P{comp['handwriting_score']:+d}"

    cv2.putText(vis_final, label, (cx-15, cy), cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1)

cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_visualization.png"), vis_final)

print("\n📁 Saved results:")
print("  - method3_handwriting_mask.png")
print("  - method3_printed_mask.png")
print("  - method3_handwriting_result.png")
print("  - method3_printed_result.png")
print("  - method3_visualization.png")

# Calculate content pixels
hw_pixels = np.count_nonzero(handwriting_mask)
pr_pixels = np.count_nonzero(printed_mask)
total_pixels = np.count_nonzero(binary)

print("\n" + "="*80)
print("Pixel Distribution:")
print("="*80)
print(f"  Total foreground:   {total_pixels:6d} pixels (100.0%)")
print(f"  Handwriting:        {hw_pixels:6d} pixels ({hw_pixels/total_pixels*100:5.1f}%)")
print(f"  Printed:            {pr_pixels:6d} pixels ({pr_pixels/total_pixels*100:5.1f}%)")

print("\n" + "="*80)
print("Test completed!")
print(f"Results: {OUTPUT_DIR}")
print("="*80)

print("\n📊 Feature Analysis Summary:")
print("  ✅ Size-based classification: Large characters → Handwriting")
print("  ✅ Stroke length analysis: Long stroke ratio → Handwriting")
print("  ✅ Regularity analysis: Irregular shapes → Handwriting")
print("\nNext: Review visualization to tune thresholds if needed")