Files
pdf_signature_extraction/test_opencv_advanced.py
gbanyan 8f231da3bc Complete OpenCV Method 3 implementation with 86.5% handwriting retention
- Implemented comprehensive feature analysis based on size, stroke length, and regularity
- Size-based scoring: height >50px indicates handwriting
- Stroke length ratio: >0.4 indicates handwriting
- Irregularity metrics: low compactness/solidity indicates handwriting
- Successfully tested on sample PDF with 2 signatures (楊智惠, 張志銘)
- Created detailed documentation: CURRENT_STATUS.md and NEW_SESSION_HANDOFF.md
- Stable PaddleOCR 2.7.3 configuration documented (numpy 1.26.4, opencv 4.6.0.66)
- Prepared research plan for PP-OCRv5 upgrade investigation

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 10:35:46 +08:00

257 lines
9.0 KiB
Python

#!/usr/bin/env python3
"""
Advanced OpenCV separation based on key observations:
1. 手写字比印刷字大 (Handwriting is LARGER)
2. 手写笔画长度更长 (Handwriting strokes are LONGER)
3. 印刷标楷体规律,手写潦草 (Printed is regular, handwriting is messy)
"""
import cv2
import numpy as np
from pathlib import Path
from scipy import ndimage
# Test image
TEST_IMAGE = "/Volumes/NV2/PDF-Processing/signature-image-output/paddleocr_improved/signature_02_original.png"
OUTPUT_DIR = "/Volumes/NV2/PDF-Processing/signature-image-output/opencv_advanced_test"
print("="*80)
print("Advanced OpenCV Separation - Size + Stroke Length + Regularity")
print("="*80)
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
# Load and preprocess
image = cv2.imread(TEST_IMAGE)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
print(f"\nImage: {image.shape[1]}x{image.shape[0]}")
# Save binary
cv2.imwrite(str(Path(OUTPUT_DIR) / "00_binary.png"), binary)
print("\n" + "="*80)
print("METHOD 3: Comprehensive Feature Analysis")
print("="*80)
# Find connected components
num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(binary, connectivity=8)
print(f"\nFound {num_labels - 1} connected components")
print("\nAnalyzing each component...")
# Store analysis for each component
components_analysis = []
for i in range(1, num_labels):
x, y, w, h, area = stats[i]
# Extract component mask
component_mask = (labels == i).astype(np.uint8) * 255
# ============================================
# FEATURE 1: Size (手写字比印刷字大)
# ============================================
bbox_area = w * h
font_height = h # Character height is a good indicator
# ============================================
# FEATURE 2: Stroke Length (笔画长度)
# ============================================
# Skeletonize to get the actual stroke centerline
from skimage.morphology import skeletonize
skeleton = skeletonize(component_mask // 255)
stroke_length = np.sum(skeleton) # Total length of strokes
# Stroke length ratio (length relative to area)
stroke_length_ratio = stroke_length / area if area > 0 else 0
# ============================================
# FEATURE 3: Regularity vs Messiness
# ============================================
# 3a. Compactness (regular shapes are more compact)
contours, _ = cv2.findContours(component_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if contours:
perimeter = cv2.arcLength(contours[0], True)
compactness = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
else:
compactness = 0
# 3b. Solidity (ratio of area to convex hull area)
if contours:
hull = cv2.convexHull(contours[0])
hull_area = cv2.contourArea(hull)
solidity = area / hull_area if hull_area > 0 else 0
else:
solidity = 0
# 3c. Extent (ratio of area to bounding box area)
extent = area / bbox_area if bbox_area > 0 else 0
# 3d. Edge roughness (measure irregularity)
# More irregular edges = more "messy" = likely handwriting
edges = cv2.Canny(component_mask, 50, 150)
edge_pixels = np.sum(edges > 0)
edge_roughness = edge_pixels / perimeter if perimeter > 0 else 0
# ============================================
# CLASSIFICATION LOGIC
# ============================================
# Large characters are likely handwriting
is_large = font_height > 40 # Threshold for "large" characters
# Long strokes relative to area indicate handwriting
is_long_stroke = stroke_length_ratio > 0.4 # Handwriting has higher ratio
# Regular shapes (high compactness, high solidity) = printed
# Irregular shapes (low compactness, low solidity) = handwriting
is_irregular = compactness < 0.3 or solidity < 0.7 or extent < 0.5
# DECISION RULES
handwriting_score = 0
# Size-based scoring (重要!)
if font_height > 50:
handwriting_score += 3 # Very large = likely handwriting
elif font_height > 35:
handwriting_score += 2 # Medium-large = possibly handwriting
elif font_height < 25:
handwriting_score -= 2 # Small = likely printed
# Stroke length scoring
if stroke_length_ratio > 0.5:
handwriting_score += 2 # Long strokes
elif stroke_length_ratio > 0.35:
handwriting_score += 1
# Regularity scoring (标楷体 is regular, 手写 is messy)
if is_irregular:
handwriting_score += 1 # Irregular = handwriting
else:
handwriting_score -= 1 # Regular = printed
# Area scoring
if area > 2000:
handwriting_score += 2 # Large area = handwriting
elif area < 500:
handwriting_score -= 1 # Small area = printed
# Final classification
is_handwriting = handwriting_score > 0
components_analysis.append({
'id': i,
'box': (x, y, w, h),
'area': area,
'height': font_height,
'stroke_length': stroke_length,
'stroke_ratio': stroke_length_ratio,
'compactness': compactness,
'solidity': solidity,
'extent': extent,
'edge_roughness': edge_roughness,
'handwriting_score': handwriting_score,
'is_handwriting': is_handwriting,
'mask': component_mask
})
# Sort by area (largest first)
components_analysis.sort(key=lambda c: c['area'], reverse=True)
# Print analysis
print("\n" + "-"*80)
print("Top 10 Components Analysis:")
print("-"*80)
print(f"{'ID':<4} {'Area':<6} {'H':<4} {'StrokeLen':<9} {'StrokeR':<7} {'Compact':<7} "
f"{'Solid':<6} {'Score':<5} {'Type':<12}")
print("-"*80)
for i, comp in enumerate(components_analysis[:10]):
comp_type = "✅ Handwriting" if comp['is_handwriting'] else "❌ Printed"
print(f"{comp['id']:<4} {comp['area']:<6} {comp['height']:<4} "
f"{comp['stroke_length']:<9.0f} {comp['stroke_ratio']:<7.3f} "
f"{comp['compactness']:<7.3f} {comp['solidity']:<6.3f} "
f"{comp['handwriting_score']:>+5} {comp_type:<12}")
# Create masks
handwriting_mask = np.zeros_like(binary)
printed_mask = np.zeros_like(binary)
for comp in components_analysis:
if comp['is_handwriting']:
handwriting_mask = cv2.bitwise_or(handwriting_mask, comp['mask'])
else:
printed_mask = cv2.bitwise_or(printed_mask, comp['mask'])
# Statistics
hw_count = sum(1 for c in components_analysis if c['is_handwriting'])
pr_count = sum(1 for c in components_analysis if not c['is_handwriting'])
print("\n" + "="*80)
print("Classification Results:")
print("="*80)
print(f" Handwriting components: {hw_count}")
print(f" Printed components: {pr_count}")
print(f" Total: {len(components_analysis)}")
# Apply to original image
result_handwriting = cv2.bitwise_and(image, image, mask=handwriting_mask)
result_printed = cv2.bitwise_and(image, image, mask=printed_mask)
# Save results
cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_handwriting_mask.png"), handwriting_mask)
cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_printed_mask.png"), printed_mask)
cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_handwriting_result.png"), result_handwriting)
cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_printed_result.png"), result_printed)
# Create visualization
vis_overlay = image.copy()
vis_overlay[handwriting_mask > 0] = [0, 255, 0] # Green for handwriting
vis_overlay[printed_mask > 0] = [0, 0, 255] # Red for printed
vis_final = cv2.addWeighted(image, 0.6, vis_overlay, 0.4, 0)
# Add labels to visualization
for comp in components_analysis[:15]: # Label top 15
x, y, w, h = comp['box']
cx, cy = x + w//2, y + h//2
color = (0, 255, 0) if comp['is_handwriting'] else (0, 0, 255)
label = f"H{comp['handwriting_score']:+d}" if comp['is_handwriting'] else f"P{comp['handwriting_score']:+d}"
cv2.putText(vis_final, label, (cx-15, cy), cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1)
cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_visualization.png"), vis_final)
print("\n📁 Saved results:")
print(" - method3_handwriting_mask.png")
print(" - method3_printed_mask.png")
print(" - method3_handwriting_result.png")
print(" - method3_printed_result.png")
print(" - method3_visualization.png")
# Calculate content pixels
hw_pixels = np.count_nonzero(handwriting_mask)
pr_pixels = np.count_nonzero(printed_mask)
total_pixels = np.count_nonzero(binary)
print("\n" + "="*80)
print("Pixel Distribution:")
print("="*80)
print(f" Total foreground: {total_pixels:6d} pixels (100.0%)")
print(f" Handwriting: {hw_pixels:6d} pixels ({hw_pixels/total_pixels*100:5.1f}%)")
print(f" Printed: {pr_pixels:6d} pixels ({pr_pixels/total_pixels*100:5.1f}%)")
print("\n" + "="*80)
print("Test completed!")
print(f"Results: {OUTPUT_DIR}")
print("="*80)
print("\n📊 Feature Analysis Summary:")
print(" ✅ Size-based classification: Large characters → Handwriting")
print(" ✅ Stroke length analysis: Long stroke ratio → Handwriting")
print(" ✅ Regularity analysis: Irregular shapes → Handwriting")
print("\nNext: Review visualization to tune thresholds if needed")