- Implemented comprehensive feature analysis based on size, stroke length, and regularity - Size-based scoring: height >50px indicates handwriting - Stroke length ratio: >0.4 indicates handwriting - Irregularity metrics: low compactness/solidity indicates handwriting - Successfully tested on sample PDF with 2 signatures (楊智惠, 張志銘) - Created detailed documentation: CURRENT_STATUS.md and NEW_SESSION_HANDOFF.md - Stable PaddleOCR 2.7.3 configuration documented (numpy 1.26.4, opencv 4.6.0.66) - Prepared research plan for PP-OCRv5 upgrade investigation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
257 lines
9.0 KiB
Python
257 lines
9.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Advanced OpenCV separation based on key observations:
|
|
1. 手写字比印刷字大 (Handwriting is LARGER)
|
|
2. 手写笔画长度更长 (Handwriting strokes are LONGER)
|
|
3. 印刷标楷体规律,手写潦草 (Printed is regular, handwriting is messy)
|
|
"""
|
|
|
|
import cv2
|
|
import numpy as np
|
|
from pathlib import Path
|
|
from scipy import ndimage
|
|
|
|
# Test image
|
|
TEST_IMAGE = "/Volumes/NV2/PDF-Processing/signature-image-output/paddleocr_improved/signature_02_original.png"
|
|
OUTPUT_DIR = "/Volumes/NV2/PDF-Processing/signature-image-output/opencv_advanced_test"
|
|
|
|
print("="*80)
|
|
print("Advanced OpenCV Separation - Size + Stroke Length + Regularity")
|
|
print("="*80)
|
|
|
|
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
|
|
|
|
# Load and preprocess
|
|
image = cv2.imread(TEST_IMAGE)
|
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
|
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
|
|
|
print(f"\nImage: {image.shape[1]}x{image.shape[0]}")
|
|
|
|
# Save binary
|
|
cv2.imwrite(str(Path(OUTPUT_DIR) / "00_binary.png"), binary)
|
|
|
|
|
|
print("\n" + "="*80)
|
|
print("METHOD 3: Comprehensive Feature Analysis")
|
|
print("="*80)
|
|
|
|
# Find connected components
|
|
num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(binary, connectivity=8)
|
|
|
|
print(f"\nFound {num_labels - 1} connected components")
|
|
print("\nAnalyzing each component...")
|
|
|
|
# Store analysis for each component
|
|
components_analysis = []
|
|
|
|
for i in range(1, num_labels):
|
|
x, y, w, h, area = stats[i]
|
|
|
|
# Extract component mask
|
|
component_mask = (labels == i).astype(np.uint8) * 255
|
|
|
|
# ============================================
|
|
# FEATURE 1: Size (手写字比印刷字大)
|
|
# ============================================
|
|
bbox_area = w * h
|
|
font_height = h # Character height is a good indicator
|
|
|
|
# ============================================
|
|
# FEATURE 2: Stroke Length (笔画长度)
|
|
# ============================================
|
|
# Skeletonize to get the actual stroke centerline
|
|
from skimage.morphology import skeletonize
|
|
skeleton = skeletonize(component_mask // 255)
|
|
stroke_length = np.sum(skeleton) # Total length of strokes
|
|
|
|
# Stroke length ratio (length relative to area)
|
|
stroke_length_ratio = stroke_length / area if area > 0 else 0
|
|
|
|
# ============================================
|
|
# FEATURE 3: Regularity vs Messiness
|
|
# ============================================
|
|
# 3a. Compactness (regular shapes are more compact)
|
|
contours, _ = cv2.findContours(component_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
if contours:
|
|
perimeter = cv2.arcLength(contours[0], True)
|
|
compactness = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
|
|
else:
|
|
compactness = 0
|
|
|
|
# 3b. Solidity (ratio of area to convex hull area)
|
|
if contours:
|
|
hull = cv2.convexHull(contours[0])
|
|
hull_area = cv2.contourArea(hull)
|
|
solidity = area / hull_area if hull_area > 0 else 0
|
|
else:
|
|
solidity = 0
|
|
|
|
# 3c. Extent (ratio of area to bounding box area)
|
|
extent = area / bbox_area if bbox_area > 0 else 0
|
|
|
|
# 3d. Edge roughness (measure irregularity)
|
|
# More irregular edges = more "messy" = likely handwriting
|
|
edges = cv2.Canny(component_mask, 50, 150)
|
|
edge_pixels = np.sum(edges > 0)
|
|
edge_roughness = edge_pixels / perimeter if perimeter > 0 else 0
|
|
|
|
# ============================================
|
|
# CLASSIFICATION LOGIC
|
|
# ============================================
|
|
|
|
# Large characters are likely handwriting
|
|
is_large = font_height > 40 # Threshold for "large" characters
|
|
|
|
# Long strokes relative to area indicate handwriting
|
|
is_long_stroke = stroke_length_ratio > 0.4 # Handwriting has higher ratio
|
|
|
|
# Regular shapes (high compactness, high solidity) = printed
|
|
# Irregular shapes (low compactness, low solidity) = handwriting
|
|
is_irregular = compactness < 0.3 or solidity < 0.7 or extent < 0.5
|
|
|
|
# DECISION RULES
|
|
handwriting_score = 0
|
|
|
|
# Size-based scoring (重要!)
|
|
if font_height > 50:
|
|
handwriting_score += 3 # Very large = likely handwriting
|
|
elif font_height > 35:
|
|
handwriting_score += 2 # Medium-large = possibly handwriting
|
|
elif font_height < 25:
|
|
handwriting_score -= 2 # Small = likely printed
|
|
|
|
# Stroke length scoring
|
|
if stroke_length_ratio > 0.5:
|
|
handwriting_score += 2 # Long strokes
|
|
elif stroke_length_ratio > 0.35:
|
|
handwriting_score += 1
|
|
|
|
# Regularity scoring (标楷体 is regular, 手写 is messy)
|
|
if is_irregular:
|
|
handwriting_score += 1 # Irregular = handwriting
|
|
else:
|
|
handwriting_score -= 1 # Regular = printed
|
|
|
|
# Area scoring
|
|
if area > 2000:
|
|
handwriting_score += 2 # Large area = handwriting
|
|
elif area < 500:
|
|
handwriting_score -= 1 # Small area = printed
|
|
|
|
# Final classification
|
|
is_handwriting = handwriting_score > 0
|
|
|
|
components_analysis.append({
|
|
'id': i,
|
|
'box': (x, y, w, h),
|
|
'area': area,
|
|
'height': font_height,
|
|
'stroke_length': stroke_length,
|
|
'stroke_ratio': stroke_length_ratio,
|
|
'compactness': compactness,
|
|
'solidity': solidity,
|
|
'extent': extent,
|
|
'edge_roughness': edge_roughness,
|
|
'handwriting_score': handwriting_score,
|
|
'is_handwriting': is_handwriting,
|
|
'mask': component_mask
|
|
})
|
|
|
|
# Sort by area (largest first)
|
|
components_analysis.sort(key=lambda c: c['area'], reverse=True)
|
|
|
|
# Print analysis
|
|
print("\n" + "-"*80)
|
|
print("Top 10 Components Analysis:")
|
|
print("-"*80)
|
|
print(f"{'ID':<4} {'Area':<6} {'H':<4} {'StrokeLen':<9} {'StrokeR':<7} {'Compact':<7} "
|
|
f"{'Solid':<6} {'Score':<5} {'Type':<12}")
|
|
print("-"*80)
|
|
|
|
for i, comp in enumerate(components_analysis[:10]):
|
|
comp_type = "✅ Handwriting" if comp['is_handwriting'] else "❌ Printed"
|
|
print(f"{comp['id']:<4} {comp['area']:<6} {comp['height']:<4} "
|
|
f"{comp['stroke_length']:<9.0f} {comp['stroke_ratio']:<7.3f} "
|
|
f"{comp['compactness']:<7.3f} {comp['solidity']:<6.3f} "
|
|
f"{comp['handwriting_score']:>+5} {comp_type:<12}")
|
|
|
|
# Create masks
|
|
handwriting_mask = np.zeros_like(binary)
|
|
printed_mask = np.zeros_like(binary)
|
|
|
|
for comp in components_analysis:
|
|
if comp['is_handwriting']:
|
|
handwriting_mask = cv2.bitwise_or(handwriting_mask, comp['mask'])
|
|
else:
|
|
printed_mask = cv2.bitwise_or(printed_mask, comp['mask'])
|
|
|
|
# Statistics
|
|
hw_count = sum(1 for c in components_analysis if c['is_handwriting'])
|
|
pr_count = sum(1 for c in components_analysis if not c['is_handwriting'])
|
|
|
|
print("\n" + "="*80)
|
|
print("Classification Results:")
|
|
print("="*80)
|
|
print(f" Handwriting components: {hw_count}")
|
|
print(f" Printed components: {pr_count}")
|
|
print(f" Total: {len(components_analysis)}")
|
|
|
|
# Apply to original image
|
|
result_handwriting = cv2.bitwise_and(image, image, mask=handwriting_mask)
|
|
result_printed = cv2.bitwise_and(image, image, mask=printed_mask)
|
|
|
|
# Save results
|
|
cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_handwriting_mask.png"), handwriting_mask)
|
|
cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_printed_mask.png"), printed_mask)
|
|
cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_handwriting_result.png"), result_handwriting)
|
|
cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_printed_result.png"), result_printed)
|
|
|
|
# Create visualization
|
|
vis_overlay = image.copy()
|
|
vis_overlay[handwriting_mask > 0] = [0, 255, 0] # Green for handwriting
|
|
vis_overlay[printed_mask > 0] = [0, 0, 255] # Red for printed
|
|
vis_final = cv2.addWeighted(image, 0.6, vis_overlay, 0.4, 0)
|
|
|
|
# Add labels to visualization
|
|
for comp in components_analysis[:15]: # Label top 15
|
|
x, y, w, h = comp['box']
|
|
cx, cy = x + w//2, y + h//2
|
|
|
|
color = (0, 255, 0) if comp['is_handwriting'] else (0, 0, 255)
|
|
label = f"H{comp['handwriting_score']:+d}" if comp['is_handwriting'] else f"P{comp['handwriting_score']:+d}"
|
|
|
|
cv2.putText(vis_final, label, (cx-15, cy), cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1)
|
|
|
|
cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_visualization.png"), vis_final)
|
|
|
|
print("\n📁 Saved results:")
|
|
print(" - method3_handwriting_mask.png")
|
|
print(" - method3_printed_mask.png")
|
|
print(" - method3_handwriting_result.png")
|
|
print(" - method3_printed_result.png")
|
|
print(" - method3_visualization.png")
|
|
|
|
# Calculate content pixels
|
|
hw_pixels = np.count_nonzero(handwriting_mask)
|
|
pr_pixels = np.count_nonzero(printed_mask)
|
|
total_pixels = np.count_nonzero(binary)
|
|
|
|
print("\n" + "="*80)
|
|
print("Pixel Distribution:")
|
|
print("="*80)
|
|
print(f" Total foreground: {total_pixels:6d} pixels (100.0%)")
|
|
print(f" Handwriting: {hw_pixels:6d} pixels ({hw_pixels/total_pixels*100:5.1f}%)")
|
|
print(f" Printed: {pr_pixels:6d} pixels ({pr_pixels/total_pixels*100:5.1f}%)")
|
|
|
|
print("\n" + "="*80)
|
|
print("Test completed!")
|
|
print(f"Results: {OUTPUT_DIR}")
|
|
print("="*80)
|
|
|
|
print("\n📊 Feature Analysis Summary:")
|
|
print(" ✅ Size-based classification: Large characters → Handwriting")
|
|
print(" ✅ Stroke length analysis: Long stroke ratio → Handwriting")
|
|
print(" ✅ Regularity analysis: Irregular shapes → Handwriting")
|
|
print("\nNext: Review visualization to tune thresholds if needed")
|