#!/usr/bin/env python3 """ Advanced OpenCV separation based on key observations: 1. 手写字比印刷字大 (Handwriting is LARGER) 2. 手写笔画长度更长 (Handwriting strokes are LONGER) 3. 印刷标楷体规律,手写潦草 (Printed is regular, handwriting is messy) """ import cv2 import numpy as np from pathlib import Path from scipy import ndimage # Test image TEST_IMAGE = "/Volumes/NV2/PDF-Processing/signature-image-output/paddleocr_improved/signature_02_original.png" OUTPUT_DIR = "/Volumes/NV2/PDF-Processing/signature-image-output/opencv_advanced_test" print("="*80) print("Advanced OpenCV Separation - Size + Stroke Length + Regularity") print("="*80) Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True) # Load and preprocess image = cv2.imread(TEST_IMAGE) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) print(f"\nImage: {image.shape[1]}x{image.shape[0]}") # Save binary cv2.imwrite(str(Path(OUTPUT_DIR) / "00_binary.png"), binary) print("\n" + "="*80) print("METHOD 3: Comprehensive Feature Analysis") print("="*80) # Find connected components num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(binary, connectivity=8) print(f"\nFound {num_labels - 1} connected components") print("\nAnalyzing each component...") # Store analysis for each component components_analysis = [] for i in range(1, num_labels): x, y, w, h, area = stats[i] # Extract component mask component_mask = (labels == i).astype(np.uint8) * 255 # ============================================ # FEATURE 1: Size (手写字比印刷字大) # ============================================ bbox_area = w * h font_height = h # Character height is a good indicator # ============================================ # FEATURE 2: Stroke Length (笔画长度) # ============================================ # Skeletonize to get the actual stroke centerline from skimage.morphology import skeletonize skeleton = skeletonize(component_mask // 255) stroke_length = np.sum(skeleton) # Total length of strokes # Stroke length ratio (length relative to area) stroke_length_ratio = stroke_length / area if area > 0 else 0 # ============================================ # FEATURE 3: Regularity vs Messiness # ============================================ # 3a. Compactness (regular shapes are more compact) contours, _ = cv2.findContours(component_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) if contours: perimeter = cv2.arcLength(contours[0], True) compactness = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0 else: compactness = 0 # 3b. Solidity (ratio of area to convex hull area) if contours: hull = cv2.convexHull(contours[0]) hull_area = cv2.contourArea(hull) solidity = area / hull_area if hull_area > 0 else 0 else: solidity = 0 # 3c. Extent (ratio of area to bounding box area) extent = area / bbox_area if bbox_area > 0 else 0 # 3d. Edge roughness (measure irregularity) # More irregular edges = more "messy" = likely handwriting edges = cv2.Canny(component_mask, 50, 150) edge_pixels = np.sum(edges > 0) edge_roughness = edge_pixels / perimeter if perimeter > 0 else 0 # ============================================ # CLASSIFICATION LOGIC # ============================================ # Large characters are likely handwriting is_large = font_height > 40 # Threshold for "large" characters # Long strokes relative to area indicate handwriting is_long_stroke = stroke_length_ratio > 0.4 # Handwriting has higher ratio # Regular shapes (high compactness, high solidity) = printed # Irregular shapes (low compactness, low solidity) = handwriting is_irregular = compactness < 0.3 or solidity < 0.7 or extent < 0.5 # DECISION RULES handwriting_score = 0 # Size-based scoring (重要!) if font_height > 50: handwriting_score += 3 # Very large = likely handwriting elif font_height > 35: handwriting_score += 2 # Medium-large = possibly handwriting elif font_height < 25: handwriting_score -= 2 # Small = likely printed # Stroke length scoring if stroke_length_ratio > 0.5: handwriting_score += 2 # Long strokes elif stroke_length_ratio > 0.35: handwriting_score += 1 # Regularity scoring (标楷体 is regular, 手写 is messy) if is_irregular: handwriting_score += 1 # Irregular = handwriting else: handwriting_score -= 1 # Regular = printed # Area scoring if area > 2000: handwriting_score += 2 # Large area = handwriting elif area < 500: handwriting_score -= 1 # Small area = printed # Final classification is_handwriting = handwriting_score > 0 components_analysis.append({ 'id': i, 'box': (x, y, w, h), 'area': area, 'height': font_height, 'stroke_length': stroke_length, 'stroke_ratio': stroke_length_ratio, 'compactness': compactness, 'solidity': solidity, 'extent': extent, 'edge_roughness': edge_roughness, 'handwriting_score': handwriting_score, 'is_handwriting': is_handwriting, 'mask': component_mask }) # Sort by area (largest first) components_analysis.sort(key=lambda c: c['area'], reverse=True) # Print analysis print("\n" + "-"*80) print("Top 10 Components Analysis:") print("-"*80) print(f"{'ID':<4} {'Area':<6} {'H':<4} {'StrokeLen':<9} {'StrokeR':<7} {'Compact':<7} " f"{'Solid':<6} {'Score':<5} {'Type':<12}") print("-"*80) for i, comp in enumerate(components_analysis[:10]): comp_type = "✅ Handwriting" if comp['is_handwriting'] else "❌ Printed" print(f"{comp['id']:<4} {comp['area']:<6} {comp['height']:<4} " f"{comp['stroke_length']:<9.0f} {comp['stroke_ratio']:<7.3f} " f"{comp['compactness']:<7.3f} {comp['solidity']:<6.3f} " f"{comp['handwriting_score']:>+5} {comp_type:<12}") # Create masks handwriting_mask = np.zeros_like(binary) printed_mask = np.zeros_like(binary) for comp in components_analysis: if comp['is_handwriting']: handwriting_mask = cv2.bitwise_or(handwriting_mask, comp['mask']) else: printed_mask = cv2.bitwise_or(printed_mask, comp['mask']) # Statistics hw_count = sum(1 for c in components_analysis if c['is_handwriting']) pr_count = sum(1 for c in components_analysis if not c['is_handwriting']) print("\n" + "="*80) print("Classification Results:") print("="*80) print(f" Handwriting components: {hw_count}") print(f" Printed components: {pr_count}") print(f" Total: {len(components_analysis)}") # Apply to original image result_handwriting = cv2.bitwise_and(image, image, mask=handwriting_mask) result_printed = cv2.bitwise_and(image, image, mask=printed_mask) # Save results cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_handwriting_mask.png"), handwriting_mask) cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_printed_mask.png"), printed_mask) cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_handwriting_result.png"), result_handwriting) cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_printed_result.png"), result_printed) # Create visualization vis_overlay = image.copy() vis_overlay[handwriting_mask > 0] = [0, 255, 0] # Green for handwriting vis_overlay[printed_mask > 0] = [0, 0, 255] # Red for printed vis_final = cv2.addWeighted(image, 0.6, vis_overlay, 0.4, 0) # Add labels to visualization for comp in components_analysis[:15]: # Label top 15 x, y, w, h = comp['box'] cx, cy = x + w//2, y + h//2 color = (0, 255, 0) if comp['is_handwriting'] else (0, 0, 255) label = f"H{comp['handwriting_score']:+d}" if comp['is_handwriting'] else f"P{comp['handwriting_score']:+d}" cv2.putText(vis_final, label, (cx-15, cy), cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1) cv2.imwrite(str(Path(OUTPUT_DIR) / "method3_visualization.png"), vis_final) print("\n📁 Saved results:") print(" - method3_handwriting_mask.png") print(" - method3_printed_mask.png") print(" - method3_handwriting_result.png") print(" - method3_printed_result.png") print(" - method3_visualization.png") # Calculate content pixels hw_pixels = np.count_nonzero(handwriting_mask) pr_pixels = np.count_nonzero(printed_mask) total_pixels = np.count_nonzero(binary) print("\n" + "="*80) print("Pixel Distribution:") print("="*80) print(f" Total foreground: {total_pixels:6d} pixels (100.0%)") print(f" Handwriting: {hw_pixels:6d} pixels ({hw_pixels/total_pixels*100:5.1f}%)") print(f" Printed: {pr_pixels:6d} pixels ({pr_pixels/total_pixels*100:5.1f}%)") print("\n" + "="*80) print("Test completed!") print(f"Results: {OUTPUT_DIR}") print("="*80) print("\n📊 Feature Analysis Summary:") print(" ✅ Size-based classification: Large characters → Handwriting") print(" ✅ Stroke length analysis: Long stroke ratio → Handwriting") print(" ✅ Regularity analysis: Irregular shapes → Handwriting") print("\nNext: Review visualization to tune thresholds if needed")