#!/usr/bin/env python3
"""
Test OpenCV methods to separate handwriting from printed text

Tests two methods:
1. Stroke Width Analysis (笔画宽度分析)
2. Connected Components + Shape Features (连通组件+形状特征)
"""

import cv2
import numpy as np
from pathlib import Path

# Test image - contains both printed and handwritten
TEST_IMAGE = "/Volumes/NV2/PDF-Processing/signature-image-output/paddleocr_improved/signature_02_original.png"
OUTPUT_DIR = "/Volumes/NV2/PDF-Processing/signature-image-output/opencv_separation_test"

print("="*80)
print("OpenCV Handwriting Separation Test")
print("="*80)

# Create output directory
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

# Load image
print(f"\nLoading test image: {Path(TEST_IMAGE).name}")
image = cv2.imread(TEST_IMAGE)
if image is None:
    print(f"Error: Cannot load image from {TEST_IMAGE}")
    exit(1)

image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
print(f"Image size: {image.shape[1]}x{image.shape[0]}")

# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Binarize
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

# Save binary for reference
cv2.imwrite(str(Path(OUTPUT_DIR) / "00_binary.png"), binary)
print("\n📁 Saved: 00_binary.png")

print("\n" + "="*80)
print("METHOD 1: Stroke Width Analysis (笔画宽度分析)")
print("="*80)

def method1_stroke_width(binary_img, threshold_values=[2.0, 3.0, 4.0, 5.0]):
    """
    Method 1: Separate by stroke width using distance transform

    Args:
        binary_img: Binary image (foreground = 255, background = 0)
        threshold_values: List of distance thresholds to test

    Returns:
        List of (threshold, result_image) tuples
    """
    results = []

    # Calculate distance transform
    dist_transform = cv2.distanceTransform(binary_img, cv2.DIST_L2, 5)

    # Normalize for visualization
    dist_normalized = cv2.normalize(dist_transform, None, 0, 255, cv2.NORM_MINMAX, cv2.CV_8U)
    results.append(('distance_transform', dist_normalized))

    print("\n  Distance transform statistics:")
    print(f"    Min: {dist_transform.min():.2f}")
    print(f"    Max: {dist_transform.max():.2f}")
    print(f"    Mean: {dist_transform.mean():.2f}")
    print(f"    Median: {np.median(dist_transform):.2f}")

    # Test different thresholds
    print("\n  Testing different stroke width thresholds:")

    for threshold in threshold_values:
        # Pixels with distance > threshold are considered "thick strokes" (handwriting)
        handwriting_mask = (dist_transform > threshold).astype(np.uint8) * 255

        # Count pixels
        total_foreground = np.count_nonzero(binary_img)
        handwriting_pixels = np.count_nonzero(handwriting_mask)
        percentage = (handwriting_pixels / total_foreground * 100) if total_foreground > 0 else 0

        print(f"    Threshold {threshold:.1f}: {handwriting_pixels} pixels ({percentage:.1f}% of foreground)")

        results.append((f'threshold_{threshold:.1f}', handwriting_mask))

    return results

# Run Method 1
method1_results = method1_stroke_width(binary, threshold_values=[2.0, 2.5, 3.0, 3.5, 4.0, 5.0])

# Save Method 1 results
print("\n  Saving results...")
for name, result_img in method1_results:
    output_path = Path(OUTPUT_DIR) / f"method1_{name}.png"
    cv2.imwrite(str(output_path), result_img)
    print(f"    📁 {output_path.name}")

# Apply best threshold result to original image
best_threshold = 3.0  # Will adjust based on visual inspection
_, best_mask = [(n, r) for n, r in method1_results if f'threshold_{best_threshold}' in n][0]

# Dilate mask slightly to connect nearby strokes
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
best_mask_dilated = cv2.dilate(best_mask, kernel, iterations=1)

# Apply to color image
result_method1 = cv2.bitwise_and(image, image, mask=best_mask_dilated)
cv2.imwrite(str(Path(OUTPUT_DIR) / "method1_final_result.png"), result_method1)
print(f"\n  📁 Final result: method1_final_result.png (threshold={best_threshold})")


print("\n" + "="*80)
print("METHOD 2: Connected Components + Shape Features (连通组件分析)")
print("="*80)

def method2_component_analysis(binary_img, original_img):
    """
    Method 2: Analyze each connected component's shape features

    Printed text characteristics:
    - Regular bounding box (aspect ratio ~1:1)
    - Medium size (200-2000 pixels)
    - High circularity/compactness

    Handwriting characteristics:
    - Irregular shapes
    - May be large (connected strokes)
    - Variable aspect ratios
    """
    # Find connected components
    num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(binary_img, connectivity=8)

    print(f"\n  Found {num_labels - 1} connected components")

    # Create masks for different categories
    handwriting_mask = np.zeros_like(binary_img)
    printed_mask = np.zeros_like(binary_img)

    # Analyze each component
    component_info = []

    for i in range(1, num_labels):  # Skip background (0)
        x, y, w, h, area = stats[i]

        # Calculate features
        aspect_ratio = w / h if h > 0 else 0
        perimeter = cv2.arcLength(cv2.findContours((labels == i).astype(np.uint8),
                                                    cv2.RETR_EXTERNAL,
                                                    cv2.CHAIN_APPROX_SIMPLE)[0][0], True)
        compactness = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0

        # Classification logic
        # Printed text: medium size, regular aspect ratio, compact
        is_printed = (
            (200 < area < 3000) and              # Medium size
            (0.3 < aspect_ratio < 3.0) and       # Not too elongated
            (area < 1000)                         # Small to medium
        )

        # Handwriting: larger, or irregular, or very wide/tall
        is_handwriting = (
            (area >= 3000) or                     # Large components (likely handwriting)
            (aspect_ratio > 3.0) or               # Very elongated (连笔)
            (aspect_ratio < 0.3) or               # Very tall
            not is_printed                        # Default to handwriting if not clearly printed
        )

        component_info.append({
            'id': i,
            'area': area,
            'aspect_ratio': aspect_ratio,
            'compactness': compactness,
            'is_printed': is_printed,
            'is_handwriting': is_handwriting
        })

        # Assign to mask
        if is_handwriting:
            handwriting_mask[labels == i] = 255
        if is_printed:
            printed_mask[labels == i] = 255

    # Print statistics
    print("\n  Component statistics:")
    handwriting_components = [c for c in component_info if c['is_handwriting']]
    printed_components = [c for c in component_info if c['is_printed']]

    print(f"    Handwriting components: {len(handwriting_components)}")
    print(f"    Printed components: {len(printed_components)}")

    # Show top 5 largest components
    print("\n  Top 5 largest components:")
    sorted_components = sorted(component_info, key=lambda c: c['area'], reverse=True)
    for i, comp in enumerate(sorted_components[:5], 1):
        comp_type = "Handwriting" if comp['is_handwriting'] else "Printed"
        print(f"    {i}. Area: {comp['area']:5d}, Aspect: {comp['aspect_ratio']:.2f}, "
              f"Type: {comp_type}")

    return handwriting_mask, printed_mask, component_info

# Run Method 2
handwriting_mask_m2, printed_mask_m2, components = method2_component_analysis(binary, image)

# Save Method 2 results
print("\n  Saving results...")

# Handwriting mask
cv2.imwrite(str(Path(OUTPUT_DIR) / "method2_handwriting_mask.png"), handwriting_mask_m2)
print(f"    📁 method2_handwriting_mask.png")

# Printed mask
cv2.imwrite(str(Path(OUTPUT_DIR) / "method2_printed_mask.png"), printed_mask_m2)
print(f"    📁 method2_printed_mask.png")

# Apply to original image
result_handwriting = cv2.bitwise_and(image, image, mask=handwriting_mask_m2)
result_printed = cv2.bitwise_and(image, image, mask=printed_mask_m2)

cv2.imwrite(str(Path(OUTPUT_DIR) / "method2_handwriting_result.png"), result_handwriting)
print(f"    📁 method2_handwriting_result.png")

cv2.imwrite(str(Path(OUTPUT_DIR) / "method2_printed_result.png"), result_printed)
print(f"    📁 method2_printed_result.png")

# Create visualization with component labels
vis_components = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
vis_components = cv2.cvtColor(vis_components, cv2.COLOR_BGR2RGB)

# Color code: green = handwriting, red = printed
vis_overlay = image.copy()
vis_overlay[handwriting_mask_m2 > 0] = [0, 255, 0]  # Green for handwriting
vis_overlay[printed_mask_m2 > 0] = [0, 0, 255]      # Red for printed

# Blend with original
vis_final = cv2.addWeighted(image, 0.6, vis_overlay, 0.4, 0)
cv2.imwrite(str(Path(OUTPUT_DIR) / "method2_visualization.png"), vis_final)
print(f"    📁 method2_visualization.png (green=handwriting, red=printed)")


print("\n" + "="*80)
print("COMPARISON")
print("="*80)

# Count non-white pixels in each result
def count_content_pixels(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) if len(img.shape) == 3 else img
    return np.count_nonzero(gray > 10)

original_pixels = count_content_pixels(image)
method1_pixels = count_content_pixels(result_method1)
method2_pixels = count_content_pixels(result_handwriting)

print(f"\nContent pixels retained:")
print(f"  Original image:     {original_pixels:6d} pixels")
print(f"  Method 1 (stroke):  {method1_pixels:6d} pixels ({method1_pixels/original_pixels*100:.1f}%)")
print(f"  Method 2 (component): {method2_pixels:6d} pixels ({method2_pixels/original_pixels*100:.1f}%)")

print("\n" + "="*80)
print("Test completed!")
print(f"Results saved to: {OUTPUT_DIR}")
print("="*80)

print("\nNext steps:")
print("  1. Review the output images")
print("  2. Check which method better preserves handwriting")
print("  3. Adjust thresholds if needed")
print("  4. Choose the best method for production pipeline")