pdf_signature_extraction/extract_handwriting.py

#!/usr/bin/env python3
"""
Script to detect and extract handwritten regions from PDF pages.
Uses computer vision to identify handwriting, not PDF image objects.
"""

import cv2
import numpy as np
import os
import sys
from pathlib import Path
from datetime import datetime
import fitz  # PyMuPDF
import csv

# Configuration
PDF_INPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output"
OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/handwriting"
LOG_FILE = None  # Will be set in main()

# Image processing parameters
DPI = 300  # Resolution for rendering PDF page
MIN_CONTOUR_AREA = 100  # Minimum area for a handwriting region (in pixels)
MAX_CONTOUR_AREA = 500000  # Maximum area (to filter out large background elements)


def render_pdf_page_as_image(pdf_path, dpi=300):
    """
    Render PDF page as a high-resolution image.
    Returns: numpy array (OpenCV format)
    """
    try:
        doc = fitz.open(pdf_path)
        page = doc[0]  # Get first page (our extracted pages only have 1 page)

        # Render at high DPI for better detection
        mat = fitz.Matrix(dpi / 72, dpi / 72)  # 72 DPI is default
        pix = page.get_pixmap(matrix=mat, alpha=False)

        # Convert to numpy array
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)

        # Convert RGB to BGR for OpenCV
        if pix.n == 3:  # RGB
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        elif pix.n == 1:  # Grayscale
            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)

        doc.close()
        return img, None

    except Exception as e:
        return None, str(e)


def detect_handwriting_regions(image):
    """
    Detect handwritten regions in the image using computer vision.
    Returns: list of bounding boxes [(x, y, w, h), ...]
    """
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply binary threshold (Otsu's method for automatic threshold)
    # Invert so that dark ink becomes white (foreground)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Morphological operations to connect nearby strokes
    # This helps group individual pen strokes into signature regions
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 5))
    dilated = cv2.dilate(binary, kernel, iterations=2)

    # Find contours (connected regions)
    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Filter contours based on area
    bounding_boxes = []
    for contour in contours:
        area = cv2.contourArea(contour)

        # Filter by area (remove noise and very large regions)
        if MIN_CONTOUR_AREA < area < MAX_CONTOUR_AREA:
            x, y, w, h = cv2.boundingRect(contour)

            # Additional filters:
            # 1. Aspect ratio check (signatures are usually wider than tall, but not extreme)
            aspect_ratio = w / float(h) if h > 0 else 0

            # 2. Size check (not too small, not too large)
            if 0.1 < aspect_ratio < 20 and w > 20 and h > 20:
                bounding_boxes.append((x, y, w, h))

    return bounding_boxes


def merge_overlapping_boxes(boxes, merge_threshold=50):
    """
    Merge bounding boxes that are close to each other.
    This helps combine signature parts that were detected separately.
    """
    if not boxes:
        return []

    # Sort boxes by x-coordinate
    boxes = sorted(boxes, key=lambda b: b[0])

    merged = []
    current = list(boxes[0])  # [x, y, w, h]

    for box in boxes[1:]:
        x, y, w, h = box
        cx, cy, cw, ch = current

        # Check if boxes are close enough to merge
        # Close in x direction and overlapping or close in y direction
        if (x <= cx + cw + merge_threshold and
            abs(y - cy) < merge_threshold * 2):
            # Merge boxes
            new_x = min(cx, x)
            new_y = min(cy, y)
            new_w = max(cx + cw, x + w) - new_x
            new_h = max(cy + ch, y + h) - new_y
            current = [new_x, new_y, new_w, new_h]
        else:
            merged.append(tuple(current))
            current = list(box)

    merged.append(tuple(current))
    return merged


def extract_handwriting_regions(pdf_path, output_dir, dpi=300):
    """
    Extract handwritten regions from a PDF page.
    Returns: (success_count, total_regions, region_info, error)
    """
    try:
        # Render PDF as image
        image, error = render_pdf_page_as_image(pdf_path, dpi)
        if error:
            return 0, 0, [], f"Rendering error: {error}"

        if image is None:
            return 0, 0, [], "Failed to render PDF"

        # Detect handwriting regions
        boxes = detect_handwriting_regions(image)

        if not boxes:
            return 0, 0, [], None  # No handwriting detected, not an error

        # Merge overlapping/nearby boxes
        merged_boxes = merge_overlapping_boxes(boxes)

        # Extract and save regions
        pdf_name = Path(pdf_path).stem
        region_info = []

        for idx, (x, y, w, h) in enumerate(merged_boxes):
            # Add padding around the region
            padding = 10
            x_pad = max(0, x - padding)
            y_pad = max(0, y - padding)
            w_pad = min(image.shape[1] - x_pad, w + 2 * padding)
            h_pad = min(image.shape[0] - y_pad, h + 2 * padding)

            # Extract region
            region = image[y_pad:y_pad + h_pad, x_pad:x_pad + w_pad]

            # Save region
            output_filename = f"{pdf_name}_handwriting_{idx + 1:02d}.png"
            output_path = os.path.join(output_dir, output_filename)
            cv2.imwrite(output_path, region)

            region_info.append({
                'filename': output_filename,
                'bbox': (x_pad, y_pad, w_pad, h_pad),
                'area': w_pad * h_pad
            })

        return len(merged_boxes), len(merged_boxes), region_info, None

    except Exception as e:
        return 0, 0, [], str(e)


def main():
    """Main processing function"""
    global LOG_FILE

    print(f"Starting handwriting extraction from PDFs...")
    print(f"Input path: {PDF_INPUT_PATH}")
    print(f"Output path: {OUTPUT_PATH}")
    print(f"DPI: {DPI}")
    print()

    # Create output directory
    os.makedirs(OUTPUT_PATH, exist_ok=True)

    LOG_FILE = os.path.join(OUTPUT_PATH, f"handwriting_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")

    # Get PDF files
    pdf_files = sorted(Path(PDF_INPUT_PATH).glob("*.pdf"))

    if not pdf_files:
        print("ERROR: No PDF files found!")
        return

    print(f"Found {len(pdf_files)} PDF files to process\n")

    # Statistics
    stats = {
        'total_pdfs': 0,
        'pdfs_with_handwriting': 0,
        'pdfs_without_handwriting': 0,
        'total_regions': 0,
        'errors': 0
    }

    # Open log file
    with open(LOG_FILE, 'w', newline='') as log_file:
        log_writer = csv.writer(log_file)
        log_writer.writerow([
            'pdf_filename', 'regions_detected', 'regions_extracted',
            'extracted_filenames', 'error'
        ])

        # Process each PDF
        for i, pdf_path in enumerate(pdf_files):
            stats['total_pdfs'] += 1
            pdf_filename = pdf_path.name

            print(f"[{i+1}/{len(pdf_files)}] Processing: {pdf_filename}... ", end='', flush=True)

            # Extract handwriting
            extracted_count, total_count, region_info, error = extract_handwriting_regions(
                str(pdf_path), OUTPUT_PATH, DPI
            )

            if error:
                print(f"ERROR: {error}")
                stats['errors'] += 1
                log_writer.writerow([pdf_filename, 0, 0, "", error])
                continue

            if extracted_count > 0:
                stats['pdfs_with_handwriting'] += 1
                stats['total_regions'] += extracted_count
                print(f"FOUND {extracted_count} regions")

                filenames = [r['filename'] for r in region_info]
                log_writer.writerow([
                    pdf_filename,
                    total_count,
                    extracted_count,
                    ", ".join(filenames),
                    ""
                ])
            else:
                stats['pdfs_without_handwriting'] += 1
                print("No handwriting detected")
                log_writer.writerow([pdf_filename, 0, 0, "", ""])

    # Print summary
    print("\n" + "="*60)
    print("HANDWRITING EXTRACTION SUMMARY")
    print("="*60)
    print(f"Total PDFs processed:        {stats['total_pdfs']}")
    print(f"PDFs with handwriting:       {stats['pdfs_with_handwriting']}")
    print(f"PDFs without handwriting:    {stats['pdfs_without_handwriting']}")
    print(f"Total regions extracted:     {stats['total_regions']}")
    print(f"Errors:                      {stats['errors']}")
    print(f"\nLog file: {LOG_FILE}")
    print("="*60)

    # Show examples
    if stats['total_regions'] > 0:
        output_files = sorted(Path(OUTPUT_PATH).glob("*_handwriting_*.png"))
        print(f"\nExtracted {len(output_files)} handwriting images")
        print("Example files:")
        for img in output_files[:5]:
            size_kb = img.stat().st_size / 1024
            print(f"  - {img.name} ({size_kb:.1f} KB)")


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n\nProcess interrupted by user.")
        sys.exit(1)
    except Exception as e:
        print(f"\n\nFATAL ERROR: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)