#!/usr/bin/env python3 """ Script to detect and extract handwritten regions from PDF pages. Uses computer vision to identify handwriting, not PDF image objects. """ import cv2 import numpy as np import os import sys from pathlib import Path from datetime import datetime import fitz # PyMuPDF import csv # Configuration PDF_INPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output" OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/handwriting" LOG_FILE = None # Will be set in main() # Image processing parameters DPI = 300 # Resolution for rendering PDF page MIN_CONTOUR_AREA = 100 # Minimum area for a handwriting region (in pixels) MAX_CONTOUR_AREA = 500000 # Maximum area (to filter out large background elements) def render_pdf_page_as_image(pdf_path, dpi=300): """ Render PDF page as a high-resolution image. Returns: numpy array (OpenCV format) """ try: doc = fitz.open(pdf_path) page = doc[0] # Get first page (our extracted pages only have 1 page) # Render at high DPI for better detection mat = fitz.Matrix(dpi / 72, dpi / 72) # 72 DPI is default pix = page.get_pixmap(matrix=mat, alpha=False) # Convert to numpy array img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n) # Convert RGB to BGR for OpenCV if pix.n == 3: # RGB img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) elif pix.n == 1: # Grayscale img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) doc.close() return img, None except Exception as e: return None, str(e) def detect_handwriting_regions(image): """ Detect handwritten regions in the image using computer vision. Returns: list of bounding boxes [(x, y, w, h), ...] """ # Convert to grayscale gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # Apply binary threshold (Otsu's method for automatic threshold) # Invert so that dark ink becomes white (foreground) _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) # Morphological operations to connect nearby strokes # This helps group individual pen strokes into signature regions kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 5)) dilated = cv2.dilate(binary, kernel, iterations=2) # Find contours (connected regions) contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # Filter contours based on area bounding_boxes = [] for contour in contours: area = cv2.contourArea(contour) # Filter by area (remove noise and very large regions) if MIN_CONTOUR_AREA < area < MAX_CONTOUR_AREA: x, y, w, h = cv2.boundingRect(contour) # Additional filters: # 1. Aspect ratio check (signatures are usually wider than tall, but not extreme) aspect_ratio = w / float(h) if h > 0 else 0 # 2. Size check (not too small, not too large) if 0.1 < aspect_ratio < 20 and w > 20 and h > 20: bounding_boxes.append((x, y, w, h)) return bounding_boxes def merge_overlapping_boxes(boxes, merge_threshold=50): """ Merge bounding boxes that are close to each other. This helps combine signature parts that were detected separately. """ if not boxes: return [] # Sort boxes by x-coordinate boxes = sorted(boxes, key=lambda b: b[0]) merged = [] current = list(boxes[0]) # [x, y, w, h] for box in boxes[1:]: x, y, w, h = box cx, cy, cw, ch = current # Check if boxes are close enough to merge # Close in x direction and overlapping or close in y direction if (x <= cx + cw + merge_threshold and abs(y - cy) < merge_threshold * 2): # Merge boxes new_x = min(cx, x) new_y = min(cy, y) new_w = max(cx + cw, x + w) - new_x new_h = max(cy + ch, y + h) - new_y current = [new_x, new_y, new_w, new_h] else: merged.append(tuple(current)) current = list(box) merged.append(tuple(current)) return merged def extract_handwriting_regions(pdf_path, output_dir, dpi=300): """ Extract handwritten regions from a PDF page. Returns: (success_count, total_regions, region_info, error) """ try: # Render PDF as image image, error = render_pdf_page_as_image(pdf_path, dpi) if error: return 0, 0, [], f"Rendering error: {error}" if image is None: return 0, 0, [], "Failed to render PDF" # Detect handwriting regions boxes = detect_handwriting_regions(image) if not boxes: return 0, 0, [], None # No handwriting detected, not an error # Merge overlapping/nearby boxes merged_boxes = merge_overlapping_boxes(boxes) # Extract and save regions pdf_name = Path(pdf_path).stem region_info = [] for idx, (x, y, w, h) in enumerate(merged_boxes): # Add padding around the region padding = 10 x_pad = max(0, x - padding) y_pad = max(0, y - padding) w_pad = min(image.shape[1] - x_pad, w + 2 * padding) h_pad = min(image.shape[0] - y_pad, h + 2 * padding) # Extract region region = image[y_pad:y_pad + h_pad, x_pad:x_pad + w_pad] # Save region output_filename = f"{pdf_name}_handwriting_{idx + 1:02d}.png" output_path = os.path.join(output_dir, output_filename) cv2.imwrite(output_path, region) region_info.append({ 'filename': output_filename, 'bbox': (x_pad, y_pad, w_pad, h_pad), 'area': w_pad * h_pad }) return len(merged_boxes), len(merged_boxes), region_info, None except Exception as e: return 0, 0, [], str(e) def main(): """Main processing function""" global LOG_FILE print(f"Starting handwriting extraction from PDFs...") print(f"Input path: {PDF_INPUT_PATH}") print(f"Output path: {OUTPUT_PATH}") print(f"DPI: {DPI}") print() # Create output directory os.makedirs(OUTPUT_PATH, exist_ok=True) LOG_FILE = os.path.join(OUTPUT_PATH, f"handwriting_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv") # Get PDF files pdf_files = sorted(Path(PDF_INPUT_PATH).glob("*.pdf")) if not pdf_files: print("ERROR: No PDF files found!") return print(f"Found {len(pdf_files)} PDF files to process\n") # Statistics stats = { 'total_pdfs': 0, 'pdfs_with_handwriting': 0, 'pdfs_without_handwriting': 0, 'total_regions': 0, 'errors': 0 } # Open log file with open(LOG_FILE, 'w', newline='') as log_file: log_writer = csv.writer(log_file) log_writer.writerow([ 'pdf_filename', 'regions_detected', 'regions_extracted', 'extracted_filenames', 'error' ]) # Process each PDF for i, pdf_path in enumerate(pdf_files): stats['total_pdfs'] += 1 pdf_filename = pdf_path.name print(f"[{i+1}/{len(pdf_files)}] Processing: {pdf_filename}... ", end='', flush=True) # Extract handwriting extracted_count, total_count, region_info, error = extract_handwriting_regions( str(pdf_path), OUTPUT_PATH, DPI ) if error: print(f"ERROR: {error}") stats['errors'] += 1 log_writer.writerow([pdf_filename, 0, 0, "", error]) continue if extracted_count > 0: stats['pdfs_with_handwriting'] += 1 stats['total_regions'] += extracted_count print(f"FOUND {extracted_count} regions") filenames = [r['filename'] for r in region_info] log_writer.writerow([ pdf_filename, total_count, extracted_count, ", ".join(filenames), "" ]) else: stats['pdfs_without_handwriting'] += 1 print("No handwriting detected") log_writer.writerow([pdf_filename, 0, 0, "", ""]) # Print summary print("\n" + "="*60) print("HANDWRITING EXTRACTION SUMMARY") print("="*60) print(f"Total PDFs processed: {stats['total_pdfs']}") print(f"PDFs with handwriting: {stats['pdfs_with_handwriting']}") print(f"PDFs without handwriting: {stats['pdfs_without_handwriting']}") print(f"Total regions extracted: {stats['total_regions']}") print(f"Errors: {stats['errors']}") print(f"\nLog file: {LOG_FILE}") print("="*60) # Show examples if stats['total_regions'] > 0: output_files = sorted(Path(OUTPUT_PATH).glob("*_handwriting_*.png")) print(f"\nExtracted {len(output_files)} handwriting images") print("Example files:") for img in output_files[:5]: size_kb = img.stat().st_size / 1024 print(f" - {img.name} ({size_kb:.1f} KB)") if __name__ == "__main__": try: main() except KeyboardInterrupt: print("\n\nProcess interrupted by user.") sys.exit(1) except Exception as e: print(f"\n\nFATAL ERROR: {e}") import traceback traceback.print_exc() sys.exit(1)