#!/usr/bin/env python3 """ YOLO-based signature extraction from PDF documents. Uses a trained YOLOv11n model to detect and extract handwritten signatures. Pipeline: PDF → Render to Image → YOLO Detection → Crop Signatures → Output """ import csv import json import os import random import sys from datetime import datetime from pathlib import Path from typing import Optional import cv2 import fitz # PyMuPDF import numpy as np from ultralytics import YOLO # Configuration CSV_PATH = "/Volumes/NV2/PDF-Processing/master_signatures.csv" PDF_BASE_PATH = "/Volumes/NV2/PDF-Processing/total-pdf" OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/yolo" OUTPUT_PATH_NO_STAMP = "/Volumes/NV2/PDF-Processing/signature-image-output/yolo_no_stamp" MODEL_PATH = "/Volumes/NV2/pdf_recognize/models/best.pt" # Detection parameters DPI = 300 CONFIDENCE_THRESHOLD = 0.5 def remove_red_stamp(image: np.ndarray) -> np.ndarray: """ Remove red stamp pixels from an image by replacing them with white. Uses HSV color space to detect red regions (stamps are typically red/orange). Args: image: RGB image as numpy array Returns: Image with red stamp pixels replaced by white """ # Convert to HSV hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV) # Red color wraps around in HSV, so we need two ranges # Range 1: H = 0-10 (red-orange) lower_red1 = np.array([0, 50, 50]) upper_red1 = np.array([10, 255, 255]) # Range 2: H = 160-180 (red-magenta) lower_red2 = np.array([160, 50, 50]) upper_red2 = np.array([180, 255, 255]) # Create masks for red regions mask1 = cv2.inRange(hsv, lower_red1, upper_red1) mask2 = cv2.inRange(hsv, lower_red2, upper_red2) # Combine masks red_mask = cv2.bitwise_or(mask1, mask2) # Optional: dilate mask slightly to catch edges kernel = np.ones((3, 3), np.uint8) red_mask = cv2.dilate(red_mask, kernel, iterations=1) # Replace red pixels with white result = image.copy() result[red_mask > 0] = [255, 255, 255] return result class YOLOSignatureExtractor: """Extract signatures from PDF pages using YOLO object detection.""" def __init__(self, model_path: str = MODEL_PATH, conf_threshold: float = CONFIDENCE_THRESHOLD): """ Initialize the extractor with a trained YOLO model. Args: model_path: Path to the YOLO model weights conf_threshold: Minimum confidence threshold for detections """ print(f"Loading YOLO model from {model_path}...") self.model = YOLO(model_path) self.conf_threshold = conf_threshold self.dpi = DPI print(f"Model loaded. Confidence threshold: {conf_threshold}") def render_pdf_page(self, pdf_path: str, page_num: int) -> Optional[np.ndarray]: """ Render a PDF page to an image array. Args: pdf_path: Path to the PDF file page_num: Page number (1-indexed) Returns: RGB image as numpy array, or None if failed """ try: doc = fitz.open(pdf_path) if page_num < 1 or page_num > len(doc): print(f" Invalid page number: {page_num} (PDF has {len(doc)} pages)") doc.close() return None page = doc[page_num - 1] mat = fitz.Matrix(self.dpi / 72, self.dpi / 72) pix = page.get_pixmap(matrix=mat, alpha=False) image = np.frombuffer(pix.samples, dtype=np.uint8) image = image.reshape(pix.height, pix.width, pix.n) doc.close() return image except Exception as e: print(f" Error rendering PDF: {e}") return None def detect_signatures(self, image: np.ndarray) -> list[dict]: """ Detect signature regions in an image using YOLO. Args: image: RGB image as numpy array Returns: List of detected signatures with box coordinates and confidence """ results = self.model(image, conf=self.conf_threshold, verbose=False) signatures = [] for r in results: for box in r.boxes: x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy()) conf = float(box.conf[0].cpu().numpy()) signatures.append({ 'box': (x1, y1, x2 - x1, y2 - y1), # x, y, w, h format 'xyxy': (x1, y1, x2, y2), 'confidence': conf }) # Sort by y-coordinate (top to bottom), then x-coordinate (left to right) signatures.sort(key=lambda s: (s['box'][1], s['box'][0])) return signatures def extract_signature_images(self, image: np.ndarray, signatures: list[dict]) -> list[np.ndarray]: """ Crop signature regions from the image. Args: image: RGB image as numpy array signatures: List of detected signatures Returns: List of cropped signature images """ cropped = [] for sig in signatures: x, y, w, h = sig['box'] # Ensure bounds are within image x = max(0, x) y = max(0, y) x2 = min(image.shape[1], x + w) y2 = min(image.shape[0], y + h) cropped.append(image[y:y2, x:x2]) return cropped def create_visualization(self, image: np.ndarray, signatures: list[dict]) -> np.ndarray: """ Create a visualization with detection boxes drawn on the image. Args: image: RGB image as numpy array signatures: List of detected signatures Returns: Image with drawn bounding boxes """ vis = image.copy() for i, sig in enumerate(signatures): x1, y1, x2, y2 = sig['xyxy'] conf = sig['confidence'] # Draw box cv2.rectangle(vis, (x1, y1), (x2, y2), (255, 0, 0), 3) # Draw label label = f"sig{i+1}: {conf:.2f}" font_scale = 0.8 thickness = 2 (text_w, text_h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness) cv2.rectangle(vis, (x1, y1 - text_h - 10), (x1 + text_w + 5, y1), (255, 0, 0), -1) cv2.putText(vis, label, (x1 + 2, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (255, 255, 255), thickness) return vis def find_pdf_file(filename: str) -> Optional[str]: """ Search for PDF file in batch directories. Args: filename: PDF filename to search for Returns: Full path if found, None otherwise """ for batch_dir in sorted(Path(PDF_BASE_PATH).glob("batch_*")): pdf_path = batch_dir / filename if pdf_path.exists(): return str(pdf_path) return None def load_csv_samples(csv_path: str, sample_size: int = 50, seed: int = 42) -> list[dict]: """ Load random samples from the CSV file. Args: csv_path: Path to master_signatures.csv sample_size: Number of samples to load seed: Random seed for reproducibility Returns: List of dictionaries with filename and page info """ with open(csv_path, 'r') as f: reader = csv.DictReader(f) all_rows = list(reader) random.seed(seed) samples = random.sample(all_rows, min(sample_size, len(all_rows))) return samples def process_samples(extractor: YOLOSignatureExtractor, samples: list[dict], output_dir: str, output_dir_no_stamp: str = None, save_visualization: bool = True) -> dict: """ Process a list of PDF samples and extract signatures. Args: extractor: YOLOSignatureExtractor instance samples: List of sample dictionaries from CSV output_dir: Output directory for signatures output_dir_no_stamp: Output directory for stamp-removed signatures (optional) save_visualization: Whether to save visualization images Returns: Results dictionary with statistics and per-file results """ os.makedirs(output_dir, exist_ok=True) if save_visualization: os.makedirs(os.path.join(output_dir, "visualization"), exist_ok=True) # Create no-stamp output directory if specified if output_dir_no_stamp: os.makedirs(output_dir_no_stamp, exist_ok=True) results = { 'timestamp': datetime.now().isoformat(), 'total_samples': len(samples), 'processed': 0, 'pdf_not_found': 0, 'render_failed': 0, 'total_signatures': 0, 'files': {} } for i, row in enumerate(samples): filename = row['filename'] page_num = int(row['page']) base_name = Path(filename).stem print(f"[{i+1}/{len(samples)}] Processing: {filename}, page {page_num}...", end=' ', flush=True) # Find PDF pdf_path = find_pdf_file(filename) if pdf_path is None: print("PDF NOT FOUND") results['pdf_not_found'] += 1 results['files'][filename] = {'status': 'pdf_not_found'} continue # Render page image = extractor.render_pdf_page(pdf_path, page_num) if image is None: print("RENDER FAILED") results['render_failed'] += 1 results['files'][filename] = {'status': 'render_failed'} continue # Detect signatures signatures = extractor.detect_signatures(image) num_sigs = len(signatures) results['total_signatures'] += num_sigs results['processed'] += 1 print(f"Found {num_sigs} signature(s)") # Extract and save signature crops crops = extractor.extract_signature_images(image, signatures) for j, (crop, sig) in enumerate(zip(crops, signatures)): crop_filename = f"{base_name}_page{page_num}_sig{j+1}.png" crop_path = os.path.join(output_dir, crop_filename) cv2.imwrite(crop_path, cv2.cvtColor(crop, cv2.COLOR_RGB2BGR)) # Save stamp-removed version if output dir specified if output_dir_no_stamp: crop_no_stamp = remove_red_stamp(crop) crop_no_stamp_path = os.path.join(output_dir_no_stamp, crop_filename) cv2.imwrite(crop_no_stamp_path, cv2.cvtColor(crop_no_stamp, cv2.COLOR_RGB2BGR)) # Save visualization if save_visualization and signatures: vis_image = extractor.create_visualization(image, signatures) vis_filename = f"{base_name}_page{page_num}_annotated.png" vis_path = os.path.join(output_dir, "visualization", vis_filename) cv2.imwrite(vis_path, cv2.cvtColor(vis_image, cv2.COLOR_RGB2BGR)) # Store file results results['files'][filename] = { 'status': 'success', 'page': page_num, 'signatures': [ { 'box': list(sig['box']), 'confidence': sig['confidence'] } for sig in signatures ] } return results def print_summary(results: dict): """Print processing summary.""" print("\n" + "=" * 60) print("YOLO SIGNATURE EXTRACTION SUMMARY") print("=" * 60) print(f"Total samples: {results['total_samples']}") print(f"Successfully processed: {results['processed']}") print(f"PDFs not found: {results['pdf_not_found']}") print(f"Render failed: {results['render_failed']}") print(f"Total signatures found: {results['total_signatures']}") if results['processed'] > 0: avg_sigs = results['total_signatures'] / results['processed'] print(f"Average signatures/page: {avg_sigs:.2f}") print("=" * 60) def main(): """Main entry point for signature extraction.""" print("=" * 60) print("YOLO Signature Extraction Pipeline") print("=" * 60) print(f"Model: {MODEL_PATH}") print(f"CSV: {CSV_PATH}") print(f"Output (original): {OUTPUT_PATH}") print(f"Output (no stamp): {OUTPUT_PATH_NO_STAMP}") print(f"Confidence threshold: {CONFIDENCE_THRESHOLD}") print("=" * 60 + "\n") # Initialize extractor extractor = YOLOSignatureExtractor(MODEL_PATH, CONFIDENCE_THRESHOLD) # Load samples print("\nLoading samples from CSV...") samples = load_csv_samples(CSV_PATH, sample_size=50, seed=42) print(f"Loaded {len(samples)} samples\n") # Process samples (with stamp removal) results = process_samples( extractor, samples, OUTPUT_PATH, output_dir_no_stamp=OUTPUT_PATH_NO_STAMP, save_visualization=True ) # Save results JSON results_path = os.path.join(OUTPUT_PATH, "results.json") with open(results_path, 'w') as f: json.dump(results, f, indent=2) print(f"\nResults saved to: {results_path}") # Print summary print_summary(results) print(f"\nStamp-removed signatures saved to: {OUTPUT_PATH_NO_STAMP}") if __name__ == "__main__": try: main() except KeyboardInterrupt: print("\n\nProcess interrupted by user.") sys.exit(1) except Exception as e: print(f"\n\nFATAL ERROR: {e}") import traceback traceback.print_exc() sys.exit(1)