pdf_signature_extraction/extract_signatures_yolo.py

#!/usr/bin/env python3
"""
YOLO-based signature extraction from PDF documents.
Uses a trained YOLOv11n model to detect and extract handwritten signatures.

Pipeline:
    PDF → Render to Image → YOLO Detection → Crop Signatures → Output
"""

import csv
import json
import os
import random
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional

import cv2
import fitz  # PyMuPDF
import numpy as np
from ultralytics import YOLO


# Configuration
CSV_PATH = "/Volumes/NV2/PDF-Processing/master_signatures.csv"
PDF_BASE_PATH = "/Volumes/NV2/PDF-Processing/total-pdf"
OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/yolo"
OUTPUT_PATH_NO_STAMP = "/Volumes/NV2/PDF-Processing/signature-image-output/yolo_no_stamp"
MODEL_PATH = "/Volumes/NV2/pdf_recognize/models/best.pt"

# Detection parameters
DPI = 300
CONFIDENCE_THRESHOLD = 0.5


def remove_red_stamp(image: np.ndarray) -> np.ndarray:
    """
    Remove red stamp pixels from an image by replacing them with white.

    Uses HSV color space to detect red regions (stamps are typically red/orange).

    Args:
        image: RGB image as numpy array

    Returns:
        Image with red stamp pixels replaced by white
    """
    # Convert to HSV
    hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)

    # Red color wraps around in HSV, so we need two ranges
    # Range 1: H = 0-10 (red-orange)
    lower_red1 = np.array([0, 50, 50])
    upper_red1 = np.array([10, 255, 255])

    # Range 2: H = 160-180 (red-magenta)
    lower_red2 = np.array([160, 50, 50])
    upper_red2 = np.array([180, 255, 255])

    # Create masks for red regions
    mask1 = cv2.inRange(hsv, lower_red1, upper_red1)
    mask2 = cv2.inRange(hsv, lower_red2, upper_red2)

    # Combine masks
    red_mask = cv2.bitwise_or(mask1, mask2)

    # Optional: dilate mask slightly to catch edges
    kernel = np.ones((3, 3), np.uint8)
    red_mask = cv2.dilate(red_mask, kernel, iterations=1)

    # Replace red pixels with white
    result = image.copy()
    result[red_mask > 0] = [255, 255, 255]

    return result


class YOLOSignatureExtractor:
    """Extract signatures from PDF pages using YOLO object detection."""

    def __init__(self, model_path: str = MODEL_PATH, conf_threshold: float = CONFIDENCE_THRESHOLD):
        """
        Initialize the extractor with a trained YOLO model.

        Args:
            model_path: Path to the YOLO model weights
            conf_threshold: Minimum confidence threshold for detections
        """
        print(f"Loading YOLO model from {model_path}...")
        self.model = YOLO(model_path)
        self.conf_threshold = conf_threshold
        self.dpi = DPI
        print(f"Model loaded. Confidence threshold: {conf_threshold}")

    def render_pdf_page(self, pdf_path: str, page_num: int) -> Optional[np.ndarray]:
        """
        Render a PDF page to an image array.

        Args:
            pdf_path: Path to the PDF file
            page_num: Page number (1-indexed)

        Returns:
            RGB image as numpy array, or None if failed
        """
        try:
            doc = fitz.open(pdf_path)
            if page_num < 1 or page_num > len(doc):
                print(f"  Invalid page number: {page_num} (PDF has {len(doc)} pages)")
                doc.close()
                return None

            page = doc[page_num - 1]
            mat = fitz.Matrix(self.dpi / 72, self.dpi / 72)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            image = np.frombuffer(pix.samples, dtype=np.uint8)
            image = image.reshape(pix.height, pix.width, pix.n)
            doc.close()
            return image
        except Exception as e:
            print(f"  Error rendering PDF: {e}")
            return None

    def detect_signatures(self, image: np.ndarray) -> list[dict]:
        """
        Detect signature regions in an image using YOLO.

        Args:
            image: RGB image as numpy array

        Returns:
            List of detected signatures with box coordinates and confidence
        """
        results = self.model(image, conf=self.conf_threshold, verbose=False)
        signatures = []

        for r in results:
            for box in r.boxes:
                x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
                conf = float(box.conf[0].cpu().numpy())
                signatures.append({
                    'box': (x1, y1, x2 - x1, y2 - y1),  # x, y, w, h format
                    'xyxy': (x1, y1, x2, y2),
                    'confidence': conf
                })

        # Sort by y-coordinate (top to bottom), then x-coordinate (left to right)
        signatures.sort(key=lambda s: (s['box'][1], s['box'][0]))

        return signatures

    def extract_signature_images(self, image: np.ndarray, signatures: list[dict]) -> list[np.ndarray]:
        """
        Crop signature regions from the image.

        Args:
            image: RGB image as numpy array
            signatures: List of detected signatures

        Returns:
            List of cropped signature images
        """
        cropped = []
        for sig in signatures:
            x, y, w, h = sig['box']
            # Ensure bounds are within image
            x = max(0, x)
            y = max(0, y)
            x2 = min(image.shape[1], x + w)
            y2 = min(image.shape[0], y + h)
            cropped.append(image[y:y2, x:x2])
        return cropped

    def create_visualization(self, image: np.ndarray, signatures: list[dict]) -> np.ndarray:
        """
        Create a visualization with detection boxes drawn on the image.

        Args:
            image: RGB image as numpy array
            signatures: List of detected signatures

        Returns:
            Image with drawn bounding boxes
        """
        vis = image.copy()
        for i, sig in enumerate(signatures):
            x1, y1, x2, y2 = sig['xyxy']
            conf = sig['confidence']

            # Draw box
            cv2.rectangle(vis, (x1, y1), (x2, y2), (255, 0, 0), 3)

            # Draw label
            label = f"sig{i+1}: {conf:.2f}"
            font_scale = 0.8
            thickness = 2
            (text_w, text_h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)

            cv2.rectangle(vis, (x1, y1 - text_h - 10), (x1 + text_w + 5, y1), (255, 0, 0), -1)
            cv2.putText(vis, label, (x1 + 2, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX,
                        font_scale, (255, 255, 255), thickness)

        return vis


def find_pdf_file(filename: str) -> Optional[str]:
    """
    Search for PDF file in batch directories.

    Args:
        filename: PDF filename to search for

    Returns:
        Full path if found, None otherwise
    """
    for batch_dir in sorted(Path(PDF_BASE_PATH).glob("batch_*")):
        pdf_path = batch_dir / filename
        if pdf_path.exists():
            return str(pdf_path)
    return None


def load_csv_samples(csv_path: str, sample_size: int = 50, seed: int = 42) -> list[dict]:
    """
    Load random samples from the CSV file.

    Args:
        csv_path: Path to master_signatures.csv
        sample_size: Number of samples to load
        seed: Random seed for reproducibility

    Returns:
        List of dictionaries with filename and page info
    """
    with open(csv_path, 'r') as f:
        reader = csv.DictReader(f)
        all_rows = list(reader)

    random.seed(seed)
    samples = random.sample(all_rows, min(sample_size, len(all_rows)))

    return samples


def process_samples(extractor: YOLOSignatureExtractor, samples: list[dict],
                    output_dir: str, output_dir_no_stamp: str = None,
                    save_visualization: bool = True) -> dict:
    """
    Process a list of PDF samples and extract signatures.

    Args:
        extractor: YOLOSignatureExtractor instance
        samples: List of sample dictionaries from CSV
        output_dir: Output directory for signatures
        output_dir_no_stamp: Output directory for stamp-removed signatures (optional)
        save_visualization: Whether to save visualization images

    Returns:
        Results dictionary with statistics and per-file results
    """
    os.makedirs(output_dir, exist_ok=True)
    if save_visualization:
        os.makedirs(os.path.join(output_dir, "visualization"), exist_ok=True)

    # Create no-stamp output directory if specified
    if output_dir_no_stamp:
        os.makedirs(output_dir_no_stamp, exist_ok=True)

    results = {
        'timestamp': datetime.now().isoformat(),
        'total_samples': len(samples),
        'processed': 0,
        'pdf_not_found': 0,
        'render_failed': 0,
        'total_signatures': 0,
        'files': {}
    }

    for i, row in enumerate(samples):
        filename = row['filename']
        page_num = int(row['page'])
        base_name = Path(filename).stem

        print(f"[{i+1}/{len(samples)}] Processing: {filename}, page {page_num}...", end=' ', flush=True)

        # Find PDF
        pdf_path = find_pdf_file(filename)
        if pdf_path is None:
            print("PDF NOT FOUND")
            results['pdf_not_found'] += 1
            results['files'][filename] = {'status': 'pdf_not_found'}
            continue

        # Render page
        image = extractor.render_pdf_page(pdf_path, page_num)
        if image is None:
            print("RENDER FAILED")
            results['render_failed'] += 1
            results['files'][filename] = {'status': 'render_failed'}
            continue

        # Detect signatures
        signatures = extractor.detect_signatures(image)
        num_sigs = len(signatures)
        results['total_signatures'] += num_sigs
        results['processed'] += 1

        print(f"Found {num_sigs} signature(s)")

        # Extract and save signature crops
        crops = extractor.extract_signature_images(image, signatures)
        for j, (crop, sig) in enumerate(zip(crops, signatures)):
            crop_filename = f"{base_name}_page{page_num}_sig{j+1}.png"
            crop_path = os.path.join(output_dir, crop_filename)
            cv2.imwrite(crop_path, cv2.cvtColor(crop, cv2.COLOR_RGB2BGR))

            # Save stamp-removed version if output dir specified
            if output_dir_no_stamp:
                crop_no_stamp = remove_red_stamp(crop)
                crop_no_stamp_path = os.path.join(output_dir_no_stamp, crop_filename)
                cv2.imwrite(crop_no_stamp_path, cv2.cvtColor(crop_no_stamp, cv2.COLOR_RGB2BGR))

        # Save visualization
        if save_visualization and signatures:
            vis_image = extractor.create_visualization(image, signatures)
            vis_filename = f"{base_name}_page{page_num}_annotated.png"
            vis_path = os.path.join(output_dir, "visualization", vis_filename)
            cv2.imwrite(vis_path, cv2.cvtColor(vis_image, cv2.COLOR_RGB2BGR))

        # Store file results
        results['files'][filename] = {
            'status': 'success',
            'page': page_num,
            'signatures': [
                {
                    'box': list(sig['box']),
                    'confidence': sig['confidence']
                }
                for sig in signatures
            ]
        }

    return results


def print_summary(results: dict):
    """Print processing summary."""
    print("\n" + "=" * 60)
    print("YOLO SIGNATURE EXTRACTION SUMMARY")
    print("=" * 60)
    print(f"Total samples:        {results['total_samples']}")
    print(f"Successfully processed: {results['processed']}")
    print(f"PDFs not found:       {results['pdf_not_found']}")
    print(f"Render failed:        {results['render_failed']}")
    print(f"Total signatures found: {results['total_signatures']}")

    if results['processed'] > 0:
        avg_sigs = results['total_signatures'] / results['processed']
        print(f"Average signatures/page: {avg_sigs:.2f}")

    print("=" * 60)


def main():
    """Main entry point for signature extraction."""
    print("=" * 60)
    print("YOLO Signature Extraction Pipeline")
    print("=" * 60)
    print(f"Model: {MODEL_PATH}")
    print(f"CSV: {CSV_PATH}")
    print(f"Output (original): {OUTPUT_PATH}")
    print(f"Output (no stamp): {OUTPUT_PATH_NO_STAMP}")
    print(f"Confidence threshold: {CONFIDENCE_THRESHOLD}")
    print("=" * 60 + "\n")

    # Initialize extractor
    extractor = YOLOSignatureExtractor(MODEL_PATH, CONFIDENCE_THRESHOLD)

    # Load samples
    print("\nLoading samples from CSV...")
    samples = load_csv_samples(CSV_PATH, sample_size=50, seed=42)
    print(f"Loaded {len(samples)} samples\n")

    # Process samples (with stamp removal)
    results = process_samples(
        extractor, samples, OUTPUT_PATH,
        output_dir_no_stamp=OUTPUT_PATH_NO_STAMP,
        save_visualization=True
    )

    # Save results JSON
    results_path = os.path.join(OUTPUT_PATH, "results.json")
    with open(results_path, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"\nResults saved to: {results_path}")

    # Print summary
    print_summary(results)
    print(f"\nStamp-removed signatures saved to: {OUTPUT_PATH_NO_STAMP}")


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n\nProcess interrupted by user.")
        sys.exit(1)
    except Exception as e:
        print(f"\n\nFATAL ERROR: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)