pdf_signature_extraction/yolo_extract_from_index.py

#!/usr/bin/env python3
"""
YOLO Signature Extraction from VLM Index

Extracts signatures from PDF pages specified in master_signatures.csv.
Uses VLM-filtered index + YOLO for precise localization and cropping.

Pipeline:
    CSV Index → Load specified page → YOLO Detection → Crop & Remove Red Stamp → Output
"""

import argparse
import csv
import json
import os
import sys
import time
from concurrent.futures import ProcessPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path
from typing import Optional

import cv2
import fitz  # PyMuPDF
import numpy as np

# Configuration
DPI = 150
CONFIDENCE_THRESHOLD = 0.5
PROGRESS_SAVE_INTERVAL = 500


def remove_red_stamp(image: np.ndarray) -> np.ndarray:
    """Remove red stamp pixels from an image by replacing them with white."""
    hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)

    lower_red1 = np.array([0, 50, 50])
    upper_red1 = np.array([10, 255, 255])
    lower_red2 = np.array([160, 50, 50])
    upper_red2 = np.array([180, 255, 255])

    mask1 = cv2.inRange(hsv, lower_red1, upper_red1)
    mask2 = cv2.inRange(hsv, lower_red2, upper_red2)
    red_mask = cv2.bitwise_or(mask1, mask2)

    kernel = np.ones((3, 3), np.uint8)
    red_mask = cv2.dilate(red_mask, kernel, iterations=1)

    result = image.copy()
    result[red_mask > 0] = [255, 255, 255]
    return result


def render_pdf_page(pdf_path: str, page_num: int, dpi: int = DPI) -> Optional[np.ndarray]:
    """Render a specific PDF page to an image array."""
    try:
        doc = fitz.open(pdf_path)
        if page_num < 1 or page_num > len(doc):
            doc.close()
            return None

        page = doc[page_num - 1]  # Convert to 0-indexed
        mat = fitz.Matrix(dpi / 72, dpi / 72)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        image = np.frombuffer(pix.samples, dtype=np.uint8)
        image = image.reshape(pix.height, pix.width, pix.n)
        doc.close()
        return image
    except Exception:
        return None


def find_pdf_file(filename: str, pdf_base: str) -> Optional[str]:
    """Search for PDF file in batch directories."""
    base_path = Path(pdf_base)

    # Check for batch subdirectories
    for batch_dir in sorted(base_path.glob("batch_*")):
        pdf_path = batch_dir / filename
        if pdf_path.exists():
            return str(pdf_path)

    # Check flat directory
    pdf_path = base_path / filename
    if pdf_path.exists():
        return str(pdf_path)

    return None


def process_single_entry(args: tuple) -> dict:
    """
    Process a single CSV entry: render page, detect signatures, crop and save.

    Args:
        args: Tuple of (row_dict, model_path, pdf_base, output_dir, conf_threshold)

    Returns:
        Result dictionary
    """
    row, model_path, pdf_base, output_dir, conf_threshold = args

    from ultralytics import YOLO

    filename = row['filename']
    page_num = int(row['page'])
    base_name = Path(filename).stem

    result = {
        'filename': filename,
        'page': page_num,
        'num_signatures': 0,
        'confidence_avg': 0.0,
        'image_files': [],
        'error': None
    }

    try:
        # Find PDF
        pdf_path = find_pdf_file(filename, pdf_base)
        if pdf_path is None:
            result['error'] = 'PDF not found'
            return result

        # Render page
        image = render_pdf_page(pdf_path, page_num)
        if image is None:
            result['error'] = 'Render failed'
            return result

        # Load model and detect
        model = YOLO(model_path)
        results = model(image, conf=conf_threshold, verbose=False)

        signatures = []
        for r in results:
            for box in r.boxes:
                x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
                conf = float(box.conf[0].cpu().numpy())
                signatures.append({
                    'box': (x1, y1, x2 - x1, y2 - y1),
                    'confidence': conf
                })

        if not signatures:
            result['num_signatures'] = 0
            return result

        # Sort signatures by position (top-left to bottom-right)
        signatures.sort(key=lambda s: (s['box'][1], s['box'][0]))

        result['num_signatures'] = len(signatures)
        result['confidence_avg'] = sum(s['confidence'] for s in signatures) / len(signatures)

        # Extract and save crops
        image_files = []
        for i, sig in enumerate(signatures):
            x, y, w, h = sig['box']
            x = max(0, x)
            y = max(0, y)
            x2 = min(image.shape[1], x + w)
            y2 = min(image.shape[0], y + h)

            crop = image[y:y2, x:x2]
            crop_clean = remove_red_stamp(crop)

            crop_filename = f"{base_name}_page{page_num}_sig{i + 1}.png"
            crop_path = os.path.join(output_dir, "images", crop_filename)
            cv2.imwrite(crop_path, cv2.cvtColor(crop_clean, cv2.COLOR_RGB2BGR))

            image_files.append(crop_filename)

        result['image_files'] = image_files

    except Exception as e:
        result['error'] = str(e)

    return result


def load_progress(progress_file: str) -> set:
    """Load completed entries from progress checkpoint."""
    if os.path.exists(progress_file):
        try:
            with open(progress_file, 'r') as f:
                data = json.load(f)
                return set(data.get('completed_keys', []))
        except Exception:
            pass
    return set()


def save_progress(progress_file: str, completed: set, total: int, start_time: float):
    """Save progress checkpoint."""
    elapsed = time.time() - start_time
    data = {
        'last_updated': datetime.now().isoformat(),
        'total_entries': total,
        'processed': len(completed),
        'remaining': total - len(completed),
        'elapsed_seconds': elapsed,
        'completed_keys': list(completed)
    }
    with open(progress_file, 'w') as f:
        json.dump(data, f)


def main():
    parser = argparse.ArgumentParser(description='YOLO Signature Extraction from VLM Index')
    parser.add_argument('--csv', required=True, help='Path to master_signatures.csv')
    parser.add_argument('--pdf-base', required=True, help='Base directory containing PDFs')
    parser.add_argument('--output', required=True, help='Output directory')
    parser.add_argument('--model', default='best.pt', help='Path to YOLO model')
    parser.add_argument('--workers', type=int, default=8, help='Number of parallel workers')
    parser.add_argument('--conf', type=float, default=0.5, help='Confidence threshold')
    parser.add_argument('--resume', action='store_true', help='Resume from checkpoint')
    args = parser.parse_args()

    # Setup output directories
    output_dir = Path(args.output)
    output_dir.mkdir(parents=True, exist_ok=True)
    (output_dir / "images").mkdir(exist_ok=True)

    progress_file = str(output_dir / "progress.json")
    csv_output = str(output_dir / "extraction_results.csv")
    report_file = str(output_dir / "extraction_report.json")

    print("=" * 70)
    print("YOLO Signature Extraction from VLM Index")
    print("=" * 70)
    print(f"CSV Index: {args.csv}")
    print(f"PDF Base: {args.pdf_base}")
    print(f"Output: {args.output}")
    print(f"Model: {args.model}")
    print(f"Workers: {args.workers}")
    print(f"Confidence: {args.conf}")
    print("=" * 70)

    # Load CSV
    print("\nLoading CSV index...")
    with open(args.csv, 'r') as f:
        reader = csv.DictReader(f)
        all_entries = list(reader)

    total_entries = len(all_entries)
    print(f"Total entries: {total_entries}")

    # Load progress if resuming
    completed_keys = set()
    if args.resume:
        completed_keys = load_progress(progress_file)
        print(f"Resuming: {len(completed_keys)} entries already processed")

    # Filter out completed entries
    def entry_key(row):
        return f"{row['filename']}_{row['page']}"

    entries_to_process = [e for e in all_entries if entry_key(e) not in completed_keys]
    print(f"Entries to process: {len(entries_to_process)}")

    if not entries_to_process:
        print("All entries already processed!")
        return

    # Prepare work arguments
    work_args = [
        (entry, args.model, args.pdf_base, str(output_dir), args.conf)
        for entry in entries_to_process
    ]

    # Results
    results_success = []
    results_no_sig = []
    errors = []

    start_time = time.time()
    processed_count = len(completed_keys)

    print(f"\nStarting extraction with {args.workers} workers...")
    print("-" * 70)

    with ProcessPoolExecutor(max_workers=args.workers) as executor:
        futures = {executor.submit(process_single_entry, arg): arg[0] for arg in work_args}

        for future in as_completed(futures):
            entry = futures[future]
            key = entry_key(entry)

            try:
                result = future.result()

                if result['error']:
                    errors.append(result)
                elif result['num_signatures'] > 0:
                    results_success.append(result)
                else:
                    results_no_sig.append(result)

                completed_keys.add(key)
                processed_count += 1

                # Progress output
                elapsed = time.time() - start_time
                rate = (processed_count - len(load_progress(progress_file) if args.resume else set())) / elapsed if elapsed > 0 else 0
                eta = (total_entries - processed_count) / rate / 60 if rate > 0 else 0

                status = f"SIG({result['num_signatures']})" if result['num_signatures'] > 0 else "---"
                if result['error']:
                    status = "ERR"

                print(f"[{processed_count}/{total_entries}] {status:8s} {result['filename'][:45]:45s} "
                      f"({rate:.1f}/s, ETA: {eta:.1f}m)")

                # Save progress
                if processed_count % PROGRESS_SAVE_INTERVAL == 0:
                    save_progress(progress_file, completed_keys, total_entries, start_time)

            except Exception as e:
                print(f"Error: {e}")
                errors.append({'filename': entry['filename'], 'error': str(e)})

    # Final progress save
    save_progress(progress_file, completed_keys, total_entries, start_time)

    # Write CSV results
    print("\nWriting results CSV...")
    with open(csv_output, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=[
            'filename', 'page', 'num_signatures', 'confidence_avg', 'image_files'
        ])
        writer.writeheader()
        for r in results_success:
            writer.writerow({
                'filename': r['filename'],
                'page': r['page'],
                'num_signatures': r['num_signatures'],
                'confidence_avg': round(r['confidence_avg'], 4),
                'image_files': ','.join(r['image_files'])
            })

    # Generate report
    elapsed_total = time.time() - start_time
    total_sigs = sum(r['num_signatures'] for r in results_success)

    report = {
        'extraction_date': datetime.now().isoformat(),
        'total_index_entries': total_entries,
        'with_signatures_detected': len(results_success),
        'no_signatures_detected': len(results_no_sig),
        'errors': len(errors),
        'total_signatures_extracted': total_sigs,
        'detection_rate': f"{len(results_success) / total_entries * 100:.2f}%" if total_entries > 0 else "0%",
        'processing_time_minutes': round(elapsed_total / 60, 2),
        'processing_rate_per_second': round(len(entries_to_process) / elapsed_total, 2) if elapsed_total > 0 else 0,
        'model': args.model,
        'confidence_threshold': args.conf,
        'workers': args.workers
    }

    with open(report_file, 'w') as f:
        json.dump(report, f, indent=2)

    # Print summary
    print("\n" + "=" * 70)
    print("EXTRACTION COMPLETE")
    print("=" * 70)
    print(f"Total index entries:      {total_entries}")
    print(f"With signatures:          {len(results_success)} ({len(results_success)/total_entries*100:.1f}%)")
    print(f"No signatures detected:   {len(results_no_sig)} ({len(results_no_sig)/total_entries*100:.1f}%)")
    print(f"Errors:                   {len(errors)}")
    print(f"Total signatures:         {total_sigs}")
    print(f"Processing time:          {elapsed_total/60:.1f} minutes")
    print(f"Rate:                     {len(entries_to_process)/elapsed_total:.1f} entries/second")
    print("-" * 70)
    print(f"Results saved to: {output_dir}")
    print("=" * 70)


if __name__ == "__main__":
    main()