pdf_signature_extraction/extract_signatures_vlm.py

#!/usr/bin/env python3
"""
Script to extract signatures using VLM (Vision Language Model) guidance.
Uses Ollama instance with qwen2.5vl:32b for signature detection.
"""

import cv2
import numpy as np
import os
import sys
import json
import base64
import requests
from pathlib import Path
from datetime import datetime
import fitz  # PyMuPDF
import csv
from io import BytesIO

# Configuration
PDF_INPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output"
OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/signatures"
REJECTED_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/signatures/rejected"
LOG_FILE = None  # Will be set in main()

# Ollama Configuration
OLLAMA_URL = "http://192.168.30.36:11434"
OLLAMA_MODEL = "qwen2.5vl:32b"

# Image processing parameters
DPI = 300  # Resolution for rendering PDF page


def encode_image_to_base64(image_array):
    """
    Encode numpy image array to base64 string for Ollama API.
    """
    # Convert BGR to RGB
    image_rgb = cv2.cvtColor(image_array, cv2.COLOR_BGR2RGB)

    # Encode as JPEG
    _, buffer = cv2.imencode('.jpg', image_rgb)

    # Convert to base64
    image_base64 = base64.b64encode(buffer).decode('utf-8')

    return image_base64


def call_ollama_vision(image_base64, prompt):
    """
    Call Ollama vision model with image and prompt.
    Returns the model's text response.
    """
    try:
        url = f"{OLLAMA_URL}/api/generate"

        payload = {
            "model": OLLAMA_MODEL,
            "prompt": prompt,
            "images": [image_base64],
            "stream": False
        }

        response = requests.post(url, json=payload, timeout=120)
        response.raise_for_status()

        result = response.json()
        return result.get('response', ''), None

    except Exception as e:
        return None, str(e)


def render_pdf_page_as_image(pdf_path, dpi=300):
    """
    Render PDF page as a high-resolution image.
    Returns: numpy array (OpenCV format)
    """
    try:
        doc = fitz.open(pdf_path)
        page = doc[0]  # Get first page

        # Render at high DPI
        mat = fitz.Matrix(dpi / 72, dpi / 72)
        pix = page.get_pixmap(matrix=mat, alpha=False)

        # Convert to numpy array
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)

        # Convert RGB to BGR for OpenCV
        if pix.n == 3:  # RGB
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        elif pix.n == 1:  # Grayscale
            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)

        doc.close()
        return img, pix.width, pix.height, None

    except Exception as e:
        return None, 0, 0, str(e)


def parse_vlm_location_response(response_text, page_width, page_height):
    """
    Parse VLM response to extract signature locations.
    Expected format from VLM should include percentages or pixel coordinates.

    Returns: list of bounding boxes [(x, y, w, h), ...]
    """
    import re

    locations = []

    # Pattern to match: "Signature N: left=X%, top=Y%, width=W%, height=H%"
    pattern = r'Signature\s+\d+:\s*left=([0-9.]+)%,?\s*top=([0-9.]+)%,?\s*width=([0-9.]+)%,?\s*height=([0-9.]+)%'

    matches = re.findall(pattern, response_text)

    for match in matches:
        left_pct = float(match[0])
        top_pct = float(match[1])
        width_pct = float(match[2])
        height_pct = float(match[3])

        # Convert percentages to pixel coordinates
        x = int(page_width * left_pct / 100)
        y = int(page_height * top_pct / 100)
        w = int(page_width * width_pct / 100)
        h = int(page_height * height_pct / 100)

        locations.append((x, y, w, h))

    print(f"  - Parsed {len(locations)} signature location(s)")

    return locations


def check_pdf_has_image_at_location(pdf_path, bbox):
    """
    Check if PDF has a SMALL image object at the specified location.
    If the image is a full-page scan, return False to use OpenCV cropping instead.
    bbox: (x, y, w, h) in pixel coordinates
    Returns: (has_image: bool, image_xref: int or None)
    """
    try:
        doc = fitz.open(pdf_path)
        page = doc[0]

        # Get all images on the page
        image_list = page.get_images(full=True)

        if not image_list:
            doc.close()
            return False, None

        # Get page dimensions (in points, 72 DPI)
        page_rect = page.rect
        page_width = page_rect.width
        page_height = page_rect.height

        # Check each image
        for img_info in image_list:
            xref = img_info[0]

            # Get image dimensions
            try:
                base_image = doc.extract_image(xref)
                img_width = base_image["width"]
                img_height = base_image["height"]

                # Check if this is a full-page scan
                # If image is close to page size, it's a scanned page, not a signature
                width_ratio = img_width / (page_width * 4)  # Approx conversion to pixels at 300 DPI
                height_ratio = img_height / (page_height * 4)

                # If image covers >80% of page, it's a full-page scan
                if width_ratio > 0.8 and height_ratio > 0.8:
                    # This is a full-page scan, don't extract it
                    # Fall back to OpenCV cropping
                    continue

                # This might be a small embedded image (actual signature scan)
                # For now, we'll still use OpenCV cropping for consistency
                # but this logic can be refined later

            except:
                continue

        # No suitable small images found, use OpenCV cropping
        doc.close()
        return False, None

    except Exception as e:
        print(f"Error checking PDF images: {e}")
        return False, None


def extract_pdf_image_object(pdf_path, xref, output_path):
    """
    Extract image object from PDF.
    Returns: (success: bool, error: str)
    """
    try:
        doc = fitz.open(pdf_path)

        # Extract image
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]

        # Save image
        output_file = f"{output_path}.{image_ext}"
        with open(output_file, "wb") as f:
            f.write(image_bytes)

        doc.close()
        return True, None, output_file

    except Exception as e:
        return False, str(e), None


def extract_region_with_opencv(image, bbox, output_path):
    """
    Extract region from image using OpenCV with generous padding.
    bbox: (x, y, w, h)
    Returns: (success: bool, error: str)
    """
    try:
        x, y, w, h = bbox

        # Add generous padding (50% of box size or minimum 50 pixels)
        # This ensures we capture the full signature even if VLM bbox is slightly off
        padding_x = max(50, int(w * 0.5))  # 50% padding on sides
        padding_y = max(50, int(h * 0.5))  # 50% padding on top/bottom

        x_pad = max(0, x - padding_x)
        y_pad = max(0, y - padding_y)
        x_end = min(image.shape[1], x + w + padding_x)
        y_end = min(image.shape[0], y + h + padding_y)

        w_pad = x_end - x_pad
        h_pad = y_end - y_pad

        # Extract region
        region = image[y_pad:y_pad + h_pad, x_pad:x_pad + w_pad]

        # Save
        output_file = f"{output_path}.png"
        cv2.imwrite(output_file, region)

        return True, None, output_file

    except Exception as e:
        return False, str(e), None


def verify_signature_with_vlm(image_path):
    """
    Verify that extracted region contains a signature with VLM.
    Returns: (is_signature: bool, error: str)
    """
    try:
        # Read image
        image = cv2.imread(image_path)

        # Encode to base64
        image_base64 = encode_image_to_base64(image)

        # Ask VLM
        prompt = "Is this a signature with a Chinese name? Answer only 'yes' or 'no'."
        response, error = call_ollama_vision(image_base64, prompt)

        if error:
            return False, error

        # Check if response contains 'yes'
        is_signature = 'yes' in response.lower()

        return is_signature, None

    except Exception as e:
        return False, str(e)


def process_pdf_page(pdf_path, output_dir):
    """
    Process a single PDF page to extract signatures using VLM.

    Workflow:
    1. VLM locates signatures
    2. Check if PDF has image objects at those locations
    3. Extract via PDF object or OpenCV cropping
    4. VLM verifies extracted regions

    Returns: (signature_count, extracted_files, error)
    """
    pdf_name = Path(pdf_path).stem

    # Step 1: Render page as image
    print("  - Rendering page...", end='', flush=True)
    image, page_width, page_height, error = render_pdf_page_as_image(pdf_path, DPI)

    if error:
        print(f" ERROR")
        return 0, [], f"Render error: {error}"

    print(" OK")

    # Step 2: Encode image and ask VLM to locate signatures
    print("  - Asking VLM to locate signatures...", end='', flush=True)
    image_base64 = encode_image_to_base64(image)

    location_prompt = """Please analyze this document page and locate ONLY handwritten signatures with Chinese names.

IMPORTANT: Only mark areas with ACTUAL handwritten pen/ink signatures.
Do NOT mark:
- Printed text or typed names
- Dates or reference numbers
- Form field labels or instructions
- Underlines or signature lines (empty boxes)
- Stamps or seals

Look for actual handwritten Chinese characters that are signatures.

For each HANDWRITTEN signature found, provide the location as percentages from the top-left corner:
- Distance from left edge (% of page width)
- Distance from top edge (% of page height)
- Width (% of page width)
- Height (% of page height)

Format your response as:
Signature 1: left=X%, top=Y%, width=W%, height=H%
Signature 2: left=X%, top=Y%, width=W%, height=H%

If no handwritten signatures found, say "No signatures found"."""

    response, error = call_ollama_vision(image_base64, location_prompt)

    if error:
        print(f" ERROR")
        return 0, [], f"VLM error: {error}"

    print(" OK")
    print(f"  - VLM Response:\n{response}")

    # Step 3: Parse locations (this needs to be implemented based on actual VLM responses)
    locations = parse_vlm_location_response(response, page_width, page_height)

    if not locations:
        print("  - No signatures located by VLM")
        return 0, [], None

    # Step 4: Extract each located signature
    extracted_files = []

    for idx, bbox in enumerate(locations):
        print(f"  - Extracting signature {idx + 1}...", end='', flush=True)

        # Check if PDF has image object
        has_image, xref = check_pdf_has_image_at_location(pdf_path, bbox)

        output_base = os.path.join(output_dir, f"{pdf_name}_signature_{idx + 1}")

        if has_image and xref:
            # Extract PDF image object
            success, error, output_file = extract_pdf_image_object(pdf_path, xref, output_base)
        else:
            # Extract with OpenCV
            success, error, output_file = extract_region_with_opencv(image, bbox, output_base)

        if not success:
            print(f" FAILED: {error}")
            continue

        print(f" OK")

        # Step 5: Verify with VLM
        print(f"  - Verifying signature {idx + 1}...", end='', flush=True)
        is_signature, verify_error = verify_signature_with_vlm(output_file)

        if verify_error:
            print(f" ERROR: {verify_error}")
            continue

        if is_signature:
            print(" VERIFIED")
            extracted_files.append(output_file)
        else:
            print(" NOT A SIGNATURE - moved to rejected/")
            # Move to rejected folder instead of deleting
            rejected_file = os.path.join(REJECTED_PATH, os.path.basename(output_file))
            os.rename(output_file, rejected_file)

    return len(extracted_files), extracted_files, None


def main():
    """Main processing function"""
    global LOG_FILE

    print(f"Starting VLM-guided signature extraction...")
    print(f"Ollama URL: {OLLAMA_URL}")
    print(f"Model: {OLLAMA_MODEL}")
    print(f"Input path: {PDF_INPUT_PATH}")
    print(f"Output path: {OUTPUT_PATH}")
    print()

    # Test Ollama connection
    print("Testing Ollama connection...")
    try:
        response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5)
        response.raise_for_status()
        print("✓ Ollama connection successful\n")
    except Exception as e:
        print(f"✗ Ollama connection failed: {e}")
        print(f"Please check that Ollama is running at {OLLAMA_URL}")
        return

    # Create output directories
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    os.makedirs(REJECTED_PATH, exist_ok=True)

    LOG_FILE = os.path.join(OUTPUT_PATH, f"vlm_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")

    # Get PDF files
    pdf_files = sorted(Path(PDF_INPUT_PATH).glob("*.pdf"))[:5]  # Test with first 5 files

    if not pdf_files:
        print("ERROR: No PDF files found!")
        return

    print(f"Found {len(pdf_files)} PDF files to process (testing with first 5)\n")

    # Statistics
    stats = {
        'total_pdfs': 0,
        'pdfs_with_signatures': 0,
        'total_signatures': 0,
        'errors': 0
    }

    # Open log file
    with open(LOG_FILE, 'w', newline='') as log_file:
        log_writer = csv.writer(log_file)
        log_writer.writerow([
            'pdf_filename', 'signatures_found', 'extracted_files', 'error'
        ])

        # Process each PDF
        for i, pdf_path in enumerate(pdf_files):
            stats['total_pdfs'] += 1
            pdf_filename = pdf_path.name

            print(f"[{i+1}/{len(pdf_files)}] Processing: {pdf_filename}")

            # Extract signatures
            sig_count, extracted_files, error = process_pdf_page(str(pdf_path), OUTPUT_PATH)

            if error:
                print(f"  ERROR: {error}\n")
                stats['errors'] += 1
                log_writer.writerow([pdf_filename, 0, "", error])
                continue

            if sig_count > 0:
                stats['pdfs_with_signatures'] += 1
                stats['total_signatures'] += sig_count
                print(f"  ✓ Extracted {sig_count} signature(s)\n")

                filenames = [Path(f).name for f in extracted_files]
                log_writer.writerow([
                    pdf_filename,
                    sig_count,
                    ", ".join(filenames),
                    ""
                ])
            else:
                print(f"  No signatures extracted\n")
                log_writer.writerow([pdf_filename, 0, "", ""])

    # Print summary
    print("="*60)
    print("VLM EXTRACTION SUMMARY")
    print("="*60)
    print(f"Total PDFs processed:        {stats['total_pdfs']}")
    print(f"PDFs with signatures:        {stats['pdfs_with_signatures']}")
    print(f"Total signatures extracted:  {stats['total_signatures']}")
    print(f"Errors:                      {stats['errors']}")
    print(f"\nLog file: {LOG_FILE}")
    print("="*60)


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n\nProcess interrupted by user.")
        sys.exit(1)
    except Exception as e:
        print(f"\n\nFATAL ERROR: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)