#!/usr/bin/env python3 """ Script to extract signatures using VLM (Vision Language Model) guidance. Uses Ollama instance with qwen2.5vl:32b for signature detection. """ import cv2 import numpy as np import os import sys import json import base64 import requests from pathlib import Path from datetime import datetime import fitz # PyMuPDF import csv from io import BytesIO # Configuration PDF_INPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output" OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/signatures" REJECTED_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/signatures/rejected" LOG_FILE = None # Will be set in main() # Ollama Configuration OLLAMA_URL = "http://192.168.30.36:11434" OLLAMA_MODEL = "qwen2.5vl:32b" # Image processing parameters DPI = 300 # Resolution for rendering PDF page def encode_image_to_base64(image_array): """ Encode numpy image array to base64 string for Ollama API. """ # Convert BGR to RGB image_rgb = cv2.cvtColor(image_array, cv2.COLOR_BGR2RGB) # Encode as JPEG _, buffer = cv2.imencode('.jpg', image_rgb) # Convert to base64 image_base64 = base64.b64encode(buffer).decode('utf-8') return image_base64 def call_ollama_vision(image_base64, prompt): """ Call Ollama vision model with image and prompt. Returns the model's text response. """ try: url = f"{OLLAMA_URL}/api/generate" payload = { "model": OLLAMA_MODEL, "prompt": prompt, "images": [image_base64], "stream": False } response = requests.post(url, json=payload, timeout=120) response.raise_for_status() result = response.json() return result.get('response', ''), None except Exception as e: return None, str(e) def render_pdf_page_as_image(pdf_path, dpi=300): """ Render PDF page as a high-resolution image. Returns: numpy array (OpenCV format) """ try: doc = fitz.open(pdf_path) page = doc[0] # Get first page # Render at high DPI mat = fitz.Matrix(dpi / 72, dpi / 72) pix = page.get_pixmap(matrix=mat, alpha=False) # Convert to numpy array img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n) # Convert RGB to BGR for OpenCV if pix.n == 3: # RGB img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) elif pix.n == 1: # Grayscale img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) doc.close() return img, pix.width, pix.height, None except Exception as e: return None, 0, 0, str(e) def parse_vlm_location_response(response_text, page_width, page_height): """ Parse VLM response to extract signature locations. Expected format from VLM should include percentages or pixel coordinates. Returns: list of bounding boxes [(x, y, w, h), ...] """ import re locations = [] # Pattern to match: "Signature N: left=X%, top=Y%, width=W%, height=H%" pattern = r'Signature\s+\d+:\s*left=([0-9.]+)%,?\s*top=([0-9.]+)%,?\s*width=([0-9.]+)%,?\s*height=([0-9.]+)%' matches = re.findall(pattern, response_text) for match in matches: left_pct = float(match[0]) top_pct = float(match[1]) width_pct = float(match[2]) height_pct = float(match[3]) # Convert percentages to pixel coordinates x = int(page_width * left_pct / 100) y = int(page_height * top_pct / 100) w = int(page_width * width_pct / 100) h = int(page_height * height_pct / 100) locations.append((x, y, w, h)) print(f" - Parsed {len(locations)} signature location(s)") return locations def check_pdf_has_image_at_location(pdf_path, bbox): """ Check if PDF has a SMALL image object at the specified location. If the image is a full-page scan, return False to use OpenCV cropping instead. bbox: (x, y, w, h) in pixel coordinates Returns: (has_image: bool, image_xref: int or None) """ try: doc = fitz.open(pdf_path) page = doc[0] # Get all images on the page image_list = page.get_images(full=True) if not image_list: doc.close() return False, None # Get page dimensions (in points, 72 DPI) page_rect = page.rect page_width = page_rect.width page_height = page_rect.height # Check each image for img_info in image_list: xref = img_info[0] # Get image dimensions try: base_image = doc.extract_image(xref) img_width = base_image["width"] img_height = base_image["height"] # Check if this is a full-page scan # If image is close to page size, it's a scanned page, not a signature width_ratio = img_width / (page_width * 4) # Approx conversion to pixels at 300 DPI height_ratio = img_height / (page_height * 4) # If image covers >80% of page, it's a full-page scan if width_ratio > 0.8 and height_ratio > 0.8: # This is a full-page scan, don't extract it # Fall back to OpenCV cropping continue # This might be a small embedded image (actual signature scan) # For now, we'll still use OpenCV cropping for consistency # but this logic can be refined later except: continue # No suitable small images found, use OpenCV cropping doc.close() return False, None except Exception as e: print(f"Error checking PDF images: {e}") return False, None def extract_pdf_image_object(pdf_path, xref, output_path): """ Extract image object from PDF. Returns: (success: bool, error: str) """ try: doc = fitz.open(pdf_path) # Extract image base_image = doc.extract_image(xref) image_bytes = base_image["image"] image_ext = base_image["ext"] # Save image output_file = f"{output_path}.{image_ext}" with open(output_file, "wb") as f: f.write(image_bytes) doc.close() return True, None, output_file except Exception as e: return False, str(e), None def extract_region_with_opencv(image, bbox, output_path): """ Extract region from image using OpenCV with generous padding. bbox: (x, y, w, h) Returns: (success: bool, error: str) """ try: x, y, w, h = bbox # Add generous padding (50% of box size or minimum 50 pixels) # This ensures we capture the full signature even if VLM bbox is slightly off padding_x = max(50, int(w * 0.5)) # 50% padding on sides padding_y = max(50, int(h * 0.5)) # 50% padding on top/bottom x_pad = max(0, x - padding_x) y_pad = max(0, y - padding_y) x_end = min(image.shape[1], x + w + padding_x) y_end = min(image.shape[0], y + h + padding_y) w_pad = x_end - x_pad h_pad = y_end - y_pad # Extract region region = image[y_pad:y_pad + h_pad, x_pad:x_pad + w_pad] # Save output_file = f"{output_path}.png" cv2.imwrite(output_file, region) return True, None, output_file except Exception as e: return False, str(e), None def verify_signature_with_vlm(image_path): """ Verify that extracted region contains a signature with VLM. Returns: (is_signature: bool, error: str) """ try: # Read image image = cv2.imread(image_path) # Encode to base64 image_base64 = encode_image_to_base64(image) # Ask VLM prompt = "Is this a signature with a Chinese name? Answer only 'yes' or 'no'." response, error = call_ollama_vision(image_base64, prompt) if error: return False, error # Check if response contains 'yes' is_signature = 'yes' in response.lower() return is_signature, None except Exception as e: return False, str(e) def process_pdf_page(pdf_path, output_dir): """ Process a single PDF page to extract signatures using VLM. Workflow: 1. VLM locates signatures 2. Check if PDF has image objects at those locations 3. Extract via PDF object or OpenCV cropping 4. VLM verifies extracted regions Returns: (signature_count, extracted_files, error) """ pdf_name = Path(pdf_path).stem # Step 1: Render page as image print(" - Rendering page...", end='', flush=True) image, page_width, page_height, error = render_pdf_page_as_image(pdf_path, DPI) if error: print(f" ERROR") return 0, [], f"Render error: {error}" print(" OK") # Step 2: Encode image and ask VLM to locate signatures print(" - Asking VLM to locate signatures...", end='', flush=True) image_base64 = encode_image_to_base64(image) location_prompt = """Please analyze this document page and locate ONLY handwritten signatures with Chinese names. IMPORTANT: Only mark areas with ACTUAL handwritten pen/ink signatures. Do NOT mark: - Printed text or typed names - Dates or reference numbers - Form field labels or instructions - Underlines or signature lines (empty boxes) - Stamps or seals Look for actual handwritten Chinese characters that are signatures. For each HANDWRITTEN signature found, provide the location as percentages from the top-left corner: - Distance from left edge (% of page width) - Distance from top edge (% of page height) - Width (% of page width) - Height (% of page height) Format your response as: Signature 1: left=X%, top=Y%, width=W%, height=H% Signature 2: left=X%, top=Y%, width=W%, height=H% If no handwritten signatures found, say "No signatures found".""" response, error = call_ollama_vision(image_base64, location_prompt) if error: print(f" ERROR") return 0, [], f"VLM error: {error}" print(" OK") print(f" - VLM Response:\n{response}") # Step 3: Parse locations (this needs to be implemented based on actual VLM responses) locations = parse_vlm_location_response(response, page_width, page_height) if not locations: print(" - No signatures located by VLM") return 0, [], None # Step 4: Extract each located signature extracted_files = [] for idx, bbox in enumerate(locations): print(f" - Extracting signature {idx + 1}...", end='', flush=True) # Check if PDF has image object has_image, xref = check_pdf_has_image_at_location(pdf_path, bbox) output_base = os.path.join(output_dir, f"{pdf_name}_signature_{idx + 1}") if has_image and xref: # Extract PDF image object success, error, output_file = extract_pdf_image_object(pdf_path, xref, output_base) else: # Extract with OpenCV success, error, output_file = extract_region_with_opencv(image, bbox, output_base) if not success: print(f" FAILED: {error}") continue print(f" OK") # Step 5: Verify with VLM print(f" - Verifying signature {idx + 1}...", end='', flush=True) is_signature, verify_error = verify_signature_with_vlm(output_file) if verify_error: print(f" ERROR: {verify_error}") continue if is_signature: print(" VERIFIED") extracted_files.append(output_file) else: print(" NOT A SIGNATURE - moved to rejected/") # Move to rejected folder instead of deleting rejected_file = os.path.join(REJECTED_PATH, os.path.basename(output_file)) os.rename(output_file, rejected_file) return len(extracted_files), extracted_files, None def main(): """Main processing function""" global LOG_FILE print(f"Starting VLM-guided signature extraction...") print(f"Ollama URL: {OLLAMA_URL}") print(f"Model: {OLLAMA_MODEL}") print(f"Input path: {PDF_INPUT_PATH}") print(f"Output path: {OUTPUT_PATH}") print() # Test Ollama connection print("Testing Ollama connection...") try: response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5) response.raise_for_status() print("✓ Ollama connection successful\n") except Exception as e: print(f"✗ Ollama connection failed: {e}") print(f"Please check that Ollama is running at {OLLAMA_URL}") return # Create output directories os.makedirs(OUTPUT_PATH, exist_ok=True) os.makedirs(REJECTED_PATH, exist_ok=True) LOG_FILE = os.path.join(OUTPUT_PATH, f"vlm_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv") # Get PDF files pdf_files = sorted(Path(PDF_INPUT_PATH).glob("*.pdf"))[:5] # Test with first 5 files if not pdf_files: print("ERROR: No PDF files found!") return print(f"Found {len(pdf_files)} PDF files to process (testing with first 5)\n") # Statistics stats = { 'total_pdfs': 0, 'pdfs_with_signatures': 0, 'total_signatures': 0, 'errors': 0 } # Open log file with open(LOG_FILE, 'w', newline='') as log_file: log_writer = csv.writer(log_file) log_writer.writerow([ 'pdf_filename', 'signatures_found', 'extracted_files', 'error' ]) # Process each PDF for i, pdf_path in enumerate(pdf_files): stats['total_pdfs'] += 1 pdf_filename = pdf_path.name print(f"[{i+1}/{len(pdf_files)}] Processing: {pdf_filename}") # Extract signatures sig_count, extracted_files, error = process_pdf_page(str(pdf_path), OUTPUT_PATH) if error: print(f" ERROR: {error}\n") stats['errors'] += 1 log_writer.writerow([pdf_filename, 0, "", error]) continue if sig_count > 0: stats['pdfs_with_signatures'] += 1 stats['total_signatures'] += sig_count print(f" ✓ Extracted {sig_count} signature(s)\n") filenames = [Path(f).name for f in extracted_files] log_writer.writerow([ pdf_filename, sig_count, ", ".join(filenames), "" ]) else: print(f" No signatures extracted\n") log_writer.writerow([pdf_filename, 0, "", ""]) # Print summary print("="*60) print("VLM EXTRACTION SUMMARY") print("="*60) print(f"Total PDFs processed: {stats['total_pdfs']}") print(f"PDFs with signatures: {stats['pdfs_with_signatures']}") print(f"Total signatures extracted: {stats['total_signatures']}") print(f"Errors: {stats['errors']}") print(f"\nLog file: {LOG_FILE}") print("="*60) if __name__ == "__main__": try: main() except KeyboardInterrupt: print("\n\nProcess interrupted by user.") sys.exit(1) except Exception as e: print(f"\n\nFATAL ERROR: {e}") import traceback traceback.print_exc() sys.exit(1)