Files
pdf_signature_extraction/extract_signatures_vlm.py
gbanyan 52612e14ba Add hybrid signature extraction with name-based verification
Implement VLM name extraction + CV detection hybrid approach to
replace unreliable VLM coordinate system with name-based verification.

Key Features:
- VLM extracts signature names (周寶蓮, 魏興海, etc.)
- CV or PDF text layer detects regions
- VLM verifies each region against expected names
- Signatures saved with person names: signature_周寶蓮.png
- Duplicate prevention and rejection handling

Test Results:
- 5 PDF pages tested
- 7/10 signatures extracted (70% recall)
- 100% precision (no false positives)
- No blank regions extracted (previous issue resolved)

Files:
- extract_pages_from_csv.py: Extract pages from CSV (tested: 100 files)
- extract_signatures_hybrid.py: Hybrid extraction (current working solution)
- extract_handwriting.py: CV-only approach (component)
- extract_signatures_vlm.py: Deprecated VLM coordinate approach
- PROJECT_DOCUMENTATION.md: Complete project history and results
- SESSION_INIT.md: Session handoff documentation
- SESSION_CHECKLIST.md: Status checklist
- NEW_SESSION_PROMPT.txt: Template for next session
- HOW_TO_CONTINUE.txt: Visual handoff guide
- COMMIT_SUMMARY.md: Commit preparation guide
- README.md: Quick start guide
- README_page_extraction.md: Page extraction docs
- README_hybrid_extraction.md: Hybrid approach docs
- .gitignore: Exclude diagnostic scripts and outputs

Known Limitations:
- 30% of signatures missed due to conservative CV parameters
- Text layer method untested (all test PDFs are scanned images)
- Performance: ~24 seconds per PDF

Next Steps:
- Tune CV parameters for higher recall
- Test with larger dataset (100+ files)
- Process full dataset (86,073 files)

🤖 Generated with Claude Code
2025-10-26 23:39:52 +08:00

506 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Script to extract signatures using VLM (Vision Language Model) guidance.
Uses Ollama instance with qwen2.5vl:32b for signature detection.
"""
import cv2
import numpy as np
import os
import sys
import json
import base64
import requests
from pathlib import Path
from datetime import datetime
import fitz # PyMuPDF
import csv
from io import BytesIO
# Configuration
PDF_INPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output"
OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/signatures"
REJECTED_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/signatures/rejected"
LOG_FILE = None # Will be set in main()
# Ollama Configuration
OLLAMA_URL = "http://192.168.30.36:11434"
OLLAMA_MODEL = "qwen2.5vl:32b"
# Image processing parameters
DPI = 300 # Resolution for rendering PDF page
def encode_image_to_base64(image_array):
"""
Encode numpy image array to base64 string for Ollama API.
"""
# Convert BGR to RGB
image_rgb = cv2.cvtColor(image_array, cv2.COLOR_BGR2RGB)
# Encode as JPEG
_, buffer = cv2.imencode('.jpg', image_rgb)
# Convert to base64
image_base64 = base64.b64encode(buffer).decode('utf-8')
return image_base64
def call_ollama_vision(image_base64, prompt):
"""
Call Ollama vision model with image and prompt.
Returns the model's text response.
"""
try:
url = f"{OLLAMA_URL}/api/generate"
payload = {
"model": OLLAMA_MODEL,
"prompt": prompt,
"images": [image_base64],
"stream": False
}
response = requests.post(url, json=payload, timeout=120)
response.raise_for_status()
result = response.json()
return result.get('response', ''), None
except Exception as e:
return None, str(e)
def render_pdf_page_as_image(pdf_path, dpi=300):
"""
Render PDF page as a high-resolution image.
Returns: numpy array (OpenCV format)
"""
try:
doc = fitz.open(pdf_path)
page = doc[0] # Get first page
# Render at high DPI
mat = fitz.Matrix(dpi / 72, dpi / 72)
pix = page.get_pixmap(matrix=mat, alpha=False)
# Convert to numpy array
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
# Convert RGB to BGR for OpenCV
if pix.n == 3: # RGB
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
elif pix.n == 1: # Grayscale
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
doc.close()
return img, pix.width, pix.height, None
except Exception as e:
return None, 0, 0, str(e)
def parse_vlm_location_response(response_text, page_width, page_height):
"""
Parse VLM response to extract signature locations.
Expected format from VLM should include percentages or pixel coordinates.
Returns: list of bounding boxes [(x, y, w, h), ...]
"""
import re
locations = []
# Pattern to match: "Signature N: left=X%, top=Y%, width=W%, height=H%"
pattern = r'Signature\s+\d+:\s*left=([0-9.]+)%,?\s*top=([0-9.]+)%,?\s*width=([0-9.]+)%,?\s*height=([0-9.]+)%'
matches = re.findall(pattern, response_text)
for match in matches:
left_pct = float(match[0])
top_pct = float(match[1])
width_pct = float(match[2])
height_pct = float(match[3])
# Convert percentages to pixel coordinates
x = int(page_width * left_pct / 100)
y = int(page_height * top_pct / 100)
w = int(page_width * width_pct / 100)
h = int(page_height * height_pct / 100)
locations.append((x, y, w, h))
print(f" - Parsed {len(locations)} signature location(s)")
return locations
def check_pdf_has_image_at_location(pdf_path, bbox):
"""
Check if PDF has a SMALL image object at the specified location.
If the image is a full-page scan, return False to use OpenCV cropping instead.
bbox: (x, y, w, h) in pixel coordinates
Returns: (has_image: bool, image_xref: int or None)
"""
try:
doc = fitz.open(pdf_path)
page = doc[0]
# Get all images on the page
image_list = page.get_images(full=True)
if not image_list:
doc.close()
return False, None
# Get page dimensions (in points, 72 DPI)
page_rect = page.rect
page_width = page_rect.width
page_height = page_rect.height
# Check each image
for img_info in image_list:
xref = img_info[0]
# Get image dimensions
try:
base_image = doc.extract_image(xref)
img_width = base_image["width"]
img_height = base_image["height"]
# Check if this is a full-page scan
# If image is close to page size, it's a scanned page, not a signature
width_ratio = img_width / (page_width * 4) # Approx conversion to pixels at 300 DPI
height_ratio = img_height / (page_height * 4)
# If image covers >80% of page, it's a full-page scan
if width_ratio > 0.8 and height_ratio > 0.8:
# This is a full-page scan, don't extract it
# Fall back to OpenCV cropping
continue
# This might be a small embedded image (actual signature scan)
# For now, we'll still use OpenCV cropping for consistency
# but this logic can be refined later
except:
continue
# No suitable small images found, use OpenCV cropping
doc.close()
return False, None
except Exception as e:
print(f"Error checking PDF images: {e}")
return False, None
def extract_pdf_image_object(pdf_path, xref, output_path):
"""
Extract image object from PDF.
Returns: (success: bool, error: str)
"""
try:
doc = fitz.open(pdf_path)
# Extract image
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
# Save image
output_file = f"{output_path}.{image_ext}"
with open(output_file, "wb") as f:
f.write(image_bytes)
doc.close()
return True, None, output_file
except Exception as e:
return False, str(e), None
def extract_region_with_opencv(image, bbox, output_path):
"""
Extract region from image using OpenCV with generous padding.
bbox: (x, y, w, h)
Returns: (success: bool, error: str)
"""
try:
x, y, w, h = bbox
# Add generous padding (50% of box size or minimum 50 pixels)
# This ensures we capture the full signature even if VLM bbox is slightly off
padding_x = max(50, int(w * 0.5)) # 50% padding on sides
padding_y = max(50, int(h * 0.5)) # 50% padding on top/bottom
x_pad = max(0, x - padding_x)
y_pad = max(0, y - padding_y)
x_end = min(image.shape[1], x + w + padding_x)
y_end = min(image.shape[0], y + h + padding_y)
w_pad = x_end - x_pad
h_pad = y_end - y_pad
# Extract region
region = image[y_pad:y_pad + h_pad, x_pad:x_pad + w_pad]
# Save
output_file = f"{output_path}.png"
cv2.imwrite(output_file, region)
return True, None, output_file
except Exception as e:
return False, str(e), None
def verify_signature_with_vlm(image_path):
"""
Verify that extracted region contains a signature with VLM.
Returns: (is_signature: bool, error: str)
"""
try:
# Read image
image = cv2.imread(image_path)
# Encode to base64
image_base64 = encode_image_to_base64(image)
# Ask VLM
prompt = "Is this a signature with a Chinese name? Answer only 'yes' or 'no'."
response, error = call_ollama_vision(image_base64, prompt)
if error:
return False, error
# Check if response contains 'yes'
is_signature = 'yes' in response.lower()
return is_signature, None
except Exception as e:
return False, str(e)
def process_pdf_page(pdf_path, output_dir):
"""
Process a single PDF page to extract signatures using VLM.
Workflow:
1. VLM locates signatures
2. Check if PDF has image objects at those locations
3. Extract via PDF object or OpenCV cropping
4. VLM verifies extracted regions
Returns: (signature_count, extracted_files, error)
"""
pdf_name = Path(pdf_path).stem
# Step 1: Render page as image
print(" - Rendering page...", end='', flush=True)
image, page_width, page_height, error = render_pdf_page_as_image(pdf_path, DPI)
if error:
print(f" ERROR")
return 0, [], f"Render error: {error}"
print(" OK")
# Step 2: Encode image and ask VLM to locate signatures
print(" - Asking VLM to locate signatures...", end='', flush=True)
image_base64 = encode_image_to_base64(image)
location_prompt = """Please analyze this document page and locate ONLY handwritten signatures with Chinese names.
IMPORTANT: Only mark areas with ACTUAL handwritten pen/ink signatures.
Do NOT mark:
- Printed text or typed names
- Dates or reference numbers
- Form field labels or instructions
- Underlines or signature lines (empty boxes)
- Stamps or seals
Look for actual handwritten Chinese characters that are signatures.
For each HANDWRITTEN signature found, provide the location as percentages from the top-left corner:
- Distance from left edge (% of page width)
- Distance from top edge (% of page height)
- Width (% of page width)
- Height (% of page height)
Format your response as:
Signature 1: left=X%, top=Y%, width=W%, height=H%
Signature 2: left=X%, top=Y%, width=W%, height=H%
If no handwritten signatures found, say "No signatures found"."""
response, error = call_ollama_vision(image_base64, location_prompt)
if error:
print(f" ERROR")
return 0, [], f"VLM error: {error}"
print(" OK")
print(f" - VLM Response:\n{response}")
# Step 3: Parse locations (this needs to be implemented based on actual VLM responses)
locations = parse_vlm_location_response(response, page_width, page_height)
if not locations:
print(" - No signatures located by VLM")
return 0, [], None
# Step 4: Extract each located signature
extracted_files = []
for idx, bbox in enumerate(locations):
print(f" - Extracting signature {idx + 1}...", end='', flush=True)
# Check if PDF has image object
has_image, xref = check_pdf_has_image_at_location(pdf_path, bbox)
output_base = os.path.join(output_dir, f"{pdf_name}_signature_{idx + 1}")
if has_image and xref:
# Extract PDF image object
success, error, output_file = extract_pdf_image_object(pdf_path, xref, output_base)
else:
# Extract with OpenCV
success, error, output_file = extract_region_with_opencv(image, bbox, output_base)
if not success:
print(f" FAILED: {error}")
continue
print(f" OK")
# Step 5: Verify with VLM
print(f" - Verifying signature {idx + 1}...", end='', flush=True)
is_signature, verify_error = verify_signature_with_vlm(output_file)
if verify_error:
print(f" ERROR: {verify_error}")
continue
if is_signature:
print(" VERIFIED")
extracted_files.append(output_file)
else:
print(" NOT A SIGNATURE - moved to rejected/")
# Move to rejected folder instead of deleting
rejected_file = os.path.join(REJECTED_PATH, os.path.basename(output_file))
os.rename(output_file, rejected_file)
return len(extracted_files), extracted_files, None
def main():
"""Main processing function"""
global LOG_FILE
print(f"Starting VLM-guided signature extraction...")
print(f"Ollama URL: {OLLAMA_URL}")
print(f"Model: {OLLAMA_MODEL}")
print(f"Input path: {PDF_INPUT_PATH}")
print(f"Output path: {OUTPUT_PATH}")
print()
# Test Ollama connection
print("Testing Ollama connection...")
try:
response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5)
response.raise_for_status()
print("✓ Ollama connection successful\n")
except Exception as e:
print(f"✗ Ollama connection failed: {e}")
print(f"Please check that Ollama is running at {OLLAMA_URL}")
return
# Create output directories
os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs(REJECTED_PATH, exist_ok=True)
LOG_FILE = os.path.join(OUTPUT_PATH, f"vlm_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
# Get PDF files
pdf_files = sorted(Path(PDF_INPUT_PATH).glob("*.pdf"))[:5] # Test with first 5 files
if not pdf_files:
print("ERROR: No PDF files found!")
return
print(f"Found {len(pdf_files)} PDF files to process (testing with first 5)\n")
# Statistics
stats = {
'total_pdfs': 0,
'pdfs_with_signatures': 0,
'total_signatures': 0,
'errors': 0
}
# Open log file
with open(LOG_FILE, 'w', newline='') as log_file:
log_writer = csv.writer(log_file)
log_writer.writerow([
'pdf_filename', 'signatures_found', 'extracted_files', 'error'
])
# Process each PDF
for i, pdf_path in enumerate(pdf_files):
stats['total_pdfs'] += 1
pdf_filename = pdf_path.name
print(f"[{i+1}/{len(pdf_files)}] Processing: {pdf_filename}")
# Extract signatures
sig_count, extracted_files, error = process_pdf_page(str(pdf_path), OUTPUT_PATH)
if error:
print(f" ERROR: {error}\n")
stats['errors'] += 1
log_writer.writerow([pdf_filename, 0, "", error])
continue
if sig_count > 0:
stats['pdfs_with_signatures'] += 1
stats['total_signatures'] += sig_count
print(f" ✓ Extracted {sig_count} signature(s)\n")
filenames = [Path(f).name for f in extracted_files]
log_writer.writerow([
pdf_filename,
sig_count,
", ".join(filenames),
""
])
else:
print(f" No signatures extracted\n")
log_writer.writerow([pdf_filename, 0, "", ""])
# Print summary
print("="*60)
print("VLM EXTRACTION SUMMARY")
print("="*60)
print(f"Total PDFs processed: {stats['total_pdfs']}")
print(f"PDFs with signatures: {stats['pdfs_with_signatures']}")
print(f"Total signatures extracted: {stats['total_signatures']}")
print(f"Errors: {stats['errors']}")
print(f"\nLog file: {LOG_FILE}")
print("="*60)
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\n\nProcess interrupted by user.")
sys.exit(1)
except Exception as e:
print(f"\n\nFATAL ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)