Add hybrid signature extraction with name-based verification
Implement VLM name extraction + CV detection hybrid approach to
replace unreliable VLM coordinate system with name-based verification.
Key Features:
- VLM extracts signature names (周寶蓮, 魏興海, etc.)
- CV or PDF text layer detects regions
- VLM verifies each region against expected names
- Signatures saved with person names: signature_周寶蓮.png
- Duplicate prevention and rejection handling
Test Results:
- 5 PDF pages tested
- 7/10 signatures extracted (70% recall)
- 100% precision (no false positives)
- No blank regions extracted (previous issue resolved)
Files:
- extract_pages_from_csv.py: Extract pages from CSV (tested: 100 files)
- extract_signatures_hybrid.py: Hybrid extraction (current working solution)
- extract_handwriting.py: CV-only approach (component)
- extract_signatures_vlm.py: Deprecated VLM coordinate approach
- PROJECT_DOCUMENTATION.md: Complete project history and results
- SESSION_INIT.md: Session handoff documentation
- SESSION_CHECKLIST.md: Status checklist
- NEW_SESSION_PROMPT.txt: Template for next session
- HOW_TO_CONTINUE.txt: Visual handoff guide
- COMMIT_SUMMARY.md: Commit preparation guide
- README.md: Quick start guide
- README_page_extraction.md: Page extraction docs
- README_hybrid_extraction.md: Hybrid approach docs
- .gitignore: Exclude diagnostic scripts and outputs
Known Limitations:
- 30% of signatures missed due to conservative CV parameters
- Text layer method untested (all test PDFs are scanned images)
- Performance: ~24 seconds per PDF
Next Steps:
- Tune CV parameters for higher recall
- Test with larger dataset (100+ files)
- Process full dataset (86,073 files)
🤖 Generated with Claude Code
This commit is contained in:
505
extract_signatures_vlm.py
Normal file
505
extract_signatures_vlm.py
Normal file
@@ -0,0 +1,505 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to extract signatures using VLM (Vision Language Model) guidance.
|
||||
Uses Ollama instance with qwen2.5vl:32b for signature detection.
|
||||
"""
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import base64
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import fitz # PyMuPDF
|
||||
import csv
|
||||
from io import BytesIO
|
||||
|
||||
# Configuration
|
||||
PDF_INPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output"
|
||||
OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/signatures"
|
||||
REJECTED_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/signatures/rejected"
|
||||
LOG_FILE = None # Will be set in main()
|
||||
|
||||
# Ollama Configuration
|
||||
OLLAMA_URL = "http://192.168.30.36:11434"
|
||||
OLLAMA_MODEL = "qwen2.5vl:32b"
|
||||
|
||||
# Image processing parameters
|
||||
DPI = 300 # Resolution for rendering PDF page
|
||||
|
||||
|
||||
def encode_image_to_base64(image_array):
|
||||
"""
|
||||
Encode numpy image array to base64 string for Ollama API.
|
||||
"""
|
||||
# Convert BGR to RGB
|
||||
image_rgb = cv2.cvtColor(image_array, cv2.COLOR_BGR2RGB)
|
||||
|
||||
# Encode as JPEG
|
||||
_, buffer = cv2.imencode('.jpg', image_rgb)
|
||||
|
||||
# Convert to base64
|
||||
image_base64 = base64.b64encode(buffer).decode('utf-8')
|
||||
|
||||
return image_base64
|
||||
|
||||
|
||||
def call_ollama_vision(image_base64, prompt):
|
||||
"""
|
||||
Call Ollama vision model with image and prompt.
|
||||
Returns the model's text response.
|
||||
"""
|
||||
try:
|
||||
url = f"{OLLAMA_URL}/api/generate"
|
||||
|
||||
payload = {
|
||||
"model": OLLAMA_MODEL,
|
||||
"prompt": prompt,
|
||||
"images": [image_base64],
|
||||
"stream": False
|
||||
}
|
||||
|
||||
response = requests.post(url, json=payload, timeout=120)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
return result.get('response', ''), None
|
||||
|
||||
except Exception as e:
|
||||
return None, str(e)
|
||||
|
||||
|
||||
def render_pdf_page_as_image(pdf_path, dpi=300):
|
||||
"""
|
||||
Render PDF page as a high-resolution image.
|
||||
Returns: numpy array (OpenCV format)
|
||||
"""
|
||||
try:
|
||||
doc = fitz.open(pdf_path)
|
||||
page = doc[0] # Get first page
|
||||
|
||||
# Render at high DPI
|
||||
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
||||
pix = page.get_pixmap(matrix=mat, alpha=False)
|
||||
|
||||
# Convert to numpy array
|
||||
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
|
||||
|
||||
# Convert RGB to BGR for OpenCV
|
||||
if pix.n == 3: # RGB
|
||||
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
||||
elif pix.n == 1: # Grayscale
|
||||
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
||||
|
||||
doc.close()
|
||||
return img, pix.width, pix.height, None
|
||||
|
||||
except Exception as e:
|
||||
return None, 0, 0, str(e)
|
||||
|
||||
|
||||
def parse_vlm_location_response(response_text, page_width, page_height):
|
||||
"""
|
||||
Parse VLM response to extract signature locations.
|
||||
Expected format from VLM should include percentages or pixel coordinates.
|
||||
|
||||
Returns: list of bounding boxes [(x, y, w, h), ...]
|
||||
"""
|
||||
import re
|
||||
|
||||
locations = []
|
||||
|
||||
# Pattern to match: "Signature N: left=X%, top=Y%, width=W%, height=H%"
|
||||
pattern = r'Signature\s+\d+:\s*left=([0-9.]+)%,?\s*top=([0-9.]+)%,?\s*width=([0-9.]+)%,?\s*height=([0-9.]+)%'
|
||||
|
||||
matches = re.findall(pattern, response_text)
|
||||
|
||||
for match in matches:
|
||||
left_pct = float(match[0])
|
||||
top_pct = float(match[1])
|
||||
width_pct = float(match[2])
|
||||
height_pct = float(match[3])
|
||||
|
||||
# Convert percentages to pixel coordinates
|
||||
x = int(page_width * left_pct / 100)
|
||||
y = int(page_height * top_pct / 100)
|
||||
w = int(page_width * width_pct / 100)
|
||||
h = int(page_height * height_pct / 100)
|
||||
|
||||
locations.append((x, y, w, h))
|
||||
|
||||
print(f" - Parsed {len(locations)} signature location(s)")
|
||||
|
||||
return locations
|
||||
|
||||
|
||||
def check_pdf_has_image_at_location(pdf_path, bbox):
|
||||
"""
|
||||
Check if PDF has a SMALL image object at the specified location.
|
||||
If the image is a full-page scan, return False to use OpenCV cropping instead.
|
||||
bbox: (x, y, w, h) in pixel coordinates
|
||||
Returns: (has_image: bool, image_xref: int or None)
|
||||
"""
|
||||
try:
|
||||
doc = fitz.open(pdf_path)
|
||||
page = doc[0]
|
||||
|
||||
# Get all images on the page
|
||||
image_list = page.get_images(full=True)
|
||||
|
||||
if not image_list:
|
||||
doc.close()
|
||||
return False, None
|
||||
|
||||
# Get page dimensions (in points, 72 DPI)
|
||||
page_rect = page.rect
|
||||
page_width = page_rect.width
|
||||
page_height = page_rect.height
|
||||
|
||||
# Check each image
|
||||
for img_info in image_list:
|
||||
xref = img_info[0]
|
||||
|
||||
# Get image dimensions
|
||||
try:
|
||||
base_image = doc.extract_image(xref)
|
||||
img_width = base_image["width"]
|
||||
img_height = base_image["height"]
|
||||
|
||||
# Check if this is a full-page scan
|
||||
# If image is close to page size, it's a scanned page, not a signature
|
||||
width_ratio = img_width / (page_width * 4) # Approx conversion to pixels at 300 DPI
|
||||
height_ratio = img_height / (page_height * 4)
|
||||
|
||||
# If image covers >80% of page, it's a full-page scan
|
||||
if width_ratio > 0.8 and height_ratio > 0.8:
|
||||
# This is a full-page scan, don't extract it
|
||||
# Fall back to OpenCV cropping
|
||||
continue
|
||||
|
||||
# This might be a small embedded image (actual signature scan)
|
||||
# For now, we'll still use OpenCV cropping for consistency
|
||||
# but this logic can be refined later
|
||||
|
||||
except:
|
||||
continue
|
||||
|
||||
# No suitable small images found, use OpenCV cropping
|
||||
doc.close()
|
||||
return False, None
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error checking PDF images: {e}")
|
||||
return False, None
|
||||
|
||||
|
||||
def extract_pdf_image_object(pdf_path, xref, output_path):
|
||||
"""
|
||||
Extract image object from PDF.
|
||||
Returns: (success: bool, error: str)
|
||||
"""
|
||||
try:
|
||||
doc = fitz.open(pdf_path)
|
||||
|
||||
# Extract image
|
||||
base_image = doc.extract_image(xref)
|
||||
image_bytes = base_image["image"]
|
||||
image_ext = base_image["ext"]
|
||||
|
||||
# Save image
|
||||
output_file = f"{output_path}.{image_ext}"
|
||||
with open(output_file, "wb") as f:
|
||||
f.write(image_bytes)
|
||||
|
||||
doc.close()
|
||||
return True, None, output_file
|
||||
|
||||
except Exception as e:
|
||||
return False, str(e), None
|
||||
|
||||
|
||||
def extract_region_with_opencv(image, bbox, output_path):
|
||||
"""
|
||||
Extract region from image using OpenCV with generous padding.
|
||||
bbox: (x, y, w, h)
|
||||
Returns: (success: bool, error: str)
|
||||
"""
|
||||
try:
|
||||
x, y, w, h = bbox
|
||||
|
||||
# Add generous padding (50% of box size or minimum 50 pixels)
|
||||
# This ensures we capture the full signature even if VLM bbox is slightly off
|
||||
padding_x = max(50, int(w * 0.5)) # 50% padding on sides
|
||||
padding_y = max(50, int(h * 0.5)) # 50% padding on top/bottom
|
||||
|
||||
x_pad = max(0, x - padding_x)
|
||||
y_pad = max(0, y - padding_y)
|
||||
x_end = min(image.shape[1], x + w + padding_x)
|
||||
y_end = min(image.shape[0], y + h + padding_y)
|
||||
|
||||
w_pad = x_end - x_pad
|
||||
h_pad = y_end - y_pad
|
||||
|
||||
# Extract region
|
||||
region = image[y_pad:y_pad + h_pad, x_pad:x_pad + w_pad]
|
||||
|
||||
# Save
|
||||
output_file = f"{output_path}.png"
|
||||
cv2.imwrite(output_file, region)
|
||||
|
||||
return True, None, output_file
|
||||
|
||||
except Exception as e:
|
||||
return False, str(e), None
|
||||
|
||||
|
||||
def verify_signature_with_vlm(image_path):
|
||||
"""
|
||||
Verify that extracted region contains a signature with VLM.
|
||||
Returns: (is_signature: bool, error: str)
|
||||
"""
|
||||
try:
|
||||
# Read image
|
||||
image = cv2.imread(image_path)
|
||||
|
||||
# Encode to base64
|
||||
image_base64 = encode_image_to_base64(image)
|
||||
|
||||
# Ask VLM
|
||||
prompt = "Is this a signature with a Chinese name? Answer only 'yes' or 'no'."
|
||||
response, error = call_ollama_vision(image_base64, prompt)
|
||||
|
||||
if error:
|
||||
return False, error
|
||||
|
||||
# Check if response contains 'yes'
|
||||
is_signature = 'yes' in response.lower()
|
||||
|
||||
return is_signature, None
|
||||
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
|
||||
def process_pdf_page(pdf_path, output_dir):
|
||||
"""
|
||||
Process a single PDF page to extract signatures using VLM.
|
||||
|
||||
Workflow:
|
||||
1. VLM locates signatures
|
||||
2. Check if PDF has image objects at those locations
|
||||
3. Extract via PDF object or OpenCV cropping
|
||||
4. VLM verifies extracted regions
|
||||
|
||||
Returns: (signature_count, extracted_files, error)
|
||||
"""
|
||||
pdf_name = Path(pdf_path).stem
|
||||
|
||||
# Step 1: Render page as image
|
||||
print(" - Rendering page...", end='', flush=True)
|
||||
image, page_width, page_height, error = render_pdf_page_as_image(pdf_path, DPI)
|
||||
|
||||
if error:
|
||||
print(f" ERROR")
|
||||
return 0, [], f"Render error: {error}"
|
||||
|
||||
print(" OK")
|
||||
|
||||
# Step 2: Encode image and ask VLM to locate signatures
|
||||
print(" - Asking VLM to locate signatures...", end='', flush=True)
|
||||
image_base64 = encode_image_to_base64(image)
|
||||
|
||||
location_prompt = """Please analyze this document page and locate ONLY handwritten signatures with Chinese names.
|
||||
|
||||
IMPORTANT: Only mark areas with ACTUAL handwritten pen/ink signatures.
|
||||
Do NOT mark:
|
||||
- Printed text or typed names
|
||||
- Dates or reference numbers
|
||||
- Form field labels or instructions
|
||||
- Underlines or signature lines (empty boxes)
|
||||
- Stamps or seals
|
||||
|
||||
Look for actual handwritten Chinese characters that are signatures.
|
||||
|
||||
For each HANDWRITTEN signature found, provide the location as percentages from the top-left corner:
|
||||
- Distance from left edge (% of page width)
|
||||
- Distance from top edge (% of page height)
|
||||
- Width (% of page width)
|
||||
- Height (% of page height)
|
||||
|
||||
Format your response as:
|
||||
Signature 1: left=X%, top=Y%, width=W%, height=H%
|
||||
Signature 2: left=X%, top=Y%, width=W%, height=H%
|
||||
|
||||
If no handwritten signatures found, say "No signatures found"."""
|
||||
|
||||
response, error = call_ollama_vision(image_base64, location_prompt)
|
||||
|
||||
if error:
|
||||
print(f" ERROR")
|
||||
return 0, [], f"VLM error: {error}"
|
||||
|
||||
print(" OK")
|
||||
print(f" - VLM Response:\n{response}")
|
||||
|
||||
# Step 3: Parse locations (this needs to be implemented based on actual VLM responses)
|
||||
locations = parse_vlm_location_response(response, page_width, page_height)
|
||||
|
||||
if not locations:
|
||||
print(" - No signatures located by VLM")
|
||||
return 0, [], None
|
||||
|
||||
# Step 4: Extract each located signature
|
||||
extracted_files = []
|
||||
|
||||
for idx, bbox in enumerate(locations):
|
||||
print(f" - Extracting signature {idx + 1}...", end='', flush=True)
|
||||
|
||||
# Check if PDF has image object
|
||||
has_image, xref = check_pdf_has_image_at_location(pdf_path, bbox)
|
||||
|
||||
output_base = os.path.join(output_dir, f"{pdf_name}_signature_{idx + 1}")
|
||||
|
||||
if has_image and xref:
|
||||
# Extract PDF image object
|
||||
success, error, output_file = extract_pdf_image_object(pdf_path, xref, output_base)
|
||||
else:
|
||||
# Extract with OpenCV
|
||||
success, error, output_file = extract_region_with_opencv(image, bbox, output_base)
|
||||
|
||||
if not success:
|
||||
print(f" FAILED: {error}")
|
||||
continue
|
||||
|
||||
print(f" OK")
|
||||
|
||||
# Step 5: Verify with VLM
|
||||
print(f" - Verifying signature {idx + 1}...", end='', flush=True)
|
||||
is_signature, verify_error = verify_signature_with_vlm(output_file)
|
||||
|
||||
if verify_error:
|
||||
print(f" ERROR: {verify_error}")
|
||||
continue
|
||||
|
||||
if is_signature:
|
||||
print(" VERIFIED")
|
||||
extracted_files.append(output_file)
|
||||
else:
|
||||
print(" NOT A SIGNATURE - moved to rejected/")
|
||||
# Move to rejected folder instead of deleting
|
||||
rejected_file = os.path.join(REJECTED_PATH, os.path.basename(output_file))
|
||||
os.rename(output_file, rejected_file)
|
||||
|
||||
return len(extracted_files), extracted_files, None
|
||||
|
||||
|
||||
def main():
|
||||
"""Main processing function"""
|
||||
global LOG_FILE
|
||||
|
||||
print(f"Starting VLM-guided signature extraction...")
|
||||
print(f"Ollama URL: {OLLAMA_URL}")
|
||||
print(f"Model: {OLLAMA_MODEL}")
|
||||
print(f"Input path: {PDF_INPUT_PATH}")
|
||||
print(f"Output path: {OUTPUT_PATH}")
|
||||
print()
|
||||
|
||||
# Test Ollama connection
|
||||
print("Testing Ollama connection...")
|
||||
try:
|
||||
response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5)
|
||||
response.raise_for_status()
|
||||
print("✓ Ollama connection successful\n")
|
||||
except Exception as e:
|
||||
print(f"✗ Ollama connection failed: {e}")
|
||||
print(f"Please check that Ollama is running at {OLLAMA_URL}")
|
||||
return
|
||||
|
||||
# Create output directories
|
||||
os.makedirs(OUTPUT_PATH, exist_ok=True)
|
||||
os.makedirs(REJECTED_PATH, exist_ok=True)
|
||||
|
||||
LOG_FILE = os.path.join(OUTPUT_PATH, f"vlm_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
|
||||
|
||||
# Get PDF files
|
||||
pdf_files = sorted(Path(PDF_INPUT_PATH).glob("*.pdf"))[:5] # Test with first 5 files
|
||||
|
||||
if not pdf_files:
|
||||
print("ERROR: No PDF files found!")
|
||||
return
|
||||
|
||||
print(f"Found {len(pdf_files)} PDF files to process (testing with first 5)\n")
|
||||
|
||||
# Statistics
|
||||
stats = {
|
||||
'total_pdfs': 0,
|
||||
'pdfs_with_signatures': 0,
|
||||
'total_signatures': 0,
|
||||
'errors': 0
|
||||
}
|
||||
|
||||
# Open log file
|
||||
with open(LOG_FILE, 'w', newline='') as log_file:
|
||||
log_writer = csv.writer(log_file)
|
||||
log_writer.writerow([
|
||||
'pdf_filename', 'signatures_found', 'extracted_files', 'error'
|
||||
])
|
||||
|
||||
# Process each PDF
|
||||
for i, pdf_path in enumerate(pdf_files):
|
||||
stats['total_pdfs'] += 1
|
||||
pdf_filename = pdf_path.name
|
||||
|
||||
print(f"[{i+1}/{len(pdf_files)}] Processing: {pdf_filename}")
|
||||
|
||||
# Extract signatures
|
||||
sig_count, extracted_files, error = process_pdf_page(str(pdf_path), OUTPUT_PATH)
|
||||
|
||||
if error:
|
||||
print(f" ERROR: {error}\n")
|
||||
stats['errors'] += 1
|
||||
log_writer.writerow([pdf_filename, 0, "", error])
|
||||
continue
|
||||
|
||||
if sig_count > 0:
|
||||
stats['pdfs_with_signatures'] += 1
|
||||
stats['total_signatures'] += sig_count
|
||||
print(f" ✓ Extracted {sig_count} signature(s)\n")
|
||||
|
||||
filenames = [Path(f).name for f in extracted_files]
|
||||
log_writer.writerow([
|
||||
pdf_filename,
|
||||
sig_count,
|
||||
", ".join(filenames),
|
||||
""
|
||||
])
|
||||
else:
|
||||
print(f" No signatures extracted\n")
|
||||
log_writer.writerow([pdf_filename, 0, "", ""])
|
||||
|
||||
# Print summary
|
||||
print("="*60)
|
||||
print("VLM EXTRACTION SUMMARY")
|
||||
print("="*60)
|
||||
print(f"Total PDFs processed: {stats['total_pdfs']}")
|
||||
print(f"PDFs with signatures: {stats['pdfs_with_signatures']}")
|
||||
print(f"Total signatures extracted: {stats['total_signatures']}")
|
||||
print(f"Errors: {stats['errors']}")
|
||||
print(f"\nLog file: {LOG_FILE}")
|
||||
print("="*60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nProcess interrupted by user.")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n\nFATAL ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user