Implement VLM name extraction + CV detection hybrid approach to
replace unreliable VLM coordinate system with name-based verification.
Key Features:
- VLM extracts signature names (周寶蓮, 魏興海, etc.)
- CV or PDF text layer detects regions
- VLM verifies each region against expected names
- Signatures saved with person names: signature_周寶蓮.png
- Duplicate prevention and rejection handling
Test Results:
- 5 PDF pages tested
- 7/10 signatures extracted (70% recall)
- 100% precision (no false positives)
- No blank regions extracted (previous issue resolved)
Files:
- extract_pages_from_csv.py: Extract pages from CSV (tested: 100 files)
- extract_signatures_hybrid.py: Hybrid extraction (current working solution)
- extract_handwriting.py: CV-only approach (component)
- extract_signatures_vlm.py: Deprecated VLM coordinate approach
- PROJECT_DOCUMENTATION.md: Complete project history and results
- SESSION_INIT.md: Session handoff documentation
- SESSION_CHECKLIST.md: Status checklist
- NEW_SESSION_PROMPT.txt: Template for next session
- HOW_TO_CONTINUE.txt: Visual handoff guide
- COMMIT_SUMMARY.md: Commit preparation guide
- README.md: Quick start guide
- README_page_extraction.md: Page extraction docs
- README_hybrid_extraction.md: Hybrid approach docs
- .gitignore: Exclude diagnostic scripts and outputs
Known Limitations:
- 30% of signatures missed due to conservative CV parameters
- Text layer method untested (all test PDFs are scanned images)
- Performance: ~24 seconds per PDF
Next Steps:
- Tune CV parameters for higher recall
- Test with larger dataset (100+ files)
- Process full dataset (86,073 files)
🤖 Generated with Claude Code
297 lines
9.4 KiB
Python
297 lines
9.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script to detect and extract handwritten regions from PDF pages.
|
|
Uses computer vision to identify handwriting, not PDF image objects.
|
|
"""
|
|
|
|
import cv2
|
|
import numpy as np
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import fitz # PyMuPDF
|
|
import csv
|
|
|
|
# Configuration
|
|
PDF_INPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output"
|
|
OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/handwriting"
|
|
LOG_FILE = None # Will be set in main()
|
|
|
|
# Image processing parameters
|
|
DPI = 300 # Resolution for rendering PDF page
|
|
MIN_CONTOUR_AREA = 100 # Minimum area for a handwriting region (in pixels)
|
|
MAX_CONTOUR_AREA = 500000 # Maximum area (to filter out large background elements)
|
|
|
|
|
|
def render_pdf_page_as_image(pdf_path, dpi=300):
|
|
"""
|
|
Render PDF page as a high-resolution image.
|
|
Returns: numpy array (OpenCV format)
|
|
"""
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
page = doc[0] # Get first page (our extracted pages only have 1 page)
|
|
|
|
# Render at high DPI for better detection
|
|
mat = fitz.Matrix(dpi / 72, dpi / 72) # 72 DPI is default
|
|
pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
|
|
# Convert to numpy array
|
|
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
|
|
|
|
# Convert RGB to BGR for OpenCV
|
|
if pix.n == 3: # RGB
|
|
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
|
elif pix.n == 1: # Grayscale
|
|
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
|
|
|
doc.close()
|
|
return img, None
|
|
|
|
except Exception as e:
|
|
return None, str(e)
|
|
|
|
|
|
def detect_handwriting_regions(image):
|
|
"""
|
|
Detect handwritten regions in the image using computer vision.
|
|
Returns: list of bounding boxes [(x, y, w, h), ...]
|
|
"""
|
|
# Convert to grayscale
|
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
|
|
|
# Apply binary threshold (Otsu's method for automatic threshold)
|
|
# Invert so that dark ink becomes white (foreground)
|
|
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
|
|
|
# Morphological operations to connect nearby strokes
|
|
# This helps group individual pen strokes into signature regions
|
|
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 5))
|
|
dilated = cv2.dilate(binary, kernel, iterations=2)
|
|
|
|
# Find contours (connected regions)
|
|
contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
|
|
# Filter contours based on area
|
|
bounding_boxes = []
|
|
for contour in contours:
|
|
area = cv2.contourArea(contour)
|
|
|
|
# Filter by area (remove noise and very large regions)
|
|
if MIN_CONTOUR_AREA < area < MAX_CONTOUR_AREA:
|
|
x, y, w, h = cv2.boundingRect(contour)
|
|
|
|
# Additional filters:
|
|
# 1. Aspect ratio check (signatures are usually wider than tall, but not extreme)
|
|
aspect_ratio = w / float(h) if h > 0 else 0
|
|
|
|
# 2. Size check (not too small, not too large)
|
|
if 0.1 < aspect_ratio < 20 and w > 20 and h > 20:
|
|
bounding_boxes.append((x, y, w, h))
|
|
|
|
return bounding_boxes
|
|
|
|
|
|
def merge_overlapping_boxes(boxes, merge_threshold=50):
|
|
"""
|
|
Merge bounding boxes that are close to each other.
|
|
This helps combine signature parts that were detected separately.
|
|
"""
|
|
if not boxes:
|
|
return []
|
|
|
|
# Sort boxes by x-coordinate
|
|
boxes = sorted(boxes, key=lambda b: b[0])
|
|
|
|
merged = []
|
|
current = list(boxes[0]) # [x, y, w, h]
|
|
|
|
for box in boxes[1:]:
|
|
x, y, w, h = box
|
|
cx, cy, cw, ch = current
|
|
|
|
# Check if boxes are close enough to merge
|
|
# Close in x direction and overlapping or close in y direction
|
|
if (x <= cx + cw + merge_threshold and
|
|
abs(y - cy) < merge_threshold * 2):
|
|
# Merge boxes
|
|
new_x = min(cx, x)
|
|
new_y = min(cy, y)
|
|
new_w = max(cx + cw, x + w) - new_x
|
|
new_h = max(cy + ch, y + h) - new_y
|
|
current = [new_x, new_y, new_w, new_h]
|
|
else:
|
|
merged.append(tuple(current))
|
|
current = list(box)
|
|
|
|
merged.append(tuple(current))
|
|
return merged
|
|
|
|
|
|
def extract_handwriting_regions(pdf_path, output_dir, dpi=300):
|
|
"""
|
|
Extract handwritten regions from a PDF page.
|
|
Returns: (success_count, total_regions, region_info, error)
|
|
"""
|
|
try:
|
|
# Render PDF as image
|
|
image, error = render_pdf_page_as_image(pdf_path, dpi)
|
|
if error:
|
|
return 0, 0, [], f"Rendering error: {error}"
|
|
|
|
if image is None:
|
|
return 0, 0, [], "Failed to render PDF"
|
|
|
|
# Detect handwriting regions
|
|
boxes = detect_handwriting_regions(image)
|
|
|
|
if not boxes:
|
|
return 0, 0, [], None # No handwriting detected, not an error
|
|
|
|
# Merge overlapping/nearby boxes
|
|
merged_boxes = merge_overlapping_boxes(boxes)
|
|
|
|
# Extract and save regions
|
|
pdf_name = Path(pdf_path).stem
|
|
region_info = []
|
|
|
|
for idx, (x, y, w, h) in enumerate(merged_boxes):
|
|
# Add padding around the region
|
|
padding = 10
|
|
x_pad = max(0, x - padding)
|
|
y_pad = max(0, y - padding)
|
|
w_pad = min(image.shape[1] - x_pad, w + 2 * padding)
|
|
h_pad = min(image.shape[0] - y_pad, h + 2 * padding)
|
|
|
|
# Extract region
|
|
region = image[y_pad:y_pad + h_pad, x_pad:x_pad + w_pad]
|
|
|
|
# Save region
|
|
output_filename = f"{pdf_name}_handwriting_{idx + 1:02d}.png"
|
|
output_path = os.path.join(output_dir, output_filename)
|
|
cv2.imwrite(output_path, region)
|
|
|
|
region_info.append({
|
|
'filename': output_filename,
|
|
'bbox': (x_pad, y_pad, w_pad, h_pad),
|
|
'area': w_pad * h_pad
|
|
})
|
|
|
|
return len(merged_boxes), len(merged_boxes), region_info, None
|
|
|
|
except Exception as e:
|
|
return 0, 0, [], str(e)
|
|
|
|
|
|
def main():
|
|
"""Main processing function"""
|
|
global LOG_FILE
|
|
|
|
print(f"Starting handwriting extraction from PDFs...")
|
|
print(f"Input path: {PDF_INPUT_PATH}")
|
|
print(f"Output path: {OUTPUT_PATH}")
|
|
print(f"DPI: {DPI}")
|
|
print()
|
|
|
|
# Create output directory
|
|
os.makedirs(OUTPUT_PATH, exist_ok=True)
|
|
|
|
LOG_FILE = os.path.join(OUTPUT_PATH, f"handwriting_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
|
|
|
|
# Get PDF files
|
|
pdf_files = sorted(Path(PDF_INPUT_PATH).glob("*.pdf"))
|
|
|
|
if not pdf_files:
|
|
print("ERROR: No PDF files found!")
|
|
return
|
|
|
|
print(f"Found {len(pdf_files)} PDF files to process\n")
|
|
|
|
# Statistics
|
|
stats = {
|
|
'total_pdfs': 0,
|
|
'pdfs_with_handwriting': 0,
|
|
'pdfs_without_handwriting': 0,
|
|
'total_regions': 0,
|
|
'errors': 0
|
|
}
|
|
|
|
# Open log file
|
|
with open(LOG_FILE, 'w', newline='') as log_file:
|
|
log_writer = csv.writer(log_file)
|
|
log_writer.writerow([
|
|
'pdf_filename', 'regions_detected', 'regions_extracted',
|
|
'extracted_filenames', 'error'
|
|
])
|
|
|
|
# Process each PDF
|
|
for i, pdf_path in enumerate(pdf_files):
|
|
stats['total_pdfs'] += 1
|
|
pdf_filename = pdf_path.name
|
|
|
|
print(f"[{i+1}/{len(pdf_files)}] Processing: {pdf_filename}... ", end='', flush=True)
|
|
|
|
# Extract handwriting
|
|
extracted_count, total_count, region_info, error = extract_handwriting_regions(
|
|
str(pdf_path), OUTPUT_PATH, DPI
|
|
)
|
|
|
|
if error:
|
|
print(f"ERROR: {error}")
|
|
stats['errors'] += 1
|
|
log_writer.writerow([pdf_filename, 0, 0, "", error])
|
|
continue
|
|
|
|
if extracted_count > 0:
|
|
stats['pdfs_with_handwriting'] += 1
|
|
stats['total_regions'] += extracted_count
|
|
print(f"FOUND {extracted_count} regions")
|
|
|
|
filenames = [r['filename'] for r in region_info]
|
|
log_writer.writerow([
|
|
pdf_filename,
|
|
total_count,
|
|
extracted_count,
|
|
", ".join(filenames),
|
|
""
|
|
])
|
|
else:
|
|
stats['pdfs_without_handwriting'] += 1
|
|
print("No handwriting detected")
|
|
log_writer.writerow([pdf_filename, 0, 0, "", ""])
|
|
|
|
# Print summary
|
|
print("\n" + "="*60)
|
|
print("HANDWRITING EXTRACTION SUMMARY")
|
|
print("="*60)
|
|
print(f"Total PDFs processed: {stats['total_pdfs']}")
|
|
print(f"PDFs with handwriting: {stats['pdfs_with_handwriting']}")
|
|
print(f"PDFs without handwriting: {stats['pdfs_without_handwriting']}")
|
|
print(f"Total regions extracted: {stats['total_regions']}")
|
|
print(f"Errors: {stats['errors']}")
|
|
print(f"\nLog file: {LOG_FILE}")
|
|
print("="*60)
|
|
|
|
# Show examples
|
|
if stats['total_regions'] > 0:
|
|
output_files = sorted(Path(OUTPUT_PATH).glob("*_handwriting_*.png"))
|
|
print(f"\nExtracted {len(output_files)} handwriting images")
|
|
print("Example files:")
|
|
for img in output_files[:5]:
|
|
size_kb = img.stat().st_size / 1024
|
|
print(f" - {img.name} ({size_kb:.1f} KB)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main()
|
|
except KeyboardInterrupt:
|
|
print("\n\nProcess interrupted by user.")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"\n\nFATAL ERROR: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
sys.exit(1)
|