Add hybrid signature extraction with name-based verification
Implement VLM name extraction + CV detection hybrid approach to
replace unreliable VLM coordinate system with name-based verification.
Key Features:
- VLM extracts signature names (周寶蓮, 魏興海, etc.)
- CV or PDF text layer detects regions
- VLM verifies each region against expected names
- Signatures saved with person names: signature_周寶蓮.png
- Duplicate prevention and rejection handling
Test Results:
- 5 PDF pages tested
- 7/10 signatures extracted (70% recall)
- 100% precision (no false positives)
- No blank regions extracted (previous issue resolved)
Files:
- extract_pages_from_csv.py: Extract pages from CSV (tested: 100 files)
- extract_signatures_hybrid.py: Hybrid extraction (current working solution)
- extract_handwriting.py: CV-only approach (component)
- extract_signatures_vlm.py: Deprecated VLM coordinate approach
- PROJECT_DOCUMENTATION.md: Complete project history and results
- SESSION_INIT.md: Session handoff documentation
- SESSION_CHECKLIST.md: Status checklist
- NEW_SESSION_PROMPT.txt: Template for next session
- HOW_TO_CONTINUE.txt: Visual handoff guide
- COMMIT_SUMMARY.md: Commit preparation guide
- README.md: Quick start guide
- README_page_extraction.md: Page extraction docs
- README_hybrid_extraction.md: Hybrid approach docs
- .gitignore: Exclude diagnostic scripts and outputs
Known Limitations:
- 30% of signatures missed due to conservative CV parameters
- Text layer method untested (all test PDFs are scanned images)
- Performance: ~24 seconds per PDF
Next Steps:
- Tune CV parameters for higher recall
- Test with larger dataset (100+ files)
- Process full dataset (86,073 files)
🤖 Generated with Claude Code
This commit is contained in:
296
extract_handwriting.py
Normal file
296
extract_handwriting.py
Normal file
@@ -0,0 +1,296 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to detect and extract handwritten regions from PDF pages.
|
||||
Uses computer vision to identify handwriting, not PDF image objects.
|
||||
"""
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import fitz # PyMuPDF
|
||||
import csv
|
||||
|
||||
# Configuration
|
||||
PDF_INPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output"
|
||||
OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/handwriting"
|
||||
LOG_FILE = None # Will be set in main()
|
||||
|
||||
# Image processing parameters
|
||||
DPI = 300 # Resolution for rendering PDF page
|
||||
MIN_CONTOUR_AREA = 100 # Minimum area for a handwriting region (in pixels)
|
||||
MAX_CONTOUR_AREA = 500000 # Maximum area (to filter out large background elements)
|
||||
|
||||
|
||||
def render_pdf_page_as_image(pdf_path, dpi=300):
|
||||
"""
|
||||
Render PDF page as a high-resolution image.
|
||||
Returns: numpy array (OpenCV format)
|
||||
"""
|
||||
try:
|
||||
doc = fitz.open(pdf_path)
|
||||
page = doc[0] # Get first page (our extracted pages only have 1 page)
|
||||
|
||||
# Render at high DPI for better detection
|
||||
mat = fitz.Matrix(dpi / 72, dpi / 72) # 72 DPI is default
|
||||
pix = page.get_pixmap(matrix=mat, alpha=False)
|
||||
|
||||
# Convert to numpy array
|
||||
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
|
||||
|
||||
# Convert RGB to BGR for OpenCV
|
||||
if pix.n == 3: # RGB
|
||||
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
|
||||
elif pix.n == 1: # Grayscale
|
||||
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
||||
|
||||
doc.close()
|
||||
return img, None
|
||||
|
||||
except Exception as e:
|
||||
return None, str(e)
|
||||
|
||||
|
||||
def detect_handwriting_regions(image):
|
||||
"""
|
||||
Detect handwritten regions in the image using computer vision.
|
||||
Returns: list of bounding boxes [(x, y, w, h), ...]
|
||||
"""
|
||||
# Convert to grayscale
|
||||
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Apply binary threshold (Otsu's method for automatic threshold)
|
||||
# Invert so that dark ink becomes white (foreground)
|
||||
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||||
|
||||
# Morphological operations to connect nearby strokes
|
||||
# This helps group individual pen strokes into signature regions
|
||||
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 5))
|
||||
dilated = cv2.dilate(binary, kernel, iterations=2)
|
||||
|
||||
# Find contours (connected regions)
|
||||
contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
# Filter contours based on area
|
||||
bounding_boxes = []
|
||||
for contour in contours:
|
||||
area = cv2.contourArea(contour)
|
||||
|
||||
# Filter by area (remove noise and very large regions)
|
||||
if MIN_CONTOUR_AREA < area < MAX_CONTOUR_AREA:
|
||||
x, y, w, h = cv2.boundingRect(contour)
|
||||
|
||||
# Additional filters:
|
||||
# 1. Aspect ratio check (signatures are usually wider than tall, but not extreme)
|
||||
aspect_ratio = w / float(h) if h > 0 else 0
|
||||
|
||||
# 2. Size check (not too small, not too large)
|
||||
if 0.1 < aspect_ratio < 20 and w > 20 and h > 20:
|
||||
bounding_boxes.append((x, y, w, h))
|
||||
|
||||
return bounding_boxes
|
||||
|
||||
|
||||
def merge_overlapping_boxes(boxes, merge_threshold=50):
|
||||
"""
|
||||
Merge bounding boxes that are close to each other.
|
||||
This helps combine signature parts that were detected separately.
|
||||
"""
|
||||
if not boxes:
|
||||
return []
|
||||
|
||||
# Sort boxes by x-coordinate
|
||||
boxes = sorted(boxes, key=lambda b: b[0])
|
||||
|
||||
merged = []
|
||||
current = list(boxes[0]) # [x, y, w, h]
|
||||
|
||||
for box in boxes[1:]:
|
||||
x, y, w, h = box
|
||||
cx, cy, cw, ch = current
|
||||
|
||||
# Check if boxes are close enough to merge
|
||||
# Close in x direction and overlapping or close in y direction
|
||||
if (x <= cx + cw + merge_threshold and
|
||||
abs(y - cy) < merge_threshold * 2):
|
||||
# Merge boxes
|
||||
new_x = min(cx, x)
|
||||
new_y = min(cy, y)
|
||||
new_w = max(cx + cw, x + w) - new_x
|
||||
new_h = max(cy + ch, y + h) - new_y
|
||||
current = [new_x, new_y, new_w, new_h]
|
||||
else:
|
||||
merged.append(tuple(current))
|
||||
current = list(box)
|
||||
|
||||
merged.append(tuple(current))
|
||||
return merged
|
||||
|
||||
|
||||
def extract_handwriting_regions(pdf_path, output_dir, dpi=300):
|
||||
"""
|
||||
Extract handwritten regions from a PDF page.
|
||||
Returns: (success_count, total_regions, region_info, error)
|
||||
"""
|
||||
try:
|
||||
# Render PDF as image
|
||||
image, error = render_pdf_page_as_image(pdf_path, dpi)
|
||||
if error:
|
||||
return 0, 0, [], f"Rendering error: {error}"
|
||||
|
||||
if image is None:
|
||||
return 0, 0, [], "Failed to render PDF"
|
||||
|
||||
# Detect handwriting regions
|
||||
boxes = detect_handwriting_regions(image)
|
||||
|
||||
if not boxes:
|
||||
return 0, 0, [], None # No handwriting detected, not an error
|
||||
|
||||
# Merge overlapping/nearby boxes
|
||||
merged_boxes = merge_overlapping_boxes(boxes)
|
||||
|
||||
# Extract and save regions
|
||||
pdf_name = Path(pdf_path).stem
|
||||
region_info = []
|
||||
|
||||
for idx, (x, y, w, h) in enumerate(merged_boxes):
|
||||
# Add padding around the region
|
||||
padding = 10
|
||||
x_pad = max(0, x - padding)
|
||||
y_pad = max(0, y - padding)
|
||||
w_pad = min(image.shape[1] - x_pad, w + 2 * padding)
|
||||
h_pad = min(image.shape[0] - y_pad, h + 2 * padding)
|
||||
|
||||
# Extract region
|
||||
region = image[y_pad:y_pad + h_pad, x_pad:x_pad + w_pad]
|
||||
|
||||
# Save region
|
||||
output_filename = f"{pdf_name}_handwriting_{idx + 1:02d}.png"
|
||||
output_path = os.path.join(output_dir, output_filename)
|
||||
cv2.imwrite(output_path, region)
|
||||
|
||||
region_info.append({
|
||||
'filename': output_filename,
|
||||
'bbox': (x_pad, y_pad, w_pad, h_pad),
|
||||
'area': w_pad * h_pad
|
||||
})
|
||||
|
||||
return len(merged_boxes), len(merged_boxes), region_info, None
|
||||
|
||||
except Exception as e:
|
||||
return 0, 0, [], str(e)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main processing function"""
|
||||
global LOG_FILE
|
||||
|
||||
print(f"Starting handwriting extraction from PDFs...")
|
||||
print(f"Input path: {PDF_INPUT_PATH}")
|
||||
print(f"Output path: {OUTPUT_PATH}")
|
||||
print(f"DPI: {DPI}")
|
||||
print()
|
||||
|
||||
# Create output directory
|
||||
os.makedirs(OUTPUT_PATH, exist_ok=True)
|
||||
|
||||
LOG_FILE = os.path.join(OUTPUT_PATH, f"handwriting_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
|
||||
|
||||
# Get PDF files
|
||||
pdf_files = sorted(Path(PDF_INPUT_PATH).glob("*.pdf"))
|
||||
|
||||
if not pdf_files:
|
||||
print("ERROR: No PDF files found!")
|
||||
return
|
||||
|
||||
print(f"Found {len(pdf_files)} PDF files to process\n")
|
||||
|
||||
# Statistics
|
||||
stats = {
|
||||
'total_pdfs': 0,
|
||||
'pdfs_with_handwriting': 0,
|
||||
'pdfs_without_handwriting': 0,
|
||||
'total_regions': 0,
|
||||
'errors': 0
|
||||
}
|
||||
|
||||
# Open log file
|
||||
with open(LOG_FILE, 'w', newline='') as log_file:
|
||||
log_writer = csv.writer(log_file)
|
||||
log_writer.writerow([
|
||||
'pdf_filename', 'regions_detected', 'regions_extracted',
|
||||
'extracted_filenames', 'error'
|
||||
])
|
||||
|
||||
# Process each PDF
|
||||
for i, pdf_path in enumerate(pdf_files):
|
||||
stats['total_pdfs'] += 1
|
||||
pdf_filename = pdf_path.name
|
||||
|
||||
print(f"[{i+1}/{len(pdf_files)}] Processing: {pdf_filename}... ", end='', flush=True)
|
||||
|
||||
# Extract handwriting
|
||||
extracted_count, total_count, region_info, error = extract_handwriting_regions(
|
||||
str(pdf_path), OUTPUT_PATH, DPI
|
||||
)
|
||||
|
||||
if error:
|
||||
print(f"ERROR: {error}")
|
||||
stats['errors'] += 1
|
||||
log_writer.writerow([pdf_filename, 0, 0, "", error])
|
||||
continue
|
||||
|
||||
if extracted_count > 0:
|
||||
stats['pdfs_with_handwriting'] += 1
|
||||
stats['total_regions'] += extracted_count
|
||||
print(f"FOUND {extracted_count} regions")
|
||||
|
||||
filenames = [r['filename'] for r in region_info]
|
||||
log_writer.writerow([
|
||||
pdf_filename,
|
||||
total_count,
|
||||
extracted_count,
|
||||
", ".join(filenames),
|
||||
""
|
||||
])
|
||||
else:
|
||||
stats['pdfs_without_handwriting'] += 1
|
||||
print("No handwriting detected")
|
||||
log_writer.writerow([pdf_filename, 0, 0, "", ""])
|
||||
|
||||
# Print summary
|
||||
print("\n" + "="*60)
|
||||
print("HANDWRITING EXTRACTION SUMMARY")
|
||||
print("="*60)
|
||||
print(f"Total PDFs processed: {stats['total_pdfs']}")
|
||||
print(f"PDFs with handwriting: {stats['pdfs_with_handwriting']}")
|
||||
print(f"PDFs without handwriting: {stats['pdfs_without_handwriting']}")
|
||||
print(f"Total regions extracted: {stats['total_regions']}")
|
||||
print(f"Errors: {stats['errors']}")
|
||||
print(f"\nLog file: {LOG_FILE}")
|
||||
print("="*60)
|
||||
|
||||
# Show examples
|
||||
if stats['total_regions'] > 0:
|
||||
output_files = sorted(Path(OUTPUT_PATH).glob("*_handwriting_*.png"))
|
||||
print(f"\nExtracted {len(output_files)} handwriting images")
|
||||
print("Example files:")
|
||||
for img in output_files[:5]:
|
||||
size_kb = img.stat().st_size / 1024
|
||||
print(f" - {img.name} ({size_kb:.1f} KB)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nProcess interrupted by user.")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n\nFATAL ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user