Add hybrid signature extraction with name-based verification

Implement VLM name extraction + CV detection hybrid approach to
replace unreliable VLM coordinate system with name-based verification.

Key Features:
- VLM extracts signature names (周寶蓮, 魏興海, etc.)
- CV or PDF text layer detects regions
- VLM verifies each region against expected names
- Signatures saved with person names: signature_周寶蓮.png
- Duplicate prevention and rejection handling

Test Results:
- 5 PDF pages tested
- 7/10 signatures extracted (70% recall)
- 100% precision (no false positives)
- No blank regions extracted (previous issue resolved)

Files:
- extract_pages_from_csv.py: Extract pages from CSV (tested: 100 files)
- extract_signatures_hybrid.py: Hybrid extraction (current working solution)
- extract_handwriting.py: CV-only approach (component)
- extract_signatures_vlm.py: Deprecated VLM coordinate approach
- PROJECT_DOCUMENTATION.md: Complete project history and results
- SESSION_INIT.md: Session handoff documentation
- SESSION_CHECKLIST.md: Status checklist
- NEW_SESSION_PROMPT.txt: Template for next session
- HOW_TO_CONTINUE.txt: Visual handoff guide
- COMMIT_SUMMARY.md: Commit preparation guide
- README.md: Quick start guide
- README_page_extraction.md: Page extraction docs
- README_hybrid_extraction.md: Hybrid approach docs
- .gitignore: Exclude diagnostic scripts and outputs

Known Limitations:
- 30% of signatures missed due to conservative CV parameters
- Text layer method untested (all test PDFs are scanned images)
- Performance: ~24 seconds per PDF

Next Steps:
- Tune CV parameters for higher recall
- Test with larger dataset (100+ files)
- Process full dataset (86,073 files)

🤖 Generated with Claude Code
This commit is contained in:
2025-10-26 23:39:52 +08:00
commit 52612e14ba
14 changed files with 3583 additions and 0 deletions

296
extract_handwriting.py Normal file
View File

@@ -0,0 +1,296 @@
#!/usr/bin/env python3
"""
Script to detect and extract handwritten regions from PDF pages.
Uses computer vision to identify handwriting, not PDF image objects.
"""
import cv2
import numpy as np
import os
import sys
from pathlib import Path
from datetime import datetime
import fitz # PyMuPDF
import csv
# Configuration
PDF_INPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output"
OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/handwriting"
LOG_FILE = None # Will be set in main()
# Image processing parameters
DPI = 300 # Resolution for rendering PDF page
MIN_CONTOUR_AREA = 100 # Minimum area for a handwriting region (in pixels)
MAX_CONTOUR_AREA = 500000 # Maximum area (to filter out large background elements)
def render_pdf_page_as_image(pdf_path, dpi=300):
"""
Render PDF page as a high-resolution image.
Returns: numpy array (OpenCV format)
"""
try:
doc = fitz.open(pdf_path)
page = doc[0] # Get first page (our extracted pages only have 1 page)
# Render at high DPI for better detection
mat = fitz.Matrix(dpi / 72, dpi / 72) # 72 DPI is default
pix = page.get_pixmap(matrix=mat, alpha=False)
# Convert to numpy array
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
# Convert RGB to BGR for OpenCV
if pix.n == 3: # RGB
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
elif pix.n == 1: # Grayscale
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
doc.close()
return img, None
except Exception as e:
return None, str(e)
def detect_handwriting_regions(image):
"""
Detect handwritten regions in the image using computer vision.
Returns: list of bounding boxes [(x, y, w, h), ...]
"""
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Apply binary threshold (Otsu's method for automatic threshold)
# Invert so that dark ink becomes white (foreground)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Morphological operations to connect nearby strokes
# This helps group individual pen strokes into signature regions
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 5))
dilated = cv2.dilate(binary, kernel, iterations=2)
# Find contours (connected regions)
contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# Filter contours based on area
bounding_boxes = []
for contour in contours:
area = cv2.contourArea(contour)
# Filter by area (remove noise and very large regions)
if MIN_CONTOUR_AREA < area < MAX_CONTOUR_AREA:
x, y, w, h = cv2.boundingRect(contour)
# Additional filters:
# 1. Aspect ratio check (signatures are usually wider than tall, but not extreme)
aspect_ratio = w / float(h) if h > 0 else 0
# 2. Size check (not too small, not too large)
if 0.1 < aspect_ratio < 20 and w > 20 and h > 20:
bounding_boxes.append((x, y, w, h))
return bounding_boxes
def merge_overlapping_boxes(boxes, merge_threshold=50):
"""
Merge bounding boxes that are close to each other.
This helps combine signature parts that were detected separately.
"""
if not boxes:
return []
# Sort boxes by x-coordinate
boxes = sorted(boxes, key=lambda b: b[0])
merged = []
current = list(boxes[0]) # [x, y, w, h]
for box in boxes[1:]:
x, y, w, h = box
cx, cy, cw, ch = current
# Check if boxes are close enough to merge
# Close in x direction and overlapping or close in y direction
if (x <= cx + cw + merge_threshold and
abs(y - cy) < merge_threshold * 2):
# Merge boxes
new_x = min(cx, x)
new_y = min(cy, y)
new_w = max(cx + cw, x + w) - new_x
new_h = max(cy + ch, y + h) - new_y
current = [new_x, new_y, new_w, new_h]
else:
merged.append(tuple(current))
current = list(box)
merged.append(tuple(current))
return merged
def extract_handwriting_regions(pdf_path, output_dir, dpi=300):
"""
Extract handwritten regions from a PDF page.
Returns: (success_count, total_regions, region_info, error)
"""
try:
# Render PDF as image
image, error = render_pdf_page_as_image(pdf_path, dpi)
if error:
return 0, 0, [], f"Rendering error: {error}"
if image is None:
return 0, 0, [], "Failed to render PDF"
# Detect handwriting regions
boxes = detect_handwriting_regions(image)
if not boxes:
return 0, 0, [], None # No handwriting detected, not an error
# Merge overlapping/nearby boxes
merged_boxes = merge_overlapping_boxes(boxes)
# Extract and save regions
pdf_name = Path(pdf_path).stem
region_info = []
for idx, (x, y, w, h) in enumerate(merged_boxes):
# Add padding around the region
padding = 10
x_pad = max(0, x - padding)
y_pad = max(0, y - padding)
w_pad = min(image.shape[1] - x_pad, w + 2 * padding)
h_pad = min(image.shape[0] - y_pad, h + 2 * padding)
# Extract region
region = image[y_pad:y_pad + h_pad, x_pad:x_pad + w_pad]
# Save region
output_filename = f"{pdf_name}_handwriting_{idx + 1:02d}.png"
output_path = os.path.join(output_dir, output_filename)
cv2.imwrite(output_path, region)
region_info.append({
'filename': output_filename,
'bbox': (x_pad, y_pad, w_pad, h_pad),
'area': w_pad * h_pad
})
return len(merged_boxes), len(merged_boxes), region_info, None
except Exception as e:
return 0, 0, [], str(e)
def main():
"""Main processing function"""
global LOG_FILE
print(f"Starting handwriting extraction from PDFs...")
print(f"Input path: {PDF_INPUT_PATH}")
print(f"Output path: {OUTPUT_PATH}")
print(f"DPI: {DPI}")
print()
# Create output directory
os.makedirs(OUTPUT_PATH, exist_ok=True)
LOG_FILE = os.path.join(OUTPUT_PATH, f"handwriting_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
# Get PDF files
pdf_files = sorted(Path(PDF_INPUT_PATH).glob("*.pdf"))
if not pdf_files:
print("ERROR: No PDF files found!")
return
print(f"Found {len(pdf_files)} PDF files to process\n")
# Statistics
stats = {
'total_pdfs': 0,
'pdfs_with_handwriting': 0,
'pdfs_without_handwriting': 0,
'total_regions': 0,
'errors': 0
}
# Open log file
with open(LOG_FILE, 'w', newline='') as log_file:
log_writer = csv.writer(log_file)
log_writer.writerow([
'pdf_filename', 'regions_detected', 'regions_extracted',
'extracted_filenames', 'error'
])
# Process each PDF
for i, pdf_path in enumerate(pdf_files):
stats['total_pdfs'] += 1
pdf_filename = pdf_path.name
print(f"[{i+1}/{len(pdf_files)}] Processing: {pdf_filename}... ", end='', flush=True)
# Extract handwriting
extracted_count, total_count, region_info, error = extract_handwriting_regions(
str(pdf_path), OUTPUT_PATH, DPI
)
if error:
print(f"ERROR: {error}")
stats['errors'] += 1
log_writer.writerow([pdf_filename, 0, 0, "", error])
continue
if extracted_count > 0:
stats['pdfs_with_handwriting'] += 1
stats['total_regions'] += extracted_count
print(f"FOUND {extracted_count} regions")
filenames = [r['filename'] for r in region_info]
log_writer.writerow([
pdf_filename,
total_count,
extracted_count,
", ".join(filenames),
""
])
else:
stats['pdfs_without_handwriting'] += 1
print("No handwriting detected")
log_writer.writerow([pdf_filename, 0, 0, "", ""])
# Print summary
print("\n" + "="*60)
print("HANDWRITING EXTRACTION SUMMARY")
print("="*60)
print(f"Total PDFs processed: {stats['total_pdfs']}")
print(f"PDFs with handwriting: {stats['pdfs_with_handwriting']}")
print(f"PDFs without handwriting: {stats['pdfs_without_handwriting']}")
print(f"Total regions extracted: {stats['total_regions']}")
print(f"Errors: {stats['errors']}")
print(f"\nLog file: {LOG_FILE}")
print("="*60)
# Show examples
if stats['total_regions'] > 0:
output_files = sorted(Path(OUTPUT_PATH).glob("*_handwriting_*.png"))
print(f"\nExtracted {len(output_files)} handwriting images")
print("Example files:")
for img in output_files[:5]:
size_kb = img.stat().st_size / 1024
print(f" - {img.name} ({size_kb:.1f} KB)")
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\n\nProcess interrupted by user.")
sys.exit(1)
except Exception as e:
print(f"\n\nFATAL ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)