Add hybrid signature extraction with name-based verification

Implement VLM name extraction + CV detection hybrid approach to replace unreliable VLM coordinate system with name-based verification. Key Features: - VLM extracts signature names (周寶蓮, 魏興海, etc.) - CV or PDF text layer detects regions - VLM verifies each region against expected names - Signatures saved with person names: signature_周寶蓮.png - Duplicate prevention and rejection handling Test Results: - 5 PDF pages tested - 7/10 signatures extracted (70% recall) - 100% precision (no false positives) - No blank regions extracted (previous issue resolved) Files: - extract_pages_from_csv.py: Extract pages from CSV (tested: 100 files) - extract_signatures_hybrid.py: Hybrid extraction (current working solution) - extract_handwriting.py: CV-only approach (component) - extract_signatures_vlm.py: Deprecated VLM coordinate approach - PROJECT_DOCUMENTATION.md: Complete project history and results - SESSION_INIT.md: Session handoff documentation - SESSION_CHECKLIST.md: Status checklist - NEW_SESSION_PROMPT.txt: Template for next session - HOW_TO_CONTINUE.txt: Visual handoff guide - COMMIT_SUMMARY.md: Commit preparation guide - README.md: Quick start guide - README_page_extraction.md: Page extraction docs - README_hybrid_extraction.md: Hybrid approach docs - .gitignore: Exclude diagnostic scripts and outputs Known Limitations: - 30% of signatures missed due to conservative CV parameters - Text layer method untested (all test PDFs are scanned images) - Performance: ~24 seconds per PDF Next Steps: - Tune CV parameters for higher recall - Test with larger dataset (100+ files) - Process full dataset (86,073 files) 🤖 Generated with Claude Code
2025-10-26 23:39:52 +08:00
commit 52612e14ba
14 changed files with 3583 additions and 0 deletions
--- a/extract_handwriting.py
+++ b/extract_handwriting.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+"""
+Script to detect and extract handwritten regions from PDF pages.
+Uses computer vision to identify handwriting, not PDF image objects.
+"""
+
+import cv2
+import numpy as np
+import os
+import sys
+from pathlib import Path
+from datetime import datetime
+import fitz  # PyMuPDF
+import csv
+
+# Configuration
+PDF_INPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output"
+OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/handwriting"
+LOG_FILE = None  # Will be set in main()
+
+# Image processing parameters
+DPI = 300  # Resolution for rendering PDF page
+MIN_CONTOUR_AREA = 100  # Minimum area for a handwriting region (in pixels)
+MAX_CONTOUR_AREA = 500000  # Maximum area (to filter out large background elements)
+
+
+def render_pdf_page_as_image(pdf_path, dpi=300):
+    """
+    Render PDF page as a high-resolution image.
+    Returns: numpy array (OpenCV format)
+    """
+    try:
+        doc = fitz.open(pdf_path)
+        page = doc[0]  # Get first page (our extracted pages only have 1 page)
+
+        # Render at high DPI for better detection
+        mat = fitz.Matrix(dpi / 72, dpi / 72)  # 72 DPI is default
+        pix = page.get_pixmap(matrix=mat, alpha=False)
+
+        # Convert to numpy array
+        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
+
+        # Convert RGB to BGR for OpenCV
+        if pix.n == 3:  # RGB
+            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        elif pix.n == 1:  # Grayscale
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+
+        doc.close()
+        return img, None
+
+    except Exception as e:
+        return None, str(e)
+
+
+def detect_handwriting_regions(image):
+    """
+    Detect handwritten regions in the image using computer vision.
+    Returns: list of bounding boxes [(x, y, w, h), ...]
+    """
+    # Convert to grayscale
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+    # Apply binary threshold (Otsu's method for automatic threshold)
+    # Invert so that dark ink becomes white (foreground)
+    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+    # Morphological operations to connect nearby strokes
+    # This helps group individual pen strokes into signature regions
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 5))
+    dilated = cv2.dilate(binary, kernel, iterations=2)
+
+    # Find contours (connected regions)
+    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    # Filter contours based on area
+    bounding_boxes = []
+    for contour in contours:
+        area = cv2.contourArea(contour)
+
+        # Filter by area (remove noise and very large regions)
+        if MIN_CONTOUR_AREA < area < MAX_CONTOUR_AREA:
+            x, y, w, h = cv2.boundingRect(contour)
+
+            # Additional filters:
+            # 1. Aspect ratio check (signatures are usually wider than tall, but not extreme)
+            aspect_ratio = w / float(h) if h > 0 else 0
+
+            # 2. Size check (not too small, not too large)
+            if 0.1 < aspect_ratio < 20 and w > 20 and h > 20:
+                bounding_boxes.append((x, y, w, h))
+
+    return bounding_boxes
+
+
+def merge_overlapping_boxes(boxes, merge_threshold=50):
+    """
+    Merge bounding boxes that are close to each other.
+    This helps combine signature parts that were detected separately.
+    """
+    if not boxes:
+        return []
+
+    # Sort boxes by x-coordinate
+    boxes = sorted(boxes, key=lambda b: b[0])
+
+    merged = []
+    current = list(boxes[0])  # [x, y, w, h]
+
+    for box in boxes[1:]:
+        x, y, w, h = box
+        cx, cy, cw, ch = current
+
+        # Check if boxes are close enough to merge
+        # Close in x direction and overlapping or close in y direction
+        if (x <= cx + cw + merge_threshold and
+            abs(y - cy) < merge_threshold * 2):
+            # Merge boxes
+            new_x = min(cx, x)
+            new_y = min(cy, y)
+            new_w = max(cx + cw, x + w) - new_x
+            new_h = max(cy + ch, y + h) - new_y
+            current = [new_x, new_y, new_w, new_h]
+        else:
+            merged.append(tuple(current))
+            current = list(box)
+
+    merged.append(tuple(current))
+    return merged
+
+
+def extract_handwriting_regions(pdf_path, output_dir, dpi=300):
+    """
+    Extract handwritten regions from a PDF page.
+    Returns: (success_count, total_regions, region_info, error)
+    """
+    try:
+        # Render PDF as image
+        image, error = render_pdf_page_as_image(pdf_path, dpi)
+        if error:
+            return 0, 0, [], f"Rendering error: {error}"
+
+        if image is None:
+            return 0, 0, [], "Failed to render PDF"
+
+        # Detect handwriting regions
+        boxes = detect_handwriting_regions(image)
+
+        if not boxes:
+            return 0, 0, [], None  # No handwriting detected, not an error
+
+        # Merge overlapping/nearby boxes
+        merged_boxes = merge_overlapping_boxes(boxes)
+
+        # Extract and save regions
+        pdf_name = Path(pdf_path).stem
+        region_info = []
+
+        for idx, (x, y, w, h) in enumerate(merged_boxes):
+            # Add padding around the region
+            padding = 10
+            x_pad = max(0, x - padding)
+            y_pad = max(0, y - padding)
+            w_pad = min(image.shape[1] - x_pad, w + 2 * padding)
+            h_pad = min(image.shape[0] - y_pad, h + 2 * padding)
+
+            # Extract region
+            region = image[y_pad:y_pad + h_pad, x_pad:x_pad + w_pad]
+
+            # Save region
+            output_filename = f"{pdf_name}_handwriting_{idx + 1:02d}.png"
+            output_path = os.path.join(output_dir, output_filename)
+            cv2.imwrite(output_path, region)
+
+            region_info.append({
+                'filename': output_filename,
+                'bbox': (x_pad, y_pad, w_pad, h_pad),
+                'area': w_pad * h_pad
+            })
+
+        return len(merged_boxes), len(merged_boxes), region_info, None
+
+    except Exception as e:
+        return 0, 0, [], str(e)
+
+
+def main():
+    """Main processing function"""
+    global LOG_FILE
+
+    print(f"Starting handwriting extraction from PDFs...")
+    print(f"Input path: {PDF_INPUT_PATH}")
+    print(f"Output path: {OUTPUT_PATH}")
+    print(f"DPI: {DPI}")
+    print()
+
+    # Create output directory
+    os.makedirs(OUTPUT_PATH, exist_ok=True)
+
+    LOG_FILE = os.path.join(OUTPUT_PATH, f"handwriting_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
+
+    # Get PDF files
+    pdf_files = sorted(Path(PDF_INPUT_PATH).glob("*.pdf"))
+
+    if not pdf_files:
+        print("ERROR: No PDF files found!")
+        return
+
+    print(f"Found {len(pdf_files)} PDF files to process\n")
+
+    # Statistics
+    stats = {
+        'total_pdfs': 0,
+        'pdfs_with_handwriting': 0,
+        'pdfs_without_handwriting': 0,
+        'total_regions': 0,
+        'errors': 0
+    }
+
+    # Open log file
+    with open(LOG_FILE, 'w', newline='') as log_file:
+        log_writer = csv.writer(log_file)
+        log_writer.writerow([
+            'pdf_filename', 'regions_detected', 'regions_extracted',
+            'extracted_filenames', 'error'
+        ])
+
+        # Process each PDF
+        for i, pdf_path in enumerate(pdf_files):
+            stats['total_pdfs'] += 1
+            pdf_filename = pdf_path.name
+
+            print(f"[{i+1}/{len(pdf_files)}] Processing: {pdf_filename}... ", end='', flush=True)
+
+            # Extract handwriting
+            extracted_count, total_count, region_info, error = extract_handwriting_regions(
+                str(pdf_path), OUTPUT_PATH, DPI
+            )
+
+            if error:
+                print(f"ERROR: {error}")
+                stats['errors'] += 1
+                log_writer.writerow([pdf_filename, 0, 0, "", error])
+                continue
+
+            if extracted_count > 0:
+                stats['pdfs_with_handwriting'] += 1
+                stats['total_regions'] += extracted_count
+                print(f"FOUND {extracted_count} regions")
+
+                filenames = [r['filename'] for r in region_info]
+                log_writer.writerow([
+                    pdf_filename,
+                    total_count,
+                    extracted_count,
+                    ", ".join(filenames),
+                    ""
+                ])
+            else:
+                stats['pdfs_without_handwriting'] += 1
+                print("No handwriting detected")
+                log_writer.writerow([pdf_filename, 0, 0, "", ""])
+
+    # Print summary
+    print("\n" + "="*60)
+    print("HANDWRITING EXTRACTION SUMMARY")
+    print("="*60)
+    print(f"Total PDFs processed:        {stats['total_pdfs']}")
+    print(f"PDFs with handwriting:       {stats['pdfs_with_handwriting']}")
+    print(f"PDFs without handwriting:    {stats['pdfs_without_handwriting']}")
+    print(f"Total regions extracted:     {stats['total_regions']}")
+    print(f"Errors:                      {stats['errors']}")
+    print(f"\nLog file: {LOG_FILE}")
+    print("="*60)
+
+    # Show examples
+    if stats['total_regions'] > 0:
+        output_files = sorted(Path(OUTPUT_PATH).glob("*_handwriting_*.png"))
+        print(f"\nExtracted {len(output_files)} handwriting images")
+        print("Example files:")
+        for img in output_files[:5]:
+            size_kb = img.stat().st_size / 1024
+            print(f"  - {img.name} ({size_kb:.1f} KB)")
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\n\nProcess interrupted by user.")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n\nFATAL ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)