Add hybrid signature extraction with name-based verification

Implement VLM name extraction + CV detection hybrid approach to replace unreliable VLM coordinate system with name-based verification. Key Features: - VLM extracts signature names (周寶蓮, 魏興海, etc.) - CV or PDF text layer detects regions - VLM verifies each region against expected names - Signatures saved with person names: signature_周寶蓮.png - Duplicate prevention and rejection handling Test Results: - 5 PDF pages tested - 7/10 signatures extracted (70% recall) - 100% precision (no false positives) - No blank regions extracted (previous issue resolved) Files: - extract_pages_from_csv.py: Extract pages from CSV (tested: 100 files) - extract_signatures_hybrid.py: Hybrid extraction (current working solution) - extract_handwriting.py: CV-only approach (component) - extract_signatures_vlm.py: Deprecated VLM coordinate approach - PROJECT_DOCUMENTATION.md: Complete project history and results - SESSION_INIT.md: Session handoff documentation - SESSION_CHECKLIST.md: Status checklist - NEW_SESSION_PROMPT.txt: Template for next session - HOW_TO_CONTINUE.txt: Visual handoff guide - COMMIT_SUMMARY.md: Commit preparation guide - README.md: Quick start guide - README_page_extraction.md: Page extraction docs - README_hybrid_extraction.md: Hybrid approach docs - .gitignore: Exclude diagnostic scripts and outputs Known Limitations: - 30% of signatures missed due to conservative CV parameters - Text layer method untested (all test PDFs are scanned images) - Performance: ~24 seconds per PDF Next Steps: - Tune CV parameters for higher recall - Test with larger dataset (100+ files) - Process full dataset (86,073 files) 🤖 Generated with Claude Code
2025-10-26 23:39:52 +08:00
commit 52612e14ba
14 changed files with 3583 additions and 0 deletions
--- a/extract_signatures_vlm.py
+++ b/extract_signatures_vlm.py
@@ -0,0 +1,505 @@
+#!/usr/bin/env python3
+"""
+Script to extract signatures using VLM (Vision Language Model) guidance.
+Uses Ollama instance with qwen2.5vl:32b for signature detection.
+"""
+
+import cv2
+import numpy as np
+import os
+import sys
+import json
+import base64
+import requests
+from pathlib import Path
+from datetime import datetime
+import fitz  # PyMuPDF
+import csv
+from io import BytesIO
+
+# Configuration
+PDF_INPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output"
+OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/signatures"
+REJECTED_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/signatures/rejected"
+LOG_FILE = None  # Will be set in main()
+
+# Ollama Configuration
+OLLAMA_URL = "http://192.168.30.36:11434"
+OLLAMA_MODEL = "qwen2.5vl:32b"
+
+# Image processing parameters
+DPI = 300  # Resolution for rendering PDF page
+
+
+def encode_image_to_base64(image_array):
+    """
+    Encode numpy image array to base64 string for Ollama API.
+    """
+    # Convert BGR to RGB
+    image_rgb = cv2.cvtColor(image_array, cv2.COLOR_BGR2RGB)
+
+    # Encode as JPEG
+    _, buffer = cv2.imencode('.jpg', image_rgb)
+
+    # Convert to base64
+    image_base64 = base64.b64encode(buffer).decode('utf-8')
+
+    return image_base64
+
+
+def call_ollama_vision(image_base64, prompt):
+    """
+    Call Ollama vision model with image and prompt.
+    Returns the model's text response.
+    """
+    try:
+        url = f"{OLLAMA_URL}/api/generate"
+
+        payload = {
+            "model": OLLAMA_MODEL,
+            "prompt": prompt,
+            "images": [image_base64],
+            "stream": False
+        }
+
+        response = requests.post(url, json=payload, timeout=120)
+        response.raise_for_status()
+
+        result = response.json()
+        return result.get('response', ''), None
+
+    except Exception as e:
+        return None, str(e)
+
+
+def render_pdf_page_as_image(pdf_path, dpi=300):
+    """
+    Render PDF page as a high-resolution image.
+    Returns: numpy array (OpenCV format)
+    """
+    try:
+        doc = fitz.open(pdf_path)
+        page = doc[0]  # Get first page
+
+        # Render at high DPI
+        mat = fitz.Matrix(dpi / 72, dpi / 72)
+        pix = page.get_pixmap(matrix=mat, alpha=False)
+
+        # Convert to numpy array
+        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
+
+        # Convert RGB to BGR for OpenCV
+        if pix.n == 3:  # RGB
+            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        elif pix.n == 1:  # Grayscale
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+
+        doc.close()
+        return img, pix.width, pix.height, None
+
+    except Exception as e:
+        return None, 0, 0, str(e)
+
+
+def parse_vlm_location_response(response_text, page_width, page_height):
+    """
+    Parse VLM response to extract signature locations.
+    Expected format from VLM should include percentages or pixel coordinates.
+
+    Returns: list of bounding boxes [(x, y, w, h), ...]
+    """
+    import re
+
+    locations = []
+
+    # Pattern to match: "Signature N: left=X%, top=Y%, width=W%, height=H%"
+    pattern = r'Signature\s+\d+:\s*left=([0-9.]+)%,?\s*top=([0-9.]+)%,?\s*width=([0-9.]+)%,?\s*height=([0-9.]+)%'
+
+    matches = re.findall(pattern, response_text)
+
+    for match in matches:
+        left_pct = float(match[0])
+        top_pct = float(match[1])
+        width_pct = float(match[2])
+        height_pct = float(match[3])
+
+        # Convert percentages to pixel coordinates
+        x = int(page_width * left_pct / 100)
+        y = int(page_height * top_pct / 100)
+        w = int(page_width * width_pct / 100)
+        h = int(page_height * height_pct / 100)
+
+        locations.append((x, y, w, h))
+
+    print(f"  - Parsed {len(locations)} signature location(s)")
+
+    return locations
+
+
+def check_pdf_has_image_at_location(pdf_path, bbox):
+    """
+    Check if PDF has a SMALL image object at the specified location.
+    If the image is a full-page scan, return False to use OpenCV cropping instead.
+    bbox: (x, y, w, h) in pixel coordinates
+    Returns: (has_image: bool, image_xref: int or None)
+    """
+    try:
+        doc = fitz.open(pdf_path)
+        page = doc[0]
+
+        # Get all images on the page
+        image_list = page.get_images(full=True)
+
+        if not image_list:
+            doc.close()
+            return False, None
+
+        # Get page dimensions (in points, 72 DPI)
+        page_rect = page.rect
+        page_width = page_rect.width
+        page_height = page_rect.height
+
+        # Check each image
+        for img_info in image_list:
+            xref = img_info[0]
+
+            # Get image dimensions
+            try:
+                base_image = doc.extract_image(xref)
+                img_width = base_image["width"]
+                img_height = base_image["height"]
+
+                # Check if this is a full-page scan
+                # If image is close to page size, it's a scanned page, not a signature
+                width_ratio = img_width / (page_width * 4)  # Approx conversion to pixels at 300 DPI
+                height_ratio = img_height / (page_height * 4)
+
+                # If image covers >80% of page, it's a full-page scan
+                if width_ratio > 0.8 and height_ratio > 0.8:
+                    # This is a full-page scan, don't extract it
+                    # Fall back to OpenCV cropping
+                    continue
+
+                # This might be a small embedded image (actual signature scan)
+                # For now, we'll still use OpenCV cropping for consistency
+                # but this logic can be refined later
+
+            except:
+                continue
+
+        # No suitable small images found, use OpenCV cropping
+        doc.close()
+        return False, None
+
+    except Exception as e:
+        print(f"Error checking PDF images: {e}")
+        return False, None
+
+
+def extract_pdf_image_object(pdf_path, xref, output_path):
+    """
+    Extract image object from PDF.
+    Returns: (success: bool, error: str)
+    """
+    try:
+        doc = fitz.open(pdf_path)
+
+        # Extract image
+        base_image = doc.extract_image(xref)
+        image_bytes = base_image["image"]
+        image_ext = base_image["ext"]
+
+        # Save image
+        output_file = f"{output_path}.{image_ext}"
+        with open(output_file, "wb") as f:
+            f.write(image_bytes)
+
+        doc.close()
+        return True, None, output_file
+
+    except Exception as e:
+        return False, str(e), None
+
+
+def extract_region_with_opencv(image, bbox, output_path):
+    """
+    Extract region from image using OpenCV with generous padding.
+    bbox: (x, y, w, h)
+    Returns: (success: bool, error: str)
+    """
+    try:
+        x, y, w, h = bbox
+
+        # Add generous padding (50% of box size or minimum 50 pixels)
+        # This ensures we capture the full signature even if VLM bbox is slightly off
+        padding_x = max(50, int(w * 0.5))  # 50% padding on sides
+        padding_y = max(50, int(h * 0.5))  # 50% padding on top/bottom
+
+        x_pad = max(0, x - padding_x)
+        y_pad = max(0, y - padding_y)
+        x_end = min(image.shape[1], x + w + padding_x)
+        y_end = min(image.shape[0], y + h + padding_y)
+
+        w_pad = x_end - x_pad
+        h_pad = y_end - y_pad
+
+        # Extract region
+        region = image[y_pad:y_pad + h_pad, x_pad:x_pad + w_pad]
+
+        # Save
+        output_file = f"{output_path}.png"
+        cv2.imwrite(output_file, region)
+
+        return True, None, output_file
+
+    except Exception as e:
+        return False, str(e), None
+
+
+def verify_signature_with_vlm(image_path):
+    """
+    Verify that extracted region contains a signature with VLM.
+    Returns: (is_signature: bool, error: str)
+    """
+    try:
+        # Read image
+        image = cv2.imread(image_path)
+
+        # Encode to base64
+        image_base64 = encode_image_to_base64(image)
+
+        # Ask VLM
+        prompt = "Is this a signature with a Chinese name? Answer only 'yes' or 'no'."
+        response, error = call_ollama_vision(image_base64, prompt)
+
+        if error:
+            return False, error
+
+        # Check if response contains 'yes'
+        is_signature = 'yes' in response.lower()
+
+        return is_signature, None
+
+    except Exception as e:
+        return False, str(e)
+
+
+def process_pdf_page(pdf_path, output_dir):
+    """
+    Process a single PDF page to extract signatures using VLM.
+
+    Workflow:
+    1. VLM locates signatures
+    2. Check if PDF has image objects at those locations
+    3. Extract via PDF object or OpenCV cropping
+    4. VLM verifies extracted regions
+
+    Returns: (signature_count, extracted_files, error)
+    """
+    pdf_name = Path(pdf_path).stem
+
+    # Step 1: Render page as image
+    print("  - Rendering page...", end='', flush=True)
+    image, page_width, page_height, error = render_pdf_page_as_image(pdf_path, DPI)
+
+    if error:
+        print(f" ERROR")
+        return 0, [], f"Render error: {error}"
+
+    print(" OK")
+
+    # Step 2: Encode image and ask VLM to locate signatures
+    print("  - Asking VLM to locate signatures...", end='', flush=True)
+    image_base64 = encode_image_to_base64(image)
+
+    location_prompt = """Please analyze this document page and locate ONLY handwritten signatures with Chinese names.
+
+IMPORTANT: Only mark areas with ACTUAL handwritten pen/ink signatures.
+Do NOT mark:
+- Printed text or typed names
+- Dates or reference numbers
+- Form field labels or instructions
+- Underlines or signature lines (empty boxes)
+- Stamps or seals
+
+Look for actual handwritten Chinese characters that are signatures.
+
+For each HANDWRITTEN signature found, provide the location as percentages from the top-left corner:
+- Distance from left edge (% of page width)
+- Distance from top edge (% of page height)
+- Width (% of page width)
+- Height (% of page height)
+
+Format your response as:
+Signature 1: left=X%, top=Y%, width=W%, height=H%
+Signature 2: left=X%, top=Y%, width=W%, height=H%
+
+If no handwritten signatures found, say "No signatures found"."""
+
+    response, error = call_ollama_vision(image_base64, location_prompt)
+
+    if error:
+        print(f" ERROR")
+        return 0, [], f"VLM error: {error}"
+
+    print(" OK")
+    print(f"  - VLM Response:\n{response}")
+
+    # Step 3: Parse locations (this needs to be implemented based on actual VLM responses)
+    locations = parse_vlm_location_response(response, page_width, page_height)
+
+    if not locations:
+        print("  - No signatures located by VLM")
+        return 0, [], None
+
+    # Step 4: Extract each located signature
+    extracted_files = []
+
+    for idx, bbox in enumerate(locations):
+        print(f"  - Extracting signature {idx + 1}...", end='', flush=True)
+
+        # Check if PDF has image object
+        has_image, xref = check_pdf_has_image_at_location(pdf_path, bbox)
+
+        output_base = os.path.join(output_dir, f"{pdf_name}_signature_{idx + 1}")
+
+        if has_image and xref:
+            # Extract PDF image object
+            success, error, output_file = extract_pdf_image_object(pdf_path, xref, output_base)
+        else:
+            # Extract with OpenCV
+            success, error, output_file = extract_region_with_opencv(image, bbox, output_base)
+
+        if not success:
+            print(f" FAILED: {error}")
+            continue
+
+        print(f" OK")
+
+        # Step 5: Verify with VLM
+        print(f"  - Verifying signature {idx + 1}...", end='', flush=True)
+        is_signature, verify_error = verify_signature_with_vlm(output_file)
+
+        if verify_error:
+            print(f" ERROR: {verify_error}")
+            continue
+
+        if is_signature:
+            print(" VERIFIED")
+            extracted_files.append(output_file)
+        else:
+            print(" NOT A SIGNATURE - moved to rejected/")
+            # Move to rejected folder instead of deleting
+            rejected_file = os.path.join(REJECTED_PATH, os.path.basename(output_file))
+            os.rename(output_file, rejected_file)
+
+    return len(extracted_files), extracted_files, None
+
+
+def main():
+    """Main processing function"""
+    global LOG_FILE
+
+    print(f"Starting VLM-guided signature extraction...")
+    print(f"Ollama URL: {OLLAMA_URL}")
+    print(f"Model: {OLLAMA_MODEL}")
+    print(f"Input path: {PDF_INPUT_PATH}")
+    print(f"Output path: {OUTPUT_PATH}")
+    print()
+
+    # Test Ollama connection
+    print("Testing Ollama connection...")
+    try:
+        response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5)
+        response.raise_for_status()
+        print("✓ Ollama connection successful\n")
+    except Exception as e:
+        print(f"✗ Ollama connection failed: {e}")
+        print(f"Please check that Ollama is running at {OLLAMA_URL}")
+        return
+
+    # Create output directories
+    os.makedirs(OUTPUT_PATH, exist_ok=True)
+    os.makedirs(REJECTED_PATH, exist_ok=True)
+
+    LOG_FILE = os.path.join(OUTPUT_PATH, f"vlm_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
+
+    # Get PDF files
+    pdf_files = sorted(Path(PDF_INPUT_PATH).glob("*.pdf"))[:5]  # Test with first 5 files
+
+    if not pdf_files:
+        print("ERROR: No PDF files found!")
+        return
+
+    print(f"Found {len(pdf_files)} PDF files to process (testing with first 5)\n")
+
+    # Statistics
+    stats = {
+        'total_pdfs': 0,
+        'pdfs_with_signatures': 0,
+        'total_signatures': 0,
+        'errors': 0
+    }
+
+    # Open log file
+    with open(LOG_FILE, 'w', newline='') as log_file:
+        log_writer = csv.writer(log_file)
+        log_writer.writerow([
+            'pdf_filename', 'signatures_found', 'extracted_files', 'error'
+        ])
+
+        # Process each PDF
+        for i, pdf_path in enumerate(pdf_files):
+            stats['total_pdfs'] += 1
+            pdf_filename = pdf_path.name
+
+            print(f"[{i+1}/{len(pdf_files)}] Processing: {pdf_filename}")
+
+            # Extract signatures
+            sig_count, extracted_files, error = process_pdf_page(str(pdf_path), OUTPUT_PATH)
+
+            if error:
+                print(f"  ERROR: {error}\n")
+                stats['errors'] += 1
+                log_writer.writerow([pdf_filename, 0, "", error])
+                continue
+
+            if sig_count > 0:
+                stats['pdfs_with_signatures'] += 1
+                stats['total_signatures'] += sig_count
+                print(f"  ✓ Extracted {sig_count} signature(s)\n")
+
+                filenames = [Path(f).name for f in extracted_files]
+                log_writer.writerow([
+                    pdf_filename,
+                    sig_count,
+                    ", ".join(filenames),
+                    ""
+                ])
+            else:
+                print(f"  No signatures extracted\n")
+                log_writer.writerow([pdf_filename, 0, "", ""])
+
+    # Print summary
+    print("="*60)
+    print("VLM EXTRACTION SUMMARY")
+    print("="*60)
+    print(f"Total PDFs processed:        {stats['total_pdfs']}")
+    print(f"PDFs with signatures:        {stats['pdfs_with_signatures']}")
+    print(f"Total signatures extracted:  {stats['total_signatures']}")
+    print(f"Errors:                      {stats['errors']}")
+    print(f"\nLog file: {LOG_FILE}")
+    print("="*60)
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\n\nProcess interrupted by user.")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n\nFATAL ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)