Add hybrid signature extraction with name-based verification

Implement VLM name extraction + CV detection hybrid approach to replace unreliable VLM coordinate system with name-based verification. Key Features: - VLM extracts signature names (周寶蓮, 魏興海, etc.) - CV or PDF text layer detects regions - VLM verifies each region against expected names - Signatures saved with person names: signature_周寶蓮.png - Duplicate prevention and rejection handling Test Results: - 5 PDF pages tested - 7/10 signatures extracted (70% recall) - 100% precision (no false positives) - No blank regions extracted (previous issue resolved) Files: - extract_pages_from_csv.py: Extract pages from CSV (tested: 100 files) - extract_signatures_hybrid.py: Hybrid extraction (current working solution) - extract_handwriting.py: CV-only approach (component) - extract_signatures_vlm.py: Deprecated VLM coordinate approach - PROJECT_DOCUMENTATION.md: Complete project history and results - SESSION_INIT.md: Session handoff documentation - SESSION_CHECKLIST.md: Status checklist - NEW_SESSION_PROMPT.txt: Template for next session - HOW_TO_CONTINUE.txt: Visual handoff guide - COMMIT_SUMMARY.md: Commit preparation guide - README.md: Quick start guide - README_page_extraction.md: Page extraction docs - README_hybrid_extraction.md: Hybrid approach docs - .gitignore: Exclude diagnostic scripts and outputs Known Limitations: - 30% of signatures missed due to conservative CV parameters - Text layer method untested (all test PDFs are scanned images) - Performance: ~24 seconds per PDF Next Steps: - Tune CV parameters for higher recall - Test with larger dataset (100+ files) - Process full dataset (86,073 files) 🤖 Generated with Claude Code
2025-10-26 23:39:52 +08:00
commit 52612e14ba
14 changed files with 3583 additions and 0 deletions
--- a/extract_pages_from_csv.py
+++ b/extract_pages_from_csv.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""
+Script to extract PDF pages specified in master_signatures.csv.
+Simply extracts the pages listed in the CSV without any image detection.
+"""
+
+import csv
+import os
+import sys
+from pathlib import Path
+from datetime import datetime
+import fitz  # PyMuPDF
+
+# Configuration
+CSV_PATH = "/Volumes/NV2/PDF-Processing/master_signatures.csv"
+PDF_BASE_PATH = "/Volumes/NV2/PDF-Processing/total-pdf"
+OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output"
+LOG_FILE = os.path.join(OUTPUT_PATH, f"page_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
+TEST_LIMIT = 100  # Number of files to test
+
+
+def find_pdf_file(filename):
+    """
+    Search for PDF file in batch directories.
+    Returns the full path if found, None otherwise.
+    """
+    # Search in all batch directories
+    for batch_dir in sorted(Path(PDF_BASE_PATH).glob("batch_*")):
+        pdf_path = batch_dir / filename
+        if pdf_path.exists():
+            return str(pdf_path)
+    return None
+
+
+def export_page(pdf_path, page_number, output_filename):
+    """
+    Export a specific page from PDF to the output directory.
+    Returns (success: bool, error: str)
+    """
+    try:
+        doc = fitz.open(pdf_path)
+
+        # Check if page number is valid (convert to 0-indexed)
+        if page_number < 1 or page_number > len(doc):
+            doc.close()
+            return False, f"Invalid page number: {page_number} (PDF has {len(doc)} pages)"
+
+        # Create a new PDF with just this page
+        output_doc = fitz.open()
+        output_doc.insert_pdf(doc, from_page=page_number - 1, to_page=page_number - 1)
+
+        # Save to output directory
+        output_path = os.path.join(OUTPUT_PATH, output_filename)
+        output_doc.save(output_path)
+
+        output_doc.close()
+        doc.close()
+
+        return True, None
+
+    except Exception as e:
+        return False, str(e)
+
+
+def main():
+    """Main processing function"""
+    print(f"Starting PDF page extraction...")
+    print(f"CSV file: {CSV_PATH}")
+    print(f"PDF base path: {PDF_BASE_PATH}")
+    print(f"Output path: {OUTPUT_PATH}")
+    print(f"Test limit: {TEST_LIMIT} files\n")
+
+    # Ensure output directory exists
+    os.makedirs(OUTPUT_PATH, exist_ok=True)
+
+    # Statistics
+    stats = {
+        'total_processed': 0,
+        'pdf_found': 0,
+        'pdf_not_found': 0,
+        'exported': 0,
+        'errors': 0
+    }
+
+    # Open log file for writing
+    with open(LOG_FILE, 'w', newline='') as log_file:
+        log_writer = csv.writer(log_file)
+        log_writer.writerow([
+            'source_folder', 'source_subfolder', 'filename', 'page',
+            'pdf_found', 'exported', 'error_message'
+        ])
+
+        # Read and process CSV
+        with open(CSV_PATH, 'r') as csv_file:
+            csv_reader = csv.DictReader(csv_file)
+
+            for i, row in enumerate(csv_reader):
+                if i >= TEST_LIMIT:
+                    break
+
+                stats['total_processed'] += 1
+
+                source_folder = row['source_folder']
+                source_subfolder = row['source_subfolder']
+                filename = row['filename']
+                page = int(row['page'])
+
+                print(f"[{i+1}/{TEST_LIMIT}] Processing: {filename}, page {page}... ", end='', flush=True)
+
+                # Find the PDF file
+                pdf_path = find_pdf_file(filename)
+
+                if pdf_path is None:
+                    print("NOT FOUND")
+                    stats['pdf_not_found'] += 1
+                    log_writer.writerow([
+                        source_folder, source_subfolder, filename, page,
+                        False, False, "PDF file not found"
+                    ])
+                    continue
+
+                stats['pdf_found'] += 1
+
+                # Export the page
+                output_filename = f"{Path(filename).stem}_page{page}.pdf"
+                success, error = export_page(pdf_path, page, output_filename)
+
+                if success:
+                    print("EXPORTED")
+                    stats['exported'] += 1
+                    log_writer.writerow([
+                        source_folder, source_subfolder, filename, page,
+                        True, True, None
+                    ])
+                else:
+                    print(f"ERROR: {error}")
+                    stats['errors'] += 1
+                    log_writer.writerow([
+                        source_folder, source_subfolder, filename, page,
+                        True, False, error
+                    ])
+
+    # Print summary
+    print("\n" + "="*60)
+    print("PROCESSING SUMMARY")
+    print("="*60)
+    print(f"Total processed:       {stats['total_processed']}")
+    print(f"PDFs found:            {stats['pdf_found']}")
+    print(f"PDFs not found:        {stats['pdf_not_found']}")
+    print(f"Successfully exported: {stats['exported']}")
+    print(f"Errors:                {stats['errors']}")
+    print(f"\nLog file saved to: {LOG_FILE}")
+    print("="*60)
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\n\nProcess interrupted by user.")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n\nFATAL ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)