Files
pdf_signature_extraction/extract_pages_from_csv.py
gbanyan 52612e14ba Add hybrid signature extraction with name-based verification
Implement VLM name extraction + CV detection hybrid approach to
replace unreliable VLM coordinate system with name-based verification.

Key Features:
- VLM extracts signature names (周寶蓮, 魏興海, etc.)
- CV or PDF text layer detects regions
- VLM verifies each region against expected names
- Signatures saved with person names: signature_周寶蓮.png
- Duplicate prevention and rejection handling

Test Results:
- 5 PDF pages tested
- 7/10 signatures extracted (70% recall)
- 100% precision (no false positives)
- No blank regions extracted (previous issue resolved)

Files:
- extract_pages_from_csv.py: Extract pages from CSV (tested: 100 files)
- extract_signatures_hybrid.py: Hybrid extraction (current working solution)
- extract_handwriting.py: CV-only approach (component)
- extract_signatures_vlm.py: Deprecated VLM coordinate approach
- PROJECT_DOCUMENTATION.md: Complete project history and results
- SESSION_INIT.md: Session handoff documentation
- SESSION_CHECKLIST.md: Status checklist
- NEW_SESSION_PROMPT.txt: Template for next session
- HOW_TO_CONTINUE.txt: Visual handoff guide
- COMMIT_SUMMARY.md: Commit preparation guide
- README.md: Quick start guide
- README_page_extraction.md: Page extraction docs
- README_hybrid_extraction.md: Hybrid approach docs
- .gitignore: Exclude diagnostic scripts and outputs

Known Limitations:
- 30% of signatures missed due to conservative CV parameters
- Text layer method untested (all test PDFs are scanned images)
- Performance: ~24 seconds per PDF

Next Steps:
- Tune CV parameters for higher recall
- Test with larger dataset (100+ files)
- Process full dataset (86,073 files)

🤖 Generated with Claude Code
2025-10-26 23:39:52 +08:00

167 lines
5.2 KiB
Python

#!/usr/bin/env python3
"""
Script to extract PDF pages specified in master_signatures.csv.
Simply extracts the pages listed in the CSV without any image detection.
"""
import csv
import os
import sys
from pathlib import Path
from datetime import datetime
import fitz # PyMuPDF
# Configuration
CSV_PATH = "/Volumes/NV2/PDF-Processing/master_signatures.csv"
PDF_BASE_PATH = "/Volumes/NV2/PDF-Processing/total-pdf"
OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output"
LOG_FILE = os.path.join(OUTPUT_PATH, f"page_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
TEST_LIMIT = 100 # Number of files to test
def find_pdf_file(filename):
"""
Search for PDF file in batch directories.
Returns the full path if found, None otherwise.
"""
# Search in all batch directories
for batch_dir in sorted(Path(PDF_BASE_PATH).glob("batch_*")):
pdf_path = batch_dir / filename
if pdf_path.exists():
return str(pdf_path)
return None
def export_page(pdf_path, page_number, output_filename):
"""
Export a specific page from PDF to the output directory.
Returns (success: bool, error: str)
"""
try:
doc = fitz.open(pdf_path)
# Check if page number is valid (convert to 0-indexed)
if page_number < 1 or page_number > len(doc):
doc.close()
return False, f"Invalid page number: {page_number} (PDF has {len(doc)} pages)"
# Create a new PDF with just this page
output_doc = fitz.open()
output_doc.insert_pdf(doc, from_page=page_number - 1, to_page=page_number - 1)
# Save to output directory
output_path = os.path.join(OUTPUT_PATH, output_filename)
output_doc.save(output_path)
output_doc.close()
doc.close()
return True, None
except Exception as e:
return False, str(e)
def main():
"""Main processing function"""
print(f"Starting PDF page extraction...")
print(f"CSV file: {CSV_PATH}")
print(f"PDF base path: {PDF_BASE_PATH}")
print(f"Output path: {OUTPUT_PATH}")
print(f"Test limit: {TEST_LIMIT} files\n")
# Ensure output directory exists
os.makedirs(OUTPUT_PATH, exist_ok=True)
# Statistics
stats = {
'total_processed': 0,
'pdf_found': 0,
'pdf_not_found': 0,
'exported': 0,
'errors': 0
}
# Open log file for writing
with open(LOG_FILE, 'w', newline='') as log_file:
log_writer = csv.writer(log_file)
log_writer.writerow([
'source_folder', 'source_subfolder', 'filename', 'page',
'pdf_found', 'exported', 'error_message'
])
# Read and process CSV
with open(CSV_PATH, 'r') as csv_file:
csv_reader = csv.DictReader(csv_file)
for i, row in enumerate(csv_reader):
if i >= TEST_LIMIT:
break
stats['total_processed'] += 1
source_folder = row['source_folder']
source_subfolder = row['source_subfolder']
filename = row['filename']
page = int(row['page'])
print(f"[{i+1}/{TEST_LIMIT}] Processing: {filename}, page {page}... ", end='', flush=True)
# Find the PDF file
pdf_path = find_pdf_file(filename)
if pdf_path is None:
print("NOT FOUND")
stats['pdf_not_found'] += 1
log_writer.writerow([
source_folder, source_subfolder, filename, page,
False, False, "PDF file not found"
])
continue
stats['pdf_found'] += 1
# Export the page
output_filename = f"{Path(filename).stem}_page{page}.pdf"
success, error = export_page(pdf_path, page, output_filename)
if success:
print("EXPORTED")
stats['exported'] += 1
log_writer.writerow([
source_folder, source_subfolder, filename, page,
True, True, None
])
else:
print(f"ERROR: {error}")
stats['errors'] += 1
log_writer.writerow([
source_folder, source_subfolder, filename, page,
True, False, error
])
# Print summary
print("\n" + "="*60)
print("PROCESSING SUMMARY")
print("="*60)
print(f"Total processed: {stats['total_processed']}")
print(f"PDFs found: {stats['pdf_found']}")
print(f"PDFs not found: {stats['pdf_not_found']}")
print(f"Successfully exported: {stats['exported']}")
print(f"Errors: {stats['errors']}")
print(f"\nLog file saved to: {LOG_FILE}")
print("="*60)
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\n\nProcess interrupted by user.")
sys.exit(1)
except Exception as e:
print(f"\n\nFATAL ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)