Implement VLM name extraction + CV detection hybrid approach to
replace unreliable VLM coordinate system with name-based verification.
Key Features:
- VLM extracts signature names (周寶蓮, 魏興海, etc.)
- CV or PDF text layer detects regions
- VLM verifies each region against expected names
- Signatures saved with person names: signature_周寶蓮.png
- Duplicate prevention and rejection handling
Test Results:
- 5 PDF pages tested
- 7/10 signatures extracted (70% recall)
- 100% precision (no false positives)
- No blank regions extracted (previous issue resolved)
Files:
- extract_pages_from_csv.py: Extract pages from CSV (tested: 100 files)
- extract_signatures_hybrid.py: Hybrid extraction (current working solution)
- extract_handwriting.py: CV-only approach (component)
- extract_signatures_vlm.py: Deprecated VLM coordinate approach
- PROJECT_DOCUMENTATION.md: Complete project history and results
- SESSION_INIT.md: Session handoff documentation
- SESSION_CHECKLIST.md: Status checklist
- NEW_SESSION_PROMPT.txt: Template for next session
- HOW_TO_CONTINUE.txt: Visual handoff guide
- COMMIT_SUMMARY.md: Commit preparation guide
- README.md: Quick start guide
- README_page_extraction.md: Page extraction docs
- README_hybrid_extraction.md: Hybrid approach docs
- .gitignore: Exclude diagnostic scripts and outputs
Known Limitations:
- 30% of signatures missed due to conservative CV parameters
- Text layer method untested (all test PDFs are scanned images)
- Performance: ~24 seconds per PDF
Next Steps:
- Tune CV parameters for higher recall
- Test with larger dataset (100+ files)
- Process full dataset (86,073 files)
🤖 Generated with Claude Code
167 lines
5.2 KiB
Python
167 lines
5.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script to extract PDF pages specified in master_signatures.csv.
|
|
Simply extracts the pages listed in the CSV without any image detection.
|
|
"""
|
|
|
|
import csv
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import fitz # PyMuPDF
|
|
|
|
# Configuration
|
|
CSV_PATH = "/Volumes/NV2/PDF-Processing/master_signatures.csv"
|
|
PDF_BASE_PATH = "/Volumes/NV2/PDF-Processing/total-pdf"
|
|
OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output"
|
|
LOG_FILE = os.path.join(OUTPUT_PATH, f"page_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
|
|
TEST_LIMIT = 100 # Number of files to test
|
|
|
|
|
|
def find_pdf_file(filename):
|
|
"""
|
|
Search for PDF file in batch directories.
|
|
Returns the full path if found, None otherwise.
|
|
"""
|
|
# Search in all batch directories
|
|
for batch_dir in sorted(Path(PDF_BASE_PATH).glob("batch_*")):
|
|
pdf_path = batch_dir / filename
|
|
if pdf_path.exists():
|
|
return str(pdf_path)
|
|
return None
|
|
|
|
|
|
def export_page(pdf_path, page_number, output_filename):
|
|
"""
|
|
Export a specific page from PDF to the output directory.
|
|
Returns (success: bool, error: str)
|
|
"""
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
|
|
# Check if page number is valid (convert to 0-indexed)
|
|
if page_number < 1 or page_number > len(doc):
|
|
doc.close()
|
|
return False, f"Invalid page number: {page_number} (PDF has {len(doc)} pages)"
|
|
|
|
# Create a new PDF with just this page
|
|
output_doc = fitz.open()
|
|
output_doc.insert_pdf(doc, from_page=page_number - 1, to_page=page_number - 1)
|
|
|
|
# Save to output directory
|
|
output_path = os.path.join(OUTPUT_PATH, output_filename)
|
|
output_doc.save(output_path)
|
|
|
|
output_doc.close()
|
|
doc.close()
|
|
|
|
return True, None
|
|
|
|
except Exception as e:
|
|
return False, str(e)
|
|
|
|
|
|
def main():
|
|
"""Main processing function"""
|
|
print(f"Starting PDF page extraction...")
|
|
print(f"CSV file: {CSV_PATH}")
|
|
print(f"PDF base path: {PDF_BASE_PATH}")
|
|
print(f"Output path: {OUTPUT_PATH}")
|
|
print(f"Test limit: {TEST_LIMIT} files\n")
|
|
|
|
# Ensure output directory exists
|
|
os.makedirs(OUTPUT_PATH, exist_ok=True)
|
|
|
|
# Statistics
|
|
stats = {
|
|
'total_processed': 0,
|
|
'pdf_found': 0,
|
|
'pdf_not_found': 0,
|
|
'exported': 0,
|
|
'errors': 0
|
|
}
|
|
|
|
# Open log file for writing
|
|
with open(LOG_FILE, 'w', newline='') as log_file:
|
|
log_writer = csv.writer(log_file)
|
|
log_writer.writerow([
|
|
'source_folder', 'source_subfolder', 'filename', 'page',
|
|
'pdf_found', 'exported', 'error_message'
|
|
])
|
|
|
|
# Read and process CSV
|
|
with open(CSV_PATH, 'r') as csv_file:
|
|
csv_reader = csv.DictReader(csv_file)
|
|
|
|
for i, row in enumerate(csv_reader):
|
|
if i >= TEST_LIMIT:
|
|
break
|
|
|
|
stats['total_processed'] += 1
|
|
|
|
source_folder = row['source_folder']
|
|
source_subfolder = row['source_subfolder']
|
|
filename = row['filename']
|
|
page = int(row['page'])
|
|
|
|
print(f"[{i+1}/{TEST_LIMIT}] Processing: {filename}, page {page}... ", end='', flush=True)
|
|
|
|
# Find the PDF file
|
|
pdf_path = find_pdf_file(filename)
|
|
|
|
if pdf_path is None:
|
|
print("NOT FOUND")
|
|
stats['pdf_not_found'] += 1
|
|
log_writer.writerow([
|
|
source_folder, source_subfolder, filename, page,
|
|
False, False, "PDF file not found"
|
|
])
|
|
continue
|
|
|
|
stats['pdf_found'] += 1
|
|
|
|
# Export the page
|
|
output_filename = f"{Path(filename).stem}_page{page}.pdf"
|
|
success, error = export_page(pdf_path, page, output_filename)
|
|
|
|
if success:
|
|
print("EXPORTED")
|
|
stats['exported'] += 1
|
|
log_writer.writerow([
|
|
source_folder, source_subfolder, filename, page,
|
|
True, True, None
|
|
])
|
|
else:
|
|
print(f"ERROR: {error}")
|
|
stats['errors'] += 1
|
|
log_writer.writerow([
|
|
source_folder, source_subfolder, filename, page,
|
|
True, False, error
|
|
])
|
|
|
|
# Print summary
|
|
print("\n" + "="*60)
|
|
print("PROCESSING SUMMARY")
|
|
print("="*60)
|
|
print(f"Total processed: {stats['total_processed']}")
|
|
print(f"PDFs found: {stats['pdf_found']}")
|
|
print(f"PDFs not found: {stats['pdf_not_found']}")
|
|
print(f"Successfully exported: {stats['exported']}")
|
|
print(f"Errors: {stats['errors']}")
|
|
print(f"\nLog file saved to: {LOG_FILE}")
|
|
print("="*60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main()
|
|
except KeyboardInterrupt:
|
|
print("\n\nProcess interrupted by user.")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"\n\nFATAL ERROR: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
sys.exit(1)
|