Add hybrid signature extraction with name-based verification
Implement VLM name extraction + CV detection hybrid approach to
replace unreliable VLM coordinate system with name-based verification.
Key Features:
- VLM extracts signature names (周寶蓮, 魏興海, etc.)
- CV or PDF text layer detects regions
- VLM verifies each region against expected names
- Signatures saved with person names: signature_周寶蓮.png
- Duplicate prevention and rejection handling
Test Results:
- 5 PDF pages tested
- 7/10 signatures extracted (70% recall)
- 100% precision (no false positives)
- No blank regions extracted (previous issue resolved)
Files:
- extract_pages_from_csv.py: Extract pages from CSV (tested: 100 files)
- extract_signatures_hybrid.py: Hybrid extraction (current working solution)
- extract_handwriting.py: CV-only approach (component)
- extract_signatures_vlm.py: Deprecated VLM coordinate approach
- PROJECT_DOCUMENTATION.md: Complete project history and results
- SESSION_INIT.md: Session handoff documentation
- SESSION_CHECKLIST.md: Status checklist
- NEW_SESSION_PROMPT.txt: Template for next session
- HOW_TO_CONTINUE.txt: Visual handoff guide
- COMMIT_SUMMARY.md: Commit preparation guide
- README.md: Quick start guide
- README_page_extraction.md: Page extraction docs
- README_hybrid_extraction.md: Hybrid approach docs
- .gitignore: Exclude diagnostic scripts and outputs
Known Limitations:
- 30% of signatures missed due to conservative CV parameters
- Text layer method untested (all test PDFs are scanned images)
- Performance: ~24 seconds per PDF
Next Steps:
- Tune CV parameters for higher recall
- Test with larger dataset (100+ files)
- Process full dataset (86,073 files)
🤖 Generated with Claude Code
This commit is contained in:
166
extract_pages_from_csv.py
Normal file
166
extract_pages_from_csv.py
Normal file
@@ -0,0 +1,166 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to extract PDF pages specified in master_signatures.csv.
|
||||
Simply extracts the pages listed in the CSV without any image detection.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import fitz # PyMuPDF
|
||||
|
||||
# Configuration
|
||||
CSV_PATH = "/Volumes/NV2/PDF-Processing/master_signatures.csv"
|
||||
PDF_BASE_PATH = "/Volumes/NV2/PDF-Processing/total-pdf"
|
||||
OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output"
|
||||
LOG_FILE = os.path.join(OUTPUT_PATH, f"page_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
|
||||
TEST_LIMIT = 100 # Number of files to test
|
||||
|
||||
|
||||
def find_pdf_file(filename):
|
||||
"""
|
||||
Search for PDF file in batch directories.
|
||||
Returns the full path if found, None otherwise.
|
||||
"""
|
||||
# Search in all batch directories
|
||||
for batch_dir in sorted(Path(PDF_BASE_PATH).glob("batch_*")):
|
||||
pdf_path = batch_dir / filename
|
||||
if pdf_path.exists():
|
||||
return str(pdf_path)
|
||||
return None
|
||||
|
||||
|
||||
def export_page(pdf_path, page_number, output_filename):
|
||||
"""
|
||||
Export a specific page from PDF to the output directory.
|
||||
Returns (success: bool, error: str)
|
||||
"""
|
||||
try:
|
||||
doc = fitz.open(pdf_path)
|
||||
|
||||
# Check if page number is valid (convert to 0-indexed)
|
||||
if page_number < 1 or page_number > len(doc):
|
||||
doc.close()
|
||||
return False, f"Invalid page number: {page_number} (PDF has {len(doc)} pages)"
|
||||
|
||||
# Create a new PDF with just this page
|
||||
output_doc = fitz.open()
|
||||
output_doc.insert_pdf(doc, from_page=page_number - 1, to_page=page_number - 1)
|
||||
|
||||
# Save to output directory
|
||||
output_path = os.path.join(OUTPUT_PATH, output_filename)
|
||||
output_doc.save(output_path)
|
||||
|
||||
output_doc.close()
|
||||
doc.close()
|
||||
|
||||
return True, None
|
||||
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main processing function"""
|
||||
print(f"Starting PDF page extraction...")
|
||||
print(f"CSV file: {CSV_PATH}")
|
||||
print(f"PDF base path: {PDF_BASE_PATH}")
|
||||
print(f"Output path: {OUTPUT_PATH}")
|
||||
print(f"Test limit: {TEST_LIMIT} files\n")
|
||||
|
||||
# Ensure output directory exists
|
||||
os.makedirs(OUTPUT_PATH, exist_ok=True)
|
||||
|
||||
# Statistics
|
||||
stats = {
|
||||
'total_processed': 0,
|
||||
'pdf_found': 0,
|
||||
'pdf_not_found': 0,
|
||||
'exported': 0,
|
||||
'errors': 0
|
||||
}
|
||||
|
||||
# Open log file for writing
|
||||
with open(LOG_FILE, 'w', newline='') as log_file:
|
||||
log_writer = csv.writer(log_file)
|
||||
log_writer.writerow([
|
||||
'source_folder', 'source_subfolder', 'filename', 'page',
|
||||
'pdf_found', 'exported', 'error_message'
|
||||
])
|
||||
|
||||
# Read and process CSV
|
||||
with open(CSV_PATH, 'r') as csv_file:
|
||||
csv_reader = csv.DictReader(csv_file)
|
||||
|
||||
for i, row in enumerate(csv_reader):
|
||||
if i >= TEST_LIMIT:
|
||||
break
|
||||
|
||||
stats['total_processed'] += 1
|
||||
|
||||
source_folder = row['source_folder']
|
||||
source_subfolder = row['source_subfolder']
|
||||
filename = row['filename']
|
||||
page = int(row['page'])
|
||||
|
||||
print(f"[{i+1}/{TEST_LIMIT}] Processing: {filename}, page {page}... ", end='', flush=True)
|
||||
|
||||
# Find the PDF file
|
||||
pdf_path = find_pdf_file(filename)
|
||||
|
||||
if pdf_path is None:
|
||||
print("NOT FOUND")
|
||||
stats['pdf_not_found'] += 1
|
||||
log_writer.writerow([
|
||||
source_folder, source_subfolder, filename, page,
|
||||
False, False, "PDF file not found"
|
||||
])
|
||||
continue
|
||||
|
||||
stats['pdf_found'] += 1
|
||||
|
||||
# Export the page
|
||||
output_filename = f"{Path(filename).stem}_page{page}.pdf"
|
||||
success, error = export_page(pdf_path, page, output_filename)
|
||||
|
||||
if success:
|
||||
print("EXPORTED")
|
||||
stats['exported'] += 1
|
||||
log_writer.writerow([
|
||||
source_folder, source_subfolder, filename, page,
|
||||
True, True, None
|
||||
])
|
||||
else:
|
||||
print(f"ERROR: {error}")
|
||||
stats['errors'] += 1
|
||||
log_writer.writerow([
|
||||
source_folder, source_subfolder, filename, page,
|
||||
True, False, error
|
||||
])
|
||||
|
||||
# Print summary
|
||||
print("\n" + "="*60)
|
||||
print("PROCESSING SUMMARY")
|
||||
print("="*60)
|
||||
print(f"Total processed: {stats['total_processed']}")
|
||||
print(f"PDFs found: {stats['pdf_found']}")
|
||||
print(f"PDFs not found: {stats['pdf_not_found']}")
|
||||
print(f"Successfully exported: {stats['exported']}")
|
||||
print(f"Errors: {stats['errors']}")
|
||||
print(f"\nLog file saved to: {LOG_FILE}")
|
||||
print("="*60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nProcess interrupted by user.")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n\nFATAL ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user