#!/usr/bin/env python3 """ Script to extract PDF pages specified in master_signatures.csv. Simply extracts the pages listed in the CSV without any image detection. """ import csv import os import sys from pathlib import Path from datetime import datetime import fitz # PyMuPDF # Configuration CSV_PATH = "/Volumes/NV2/PDF-Processing/master_signatures.csv" PDF_BASE_PATH = "/Volumes/NV2/PDF-Processing/total-pdf" OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output" LOG_FILE = os.path.join(OUTPUT_PATH, f"page_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv") TEST_LIMIT = 100 # Number of files to test def find_pdf_file(filename): """ Search for PDF file in batch directories. Returns the full path if found, None otherwise. """ # Search in all batch directories for batch_dir in sorted(Path(PDF_BASE_PATH).glob("batch_*")): pdf_path = batch_dir / filename if pdf_path.exists(): return str(pdf_path) return None def export_page(pdf_path, page_number, output_filename): """ Export a specific page from PDF to the output directory. Returns (success: bool, error: str) """ try: doc = fitz.open(pdf_path) # Check if page number is valid (convert to 0-indexed) if page_number < 1 or page_number > len(doc): doc.close() return False, f"Invalid page number: {page_number} (PDF has {len(doc)} pages)" # Create a new PDF with just this page output_doc = fitz.open() output_doc.insert_pdf(doc, from_page=page_number - 1, to_page=page_number - 1) # Save to output directory output_path = os.path.join(OUTPUT_PATH, output_filename) output_doc.save(output_path) output_doc.close() doc.close() return True, None except Exception as e: return False, str(e) def main(): """Main processing function""" print(f"Starting PDF page extraction...") print(f"CSV file: {CSV_PATH}") print(f"PDF base path: {PDF_BASE_PATH}") print(f"Output path: {OUTPUT_PATH}") print(f"Test limit: {TEST_LIMIT} files\n") # Ensure output directory exists os.makedirs(OUTPUT_PATH, exist_ok=True) # Statistics stats = { 'total_processed': 0, 'pdf_found': 0, 'pdf_not_found': 0, 'exported': 0, 'errors': 0 } # Open log file for writing with open(LOG_FILE, 'w', newline='') as log_file: log_writer = csv.writer(log_file) log_writer.writerow([ 'source_folder', 'source_subfolder', 'filename', 'page', 'pdf_found', 'exported', 'error_message' ]) # Read and process CSV with open(CSV_PATH, 'r') as csv_file: csv_reader = csv.DictReader(csv_file) for i, row in enumerate(csv_reader): if i >= TEST_LIMIT: break stats['total_processed'] += 1 source_folder = row['source_folder'] source_subfolder = row['source_subfolder'] filename = row['filename'] page = int(row['page']) print(f"[{i+1}/{TEST_LIMIT}] Processing: {filename}, page {page}... ", end='', flush=True) # Find the PDF file pdf_path = find_pdf_file(filename) if pdf_path is None: print("NOT FOUND") stats['pdf_not_found'] += 1 log_writer.writerow([ source_folder, source_subfolder, filename, page, False, False, "PDF file not found" ]) continue stats['pdf_found'] += 1 # Export the page output_filename = f"{Path(filename).stem}_page{page}.pdf" success, error = export_page(pdf_path, page, output_filename) if success: print("EXPORTED") stats['exported'] += 1 log_writer.writerow([ source_folder, source_subfolder, filename, page, True, True, None ]) else: print(f"ERROR: {error}") stats['errors'] += 1 log_writer.writerow([ source_folder, source_subfolder, filename, page, True, False, error ]) # Print summary print("\n" + "="*60) print("PROCESSING SUMMARY") print("="*60) print(f"Total processed: {stats['total_processed']}") print(f"PDFs found: {stats['pdf_found']}") print(f"PDFs not found: {stats['pdf_not_found']}") print(f"Successfully exported: {stats['exported']}") print(f"Errors: {stats['errors']}") print(f"\nLog file saved to: {LOG_FILE}") print("="*60) if __name__ == "__main__": try: main() except KeyboardInterrupt: print("\n\nProcess interrupted by user.") sys.exit(1) except Exception as e: print(f"\n\nFATAL ERROR: {e}") import traceback traceback.print_exc() sys.exit(1)