pdf_signature_extraction/extract_pages_from_csv.py

#!/usr/bin/env python3
"""
Script to extract PDF pages specified in master_signatures.csv.
Simply extracts the pages listed in the CSV without any image detection.
"""

import csv
import os
import sys
from pathlib import Path
from datetime import datetime
import fitz  # PyMuPDF

# Configuration
CSV_PATH = "/Volumes/NV2/PDF-Processing/master_signatures.csv"
PDF_BASE_PATH = "/Volumes/NV2/PDF-Processing/total-pdf"
OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output"
LOG_FILE = os.path.join(OUTPUT_PATH, f"page_extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
TEST_LIMIT = 100  # Number of files to test


def find_pdf_file(filename):
    """
    Search for PDF file in batch directories.
    Returns the full path if found, None otherwise.
    """
    # Search in all batch directories
    for batch_dir in sorted(Path(PDF_BASE_PATH).glob("batch_*")):
        pdf_path = batch_dir / filename
        if pdf_path.exists():
            return str(pdf_path)
    return None


def export_page(pdf_path, page_number, output_filename):
    """
    Export a specific page from PDF to the output directory.
    Returns (success: bool, error: str)
    """
    try:
        doc = fitz.open(pdf_path)

        # Check if page number is valid (convert to 0-indexed)
        if page_number < 1 or page_number > len(doc):
            doc.close()
            return False, f"Invalid page number: {page_number} (PDF has {len(doc)} pages)"

        # Create a new PDF with just this page
        output_doc = fitz.open()
        output_doc.insert_pdf(doc, from_page=page_number - 1, to_page=page_number - 1)

        # Save to output directory
        output_path = os.path.join(OUTPUT_PATH, output_filename)
        output_doc.save(output_path)

        output_doc.close()
        doc.close()

        return True, None

    except Exception as e:
        return False, str(e)


def main():
    """Main processing function"""
    print(f"Starting PDF page extraction...")
    print(f"CSV file: {CSV_PATH}")
    print(f"PDF base path: {PDF_BASE_PATH}")
    print(f"Output path: {OUTPUT_PATH}")
    print(f"Test limit: {TEST_LIMIT} files\n")

    # Ensure output directory exists
    os.makedirs(OUTPUT_PATH, exist_ok=True)

    # Statistics
    stats = {
        'total_processed': 0,
        'pdf_found': 0,
        'pdf_not_found': 0,
        'exported': 0,
        'errors': 0
    }

    # Open log file for writing
    with open(LOG_FILE, 'w', newline='') as log_file:
        log_writer = csv.writer(log_file)
        log_writer.writerow([
            'source_folder', 'source_subfolder', 'filename', 'page',
            'pdf_found', 'exported', 'error_message'
        ])

        # Read and process CSV
        with open(CSV_PATH, 'r') as csv_file:
            csv_reader = csv.DictReader(csv_file)

            for i, row in enumerate(csv_reader):
                if i >= TEST_LIMIT:
                    break

                stats['total_processed'] += 1

                source_folder = row['source_folder']
                source_subfolder = row['source_subfolder']
                filename = row['filename']
                page = int(row['page'])

                print(f"[{i+1}/{TEST_LIMIT}] Processing: {filename}, page {page}... ", end='', flush=True)

                # Find the PDF file
                pdf_path = find_pdf_file(filename)

                if pdf_path is None:
                    print("NOT FOUND")
                    stats['pdf_not_found'] += 1
                    log_writer.writerow([
                        source_folder, source_subfolder, filename, page,
                        False, False, "PDF file not found"
                    ])
                    continue

                stats['pdf_found'] += 1

                # Export the page
                output_filename = f"{Path(filename).stem}_page{page}.pdf"
                success, error = export_page(pdf_path, page, output_filename)

                if success:
                    print("EXPORTED")
                    stats['exported'] += 1
                    log_writer.writerow([
                        source_folder, source_subfolder, filename, page,
                        True, True, None
                    ])
                else:
                    print(f"ERROR: {error}")
                    stats['errors'] += 1
                    log_writer.writerow([
                        source_folder, source_subfolder, filename, page,
                        True, False, error
                    ])

    # Print summary
    print("\n" + "="*60)
    print("PROCESSING SUMMARY")
    print("="*60)
    print(f"Total processed:       {stats['total_processed']}")
    print(f"PDFs found:            {stats['pdf_found']}")
    print(f"PDFs not found:        {stats['pdf_not_found']}")
    print(f"Successfully exported: {stats['exported']}")
    print(f"Errors:                {stats['errors']}")
    print(f"\nLog file saved to: {LOG_FILE}")
    print("="*60)


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n\nProcess interrupted by user.")
        sys.exit(1)
    except Exception as e:
        print(f"\n\nFATAL ERROR: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)