yolo-label/scripts/extract_training_images.py

#!/usr/bin/env python3
"""
從 master_signatures.csv 隨機抽取 500 筆記錄，
將對應的 PDF 頁面轉換為 PNG 圖片，供 YOLO 訓練標注使用。

基於 extract_pages_from_csv.py 的邏輯修改。
"""

import csv
import os
import random
from pathlib import Path
from datetime import datetime
import fitz  # PyMuPDF

# Configuration
CSV_PATH = "/Volumes/NV2/PDF-Processing/master_signatures.csv"
PDF_BASE_PATH = "/Volumes/NV2/PDF-Processing/total-pdf"
OUTPUT_PATH = "/Volumes/NV2/yolo-signature-detection/images"
LOG_FILE = "/Volumes/NV2/yolo-signature-detection/extraction_log.csv"

# 要提取的數量
SAMPLE_SIZE = 500

# PNG 輸出設定
DPI = 150  # 解析度，150 DPI 對於標注來說足夠清晰且檔案不會太大


def find_pdf_file(filename):
    """
    Search for PDF file in batch directories.
    Returns the full path if found, None otherwise.
    """
    for batch_dir in sorted(Path(PDF_BASE_PATH).glob("batch_*")):
        pdf_path = batch_dir / filename
        if pdf_path.exists():
            return str(pdf_path)
    return None


def export_page_as_png(pdf_path, page_number, output_path):
    """
    Export a specific page from PDF to PNG image.
    Returns (success: bool, error: str)
    """
    try:
        doc = fitz.open(pdf_path)

        # Check if page number is valid (convert to 0-indexed)
        if page_number < 1 or page_number > len(doc):
            doc.close()
            return False, f"Invalid page number: {page_number} (PDF has {len(doc)} pages)"

        # Get the page
        page = doc.load_page(page_number - 1)

        # Render to image with specified DPI
        mat = fitz.Matrix(DPI / 72, DPI / 72)  # 72 is the default PDF DPI
        pix = page.get_pixmap(matrix=mat)

        # Save as PNG
        pix.save(output_path)

        doc.close()
        return True, None

    except Exception as e:
        return False, str(e)


def load_csv_records():
    """Load all records from CSV file."""
    records = []
    with open(CSV_PATH, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            records.append(row)
    return records


def main():
    """Main processing function"""
    print(f"=" * 60)
    print(f"YOLO 訓練數據提取工具")
    print(f"=" * 60)
    print(f"CSV 索引檔: {CSV_PATH}")
    print(f"PDF 來源: {PDF_BASE_PATH}")
    print(f"輸出目錄: {OUTPUT_PATH}")
    print(f"提取數量: {SAMPLE_SIZE}")
    print(f"輸出 DPI: {DPI}")
    print(f"=" * 60 + "\n")

    # Ensure output directory exists
    os.makedirs(OUTPUT_PATH, exist_ok=True)

    # Load all records
    print("載入 CSV 索引檔...")
    all_records = load_csv_records()
    print(f"總共 {len(all_records)} 筆記錄\n")

    # Random sample
    print(f"隨機抽取 {SAMPLE_SIZE} 筆...")
    random.seed(42)  # 固定種子以便重現
    sampled_records = random.sample(all_records, min(SAMPLE_SIZE, len(all_records)))
    print(f"已抽取 {len(sampled_records)} 筆\n")

    # Statistics
    stats = {
        'total': len(sampled_records),
        'pdf_found': 0,
        'pdf_not_found': 0,
        'exported': 0,
        'errors': 0
    }

    # Process and log
    with open(LOG_FILE, 'w', newline='') as log_file:
        log_writer = csv.writer(log_file)
        log_writer.writerow([
            'index', 'filename', 'page', 'output_file',
            'pdf_found', 'exported', 'error_message'
        ])

        for i, row in enumerate(sampled_records):
            filename = row['filename']
            page = int(row['page'])

            print(f"[{i+1}/{len(sampled_records)}] {filename} (頁 {page})... ", end='', flush=True)

            # Find the PDF file
            pdf_path = find_pdf_file(filename)

            if pdf_path is None:
                print("找不到 PDF")
                stats['pdf_not_found'] += 1
                log_writer.writerow([
                    i+1, filename, page, None,
                    False, False, "PDF file not found"
                ])
                continue

            stats['pdf_found'] += 1

            # Generate output filename
            output_filename = f"{Path(filename).stem}_page{page}.png"
            output_path = os.path.join(OUTPUT_PATH, output_filename)

            # Export as PNG
            success, error = export_page_as_png(pdf_path, page, output_path)

            if success:
                print("✓")
                stats['exported'] += 1
                log_writer.writerow([
                    i+1, filename, page, output_filename,
                    True, True, None
                ])
            else:
                print(f"錯誤: {error}")
                stats['errors'] += 1
                log_writer.writerow([
                    i+1, filename, page, None,
                    True, False, error
                ])

    # Print summary
    print("\n" + "=" * 60)
    print("提取完成!")
    print("=" * 60)
    print(f"總共處理:     {stats['total']}")
    print(f"PDF 找到:     {stats['pdf_found']}")
    print(f"PDF 找不到:   {stats['pdf_not_found']}")
    print(f"成功輸出:     {stats['exported']}")
    print(f"錯誤:         {stats['errors']}")
    print(f"\n輸出目錄: {OUTPUT_PATH}")
    print(f"日誌檔案: {LOG_FILE}")
    print("=" * 60)


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n\n使用者中斷")
    except Exception as e:
        print(f"\n\n錯誤: {e}")
        import traceback
        traceback.print_exc()