#!/usr/bin/env python3 """ 從 master_signatures.csv 隨機抽取 500 筆記錄, 將對應的 PDF 頁面轉換為 PNG 圖片,供 YOLO 訓練標注使用。 基於 extract_pages_from_csv.py 的邏輯修改。 """ import csv import os import random from pathlib import Path from datetime import datetime import fitz # PyMuPDF # Configuration CSV_PATH = "/Volumes/NV2/PDF-Processing/master_signatures.csv" PDF_BASE_PATH = "/Volumes/NV2/PDF-Processing/total-pdf" OUTPUT_PATH = "/Volumes/NV2/yolo-signature-detection/images" LOG_FILE = "/Volumes/NV2/yolo-signature-detection/extraction_log.csv" # 要提取的數量 SAMPLE_SIZE = 500 # PNG 輸出設定 DPI = 150 # 解析度,150 DPI 對於標注來說足夠清晰且檔案不會太大 def find_pdf_file(filename): """ Search for PDF file in batch directories. Returns the full path if found, None otherwise. """ for batch_dir in sorted(Path(PDF_BASE_PATH).glob("batch_*")): pdf_path = batch_dir / filename if pdf_path.exists(): return str(pdf_path) return None def export_page_as_png(pdf_path, page_number, output_path): """ Export a specific page from PDF to PNG image. Returns (success: bool, error: str) """ try: doc = fitz.open(pdf_path) # Check if page number is valid (convert to 0-indexed) if page_number < 1 or page_number > len(doc): doc.close() return False, f"Invalid page number: {page_number} (PDF has {len(doc)} pages)" # Get the page page = doc.load_page(page_number - 1) # Render to image with specified DPI mat = fitz.Matrix(DPI / 72, DPI / 72) # 72 is the default PDF DPI pix = page.get_pixmap(matrix=mat) # Save as PNG pix.save(output_path) doc.close() return True, None except Exception as e: return False, str(e) def load_csv_records(): """Load all records from CSV file.""" records = [] with open(CSV_PATH, 'r') as csv_file: csv_reader = csv.DictReader(csv_file) for row in csv_reader: records.append(row) return records def main(): """Main processing function""" print(f"=" * 60) print(f"YOLO 訓練數據提取工具") print(f"=" * 60) print(f"CSV 索引檔: {CSV_PATH}") print(f"PDF 來源: {PDF_BASE_PATH}") print(f"輸出目錄: {OUTPUT_PATH}") print(f"提取數量: {SAMPLE_SIZE}") print(f"輸出 DPI: {DPI}") print(f"=" * 60 + "\n") # Ensure output directory exists os.makedirs(OUTPUT_PATH, exist_ok=True) # Load all records print("載入 CSV 索引檔...") all_records = load_csv_records() print(f"總共 {len(all_records)} 筆記錄\n") # Random sample print(f"隨機抽取 {SAMPLE_SIZE} 筆...") random.seed(42) # 固定種子以便重現 sampled_records = random.sample(all_records, min(SAMPLE_SIZE, len(all_records))) print(f"已抽取 {len(sampled_records)} 筆\n") # Statistics stats = { 'total': len(sampled_records), 'pdf_found': 0, 'pdf_not_found': 0, 'exported': 0, 'errors': 0 } # Process and log with open(LOG_FILE, 'w', newline='') as log_file: log_writer = csv.writer(log_file) log_writer.writerow([ 'index', 'filename', 'page', 'output_file', 'pdf_found', 'exported', 'error_message' ]) for i, row in enumerate(sampled_records): filename = row['filename'] page = int(row['page']) print(f"[{i+1}/{len(sampled_records)}] {filename} (頁 {page})... ", end='', flush=True) # Find the PDF file pdf_path = find_pdf_file(filename) if pdf_path is None: print("找不到 PDF") stats['pdf_not_found'] += 1 log_writer.writerow([ i+1, filename, page, None, False, False, "PDF file not found" ]) continue stats['pdf_found'] += 1 # Generate output filename output_filename = f"{Path(filename).stem}_page{page}.png" output_path = os.path.join(OUTPUT_PATH, output_filename) # Export as PNG success, error = export_page_as_png(pdf_path, page, output_path) if success: print("✓") stats['exported'] += 1 log_writer.writerow([ i+1, filename, page, output_filename, True, True, None ]) else: print(f"錯誤: {error}") stats['errors'] += 1 log_writer.writerow([ i+1, filename, page, None, True, False, error ]) # Print summary print("\n" + "=" * 60) print("提取完成!") print("=" * 60) print(f"總共處理: {stats['total']}") print(f"PDF 找到: {stats['pdf_found']}") print(f"PDF 找不到: {stats['pdf_not_found']}") print(f"成功輸出: {stats['exported']}") print(f"錯誤: {stats['errors']}") print(f"\n輸出目錄: {OUTPUT_PATH}") print(f"日誌檔案: {LOG_FILE}") print("=" * 60) if __name__ == "__main__": try: main() except KeyboardInterrupt: print("\n\n使用者中斷") except Exception as e: print(f"\n\n錯誤: {e}") import traceback traceback.print_exc()