Initial commit

2025-11-27 16:39:26 +08:00
commit a97ca8cb7c
508 changed files with 1087 additions and 0 deletions
--- a/scripts/extract_training_images.py
+++ b/scripts/extract_training_images.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+"""
+從 master_signatures.csv 隨機抽取 500 筆記錄，
+將對應的 PDF 頁面轉換為 PNG 圖片，供 YOLO 訓練標注使用。
+
+基於 extract_pages_from_csv.py 的邏輯修改。
+"""
+
+import csv
+import os
+import random
+from pathlib import Path
+from datetime import datetime
+import fitz  # PyMuPDF
+
+# Configuration
+CSV_PATH = "/Volumes/NV2/PDF-Processing/master_signatures.csv"
+PDF_BASE_PATH = "/Volumes/NV2/PDF-Processing/total-pdf"
+OUTPUT_PATH = "/Volumes/NV2/yolo-signature-detection/images"
+LOG_FILE = "/Volumes/NV2/yolo-signature-detection/extraction_log.csv"
+
+# 要提取的數量
+SAMPLE_SIZE = 500
+
+# PNG 輸出設定
+DPI = 150  # 解析度，150 DPI 對於標注來說足夠清晰且檔案不會太大
+
+
+def find_pdf_file(filename):
+    """
+    Search for PDF file in batch directories.
+    Returns the full path if found, None otherwise.
+    """
+    for batch_dir in sorted(Path(PDF_BASE_PATH).glob("batch_*")):
+        pdf_path = batch_dir / filename
+        if pdf_path.exists():
+            return str(pdf_path)
+    return None
+
+
+def export_page_as_png(pdf_path, page_number, output_path):
+    """
+    Export a specific page from PDF to PNG image.
+    Returns (success: bool, error: str)
+    """
+    try:
+        doc = fitz.open(pdf_path)
+
+        # Check if page number is valid (convert to 0-indexed)
+        if page_number < 1 or page_number > len(doc):
+            doc.close()
+            return False, f"Invalid page number: {page_number} (PDF has {len(doc)} pages)"
+
+        # Get the page
+        page = doc.load_page(page_number - 1)
+
+        # Render to image with specified DPI
+        mat = fitz.Matrix(DPI / 72, DPI / 72)  # 72 is the default PDF DPI
+        pix = page.get_pixmap(matrix=mat)
+
+        # Save as PNG
+        pix.save(output_path)
+
+        doc.close()
+        return True, None
+
+    except Exception as e:
+        return False, str(e)
+
+
+def load_csv_records():
+    """Load all records from CSV file."""
+    records = []
+    with open(CSV_PATH, 'r') as csv_file:
+        csv_reader = csv.DictReader(csv_file)
+        for row in csv_reader:
+            records.append(row)
+    return records
+
+
+def main():
+    """Main processing function"""
+    print(f"=" * 60)
+    print(f"YOLO 訓練數據提取工具")
+    print(f"=" * 60)
+    print(f"CSV 索引檔: {CSV_PATH}")
+    print(f"PDF 來源: {PDF_BASE_PATH}")
+    print(f"輸出目錄: {OUTPUT_PATH}")
+    print(f"提取數量: {SAMPLE_SIZE}")
+    print(f"輸出 DPI: {DPI}")
+    print(f"=" * 60 + "\n")
+
+    # Ensure output directory exists
+    os.makedirs(OUTPUT_PATH, exist_ok=True)
+
+    # Load all records
+    print("載入 CSV 索引檔...")
+    all_records = load_csv_records()
+    print(f"總共 {len(all_records)} 筆記錄\n")
+
+    # Random sample
+    print(f"隨機抽取 {SAMPLE_SIZE} 筆...")
+    random.seed(42)  # 固定種子以便重現
+    sampled_records = random.sample(all_records, min(SAMPLE_SIZE, len(all_records)))
+    print(f"已抽取 {len(sampled_records)} 筆\n")
+
+    # Statistics
+    stats = {
+        'total': len(sampled_records),
+        'pdf_found': 0,
+        'pdf_not_found': 0,
+        'exported': 0,
+        'errors': 0
+    }
+
+    # Process and log
+    with open(LOG_FILE, 'w', newline='') as log_file:
+        log_writer = csv.writer(log_file)
+        log_writer.writerow([
+            'index', 'filename', 'page', 'output_file',
+            'pdf_found', 'exported', 'error_message'
+        ])
+
+        for i, row in enumerate(sampled_records):
+            filename = row['filename']
+            page = int(row['page'])
+
+            print(f"[{i+1}/{len(sampled_records)}] {filename} (頁 {page})... ", end='', flush=True)
+
+            # Find the PDF file
+            pdf_path = find_pdf_file(filename)
+
+            if pdf_path is None:
+                print("找不到 PDF")
+                stats['pdf_not_found'] += 1
+                log_writer.writerow([
+                    i+1, filename, page, None,
+                    False, False, "PDF file not found"
+                ])
+                continue
+
+            stats['pdf_found'] += 1
+
+            # Generate output filename
+            output_filename = f"{Path(filename).stem}_page{page}.png"
+            output_path = os.path.join(OUTPUT_PATH, output_filename)
+
+            # Export as PNG
+            success, error = export_page_as_png(pdf_path, page, output_path)
+
+            if success:
+                print("✓")
+                stats['exported'] += 1
+                log_writer.writerow([
+                    i+1, filename, page, output_filename,
+                    True, True, None
+                ])
+            else:
+                print(f"錯誤: {error}")
+                stats['errors'] += 1
+                log_writer.writerow([
+                    i+1, filename, page, None,
+                    True, False, error
+                ])
+
+    # Print summary
+    print("\n" + "=" * 60)
+    print("提取完成!")
+    print("=" * 60)
+    print(f"總共處理:     {stats['total']}")
+    print(f"PDF 找到:     {stats['pdf_found']}")
+    print(f"PDF 找不到:   {stats['pdf_not_found']}")
+    print(f"成功輸出:     {stats['exported']}")
+    print(f"錯誤:         {stats['errors']}")
+    print(f"\n輸出目錄: {OUTPUT_PATH}")
+    print(f"日誌檔案: {LOG_FILE}")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\n\n使用者中斷")
+    except Exception as e:
+        print(f"\n\n錯誤: {e}")
+        import traceback
+        traceback.print_exc()