Files
yolo-label/scripts/extract_training_images.py
2025-11-27 16:39:26 +08:00

189 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
從 master_signatures.csv 隨機抽取 500 筆記錄,
將對應的 PDF 頁面轉換為 PNG 圖片,供 YOLO 訓練標注使用。
基於 extract_pages_from_csv.py 的邏輯修改。
"""
import csv
import os
import random
from pathlib import Path
from datetime import datetime
import fitz # PyMuPDF
# Configuration
CSV_PATH = "/Volumes/NV2/PDF-Processing/master_signatures.csv"
PDF_BASE_PATH = "/Volumes/NV2/PDF-Processing/total-pdf"
OUTPUT_PATH = "/Volumes/NV2/yolo-signature-detection/images"
LOG_FILE = "/Volumes/NV2/yolo-signature-detection/extraction_log.csv"
# 要提取的數量
SAMPLE_SIZE = 500
# PNG 輸出設定
DPI = 150 # 解析度150 DPI 對於標注來說足夠清晰且檔案不會太大
def find_pdf_file(filename):
"""
Search for PDF file in batch directories.
Returns the full path if found, None otherwise.
"""
for batch_dir in sorted(Path(PDF_BASE_PATH).glob("batch_*")):
pdf_path = batch_dir / filename
if pdf_path.exists():
return str(pdf_path)
return None
def export_page_as_png(pdf_path, page_number, output_path):
"""
Export a specific page from PDF to PNG image.
Returns (success: bool, error: str)
"""
try:
doc = fitz.open(pdf_path)
# Check if page number is valid (convert to 0-indexed)
if page_number < 1 or page_number > len(doc):
doc.close()
return False, f"Invalid page number: {page_number} (PDF has {len(doc)} pages)"
# Get the page
page = doc.load_page(page_number - 1)
# Render to image with specified DPI
mat = fitz.Matrix(DPI / 72, DPI / 72) # 72 is the default PDF DPI
pix = page.get_pixmap(matrix=mat)
# Save as PNG
pix.save(output_path)
doc.close()
return True, None
except Exception as e:
return False, str(e)
def load_csv_records():
"""Load all records from CSV file."""
records = []
with open(CSV_PATH, 'r') as csv_file:
csv_reader = csv.DictReader(csv_file)
for row in csv_reader:
records.append(row)
return records
def main():
"""Main processing function"""
print(f"=" * 60)
print(f"YOLO 訓練數據提取工具")
print(f"=" * 60)
print(f"CSV 索引檔: {CSV_PATH}")
print(f"PDF 來源: {PDF_BASE_PATH}")
print(f"輸出目錄: {OUTPUT_PATH}")
print(f"提取數量: {SAMPLE_SIZE}")
print(f"輸出 DPI: {DPI}")
print(f"=" * 60 + "\n")
# Ensure output directory exists
os.makedirs(OUTPUT_PATH, exist_ok=True)
# Load all records
print("載入 CSV 索引檔...")
all_records = load_csv_records()
print(f"總共 {len(all_records)} 筆記錄\n")
# Random sample
print(f"隨機抽取 {SAMPLE_SIZE} 筆...")
random.seed(42) # 固定種子以便重現
sampled_records = random.sample(all_records, min(SAMPLE_SIZE, len(all_records)))
print(f"已抽取 {len(sampled_records)}\n")
# Statistics
stats = {
'total': len(sampled_records),
'pdf_found': 0,
'pdf_not_found': 0,
'exported': 0,
'errors': 0
}
# Process and log
with open(LOG_FILE, 'w', newline='') as log_file:
log_writer = csv.writer(log_file)
log_writer.writerow([
'index', 'filename', 'page', 'output_file',
'pdf_found', 'exported', 'error_message'
])
for i, row in enumerate(sampled_records):
filename = row['filename']
page = int(row['page'])
print(f"[{i+1}/{len(sampled_records)}] {filename} (頁 {page})... ", end='', flush=True)
# Find the PDF file
pdf_path = find_pdf_file(filename)
if pdf_path is None:
print("找不到 PDF")
stats['pdf_not_found'] += 1
log_writer.writerow([
i+1, filename, page, None,
False, False, "PDF file not found"
])
continue
stats['pdf_found'] += 1
# Generate output filename
output_filename = f"{Path(filename).stem}_page{page}.png"
output_path = os.path.join(OUTPUT_PATH, output_filename)
# Export as PNG
success, error = export_page_as_png(pdf_path, page, output_path)
if success:
print("")
stats['exported'] += 1
log_writer.writerow([
i+1, filename, page, output_filename,
True, True, None
])
else:
print(f"錯誤: {error}")
stats['errors'] += 1
log_writer.writerow([
i+1, filename, page, None,
True, False, error
])
# Print summary
print("\n" + "=" * 60)
print("提取完成!")
print("=" * 60)
print(f"總共處理: {stats['total']}")
print(f"PDF 找到: {stats['pdf_found']}")
print(f"PDF 找不到: {stats['pdf_not_found']}")
print(f"成功輸出: {stats['exported']}")
print(f"錯誤: {stats['errors']}")
print(f"\n輸出目錄: {OUTPUT_PATH}")
print(f"日誌檔案: {LOG_FILE}")
print("=" * 60)
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\n\n使用者中斷")
except Exception as e:
print(f"\n\n錯誤: {e}")
import traceback
traceback.print_exc()