189 lines
5.4 KiB
Python
189 lines
5.4 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
從 master_signatures.csv 隨機抽取 500 筆記錄,
|
||
將對應的 PDF 頁面轉換為 PNG 圖片,供 YOLO 訓練標注使用。
|
||
|
||
基於 extract_pages_from_csv.py 的邏輯修改。
|
||
"""
|
||
|
||
import csv
|
||
import os
|
||
import random
|
||
from pathlib import Path
|
||
from datetime import datetime
|
||
import fitz # PyMuPDF
|
||
|
||
# Configuration
|
||
CSV_PATH = "/Volumes/NV2/PDF-Processing/master_signatures.csv"
|
||
PDF_BASE_PATH = "/Volumes/NV2/PDF-Processing/total-pdf"
|
||
OUTPUT_PATH = "/Volumes/NV2/yolo-signature-detection/images"
|
||
LOG_FILE = "/Volumes/NV2/yolo-signature-detection/extraction_log.csv"
|
||
|
||
# 要提取的數量
|
||
SAMPLE_SIZE = 500
|
||
|
||
# PNG 輸出設定
|
||
DPI = 150 # 解析度,150 DPI 對於標注來說足夠清晰且檔案不會太大
|
||
|
||
|
||
def find_pdf_file(filename):
|
||
"""
|
||
Search for PDF file in batch directories.
|
||
Returns the full path if found, None otherwise.
|
||
"""
|
||
for batch_dir in sorted(Path(PDF_BASE_PATH).glob("batch_*")):
|
||
pdf_path = batch_dir / filename
|
||
if pdf_path.exists():
|
||
return str(pdf_path)
|
||
return None
|
||
|
||
|
||
def export_page_as_png(pdf_path, page_number, output_path):
|
||
"""
|
||
Export a specific page from PDF to PNG image.
|
||
Returns (success: bool, error: str)
|
||
"""
|
||
try:
|
||
doc = fitz.open(pdf_path)
|
||
|
||
# Check if page number is valid (convert to 0-indexed)
|
||
if page_number < 1 or page_number > len(doc):
|
||
doc.close()
|
||
return False, f"Invalid page number: {page_number} (PDF has {len(doc)} pages)"
|
||
|
||
# Get the page
|
||
page = doc.load_page(page_number - 1)
|
||
|
||
# Render to image with specified DPI
|
||
mat = fitz.Matrix(DPI / 72, DPI / 72) # 72 is the default PDF DPI
|
||
pix = page.get_pixmap(matrix=mat)
|
||
|
||
# Save as PNG
|
||
pix.save(output_path)
|
||
|
||
doc.close()
|
||
return True, None
|
||
|
||
except Exception as e:
|
||
return False, str(e)
|
||
|
||
|
||
def load_csv_records():
|
||
"""Load all records from CSV file."""
|
||
records = []
|
||
with open(CSV_PATH, 'r') as csv_file:
|
||
csv_reader = csv.DictReader(csv_file)
|
||
for row in csv_reader:
|
||
records.append(row)
|
||
return records
|
||
|
||
|
||
def main():
|
||
"""Main processing function"""
|
||
print(f"=" * 60)
|
||
print(f"YOLO 訓練數據提取工具")
|
||
print(f"=" * 60)
|
||
print(f"CSV 索引檔: {CSV_PATH}")
|
||
print(f"PDF 來源: {PDF_BASE_PATH}")
|
||
print(f"輸出目錄: {OUTPUT_PATH}")
|
||
print(f"提取數量: {SAMPLE_SIZE}")
|
||
print(f"輸出 DPI: {DPI}")
|
||
print(f"=" * 60 + "\n")
|
||
|
||
# Ensure output directory exists
|
||
os.makedirs(OUTPUT_PATH, exist_ok=True)
|
||
|
||
# Load all records
|
||
print("載入 CSV 索引檔...")
|
||
all_records = load_csv_records()
|
||
print(f"總共 {len(all_records)} 筆記錄\n")
|
||
|
||
# Random sample
|
||
print(f"隨機抽取 {SAMPLE_SIZE} 筆...")
|
||
random.seed(42) # 固定種子以便重現
|
||
sampled_records = random.sample(all_records, min(SAMPLE_SIZE, len(all_records)))
|
||
print(f"已抽取 {len(sampled_records)} 筆\n")
|
||
|
||
# Statistics
|
||
stats = {
|
||
'total': len(sampled_records),
|
||
'pdf_found': 0,
|
||
'pdf_not_found': 0,
|
||
'exported': 0,
|
||
'errors': 0
|
||
}
|
||
|
||
# Process and log
|
||
with open(LOG_FILE, 'w', newline='') as log_file:
|
||
log_writer = csv.writer(log_file)
|
||
log_writer.writerow([
|
||
'index', 'filename', 'page', 'output_file',
|
||
'pdf_found', 'exported', 'error_message'
|
||
])
|
||
|
||
for i, row in enumerate(sampled_records):
|
||
filename = row['filename']
|
||
page = int(row['page'])
|
||
|
||
print(f"[{i+1}/{len(sampled_records)}] {filename} (頁 {page})... ", end='', flush=True)
|
||
|
||
# Find the PDF file
|
||
pdf_path = find_pdf_file(filename)
|
||
|
||
if pdf_path is None:
|
||
print("找不到 PDF")
|
||
stats['pdf_not_found'] += 1
|
||
log_writer.writerow([
|
||
i+1, filename, page, None,
|
||
False, False, "PDF file not found"
|
||
])
|
||
continue
|
||
|
||
stats['pdf_found'] += 1
|
||
|
||
# Generate output filename
|
||
output_filename = f"{Path(filename).stem}_page{page}.png"
|
||
output_path = os.path.join(OUTPUT_PATH, output_filename)
|
||
|
||
# Export as PNG
|
||
success, error = export_page_as_png(pdf_path, page, output_path)
|
||
|
||
if success:
|
||
print("✓")
|
||
stats['exported'] += 1
|
||
log_writer.writerow([
|
||
i+1, filename, page, output_filename,
|
||
True, True, None
|
||
])
|
||
else:
|
||
print(f"錯誤: {error}")
|
||
stats['errors'] += 1
|
||
log_writer.writerow([
|
||
i+1, filename, page, None,
|
||
True, False, error
|
||
])
|
||
|
||
# Print summary
|
||
print("\n" + "=" * 60)
|
||
print("提取完成!")
|
||
print("=" * 60)
|
||
print(f"總共處理: {stats['total']}")
|
||
print(f"PDF 找到: {stats['pdf_found']}")
|
||
print(f"PDF 找不到: {stats['pdf_not_found']}")
|
||
print(f"成功輸出: {stats['exported']}")
|
||
print(f"錯誤: {stats['errors']}")
|
||
print(f"\n輸出目錄: {OUTPUT_PATH}")
|
||
print(f"日誌檔案: {LOG_FILE}")
|
||
print("=" * 60)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
try:
|
||
main()
|
||
except KeyboardInterrupt:
|
||
print("\n\n使用者中斷")
|
||
except Exception as e:
|
||
print(f"\n\n錯誤: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|