Initial commit

This commit is contained in:
2025-11-27 16:39:26 +08:00
commit a97ca8cb7c
508 changed files with 1087 additions and 0 deletions

View File

@@ -0,0 +1,188 @@
#!/usr/bin/env python3
"""
從 master_signatures.csv 隨機抽取 500 筆記錄,
將對應的 PDF 頁面轉換為 PNG 圖片,供 YOLO 訓練標注使用。
基於 extract_pages_from_csv.py 的邏輯修改。
"""
import csv
import os
import random
from pathlib import Path
from datetime import datetime
import fitz # PyMuPDF
# Configuration
CSV_PATH = "/Volumes/NV2/PDF-Processing/master_signatures.csv"
PDF_BASE_PATH = "/Volumes/NV2/PDF-Processing/total-pdf"
OUTPUT_PATH = "/Volumes/NV2/yolo-signature-detection/images"
LOG_FILE = "/Volumes/NV2/yolo-signature-detection/extraction_log.csv"
# 要提取的數量
SAMPLE_SIZE = 500
# PNG 輸出設定
DPI = 150 # 解析度150 DPI 對於標注來說足夠清晰且檔案不會太大
def find_pdf_file(filename):
"""
Search for PDF file in batch directories.
Returns the full path if found, None otherwise.
"""
for batch_dir in sorted(Path(PDF_BASE_PATH).glob("batch_*")):
pdf_path = batch_dir / filename
if pdf_path.exists():
return str(pdf_path)
return None
def export_page_as_png(pdf_path, page_number, output_path):
"""
Export a specific page from PDF to PNG image.
Returns (success: bool, error: str)
"""
try:
doc = fitz.open(pdf_path)
# Check if page number is valid (convert to 0-indexed)
if page_number < 1 or page_number > len(doc):
doc.close()
return False, f"Invalid page number: {page_number} (PDF has {len(doc)} pages)"
# Get the page
page = doc.load_page(page_number - 1)
# Render to image with specified DPI
mat = fitz.Matrix(DPI / 72, DPI / 72) # 72 is the default PDF DPI
pix = page.get_pixmap(matrix=mat)
# Save as PNG
pix.save(output_path)
doc.close()
return True, None
except Exception as e:
return False, str(e)
def load_csv_records():
"""Load all records from CSV file."""
records = []
with open(CSV_PATH, 'r') as csv_file:
csv_reader = csv.DictReader(csv_file)
for row in csv_reader:
records.append(row)
return records
def main():
"""Main processing function"""
print(f"=" * 60)
print(f"YOLO 訓練數據提取工具")
print(f"=" * 60)
print(f"CSV 索引檔: {CSV_PATH}")
print(f"PDF 來源: {PDF_BASE_PATH}")
print(f"輸出目錄: {OUTPUT_PATH}")
print(f"提取數量: {SAMPLE_SIZE}")
print(f"輸出 DPI: {DPI}")
print(f"=" * 60 + "\n")
# Ensure output directory exists
os.makedirs(OUTPUT_PATH, exist_ok=True)
# Load all records
print("載入 CSV 索引檔...")
all_records = load_csv_records()
print(f"總共 {len(all_records)} 筆記錄\n")
# Random sample
print(f"隨機抽取 {SAMPLE_SIZE} 筆...")
random.seed(42) # 固定種子以便重現
sampled_records = random.sample(all_records, min(SAMPLE_SIZE, len(all_records)))
print(f"已抽取 {len(sampled_records)}\n")
# Statistics
stats = {
'total': len(sampled_records),
'pdf_found': 0,
'pdf_not_found': 0,
'exported': 0,
'errors': 0
}
# Process and log
with open(LOG_FILE, 'w', newline='') as log_file:
log_writer = csv.writer(log_file)
log_writer.writerow([
'index', 'filename', 'page', 'output_file',
'pdf_found', 'exported', 'error_message'
])
for i, row in enumerate(sampled_records):
filename = row['filename']
page = int(row['page'])
print(f"[{i+1}/{len(sampled_records)}] {filename} (頁 {page})... ", end='', flush=True)
# Find the PDF file
pdf_path = find_pdf_file(filename)
if pdf_path is None:
print("找不到 PDF")
stats['pdf_not_found'] += 1
log_writer.writerow([
i+1, filename, page, None,
False, False, "PDF file not found"
])
continue
stats['pdf_found'] += 1
# Generate output filename
output_filename = f"{Path(filename).stem}_page{page}.png"
output_path = os.path.join(OUTPUT_PATH, output_filename)
# Export as PNG
success, error = export_page_as_png(pdf_path, page, output_path)
if success:
print("")
stats['exported'] += 1
log_writer.writerow([
i+1, filename, page, output_filename,
True, True, None
])
else:
print(f"錯誤: {error}")
stats['errors'] += 1
log_writer.writerow([
i+1, filename, page, None,
True, False, error
])
# Print summary
print("\n" + "=" * 60)
print("提取完成!")
print("=" * 60)
print(f"總共處理: {stats['total']}")
print(f"PDF 找到: {stats['pdf_found']}")
print(f"PDF 找不到: {stats['pdf_not_found']}")
print(f"成功輸出: {stats['exported']}")
print(f"錯誤: {stats['errors']}")
print(f"\n輸出目錄: {OUTPUT_PATH}")
print(f"日誌檔案: {LOG_FILE}")
print("=" * 60)
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\n\n使用者中斷")
except Exception as e:
print(f"\n\n錯誤: {e}")
import traceback
traceback.print_exc()