#!/usr/bin/env python3 """ Step 1: 建立 SQLite 資料庫,匯入簽名記錄 從 extraction_results.csv 匯入資料,展開每個圖片為獨立記錄 解析圖片檔名填充 year_month, sig_index 計算圖片尺寸 width, height """ import sqlite3 import pandas as pd import cv2 import os import re from pathlib import Path from tqdm import tqdm from concurrent.futures import ThreadPoolExecutor, as_completed # 路徑配置 IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images") CSV_PATH = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/extraction_results.csv") OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis") DB_PATH = OUTPUT_DIR / "signature_analysis.db" def parse_image_filename(filename: str) -> dict: """ 解析圖片檔名,提取結構化資訊 範例: 201301_2458_AI1_page4_sig1.png """ # 移除 .png 副檔名 name = filename.replace('.png', '') # 解析模式: {YYYYMM}_{SERIAL}_{DOCTYPE}_page{PAGE}_sig{N} match = re.match(r'^(\d{6})_([^_]+)_([^_]+)_page(\d+)_sig(\d+)$', name) if match: year_month, serial, doc_type, page, sig_index = match.groups() return { 'year_month': year_month, 'serial_number': serial, 'doc_type': doc_type, 'page_number': int(page), 'sig_index': int(sig_index) } else: # 無法解析時返回 None return { 'year_month': None, 'serial_number': None, 'doc_type': None, 'page_number': None, 'sig_index': None } def get_image_dimensions(image_path: Path) -> tuple: """讀取圖片尺寸""" try: img = cv2.imread(str(image_path)) if img is not None: h, w = img.shape[:2] return w, h return None, None except Exception: return None, None def process_single_image(args: tuple) -> dict: """處理單張圖片,返回資料記錄""" image_filename, source_pdf, confidence_avg = args # 解析檔名 parsed = parse_image_filename(image_filename) # 取得圖片尺寸 image_path = IMAGES_DIR / image_filename width, height = get_image_dimensions(image_path) return { 'image_filename': image_filename, 'source_pdf': source_pdf, 'year_month': parsed['year_month'], 'serial_number': parsed['serial_number'], 'doc_type': parsed['doc_type'], 'page_number': parsed['page_number'], 'sig_index': parsed['sig_index'], 'detection_confidence': confidence_avg, 'image_width': width, 'image_height': height } def create_database(): """建立資料庫 schema""" OUTPUT_DIR.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() # 建立 signatures 表 cursor.execute(''' CREATE TABLE IF NOT EXISTS signatures ( signature_id INTEGER PRIMARY KEY AUTOINCREMENT, image_filename TEXT UNIQUE NOT NULL, source_pdf TEXT NOT NULL, year_month TEXT, serial_number TEXT, doc_type TEXT, page_number INTEGER, sig_index INTEGER, detection_confidence REAL, image_width INTEGER, image_height INTEGER, accountant_name TEXT, accountant_id INTEGER, feature_vector BLOB, cluster_id INTEGER, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') # 建立索引 cursor.execute('CREATE INDEX IF NOT EXISTS idx_source_pdf ON signatures(source_pdf)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_year_month ON signatures(year_month)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_accountant_id ON signatures(accountant_id)') conn.commit() conn.close() print(f"資料庫已建立: {DB_PATH}") def expand_csv_to_records(csv_path: Path) -> list: """ 將 CSV 展開為單張圖片記錄 CSV 格式: filename,page,num_signatures,confidence_avg,image_files 需要將 image_files 展開為多筆記錄 """ df = pd.read_csv(csv_path) records = [] for _, row in df.iterrows(): source_pdf = row['filename'] confidence_avg = row['confidence_avg'] image_files_str = row['image_files'] # 展開 image_files(逗號分隔) if pd.notna(image_files_str): image_files = [f.strip() for f in image_files_str.split(',')] for img_file in image_files: records.append((img_file, source_pdf, confidence_avg)) return records def import_data(): """匯入資料到資料庫""" print("讀取 CSV 並展開記錄...") records = expand_csv_to_records(CSV_PATH) print(f"共 {len(records)} 張簽名圖片待處理") print("處理圖片資訊(讀取尺寸)...") processed_records = [] # 使用多執行緒加速圖片尺寸讀取 with ThreadPoolExecutor(max_workers=8) as executor: futures = {executor.submit(process_single_image, r): r for r in records} for future in tqdm(as_completed(futures), total=len(records), desc="處理圖片"): result = future.result() processed_records.append(result) print("寫入資料庫...") conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() # 批次插入 insert_sql = ''' INSERT OR IGNORE INTO signatures ( image_filename, source_pdf, year_month, serial_number, doc_type, page_number, sig_index, detection_confidence, image_width, image_height ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''' batch_data = [ ( r['image_filename'], r['source_pdf'], r['year_month'], r['serial_number'], r['doc_type'], r['page_number'], r['sig_index'], r['detection_confidence'], r['image_width'], r['image_height'] ) for r in processed_records ] cursor.executemany(insert_sql, batch_data) conn.commit() # 統計結果 cursor.execute('SELECT COUNT(*) FROM signatures') total = cursor.fetchone()[0] cursor.execute('SELECT COUNT(DISTINCT source_pdf) FROM signatures') pdf_count = cursor.fetchone()[0] cursor.execute('SELECT COUNT(DISTINCT year_month) FROM signatures') period_count = cursor.fetchone()[0] cursor.execute('SELECT MIN(year_month), MAX(year_month) FROM signatures') min_date, max_date = cursor.fetchone() conn.close() print("\n" + "=" * 50) print("資料庫建立完成") print("=" * 50) print(f"簽名總數: {total:,}") print(f"PDF 檔案數: {pdf_count:,}") print(f"時間範圍: {min_date} ~ {max_date} ({period_count} 個月)") print(f"資料庫位置: {DB_PATH}") def main(): print("=" * 50) print("Step 1: 建立簽名分析資料庫") print("=" * 50) # 檢查來源檔案 if not CSV_PATH.exists(): print(f"錯誤: 找不到 CSV 檔案 {CSV_PATH}") return if not IMAGES_DIR.exists(): print(f"錯誤: 找不到圖片目錄 {IMAGES_DIR}") return # 建立資料庫 create_database() # 匯入資料 import_data() if __name__ == "__main__": main()