Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification

Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 23:05:33 +08:00
parent 21df0ff387
commit 939a348da4
33 changed files with 9315 additions and 0 deletions
@@ -0,0 +1,246 @@
+#!/usr/bin/env python3
+"""
+Step 1: 建立 SQLite 資料庫，匯入簽名記錄
+
+從 extraction_results.csv 匯入資料，展開每個圖片為獨立記錄
+解析圖片檔名填充 year_month, sig_index
+計算圖片尺寸 width, height
+"""
+
+import sqlite3
+import pandas as pd
+import cv2
+import os
+import re
+from pathlib import Path
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# 路徑配置
+IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
+CSV_PATH = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/extraction_results.csv")
+OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis")
+DB_PATH = OUTPUT_DIR / "signature_analysis.db"
+
+
+def parse_image_filename(filename: str) -> dict:
+    """
+    解析圖片檔名，提取結構化資訊
+
+    範例: 201301_2458_AI1_page4_sig1.png
+    """
+    # 移除 .png 副檔名
+    name = filename.replace('.png', '')
+
+    # 解析模式: {YYYYMM}_{SERIAL}_{DOCTYPE}_page{PAGE}_sig{N}
+    match = re.match(r'^(\d{6})_([^_]+)_([^_]+)_page(\d+)_sig(\d+)$', name)
+
+    if match:
+        year_month, serial, doc_type, page, sig_index = match.groups()
+        return {
+            'year_month': year_month,
+            'serial_number': serial,
+            'doc_type': doc_type,
+            'page_number': int(page),
+            'sig_index': int(sig_index)
+        }
+    else:
+        # 無法解析時返回 None
+        return {
+            'year_month': None,
+            'serial_number': None,
+            'doc_type': None,
+            'page_number': None,
+            'sig_index': None
+        }
+
+
+def get_image_dimensions(image_path: Path) -> tuple:
+    """讀取圖片尺寸"""
+    try:
+        img = cv2.imread(str(image_path))
+        if img is not None:
+            h, w = img.shape[:2]
+            return w, h
+        return None, None
+    except Exception:
+        return None, None
+
+
+def process_single_image(args: tuple) -> dict:
+    """處理單張圖片，返回資料記錄"""
+    image_filename, source_pdf, confidence_avg = args
+
+    # 解析檔名
+    parsed = parse_image_filename(image_filename)
+
+    # 取得圖片尺寸
+    image_path = IMAGES_DIR / image_filename
+    width, height = get_image_dimensions(image_path)
+
+    return {
+        'image_filename': image_filename,
+        'source_pdf': source_pdf,
+        'year_month': parsed['year_month'],
+        'serial_number': parsed['serial_number'],
+        'doc_type': parsed['doc_type'],
+        'page_number': parsed['page_number'],
+        'sig_index': parsed['sig_index'],
+        'detection_confidence': confidence_avg,
+        'image_width': width,
+        'image_height': height
+    }
+
+
+def create_database():
+    """建立資料庫 schema"""
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+
+    # 建立 signatures 表
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS signatures (
+            signature_id INTEGER PRIMARY KEY AUTOINCREMENT,
+            image_filename TEXT UNIQUE NOT NULL,
+            source_pdf TEXT NOT NULL,
+            year_month TEXT,
+            serial_number TEXT,
+            doc_type TEXT,
+            page_number INTEGER,
+            sig_index INTEGER,
+            detection_confidence REAL,
+            image_width INTEGER,
+            image_height INTEGER,
+            accountant_name TEXT,
+            accountant_id INTEGER,
+            feature_vector BLOB,
+            cluster_id INTEGER,
+            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+    ''')
+
+    # 建立索引
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_source_pdf ON signatures(source_pdf)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_year_month ON signatures(year_month)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_accountant_id ON signatures(accountant_id)')
+
+    conn.commit()
+    conn.close()
+
+    print(f"資料庫已建立: {DB_PATH}")
+
+
+def expand_csv_to_records(csv_path: Path) -> list:
+    """
+    將 CSV 展開為單張圖片記錄
+
+    CSV 格式: filename,page,num_signatures,confidence_avg,image_files
+    需要將 image_files 展開為多筆記錄
+    """
+    df = pd.read_csv(csv_path)
+
+    records = []
+    for _, row in df.iterrows():
+        source_pdf = row['filename']
+        confidence_avg = row['confidence_avg']
+        image_files_str = row['image_files']
+
+        # 展開 image_files（逗號分隔）
+        if pd.notna(image_files_str):
+            image_files = [f.strip() for f in image_files_str.split(',')]
+            for img_file in image_files:
+                records.append((img_file, source_pdf, confidence_avg))
+
+    return records
+
+
+def import_data():
+    """匯入資料到資料庫"""
+    print("讀取 CSV 並展開記錄...")
+    records = expand_csv_to_records(CSV_PATH)
+    print(f"共 {len(records)} 張簽名圖片待處理")
+
+    print("處理圖片資訊（讀取尺寸）...")
+    processed_records = []
+
+    # 使用多執行緒加速圖片尺寸讀取
+    with ThreadPoolExecutor(max_workers=8) as executor:
+        futures = {executor.submit(process_single_image, r): r for r in records}
+
+        for future in tqdm(as_completed(futures), total=len(records), desc="處理圖片"):
+            result = future.result()
+            processed_records.append(result)
+
+    print("寫入資料庫...")
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+
+    # 批次插入
+    insert_sql = '''
+        INSERT OR IGNORE INTO signatures (
+            image_filename, source_pdf, year_month, serial_number, doc_type,
+            page_number, sig_index, detection_confidence, image_width, image_height
+        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+    '''
+
+    batch_data = [
+        (
+            r['image_filename'], r['source_pdf'], r['year_month'], r['serial_number'],
+            r['doc_type'], r['page_number'], r['sig_index'], r['detection_confidence'],
+            r['image_width'], r['image_height']
+        )
+        for r in processed_records
+    ]
+
+    cursor.executemany(insert_sql, batch_data)
+    conn.commit()
+
+    # 統計結果
+    cursor.execute('SELECT COUNT(*) FROM signatures')
+    total = cursor.fetchone()[0]
+
+    cursor.execute('SELECT COUNT(DISTINCT source_pdf) FROM signatures')
+    pdf_count = cursor.fetchone()[0]
+
+    cursor.execute('SELECT COUNT(DISTINCT year_month) FROM signatures')
+    period_count = cursor.fetchone()[0]
+
+    cursor.execute('SELECT MIN(year_month), MAX(year_month) FROM signatures')
+    min_date, max_date = cursor.fetchone()
+
+    conn.close()
+
+    print("\n" + "=" * 50)
+    print("資料庫建立完成")
+    print("=" * 50)
+    print(f"簽名總數: {total:,}")
+    print(f"PDF 檔案數: {pdf_count:,}")
+    print(f"時間範圍: {min_date} ~ {max_date} ({period_count} 個月)")
+    print(f"資料庫位置: {DB_PATH}")
+
+
+def main():
+    print("=" * 50)
+    print("Step 1: 建立簽名分析資料庫")
+    print("=" * 50)
+
+    # 檢查來源檔案
+    if not CSV_PATH.exists():
+        print(f"錯誤: 找不到 CSV 檔案 {CSV_PATH}")
+        return
+
+    if not IMAGES_DIR.exists():
+        print(f"錯誤: 找不到圖片目錄 {IMAGES_DIR}")
+        return
+
+    # 建立資料庫
+    create_database()
+
+    # 匯入資料
+    import_data()
+
+
+if __name__ == "__main__":
+    main()