Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification

Paper draft includes all sections (Abstract through Conclusion), 36 references,
and supporting scripts. Key methodology: Cosine similarity + dHash dual-method
verification with thresholds calibrated against known-replication firm (Firm A).

Includes:
- 8 section markdown files (paper_a_*.md)
- Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0)
- Recalibrated classification script (84,386 PDFs, 5-tier system)
- Figure generation and Word export scripts
- Citation renumbering script ([1]-[36])
- Signature analysis pipeline (12 steps)
- YOLO extraction scripts

Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-06 23:05:33 +08:00
parent 21df0ff387
commit 939a348da4
33 changed files with 9315 additions and 0 deletions
+246
View File
@@ -0,0 +1,246 @@
#!/usr/bin/env python3
"""
Step 1: 建立 SQLite 資料庫,匯入簽名記錄
從 extraction_results.csv 匯入資料,展開每個圖片為獨立記錄
解析圖片檔名填充 year_month, sig_index
計算圖片尺寸 width, height
"""
import sqlite3
import pandas as pd
import cv2
import os
import re
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
# 路徑配置
IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
CSV_PATH = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/extraction_results.csv")
OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis")
DB_PATH = OUTPUT_DIR / "signature_analysis.db"
def parse_image_filename(filename: str) -> dict:
"""
解析圖片檔名,提取結構化資訊
範例: 201301_2458_AI1_page4_sig1.png
"""
# 移除 .png 副檔名
name = filename.replace('.png', '')
# 解析模式: {YYYYMM}_{SERIAL}_{DOCTYPE}_page{PAGE}_sig{N}
match = re.match(r'^(\d{6})_([^_]+)_([^_]+)_page(\d+)_sig(\d+)$', name)
if match:
year_month, serial, doc_type, page, sig_index = match.groups()
return {
'year_month': year_month,
'serial_number': serial,
'doc_type': doc_type,
'page_number': int(page),
'sig_index': int(sig_index)
}
else:
# 無法解析時返回 None
return {
'year_month': None,
'serial_number': None,
'doc_type': None,
'page_number': None,
'sig_index': None
}
def get_image_dimensions(image_path: Path) -> tuple:
"""讀取圖片尺寸"""
try:
img = cv2.imread(str(image_path))
if img is not None:
h, w = img.shape[:2]
return w, h
return None, None
except Exception:
return None, None
def process_single_image(args: tuple) -> dict:
"""處理單張圖片,返回資料記錄"""
image_filename, source_pdf, confidence_avg = args
# 解析檔名
parsed = parse_image_filename(image_filename)
# 取得圖片尺寸
image_path = IMAGES_DIR / image_filename
width, height = get_image_dimensions(image_path)
return {
'image_filename': image_filename,
'source_pdf': source_pdf,
'year_month': parsed['year_month'],
'serial_number': parsed['serial_number'],
'doc_type': parsed['doc_type'],
'page_number': parsed['page_number'],
'sig_index': parsed['sig_index'],
'detection_confidence': confidence_avg,
'image_width': width,
'image_height': height
}
def create_database():
"""建立資料庫 schema"""
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# 建立 signatures 表
cursor.execute('''
CREATE TABLE IF NOT EXISTS signatures (
signature_id INTEGER PRIMARY KEY AUTOINCREMENT,
image_filename TEXT UNIQUE NOT NULL,
source_pdf TEXT NOT NULL,
year_month TEXT,
serial_number TEXT,
doc_type TEXT,
page_number INTEGER,
sig_index INTEGER,
detection_confidence REAL,
image_width INTEGER,
image_height INTEGER,
accountant_name TEXT,
accountant_id INTEGER,
feature_vector BLOB,
cluster_id INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# 建立索引
cursor.execute('CREATE INDEX IF NOT EXISTS idx_source_pdf ON signatures(source_pdf)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_year_month ON signatures(year_month)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_accountant_id ON signatures(accountant_id)')
conn.commit()
conn.close()
print(f"資料庫已建立: {DB_PATH}")
def expand_csv_to_records(csv_path: Path) -> list:
"""
將 CSV 展開為單張圖片記錄
CSV 格式: filename,page,num_signatures,confidence_avg,image_files
需要將 image_files 展開為多筆記錄
"""
df = pd.read_csv(csv_path)
records = []
for _, row in df.iterrows():
source_pdf = row['filename']
confidence_avg = row['confidence_avg']
image_files_str = row['image_files']
# 展開 image_files(逗號分隔)
if pd.notna(image_files_str):
image_files = [f.strip() for f in image_files_str.split(',')]
for img_file in image_files:
records.append((img_file, source_pdf, confidence_avg))
return records
def import_data():
"""匯入資料到資料庫"""
print("讀取 CSV 並展開記錄...")
records = expand_csv_to_records(CSV_PATH)
print(f"{len(records)} 張簽名圖片待處理")
print("處理圖片資訊(讀取尺寸)...")
processed_records = []
# 使用多執行緒加速圖片尺寸讀取
with ThreadPoolExecutor(max_workers=8) as executor:
futures = {executor.submit(process_single_image, r): r for r in records}
for future in tqdm(as_completed(futures), total=len(records), desc="處理圖片"):
result = future.result()
processed_records.append(result)
print("寫入資料庫...")
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# 批次插入
insert_sql = '''
INSERT OR IGNORE INTO signatures (
image_filename, source_pdf, year_month, serial_number, doc_type,
page_number, sig_index, detection_confidence, image_width, image_height
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
'''
batch_data = [
(
r['image_filename'], r['source_pdf'], r['year_month'], r['serial_number'],
r['doc_type'], r['page_number'], r['sig_index'], r['detection_confidence'],
r['image_width'], r['image_height']
)
for r in processed_records
]
cursor.executemany(insert_sql, batch_data)
conn.commit()
# 統計結果
cursor.execute('SELECT COUNT(*) FROM signatures')
total = cursor.fetchone()[0]
cursor.execute('SELECT COUNT(DISTINCT source_pdf) FROM signatures')
pdf_count = cursor.fetchone()[0]
cursor.execute('SELECT COUNT(DISTINCT year_month) FROM signatures')
period_count = cursor.fetchone()[0]
cursor.execute('SELECT MIN(year_month), MAX(year_month) FROM signatures')
min_date, max_date = cursor.fetchone()
conn.close()
print("\n" + "=" * 50)
print("資料庫建立完成")
print("=" * 50)
print(f"簽名總數: {total:,}")
print(f"PDF 檔案數: {pdf_count:,}")
print(f"時間範圍: {min_date} ~ {max_date} ({period_count} 個月)")
print(f"資料庫位置: {DB_PATH}")
def main():
print("=" * 50)
print("Step 1: 建立簽名分析資料庫")
print("=" * 50)
# 檢查來源檔案
if not CSV_PATH.exists():
print(f"錯誤: 找不到 CSV 檔案 {CSV_PATH}")
return
if not IMAGES_DIR.exists():
print(f"錯誤: 找不到圖片目錄 {IMAGES_DIR}")
return
# 建立資料庫
create_database()
# 匯入資料
import_data()
if __name__ == "__main__":
main()