Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification
Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,246 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Step 1: 建立 SQLite 資料庫,匯入簽名記錄
|
||||
|
||||
從 extraction_results.csv 匯入資料,展開每個圖片為獨立記錄
|
||||
解析圖片檔名填充 year_month, sig_index
|
||||
計算圖片尺寸 width, height
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
import cv2
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
# 路徑配置
|
||||
IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
|
||||
CSV_PATH = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/extraction_results.csv")
|
||||
OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis")
|
||||
DB_PATH = OUTPUT_DIR / "signature_analysis.db"
|
||||
|
||||
|
||||
def parse_image_filename(filename: str) -> dict:
|
||||
"""
|
||||
解析圖片檔名,提取結構化資訊
|
||||
|
||||
範例: 201301_2458_AI1_page4_sig1.png
|
||||
"""
|
||||
# 移除 .png 副檔名
|
||||
name = filename.replace('.png', '')
|
||||
|
||||
# 解析模式: {YYYYMM}_{SERIAL}_{DOCTYPE}_page{PAGE}_sig{N}
|
||||
match = re.match(r'^(\d{6})_([^_]+)_([^_]+)_page(\d+)_sig(\d+)$', name)
|
||||
|
||||
if match:
|
||||
year_month, serial, doc_type, page, sig_index = match.groups()
|
||||
return {
|
||||
'year_month': year_month,
|
||||
'serial_number': serial,
|
||||
'doc_type': doc_type,
|
||||
'page_number': int(page),
|
||||
'sig_index': int(sig_index)
|
||||
}
|
||||
else:
|
||||
# 無法解析時返回 None
|
||||
return {
|
||||
'year_month': None,
|
||||
'serial_number': None,
|
||||
'doc_type': None,
|
||||
'page_number': None,
|
||||
'sig_index': None
|
||||
}
|
||||
|
||||
|
||||
def get_image_dimensions(image_path: Path) -> tuple:
|
||||
"""讀取圖片尺寸"""
|
||||
try:
|
||||
img = cv2.imread(str(image_path))
|
||||
if img is not None:
|
||||
h, w = img.shape[:2]
|
||||
return w, h
|
||||
return None, None
|
||||
except Exception:
|
||||
return None, None
|
||||
|
||||
|
||||
def process_single_image(args: tuple) -> dict:
|
||||
"""處理單張圖片,返回資料記錄"""
|
||||
image_filename, source_pdf, confidence_avg = args
|
||||
|
||||
# 解析檔名
|
||||
parsed = parse_image_filename(image_filename)
|
||||
|
||||
# 取得圖片尺寸
|
||||
image_path = IMAGES_DIR / image_filename
|
||||
width, height = get_image_dimensions(image_path)
|
||||
|
||||
return {
|
||||
'image_filename': image_filename,
|
||||
'source_pdf': source_pdf,
|
||||
'year_month': parsed['year_month'],
|
||||
'serial_number': parsed['serial_number'],
|
||||
'doc_type': parsed['doc_type'],
|
||||
'page_number': parsed['page_number'],
|
||||
'sig_index': parsed['sig_index'],
|
||||
'detection_confidence': confidence_avg,
|
||||
'image_width': width,
|
||||
'image_height': height
|
||||
}
|
||||
|
||||
|
||||
def create_database():
|
||||
"""建立資料庫 schema"""
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 建立 signatures 表
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS signatures (
|
||||
signature_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
image_filename TEXT UNIQUE NOT NULL,
|
||||
source_pdf TEXT NOT NULL,
|
||||
year_month TEXT,
|
||||
serial_number TEXT,
|
||||
doc_type TEXT,
|
||||
page_number INTEGER,
|
||||
sig_index INTEGER,
|
||||
detection_confidence REAL,
|
||||
image_width INTEGER,
|
||||
image_height INTEGER,
|
||||
accountant_name TEXT,
|
||||
accountant_id INTEGER,
|
||||
feature_vector BLOB,
|
||||
cluster_id INTEGER,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
''')
|
||||
|
||||
# 建立索引
|
||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_source_pdf ON signatures(source_pdf)')
|
||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_year_month ON signatures(year_month)')
|
||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_accountant_id ON signatures(accountant_id)')
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
print(f"資料庫已建立: {DB_PATH}")
|
||||
|
||||
|
||||
def expand_csv_to_records(csv_path: Path) -> list:
|
||||
"""
|
||||
將 CSV 展開為單張圖片記錄
|
||||
|
||||
CSV 格式: filename,page,num_signatures,confidence_avg,image_files
|
||||
需要將 image_files 展開為多筆記錄
|
||||
"""
|
||||
df = pd.read_csv(csv_path)
|
||||
|
||||
records = []
|
||||
for _, row in df.iterrows():
|
||||
source_pdf = row['filename']
|
||||
confidence_avg = row['confidence_avg']
|
||||
image_files_str = row['image_files']
|
||||
|
||||
# 展開 image_files(逗號分隔)
|
||||
if pd.notna(image_files_str):
|
||||
image_files = [f.strip() for f in image_files_str.split(',')]
|
||||
for img_file in image_files:
|
||||
records.append((img_file, source_pdf, confidence_avg))
|
||||
|
||||
return records
|
||||
|
||||
|
||||
def import_data():
|
||||
"""匯入資料到資料庫"""
|
||||
print("讀取 CSV 並展開記錄...")
|
||||
records = expand_csv_to_records(CSV_PATH)
|
||||
print(f"共 {len(records)} 張簽名圖片待處理")
|
||||
|
||||
print("處理圖片資訊(讀取尺寸)...")
|
||||
processed_records = []
|
||||
|
||||
# 使用多執行緒加速圖片尺寸讀取
|
||||
with ThreadPoolExecutor(max_workers=8) as executor:
|
||||
futures = {executor.submit(process_single_image, r): r for r in records}
|
||||
|
||||
for future in tqdm(as_completed(futures), total=len(records), desc="處理圖片"):
|
||||
result = future.result()
|
||||
processed_records.append(result)
|
||||
|
||||
print("寫入資料庫...")
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 批次插入
|
||||
insert_sql = '''
|
||||
INSERT OR IGNORE INTO signatures (
|
||||
image_filename, source_pdf, year_month, serial_number, doc_type,
|
||||
page_number, sig_index, detection_confidence, image_width, image_height
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
'''
|
||||
|
||||
batch_data = [
|
||||
(
|
||||
r['image_filename'], r['source_pdf'], r['year_month'], r['serial_number'],
|
||||
r['doc_type'], r['page_number'], r['sig_index'], r['detection_confidence'],
|
||||
r['image_width'], r['image_height']
|
||||
)
|
||||
for r in processed_records
|
||||
]
|
||||
|
||||
cursor.executemany(insert_sql, batch_data)
|
||||
conn.commit()
|
||||
|
||||
# 統計結果
|
||||
cursor.execute('SELECT COUNT(*) FROM signatures')
|
||||
total = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute('SELECT COUNT(DISTINCT source_pdf) FROM signatures')
|
||||
pdf_count = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute('SELECT COUNT(DISTINCT year_month) FROM signatures')
|
||||
period_count = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute('SELECT MIN(year_month), MAX(year_month) FROM signatures')
|
||||
min_date, max_date = cursor.fetchone()
|
||||
|
||||
conn.close()
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("資料庫建立完成")
|
||||
print("=" * 50)
|
||||
print(f"簽名總數: {total:,}")
|
||||
print(f"PDF 檔案數: {pdf_count:,}")
|
||||
print(f"時間範圍: {min_date} ~ {max_date} ({period_count} 個月)")
|
||||
print(f"資料庫位置: {DB_PATH}")
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 50)
|
||||
print("Step 1: 建立簽名分析資料庫")
|
||||
print("=" * 50)
|
||||
|
||||
# 檢查來源檔案
|
||||
if not CSV_PATH.exists():
|
||||
print(f"錯誤: 找不到 CSV 檔案 {CSV_PATH}")
|
||||
return
|
||||
|
||||
if not IMAGES_DIR.exists():
|
||||
print(f"錯誤: 找不到圖片目錄 {IMAGES_DIR}")
|
||||
return
|
||||
|
||||
# 建立資料庫
|
||||
create_database()
|
||||
|
||||
# 匯入資料
|
||||
import_data()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user