Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification

Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 23:05:33 +08:00
parent 21df0ff387
commit 939a348da4
33 changed files with 9315 additions and 0 deletions
@@ -0,0 +1,246 @@
+#!/usr/bin/env python3
+"""
+Step 1: 建立 SQLite 資料庫，匯入簽名記錄
+
+從 extraction_results.csv 匯入資料，展開每個圖片為獨立記錄
+解析圖片檔名填充 year_month, sig_index
+計算圖片尺寸 width, height
+"""
+
+import sqlite3
+import pandas as pd
+import cv2
+import os
+import re
+from pathlib import Path
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# 路徑配置
+IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
+CSV_PATH = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/extraction_results.csv")
+OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis")
+DB_PATH = OUTPUT_DIR / "signature_analysis.db"
+
+
+def parse_image_filename(filename: str) -> dict:
+    """
+    解析圖片檔名，提取結構化資訊
+
+    範例: 201301_2458_AI1_page4_sig1.png
+    """
+    # 移除 .png 副檔名
+    name = filename.replace('.png', '')
+
+    # 解析模式: {YYYYMM}_{SERIAL}_{DOCTYPE}_page{PAGE}_sig{N}
+    match = re.match(r'^(\d{6})_([^_]+)_([^_]+)_page(\d+)_sig(\d+)$', name)
+
+    if match:
+        year_month, serial, doc_type, page, sig_index = match.groups()
+        return {
+            'year_month': year_month,
+            'serial_number': serial,
+            'doc_type': doc_type,
+            'page_number': int(page),
+            'sig_index': int(sig_index)
+        }
+    else:
+        # 無法解析時返回 None
+        return {
+            'year_month': None,
+            'serial_number': None,
+            'doc_type': None,
+            'page_number': None,
+            'sig_index': None
+        }
+
+
+def get_image_dimensions(image_path: Path) -> tuple:
+    """讀取圖片尺寸"""
+    try:
+        img = cv2.imread(str(image_path))
+        if img is not None:
+            h, w = img.shape[:2]
+            return w, h
+        return None, None
+    except Exception:
+        return None, None
+
+
+def process_single_image(args: tuple) -> dict:
+    """處理單張圖片，返回資料記錄"""
+    image_filename, source_pdf, confidence_avg = args
+
+    # 解析檔名
+    parsed = parse_image_filename(image_filename)
+
+    # 取得圖片尺寸
+    image_path = IMAGES_DIR / image_filename
+    width, height = get_image_dimensions(image_path)
+
+    return {
+        'image_filename': image_filename,
+        'source_pdf': source_pdf,
+        'year_month': parsed['year_month'],
+        'serial_number': parsed['serial_number'],
+        'doc_type': parsed['doc_type'],
+        'page_number': parsed['page_number'],
+        'sig_index': parsed['sig_index'],
+        'detection_confidence': confidence_avg,
+        'image_width': width,
+        'image_height': height
+    }
+
+
+def create_database():
+    """建立資料庫 schema"""
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+
+    # 建立 signatures 表
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS signatures (
+            signature_id INTEGER PRIMARY KEY AUTOINCREMENT,
+            image_filename TEXT UNIQUE NOT NULL,
+            source_pdf TEXT NOT NULL,
+            year_month TEXT,
+            serial_number TEXT,
+            doc_type TEXT,
+            page_number INTEGER,
+            sig_index INTEGER,
+            detection_confidence REAL,
+            image_width INTEGER,
+            image_height INTEGER,
+            accountant_name TEXT,
+            accountant_id INTEGER,
+            feature_vector BLOB,
+            cluster_id INTEGER,
+            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+    ''')
+
+    # 建立索引
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_source_pdf ON signatures(source_pdf)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_year_month ON signatures(year_month)')
+    cursor.execute('CREATE INDEX IF NOT EXISTS idx_accountant_id ON signatures(accountant_id)')
+
+    conn.commit()
+    conn.close()
+
+    print(f"資料庫已建立: {DB_PATH}")
+
+
+def expand_csv_to_records(csv_path: Path) -> list:
+    """
+    將 CSV 展開為單張圖片記錄
+
+    CSV 格式: filename,page,num_signatures,confidence_avg,image_files
+    需要將 image_files 展開為多筆記錄
+    """
+    df = pd.read_csv(csv_path)
+
+    records = []
+    for _, row in df.iterrows():
+        source_pdf = row['filename']
+        confidence_avg = row['confidence_avg']
+        image_files_str = row['image_files']
+
+        # 展開 image_files（逗號分隔）
+        if pd.notna(image_files_str):
+            image_files = [f.strip() for f in image_files_str.split(',')]
+            for img_file in image_files:
+                records.append((img_file, source_pdf, confidence_avg))
+
+    return records
+
+
+def import_data():
+    """匯入資料到資料庫"""
+    print("讀取 CSV 並展開記錄...")
+    records = expand_csv_to_records(CSV_PATH)
+    print(f"共 {len(records)} 張簽名圖片待處理")
+
+    print("處理圖片資訊（讀取尺寸）...")
+    processed_records = []
+
+    # 使用多執行緒加速圖片尺寸讀取
+    with ThreadPoolExecutor(max_workers=8) as executor:
+        futures = {executor.submit(process_single_image, r): r for r in records}
+
+        for future in tqdm(as_completed(futures), total=len(records), desc="處理圖片"):
+            result = future.result()
+            processed_records.append(result)
+
+    print("寫入資料庫...")
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+
+    # 批次插入
+    insert_sql = '''
+        INSERT OR IGNORE INTO signatures (
+            image_filename, source_pdf, year_month, serial_number, doc_type,
+            page_number, sig_index, detection_confidence, image_width, image_height
+        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+    '''
+
+    batch_data = [
+        (
+            r['image_filename'], r['source_pdf'], r['year_month'], r['serial_number'],
+            r['doc_type'], r['page_number'], r['sig_index'], r['detection_confidence'],
+            r['image_width'], r['image_height']
+        )
+        for r in processed_records
+    ]
+
+    cursor.executemany(insert_sql, batch_data)
+    conn.commit()
+
+    # 統計結果
+    cursor.execute('SELECT COUNT(*) FROM signatures')
+    total = cursor.fetchone()[0]
+
+    cursor.execute('SELECT COUNT(DISTINCT source_pdf) FROM signatures')
+    pdf_count = cursor.fetchone()[0]
+
+    cursor.execute('SELECT COUNT(DISTINCT year_month) FROM signatures')
+    period_count = cursor.fetchone()[0]
+
+    cursor.execute('SELECT MIN(year_month), MAX(year_month) FROM signatures')
+    min_date, max_date = cursor.fetchone()
+
+    conn.close()
+
+    print("\n" + "=" * 50)
+    print("資料庫建立完成")
+    print("=" * 50)
+    print(f"簽名總數: {total:,}")
+    print(f"PDF 檔案數: {pdf_count:,}")
+    print(f"時間範圍: {min_date} ~ {max_date} ({period_count} 個月)")
+    print(f"資料庫位置: {DB_PATH}")
+
+
+def main():
+    print("=" * 50)
+    print("Step 1: 建立簽名分析資料庫")
+    print("=" * 50)
+
+    # 檢查來源檔案
+    if not CSV_PATH.exists():
+        print(f"錯誤: 找不到 CSV 檔案 {CSV_PATH}")
+        return
+
+    if not IMAGES_DIR.exists():
+        print(f"錯誤: 找不到圖片目錄 {IMAGES_DIR}")
+        return
+
+    # 建立資料庫
+    create_database()
+
+    # 匯入資料
+    import_data()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,241 @@
+#!/usr/bin/env python3
+"""
+Step 2: 使用 ResNet-50 提取簽名圖片的特徵向量
+
+預處理流程:
+1. 載入圖片 (RGB)
+2. 縮放至 224x224（保持比例，填充白色）
+3. 正規化 (ImageNet mean/std)
+4. 通過 ResNet-50 (去掉最後分類層)
+5. L2 正規化
+6. 輸出 2048 維特徵向量
+"""
+
+import torch
+import torch.nn as nn
+import torchvision.models as models
+import torchvision.transforms as transforms
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+import cv2
+import sqlite3
+from pathlib import Path
+from tqdm import tqdm
+import warnings
+warnings.filterwarnings('ignore')
+
+# 路徑配置
+IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
+OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis")
+DB_PATH = OUTPUT_DIR / "signature_analysis.db"
+FEATURES_PATH = OUTPUT_DIR / "features"
+
+# 模型配置
+BATCH_SIZE = 64
+NUM_WORKERS = 4
+DEVICE = torch.device("mps" if torch.backends.mps.is_available() else
+                      "cuda" if torch.cuda.is_available() else "cpu")
+
+
+class SignatureDataset(Dataset):
+    """簽名圖片資料集"""
+
+    def __init__(self, image_paths: list, transform=None):
+        self.image_paths = image_paths
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.image_paths)
+
+    def __getitem__(self, idx):
+        img_path = self.image_paths[idx]
+
+        # 載入圖片
+        img = cv2.imread(str(img_path))
+        if img is None:
+            # 如果讀取失敗，返回白色圖片
+            img = np.ones((224, 224, 3), dtype=np.uint8) * 255
+        else:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+        # 調整大小（保持比例，填充白色）
+        img = self.resize_with_padding(img, 224, 224)
+
+        if self.transform:
+            img = self.transform(img)
+
+        return img, str(img_path.name)
+
+    @staticmethod
+    def resize_with_padding(img, target_w, target_h):
+        """調整大小並填充白色以保持比例"""
+        h, w = img.shape[:2]
+
+        # 計算縮放比例
+        scale = min(target_w / w, target_h / h)
+        new_w = int(w * scale)
+        new_h = int(h * scale)
+
+        # 縮放
+        resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
+
+        # 建立白色畫布
+        canvas = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255
+
+        # 置中貼上
+        x_offset = (target_w - new_w) // 2
+        y_offset = (target_h - new_h) // 2
+        canvas[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized
+
+        return canvas
+
+
+class FeatureExtractor:
+    """特徵提取器"""
+
+    def __init__(self, device):
+        self.device = device
+
+        # 載入預訓練 ResNet-50
+        print(f"載入 ResNet-50 模型... (device: {device})")
+        self.model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
+
+        # 移除最後的分類層，保留特徵
+        self.model = nn.Sequential(*list(self.model.children())[:-1])
+        self.model = self.model.to(device)
+        self.model.eval()
+
+        # ImageNet 正規化
+        self.transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]
+            )
+        ])
+
+    @torch.no_grad()
+    def extract_batch(self, images):
+        """提取一批圖片的特徵"""
+        images = images.to(self.device)
+        features = self.model(images)
+        features = features.squeeze(-1).squeeze(-1)  # [B, 2048]
+
+        # L2 正規化
+        features = nn.functional.normalize(features, p=2, dim=1)
+
+        return features.cpu().numpy()
+
+
+def get_image_list_from_db():
+    """從資料庫取得所有圖片檔名"""
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+
+    cursor.execute('SELECT image_filename FROM signatures ORDER BY signature_id')
+    filenames = [row[0] for row in cursor.fetchall()]
+
+    conn.close()
+    return filenames
+
+
+def save_features_to_db(features_dict: dict):
+    """將特徵向量存入資料庫"""
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+
+    for filename, feature in tqdm(features_dict.items(), desc="寫入資料庫"):
+        cursor.execute('''
+            UPDATE signatures
+            SET feature_vector = ?
+            WHERE image_filename = ?
+        ''', (feature.tobytes(), filename))
+
+    conn.commit()
+    conn.close()
+
+
+def main():
+    print("=" * 60)
+    print("Step 2: ResNet-50 特徵向量提取")
+    print("=" * 60)
+    print(f"裝置: {DEVICE}")
+
+    # 確保輸出目錄存在
+    FEATURES_PATH.mkdir(parents=True, exist_ok=True)
+
+    # 從資料庫取得圖片列表
+    print("從資料庫讀取圖片列表...")
+    filenames = get_image_list_from_db()
+    print(f"共 {len(filenames):,} 張圖片待處理")
+
+    # 建立圖片路徑列表
+    image_paths = [IMAGES_DIR / f for f in filenames]
+
+    # 初始化特徵提取器
+    extractor = FeatureExtractor(DEVICE)
+
+    # 建立資料集和載入器
+    dataset = SignatureDataset(image_paths, transform=extractor.transform)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=BATCH_SIZE,
+        shuffle=False,
+        num_workers=NUM_WORKERS,
+        pin_memory=True
+    )
+
+    # 提取特徵
+    print(f"\n開始提取特徵 (batch_size={BATCH_SIZE})...")
+    all_features = []
+    all_filenames = []
+
+    for images, batch_filenames in tqdm(dataloader, desc="提取特徵"):
+        features = extractor.extract_batch(images)
+        all_features.append(features)
+        all_filenames.extend(batch_filenames)
+
+    # 合併所有特徵
+    all_features = np.vstack(all_features)
+    print(f"\n特徵矩陣形狀: {all_features.shape}")
+
+    # 儲存為 numpy 檔案（備份）
+    npy_path = FEATURES_PATH / "signature_features.npy"
+    np.save(npy_path, all_features)
+    print(f"特徵向量已儲存: {npy_path} ({all_features.nbytes / 1e9:.2f} GB)")
+
+    # 儲存檔名對應（用於後續索引）
+    filenames_path = FEATURES_PATH / "signature_filenames.txt"
+    with open(filenames_path, 'w') as f:
+        for fn in all_filenames:
+            f.write(fn + '\n')
+    print(f"檔名列表已儲存: {filenames_path}")
+
+    # 更新資料庫
+    print("\n更新資料庫中的特徵向量...")
+    features_dict = dict(zip(all_filenames, all_features))
+    save_features_to_db(features_dict)
+
+    # 統計
+    print("\n" + "=" * 60)
+    print("特徵提取完成")
+    print("=" * 60)
+    print(f"處理圖片數: {len(all_filenames):,}")
+    print(f"特徵維度: {all_features.shape[1]}")
+    print(f"特徵檔案: {npy_path}")
+    print(f"檔案大小: {all_features.nbytes / 1e9:.2f} GB")
+
+    # 簡單驗證
+    print("\n特徵統計:")
+    print(f"  平均值: {all_features.mean():.6f}")
+    print(f"  標準差: {all_features.std():.6f}")
+    print(f"  最小值: {all_features.min():.6f}")
+    print(f"  最大值: {all_features.max():.6f}")
+
+    # L2 norm 驗證（應該都是 1.0）
+    norms = np.linalg.norm(all_features, axis=1)
+    print(f"  L2 norm: {norms.mean():.6f} ± {norms.std():.6f}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,368 @@
+#!/usr/bin/env python3
+"""
+Step 3: 相似度分布探索
+
+1. 隨機抽樣 100,000 對簽名
+2. 計算 cosine similarity
+3. 繪製直方圖分布
+4. 找出高相似度對 (>0.95)
+5. 分析高相似度對的來源
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from pathlib import Path
+from tqdm import tqdm
+import random
+from collections import defaultdict
+import json
+
+# 路徑配置
+OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis")
+FEATURES_PATH = OUTPUT_DIR / "features" / "signature_features.npy"
+FILENAMES_PATH = OUTPUT_DIR / "features" / "signature_filenames.txt"
+REPORTS_PATH = OUTPUT_DIR / "reports"
+
+# 分析配置
+NUM_RANDOM_PAIRS = 100000
+HIGH_SIMILARITY_THRESHOLD = 0.95
+VERY_HIGH_SIMILARITY_THRESHOLD = 0.99
+
+
+def load_data():
+    """載入特徵向量和檔名"""
+    print("載入特徵向量...")
+    features = np.load(FEATURES_PATH)
+    print(f"特徵矩陣形狀: {features.shape}")
+
+    print("載入檔名列表...")
+    with open(FILENAMES_PATH, 'r') as f:
+        filenames = [line.strip() for line in f.readlines()]
+    print(f"檔名數量: {len(filenames)}")
+
+    return features, filenames
+
+
+def parse_filename(filename: str) -> dict:
+    """解析檔名提取資訊"""
+    # 範例: 201301_2458_AI1_page4_sig1.png
+    parts = filename.replace('.png', '').split('_')
+    if len(parts) >= 5:
+        return {
+            'year_month': parts[0],
+            'serial': parts[1],
+            'doc_type': parts[2],
+            'page': parts[3].replace('page', ''),
+            'sig_index': parts[4].replace('sig', '')
+        }
+    return {'raw': filename}
+
+
+def cosine_similarity(v1, v2):
+    """計算餘弦相似度（向量已 L2 正規化）"""
+    return np.dot(v1, v2)
+
+
+def random_sampling_analysis(features, filenames, n_pairs=100000):
+    """隨機抽樣計算相似度分布"""
+    print(f"\n隨機抽樣 {n_pairs:,} 對簽名...")
+
+    n = len(filenames)
+    similarities = []
+    pair_indices = []
+
+    # 產生隨機配對
+    for _ in tqdm(range(n_pairs), desc="計算相似度"):
+        i, j = random.sample(range(n), 2)
+        sim = cosine_similarity(features[i], features[j])
+        similarities.append(sim)
+        pair_indices.append((i, j))
+
+    return np.array(similarities), pair_indices
+
+
+def find_high_similarity_pairs(features, filenames, threshold=0.95, sample_size=100000):
+    """找出高相似度的簽名對"""
+    print(f"\n搜尋相似度 > {threshold} 的簽名對...")
+
+    n = len(filenames)
+    high_sim_pairs = []
+
+    # 使用隨機抽樣找高相似度對
+    # 由於全量計算太慢 (n^2 = 33 billion pairs)，採用抽樣策略
+    for _ in tqdm(range(sample_size), desc="搜尋高相似度"):
+        i, j = random.sample(range(n), 2)
+        sim = cosine_similarity(features[i], features[j])
+        if sim > threshold:
+            high_sim_pairs.append({
+                'idx1': i,
+                'idx2': j,
+                'file1': filenames[i],
+                'file2': filenames[j],
+                'similarity': float(sim),
+                'parsed1': parse_filename(filenames[i]),
+                'parsed2': parse_filename(filenames[j])
+            })
+
+    return high_sim_pairs
+
+
+def systematic_high_similarity_search(features, filenames, threshold=0.95, batch_size=1000):
+    """
+    更系統化的高相似度搜尋：
+    對每個簽名，找出與它最相似的其他簽名
+    """
+    print(f"\n系統化搜尋高相似度對 (threshold={threshold})...")
+    print("這會對每個簽名找出最相似的候選...")
+
+    n = len(filenames)
+    high_sim_pairs = []
+    seen_pairs = set()
+
+    # 隨機抽樣一部分簽名作為查詢
+    sample_indices = random.sample(range(n), min(5000, n))
+
+    for idx in tqdm(sample_indices, desc="搜尋"):
+        # 計算這個簽名與所有其他簽名的相似度
+        # 使用矩陣運算加速
+        sims = features @ features[idx]
+
+        # 找出高於閾值的（排除自己）
+        high_sim_idx = np.where(sims > threshold)[0]
+
+        for j in high_sim_idx:
+            if j != idx:
+                pair_key = tuple(sorted([idx, int(j)]))
+                if pair_key not in seen_pairs:
+                    seen_pairs.add(pair_key)
+                    high_sim_pairs.append({
+                        'idx1': int(idx),
+                        'idx2': int(j),
+                        'file1': filenames[idx],
+                        'file2': filenames[int(j)],
+                        'similarity': float(sims[j]),
+                        'parsed1': parse_filename(filenames[idx]),
+                        'parsed2': parse_filename(filenames[int(j)])
+                    })
+
+    return high_sim_pairs
+
+
+def analyze_high_similarity_sources(high_sim_pairs):
+    """分析高相似度對的來源特徵"""
+    print("\n分析高相似度對的來源...")
+
+    stats = {
+        'same_pdf': 0,
+        'same_year_month': 0,
+        'same_doc_type': 0,
+        'different_everything': 0,
+        'total': len(high_sim_pairs)
+    }
+
+    for pair in high_sim_pairs:
+        p1, p2 = pair.get('parsed1', {}), pair.get('parsed2', {})
+
+        # 同一 PDF
+        if p1.get('year_month') == p2.get('year_month') and \
+           p1.get('serial') == p2.get('serial') and \
+           p1.get('doc_type') == p2.get('doc_type'):
+            stats['same_pdf'] += 1
+        # 同月份
+        elif p1.get('year_month') == p2.get('year_month'):
+            stats['same_year_month'] += 1
+        # 同類型
+        elif p1.get('doc_type') == p2.get('doc_type'):
+            stats['same_doc_type'] += 1
+        else:
+            stats['different_everything'] += 1
+
+    return stats
+
+
+def plot_similarity_distribution(similarities, output_path):
+    """繪製相似度分布圖"""
+    print("\n繪製分布圖...")
+
+    try:
+        # 轉換為 Python list 完全避免 numpy 問題
+        sim_list = similarities.tolist()
+
+        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+
+        # 左圖：完整分布 - 使用 range 指定 bins
+        ax1 = axes[0]
+        ax1.hist(sim_list, bins=np.linspace(min(sim_list), max(sim_list), 101).tolist(),
+                 density=True, alpha=0.7, color='steelblue', edgecolor='white')
+        ax1.axvline(x=0.95, color='red', linestyle='--', label='0.95 threshold')
+        ax1.axvline(x=0.99, color='darkred', linestyle='--', label='0.99 threshold')
+        ax1.set_xlabel('Cosine Similarity', fontsize=12)
+        ax1.set_ylabel('Density', fontsize=12)
+        ax1.set_title('Signature Similarity Distribution (Random Sampling)', fontsize=14)
+        ax1.legend()
+
+        # 統計標註
+        mean_sim = float(np.mean(similarities))
+        std_sim = float(np.std(similarities))
+        ax1.annotate(f'Mean: {mean_sim:.4f}\nStd: {std_sim:.4f}',
+                    xy=(0.02, 0.95), xycoords='axes fraction',
+                    fontsize=10, verticalalignment='top',
+                    bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
+
+        # 右圖：高相似度區域放大
+        ax2 = axes[1]
+        high_sim_list = [x for x in sim_list if x > 0.8]
+        if len(high_sim_list) > 0:
+            ax2.hist(high_sim_list, bins=np.linspace(0.8, max(high_sim_list), 51).tolist(),
+                     density=True, alpha=0.7, color='coral', edgecolor='white')
+            ax2.axvline(x=0.95, color='red', linestyle='--', label='0.95 threshold')
+            ax2.axvline(x=0.99, color='darkred', linestyle='--', label='0.99 threshold')
+        ax2.set_xlabel('Cosine Similarity', fontsize=12)
+        ax2.set_ylabel('Density', fontsize=12)
+        ax2.set_title('High Similarity Region (> 0.8)', fontsize=14)
+        ax2.legend()
+
+        # 高相似度統計
+        pct_95 = int((similarities > 0.95).sum()) / len(similarities) * 100
+        pct_99 = int((similarities > 0.99).sum()) / len(similarities) * 100
+        ax2.annotate(f'> 0.95: {pct_95:.4f}%\n> 0.99: {pct_99:.4f}%',
+                    xy=(0.98, 0.95), xycoords='axes fraction',
+                    fontsize=10, verticalalignment='top', horizontalalignment='right',
+                    bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
+
+        plt.tight_layout()
+        plt.savefig(output_path, dpi=150, bbox_inches='tight')
+        plt.close()
+
+        print(f"分布圖已儲存: {output_path}")
+    except Exception as e:
+        print(f"繪圖失敗: {e}")
+        print("跳過繪圖，繼續其他分析...")
+
+
+def generate_statistics_report(similarities, high_sim_pairs, source_stats, output_path):
+    """生成統計報告"""
+    report = {
+        'random_sampling': {
+            'n_pairs': len(similarities),
+            'mean': float(np.mean(similarities)),
+            'std': float(np.std(similarities)),
+            'min': float(np.min(similarities)),
+            'max': float(np.max(similarities)),
+            'percentiles': {
+                '25%': float(np.percentile(similarities, 25)),
+                '50%': float(np.percentile(similarities, 50)),
+                '75%': float(np.percentile(similarities, 75)),
+                '90%': float(np.percentile(similarities, 90)),
+                '95%': float(np.percentile(similarities, 95)),
+                '99%': float(np.percentile(similarities, 99)),
+            },
+            'above_thresholds': {
+                '>0.90': int((similarities > 0.90).sum()),
+                '>0.95': int((similarities > 0.95).sum()),
+                '>0.99': int((similarities > 0.99).sum()),
+            }
+        },
+        'high_similarity_search': {
+            'threshold': HIGH_SIMILARITY_THRESHOLD,
+            'pairs_found': len(high_sim_pairs),
+            'source_analysis': source_stats,
+            'top_10_pairs': sorted(high_sim_pairs, key=lambda x: x['similarity'], reverse=True)[:10]
+        }
+    }
+
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(report, f, indent=2, ensure_ascii=False)
+
+    print(f"統計報告已儲存: {output_path}")
+    return report
+
+
+def print_summary(report):
+    """印出摘要"""
+    print("\n" + "=" * 70)
+    print("相似度分布分析摘要")
+    print("=" * 70)
+
+    rs = report['random_sampling']
+    print(f"\n隨機抽樣統計 ({rs['n_pairs']:,} 對):")
+    print(f"  平均相似度: {rs['mean']:.4f}")
+    print(f"  標準差: {rs['std']:.4f}")
+    print(f"  範圍: [{rs['min']:.4f}, {rs['max']:.4f}]")
+    print(f"\n百分位數:")
+    for k, v in rs['percentiles'].items():
+        print(f"  {k}: {v:.4f}")
+
+    print(f"\n高相似度對數量:")
+    for k, v in rs['above_thresholds'].items():
+        pct = v / rs['n_pairs'] * 100
+        print(f"  {k}: {v:,} ({pct:.4f}%)")
+
+    hs = report['high_similarity_search']
+    print(f"\n系統化搜尋結果 (threshold={hs['threshold']}):")
+    print(f"  發現高相似度對: {hs['pairs_found']:,}")
+
+    if hs['source_analysis']['total'] > 0:
+        sa = hs['source_analysis']
+        print(f"\n來源分析:")
+        print(f"  同一 PDF: {sa['same_pdf']} ({sa['same_pdf']/sa['total']*100:.1f}%)")
+        print(f"  同月份: {sa['same_year_month']} ({sa['same_year_month']/sa['total']*100:.1f}%)")
+        print(f"  同類型: {sa['same_doc_type']} ({sa['same_doc_type']/sa['total']*100:.1f}%)")
+        print(f"  完全不同: {sa['different_everything']} ({sa['different_everything']/sa['total']*100:.1f}%)")
+
+    if hs['top_10_pairs']:
+        print(f"\nTop 10 高相似度對:")
+        for i, pair in enumerate(hs['top_10_pairs'], 1):
+            print(f"  {i}. {pair['similarity']:.4f}")
+            print(f"     {pair['file1']}")
+            print(f"     {pair['file2']}")
+
+
+def main():
+    print("=" * 70)
+    print("Step 3: 相似度分布探索")
+    print("=" * 70)
+
+    # 確保輸出目錄存在
+    REPORTS_PATH.mkdir(parents=True, exist_ok=True)
+
+    # 載入資料
+    features, filenames = load_data()
+
+    # 隨機抽樣分析
+    similarities, pair_indices = random_sampling_analysis(features, filenames, NUM_RANDOM_PAIRS)
+
+    # 繪製分布圖
+    plot_similarity_distribution(
+        similarities,
+        REPORTS_PATH / "similarity_distribution.png"
+    )
+
+    # 系統化搜尋高相似度對
+    high_sim_pairs = systematic_high_similarity_search(
+        features, filenames,
+        threshold=HIGH_SIMILARITY_THRESHOLD
+    )
+
+    # 分析來源
+    source_stats = analyze_high_similarity_sources(high_sim_pairs)
+
+    # 生成報告
+    report = generate_statistics_report(
+        similarities, high_sim_pairs, source_stats,
+        REPORTS_PATH / "similarity_statistics.json"
+    )
+
+    # 儲存高相似度對列表
+    high_sim_output = REPORTS_PATH / "high_similarity_pairs.json"
+    with open(high_sim_output, 'w', encoding='utf-8') as f:
+        json.dump(high_sim_pairs, f, indent=2, ensure_ascii=False)
+    print(f"高相似度對列表已儲存: {high_sim_output}")
+
+    # 印出摘要
+    print_summary(report)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,274 @@
+#!/usr/bin/env python3
+"""
+Step 4: 生成高相似度案例的視覺化報告
+
+讀取 high_similarity_pairs.json
+為 Top N 高相似度對生成並排對比圖
+生成 HTML 報告
+"""
+
+import json
+import cv2
+import numpy as np
+from pathlib import Path
+from tqdm import tqdm
+import base64
+from io import BytesIO
+
+# 路徑配置
+IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
+REPORTS_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
+HIGH_SIM_JSON = REPORTS_PATH / "high_similarity_pairs.json"
+
+# 報告配置
+TOP_N = 100  # 顯示前 N 對
+
+
+def load_image(filename: str) -> np.ndarray:
+    """載入圖片"""
+    img_path = IMAGES_DIR / filename
+    img = cv2.imread(str(img_path))
+    if img is None:
+        # 返回空白圖片
+        return np.ones((100, 200, 3), dtype=np.uint8) * 255
+    return img
+
+
+def create_comparison_image(file1: str, file2: str, similarity: float) -> np.ndarray:
+    """建立並排對比圖"""
+    img1 = load_image(file1)
+    img2 = load_image(file2)
+
+    # 統一高度
+    h1, w1 = img1.shape[:2]
+    h2, w2 = img2.shape[:2]
+    target_h = max(h1, h2, 100)
+
+    # 縮放
+    if h1 != target_h:
+        scale = target_h / h1
+        img1 = cv2.resize(img1, (int(w1 * scale), target_h))
+    if h2 != target_h:
+        scale = target_h / h2
+        img2 = cv2.resize(img2, (int(w2 * scale), target_h))
+
+    # 加入分隔線
+    separator = np.ones((target_h, 20, 3), dtype=np.uint8) * 200
+
+    # 合併
+    comparison = np.hstack([img1, separator, img2])
+
+    return comparison
+
+
+def image_to_base64(img: np.ndarray) -> str:
+    """將圖片轉換為 base64"""
+    _, buffer = cv2.imencode('.png', img)
+    return base64.b64encode(buffer).decode('utf-8')
+
+
+def generate_html_report(pairs: list, output_path: Path):
+    """生成 HTML 報告"""
+    html_content = """
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <title>簽名相似度分析報告 - 高相似度案例</title>
+    <style>
+        body {
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            max-width: 1400px;
+            margin: 0 auto;
+            padding: 20px;
+            background-color: #f5f5f5;
+        }
+        h1 {
+            color: #333;
+            text-align: center;
+            border-bottom: 2px solid #666;
+            padding-bottom: 10px;
+        }
+        .summary {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 20px;
+            border-radius: 10px;
+            margin-bottom: 30px;
+        }
+        .summary h2 {
+            margin-top: 0;
+        }
+        .pair-card {
+            background: white;
+            border-radius: 10px;
+            padding: 20px;
+            margin-bottom: 20px;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+        }
+        .pair-header {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 15px;
+            padding-bottom: 10px;
+            border-bottom: 1px solid #eee;
+        }
+        .pair-number {
+            font-size: 1.2em;
+            font-weight: bold;
+            color: #333;
+        }
+        .similarity-badge {
+            background: #dc3545;
+            color: white;
+            padding: 5px 15px;
+            border-radius: 20px;
+            font-weight: bold;
+        }
+        .similarity-badge.high {
+            background: #dc3545;
+        }
+        .similarity-badge.very-high {
+            background: #8b0000;
+        }
+        .file-info {
+            font-family: monospace;
+            font-size: 0.9em;
+            color: #666;
+            margin-bottom: 10px;
+        }
+        .comparison-image {
+            max-width: 100%;
+            border: 1px solid #ddd;
+            border-radius: 5px;
+        }
+        .analysis {
+            margin-top: 15px;
+            padding: 10px;
+            background: #f8f9fa;
+            border-radius: 5px;
+            font-size: 0.9em;
+        }
+        .tag {
+            display: inline-block;
+            padding: 2px 8px;
+            border-radius: 3px;
+            margin-right: 5px;
+            font-size: 0.8em;
+        }
+        .tag-same-serial { background: #ffebee; color: #c62828; }
+        .tag-same-month { background: #fff3e0; color: #e65100; }
+        .tag-diff { background: #e8f5e9; color: #2e7d32; }
+    </style>
+</head>
+<body>
+    <h1>簽名相似度分析報告 - 高相似度案例</h1>
+
+    <div class="summary">
+        <h2>摘要</h2>
+        <p><strong>分析結果：</strong>發現 659,111 對高相似度簽名 (>0.95)</p>
+        <p><strong>本報告顯示：</strong>Top """ + str(TOP_N) + """ 最高相似度案例</p>
+        <p><strong>結論：</strong>存在大量相似度接近或等於 1.0 的簽名對，強烈暗示「複製貼上」行為</p>
+    </div>
+
+    <div class="pairs-container">
+"""
+
+    for i, pair in enumerate(pairs[:TOP_N], 1):
+        sim = pair['similarity']
+        file1 = pair['file1']
+        file2 = pair['file2']
+        p1 = pair.get('parsed1', {})
+        p2 = pair.get('parsed2', {})
+
+        # 分析關係
+        tags = []
+        if p1.get('serial') == p2.get('serial'):
+            tags.append(('<span class="tag tag-same-serial">同序號</span>', ''))
+        if p1.get('year_month') == p2.get('year_month'):
+            tags.append(('<span class="tag tag-same-month">同月份</span>', ''))
+        if p1.get('year_month') != p2.get('year_month') and p1.get('serial') != p2.get('serial'):
+            tags.append(('<span class="tag tag-diff">完全不同文件</span>', ''))
+
+        badge_class = 'very-high' if sim >= 0.99 else 'high'
+
+        # 建立對比圖
+        try:
+            comparison_img = create_comparison_image(file1, file2, sim)
+            img_base64 = image_to_base64(comparison_img)
+            img_html = f'<img src="data:image/png;base64,{img_base64}" class="comparison-image">'
+        except Exception as e:
+            img_html = f'<p style="color:red">無法載入圖片: {e}</p>'
+
+        tag_html = ''.join([t[0] for t in tags])
+
+        html_content += f"""
+        <div class="pair-card">
+            <div class="pair-header">
+                <span class="pair-number">#{i}</span>
+                <span class="similarity-badge {badge_class}">相似度: {sim:.4f}</span>
+            </div>
+            <div class="file-info">
+                <strong>簽名 1:</strong> {file1}<br>
+                <strong>簽名 2:</strong> {file2}
+            </div>
+            {img_html}
+            <div class="analysis">
+                {tag_html}
+                <br><small>日期: {p1.get('year_month', 'N/A')} vs {p2.get('year_month', 'N/A')} |
+                序號: {p1.get('serial', 'N/A')} vs {p2.get('serial', 'N/A')}</small>
+            </div>
+        </div>
+"""
+
+    html_content += """
+    </div>
+
+    <div style="text-align: center; margin-top: 30px; color: #666;">
+        <p>生成時間: 2024 | 簽名真實性研究計劃</p>
+    </div>
+</body>
+</html>
+"""
+
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write(html_content)
+
+    print(f"HTML 報告已儲存: {output_path}")
+
+
+def main():
+    print("=" * 60)
+    print("Step 4: 生成高相似度案例視覺化報告")
+    print("=" * 60)
+
+    # 載入高相似度對
+    print("載入高相似度對資料...")
+    with open(HIGH_SIM_JSON, 'r', encoding='utf-8') as f:
+        pairs = json.load(f)
+
+    print(f"共 {len(pairs):,} 對高相似度簽名")
+
+    # 按相似度排序
+    pairs_sorted = sorted(pairs, key=lambda x: x['similarity'], reverse=True)
+
+    # 統計
+    sim_1 = len([p for p in pairs_sorted if p['similarity'] >= 0.9999])
+    sim_99 = len([p for p in pairs_sorted if p['similarity'] >= 0.99])
+    sim_97 = len([p for p in pairs_sorted if p['similarity'] >= 0.97])
+
+    print(f"\n相似度統計:")
+    print(f"  = 1.0 (完全相同): {sim_1:,}")
+    print(f"  >= 0.99: {sim_99:,}")
+    print(f"  >= 0.97: {sim_97:,}")
+
+    # 生成報告
+    print(f"\n生成 Top {TOP_N} 視覺化報告...")
+    generate_html_report(pairs_sorted, REPORTS_PATH / "high_similarity_report.html")
+
+    print("\n完成！")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,432 @@
+#!/usr/bin/env python3
+"""
+Step 5: 從 PDF 提取會計師印刷姓名
+
+流程：
+1. 從資料庫讀取簽名記錄，按 (PDF, page) 分組
+2. 對每個頁面重新執行 YOLO 獲取簽名框座標
+3. 對整頁執行 PaddleOCR 提取印刷文字
+4. 過濾出候選姓名（2-4 個中文字）
+5. 配對簽名與最近的印刷姓名
+6. 更新資料庫的 accountant_name 欄位
+"""
+
+import sqlite3
+import json
+import re
+import sys
+import time
+from pathlib import Path
+from typing import Optional, List, Dict, Tuple
+from collections import defaultdict
+from tqdm import tqdm
+import numpy as np
+import cv2
+import fitz  # PyMuPDF
+
+# 加入父目錄到路徑以便匯入
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from paddleocr_client import PaddleOCRClient
+
+# 路徑配置
+PDF_BASE = Path("/Volumes/NV2/PDF-Processing/total-pdf")
+YOLO_MODEL_PATH = Path("/Volumes/NV2/pdf_recognize/models/best.pt")
+DB_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db")
+REPORTS_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
+
+# 處理配置
+DPI = 150
+CONFIDENCE_THRESHOLD = 0.5
+NAME_SEARCH_MARGIN = 200  # 簽名框周圍搜索姓名的像素範圍
+PROGRESS_SAVE_INTERVAL = 100  # 每處理 N 個頁面保存一次進度
+
+# 中文姓名正則
+CHINESE_NAME_PATTERN = re.compile(r'^[\u4e00-\u9fff]{2,4}$')
+
+
+def find_pdf_file(filename: str) -> Optional[str]:
+    """搜尋 PDF 檔案路徑"""
+    # 先在 batch_* 子目錄尋找
+    for batch_dir in sorted(PDF_BASE.glob("batch_*")):
+        pdf_path = batch_dir / filename
+        if pdf_path.exists():
+            return str(pdf_path)
+
+    # 再在頂層目錄尋找
+    pdf_path = PDF_BASE / filename
+    if pdf_path.exists():
+        return str(pdf_path)
+
+    return None
+
+
+def render_pdf_page(pdf_path: str, page_num: int) -> Optional[np.ndarray]:
+    """渲染 PDF 頁面為圖像"""
+    try:
+        doc = fitz.open(pdf_path)
+        if page_num < 1 or page_num > len(doc):
+            doc.close()
+            return None
+
+        page = doc[page_num - 1]
+        mat = fitz.Matrix(DPI / 72, DPI / 72)
+        pix = page.get_pixmap(matrix=mat, alpha=False)
+        image = np.frombuffer(pix.samples, dtype=np.uint8)
+        image = image.reshape(pix.height, pix.width, pix.n)
+        doc.close()
+        return image
+    except Exception as e:
+        print(f"渲染失敗: {pdf_path} page {page_num}: {e}")
+        return None
+
+
+def detect_signatures_yolo(image: np.ndarray, model) -> List[Dict]:
+    """使用 YOLO 偵測簽名框"""
+    results = model(image, conf=CONFIDENCE_THRESHOLD, verbose=False)
+
+    signatures = []
+    for r in results:
+        for box in r.boxes:
+            x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
+            conf = float(box.conf[0].cpu().numpy())
+            signatures.append({
+                'x': x1,
+                'y': y1,
+                'width': x2 - x1,
+                'height': y2 - y1,
+                'confidence': conf,
+                'center_x': (x1 + x2) / 2,
+                'center_y': (y1 + y2) / 2
+            })
+
+    # 按位置排序（上到下，左到右）
+    signatures.sort(key=lambda s: (s['y'], s['x']))
+
+    return signatures
+
+
+def extract_text_candidates(image: np.ndarray, ocr_client: PaddleOCRClient) -> List[Dict]:
+    """從圖像中提取所有文字候選"""
+    try:
+        results = ocr_client.ocr(image)
+
+        candidates = []
+        for result in results:
+            text = result.get('text', '').strip()
+            box = result.get('box', [])
+            confidence = result.get('confidence', 0.0)
+
+            if not box or not text:
+                continue
+
+            # 計算邊界框中心
+            xs = [point[0] for point in box]
+            ys = [point[1] for point in box]
+            center_x = sum(xs) / len(xs)
+            center_y = sum(ys) / len(ys)
+
+            candidates.append({
+                'text': text,
+                'center_x': center_x,
+                'center_y': center_y,
+                'x': min(xs),
+                'y': min(ys),
+                'width': max(xs) - min(xs),
+                'height': max(ys) - min(ys),
+                'confidence': confidence
+            })
+
+        return candidates
+    except Exception as e:
+        print(f"OCR 失敗: {e}")
+        return []
+
+
+def filter_name_candidates(candidates: List[Dict]) -> List[Dict]:
+    """過濾出可能是姓名的文字（2-4 個中文字，不含數字標點）"""
+    names = []
+    for c in candidates:
+        text = c['text']
+        # 移除空白和標點
+        text_clean = re.sub(r'[\s\:\：\,\，\.\。]', '', text)
+
+        if CHINESE_NAME_PATTERN.match(text_clean):
+            c['text_clean'] = text_clean
+            names.append(c)
+
+    return names
+
+
+def match_signature_to_name(
+    sig: Dict,
+    name_candidates: List[Dict],
+    margin: int = NAME_SEARCH_MARGIN
+) -> Optional[str]:
+    """為簽名框配對最近的姓名候選"""
+    sig_center_x = sig['center_x']
+    sig_center_y = sig['center_y']
+
+    # 過濾出在搜索範圍內的姓名
+    nearby_names = []
+    for name in name_candidates:
+        dx = abs(name['center_x'] - sig_center_x)
+        dy = abs(name['center_y'] - sig_center_y)
+
+        # 在 margin 範圍內
+        if dx <= margin + sig['width']/2 and dy <= margin + sig['height']/2:
+            distance = (dx**2 + dy**2) ** 0.5
+            nearby_names.append((name, distance))
+
+    if not nearby_names:
+        return None
+
+    # 返回距離最近的
+    nearby_names.sort(key=lambda x: x[1])
+    return nearby_names[0][0]['text_clean']
+
+
+def get_pages_to_process(conn: sqlite3.Connection) -> List[Tuple[str, int, List[int]]]:
+    """
+    從資料庫獲取需要處理的 (PDF, page) 組合
+
+    Returns:
+        List of (source_pdf, page_number, [signature_ids])
+    """
+    cursor = conn.cursor()
+
+    # 查詢尚未有 accountant_name 的簽名，按 (PDF, page) 分組
+    cursor.execute('''
+        SELECT source_pdf, page_number, GROUP_CONCAT(signature_id)
+        FROM signatures
+        WHERE accountant_name IS NULL OR accountant_name = ''
+        GROUP BY source_pdf, page_number
+        ORDER BY source_pdf, page_number
+    ''')
+
+    pages = []
+    for row in cursor.fetchall():
+        source_pdf, page_number, sig_ids_str = row
+        sig_ids = [int(x) for x in sig_ids_str.split(',')]
+        pages.append((source_pdf, page_number, sig_ids))
+
+    return pages
+
+
+def update_signature_names(
+    conn: sqlite3.Connection,
+    updates: List[Tuple[int, str, int, int, int, int]]
+):
+    """
+    更新資料庫中的簽名姓名和座標
+
+    Args:
+        updates: List of (signature_id, accountant_name, x, y, width, height)
+    """
+    cursor = conn.cursor()
+
+    # 確保 signature_boxes 表存在
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS signature_boxes (
+            signature_id INTEGER PRIMARY KEY,
+            x INTEGER,
+            y INTEGER,
+            width INTEGER,
+            height INTEGER,
+            FOREIGN KEY (signature_id) REFERENCES signatures(signature_id)
+        )
+    ''')
+
+    for sig_id, name, x, y, w, h in updates:
+        # 更新姓名
+        cursor.execute('''
+            UPDATE signatures SET accountant_name = ? WHERE signature_id = ?
+        ''', (name, sig_id))
+
+        # 更新或插入座標
+        cursor.execute('''
+            INSERT OR REPLACE INTO signature_boxes (signature_id, x, y, width, height)
+            VALUES (?, ?, ?, ?, ?)
+        ''', (sig_id, x, y, w, h))
+
+    conn.commit()
+
+
+def process_page(
+    source_pdf: str,
+    page_number: int,
+    sig_ids: List[int],
+    yolo_model,
+    ocr_client: PaddleOCRClient,
+    conn: sqlite3.Connection
+) -> Dict:
+    """
+    處理單一頁面：偵測簽名框、提取姓名、配對
+
+    Returns:
+        處理結果統計
+    """
+    result = {
+        'source_pdf': source_pdf,
+        'page_number': page_number,
+        'num_signatures': len(sig_ids),
+        'matched': 0,
+        'unmatched': 0,
+        'error': None
+    }
+
+    # 找 PDF 檔案
+    pdf_path = find_pdf_file(source_pdf)
+    if pdf_path is None:
+        result['error'] = 'PDF not found'
+        return result
+
+    # 渲染頁面
+    image = render_pdf_page(pdf_path, page_number)
+    if image is None:
+        result['error'] = 'Render failed'
+        return result
+
+    # YOLO 偵測簽名框
+    sig_boxes = detect_signatures_yolo(image, yolo_model)
+
+    if len(sig_boxes) != len(sig_ids):
+        # 簽名數量不匹配，嘗試按順序配對
+        pass
+
+    # OCR 提取文字
+    text_candidates = extract_text_candidates(image, ocr_client)
+
+    # 過濾出姓名候選
+    name_candidates = filter_name_candidates(text_candidates)
+
+    # 配對簽名與姓名
+    updates = []
+
+    for i, (sig_id, sig_box) in enumerate(zip(sig_ids, sig_boxes)):
+        matched_name = match_signature_to_name(sig_box, name_candidates)
+
+        if matched_name:
+            result['matched'] += 1
+        else:
+            result['unmatched'] += 1
+            matched_name = ''  # 空字串表示未配對
+
+        updates.append((
+            sig_id,
+            matched_name,
+            sig_box['x'],
+            sig_box['y'],
+            sig_box['width'],
+            sig_box['height']
+        ))
+
+    # 如果 YOLO 偵測數量少於記錄數量，處理剩餘的
+    if len(sig_boxes) < len(sig_ids):
+        for sig_id in sig_ids[len(sig_boxes):]:
+            updates.append((sig_id, '', 0, 0, 0, 0))
+            result['unmatched'] += 1
+
+    # 更新資料庫
+    update_signature_names(conn, updates)
+
+    return result
+
+
+def main():
+    print("=" * 60)
+    print("Step 5: 從 PDF 提取會計師印刷姓名")
+    print("=" * 60)
+
+    # 確保報告目錄存在
+    REPORTS_PATH.mkdir(parents=True, exist_ok=True)
+
+    # 連接資料庫
+    print("\n連接資料庫...")
+    conn = sqlite3.connect(DB_PATH)
+
+    # 獲取需要處理的頁面
+    print("查詢待處理頁面...")
+    pages = get_pages_to_process(conn)
+    print(f"共 {len(pages)} 個頁面待處理")
+
+    if not pages:
+        print("沒有需要處理的頁面")
+        conn.close()
+        return
+
+    # 初始化 YOLO
+    print("\n載入 YOLO 模型...")
+    from ultralytics import YOLO
+    yolo_model = YOLO(str(YOLO_MODEL_PATH))
+
+    # 初始化 OCR 客戶端
+    print("連接 PaddleOCR 伺服器...")
+    ocr_client = PaddleOCRClient()
+    if not ocr_client.health_check():
+        print("錯誤: PaddleOCR 伺服器無法連接")
+        print("請確認伺服器 http://192.168.30.36:5555 正在運行")
+        conn.close()
+        return
+    print("OCR 伺服器連接成功")
+
+    # 統計
+    stats = {
+        'total_pages': len(pages),
+        'processed': 0,
+        'matched': 0,
+        'unmatched': 0,
+        'errors': 0,
+        'start_time': time.time()
+    }
+
+    # 處理每個頁面
+    print(f"\n開始處理 {len(pages)} 個頁面...")
+
+    for source_pdf, page_number, sig_ids in tqdm(pages, desc="處理頁面"):
+        result = process_page(
+            source_pdf, page_number, sig_ids,
+            yolo_model, ocr_client, conn
+        )
+
+        stats['processed'] += 1
+        stats['matched'] += result['matched']
+        stats['unmatched'] += result['unmatched']
+        if result['error']:
+            stats['errors'] += 1
+
+        # 定期保存進度報告
+        if stats['processed'] % PROGRESS_SAVE_INTERVAL == 0:
+            elapsed = time.time() - stats['start_time']
+            rate = stats['processed'] / elapsed
+            remaining = (stats['total_pages'] - stats['processed']) / rate if rate > 0 else 0
+
+            print(f"\n進度: {stats['processed']}/{stats['total_pages']} "
+                  f"({stats['processed']/stats['total_pages']*100:.1f}%)")
+            print(f"配對成功: {stats['matched']}, 未配對: {stats['unmatched']}")
+            print(f"預估剩餘時間: {remaining/60:.1f} 分鐘")
+
+    # 最終統計
+    elapsed = time.time() - stats['start_time']
+    stats['elapsed_seconds'] = elapsed
+
+    print("\n" + "=" * 60)
+    print("處理完成")
+    print("=" * 60)
+    print(f"總頁面數: {stats['total_pages']}")
+    print(f"處理成功: {stats['processed']}")
+    print(f"配對成功: {stats['matched']}")
+    print(f"未配對: {stats['unmatched']}")
+    print(f"錯誤: {stats['errors']}")
+    print(f"耗時: {elapsed/60:.1f} 分鐘")
+
+    # 保存報告
+    report_path = REPORTS_PATH / "name_extraction_report.json"
+    with open(report_path, 'w', encoding='utf-8') as f:
+        json.dump(stats, f, indent=2, ensure_ascii=False)
+    print(f"\n報告已儲存: {report_path}")
+
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,402 @@
+#!/usr/bin/env python3
+"""
+Step 5: 從 PDF 提取會計師姓名 - 完整處理版本
+
+流程：
+1. 從資料庫讀取簽名記錄，按 (PDF, page) 分組
+2. 對每個頁面重新執行 YOLO 獲取簽名框座標
+3. 對整頁執行 PaddleOCR 提取文字
+4. 過濾出候選姓名（2-4 個中文字）
+5. 配對簽名與最近的姓名
+6. 更新資料庫並生成報告
+"""
+
+import sqlite3
+import json
+import re
+import sys
+import time
+from pathlib import Path
+from typing import Optional, List, Dict, Tuple
+from collections import defaultdict
+from datetime import datetime
+from tqdm import tqdm
+import numpy as np
+import fitz  # PyMuPDF
+
+# 加入父目錄到路徑
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from paddleocr_client import PaddleOCRClient
+
+# 路徑配置
+PDF_BASE = Path("/Volumes/NV2/PDF-Processing/total-pdf")
+YOLO_MODEL_PATH = Path("/Volumes/NV2/pdf_recognize/models/best.pt")
+DB_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db")
+REPORTS_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
+
+# 處理配置
+DPI = 150
+CONFIDENCE_THRESHOLD = 0.5
+NAME_SEARCH_MARGIN = 200
+PROGRESS_SAVE_INTERVAL = 100
+BATCH_COMMIT_SIZE = 50
+
+# 中文姓名正則
+CHINESE_NAME_PATTERN = re.compile(r'^[\u4e00-\u9fff]{2,4}$')
+# 排除的常見詞
+EXCLUDE_WORDS = {'會計', '會計師', '事務所', '師', '聯合', '出具報告'}
+
+
+def find_pdf_file(filename: str) -> Optional[str]:
+    """搜尋 PDF 檔案路徑"""
+    for batch_dir in sorted(PDF_BASE.glob("batch_*")):
+        pdf_path = batch_dir / filename
+        if pdf_path.exists():
+            return str(pdf_path)
+    pdf_path = PDF_BASE / filename
+    if pdf_path.exists():
+        return str(pdf_path)
+    return None
+
+
+def render_pdf_page(pdf_path: str, page_num: int) -> Optional[np.ndarray]:
+    """渲染 PDF 頁面為圖像"""
+    try:
+        doc = fitz.open(pdf_path)
+        if page_num < 1 or page_num > len(doc):
+            doc.close()
+            return None
+        page = doc[page_num - 1]
+        mat = fitz.Matrix(DPI / 72, DPI / 72)
+        pix = page.get_pixmap(matrix=mat, alpha=False)
+        image = np.frombuffer(pix.samples, dtype=np.uint8)
+        image = image.reshape(pix.height, pix.width, pix.n)
+        doc.close()
+        return image
+    except Exception:
+        return None
+
+
+def detect_signatures_yolo(image: np.ndarray, model) -> List[Dict]:
+    """使用 YOLO 偵測簽名框"""
+    results = model(image, conf=CONFIDENCE_THRESHOLD, verbose=False)
+    signatures = []
+    for r in results:
+        for box in r.boxes:
+            x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
+            conf = float(box.conf[0].cpu().numpy())
+            signatures.append({
+                'x': x1, 'y': y1,
+                'width': x2 - x1, 'height': y2 - y1,
+                'confidence': conf,
+                'center_x': (x1 + x2) / 2,
+                'center_y': (y1 + y2) / 2
+            })
+    signatures.sort(key=lambda s: (s['y'], s['x']))
+    return signatures
+
+
+def extract_and_filter_names(image: np.ndarray, ocr_client: PaddleOCRClient) -> List[Dict]:
+    """從圖像提取並過濾姓名候選"""
+    try:
+        results = ocr_client.ocr(image)
+    except Exception:
+        return []
+
+    candidates = []
+    for result in results:
+        text = result.get('text', '').strip()
+        box = result.get('box', [])
+        if not box or not text:
+            continue
+
+        # 清理文字
+        text_clean = re.sub(r'[\s\:\：\,\，\.\。\、]', '', text)
+
+        # 檢查是否為姓名候選
+        if CHINESE_NAME_PATTERN.match(text_clean) and text_clean not in EXCLUDE_WORDS:
+            xs = [point[0] for point in box]
+            ys = [point[1] for point in box]
+            candidates.append({
+                'text': text_clean,
+                'center_x': sum(xs) / len(xs),
+                'center_y': sum(ys) / len(ys),
+            })
+
+    return candidates
+
+
+def match_signature_to_name(sig: Dict, name_candidates: List[Dict]) -> Optional[str]:
+    """為簽名框配對最近的姓名"""
+    margin = NAME_SEARCH_MARGIN
+    nearby = []
+
+    for name in name_candidates:
+        dx = abs(name['center_x'] - sig['center_x'])
+        dy = abs(name['center_y'] - sig['center_y'])
+        if dx <= margin + sig['width']/2 and dy <= margin + sig['height']/2:
+            distance = (dx**2 + dy**2) ** 0.5
+            nearby.append((name['text'], distance))
+
+    if nearby:
+        nearby.sort(key=lambda x: x[1])
+        return nearby[0][0]
+    return None
+
+
+def get_pages_to_process(conn: sqlite3.Connection) -> List[Tuple[str, int, List[int]]]:
+    """從資料庫獲取需要處理的頁面"""
+    cursor = conn.cursor()
+    cursor.execute('''
+        SELECT source_pdf, page_number, GROUP_CONCAT(signature_id)
+        FROM signatures
+        WHERE accountant_name IS NULL OR accountant_name = ''
+        GROUP BY source_pdf, page_number
+        ORDER BY source_pdf, page_number
+    ''')
+    pages = []
+    for row in cursor.fetchall():
+        source_pdf, page_number, sig_ids_str = row
+        sig_ids = [int(x) for x in sig_ids_str.split(',')]
+        pages.append((source_pdf, page_number, sig_ids))
+    return pages
+
+
+def process_page(
+    source_pdf: str, page_number: int, sig_ids: List[int],
+    yolo_model, ocr_client: PaddleOCRClient
+) -> Dict:
+    """處理單一頁面"""
+    result = {
+        'source_pdf': source_pdf,
+        'page_number': page_number,
+        'num_signatures': len(sig_ids),
+        'matched': 0,
+        'unmatched': 0,
+        'error': None,
+        'updates': []
+    }
+
+    pdf_path = find_pdf_file(source_pdf)
+    if pdf_path is None:
+        result['error'] = 'PDF not found'
+        return result
+
+    image = render_pdf_page(pdf_path, page_number)
+    if image is None:
+        result['error'] = 'Render failed'
+        return result
+
+    sig_boxes = detect_signatures_yolo(image, yolo_model)
+    name_candidates = extract_and_filter_names(image, ocr_client)
+
+    for i, sig_id in enumerate(sig_ids):
+        if i < len(sig_boxes):
+            sig = sig_boxes[i]
+            matched_name = match_signature_to_name(sig, name_candidates)
+
+            if matched_name:
+                result['matched'] += 1
+            else:
+                result['unmatched'] += 1
+                matched_name = ''
+
+            result['updates'].append((
+                sig_id, matched_name,
+                sig['x'], sig['y'], sig['width'], sig['height']
+            ))
+        else:
+            result['updates'].append((sig_id, '', 0, 0, 0, 0))
+            result['unmatched'] += 1
+
+    return result
+
+
+def save_updates_to_db(conn: sqlite3.Connection, updates: List[Tuple]):
+    """批次更新資料庫"""
+    cursor = conn.cursor()
+
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS signature_boxes (
+            signature_id INTEGER PRIMARY KEY,
+            x INTEGER, y INTEGER, width INTEGER, height INTEGER,
+            FOREIGN KEY (signature_id) REFERENCES signatures(signature_id)
+        )
+    ''')
+
+    for sig_id, name, x, y, w, h in updates:
+        cursor.execute('UPDATE signatures SET accountant_name = ? WHERE signature_id = ?', (name, sig_id))
+        if x > 0:  # 有座標才存
+            cursor.execute('''
+                INSERT OR REPLACE INTO signature_boxes (signature_id, x, y, width, height)
+                VALUES (?, ?, ?, ?, ?)
+            ''', (sig_id, x, y, w, h))
+
+    conn.commit()
+
+
+def generate_report(stats: Dict, output_path: Path):
+    """生成處理報告"""
+    report = {
+        'title': '會計師姓名提取報告',
+        'generated_at': datetime.now().isoformat(),
+        'summary': {
+            'total_pages': stats['total_pages'],
+            'processed_pages': stats['processed'],
+            'total_signatures': stats['total_sigs'],
+            'matched_signatures': stats['matched'],
+            'unmatched_signatures': stats['unmatched'],
+            'match_rate': f"{stats['matched']/stats['total_sigs']*100:.1f}%" if stats['total_sigs'] > 0 else "N/A",
+            'errors': stats['errors'],
+            'elapsed_seconds': stats['elapsed_seconds'],
+            'elapsed_human': f"{stats['elapsed_seconds']/3600:.1f} 小時"
+        },
+        'methodology': {
+            'step1': 'YOLO 模型偵測簽名框座標',
+            'step2': 'PaddleOCR 整頁 OCR 提取文字',
+            'step3': '過濾 2-4 個中文字作為姓名候選',
+            'step4': f'在簽名框周圍 {NAME_SEARCH_MARGIN}px 範圍內配對最近的姓名',
+            'dpi': DPI,
+            'yolo_confidence': CONFIDENCE_THRESHOLD
+        },
+        'name_distribution': stats.get('name_distribution', {}),
+        'error_samples': stats.get('error_samples', [])
+    }
+
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(report, f, indent=2, ensure_ascii=False)
+
+    # 同時生成 Markdown 報告
+    md_path = output_path.with_suffix('.md')
+    with open(md_path, 'w', encoding='utf-8') as f:
+        f.write(f"# {report['title']}\n\n")
+        f.write(f"生成時間: {report['generated_at']}\n\n")
+        f.write("## 摘要\n\n")
+        f.write(f"| 指標 | 數值 |\n|------|------|\n")
+        for k, v in report['summary'].items():
+            f.write(f"| {k} | {v} |\n")
+        f.write("\n## 方法論\n\n")
+        for k, v in report['methodology'].items():
+            f.write(f"- **{k}**: {v}\n")
+        f.write("\n## 姓名分布 (Top 50)\n\n")
+        names = sorted(report['name_distribution'].items(), key=lambda x: -x[1])[:50]
+        for name, count in names:
+            f.write(f"- {name}: {count}\n")
+
+    return report
+
+
+def main():
+    print("=" * 70)
+    print("Step 5: 從 PDF 提取會計師姓名 - 完整處理")
+    print("=" * 70)
+    print(f"開始時間: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+
+    REPORTS_PATH.mkdir(parents=True, exist_ok=True)
+
+    # 連接資料庫
+    conn = sqlite3.connect(DB_PATH)
+    pages = get_pages_to_process(conn)
+    print(f"\n待處理頁面: {len(pages):,}")
+
+    if not pages:
+        print("沒有需要處理的頁面")
+        conn.close()
+        return
+
+    # 載入 YOLO
+    print("\n載入 YOLO 模型...")
+    from ultralytics import YOLO
+    yolo_model = YOLO(str(YOLO_MODEL_PATH))
+
+    # 連接 OCR
+    print("連接 PaddleOCR 伺服器...")
+    ocr_client = PaddleOCRClient()
+    if not ocr_client.health_check():
+        print("錯誤: PaddleOCR 伺服器無法連接")
+        conn.close()
+        return
+    print("OCR 伺服器連接成功\n")
+
+    # 統計
+    stats = {
+        'total_pages': len(pages),
+        'processed': 0,
+        'total_sigs': sum(len(p[2]) for p in pages),
+        'matched': 0,
+        'unmatched': 0,
+        'errors': 0,
+        'error_samples': [],
+        'name_distribution': defaultdict(int),
+        'start_time': time.time()
+    }
+
+    all_updates = []
+
+    # 處理每個頁面
+    for source_pdf, page_number, sig_ids in tqdm(pages, desc="處理頁面"):
+        result = process_page(source_pdf, page_number, sig_ids, yolo_model, ocr_client)
+
+        stats['processed'] += 1
+        stats['matched'] += result['matched']
+        stats['unmatched'] += result['unmatched']
+
+        if result['error']:
+            stats['errors'] += 1
+            if len(stats['error_samples']) < 20:
+                stats['error_samples'].append({
+                    'pdf': source_pdf,
+                    'page': page_number,
+                    'error': result['error']
+                })
+        else:
+            all_updates.extend(result['updates'])
+            for update in result['updates']:
+                if update[1]:  # 有姓名
+                    stats['name_distribution'][update[1]] += 1
+
+        # 批次提交
+        if len(all_updates) >= BATCH_COMMIT_SIZE:
+            save_updates_to_db(conn, all_updates)
+            all_updates = []
+
+        # 定期顯示進度
+        if stats['processed'] % PROGRESS_SAVE_INTERVAL == 0:
+            elapsed = time.time() - stats['start_time']
+            rate = stats['processed'] / elapsed
+            remaining = (stats['total_pages'] - stats['processed']) / rate if rate > 0 else 0
+            print(f"\n[進度] {stats['processed']:,}/{stats['total_pages']:,} "
+                  f"({stats['processed']/stats['total_pages']*100:.1f}%) | "
+                  f"配對: {stats['matched']:,} | "
+                  f"剩餘: {remaining/60:.1f} 分鐘")
+
+    # 最後一批提交
+    if all_updates:
+        save_updates_to_db(conn, all_updates)
+
+    stats['elapsed_seconds'] = time.time() - stats['start_time']
+    stats['name_distribution'] = dict(stats['name_distribution'])
+
+    # 生成報告
+    print("\n生成報告...")
+    report_path = REPORTS_PATH / "name_extraction_report.json"
+    generate_report(stats, report_path)
+
+    print("\n" + "=" * 70)
+    print("處理完成！")
+    print("=" * 70)
+    print(f"總頁面: {stats['total_pages']:,}")
+    print(f"總簽名: {stats['total_sigs']:,}")
+    print(f"配對成功: {stats['matched']:,} ({stats['matched']/stats['total_sigs']*100:.1f}%)")
+    print(f"未配對: {stats['unmatched']:,}")
+    print(f"錯誤: {stats['errors']:,}")
+    print(f"耗時: {stats['elapsed_seconds']/3600:.2f} 小時")
+    print(f"\n報告已儲存:")
+    print(f"  - {report_path}")
+    print(f"  - {report_path.with_suffix('.md')}")
+
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,450 @@
+#!/usr/bin/env python3
+"""
+簽名清理與會計師歸檔
+
+1. 標記 sig_count > 2 的 PDF，篩選最佳 2 個簽名
+2. 用 OCR 或座標歸檔到會計師
+3. 建立 accountants 表
+"""
+
+import sqlite3
+import json
+from collections import defaultdict
+from datetime import datetime
+from opencc import OpenCC
+
+# 簡繁轉換
+cc_s2t = OpenCC('s2t')
+
+DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+REPORT_DIR = '/Volumes/NV2/PDF-Processing/signature-analysis/reports'
+
+
+def get_connection():
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
+    return conn
+
+
+def add_columns_if_needed(conn):
+    """添加新欄位"""
+    cur = conn.cursor()
+
+    # 檢查現有欄位
+    cur.execute("PRAGMA table_info(signatures)")
+    columns = [row[1] for row in cur.fetchall()]
+
+    if 'is_valid' not in columns:
+        cur.execute("ALTER TABLE signatures ADD COLUMN is_valid INTEGER DEFAULT 1")
+        print("已添加 is_valid 欄位")
+
+    if 'assigned_accountant' not in columns:
+        cur.execute("ALTER TABLE signatures ADD COLUMN assigned_accountant TEXT")
+        print("已添加 assigned_accountant 欄位")
+
+    conn.commit()
+
+
+def create_accountants_table(conn):
+    """建立 accountants 表"""
+    cur = conn.cursor()
+    cur.execute("""
+        CREATE TABLE IF NOT EXISTS accountants (
+            accountant_id INTEGER PRIMARY KEY AUTOINCREMENT,
+            name TEXT UNIQUE NOT NULL,
+            signature_count INTEGER DEFAULT 0,
+            firm TEXT,
+            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+    """)
+    conn.commit()
+    print("accountants 表已建立")
+
+
+def get_pdf_signatures(conn):
+    """取得每份 PDF 的簽名資料"""
+    cur = conn.cursor()
+    cur.execute("""
+        SELECT s.signature_id, s.source_pdf, s.page_number, s.accountant_name,
+               s.excel_accountant1, s.excel_accountant2, s.excel_firm,
+               sb.x, sb.y, sb.width, sb.height
+        FROM signatures s
+        LEFT JOIN signature_boxes sb ON s.signature_id = sb.signature_id
+        ORDER BY s.source_pdf, s.page_number, sb.y
+    """)
+
+    pdf_sigs = defaultdict(list)
+    for row in cur.fetchall():
+        pdf_sigs[row['source_pdf']].append(dict(row))
+
+    return pdf_sigs
+
+
+def normalize_name(name):
+    """正規化姓名（簡轉繁）"""
+    if not name:
+        return None
+    return cc_s2t.convert(name)
+
+
+def names_match(ocr_name, excel_name):
+    """檢查 OCR 姓名是否與 Excel 姓名匹配"""
+    if not ocr_name or not excel_name:
+        return False
+
+    # 精確匹配
+    if ocr_name == excel_name:
+        return True
+
+    # 簡繁轉換後匹配
+    ocr_trad = normalize_name(ocr_name)
+    if ocr_trad == excel_name:
+        return True
+
+    return False
+
+
+def score_signature(sig, excel_acc1, excel_acc2):
+    """為簽名評分"""
+    score = 0
+    ocr_name = sig.get('accountant_name', '')
+
+    # 1. OCR 姓名匹配 (+100)
+    if names_match(ocr_name, excel_acc1) or names_match(ocr_name, excel_acc2):
+        score += 100
+
+    # 2. 合理尺寸 (+20)
+    width = sig.get('width', 0) or 0
+    height = sig.get('height', 0) or 0
+    if 30 < width < 500 and 20 < height < 200:
+        score += 20
+
+    # 3. 頁面位置 - Y 座標越大分數越高 (最多 +15)
+    y = sig.get('y', 0) or 0
+    score += min(y / 100, 15)
+
+    # 4. 如果尺寸過大（可能是印章），扣分
+    if width > 300 or height > 150:
+        score -= 30
+
+    return score
+
+
+def select_best_two(signatures, excel_acc1, excel_acc2):
+    """選擇最佳的 2 個簽名"""
+    if len(signatures) <= 2:
+        return signatures
+
+    scored = []
+    for sig in signatures:
+        score = score_signature(sig, excel_acc1, excel_acc2)
+        scored.append((sig, score))
+
+    # 按分數排序
+    scored.sort(key=lambda x: -x[1])
+
+    # 取前 2 個
+    return [s[0] for s in scored[:2]]
+
+
+def assign_to_accountant(sig1, sig2, excel_acc1, excel_acc2):
+    """將簽名歸檔到會計師"""
+    ocr1 = sig1.get('accountant_name', '')
+    ocr2 = sig2.get('accountant_name', '')
+
+    # 方法 A: OCR 姓名匹配
+    if names_match(ocr1, excel_acc1):
+        return [(sig1, excel_acc1), (sig2, excel_acc2)]
+    elif names_match(ocr1, excel_acc2):
+        return [(sig1, excel_acc2), (sig2, excel_acc1)]
+    elif names_match(ocr2, excel_acc1):
+        return [(sig1, excel_acc2), (sig2, excel_acc1)]
+    elif names_match(ocr2, excel_acc2):
+        return [(sig1, excel_acc1), (sig2, excel_acc2)]
+
+    # 方法 B: 按 Y 座標（假設會計師1 在上）
+    y1 = sig1.get('y', 0) or 0
+    y2 = sig2.get('y', 0) or 0
+
+    if y1 <= y2:
+        return [(sig1, excel_acc1), (sig2, excel_acc2)]
+    else:
+        return [(sig1, excel_acc2), (sig2, excel_acc1)]
+
+
+def process_all_pdfs(conn):
+    """處理所有 PDF"""
+    print("正在載入簽名資料...")
+    pdf_sigs = get_pdf_signatures(conn)
+    print(f"共 {len(pdf_sigs)} 份 PDF")
+
+    cur = conn.cursor()
+
+    stats = {
+        'total_pdfs': len(pdf_sigs),
+        'sig_count_1': 0,
+        'sig_count_2': 0,
+        'sig_count_gt2': 0,
+        'valid_signatures': 0,
+        'invalid_signatures': 0,
+        'ocr_matched': 0,
+        'y_coordinate_assigned': 0,
+        'no_excel_data': 0,
+    }
+
+    assignments = []  # (signature_id, assigned_accountant, is_valid)
+
+    for pdf_name, sigs in pdf_sigs.items():
+        sig_count = len(sigs)
+        excel_acc1 = sigs[0].get('excel_accountant1') if sigs else None
+        excel_acc2 = sigs[0].get('excel_accountant2') if sigs else None
+
+        if not excel_acc1 and not excel_acc2:
+            # 無 Excel 資料
+            stats['no_excel_data'] += 1
+            for sig in sigs:
+                assignments.append((sig['signature_id'], None, 1))
+            continue
+
+        if sig_count == 1:
+            stats['sig_count_1'] += 1
+            # 只有 1 個簽名，保留但無法確定是哪位會計師
+            sig = sigs[0]
+            ocr_name = sig.get('accountant_name', '')
+            if names_match(ocr_name, excel_acc1):
+                assignments.append((sig['signature_id'], excel_acc1, 1))
+                stats['ocr_matched'] += 1
+            elif names_match(ocr_name, excel_acc2):
+                assignments.append((sig['signature_id'], excel_acc2, 1))
+                stats['ocr_matched'] += 1
+            else:
+                # 無法確定，暫時不指派
+                assignments.append((sig['signature_id'], None, 1))
+            stats['valid_signatures'] += 1
+
+        elif sig_count == 2:
+            stats['sig_count_2'] += 1
+            # 正常情況
+            sig1, sig2 = sigs[0], sigs[1]
+            pairs = assign_to_accountant(sig1, sig2, excel_acc1, excel_acc2)
+
+            for sig, acc in pairs:
+                assignments.append((sig['signature_id'], acc, 1))
+                stats['valid_signatures'] += 1
+
+                # 統計匹配方式
+                ocr_name = sig.get('accountant_name', '')
+                if names_match(ocr_name, acc):
+                    stats['ocr_matched'] += 1
+                else:
+                    stats['y_coordinate_assigned'] += 1
+
+        else:
+            stats['sig_count_gt2'] += 1
+            # 需要篩選
+            best_two = select_best_two(sigs, excel_acc1, excel_acc2)
+
+            # 標記有效/無效
+            valid_ids = {s['signature_id'] for s in best_two}
+            for sig in sigs:
+                if sig['signature_id'] in valid_ids:
+                    is_valid = 1
+                    stats['valid_signatures'] += 1
+                else:
+                    is_valid = 0
+                    stats['invalid_signatures'] += 1
+                    assignments.append((sig['signature_id'], None, is_valid))
+
+            # 歸檔有效的 2 個
+            if len(best_two) == 2:
+                sig1, sig2 = best_two[0], best_two[1]
+                pairs = assign_to_accountant(sig1, sig2, excel_acc1, excel_acc2)
+
+                for sig, acc in pairs:
+                    assignments.append((sig['signature_id'], acc, 1))
+                    ocr_name = sig.get('accountant_name', '')
+                    if names_match(ocr_name, acc):
+                        stats['ocr_matched'] += 1
+                    else:
+                        stats['y_coordinate_assigned'] += 1
+            elif len(best_two) == 1:
+                sig = best_two[0]
+                ocr_name = sig.get('accountant_name', '')
+                if names_match(ocr_name, excel_acc1):
+                    assignments.append((sig['signature_id'], excel_acc1, 1))
+                elif names_match(ocr_name, excel_acc2):
+                    assignments.append((sig['signature_id'], excel_acc2, 1))
+                else:
+                    assignments.append((sig['signature_id'], None, 1))
+
+    # 批量更新資料庫
+    print(f"正在更新 {len(assignments)} 筆簽名...")
+    for sig_id, acc, is_valid in assignments:
+        cur.execute("""
+            UPDATE signatures
+            SET assigned_accountant = ?, is_valid = ?
+            WHERE signature_id = ?
+        """, (acc, is_valid, sig_id))
+
+    conn.commit()
+
+    return stats
+
+
+def build_accountants_table(conn):
+    """建立會計師表"""
+    cur = conn.cursor()
+
+    # 清空現有資料
+    cur.execute("DELETE FROM accountants")
+
+    # 收集所有會計師姓名
+    cur.execute("""
+        SELECT assigned_accountant, excel_firm, COUNT(*) as cnt
+        FROM signatures
+        WHERE assigned_accountant IS NOT NULL AND is_valid = 1
+        GROUP BY assigned_accountant
+    """)
+
+    accountants = {}
+    for row in cur.fetchall():
+        name = row[0]
+        firm = row[1]
+        count = row[2]
+
+        if name not in accountants:
+            accountants[name] = {'count': 0, 'firms': defaultdict(int)}
+        accountants[name]['count'] += count
+        if firm:
+            accountants[name]['firms'][firm] += count
+
+    # 插入 accountants 表
+    for name, data in accountants.items():
+        # 找出最常見的事務所
+        main_firm = None
+        if data['firms']:
+            main_firm = max(data['firms'].items(), key=lambda x: x[1])[0]
+
+        cur.execute("""
+            INSERT INTO accountants (name, signature_count, firm)
+            VALUES (?, ?, ?)
+        """, (name, data['count'], main_firm))
+
+    conn.commit()
+
+    # 更新 signatures 的 accountant_id
+    cur.execute("""
+        UPDATE signatures
+        SET accountant_id = (
+            SELECT accountant_id FROM accountants
+            WHERE accountants.name = signatures.assigned_accountant
+        )
+        WHERE assigned_accountant IS NOT NULL
+    """)
+    conn.commit()
+
+    return len(accountants)
+
+
+def generate_report(stats, accountant_count):
+    """生成報告"""
+    report = {
+        'generated_at': datetime.now().isoformat(),
+        'summary': {
+            'total_pdfs': stats['total_pdfs'],
+            'pdfs_with_1_sig': stats['sig_count_1'],
+            'pdfs_with_2_sigs': stats['sig_count_2'],
+            'pdfs_with_gt2_sigs': stats['sig_count_gt2'],
+            'pdfs_without_excel': stats['no_excel_data'],
+        },
+        'signatures': {
+            'valid': stats['valid_signatures'],
+            'invalid': stats['invalid_signatures'],
+            'total': stats['valid_signatures'] + stats['invalid_signatures'],
+        },
+        'assignment_method': {
+            'ocr_matched': stats['ocr_matched'],
+            'y_coordinate': stats['y_coordinate_assigned'],
+        },
+        'accountants': {
+            'total_unique': accountant_count,
+        }
+    }
+
+    # 儲存 JSON
+    json_path = f"{REPORT_DIR}/signature_cleanup_report.json"
+    with open(json_path, 'w', encoding='utf-8') as f:
+        json.dump(report, f, ensure_ascii=False, indent=2)
+
+    # 儲存 Markdown
+    md_path = f"{REPORT_DIR}/signature_cleanup_report.md"
+    with open(md_path, 'w', encoding='utf-8') as f:
+        f.write("# 簽名清理與歸檔報告\n\n")
+        f.write(f"生成時間: {report['generated_at']}\n\n")
+
+        f.write("## PDF 分布\n\n")
+        f.write("| 類型 | 數量 |\n")
+        f.write("|------|------|\n")
+        f.write(f"| 總 PDF 數 | {stats['total_pdfs']} |\n")
+        f.write(f"| 1 個簽名 | {stats['sig_count_1']} |\n")
+        f.write(f"| 2 個簽名 (正常) | {stats['sig_count_2']} |\n")
+        f.write(f"| >2 個簽名 (需篩選) | {stats['sig_count_gt2']} |\n")
+        f.write(f"| 無 Excel 資料 | {stats['no_excel_data']} |\n")
+
+        f.write("\n## 簽名統計\n\n")
+        f.write("| 類型 | 數量 |\n")
+        f.write("|------|------|\n")
+        f.write(f"| 有效簽名 | {stats['valid_signatures']} |\n")
+        f.write(f"| 無效簽名 (誤判) | {stats['invalid_signatures']} |\n")
+
+        f.write("\n## 歸檔方式\n\n")
+        f.write("| 方式 | 數量 |\n")
+        f.write("|------|------|\n")
+        f.write(f"| OCR 姓名匹配 | {stats['ocr_matched']} |\n")
+        f.write(f"| Y 座標推斷 | {stats['y_coordinate_assigned']} |\n")
+
+        f.write(f"\n## 會計師\n\n")
+        f.write(f"唯一會計師數: **{accountant_count}**\n")
+
+    print(f"報告已儲存: {json_path}")
+    print(f"報告已儲存: {md_path}")
+
+    return report
+
+
+def main():
+    print("=" * 60)
+    print("簽名清理與會計師歸檔")
+    print("=" * 60)
+
+    conn = get_connection()
+
+    # 1. 準備資料庫
+    print("\n[1/4] 準備資料庫...")
+    add_columns_if_needed(conn)
+    create_accountants_table(conn)
+
+    # 2. 處理所有 PDF
+    print("\n[2/4] 處理 PDF 簽名...")
+    stats = process_all_pdfs(conn)
+
+    # 3. 建立 accountants 表
+    print("\n[3/4] 建立會計師表...")
+    accountant_count = build_accountants_table(conn)
+
+    # 4. 生成報告
+    print("\n[4/4] 生成報告...")
+    report = generate_report(stats, accountant_count)
+
+    conn.close()
+
+    print("\n" + "=" * 60)
+    print("完成！")
+    print("=" * 60)
+    print(f"有效簽名: {stats['valid_signatures']}")
+    print(f"無效簽名: {stats['invalid_signatures']}")
+    print(f"唯一會計師: {accountant_count}")
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+"""
+第三階段：同人簽名聚類分析
+
+對每位會計師的簽名進行相似度分析，判斷是否有「複製貼上」行為。
+"""
+
+import sqlite3
+import numpy as np
+import json
+from collections import defaultdict
+from datetime import datetime
+from tqdm import tqdm
+
+DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+FEATURES_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/features/signature_features.npy'
+REPORT_DIR = '/Volumes/NV2/PDF-Processing/signature-analysis/reports'
+
+
+def load_data():
+    """載入特徵向量和會計師分配"""
+    print("載入特徵向量...")
+    features = np.load(FEATURES_PATH)
+    print(f"特徵矩陣形狀: {features.shape}")
+
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+
+    # 取得所有 signature_id 順序（與特徵向量對應）
+    cur.execute("SELECT signature_id FROM signatures ORDER BY signature_id")
+    all_sig_ids = [row[0] for row in cur.fetchall()]
+    sig_id_to_idx = {sig_id: idx for idx, sig_id in enumerate(all_sig_ids)}
+
+    # 取得有效簽名的會計師分配
+    cur.execute("""
+        SELECT s.signature_id, s.assigned_accountant, s.accountant_id, a.name, a.firm
+        FROM signatures s
+        LEFT JOIN accountants a ON s.accountant_id = a.accountant_id
+        WHERE s.is_valid = 1 AND s.assigned_accountant IS NOT NULL
+        ORDER BY s.signature_id
+    """)
+
+    acc_signatures = defaultdict(list)
+    acc_info = {}
+
+    for row in cur.fetchall():
+        sig_id, _, acc_id, acc_name, firm = row
+        if acc_id and sig_id in sig_id_to_idx:
+            acc_signatures[acc_id].append(sig_id)
+            if acc_id not in acc_info:
+                acc_info[acc_id] = {'name': acc_name, 'firm': firm}
+
+    conn.close()
+
+    return features, sig_id_to_idx, acc_signatures, acc_info
+
+
+def compute_similarity_stats(features, sig_ids, sig_id_to_idx):
+    """計算一組簽名的相似度統計"""
+    if len(sig_ids) < 2:
+        return None
+
+    # 取得特徵
+    indices = [sig_id_to_idx[sid] for sid in sig_ids]
+    feat = features[indices]
+
+    # 正規化
+    norms = np.linalg.norm(feat, axis=1, keepdims=True)
+    norms[norms == 0] = 1
+    feat_norm = feat / norms
+
+    # 計算餘弦相似度矩陣
+    sim_matrix = np.dot(feat_norm, feat_norm.T)
+
+    # 取上三角（排除對角線）
+    upper_tri = sim_matrix[np.triu_indices(len(sim_matrix), k=1)]
+
+    if len(upper_tri) == 0:
+        return None
+
+    # 統計
+    stats = {
+        'total_pairs': len(upper_tri),
+        'min_sim': float(upper_tri.min()),
+        'max_sim': float(upper_tri.max()),
+        'mean_sim': float(upper_tri.mean()),
+        'std_sim': float(upper_tri.std()),
+        'pairs_gt_90': int((upper_tri > 0.90).sum()),
+        'pairs_gt_95': int((upper_tri > 0.95).sum()),
+        'pairs_gt_99': int((upper_tri > 0.99).sum()),
+    }
+
+    # 計算比例
+    stats['ratio_gt_90'] = stats['pairs_gt_90'] / stats['total_pairs']
+    stats['ratio_gt_95'] = stats['pairs_gt_95'] / stats['total_pairs']
+    stats['ratio_gt_99'] = stats['pairs_gt_99'] / stats['total_pairs']
+
+    return stats
+
+
+def analyze_all_accountants(features, sig_id_to_idx, acc_signatures, acc_info):
+    """分析所有會計師"""
+    results = []
+
+    for acc_id, sig_ids in tqdm(acc_signatures.items(), desc="分析會計師"):
+        info = acc_info.get(acc_id, {})
+        stats = compute_similarity_stats(features, sig_ids, sig_id_to_idx)
+
+        if stats:
+            result = {
+                'accountant_id': acc_id,
+                'name': info.get('name', ''),
+                'firm': info.get('firm', ''),
+                'signature_count': len(sig_ids),
+                **stats
+            }
+            results.append(result)
+
+    return results
+
+
+def classify_risk(result):
+    """分類風險等級"""
+    ratio_95 = result.get('ratio_gt_95', 0)
+    ratio_99 = result.get('ratio_gt_99', 0)
+    mean_sim = result.get('mean_sim', 0)
+
+    # 高風險：大量高相似度對
+    if ratio_99 > 0.05 or ratio_95 > 0.3:
+        return 'high'
+    # 中風險
+    elif ratio_95 > 0.1 or mean_sim > 0.85:
+        return 'medium'
+    # 低風險
+    else:
+        return 'low'
+
+
+def save_results(results, acc_signatures):
+    """儲存結果"""
+    # 分類風險
+    for r in results:
+        r['risk_level'] = classify_risk(r)
+
+    # 統計
+    risk_counts = defaultdict(int)
+    for r in results:
+        risk_counts[r['risk_level']] += 1
+
+    summary = {
+        'generated_at': datetime.now().isoformat(),
+        'total_accountants': len(results),
+        'risk_distribution': dict(risk_counts),
+        'high_risk_count': risk_counts['high'],
+        'medium_risk_count': risk_counts['medium'],
+        'low_risk_count': risk_counts['low'],
+    }
+
+    # 按風險排序
+    results_sorted = sorted(results, key=lambda x: (-x.get('ratio_gt_95', 0), -x.get('mean_sim', 0)))
+
+    # 儲存 JSON
+    output = {
+        'summary': summary,
+        'accountants': results_sorted
+    }
+
+    json_path = f"{REPORT_DIR}/accountant_similarity_analysis.json"
+    with open(json_path, 'w', encoding='utf-8') as f:
+        json.dump(output, f, ensure_ascii=False, indent=2)
+    print(f"已儲存: {json_path}")
+
+    # 儲存 Markdown 報告
+    md_path = f"{REPORT_DIR}/accountant_similarity_analysis.md"
+    with open(md_path, 'w', encoding='utf-8') as f:
+        f.write("# 會計師簽名相似度分析報告\n\n")
+        f.write(f"生成時間: {summary['generated_at']}\n\n")
+
+        f.write("## 摘要\n\n")
+        f.write(f"| 指標 | 數值 |\n")
+        f.write(f"|------|------|\n")
+        f.write(f"| 總會計師數 | {summary['total_accountants']} |\n")
+        f.write(f"| 高風險 | {risk_counts['high']} |\n")
+        f.write(f"| 中風險 | {risk_counts['medium']} |\n")
+        f.write(f"| 低風險 | {risk_counts['low']} |\n")
+
+        f.write("\n## 風險分類標準\n\n")
+        f.write("- **高風險**: >5% 的簽名對相似度 >0.99，或 >30% 的簽名對相似度 >0.95\n")
+        f.write("- **中風險**: >10% 的簽名對相似度 >0.95，或平均相似度 >0.85\n")
+        f.write("- **低風險**: 其他情況\n")
+
+        f.write("\n## 高風險會計師 (Top 30)\n\n")
+        f.write("| 排名 | 姓名 | 事務所 | 簽名數 | 平均相似度 | >0.95比例 | >0.99比例 |\n")
+        f.write("|------|------|--------|--------|------------|-----------|----------|\n")
+
+        high_risk = [r for r in results_sorted if r['risk_level'] == 'high']
+        for i, r in enumerate(high_risk[:30], 1):
+            f.write(f"| {i} | {r['name']} | {r['firm'] or '-'} | {r['signature_count']} | ")
+            f.write(f"{r['mean_sim']:.3f} | {r['ratio_gt_95']*100:.1f}% | {r['ratio_gt_99']*100:.1f}% |\n")
+
+        f.write("\n## 所有會計師統計分布\n\n")
+
+        # 平均相似度分布
+        mean_sims = [r['mean_sim'] for r in results]
+        f.write("### 平均相似度分布\n\n")
+        f.write(f"- 最小: {min(mean_sims):.3f}\n")
+        f.write(f"- 最大: {max(mean_sims):.3f}\n")
+        f.write(f"- 平均: {np.mean(mean_sims):.3f}\n")
+        f.write(f"- 中位數: {np.median(mean_sims):.3f}\n")
+
+    print(f"已儲存: {md_path}")
+
+    return summary, results_sorted
+
+
+def update_database(results):
+    """更新資料庫，添加風險等級"""
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+
+    # 添加欄位
+    try:
+        cur.execute("ALTER TABLE accountants ADD COLUMN risk_level TEXT")
+        cur.execute("ALTER TABLE accountants ADD COLUMN mean_similarity REAL")
+        cur.execute("ALTER TABLE accountants ADD COLUMN ratio_gt_95 REAL")
+    except:
+        pass  # 欄位已存在
+
+    # 更新
+    for r in results:
+        cur.execute("""
+            UPDATE accountants
+            SET risk_level = ?, mean_similarity = ?, ratio_gt_95 = ?
+            WHERE accountant_id = ?
+        """, (r['risk_level'], r['mean_sim'], r['ratio_gt_95'], r['accountant_id']))
+
+    conn.commit()
+    conn.close()
+    print("資料庫已更新")
+
+
+def main():
+    print("=" * 60)
+    print("第三階段：同人簽名聚類分析")
+    print("=" * 60)
+
+    # 載入資料
+    features, sig_id_to_idx, acc_signatures, acc_info = load_data()
+    print(f"會計師數: {len(acc_signatures)}")
+
+    # 分析所有會計師
+    print("\n開始分析...")
+    results = analyze_all_accountants(features, sig_id_to_idx, acc_signatures, acc_info)
+
+    # 儲存結果
+    print("\n儲存結果...")
+    summary, results_sorted = save_results(results, acc_signatures)
+
+    # 更新資料庫
+    update_database(results_sorted)
+
+    print("\n" + "=" * 60)
+    print("完成！")
+    print("=" * 60)
+    print(f"總會計師: {summary['total_accountants']}")
+    print(f"高風險: {summary['high_risk_count']}")
+    print(f"中風險: {summary['medium_risk_count']}")
+    print(f"低風險: {summary['low_risk_count']}")
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,371 @@
+#!/usr/bin/env python3
+"""
+第四階段：PDF 簽名真偽判定
+
+對每份 PDF 的簽名判斷是「親簽」還是「複製貼上」
+"""
+
+import sqlite3
+import numpy as np
+import json
+import csv
+from collections import defaultdict
+from datetime import datetime
+from tqdm import tqdm
+
+DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+FEATURES_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/features/signature_features.npy'
+REPORT_DIR = '/Volumes/NV2/PDF-Processing/signature-analysis/reports'
+
+# 門檻設定
+THRESHOLD_COPY = 0.95      # 高於此值判定為「複製貼上」
+THRESHOLD_AUTHENTIC = 0.85  # 低於此值判定為「親簽」
+# 介於兩者之間為「不確定」
+
+
+def load_data():
+    """載入資料"""
+    print("載入特徵向量...")
+    features = np.load(FEATURES_PATH)
+
+    # 正規化
+    norms = np.linalg.norm(features, axis=1, keepdims=True)
+    norms[norms == 0] = 1
+    features_norm = features / norms
+
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+
+    # 取得簽名資訊
+    cur.execute("""
+        SELECT s.signature_id, s.source_pdf, s.assigned_accountant,
+               s.excel_accountant1, s.excel_accountant2, s.excel_firm
+        FROM signatures s
+        WHERE s.is_valid = 1 AND s.assigned_accountant IS NOT NULL
+        ORDER BY s.signature_id
+    """)
+
+    sig_data = {}
+    pdf_signatures = defaultdict(list)
+    acc_signatures = defaultdict(list)
+    pdf_info = {}
+
+    for row in cur.fetchall():
+        sig_id, pdf, acc_name, acc1, acc2, firm = row
+        sig_data[sig_id] = {
+            'pdf': pdf,
+            'accountant': acc_name,
+        }
+        pdf_signatures[pdf].append((sig_id, acc_name))
+        acc_signatures[acc_name].append(sig_id)
+
+        if pdf not in pdf_info:
+            pdf_info[pdf] = {
+                'accountant1': acc1,
+                'accountant2': acc2,
+                'firm': firm
+            }
+
+    # signature_id -> feature index
+    cur.execute("SELECT signature_id FROM signatures ORDER BY signature_id")
+    all_sig_ids = [row[0] for row in cur.fetchall()]
+    sig_id_to_idx = {sid: idx for idx, sid in enumerate(all_sig_ids)}
+
+    conn.close()
+
+    return features_norm, sig_data, pdf_signatures, acc_signatures, pdf_info, sig_id_to_idx
+
+
+def get_max_similarity_to_others(sig_id, acc_name, acc_signatures, sig_id_to_idx, features_norm):
+    """計算該簽名與同一會計師其他簽名的最大相似度"""
+    other_sigs = [s for s in acc_signatures[acc_name] if s != sig_id and s in sig_id_to_idx]
+    if not other_sigs:
+        return None, None
+
+    idx = sig_id_to_idx[sig_id]
+    other_indices = [sig_id_to_idx[s] for s in other_sigs]
+
+    feat = features_norm[idx]
+    other_feats = features_norm[other_indices]
+
+    similarities = np.dot(other_feats, feat)
+    max_idx = similarities.argmax()
+
+    return float(similarities[max_idx]), other_sigs[max_idx]
+
+
+def classify_signature(max_sim):
+    """分類簽名"""
+    if max_sim is None:
+        return 'unknown'  # 無法判定（沒有其他簽名可比對）
+    elif max_sim >= THRESHOLD_COPY:
+        return 'copy'     # 複製貼上
+    elif max_sim <= THRESHOLD_AUTHENTIC:
+        return 'authentic'  # 親簽
+    else:
+        return 'uncertain'  # 不確定
+
+
+def classify_pdf(verdicts):
+    """根據兩個簽名的判定結果，給出 PDF 整體判定"""
+    if not verdicts:
+        return 'unknown'
+
+    # 如果有任一簽名是複製，整份 PDF 判定為複製
+    if 'copy' in verdicts:
+        return 'copy'
+    # 如果兩個都是親簽
+    elif all(v == 'authentic' for v in verdicts):
+        return 'authentic'
+    # 如果有不確定的
+    elif 'uncertain' in verdicts:
+        return 'uncertain'
+    else:
+        return 'unknown'
+
+
+def analyze_all_pdfs(features_norm, sig_data, pdf_signatures, acc_signatures, pdf_info, sig_id_to_idx):
+    """分析所有 PDF"""
+    results = []
+
+    for pdf, sigs in tqdm(pdf_signatures.items(), desc="分析 PDF"):
+        info = pdf_info.get(pdf, {})
+
+        pdf_result = {
+            'pdf': pdf,
+            'accountant1': info.get('accountant1', ''),
+            'accountant2': info.get('accountant2', ''),
+            'firm': info.get('firm', ''),
+            'signatures': []
+        }
+
+        verdicts = []
+
+        for sig_id, acc_name in sigs:
+            max_sim, most_similar_sig = get_max_similarity_to_others(
+                sig_id, acc_name, acc_signatures, sig_id_to_idx, features_norm
+            )
+            verdict = classify_signature(max_sim)
+            verdicts.append(verdict)
+
+            pdf_result['signatures'].append({
+                'signature_id': sig_id,
+                'accountant': acc_name,
+                'max_similarity': max_sim,
+                'verdict': verdict
+            })
+
+        pdf_result['pdf_verdict'] = classify_pdf(verdicts)
+        results.append(pdf_result)
+
+    return results
+
+
+def generate_statistics(results):
+    """生成統計"""
+    stats = {
+        'total_pdfs': len(results),
+        'pdf_verdicts': defaultdict(int),
+        'signature_verdicts': defaultdict(int),
+        'by_firm': defaultdict(lambda: defaultdict(int))
+    }
+
+    for r in results:
+        stats['pdf_verdicts'][r['pdf_verdict']] += 1
+        firm = r['firm'] or '未知'
+        stats['by_firm'][firm][r['pdf_verdict']] += 1
+
+        for sig in r['signatures']:
+            stats['signature_verdicts'][sig['verdict']] += 1
+
+    return stats
+
+
+def save_results(results, stats):
+    """儲存結果"""
+    timestamp = datetime.now().isoformat()
+
+    # 1. 儲存完整 JSON
+    json_path = f"{REPORT_DIR}/pdf_signature_verdicts.json"
+    output = {
+        'generated_at': timestamp,
+        'thresholds': {
+            'copy': THRESHOLD_COPY,
+            'authentic': THRESHOLD_AUTHENTIC
+        },
+        'statistics': {
+            'total_pdfs': stats['total_pdfs'],
+            'pdf_verdicts': dict(stats['pdf_verdicts']),
+            'signature_verdicts': dict(stats['signature_verdicts'])
+        },
+        'results': results
+    }
+    with open(json_path, 'w', encoding='utf-8') as f:
+        json.dump(output, f, ensure_ascii=False, indent=2)
+    print(f"已儲存: {json_path}")
+
+    # 2. 儲存 CSV（簡易版）
+    csv_path = f"{REPORT_DIR}/pdf_signature_verdicts.csv"
+    with open(csv_path, 'w', encoding='utf-8', newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow(['PDF', '會計師1', '會計師2', '事務所', '判定結果',
+                        '簽名1_會計師', '簽名1_相似度', '簽名1_判定',
+                        '簽名2_會計師', '簽名2_相似度', '簽名2_判定'])
+
+        for r in results:
+            row = [
+                r['pdf'],
+                r['accountant1'],
+                r['accountant2'],
+                r['firm'] or '',
+                r['pdf_verdict']
+            ]
+
+            for sig in r['signatures'][:2]:  # 最多 2 個簽名
+                row.extend([
+                    sig['accountant'],
+                    f"{sig['max_similarity']:.3f}" if sig['max_similarity'] else '',
+                    sig['verdict']
+                ])
+
+            # 補齊欄位
+            while len(row) < 11:
+                row.append('')
+
+            writer.writerow(row)
+    print(f"已儲存: {csv_path}")
+
+    # 3. 儲存 Markdown 報告
+    md_path = f"{REPORT_DIR}/pdf_signature_verdict_report.md"
+    with open(md_path, 'w', encoding='utf-8') as f:
+        f.write("# PDF 簽名真偽判定報告\n\n")
+        f.write(f"生成時間: {timestamp}\n\n")
+
+        f.write("## 判定標準\n\n")
+        f.write(f"- **複製貼上 (copy)**: 與同一會計師其他簽名相似度 ≥ {THRESHOLD_COPY}\n")
+        f.write(f"- **親簽 (authentic)**: 與同一會計師其他簽名相似度 ≤ {THRESHOLD_AUTHENTIC}\n")
+        f.write(f"- **不確定 (uncertain)**: 相似度介於 {THRESHOLD_AUTHENTIC} ~ {THRESHOLD_COPY}\n")
+        f.write(f"- **無法判定 (unknown)**: 該會計師只有此一份簽名，無法比對\n\n")
+
+        f.write("## 整體統計\n\n")
+        f.write("### PDF 判定結果\n\n")
+        f.write("| 判定 | 數量 | 百分比 |\n")
+        f.write("|------|------|--------|\n")
+
+        total = stats['total_pdfs']
+        for verdict in ['copy', 'uncertain', 'authentic', 'unknown']:
+            count = stats['pdf_verdicts'].get(verdict, 0)
+            pct = count / total * 100 if total > 0 else 0
+            label = {
+                'copy': '複製貼上',
+                'authentic': '親簽',
+                'uncertain': '不確定',
+                'unknown': '無法判定'
+            }.get(verdict, verdict)
+            f.write(f"| {label} | {count:,} | {pct:.1f}% |\n")
+
+        f.write(f"\n**總計: {total:,} 份 PDF**\n")
+
+        f.write("\n### 簽名判定結果\n\n")
+        f.write("| 判定 | 數量 | 百分比 |\n")
+        f.write("|------|------|--------|\n")
+
+        sig_total = sum(stats['signature_verdicts'].values())
+        for verdict in ['copy', 'uncertain', 'authentic', 'unknown']:
+            count = stats['signature_verdicts'].get(verdict, 0)
+            pct = count / sig_total * 100 if sig_total > 0 else 0
+            label = {
+                'copy': '複製貼上',
+                'authentic': '親簽',
+                'uncertain': '不確定',
+                'unknown': '無法判定'
+            }.get(verdict, verdict)
+            f.write(f"| {label} | {count:,} | {pct:.1f}% |\n")
+
+        f.write(f"\n**總計: {sig_total:,} 個簽名**\n")
+
+        f.write("\n### 按事務所統計\n\n")
+        f.write("| 事務所 | 複製貼上 | 不確定 | 親簽 | 無法判定 | 總計 |\n")
+        f.write("|--------|----------|--------|------|----------|------|\n")
+
+        # 按總數排序
+        firms_sorted = sorted(stats['by_firm'].items(),
+                             key=lambda x: sum(x[1].values()), reverse=True)
+
+        for firm, verdicts in firms_sorted[:20]:
+            copy_n = verdicts.get('copy', 0)
+            uncertain_n = verdicts.get('uncertain', 0)
+            authentic_n = verdicts.get('authentic', 0)
+            unknown_n = verdicts.get('unknown', 0)
+            total_n = copy_n + uncertain_n + authentic_n + unknown_n
+            f.write(f"| {firm} | {copy_n:,} | {uncertain_n:,} | {authentic_n:,} | {unknown_n:,} | {total_n:,} |\n")
+
+    print(f"已儲存: {md_path}")
+
+    return stats
+
+
+def update_database(results):
+    """更新資料庫"""
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+
+    # 添加欄位
+    try:
+        cur.execute("ALTER TABLE signatures ADD COLUMN signature_verdict TEXT")
+        cur.execute("ALTER TABLE signatures ADD COLUMN max_similarity_to_same_accountant REAL")
+    except:
+        pass
+
+    # 更新
+    for r in results:
+        for sig in r['signatures']:
+            cur.execute("""
+                UPDATE signatures
+                SET signature_verdict = ?, max_similarity_to_same_accountant = ?
+                WHERE signature_id = ?
+            """, (sig['verdict'], sig['max_similarity'], sig['signature_id']))
+
+    conn.commit()
+    conn.close()
+    print("資料庫已更新")
+
+
+def main():
+    print("=" * 60)
+    print("第四階段：PDF 簽名真偽判定")
+    print("=" * 60)
+
+    # 載入資料
+    features_norm, sig_data, pdf_signatures, acc_signatures, pdf_info, sig_id_to_idx = load_data()
+    print(f"PDF 數: {len(pdf_signatures)}")
+    print(f"有效簽名: {len(sig_data)}")
+
+    # 分析所有 PDF
+    print("\n開始分析...")
+    results = analyze_all_pdfs(
+        features_norm, sig_data, pdf_signatures, acc_signatures, pdf_info, sig_id_to_idx
+    )
+
+    # 生成統計
+    stats = generate_statistics(results)
+
+    # 儲存結果
+    print("\n儲存結果...")
+    save_results(results, stats)
+
+    # 更新資料庫
+    update_database(results)
+
+    print("\n" + "=" * 60)
+    print("完成！")
+    print("=" * 60)
+    print(f"\nPDF 判定結果:")
+    print(f"  複製貼上: {stats['pdf_verdicts'].get('copy', 0):,}")
+    print(f"  不確定: {stats['pdf_verdicts'].get('uncertain', 0):,}")
+    print(f"  親簽: {stats['pdf_verdicts'].get('authentic', 0):,}")
+    print(f"  無法判定: {stats['pdf_verdicts'].get('unknown', 0):,}")
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+"""
+Compute SSIM and pHash for all signature pairs (closest match per accountant).
+Uses multiprocessing for parallel image loading and computation.
+Saves results to database and outputs complete CSV.
+"""
+
+import sqlite3
+import numpy as np
+import cv2
+import os
+import sys
+import json
+import csv
+import time
+from datetime import datetime
+from collections import defaultdict
+from multiprocessing import Pool, cpu_count
+from pathlib import Path
+
+DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+IMAGE_DIR = '/Volumes/NV2/PDF-Processing/yolo-signatures/images'
+OUTPUT_CSV = '/Volumes/NV2/PDF-Processing/signature-analysis/reports/complete_pdf_report.csv'
+CHECKPOINT_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/ssim_checkpoint.json'
+NUM_WORKERS = max(1, cpu_count() - 2)  # Leave 2 cores free
+BATCH_SIZE = 1000
+
+
+def compute_phash(img, hash_size=8):
+    """Compute perceptual hash."""
+    resized = cv2.resize(img, (hash_size + 1, hash_size))
+    diff = resized[:, 1:] > resized[:, :-1]
+    return diff.flatten()
+
+
+def compute_pair_ssim(args):
+    """Compute SSIM, pHash, histogram correlation for a pair of images."""
+    sig_id, file1, file2, cosine_sim = args
+
+    path1 = os.path.join(IMAGE_DIR, file1)
+    path2 = os.path.join(IMAGE_DIR, file2)
+
+    result = {
+        'signature_id': sig_id,
+        'match_file': file2,
+        'cosine_similarity': cosine_sim,
+        'ssim': None,
+        'phash_distance': None,
+        'histogram_corr': None,
+        'pixel_identical': False,
+    }
+
+    try:
+        img1 = cv2.imread(path1, cv2.IMREAD_GRAYSCALE)
+        img2 = cv2.imread(path2, cv2.IMREAD_GRAYSCALE)
+
+        if img1 is None or img2 is None:
+            return result
+
+        # Resize to same dimensions
+        h = min(img1.shape[0], img2.shape[0])
+        w = min(img1.shape[1], img2.shape[1])
+        if h < 3 or w < 3:
+            return result
+
+        img1_r = cv2.resize(img1, (w, h))
+        img2_r = cv2.resize(img2, (w, h))
+
+        # Pixel identical check
+        result['pixel_identical'] = bool(np.array_equal(img1_r, img2_r))
+
+        # SSIM
+        try:
+            from skimage.metrics import structural_similarity as ssim
+            win_size = min(7, min(h, w))
+            if win_size % 2 == 0:
+                win_size -= 1
+            if win_size >= 3:
+                result['ssim'] = float(ssim(img1_r, img2_r, win_size=win_size))
+            else:
+                result['ssim'] = None
+        except Exception:
+            result['ssim'] = None
+
+        # Histogram correlation
+        hist1 = cv2.calcHist([img1_r], [0], None, [256], [0, 256])
+        hist2 = cv2.calcHist([img2_r], [0], None, [256], [0, 256])
+        result['histogram_corr'] = float(cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL))
+
+        # pHash distance
+        h1 = compute_phash(img1_r)
+        h2 = compute_phash(img2_r)
+        result['phash_distance'] = int(np.sum(h1 != h2))
+
+    except Exception as e:
+        pass
+
+    return result
+
+
+def load_checkpoint():
+    """Load checkpoint of already processed signature IDs."""
+    if os.path.exists(CHECKPOINT_PATH):
+        with open(CHECKPOINT_PATH, 'r') as f:
+            data = json.load(f)
+            return set(data.get('processed_ids', []))
+    return set()
+
+
+def save_checkpoint(processed_ids):
+    """Save checkpoint."""
+    with open(CHECKPOINT_PATH, 'w') as f:
+        json.dump({'processed_ids': list(processed_ids), 'timestamp': str(datetime.now())}, f)
+
+
+def main():
+    start_time = time.time()
+    print("=" * 70)
+    print("SSIM & pHash Computation for All Signature Pairs")
+    print(f"Workers: {NUM_WORKERS}")
+    print("=" * 70)
+
+    # --- Step 1: Load data ---
+    print("\n[1/4] Loading data from database...")
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+
+    cur.execute('''
+        SELECT signature_id, image_filename, assigned_accountant, feature_vector
+        FROM signatures
+        WHERE feature_vector IS NOT NULL AND assigned_accountant IS NOT NULL
+    ''')
+    rows = cur.fetchall()
+
+    sig_ids = []
+    filenames = []
+    accountants = []
+    features = []
+
+    for row in rows:
+        sig_ids.append(row[0])
+        filenames.append(row[1])
+        accountants.append(row[2])
+        features.append(np.frombuffer(row[3], dtype=np.float32))
+
+    features = np.array(features)
+    print(f"  Loaded {len(sig_ids)} signatures")
+
+    # --- Step 2: Find closest match per signature ---
+    print("\n[2/4] Finding closest match per signature (same accountant)...")
+    acct_groups = defaultdict(list)
+    for i, acct in enumerate(accountants):
+        acct_groups[acct].append(i)
+
+    # Load checkpoint
+    processed_ids = load_checkpoint()
+    print(f"  Checkpoint: {len(processed_ids)} already processed")
+
+    # Prepare tasks
+    tasks = []
+    for acct, indices in acct_groups.items():
+        if len(indices) < 2:
+            continue
+        vecs = features[indices]
+        sim_matrix = vecs @ vecs.T
+        np.fill_diagonal(sim_matrix, -1)  # Exclude self
+
+        for local_i, global_i in enumerate(indices):
+            if sig_ids[global_i] in processed_ids:
+                continue
+            best_local = np.argmax(sim_matrix[local_i])
+            best_global = indices[best_local]
+            best_sim = float(sim_matrix[local_i, best_local])
+            tasks.append((
+                sig_ids[global_i],
+                filenames[global_i],
+                filenames[best_global],
+                best_sim
+            ))
+
+    print(f"  Tasks to process: {len(tasks)}")
+
+    # --- Step 3: Compute SSIM/pHash in parallel ---
+    print(f"\n[3/4] Computing SSIM & pHash ({len(tasks)} pairs, {NUM_WORKERS} workers)...")
+
+    # Add SSIM columns to database if not exist
+    try:
+        cur.execute('ALTER TABLE signatures ADD COLUMN ssim_to_closest REAL')
+    except:
+        pass
+    try:
+        cur.execute('ALTER TABLE signatures ADD COLUMN phash_distance_to_closest INTEGER')
+    except:
+        pass
+    try:
+        cur.execute('ALTER TABLE signatures ADD COLUMN histogram_corr_to_closest REAL')
+    except:
+        pass
+    try:
+        cur.execute('ALTER TABLE signatures ADD COLUMN pixel_identical_to_closest INTEGER')
+    except:
+        pass
+    try:
+        cur.execute('ALTER TABLE signatures ADD COLUMN closest_match_file TEXT')
+    except:
+        pass
+    conn.commit()
+
+    total = len(tasks)
+    done = 0
+    batch_results = []
+
+    with Pool(NUM_WORKERS) as pool:
+        for result in pool.imap_unordered(compute_pair_ssim, tasks, chunksize=50):
+            batch_results.append(result)
+            done += 1
+
+            if done % BATCH_SIZE == 0 or done == total:
+                # Save batch to database
+                for r in batch_results:
+                    cur.execute('''
+                        UPDATE signatures SET
+                            ssim_to_closest = ?,
+                            phash_distance_to_closest = ?,
+                            histogram_corr_to_closest = ?,
+                            pixel_identical_to_closest = ?,
+                            closest_match_file = ?
+                        WHERE signature_id = ?
+                    ''', (
+                        r['ssim'],
+                        r['phash_distance'],
+                        r['histogram_corr'],
+                        1 if r['pixel_identical'] else 0,
+                        r['match_file'],
+                        r['signature_id']
+                    ))
+                    processed_ids.add(r['signature_id'])
+                conn.commit()
+                save_checkpoint(processed_ids)
+                batch_results = []
+
+                elapsed = time.time() - start_time
+                rate = done / elapsed
+                eta = (total - done) / rate if rate > 0 else 0
+                print(f"  {done:,}/{total:,} ({100*done/total:.1f}%) "
+                      f"| {rate:.1f} pairs/s | ETA: {eta/60:.1f} min")
+
+    # --- Step 4: Generate complete CSV ---
+    print(f"\n[4/4] Generating complete CSV...")
+
+    cur.execute('''
+        SELECT
+            s.source_pdf,
+            s.year_month,
+            s.serial_number,
+            s.doc_type,
+            s.page_number,
+            s.sig_index,
+            s.image_filename,
+            s.assigned_accountant,
+            s.excel_accountant1,
+            s.excel_accountant2,
+            s.excel_firm,
+            s.detection_confidence,
+            s.signature_verdict,
+            s.max_similarity_to_same_accountant,
+            s.ssim_to_closest,
+            s.phash_distance_to_closest,
+            s.histogram_corr_to_closest,
+            s.pixel_identical_to_closest,
+            s.closest_match_file,
+            a.risk_level,
+            a.mean_similarity as acct_mean_similarity,
+            a.ratio_gt_95 as acct_ratio_gt_95
+        FROM signatures s
+        LEFT JOIN accountants a ON s.assigned_accountant = a.name
+        ORDER BY s.source_pdf, s.sig_index
+    ''')
+
+    columns = [
+        'source_pdf', 'year_month', 'serial_number', 'doc_type',
+        'page_number', 'sig_index', 'image_filename',
+        'assigned_accountant', 'excel_accountant1', 'excel_accountant2', 'excel_firm',
+        'detection_confidence', 'signature_verdict',
+        'max_cosine_similarity', 'ssim_to_closest', 'phash_distance_to_closest',
+        'histogram_corr_to_closest', 'pixel_identical_to_closest', 'closest_match_file',
+        'accountant_risk_level', 'accountant_mean_similarity', 'accountant_ratio_gt_95'
+    ]
+
+    with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        writer.writerow(columns)
+        for row in cur:
+            writer.writerow(row)
+
+    # Count rows
+    cur.execute('SELECT COUNT(*) FROM signatures')
+    total_sigs = cur.fetchone()[0]
+    cur.execute('SELECT COUNT(DISTINCT source_pdf) FROM signatures')
+    total_pdfs = cur.fetchone()[0]
+
+    conn.close()
+
+    elapsed = time.time() - start_time
+    print(f"\n{'='*70}")
+    print(f"Complete!")
+    print(f"  Total signatures: {total_sigs:,}")
+    print(f"  Total PDFs: {total_pdfs:,}")
+    print(f"  Output: {OUTPUT_CSV}")
+    print(f"  Time: {elapsed/60:.1f} minutes")
+    print(f"{'='*70}")
+
+    # Clean up checkpoint
+    if os.path.exists(CHECKPOINT_PATH):
+        os.remove(CHECKPOINT_PATH)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,407 @@
+#!/usr/bin/env python3
+"""
+Generate PDF-level aggregated report with multi-method verdicts.
+One row per PDF with all Group A-F columns plus new SSIM/pHash/combined verdicts.
+"""
+
+import sqlite3
+import csv
+import numpy as np
+from datetime import datetime
+from collections import defaultdict
+
+DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+OUTPUT_CSV = '/Volumes/NV2/PDF-Processing/signature-analysis/reports/pdf_level_complete_report.csv'
+
+# Thresholds from statistical analysis
+COSINE_THRESHOLD = 0.95
+COSINE_STATISTICAL = 0.944  # mu + 2*sigma
+KDE_CROSSOVER = 0.838
+SSIM_HIGH = 0.95
+SSIM_MEDIUM = 0.80
+PHASH_IDENTICAL = 0
+PHASH_SIMILAR = 5
+
+
+def classify_overall(max_cosine, max_ssim, min_phash, has_pixel_identical):
+    """
+    Multi-method combined verdict.
+    Returns (verdict, confidence_level, n_methods_agree)
+    """
+    evidence_copy = 0
+    evidence_genuine = 0
+    total_methods = 0
+
+    # Method 1: Cosine similarity
+    if max_cosine is not None:
+        total_methods += 1
+        if max_cosine > COSINE_THRESHOLD:
+            evidence_copy += 1
+        elif max_cosine < KDE_CROSSOVER:
+            evidence_genuine += 1
+
+    # Method 2: SSIM
+    if max_ssim is not None:
+        total_methods += 1
+        if max_ssim > SSIM_HIGH:
+            evidence_copy += 1
+        elif max_ssim < 0.5:
+            evidence_genuine += 1
+
+    # Method 3: pHash
+    if min_phash is not None:
+        total_methods += 1
+        if min_phash <= PHASH_IDENTICAL:
+            evidence_copy += 1
+        elif min_phash > 15:
+            evidence_genuine += 1
+
+    # Method 4: Pixel identical
+    if has_pixel_identical is not None:
+        total_methods += 1
+        if has_pixel_identical:
+            evidence_copy += 1
+
+    # Decision logic
+    if has_pixel_identical:
+        verdict = 'definite_copy'
+        confidence = 'very_high'
+    elif max_ssim is not None and max_ssim > SSIM_HIGH and min_phash is not None and min_phash <= PHASH_SIMILAR:
+        verdict = 'definite_copy'
+        confidence = 'very_high'
+    elif evidence_copy >= 3:
+        verdict = 'very_likely_copy'
+        confidence = 'high'
+    elif evidence_copy >= 2:
+        verdict = 'likely_copy'
+        confidence = 'medium'
+    elif max_cosine is not None and max_cosine > COSINE_THRESHOLD:
+        verdict = 'likely_copy'
+        confidence = 'medium'
+    elif max_cosine is not None and max_cosine > KDE_CROSSOVER:
+        verdict = 'uncertain'
+        confidence = 'low'
+    elif max_cosine is not None and max_cosine <= KDE_CROSSOVER:
+        verdict = 'likely_genuine'
+        confidence = 'medium'
+    else:
+        verdict = 'unknown'
+        confidence = 'none'
+
+    return verdict, confidence, evidence_copy, total_methods
+
+
+def main():
+    print("=" * 70)
+    print("PDF-Level Aggregated Report Generator")
+    print("=" * 70)
+
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+
+    # Load all signature data grouped by PDF
+    print("\n[1/3] Loading signature data...")
+    cur.execute('''
+        SELECT
+            s.source_pdf,
+            s.year_month,
+            s.serial_number,
+            s.doc_type,
+            s.page_number,
+            s.sig_index,
+            s.assigned_accountant,
+            s.excel_accountant1,
+            s.excel_accountant2,
+            s.excel_firm,
+            s.detection_confidence,
+            s.signature_verdict,
+            s.max_similarity_to_same_accountant,
+            s.ssim_to_closest,
+            s.phash_distance_to_closest,
+            s.histogram_corr_to_closest,
+            s.pixel_identical_to_closest,
+            a.risk_level,
+            a.mean_similarity,
+            a.ratio_gt_95,
+            a.signature_count
+        FROM signatures s
+        LEFT JOIN accountants a ON s.assigned_accountant = a.name
+        ORDER BY s.source_pdf, s.sig_index
+    ''')
+
+    # Group by PDF
+    pdf_data = defaultdict(list)
+    for row in cur:
+        pdf_data[row[0]].append(row)
+
+    print(f"  {len(pdf_data)} PDFs loaded")
+
+    # Generate PDF-level rows
+    print("\n[2/3] Aggregating per-PDF statistics...")
+
+    columns = [
+        # Group A: PDF Identity
+        'source_pdf', 'year_month', 'serial_number', 'doc_type',
+
+        # Group B: Excel Master Data
+        'accountant_1', 'accountant_2', 'firm',
+
+        # Group C: YOLO Detection
+        'n_signatures_detected', 'avg_detection_confidence',
+
+        # Group D: Cosine Similarity
+        'max_cosine_similarity', 'min_cosine_similarity', 'avg_cosine_similarity',
+
+        # Group E: Verdict (original per-sig)
+        'sig1_cosine_verdict', 'sig2_cosine_verdict',
+
+        # Group F: Accountant Risk
+        'acct1_name', 'acct1_risk_level', 'acct1_mean_similarity',
+        'acct1_ratio_gt_95', 'acct1_total_signatures',
+        'acct2_name', 'acct2_risk_level', 'acct2_mean_similarity',
+        'acct2_ratio_gt_95', 'acct2_total_signatures',
+
+        # Group G: SSIM (NEW)
+        'max_ssim', 'min_ssim', 'avg_ssim',
+        'verdict_ssim',
+
+        # Group H: pHash (NEW)
+        'min_phash_distance', 'max_phash_distance', 'avg_phash_distance',
+        'verdict_phash',
+
+        # Group I: Histogram Correlation (NEW)
+        'max_histogram_corr', 'avg_histogram_corr',
+
+        # Group J: Pixel Identity (NEW)
+        'has_pixel_identical',
+        'verdict_pixel',
+
+        # Group K: Statistical Threshold (NEW)
+        'verdict_statistical',  # Based on mu+2sigma (0.944)
+
+        # Group L: KDE Crossover (NEW)
+        'verdict_kde',  # Based on KDE crossover (0.838)
+
+        # Group M: Multi-Method Combined (NEW)
+        'overall_verdict',
+        'confidence_level',
+        'n_methods_copy',
+        'n_methods_total',
+    ]
+
+    rows = []
+    for pdf_name, sigs in pdf_data.items():
+        # Group A: Identity (from first signature)
+        first = sigs[0]
+        year_month = first[1]
+        serial_number = first[2]
+        doc_type = first[3]
+
+        # Group B: Excel data
+        excel_acct1 = first[7]
+        excel_acct2 = first[8]
+        excel_firm = first[9]
+
+        # Group C: Detection
+        n_sigs = len(sigs)
+        confidences = [s[10] for s in sigs if s[10] is not None]
+        avg_conf = np.mean(confidences) if confidences else None
+
+        # Group D: Cosine similarity
+        cosines = [s[12] for s in sigs if s[12] is not None]
+        max_cosine = max(cosines) if cosines else None
+        min_cosine = min(cosines) if cosines else None
+        avg_cosine = np.mean(cosines) if cosines else None
+
+        # Group E: Per-sig verdicts
+        verdicts = [s[11] for s in sigs]
+        sig1_verdict = verdicts[0] if len(verdicts) > 0 else None
+        sig2_verdict = verdicts[1] if len(verdicts) > 1 else None
+
+        # Group F: Accountant risk - separate for acct1 and acct2
+        # Match by assigned_accountant to excel_accountant1/2
+        acct1_info = {'name': None, 'risk': None, 'mean_sim': None, 'ratio': None, 'count': None}
+        acct2_info = {'name': None, 'risk': None, 'mean_sim': None, 'ratio': None, 'count': None}
+
+        for s in sigs:
+            assigned = s[6]
+            if assigned and assigned == excel_acct1 and acct1_info['name'] is None:
+                acct1_info = {
+                    'name': assigned, 'risk': s[17],
+                    'mean_sim': s[18], 'ratio': s[19], 'count': s[20]
+                }
+            elif assigned and assigned == excel_acct2 and acct2_info['name'] is None:
+                acct2_info = {
+                    'name': assigned, 'risk': s[17],
+                    'mean_sim': s[18], 'ratio': s[19], 'count': s[20]
+                }
+            elif assigned and acct1_info['name'] is None:
+                acct1_info = {
+                    'name': assigned, 'risk': s[17],
+                    'mean_sim': s[18], 'ratio': s[19], 'count': s[20]
+                }
+            elif assigned and acct2_info['name'] is None:
+                acct2_info = {
+                    'name': assigned, 'risk': s[17],
+                    'mean_sim': s[18], 'ratio': s[19], 'count': s[20]
+                }
+
+        # Group G: SSIM
+        ssims = [s[13] for s in sigs if s[13] is not None]
+        max_ssim = max(ssims) if ssims else None
+        min_ssim = min(ssims) if ssims else None
+        avg_ssim = np.mean(ssims) if ssims else None
+
+        if max_ssim is not None:
+            if max_ssim > SSIM_HIGH:
+                verdict_ssim = 'copy'
+            elif max_ssim > SSIM_MEDIUM:
+                verdict_ssim = 'suspicious'
+            else:
+                verdict_ssim = 'genuine'
+        else:
+            verdict_ssim = None
+
+        # Group H: pHash
+        phashes = [s[14] for s in sigs if s[14] is not None]
+        min_phash = min(phashes) if phashes else None
+        max_phash = max(phashes) if phashes else None
+        avg_phash = np.mean(phashes) if phashes else None
+
+        if min_phash is not None:
+            if min_phash <= PHASH_IDENTICAL:
+                verdict_phash = 'copy'
+            elif min_phash <= PHASH_SIMILAR:
+                verdict_phash = 'suspicious'
+            else:
+                verdict_phash = 'genuine'
+        else:
+            verdict_phash = None
+
+        # Group I: Histogram correlation
+        histcorrs = [s[15] for s in sigs if s[15] is not None]
+        max_histcorr = max(histcorrs) if histcorrs else None
+        avg_histcorr = np.mean(histcorrs) if histcorrs else None
+
+        # Group J: Pixel identical
+        pixel_ids = [s[16] for s in sigs if s[16] is not None]
+        has_pixel = any(p == 1 for p in pixel_ids) if pixel_ids else False
+        verdict_pixel = 'copy' if has_pixel else 'genuine'
+
+        # Group K: Statistical threshold (mu+2sigma = 0.944)
+        if max_cosine is not None:
+            if max_cosine > COSINE_STATISTICAL:
+                verdict_stat = 'copy'
+            elif max_cosine > KDE_CROSSOVER:
+                verdict_stat = 'uncertain'
+            else:
+                verdict_stat = 'genuine'
+        else:
+            verdict_stat = None
+
+        # Group L: KDE crossover (0.838)
+        if max_cosine is not None:
+            if max_cosine > KDE_CROSSOVER:
+                verdict_kde = 'above_crossover'
+            else:
+                verdict_kde = 'below_crossover'
+        else:
+            verdict_kde = None
+
+        # Group M: Multi-method combined
+        overall, confidence, n_copy, n_total = classify_overall(
+            max_cosine, max_ssim, min_phash, has_pixel)
+
+        rows.append([
+            # A
+            pdf_name, year_month, serial_number, doc_type,
+            # B
+            excel_acct1, excel_acct2, excel_firm,
+            # C
+            n_sigs, avg_conf,
+            # D
+            max_cosine, min_cosine, avg_cosine,
+            # E
+            sig1_verdict, sig2_verdict,
+            # F
+            acct1_info['name'], acct1_info['risk'], acct1_info['mean_sim'],
+            acct1_info['ratio'], acct1_info['count'],
+            acct2_info['name'], acct2_info['risk'], acct2_info['mean_sim'],
+            acct2_info['ratio'], acct2_info['count'],
+            # G
+            max_ssim, min_ssim, avg_ssim, verdict_ssim,
+            # H
+            min_phash, max_phash, avg_phash, verdict_phash,
+            # I
+            max_histcorr, avg_histcorr,
+            # J
+            1 if has_pixel else 0, verdict_pixel,
+            # K
+            verdict_stat,
+            # L
+            verdict_kde,
+            # M
+            overall, confidence, n_copy, n_total,
+        ])
+
+    # Write CSV
+    print(f"\n[3/3] Writing {len(rows)} PDF rows to CSV...")
+    with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        writer.writerow(columns)
+        writer.writerows(rows)
+
+    conn.close()
+
+    # Print summary statistics
+    print(f"\n{'='*70}")
+    print("SUMMARY")
+    print(f"{'='*70}")
+    print(f"Total PDFs: {len(rows):,}")
+
+    # Overall verdict distribution
+    verdict_counts = defaultdict(int)
+    confidence_counts = defaultdict(int)
+    for r in rows:
+        verdict_counts[r[-4]] += 1
+        confidence_counts[r[-3]] += 1
+
+    print(f"\n--- Overall Verdict Distribution ---")
+    for v in ['definite_copy', 'very_likely_copy', 'likely_copy', 'uncertain', 'likely_genuine', 'unknown']:
+        c = verdict_counts.get(v, 0)
+        print(f"  {v:20s}: {c:>6,} ({100*c/len(rows):5.1f}%)")
+
+    print(f"\n--- Confidence Level Distribution ---")
+    for c_level in ['very_high', 'high', 'medium', 'low', 'none']:
+        c = confidence_counts.get(c_level, 0)
+        print(f"  {c_level:10s}: {c:>6,} ({100*c/len(rows):5.1f}%)")
+
+    # Per-method verdict distribution
+    # Column indices: verdict_ssim=27, verdict_phash=31, verdict_pixel=35, verdict_stat=36, verdict_kde=37
+    print(f"\n--- Per-Method Verdict Distribution ---")
+    for col_idx, method_name in [(27, 'SSIM'), (31, 'pHash'), (35, 'Pixel'), (36, 'Statistical'), (37, 'KDE')]:
+        counts = defaultdict(int)
+        for r in rows:
+            counts[r[col_idx]] += 1
+        print(f"\n  {method_name}:")
+        for k, v in sorted(counts.items(), key=lambda x: -x[1]):
+            print(f"    {str(k):20s}: {v:>6,} ({100*v/len(rows):5.1f}%)")
+
+    # Cross-method agreement
+    print(f"\n--- Method Agreement (cosine>0.95 PDFs) ---")
+    cosine_copy = [r for r in rows if r[9] is not None and r[9] > COSINE_THRESHOLD]
+    if cosine_copy:
+        ssim_agree = sum(1 for r in cosine_copy if r[27] == 'copy')
+        phash_agree = sum(1 for r in cosine_copy if r[31] == 'copy')
+        pixel_agree = sum(1 for r in cosine_copy if r[34] == 1)
+        print(f"  PDFs with cosine > 0.95: {len(cosine_copy):,}")
+        print(f"  Also SSIM > 0.95:  {ssim_agree:>6,} ({100*ssim_agree/len(cosine_copy):5.1f}%)")
+        print(f"  Also pHash = 0:    {phash_agree:>6,} ({100*phash_agree/len(cosine_copy):5.1f}%)")
+        print(f"  Also pixel-identical: {pixel_agree:>4,} ({100*pixel_agree/len(cosine_copy):5.1f}%)")
+
+    print(f"\nOutput: {OUTPUT_CSV}")
+    print(f"{'='*70}")
+
+
+if __name__ == '__main__':
+    main()