Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification
Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,246 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Step 1: 建立 SQLite 資料庫,匯入簽名記錄
|
||||
|
||||
從 extraction_results.csv 匯入資料,展開每個圖片為獨立記錄
|
||||
解析圖片檔名填充 year_month, sig_index
|
||||
計算圖片尺寸 width, height
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
import cv2
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
# 路徑配置
|
||||
IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
|
||||
CSV_PATH = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/extraction_results.csv")
|
||||
OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis")
|
||||
DB_PATH = OUTPUT_DIR / "signature_analysis.db"
|
||||
|
||||
|
||||
def parse_image_filename(filename: str) -> dict:
|
||||
"""
|
||||
解析圖片檔名,提取結構化資訊
|
||||
|
||||
範例: 201301_2458_AI1_page4_sig1.png
|
||||
"""
|
||||
# 移除 .png 副檔名
|
||||
name = filename.replace('.png', '')
|
||||
|
||||
# 解析模式: {YYYYMM}_{SERIAL}_{DOCTYPE}_page{PAGE}_sig{N}
|
||||
match = re.match(r'^(\d{6})_([^_]+)_([^_]+)_page(\d+)_sig(\d+)$', name)
|
||||
|
||||
if match:
|
||||
year_month, serial, doc_type, page, sig_index = match.groups()
|
||||
return {
|
||||
'year_month': year_month,
|
||||
'serial_number': serial,
|
||||
'doc_type': doc_type,
|
||||
'page_number': int(page),
|
||||
'sig_index': int(sig_index)
|
||||
}
|
||||
else:
|
||||
# 無法解析時返回 None
|
||||
return {
|
||||
'year_month': None,
|
||||
'serial_number': None,
|
||||
'doc_type': None,
|
||||
'page_number': None,
|
||||
'sig_index': None
|
||||
}
|
||||
|
||||
|
||||
def get_image_dimensions(image_path: Path) -> tuple:
|
||||
"""讀取圖片尺寸"""
|
||||
try:
|
||||
img = cv2.imread(str(image_path))
|
||||
if img is not None:
|
||||
h, w = img.shape[:2]
|
||||
return w, h
|
||||
return None, None
|
||||
except Exception:
|
||||
return None, None
|
||||
|
||||
|
||||
def process_single_image(args: tuple) -> dict:
|
||||
"""處理單張圖片,返回資料記錄"""
|
||||
image_filename, source_pdf, confidence_avg = args
|
||||
|
||||
# 解析檔名
|
||||
parsed = parse_image_filename(image_filename)
|
||||
|
||||
# 取得圖片尺寸
|
||||
image_path = IMAGES_DIR / image_filename
|
||||
width, height = get_image_dimensions(image_path)
|
||||
|
||||
return {
|
||||
'image_filename': image_filename,
|
||||
'source_pdf': source_pdf,
|
||||
'year_month': parsed['year_month'],
|
||||
'serial_number': parsed['serial_number'],
|
||||
'doc_type': parsed['doc_type'],
|
||||
'page_number': parsed['page_number'],
|
||||
'sig_index': parsed['sig_index'],
|
||||
'detection_confidence': confidence_avg,
|
||||
'image_width': width,
|
||||
'image_height': height
|
||||
}
|
||||
|
||||
|
||||
def create_database():
|
||||
"""建立資料庫 schema"""
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 建立 signatures 表
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS signatures (
|
||||
signature_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
image_filename TEXT UNIQUE NOT NULL,
|
||||
source_pdf TEXT NOT NULL,
|
||||
year_month TEXT,
|
||||
serial_number TEXT,
|
||||
doc_type TEXT,
|
||||
page_number INTEGER,
|
||||
sig_index INTEGER,
|
||||
detection_confidence REAL,
|
||||
image_width INTEGER,
|
||||
image_height INTEGER,
|
||||
accountant_name TEXT,
|
||||
accountant_id INTEGER,
|
||||
feature_vector BLOB,
|
||||
cluster_id INTEGER,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
''')
|
||||
|
||||
# 建立索引
|
||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_source_pdf ON signatures(source_pdf)')
|
||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_year_month ON signatures(year_month)')
|
||||
cursor.execute('CREATE INDEX IF NOT EXISTS idx_accountant_id ON signatures(accountant_id)')
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
print(f"資料庫已建立: {DB_PATH}")
|
||||
|
||||
|
||||
def expand_csv_to_records(csv_path: Path) -> list:
|
||||
"""
|
||||
將 CSV 展開為單張圖片記錄
|
||||
|
||||
CSV 格式: filename,page,num_signatures,confidence_avg,image_files
|
||||
需要將 image_files 展開為多筆記錄
|
||||
"""
|
||||
df = pd.read_csv(csv_path)
|
||||
|
||||
records = []
|
||||
for _, row in df.iterrows():
|
||||
source_pdf = row['filename']
|
||||
confidence_avg = row['confidence_avg']
|
||||
image_files_str = row['image_files']
|
||||
|
||||
# 展開 image_files(逗號分隔)
|
||||
if pd.notna(image_files_str):
|
||||
image_files = [f.strip() for f in image_files_str.split(',')]
|
||||
for img_file in image_files:
|
||||
records.append((img_file, source_pdf, confidence_avg))
|
||||
|
||||
return records
|
||||
|
||||
|
||||
def import_data():
|
||||
"""匯入資料到資料庫"""
|
||||
print("讀取 CSV 並展開記錄...")
|
||||
records = expand_csv_to_records(CSV_PATH)
|
||||
print(f"共 {len(records)} 張簽名圖片待處理")
|
||||
|
||||
print("處理圖片資訊(讀取尺寸)...")
|
||||
processed_records = []
|
||||
|
||||
# 使用多執行緒加速圖片尺寸讀取
|
||||
with ThreadPoolExecutor(max_workers=8) as executor:
|
||||
futures = {executor.submit(process_single_image, r): r for r in records}
|
||||
|
||||
for future in tqdm(as_completed(futures), total=len(records), desc="處理圖片"):
|
||||
result = future.result()
|
||||
processed_records.append(result)
|
||||
|
||||
print("寫入資料庫...")
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 批次插入
|
||||
insert_sql = '''
|
||||
INSERT OR IGNORE INTO signatures (
|
||||
image_filename, source_pdf, year_month, serial_number, doc_type,
|
||||
page_number, sig_index, detection_confidence, image_width, image_height
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
'''
|
||||
|
||||
batch_data = [
|
||||
(
|
||||
r['image_filename'], r['source_pdf'], r['year_month'], r['serial_number'],
|
||||
r['doc_type'], r['page_number'], r['sig_index'], r['detection_confidence'],
|
||||
r['image_width'], r['image_height']
|
||||
)
|
||||
for r in processed_records
|
||||
]
|
||||
|
||||
cursor.executemany(insert_sql, batch_data)
|
||||
conn.commit()
|
||||
|
||||
# 統計結果
|
||||
cursor.execute('SELECT COUNT(*) FROM signatures')
|
||||
total = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute('SELECT COUNT(DISTINCT source_pdf) FROM signatures')
|
||||
pdf_count = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute('SELECT COUNT(DISTINCT year_month) FROM signatures')
|
||||
period_count = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute('SELECT MIN(year_month), MAX(year_month) FROM signatures')
|
||||
min_date, max_date = cursor.fetchone()
|
||||
|
||||
conn.close()
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("資料庫建立完成")
|
||||
print("=" * 50)
|
||||
print(f"簽名總數: {total:,}")
|
||||
print(f"PDF 檔案數: {pdf_count:,}")
|
||||
print(f"時間範圍: {min_date} ~ {max_date} ({period_count} 個月)")
|
||||
print(f"資料庫位置: {DB_PATH}")
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 50)
|
||||
print("Step 1: 建立簽名分析資料庫")
|
||||
print("=" * 50)
|
||||
|
||||
# 檢查來源檔案
|
||||
if not CSV_PATH.exists():
|
||||
print(f"錯誤: 找不到 CSV 檔案 {CSV_PATH}")
|
||||
return
|
||||
|
||||
if not IMAGES_DIR.exists():
|
||||
print(f"錯誤: 找不到圖片目錄 {IMAGES_DIR}")
|
||||
return
|
||||
|
||||
# 建立資料庫
|
||||
create_database()
|
||||
|
||||
# 匯入資料
|
||||
import_data()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,241 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Step 2: 使用 ResNet-50 提取簽名圖片的特徵向量
|
||||
|
||||
預處理流程:
|
||||
1. 載入圖片 (RGB)
|
||||
2. 縮放至 224x224(保持比例,填充白色)
|
||||
3. 正規化 (ImageNet mean/std)
|
||||
4. 通過 ResNet-50 (去掉最後分類層)
|
||||
5. L2 正規化
|
||||
6. 輸出 2048 維特徵向量
|
||||
"""
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torchvision.models as models
|
||||
import torchvision.transforms as transforms
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
import numpy as np
|
||||
import cv2
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
# 路徑配置
|
||||
IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
|
||||
OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis")
|
||||
DB_PATH = OUTPUT_DIR / "signature_analysis.db"
|
||||
FEATURES_PATH = OUTPUT_DIR / "features"
|
||||
|
||||
# 模型配置
|
||||
BATCH_SIZE = 64
|
||||
NUM_WORKERS = 4
|
||||
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else
|
||||
"cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
||||
class SignatureDataset(Dataset):
|
||||
"""簽名圖片資料集"""
|
||||
|
||||
def __init__(self, image_paths: list, transform=None):
|
||||
self.image_paths = image_paths
|
||||
self.transform = transform
|
||||
|
||||
def __len__(self):
|
||||
return len(self.image_paths)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
img_path = self.image_paths[idx]
|
||||
|
||||
# 載入圖片
|
||||
img = cv2.imread(str(img_path))
|
||||
if img is None:
|
||||
# 如果讀取失敗,返回白色圖片
|
||||
img = np.ones((224, 224, 3), dtype=np.uint8) * 255
|
||||
else:
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
||||
|
||||
# 調整大小(保持比例,填充白色)
|
||||
img = self.resize_with_padding(img, 224, 224)
|
||||
|
||||
if self.transform:
|
||||
img = self.transform(img)
|
||||
|
||||
return img, str(img_path.name)
|
||||
|
||||
@staticmethod
|
||||
def resize_with_padding(img, target_w, target_h):
|
||||
"""調整大小並填充白色以保持比例"""
|
||||
h, w = img.shape[:2]
|
||||
|
||||
# 計算縮放比例
|
||||
scale = min(target_w / w, target_h / h)
|
||||
new_w = int(w * scale)
|
||||
new_h = int(h * scale)
|
||||
|
||||
# 縮放
|
||||
resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
||||
|
||||
# 建立白色畫布
|
||||
canvas = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255
|
||||
|
||||
# 置中貼上
|
||||
x_offset = (target_w - new_w) // 2
|
||||
y_offset = (target_h - new_h) // 2
|
||||
canvas[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized
|
||||
|
||||
return canvas
|
||||
|
||||
|
||||
class FeatureExtractor:
|
||||
"""特徵提取器"""
|
||||
|
||||
def __init__(self, device):
|
||||
self.device = device
|
||||
|
||||
# 載入預訓練 ResNet-50
|
||||
print(f"載入 ResNet-50 模型... (device: {device})")
|
||||
self.model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
|
||||
|
||||
# 移除最後的分類層,保留特徵
|
||||
self.model = nn.Sequential(*list(self.model.children())[:-1])
|
||||
self.model = self.model.to(device)
|
||||
self.model.eval()
|
||||
|
||||
# ImageNet 正規化
|
||||
self.transform = transforms.Compose([
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(
|
||||
mean=[0.485, 0.456, 0.406],
|
||||
std=[0.229, 0.224, 0.225]
|
||||
)
|
||||
])
|
||||
|
||||
@torch.no_grad()
|
||||
def extract_batch(self, images):
|
||||
"""提取一批圖片的特徵"""
|
||||
images = images.to(self.device)
|
||||
features = self.model(images)
|
||||
features = features.squeeze(-1).squeeze(-1) # [B, 2048]
|
||||
|
||||
# L2 正規化
|
||||
features = nn.functional.normalize(features, p=2, dim=1)
|
||||
|
||||
return features.cpu().numpy()
|
||||
|
||||
|
||||
def get_image_list_from_db():
|
||||
"""從資料庫取得所有圖片檔名"""
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('SELECT image_filename FROM signatures ORDER BY signature_id')
|
||||
filenames = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
conn.close()
|
||||
return filenames
|
||||
|
||||
|
||||
def save_features_to_db(features_dict: dict):
|
||||
"""將特徵向量存入資料庫"""
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
for filename, feature in tqdm(features_dict.items(), desc="寫入資料庫"):
|
||||
cursor.execute('''
|
||||
UPDATE signatures
|
||||
SET feature_vector = ?
|
||||
WHERE image_filename = ?
|
||||
''', (feature.tobytes(), filename))
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Step 2: ResNet-50 特徵向量提取")
|
||||
print("=" * 60)
|
||||
print(f"裝置: {DEVICE}")
|
||||
|
||||
# 確保輸出目錄存在
|
||||
FEATURES_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 從資料庫取得圖片列表
|
||||
print("從資料庫讀取圖片列表...")
|
||||
filenames = get_image_list_from_db()
|
||||
print(f"共 {len(filenames):,} 張圖片待處理")
|
||||
|
||||
# 建立圖片路徑列表
|
||||
image_paths = [IMAGES_DIR / f for f in filenames]
|
||||
|
||||
# 初始化特徵提取器
|
||||
extractor = FeatureExtractor(DEVICE)
|
||||
|
||||
# 建立資料集和載入器
|
||||
dataset = SignatureDataset(image_paths, transform=extractor.transform)
|
||||
dataloader = DataLoader(
|
||||
dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
shuffle=False,
|
||||
num_workers=NUM_WORKERS,
|
||||
pin_memory=True
|
||||
)
|
||||
|
||||
# 提取特徵
|
||||
print(f"\n開始提取特徵 (batch_size={BATCH_SIZE})...")
|
||||
all_features = []
|
||||
all_filenames = []
|
||||
|
||||
for images, batch_filenames in tqdm(dataloader, desc="提取特徵"):
|
||||
features = extractor.extract_batch(images)
|
||||
all_features.append(features)
|
||||
all_filenames.extend(batch_filenames)
|
||||
|
||||
# 合併所有特徵
|
||||
all_features = np.vstack(all_features)
|
||||
print(f"\n特徵矩陣形狀: {all_features.shape}")
|
||||
|
||||
# 儲存為 numpy 檔案(備份)
|
||||
npy_path = FEATURES_PATH / "signature_features.npy"
|
||||
np.save(npy_path, all_features)
|
||||
print(f"特徵向量已儲存: {npy_path} ({all_features.nbytes / 1e9:.2f} GB)")
|
||||
|
||||
# 儲存檔名對應(用於後續索引)
|
||||
filenames_path = FEATURES_PATH / "signature_filenames.txt"
|
||||
with open(filenames_path, 'w') as f:
|
||||
for fn in all_filenames:
|
||||
f.write(fn + '\n')
|
||||
print(f"檔名列表已儲存: {filenames_path}")
|
||||
|
||||
# 更新資料庫
|
||||
print("\n更新資料庫中的特徵向量...")
|
||||
features_dict = dict(zip(all_filenames, all_features))
|
||||
save_features_to_db(features_dict)
|
||||
|
||||
# 統計
|
||||
print("\n" + "=" * 60)
|
||||
print("特徵提取完成")
|
||||
print("=" * 60)
|
||||
print(f"處理圖片數: {len(all_filenames):,}")
|
||||
print(f"特徵維度: {all_features.shape[1]}")
|
||||
print(f"特徵檔案: {npy_path}")
|
||||
print(f"檔案大小: {all_features.nbytes / 1e9:.2f} GB")
|
||||
|
||||
# 簡單驗證
|
||||
print("\n特徵統計:")
|
||||
print(f" 平均值: {all_features.mean():.6f}")
|
||||
print(f" 標準差: {all_features.std():.6f}")
|
||||
print(f" 最小值: {all_features.min():.6f}")
|
||||
print(f" 最大值: {all_features.max():.6f}")
|
||||
|
||||
# L2 norm 驗證(應該都是 1.0)
|
||||
norms = np.linalg.norm(all_features, axis=1)
|
||||
print(f" L2 norm: {norms.mean():.6f} ± {norms.std():.6f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,368 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Step 3: 相似度分布探索
|
||||
|
||||
1. 隨機抽樣 100,000 對簽名
|
||||
2. 計算 cosine similarity
|
||||
3. 繪製直方圖分布
|
||||
4. 找出高相似度對 (>0.95)
|
||||
5. 分析高相似度對的來源
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
import random
|
||||
from collections import defaultdict
|
||||
import json
|
||||
|
||||
# 路徑配置
|
||||
OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis")
|
||||
FEATURES_PATH = OUTPUT_DIR / "features" / "signature_features.npy"
|
||||
FILENAMES_PATH = OUTPUT_DIR / "features" / "signature_filenames.txt"
|
||||
REPORTS_PATH = OUTPUT_DIR / "reports"
|
||||
|
||||
# 分析配置
|
||||
NUM_RANDOM_PAIRS = 100000
|
||||
HIGH_SIMILARITY_THRESHOLD = 0.95
|
||||
VERY_HIGH_SIMILARITY_THRESHOLD = 0.99
|
||||
|
||||
|
||||
def load_data():
|
||||
"""載入特徵向量和檔名"""
|
||||
print("載入特徵向量...")
|
||||
features = np.load(FEATURES_PATH)
|
||||
print(f"特徵矩陣形狀: {features.shape}")
|
||||
|
||||
print("載入檔名列表...")
|
||||
with open(FILENAMES_PATH, 'r') as f:
|
||||
filenames = [line.strip() for line in f.readlines()]
|
||||
print(f"檔名數量: {len(filenames)}")
|
||||
|
||||
return features, filenames
|
||||
|
||||
|
||||
def parse_filename(filename: str) -> dict:
|
||||
"""解析檔名提取資訊"""
|
||||
# 範例: 201301_2458_AI1_page4_sig1.png
|
||||
parts = filename.replace('.png', '').split('_')
|
||||
if len(parts) >= 5:
|
||||
return {
|
||||
'year_month': parts[0],
|
||||
'serial': parts[1],
|
||||
'doc_type': parts[2],
|
||||
'page': parts[3].replace('page', ''),
|
||||
'sig_index': parts[4].replace('sig', '')
|
||||
}
|
||||
return {'raw': filename}
|
||||
|
||||
|
||||
def cosine_similarity(v1, v2):
|
||||
"""計算餘弦相似度(向量已 L2 正規化)"""
|
||||
return np.dot(v1, v2)
|
||||
|
||||
|
||||
def random_sampling_analysis(features, filenames, n_pairs=100000):
|
||||
"""隨機抽樣計算相似度分布"""
|
||||
print(f"\n隨機抽樣 {n_pairs:,} 對簽名...")
|
||||
|
||||
n = len(filenames)
|
||||
similarities = []
|
||||
pair_indices = []
|
||||
|
||||
# 產生隨機配對
|
||||
for _ in tqdm(range(n_pairs), desc="計算相似度"):
|
||||
i, j = random.sample(range(n), 2)
|
||||
sim = cosine_similarity(features[i], features[j])
|
||||
similarities.append(sim)
|
||||
pair_indices.append((i, j))
|
||||
|
||||
return np.array(similarities), pair_indices
|
||||
|
||||
|
||||
def find_high_similarity_pairs(features, filenames, threshold=0.95, sample_size=100000):
|
||||
"""找出高相似度的簽名對"""
|
||||
print(f"\n搜尋相似度 > {threshold} 的簽名對...")
|
||||
|
||||
n = len(filenames)
|
||||
high_sim_pairs = []
|
||||
|
||||
# 使用隨機抽樣找高相似度對
|
||||
# 由於全量計算太慢 (n^2 = 33 billion pairs),採用抽樣策略
|
||||
for _ in tqdm(range(sample_size), desc="搜尋高相似度"):
|
||||
i, j = random.sample(range(n), 2)
|
||||
sim = cosine_similarity(features[i], features[j])
|
||||
if sim > threshold:
|
||||
high_sim_pairs.append({
|
||||
'idx1': i,
|
||||
'idx2': j,
|
||||
'file1': filenames[i],
|
||||
'file2': filenames[j],
|
||||
'similarity': float(sim),
|
||||
'parsed1': parse_filename(filenames[i]),
|
||||
'parsed2': parse_filename(filenames[j])
|
||||
})
|
||||
|
||||
return high_sim_pairs
|
||||
|
||||
|
||||
def systematic_high_similarity_search(features, filenames, threshold=0.95, batch_size=1000):
|
||||
"""
|
||||
更系統化的高相似度搜尋:
|
||||
對每個簽名,找出與它最相似的其他簽名
|
||||
"""
|
||||
print(f"\n系統化搜尋高相似度對 (threshold={threshold})...")
|
||||
print("這會對每個簽名找出最相似的候選...")
|
||||
|
||||
n = len(filenames)
|
||||
high_sim_pairs = []
|
||||
seen_pairs = set()
|
||||
|
||||
# 隨機抽樣一部分簽名作為查詢
|
||||
sample_indices = random.sample(range(n), min(5000, n))
|
||||
|
||||
for idx in tqdm(sample_indices, desc="搜尋"):
|
||||
# 計算這個簽名與所有其他簽名的相似度
|
||||
# 使用矩陣運算加速
|
||||
sims = features @ features[idx]
|
||||
|
||||
# 找出高於閾值的(排除自己)
|
||||
high_sim_idx = np.where(sims > threshold)[0]
|
||||
|
||||
for j in high_sim_idx:
|
||||
if j != idx:
|
||||
pair_key = tuple(sorted([idx, int(j)]))
|
||||
if pair_key not in seen_pairs:
|
||||
seen_pairs.add(pair_key)
|
||||
high_sim_pairs.append({
|
||||
'idx1': int(idx),
|
||||
'idx2': int(j),
|
||||
'file1': filenames[idx],
|
||||
'file2': filenames[int(j)],
|
||||
'similarity': float(sims[j]),
|
||||
'parsed1': parse_filename(filenames[idx]),
|
||||
'parsed2': parse_filename(filenames[int(j)])
|
||||
})
|
||||
|
||||
return high_sim_pairs
|
||||
|
||||
|
||||
def analyze_high_similarity_sources(high_sim_pairs):
|
||||
"""分析高相似度對的來源特徵"""
|
||||
print("\n分析高相似度對的來源...")
|
||||
|
||||
stats = {
|
||||
'same_pdf': 0,
|
||||
'same_year_month': 0,
|
||||
'same_doc_type': 0,
|
||||
'different_everything': 0,
|
||||
'total': len(high_sim_pairs)
|
||||
}
|
||||
|
||||
for pair in high_sim_pairs:
|
||||
p1, p2 = pair.get('parsed1', {}), pair.get('parsed2', {})
|
||||
|
||||
# 同一 PDF
|
||||
if p1.get('year_month') == p2.get('year_month') and \
|
||||
p1.get('serial') == p2.get('serial') and \
|
||||
p1.get('doc_type') == p2.get('doc_type'):
|
||||
stats['same_pdf'] += 1
|
||||
# 同月份
|
||||
elif p1.get('year_month') == p2.get('year_month'):
|
||||
stats['same_year_month'] += 1
|
||||
# 同類型
|
||||
elif p1.get('doc_type') == p2.get('doc_type'):
|
||||
stats['same_doc_type'] += 1
|
||||
else:
|
||||
stats['different_everything'] += 1
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def plot_similarity_distribution(similarities, output_path):
|
||||
"""繪製相似度分布圖"""
|
||||
print("\n繪製分布圖...")
|
||||
|
||||
try:
|
||||
# 轉換為 Python list 完全避免 numpy 問題
|
||||
sim_list = similarities.tolist()
|
||||
|
||||
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
|
||||
|
||||
# 左圖:完整分布 - 使用 range 指定 bins
|
||||
ax1 = axes[0]
|
||||
ax1.hist(sim_list, bins=np.linspace(min(sim_list), max(sim_list), 101).tolist(),
|
||||
density=True, alpha=0.7, color='steelblue', edgecolor='white')
|
||||
ax1.axvline(x=0.95, color='red', linestyle='--', label='0.95 threshold')
|
||||
ax1.axvline(x=0.99, color='darkred', linestyle='--', label='0.99 threshold')
|
||||
ax1.set_xlabel('Cosine Similarity', fontsize=12)
|
||||
ax1.set_ylabel('Density', fontsize=12)
|
||||
ax1.set_title('Signature Similarity Distribution (Random Sampling)', fontsize=14)
|
||||
ax1.legend()
|
||||
|
||||
# 統計標註
|
||||
mean_sim = float(np.mean(similarities))
|
||||
std_sim = float(np.std(similarities))
|
||||
ax1.annotate(f'Mean: {mean_sim:.4f}\nStd: {std_sim:.4f}',
|
||||
xy=(0.02, 0.95), xycoords='axes fraction',
|
||||
fontsize=10, verticalalignment='top',
|
||||
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
|
||||
|
||||
# 右圖:高相似度區域放大
|
||||
ax2 = axes[1]
|
||||
high_sim_list = [x for x in sim_list if x > 0.8]
|
||||
if len(high_sim_list) > 0:
|
||||
ax2.hist(high_sim_list, bins=np.linspace(0.8, max(high_sim_list), 51).tolist(),
|
||||
density=True, alpha=0.7, color='coral', edgecolor='white')
|
||||
ax2.axvline(x=0.95, color='red', linestyle='--', label='0.95 threshold')
|
||||
ax2.axvline(x=0.99, color='darkred', linestyle='--', label='0.99 threshold')
|
||||
ax2.set_xlabel('Cosine Similarity', fontsize=12)
|
||||
ax2.set_ylabel('Density', fontsize=12)
|
||||
ax2.set_title('High Similarity Region (> 0.8)', fontsize=14)
|
||||
ax2.legend()
|
||||
|
||||
# 高相似度統計
|
||||
pct_95 = int((similarities > 0.95).sum()) / len(similarities) * 100
|
||||
pct_99 = int((similarities > 0.99).sum()) / len(similarities) * 100
|
||||
ax2.annotate(f'> 0.95: {pct_95:.4f}%\n> 0.99: {pct_99:.4f}%',
|
||||
xy=(0.98, 0.95), xycoords='axes fraction',
|
||||
fontsize=10, verticalalignment='top', horizontalalignment='right',
|
||||
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_path, dpi=150, bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
print(f"分布圖已儲存: {output_path}")
|
||||
except Exception as e:
|
||||
print(f"繪圖失敗: {e}")
|
||||
print("跳過繪圖,繼續其他分析...")
|
||||
|
||||
|
||||
def generate_statistics_report(similarities, high_sim_pairs, source_stats, output_path):
|
||||
"""生成統計報告"""
|
||||
report = {
|
||||
'random_sampling': {
|
||||
'n_pairs': len(similarities),
|
||||
'mean': float(np.mean(similarities)),
|
||||
'std': float(np.std(similarities)),
|
||||
'min': float(np.min(similarities)),
|
||||
'max': float(np.max(similarities)),
|
||||
'percentiles': {
|
||||
'25%': float(np.percentile(similarities, 25)),
|
||||
'50%': float(np.percentile(similarities, 50)),
|
||||
'75%': float(np.percentile(similarities, 75)),
|
||||
'90%': float(np.percentile(similarities, 90)),
|
||||
'95%': float(np.percentile(similarities, 95)),
|
||||
'99%': float(np.percentile(similarities, 99)),
|
||||
},
|
||||
'above_thresholds': {
|
||||
'>0.90': int((similarities > 0.90).sum()),
|
||||
'>0.95': int((similarities > 0.95).sum()),
|
||||
'>0.99': int((similarities > 0.99).sum()),
|
||||
}
|
||||
},
|
||||
'high_similarity_search': {
|
||||
'threshold': HIGH_SIMILARITY_THRESHOLD,
|
||||
'pairs_found': len(high_sim_pairs),
|
||||
'source_analysis': source_stats,
|
||||
'top_10_pairs': sorted(high_sim_pairs, key=lambda x: x['similarity'], reverse=True)[:10]
|
||||
}
|
||||
}
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(report, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"統計報告已儲存: {output_path}")
|
||||
return report
|
||||
|
||||
|
||||
def print_summary(report):
|
||||
"""印出摘要"""
|
||||
print("\n" + "=" * 70)
|
||||
print("相似度分布分析摘要")
|
||||
print("=" * 70)
|
||||
|
||||
rs = report['random_sampling']
|
||||
print(f"\n隨機抽樣統計 ({rs['n_pairs']:,} 對):")
|
||||
print(f" 平均相似度: {rs['mean']:.4f}")
|
||||
print(f" 標準差: {rs['std']:.4f}")
|
||||
print(f" 範圍: [{rs['min']:.4f}, {rs['max']:.4f}]")
|
||||
print(f"\n百分位數:")
|
||||
for k, v in rs['percentiles'].items():
|
||||
print(f" {k}: {v:.4f}")
|
||||
|
||||
print(f"\n高相似度對數量:")
|
||||
for k, v in rs['above_thresholds'].items():
|
||||
pct = v / rs['n_pairs'] * 100
|
||||
print(f" {k}: {v:,} ({pct:.4f}%)")
|
||||
|
||||
hs = report['high_similarity_search']
|
||||
print(f"\n系統化搜尋結果 (threshold={hs['threshold']}):")
|
||||
print(f" 發現高相似度對: {hs['pairs_found']:,}")
|
||||
|
||||
if hs['source_analysis']['total'] > 0:
|
||||
sa = hs['source_analysis']
|
||||
print(f"\n來源分析:")
|
||||
print(f" 同一 PDF: {sa['same_pdf']} ({sa['same_pdf']/sa['total']*100:.1f}%)")
|
||||
print(f" 同月份: {sa['same_year_month']} ({sa['same_year_month']/sa['total']*100:.1f}%)")
|
||||
print(f" 同類型: {sa['same_doc_type']} ({sa['same_doc_type']/sa['total']*100:.1f}%)")
|
||||
print(f" 完全不同: {sa['different_everything']} ({sa['different_everything']/sa['total']*100:.1f}%)")
|
||||
|
||||
if hs['top_10_pairs']:
|
||||
print(f"\nTop 10 高相似度對:")
|
||||
for i, pair in enumerate(hs['top_10_pairs'], 1):
|
||||
print(f" {i}. {pair['similarity']:.4f}")
|
||||
print(f" {pair['file1']}")
|
||||
print(f" {pair['file2']}")
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("Step 3: 相似度分布探索")
|
||||
print("=" * 70)
|
||||
|
||||
# 確保輸出目錄存在
|
||||
REPORTS_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 載入資料
|
||||
features, filenames = load_data()
|
||||
|
||||
# 隨機抽樣分析
|
||||
similarities, pair_indices = random_sampling_analysis(features, filenames, NUM_RANDOM_PAIRS)
|
||||
|
||||
# 繪製分布圖
|
||||
plot_similarity_distribution(
|
||||
similarities,
|
||||
REPORTS_PATH / "similarity_distribution.png"
|
||||
)
|
||||
|
||||
# 系統化搜尋高相似度對
|
||||
high_sim_pairs = systematic_high_similarity_search(
|
||||
features, filenames,
|
||||
threshold=HIGH_SIMILARITY_THRESHOLD
|
||||
)
|
||||
|
||||
# 分析來源
|
||||
source_stats = analyze_high_similarity_sources(high_sim_pairs)
|
||||
|
||||
# 生成報告
|
||||
report = generate_statistics_report(
|
||||
similarities, high_sim_pairs, source_stats,
|
||||
REPORTS_PATH / "similarity_statistics.json"
|
||||
)
|
||||
|
||||
# 儲存高相似度對列表
|
||||
high_sim_output = REPORTS_PATH / "high_similarity_pairs.json"
|
||||
with open(high_sim_output, 'w', encoding='utf-8') as f:
|
||||
json.dump(high_sim_pairs, f, indent=2, ensure_ascii=False)
|
||||
print(f"高相似度對列表已儲存: {high_sim_output}")
|
||||
|
||||
# 印出摘要
|
||||
print_summary(report)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,274 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Step 4: 生成高相似度案例的視覺化報告
|
||||
|
||||
讀取 high_similarity_pairs.json
|
||||
為 Top N 高相似度對生成並排對比圖
|
||||
生成 HTML 報告
|
||||
"""
|
||||
|
||||
import json
|
||||
import cv2
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
import base64
|
||||
from io import BytesIO
|
||||
|
||||
# 路徑配置
|
||||
IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
|
||||
REPORTS_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
|
||||
HIGH_SIM_JSON = REPORTS_PATH / "high_similarity_pairs.json"
|
||||
|
||||
# 報告配置
|
||||
TOP_N = 100 # 顯示前 N 對
|
||||
|
||||
|
||||
def load_image(filename: str) -> np.ndarray:
|
||||
"""載入圖片"""
|
||||
img_path = IMAGES_DIR / filename
|
||||
img = cv2.imread(str(img_path))
|
||||
if img is None:
|
||||
# 返回空白圖片
|
||||
return np.ones((100, 200, 3), dtype=np.uint8) * 255
|
||||
return img
|
||||
|
||||
|
||||
def create_comparison_image(file1: str, file2: str, similarity: float) -> np.ndarray:
|
||||
"""建立並排對比圖"""
|
||||
img1 = load_image(file1)
|
||||
img2 = load_image(file2)
|
||||
|
||||
# 統一高度
|
||||
h1, w1 = img1.shape[:2]
|
||||
h2, w2 = img2.shape[:2]
|
||||
target_h = max(h1, h2, 100)
|
||||
|
||||
# 縮放
|
||||
if h1 != target_h:
|
||||
scale = target_h / h1
|
||||
img1 = cv2.resize(img1, (int(w1 * scale), target_h))
|
||||
if h2 != target_h:
|
||||
scale = target_h / h2
|
||||
img2 = cv2.resize(img2, (int(w2 * scale), target_h))
|
||||
|
||||
# 加入分隔線
|
||||
separator = np.ones((target_h, 20, 3), dtype=np.uint8) * 200
|
||||
|
||||
# 合併
|
||||
comparison = np.hstack([img1, separator, img2])
|
||||
|
||||
return comparison
|
||||
|
||||
|
||||
def image_to_base64(img: np.ndarray) -> str:
|
||||
"""將圖片轉換為 base64"""
|
||||
_, buffer = cv2.imencode('.png', img)
|
||||
return base64.b64encode(buffer).decode('utf-8')
|
||||
|
||||
|
||||
def generate_html_report(pairs: list, output_path: Path):
|
||||
"""生成 HTML 報告"""
|
||||
html_content = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>簽名相似度分析報告 - 高相似度案例</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
||||
max-width: 1400px;
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
h1 {
|
||||
color: #333;
|
||||
text-align: center;
|
||||
border-bottom: 2px solid #666;
|
||||
padding-bottom: 10px;
|
||||
}
|
||||
.summary {
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
color: white;
|
||||
padding: 20px;
|
||||
border-radius: 10px;
|
||||
margin-bottom: 30px;
|
||||
}
|
||||
.summary h2 {
|
||||
margin-top: 0;
|
||||
}
|
||||
.pair-card {
|
||||
background: white;
|
||||
border-radius: 10px;
|
||||
padding: 20px;
|
||||
margin-bottom: 20px;
|
||||
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
||||
}
|
||||
.pair-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 15px;
|
||||
padding-bottom: 10px;
|
||||
border-bottom: 1px solid #eee;
|
||||
}
|
||||
.pair-number {
|
||||
font-size: 1.2em;
|
||||
font-weight: bold;
|
||||
color: #333;
|
||||
}
|
||||
.similarity-badge {
|
||||
background: #dc3545;
|
||||
color: white;
|
||||
padding: 5px 15px;
|
||||
border-radius: 20px;
|
||||
font-weight: bold;
|
||||
}
|
||||
.similarity-badge.high {
|
||||
background: #dc3545;
|
||||
}
|
||||
.similarity-badge.very-high {
|
||||
background: #8b0000;
|
||||
}
|
||||
.file-info {
|
||||
font-family: monospace;
|
||||
font-size: 0.9em;
|
||||
color: #666;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
.comparison-image {
|
||||
max-width: 100%;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 5px;
|
||||
}
|
||||
.analysis {
|
||||
margin-top: 15px;
|
||||
padding: 10px;
|
||||
background: #f8f9fa;
|
||||
border-radius: 5px;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
.tag {
|
||||
display: inline-block;
|
||||
padding: 2px 8px;
|
||||
border-radius: 3px;
|
||||
margin-right: 5px;
|
||||
font-size: 0.8em;
|
||||
}
|
||||
.tag-same-serial { background: #ffebee; color: #c62828; }
|
||||
.tag-same-month { background: #fff3e0; color: #e65100; }
|
||||
.tag-diff { background: #e8f5e9; color: #2e7d32; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>簽名相似度分析報告 - 高相似度案例</h1>
|
||||
|
||||
<div class="summary">
|
||||
<h2>摘要</h2>
|
||||
<p><strong>分析結果:</strong>發現 659,111 對高相似度簽名 (>0.95)</p>
|
||||
<p><strong>本報告顯示:</strong>Top """ + str(TOP_N) + """ 最高相似度案例</p>
|
||||
<p><strong>結論:</strong>存在大量相似度接近或等於 1.0 的簽名對,強烈暗示「複製貼上」行為</p>
|
||||
</div>
|
||||
|
||||
<div class="pairs-container">
|
||||
"""
|
||||
|
||||
for i, pair in enumerate(pairs[:TOP_N], 1):
|
||||
sim = pair['similarity']
|
||||
file1 = pair['file1']
|
||||
file2 = pair['file2']
|
||||
p1 = pair.get('parsed1', {})
|
||||
p2 = pair.get('parsed2', {})
|
||||
|
||||
# 分析關係
|
||||
tags = []
|
||||
if p1.get('serial') == p2.get('serial'):
|
||||
tags.append(('<span class="tag tag-same-serial">同序號</span>', ''))
|
||||
if p1.get('year_month') == p2.get('year_month'):
|
||||
tags.append(('<span class="tag tag-same-month">同月份</span>', ''))
|
||||
if p1.get('year_month') != p2.get('year_month') and p1.get('serial') != p2.get('serial'):
|
||||
tags.append(('<span class="tag tag-diff">完全不同文件</span>', ''))
|
||||
|
||||
badge_class = 'very-high' if sim >= 0.99 else 'high'
|
||||
|
||||
# 建立對比圖
|
||||
try:
|
||||
comparison_img = create_comparison_image(file1, file2, sim)
|
||||
img_base64 = image_to_base64(comparison_img)
|
||||
img_html = f'<img src="data:image/png;base64,{img_base64}" class="comparison-image">'
|
||||
except Exception as e:
|
||||
img_html = f'<p style="color:red">無法載入圖片: {e}</p>'
|
||||
|
||||
tag_html = ''.join([t[0] for t in tags])
|
||||
|
||||
html_content += f"""
|
||||
<div class="pair-card">
|
||||
<div class="pair-header">
|
||||
<span class="pair-number">#{i}</span>
|
||||
<span class="similarity-badge {badge_class}">相似度: {sim:.4f}</span>
|
||||
</div>
|
||||
<div class="file-info">
|
||||
<strong>簽名 1:</strong> {file1}<br>
|
||||
<strong>簽名 2:</strong> {file2}
|
||||
</div>
|
||||
{img_html}
|
||||
<div class="analysis">
|
||||
{tag_html}
|
||||
<br><small>日期: {p1.get('year_month', 'N/A')} vs {p2.get('year_month', 'N/A')} |
|
||||
序號: {p1.get('serial', 'N/A')} vs {p2.get('serial', 'N/A')}</small>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
html_content += """
|
||||
</div>
|
||||
|
||||
<div style="text-align: center; margin-top: 30px; color: #666;">
|
||||
<p>生成時間: 2024 | 簽名真實性研究計劃</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(html_content)
|
||||
|
||||
print(f"HTML 報告已儲存: {output_path}")
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Step 4: 生成高相似度案例視覺化報告")
|
||||
print("=" * 60)
|
||||
|
||||
# 載入高相似度對
|
||||
print("載入高相似度對資料...")
|
||||
with open(HIGH_SIM_JSON, 'r', encoding='utf-8') as f:
|
||||
pairs = json.load(f)
|
||||
|
||||
print(f"共 {len(pairs):,} 對高相似度簽名")
|
||||
|
||||
# 按相似度排序
|
||||
pairs_sorted = sorted(pairs, key=lambda x: x['similarity'], reverse=True)
|
||||
|
||||
# 統計
|
||||
sim_1 = len([p for p in pairs_sorted if p['similarity'] >= 0.9999])
|
||||
sim_99 = len([p for p in pairs_sorted if p['similarity'] >= 0.99])
|
||||
sim_97 = len([p for p in pairs_sorted if p['similarity'] >= 0.97])
|
||||
|
||||
print(f"\n相似度統計:")
|
||||
print(f" = 1.0 (完全相同): {sim_1:,}")
|
||||
print(f" >= 0.99: {sim_99:,}")
|
||||
print(f" >= 0.97: {sim_97:,}")
|
||||
|
||||
# 生成報告
|
||||
print(f"\n生成 Top {TOP_N} 視覺化報告...")
|
||||
generate_html_report(pairs_sorted, REPORTS_PATH / "high_similarity_report.html")
|
||||
|
||||
print("\n完成!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,432 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Step 5: 從 PDF 提取會計師印刷姓名
|
||||
|
||||
流程:
|
||||
1. 從資料庫讀取簽名記錄,按 (PDF, page) 分組
|
||||
2. 對每個頁面重新執行 YOLO 獲取簽名框座標
|
||||
3. 對整頁執行 PaddleOCR 提取印刷文字
|
||||
4. 過濾出候選姓名(2-4 個中文字)
|
||||
5. 配對簽名與最近的印刷姓名
|
||||
6. 更新資料庫的 accountant_name 欄位
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Dict, Tuple
|
||||
from collections import defaultdict
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
import cv2
|
||||
import fitz # PyMuPDF
|
||||
|
||||
# 加入父目錄到路徑以便匯入
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from paddleocr_client import PaddleOCRClient
|
||||
|
||||
# 路徑配置
|
||||
PDF_BASE = Path("/Volumes/NV2/PDF-Processing/total-pdf")
|
||||
YOLO_MODEL_PATH = Path("/Volumes/NV2/pdf_recognize/models/best.pt")
|
||||
DB_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db")
|
||||
REPORTS_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
|
||||
|
||||
# 處理配置
|
||||
DPI = 150
|
||||
CONFIDENCE_THRESHOLD = 0.5
|
||||
NAME_SEARCH_MARGIN = 200 # 簽名框周圍搜索姓名的像素範圍
|
||||
PROGRESS_SAVE_INTERVAL = 100 # 每處理 N 個頁面保存一次進度
|
||||
|
||||
# 中文姓名正則
|
||||
CHINESE_NAME_PATTERN = re.compile(r'^[\u4e00-\u9fff]{2,4}$')
|
||||
|
||||
|
||||
def find_pdf_file(filename: str) -> Optional[str]:
|
||||
"""搜尋 PDF 檔案路徑"""
|
||||
# 先在 batch_* 子目錄尋找
|
||||
for batch_dir in sorted(PDF_BASE.glob("batch_*")):
|
||||
pdf_path = batch_dir / filename
|
||||
if pdf_path.exists():
|
||||
return str(pdf_path)
|
||||
|
||||
# 再在頂層目錄尋找
|
||||
pdf_path = PDF_BASE / filename
|
||||
if pdf_path.exists():
|
||||
return str(pdf_path)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def render_pdf_page(pdf_path: str, page_num: int) -> Optional[np.ndarray]:
|
||||
"""渲染 PDF 頁面為圖像"""
|
||||
try:
|
||||
doc = fitz.open(pdf_path)
|
||||
if page_num < 1 or page_num > len(doc):
|
||||
doc.close()
|
||||
return None
|
||||
|
||||
page = doc[page_num - 1]
|
||||
mat = fitz.Matrix(DPI / 72, DPI / 72)
|
||||
pix = page.get_pixmap(matrix=mat, alpha=False)
|
||||
image = np.frombuffer(pix.samples, dtype=np.uint8)
|
||||
image = image.reshape(pix.height, pix.width, pix.n)
|
||||
doc.close()
|
||||
return image
|
||||
except Exception as e:
|
||||
print(f"渲染失敗: {pdf_path} page {page_num}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def detect_signatures_yolo(image: np.ndarray, model) -> List[Dict]:
|
||||
"""使用 YOLO 偵測簽名框"""
|
||||
results = model(image, conf=CONFIDENCE_THRESHOLD, verbose=False)
|
||||
|
||||
signatures = []
|
||||
for r in results:
|
||||
for box in r.boxes:
|
||||
x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
|
||||
conf = float(box.conf[0].cpu().numpy())
|
||||
signatures.append({
|
||||
'x': x1,
|
||||
'y': y1,
|
||||
'width': x2 - x1,
|
||||
'height': y2 - y1,
|
||||
'confidence': conf,
|
||||
'center_x': (x1 + x2) / 2,
|
||||
'center_y': (y1 + y2) / 2
|
||||
})
|
||||
|
||||
# 按位置排序(上到下,左到右)
|
||||
signatures.sort(key=lambda s: (s['y'], s['x']))
|
||||
|
||||
return signatures
|
||||
|
||||
|
||||
def extract_text_candidates(image: np.ndarray, ocr_client: PaddleOCRClient) -> List[Dict]:
|
||||
"""從圖像中提取所有文字候選"""
|
||||
try:
|
||||
results = ocr_client.ocr(image)
|
||||
|
||||
candidates = []
|
||||
for result in results:
|
||||
text = result.get('text', '').strip()
|
||||
box = result.get('box', [])
|
||||
confidence = result.get('confidence', 0.0)
|
||||
|
||||
if not box or not text:
|
||||
continue
|
||||
|
||||
# 計算邊界框中心
|
||||
xs = [point[0] for point in box]
|
||||
ys = [point[1] for point in box]
|
||||
center_x = sum(xs) / len(xs)
|
||||
center_y = sum(ys) / len(ys)
|
||||
|
||||
candidates.append({
|
||||
'text': text,
|
||||
'center_x': center_x,
|
||||
'center_y': center_y,
|
||||
'x': min(xs),
|
||||
'y': min(ys),
|
||||
'width': max(xs) - min(xs),
|
||||
'height': max(ys) - min(ys),
|
||||
'confidence': confidence
|
||||
})
|
||||
|
||||
return candidates
|
||||
except Exception as e:
|
||||
print(f"OCR 失敗: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def filter_name_candidates(candidates: List[Dict]) -> List[Dict]:
|
||||
"""過濾出可能是姓名的文字(2-4 個中文字,不含數字標點)"""
|
||||
names = []
|
||||
for c in candidates:
|
||||
text = c['text']
|
||||
# 移除空白和標點
|
||||
text_clean = re.sub(r'[\s\:\:\,\,\.\。]', '', text)
|
||||
|
||||
if CHINESE_NAME_PATTERN.match(text_clean):
|
||||
c['text_clean'] = text_clean
|
||||
names.append(c)
|
||||
|
||||
return names
|
||||
|
||||
|
||||
def match_signature_to_name(
|
||||
sig: Dict,
|
||||
name_candidates: List[Dict],
|
||||
margin: int = NAME_SEARCH_MARGIN
|
||||
) -> Optional[str]:
|
||||
"""為簽名框配對最近的姓名候選"""
|
||||
sig_center_x = sig['center_x']
|
||||
sig_center_y = sig['center_y']
|
||||
|
||||
# 過濾出在搜索範圍內的姓名
|
||||
nearby_names = []
|
||||
for name in name_candidates:
|
||||
dx = abs(name['center_x'] - sig_center_x)
|
||||
dy = abs(name['center_y'] - sig_center_y)
|
||||
|
||||
# 在 margin 範圍內
|
||||
if dx <= margin + sig['width']/2 and dy <= margin + sig['height']/2:
|
||||
distance = (dx**2 + dy**2) ** 0.5
|
||||
nearby_names.append((name, distance))
|
||||
|
||||
if not nearby_names:
|
||||
return None
|
||||
|
||||
# 返回距離最近的
|
||||
nearby_names.sort(key=lambda x: x[1])
|
||||
return nearby_names[0][0]['text_clean']
|
||||
|
||||
|
||||
def get_pages_to_process(conn: sqlite3.Connection) -> List[Tuple[str, int, List[int]]]:
|
||||
"""
|
||||
從資料庫獲取需要處理的 (PDF, page) 組合
|
||||
|
||||
Returns:
|
||||
List of (source_pdf, page_number, [signature_ids])
|
||||
"""
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 查詢尚未有 accountant_name 的簽名,按 (PDF, page) 分組
|
||||
cursor.execute('''
|
||||
SELECT source_pdf, page_number, GROUP_CONCAT(signature_id)
|
||||
FROM signatures
|
||||
WHERE accountant_name IS NULL OR accountant_name = ''
|
||||
GROUP BY source_pdf, page_number
|
||||
ORDER BY source_pdf, page_number
|
||||
''')
|
||||
|
||||
pages = []
|
||||
for row in cursor.fetchall():
|
||||
source_pdf, page_number, sig_ids_str = row
|
||||
sig_ids = [int(x) for x in sig_ids_str.split(',')]
|
||||
pages.append((source_pdf, page_number, sig_ids))
|
||||
|
||||
return pages
|
||||
|
||||
|
||||
def update_signature_names(
|
||||
conn: sqlite3.Connection,
|
||||
updates: List[Tuple[int, str, int, int, int, int]]
|
||||
):
|
||||
"""
|
||||
更新資料庫中的簽名姓名和座標
|
||||
|
||||
Args:
|
||||
updates: List of (signature_id, accountant_name, x, y, width, height)
|
||||
"""
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 確保 signature_boxes 表存在
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS signature_boxes (
|
||||
signature_id INTEGER PRIMARY KEY,
|
||||
x INTEGER,
|
||||
y INTEGER,
|
||||
width INTEGER,
|
||||
height INTEGER,
|
||||
FOREIGN KEY (signature_id) REFERENCES signatures(signature_id)
|
||||
)
|
||||
''')
|
||||
|
||||
for sig_id, name, x, y, w, h in updates:
|
||||
# 更新姓名
|
||||
cursor.execute('''
|
||||
UPDATE signatures SET accountant_name = ? WHERE signature_id = ?
|
||||
''', (name, sig_id))
|
||||
|
||||
# 更新或插入座標
|
||||
cursor.execute('''
|
||||
INSERT OR REPLACE INTO signature_boxes (signature_id, x, y, width, height)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
''', (sig_id, x, y, w, h))
|
||||
|
||||
conn.commit()
|
||||
|
||||
|
||||
def process_page(
|
||||
source_pdf: str,
|
||||
page_number: int,
|
||||
sig_ids: List[int],
|
||||
yolo_model,
|
||||
ocr_client: PaddleOCRClient,
|
||||
conn: sqlite3.Connection
|
||||
) -> Dict:
|
||||
"""
|
||||
處理單一頁面:偵測簽名框、提取姓名、配對
|
||||
|
||||
Returns:
|
||||
處理結果統計
|
||||
"""
|
||||
result = {
|
||||
'source_pdf': source_pdf,
|
||||
'page_number': page_number,
|
||||
'num_signatures': len(sig_ids),
|
||||
'matched': 0,
|
||||
'unmatched': 0,
|
||||
'error': None
|
||||
}
|
||||
|
||||
# 找 PDF 檔案
|
||||
pdf_path = find_pdf_file(source_pdf)
|
||||
if pdf_path is None:
|
||||
result['error'] = 'PDF not found'
|
||||
return result
|
||||
|
||||
# 渲染頁面
|
||||
image = render_pdf_page(pdf_path, page_number)
|
||||
if image is None:
|
||||
result['error'] = 'Render failed'
|
||||
return result
|
||||
|
||||
# YOLO 偵測簽名框
|
||||
sig_boxes = detect_signatures_yolo(image, yolo_model)
|
||||
|
||||
if len(sig_boxes) != len(sig_ids):
|
||||
# 簽名數量不匹配,嘗試按順序配對
|
||||
pass
|
||||
|
||||
# OCR 提取文字
|
||||
text_candidates = extract_text_candidates(image, ocr_client)
|
||||
|
||||
# 過濾出姓名候選
|
||||
name_candidates = filter_name_candidates(text_candidates)
|
||||
|
||||
# 配對簽名與姓名
|
||||
updates = []
|
||||
|
||||
for i, (sig_id, sig_box) in enumerate(zip(sig_ids, sig_boxes)):
|
||||
matched_name = match_signature_to_name(sig_box, name_candidates)
|
||||
|
||||
if matched_name:
|
||||
result['matched'] += 1
|
||||
else:
|
||||
result['unmatched'] += 1
|
||||
matched_name = '' # 空字串表示未配對
|
||||
|
||||
updates.append((
|
||||
sig_id,
|
||||
matched_name,
|
||||
sig_box['x'],
|
||||
sig_box['y'],
|
||||
sig_box['width'],
|
||||
sig_box['height']
|
||||
))
|
||||
|
||||
# 如果 YOLO 偵測數量少於記錄數量,處理剩餘的
|
||||
if len(sig_boxes) < len(sig_ids):
|
||||
for sig_id in sig_ids[len(sig_boxes):]:
|
||||
updates.append((sig_id, '', 0, 0, 0, 0))
|
||||
result['unmatched'] += 1
|
||||
|
||||
# 更新資料庫
|
||||
update_signature_names(conn, updates)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Step 5: 從 PDF 提取會計師印刷姓名")
|
||||
print("=" * 60)
|
||||
|
||||
# 確保報告目錄存在
|
||||
REPORTS_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 連接資料庫
|
||||
print("\n連接資料庫...")
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
|
||||
# 獲取需要處理的頁面
|
||||
print("查詢待處理頁面...")
|
||||
pages = get_pages_to_process(conn)
|
||||
print(f"共 {len(pages)} 個頁面待處理")
|
||||
|
||||
if not pages:
|
||||
print("沒有需要處理的頁面")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
# 初始化 YOLO
|
||||
print("\n載入 YOLO 模型...")
|
||||
from ultralytics import YOLO
|
||||
yolo_model = YOLO(str(YOLO_MODEL_PATH))
|
||||
|
||||
# 初始化 OCR 客戶端
|
||||
print("連接 PaddleOCR 伺服器...")
|
||||
ocr_client = PaddleOCRClient()
|
||||
if not ocr_client.health_check():
|
||||
print("錯誤: PaddleOCR 伺服器無法連接")
|
||||
print("請確認伺服器 http://192.168.30.36:5555 正在運行")
|
||||
conn.close()
|
||||
return
|
||||
print("OCR 伺服器連接成功")
|
||||
|
||||
# 統計
|
||||
stats = {
|
||||
'total_pages': len(pages),
|
||||
'processed': 0,
|
||||
'matched': 0,
|
||||
'unmatched': 0,
|
||||
'errors': 0,
|
||||
'start_time': time.time()
|
||||
}
|
||||
|
||||
# 處理每個頁面
|
||||
print(f"\n開始處理 {len(pages)} 個頁面...")
|
||||
|
||||
for source_pdf, page_number, sig_ids in tqdm(pages, desc="處理頁面"):
|
||||
result = process_page(
|
||||
source_pdf, page_number, sig_ids,
|
||||
yolo_model, ocr_client, conn
|
||||
)
|
||||
|
||||
stats['processed'] += 1
|
||||
stats['matched'] += result['matched']
|
||||
stats['unmatched'] += result['unmatched']
|
||||
if result['error']:
|
||||
stats['errors'] += 1
|
||||
|
||||
# 定期保存進度報告
|
||||
if stats['processed'] % PROGRESS_SAVE_INTERVAL == 0:
|
||||
elapsed = time.time() - stats['start_time']
|
||||
rate = stats['processed'] / elapsed
|
||||
remaining = (stats['total_pages'] - stats['processed']) / rate if rate > 0 else 0
|
||||
|
||||
print(f"\n進度: {stats['processed']}/{stats['total_pages']} "
|
||||
f"({stats['processed']/stats['total_pages']*100:.1f}%)")
|
||||
print(f"配對成功: {stats['matched']}, 未配對: {stats['unmatched']}")
|
||||
print(f"預估剩餘時間: {remaining/60:.1f} 分鐘")
|
||||
|
||||
# 最終統計
|
||||
elapsed = time.time() - stats['start_time']
|
||||
stats['elapsed_seconds'] = elapsed
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("處理完成")
|
||||
print("=" * 60)
|
||||
print(f"總頁面數: {stats['total_pages']}")
|
||||
print(f"處理成功: {stats['processed']}")
|
||||
print(f"配對成功: {stats['matched']}")
|
||||
print(f"未配對: {stats['unmatched']}")
|
||||
print(f"錯誤: {stats['errors']}")
|
||||
print(f"耗時: {elapsed/60:.1f} 分鐘")
|
||||
|
||||
# 保存報告
|
||||
report_path = REPORTS_PATH / "name_extraction_report.json"
|
||||
with open(report_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(stats, f, indent=2, ensure_ascii=False)
|
||||
print(f"\n報告已儲存: {report_path}")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,402 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Step 5: 從 PDF 提取會計師姓名 - 完整處理版本
|
||||
|
||||
流程:
|
||||
1. 從資料庫讀取簽名記錄,按 (PDF, page) 分組
|
||||
2. 對每個頁面重新執行 YOLO 獲取簽名框座標
|
||||
3. 對整頁執行 PaddleOCR 提取文字
|
||||
4. 過濾出候選姓名(2-4 個中文字)
|
||||
5. 配對簽名與最近的姓名
|
||||
6. 更新資料庫並生成報告
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Dict, Tuple
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
import fitz # PyMuPDF
|
||||
|
||||
# 加入父目錄到路徑
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from paddleocr_client import PaddleOCRClient
|
||||
|
||||
# 路徑配置
|
||||
PDF_BASE = Path("/Volumes/NV2/PDF-Processing/total-pdf")
|
||||
YOLO_MODEL_PATH = Path("/Volumes/NV2/pdf_recognize/models/best.pt")
|
||||
DB_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db")
|
||||
REPORTS_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
|
||||
|
||||
# 處理配置
|
||||
DPI = 150
|
||||
CONFIDENCE_THRESHOLD = 0.5
|
||||
NAME_SEARCH_MARGIN = 200
|
||||
PROGRESS_SAVE_INTERVAL = 100
|
||||
BATCH_COMMIT_SIZE = 50
|
||||
|
||||
# 中文姓名正則
|
||||
CHINESE_NAME_PATTERN = re.compile(r'^[\u4e00-\u9fff]{2,4}$')
|
||||
# 排除的常見詞
|
||||
EXCLUDE_WORDS = {'會計', '會計師', '事務所', '師', '聯合', '出具報告'}
|
||||
|
||||
|
||||
def find_pdf_file(filename: str) -> Optional[str]:
|
||||
"""搜尋 PDF 檔案路徑"""
|
||||
for batch_dir in sorted(PDF_BASE.glob("batch_*")):
|
||||
pdf_path = batch_dir / filename
|
||||
if pdf_path.exists():
|
||||
return str(pdf_path)
|
||||
pdf_path = PDF_BASE / filename
|
||||
if pdf_path.exists():
|
||||
return str(pdf_path)
|
||||
return None
|
||||
|
||||
|
||||
def render_pdf_page(pdf_path: str, page_num: int) -> Optional[np.ndarray]:
|
||||
"""渲染 PDF 頁面為圖像"""
|
||||
try:
|
||||
doc = fitz.open(pdf_path)
|
||||
if page_num < 1 or page_num > len(doc):
|
||||
doc.close()
|
||||
return None
|
||||
page = doc[page_num - 1]
|
||||
mat = fitz.Matrix(DPI / 72, DPI / 72)
|
||||
pix = page.get_pixmap(matrix=mat, alpha=False)
|
||||
image = np.frombuffer(pix.samples, dtype=np.uint8)
|
||||
image = image.reshape(pix.height, pix.width, pix.n)
|
||||
doc.close()
|
||||
return image
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def detect_signatures_yolo(image: np.ndarray, model) -> List[Dict]:
|
||||
"""使用 YOLO 偵測簽名框"""
|
||||
results = model(image, conf=CONFIDENCE_THRESHOLD, verbose=False)
|
||||
signatures = []
|
||||
for r in results:
|
||||
for box in r.boxes:
|
||||
x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
|
||||
conf = float(box.conf[0].cpu().numpy())
|
||||
signatures.append({
|
||||
'x': x1, 'y': y1,
|
||||
'width': x2 - x1, 'height': y2 - y1,
|
||||
'confidence': conf,
|
||||
'center_x': (x1 + x2) / 2,
|
||||
'center_y': (y1 + y2) / 2
|
||||
})
|
||||
signatures.sort(key=lambda s: (s['y'], s['x']))
|
||||
return signatures
|
||||
|
||||
|
||||
def extract_and_filter_names(image: np.ndarray, ocr_client: PaddleOCRClient) -> List[Dict]:
|
||||
"""從圖像提取並過濾姓名候選"""
|
||||
try:
|
||||
results = ocr_client.ocr(image)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
candidates = []
|
||||
for result in results:
|
||||
text = result.get('text', '').strip()
|
||||
box = result.get('box', [])
|
||||
if not box or not text:
|
||||
continue
|
||||
|
||||
# 清理文字
|
||||
text_clean = re.sub(r'[\s\:\:\,\,\.\。\、]', '', text)
|
||||
|
||||
# 檢查是否為姓名候選
|
||||
if CHINESE_NAME_PATTERN.match(text_clean) and text_clean not in EXCLUDE_WORDS:
|
||||
xs = [point[0] for point in box]
|
||||
ys = [point[1] for point in box]
|
||||
candidates.append({
|
||||
'text': text_clean,
|
||||
'center_x': sum(xs) / len(xs),
|
||||
'center_y': sum(ys) / len(ys),
|
||||
})
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
def match_signature_to_name(sig: Dict, name_candidates: List[Dict]) -> Optional[str]:
|
||||
"""為簽名框配對最近的姓名"""
|
||||
margin = NAME_SEARCH_MARGIN
|
||||
nearby = []
|
||||
|
||||
for name in name_candidates:
|
||||
dx = abs(name['center_x'] - sig['center_x'])
|
||||
dy = abs(name['center_y'] - sig['center_y'])
|
||||
if dx <= margin + sig['width']/2 and dy <= margin + sig['height']/2:
|
||||
distance = (dx**2 + dy**2) ** 0.5
|
||||
nearby.append((name['text'], distance))
|
||||
|
||||
if nearby:
|
||||
nearby.sort(key=lambda x: x[1])
|
||||
return nearby[0][0]
|
||||
return None
|
||||
|
||||
|
||||
def get_pages_to_process(conn: sqlite3.Connection) -> List[Tuple[str, int, List[int]]]:
|
||||
"""從資料庫獲取需要處理的頁面"""
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
SELECT source_pdf, page_number, GROUP_CONCAT(signature_id)
|
||||
FROM signatures
|
||||
WHERE accountant_name IS NULL OR accountant_name = ''
|
||||
GROUP BY source_pdf, page_number
|
||||
ORDER BY source_pdf, page_number
|
||||
''')
|
||||
pages = []
|
||||
for row in cursor.fetchall():
|
||||
source_pdf, page_number, sig_ids_str = row
|
||||
sig_ids = [int(x) for x in sig_ids_str.split(',')]
|
||||
pages.append((source_pdf, page_number, sig_ids))
|
||||
return pages
|
||||
|
||||
|
||||
def process_page(
|
||||
source_pdf: str, page_number: int, sig_ids: List[int],
|
||||
yolo_model, ocr_client: PaddleOCRClient
|
||||
) -> Dict:
|
||||
"""處理單一頁面"""
|
||||
result = {
|
||||
'source_pdf': source_pdf,
|
||||
'page_number': page_number,
|
||||
'num_signatures': len(sig_ids),
|
||||
'matched': 0,
|
||||
'unmatched': 0,
|
||||
'error': None,
|
||||
'updates': []
|
||||
}
|
||||
|
||||
pdf_path = find_pdf_file(source_pdf)
|
||||
if pdf_path is None:
|
||||
result['error'] = 'PDF not found'
|
||||
return result
|
||||
|
||||
image = render_pdf_page(pdf_path, page_number)
|
||||
if image is None:
|
||||
result['error'] = 'Render failed'
|
||||
return result
|
||||
|
||||
sig_boxes = detect_signatures_yolo(image, yolo_model)
|
||||
name_candidates = extract_and_filter_names(image, ocr_client)
|
||||
|
||||
for i, sig_id in enumerate(sig_ids):
|
||||
if i < len(sig_boxes):
|
||||
sig = sig_boxes[i]
|
||||
matched_name = match_signature_to_name(sig, name_candidates)
|
||||
|
||||
if matched_name:
|
||||
result['matched'] += 1
|
||||
else:
|
||||
result['unmatched'] += 1
|
||||
matched_name = ''
|
||||
|
||||
result['updates'].append((
|
||||
sig_id, matched_name,
|
||||
sig['x'], sig['y'], sig['width'], sig['height']
|
||||
))
|
||||
else:
|
||||
result['updates'].append((sig_id, '', 0, 0, 0, 0))
|
||||
result['unmatched'] += 1
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def save_updates_to_db(conn: sqlite3.Connection, updates: List[Tuple]):
|
||||
"""批次更新資料庫"""
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS signature_boxes (
|
||||
signature_id INTEGER PRIMARY KEY,
|
||||
x INTEGER, y INTEGER, width INTEGER, height INTEGER,
|
||||
FOREIGN KEY (signature_id) REFERENCES signatures(signature_id)
|
||||
)
|
||||
''')
|
||||
|
||||
for sig_id, name, x, y, w, h in updates:
|
||||
cursor.execute('UPDATE signatures SET accountant_name = ? WHERE signature_id = ?', (name, sig_id))
|
||||
if x > 0: # 有座標才存
|
||||
cursor.execute('''
|
||||
INSERT OR REPLACE INTO signature_boxes (signature_id, x, y, width, height)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
''', (sig_id, x, y, w, h))
|
||||
|
||||
conn.commit()
|
||||
|
||||
|
||||
def generate_report(stats: Dict, output_path: Path):
|
||||
"""生成處理報告"""
|
||||
report = {
|
||||
'title': '會計師姓名提取報告',
|
||||
'generated_at': datetime.now().isoformat(),
|
||||
'summary': {
|
||||
'total_pages': stats['total_pages'],
|
||||
'processed_pages': stats['processed'],
|
||||
'total_signatures': stats['total_sigs'],
|
||||
'matched_signatures': stats['matched'],
|
||||
'unmatched_signatures': stats['unmatched'],
|
||||
'match_rate': f"{stats['matched']/stats['total_sigs']*100:.1f}%" if stats['total_sigs'] > 0 else "N/A",
|
||||
'errors': stats['errors'],
|
||||
'elapsed_seconds': stats['elapsed_seconds'],
|
||||
'elapsed_human': f"{stats['elapsed_seconds']/3600:.1f} 小時"
|
||||
},
|
||||
'methodology': {
|
||||
'step1': 'YOLO 模型偵測簽名框座標',
|
||||
'step2': 'PaddleOCR 整頁 OCR 提取文字',
|
||||
'step3': '過濾 2-4 個中文字作為姓名候選',
|
||||
'step4': f'在簽名框周圍 {NAME_SEARCH_MARGIN}px 範圍內配對最近的姓名',
|
||||
'dpi': DPI,
|
||||
'yolo_confidence': CONFIDENCE_THRESHOLD
|
||||
},
|
||||
'name_distribution': stats.get('name_distribution', {}),
|
||||
'error_samples': stats.get('error_samples', [])
|
||||
}
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(report, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# 同時生成 Markdown 報告
|
||||
md_path = output_path.with_suffix('.md')
|
||||
with open(md_path, 'w', encoding='utf-8') as f:
|
||||
f.write(f"# {report['title']}\n\n")
|
||||
f.write(f"生成時間: {report['generated_at']}\n\n")
|
||||
f.write("## 摘要\n\n")
|
||||
f.write(f"| 指標 | 數值 |\n|------|------|\n")
|
||||
for k, v in report['summary'].items():
|
||||
f.write(f"| {k} | {v} |\n")
|
||||
f.write("\n## 方法論\n\n")
|
||||
for k, v in report['methodology'].items():
|
||||
f.write(f"- **{k}**: {v}\n")
|
||||
f.write("\n## 姓名分布 (Top 50)\n\n")
|
||||
names = sorted(report['name_distribution'].items(), key=lambda x: -x[1])[:50]
|
||||
for name, count in names:
|
||||
f.write(f"- {name}: {count}\n")
|
||||
|
||||
return report
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("Step 5: 從 PDF 提取會計師姓名 - 完整處理")
|
||||
print("=" * 70)
|
||||
print(f"開始時間: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
REPORTS_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 連接資料庫
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
pages = get_pages_to_process(conn)
|
||||
print(f"\n待處理頁面: {len(pages):,}")
|
||||
|
||||
if not pages:
|
||||
print("沒有需要處理的頁面")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
# 載入 YOLO
|
||||
print("\n載入 YOLO 模型...")
|
||||
from ultralytics import YOLO
|
||||
yolo_model = YOLO(str(YOLO_MODEL_PATH))
|
||||
|
||||
# 連接 OCR
|
||||
print("連接 PaddleOCR 伺服器...")
|
||||
ocr_client = PaddleOCRClient()
|
||||
if not ocr_client.health_check():
|
||||
print("錯誤: PaddleOCR 伺服器無法連接")
|
||||
conn.close()
|
||||
return
|
||||
print("OCR 伺服器連接成功\n")
|
||||
|
||||
# 統計
|
||||
stats = {
|
||||
'total_pages': len(pages),
|
||||
'processed': 0,
|
||||
'total_sigs': sum(len(p[2]) for p in pages),
|
||||
'matched': 0,
|
||||
'unmatched': 0,
|
||||
'errors': 0,
|
||||
'error_samples': [],
|
||||
'name_distribution': defaultdict(int),
|
||||
'start_time': time.time()
|
||||
}
|
||||
|
||||
all_updates = []
|
||||
|
||||
# 處理每個頁面
|
||||
for source_pdf, page_number, sig_ids in tqdm(pages, desc="處理頁面"):
|
||||
result = process_page(source_pdf, page_number, sig_ids, yolo_model, ocr_client)
|
||||
|
||||
stats['processed'] += 1
|
||||
stats['matched'] += result['matched']
|
||||
stats['unmatched'] += result['unmatched']
|
||||
|
||||
if result['error']:
|
||||
stats['errors'] += 1
|
||||
if len(stats['error_samples']) < 20:
|
||||
stats['error_samples'].append({
|
||||
'pdf': source_pdf,
|
||||
'page': page_number,
|
||||
'error': result['error']
|
||||
})
|
||||
else:
|
||||
all_updates.extend(result['updates'])
|
||||
for update in result['updates']:
|
||||
if update[1]: # 有姓名
|
||||
stats['name_distribution'][update[1]] += 1
|
||||
|
||||
# 批次提交
|
||||
if len(all_updates) >= BATCH_COMMIT_SIZE:
|
||||
save_updates_to_db(conn, all_updates)
|
||||
all_updates = []
|
||||
|
||||
# 定期顯示進度
|
||||
if stats['processed'] % PROGRESS_SAVE_INTERVAL == 0:
|
||||
elapsed = time.time() - stats['start_time']
|
||||
rate = stats['processed'] / elapsed
|
||||
remaining = (stats['total_pages'] - stats['processed']) / rate if rate > 0 else 0
|
||||
print(f"\n[進度] {stats['processed']:,}/{stats['total_pages']:,} "
|
||||
f"({stats['processed']/stats['total_pages']*100:.1f}%) | "
|
||||
f"配對: {stats['matched']:,} | "
|
||||
f"剩餘: {remaining/60:.1f} 分鐘")
|
||||
|
||||
# 最後一批提交
|
||||
if all_updates:
|
||||
save_updates_to_db(conn, all_updates)
|
||||
|
||||
stats['elapsed_seconds'] = time.time() - stats['start_time']
|
||||
stats['name_distribution'] = dict(stats['name_distribution'])
|
||||
|
||||
# 生成報告
|
||||
print("\n生成報告...")
|
||||
report_path = REPORTS_PATH / "name_extraction_report.json"
|
||||
generate_report(stats, report_path)
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("處理完成!")
|
||||
print("=" * 70)
|
||||
print(f"總頁面: {stats['total_pages']:,}")
|
||||
print(f"總簽名: {stats['total_sigs']:,}")
|
||||
print(f"配對成功: {stats['matched']:,} ({stats['matched']/stats['total_sigs']*100:.1f}%)")
|
||||
print(f"未配對: {stats['unmatched']:,}")
|
||||
print(f"錯誤: {stats['errors']:,}")
|
||||
print(f"耗時: {stats['elapsed_seconds']/3600:.2f} 小時")
|
||||
print(f"\n報告已儲存:")
|
||||
print(f" - {report_path}")
|
||||
print(f" - {report_path.with_suffix('.md')}")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,450 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
簽名清理與會計師歸檔
|
||||
|
||||
1. 標記 sig_count > 2 的 PDF,篩選最佳 2 個簽名
|
||||
2. 用 OCR 或座標歸檔到會計師
|
||||
3. 建立 accountants 表
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from opencc import OpenCC
|
||||
|
||||
# 簡繁轉換
|
||||
cc_s2t = OpenCC('s2t')
|
||||
|
||||
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
REPORT_DIR = '/Volumes/NV2/PDF-Processing/signature-analysis/reports'
|
||||
|
||||
|
||||
def get_connection():
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
|
||||
def add_columns_if_needed(conn):
|
||||
"""添加新欄位"""
|
||||
cur = conn.cursor()
|
||||
|
||||
# 檢查現有欄位
|
||||
cur.execute("PRAGMA table_info(signatures)")
|
||||
columns = [row[1] for row in cur.fetchall()]
|
||||
|
||||
if 'is_valid' not in columns:
|
||||
cur.execute("ALTER TABLE signatures ADD COLUMN is_valid INTEGER DEFAULT 1")
|
||||
print("已添加 is_valid 欄位")
|
||||
|
||||
if 'assigned_accountant' not in columns:
|
||||
cur.execute("ALTER TABLE signatures ADD COLUMN assigned_accountant TEXT")
|
||||
print("已添加 assigned_accountant 欄位")
|
||||
|
||||
conn.commit()
|
||||
|
||||
|
||||
def create_accountants_table(conn):
|
||||
"""建立 accountants 表"""
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
CREATE TABLE IF NOT EXISTS accountants (
|
||||
accountant_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT UNIQUE NOT NULL,
|
||||
signature_count INTEGER DEFAULT 0,
|
||||
firm TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
conn.commit()
|
||||
print("accountants 表已建立")
|
||||
|
||||
|
||||
def get_pdf_signatures(conn):
|
||||
"""取得每份 PDF 的簽名資料"""
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT s.signature_id, s.source_pdf, s.page_number, s.accountant_name,
|
||||
s.excel_accountant1, s.excel_accountant2, s.excel_firm,
|
||||
sb.x, sb.y, sb.width, sb.height
|
||||
FROM signatures s
|
||||
LEFT JOIN signature_boxes sb ON s.signature_id = sb.signature_id
|
||||
ORDER BY s.source_pdf, s.page_number, sb.y
|
||||
""")
|
||||
|
||||
pdf_sigs = defaultdict(list)
|
||||
for row in cur.fetchall():
|
||||
pdf_sigs[row['source_pdf']].append(dict(row))
|
||||
|
||||
return pdf_sigs
|
||||
|
||||
|
||||
def normalize_name(name):
|
||||
"""正規化姓名(簡轉繁)"""
|
||||
if not name:
|
||||
return None
|
||||
return cc_s2t.convert(name)
|
||||
|
||||
|
||||
def names_match(ocr_name, excel_name):
|
||||
"""檢查 OCR 姓名是否與 Excel 姓名匹配"""
|
||||
if not ocr_name or not excel_name:
|
||||
return False
|
||||
|
||||
# 精確匹配
|
||||
if ocr_name == excel_name:
|
||||
return True
|
||||
|
||||
# 簡繁轉換後匹配
|
||||
ocr_trad = normalize_name(ocr_name)
|
||||
if ocr_trad == excel_name:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def score_signature(sig, excel_acc1, excel_acc2):
|
||||
"""為簽名評分"""
|
||||
score = 0
|
||||
ocr_name = sig.get('accountant_name', '')
|
||||
|
||||
# 1. OCR 姓名匹配 (+100)
|
||||
if names_match(ocr_name, excel_acc1) or names_match(ocr_name, excel_acc2):
|
||||
score += 100
|
||||
|
||||
# 2. 合理尺寸 (+20)
|
||||
width = sig.get('width', 0) or 0
|
||||
height = sig.get('height', 0) or 0
|
||||
if 30 < width < 500 and 20 < height < 200:
|
||||
score += 20
|
||||
|
||||
# 3. 頁面位置 - Y 座標越大分數越高 (最多 +15)
|
||||
y = sig.get('y', 0) or 0
|
||||
score += min(y / 100, 15)
|
||||
|
||||
# 4. 如果尺寸過大(可能是印章),扣分
|
||||
if width > 300 or height > 150:
|
||||
score -= 30
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def select_best_two(signatures, excel_acc1, excel_acc2):
|
||||
"""選擇最佳的 2 個簽名"""
|
||||
if len(signatures) <= 2:
|
||||
return signatures
|
||||
|
||||
scored = []
|
||||
for sig in signatures:
|
||||
score = score_signature(sig, excel_acc1, excel_acc2)
|
||||
scored.append((sig, score))
|
||||
|
||||
# 按分數排序
|
||||
scored.sort(key=lambda x: -x[1])
|
||||
|
||||
# 取前 2 個
|
||||
return [s[0] for s in scored[:2]]
|
||||
|
||||
|
||||
def assign_to_accountant(sig1, sig2, excel_acc1, excel_acc2):
|
||||
"""將簽名歸檔到會計師"""
|
||||
ocr1 = sig1.get('accountant_name', '')
|
||||
ocr2 = sig2.get('accountant_name', '')
|
||||
|
||||
# 方法 A: OCR 姓名匹配
|
||||
if names_match(ocr1, excel_acc1):
|
||||
return [(sig1, excel_acc1), (sig2, excel_acc2)]
|
||||
elif names_match(ocr1, excel_acc2):
|
||||
return [(sig1, excel_acc2), (sig2, excel_acc1)]
|
||||
elif names_match(ocr2, excel_acc1):
|
||||
return [(sig1, excel_acc2), (sig2, excel_acc1)]
|
||||
elif names_match(ocr2, excel_acc2):
|
||||
return [(sig1, excel_acc1), (sig2, excel_acc2)]
|
||||
|
||||
# 方法 B: 按 Y 座標(假設會計師1 在上)
|
||||
y1 = sig1.get('y', 0) or 0
|
||||
y2 = sig2.get('y', 0) or 0
|
||||
|
||||
if y1 <= y2:
|
||||
return [(sig1, excel_acc1), (sig2, excel_acc2)]
|
||||
else:
|
||||
return [(sig1, excel_acc2), (sig2, excel_acc1)]
|
||||
|
||||
|
||||
def process_all_pdfs(conn):
|
||||
"""處理所有 PDF"""
|
||||
print("正在載入簽名資料...")
|
||||
pdf_sigs = get_pdf_signatures(conn)
|
||||
print(f"共 {len(pdf_sigs)} 份 PDF")
|
||||
|
||||
cur = conn.cursor()
|
||||
|
||||
stats = {
|
||||
'total_pdfs': len(pdf_sigs),
|
||||
'sig_count_1': 0,
|
||||
'sig_count_2': 0,
|
||||
'sig_count_gt2': 0,
|
||||
'valid_signatures': 0,
|
||||
'invalid_signatures': 0,
|
||||
'ocr_matched': 0,
|
||||
'y_coordinate_assigned': 0,
|
||||
'no_excel_data': 0,
|
||||
}
|
||||
|
||||
assignments = [] # (signature_id, assigned_accountant, is_valid)
|
||||
|
||||
for pdf_name, sigs in pdf_sigs.items():
|
||||
sig_count = len(sigs)
|
||||
excel_acc1 = sigs[0].get('excel_accountant1') if sigs else None
|
||||
excel_acc2 = sigs[0].get('excel_accountant2') if sigs else None
|
||||
|
||||
if not excel_acc1 and not excel_acc2:
|
||||
# 無 Excel 資料
|
||||
stats['no_excel_data'] += 1
|
||||
for sig in sigs:
|
||||
assignments.append((sig['signature_id'], None, 1))
|
||||
continue
|
||||
|
||||
if sig_count == 1:
|
||||
stats['sig_count_1'] += 1
|
||||
# 只有 1 個簽名,保留但無法確定是哪位會計師
|
||||
sig = sigs[0]
|
||||
ocr_name = sig.get('accountant_name', '')
|
||||
if names_match(ocr_name, excel_acc1):
|
||||
assignments.append((sig['signature_id'], excel_acc1, 1))
|
||||
stats['ocr_matched'] += 1
|
||||
elif names_match(ocr_name, excel_acc2):
|
||||
assignments.append((sig['signature_id'], excel_acc2, 1))
|
||||
stats['ocr_matched'] += 1
|
||||
else:
|
||||
# 無法確定,暫時不指派
|
||||
assignments.append((sig['signature_id'], None, 1))
|
||||
stats['valid_signatures'] += 1
|
||||
|
||||
elif sig_count == 2:
|
||||
stats['sig_count_2'] += 1
|
||||
# 正常情況
|
||||
sig1, sig2 = sigs[0], sigs[1]
|
||||
pairs = assign_to_accountant(sig1, sig2, excel_acc1, excel_acc2)
|
||||
|
||||
for sig, acc in pairs:
|
||||
assignments.append((sig['signature_id'], acc, 1))
|
||||
stats['valid_signatures'] += 1
|
||||
|
||||
# 統計匹配方式
|
||||
ocr_name = sig.get('accountant_name', '')
|
||||
if names_match(ocr_name, acc):
|
||||
stats['ocr_matched'] += 1
|
||||
else:
|
||||
stats['y_coordinate_assigned'] += 1
|
||||
|
||||
else:
|
||||
stats['sig_count_gt2'] += 1
|
||||
# 需要篩選
|
||||
best_two = select_best_two(sigs, excel_acc1, excel_acc2)
|
||||
|
||||
# 標記有效/無效
|
||||
valid_ids = {s['signature_id'] for s in best_two}
|
||||
for sig in sigs:
|
||||
if sig['signature_id'] in valid_ids:
|
||||
is_valid = 1
|
||||
stats['valid_signatures'] += 1
|
||||
else:
|
||||
is_valid = 0
|
||||
stats['invalid_signatures'] += 1
|
||||
assignments.append((sig['signature_id'], None, is_valid))
|
||||
|
||||
# 歸檔有效的 2 個
|
||||
if len(best_two) == 2:
|
||||
sig1, sig2 = best_two[0], best_two[1]
|
||||
pairs = assign_to_accountant(sig1, sig2, excel_acc1, excel_acc2)
|
||||
|
||||
for sig, acc in pairs:
|
||||
assignments.append((sig['signature_id'], acc, 1))
|
||||
ocr_name = sig.get('accountant_name', '')
|
||||
if names_match(ocr_name, acc):
|
||||
stats['ocr_matched'] += 1
|
||||
else:
|
||||
stats['y_coordinate_assigned'] += 1
|
||||
elif len(best_two) == 1:
|
||||
sig = best_two[0]
|
||||
ocr_name = sig.get('accountant_name', '')
|
||||
if names_match(ocr_name, excel_acc1):
|
||||
assignments.append((sig['signature_id'], excel_acc1, 1))
|
||||
elif names_match(ocr_name, excel_acc2):
|
||||
assignments.append((sig['signature_id'], excel_acc2, 1))
|
||||
else:
|
||||
assignments.append((sig['signature_id'], None, 1))
|
||||
|
||||
# 批量更新資料庫
|
||||
print(f"正在更新 {len(assignments)} 筆簽名...")
|
||||
for sig_id, acc, is_valid in assignments:
|
||||
cur.execute("""
|
||||
UPDATE signatures
|
||||
SET assigned_accountant = ?, is_valid = ?
|
||||
WHERE signature_id = ?
|
||||
""", (acc, is_valid, sig_id))
|
||||
|
||||
conn.commit()
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def build_accountants_table(conn):
|
||||
"""建立會計師表"""
|
||||
cur = conn.cursor()
|
||||
|
||||
# 清空現有資料
|
||||
cur.execute("DELETE FROM accountants")
|
||||
|
||||
# 收集所有會計師姓名
|
||||
cur.execute("""
|
||||
SELECT assigned_accountant, excel_firm, COUNT(*) as cnt
|
||||
FROM signatures
|
||||
WHERE assigned_accountant IS NOT NULL AND is_valid = 1
|
||||
GROUP BY assigned_accountant
|
||||
""")
|
||||
|
||||
accountants = {}
|
||||
for row in cur.fetchall():
|
||||
name = row[0]
|
||||
firm = row[1]
|
||||
count = row[2]
|
||||
|
||||
if name not in accountants:
|
||||
accountants[name] = {'count': 0, 'firms': defaultdict(int)}
|
||||
accountants[name]['count'] += count
|
||||
if firm:
|
||||
accountants[name]['firms'][firm] += count
|
||||
|
||||
# 插入 accountants 表
|
||||
for name, data in accountants.items():
|
||||
# 找出最常見的事務所
|
||||
main_firm = None
|
||||
if data['firms']:
|
||||
main_firm = max(data['firms'].items(), key=lambda x: x[1])[0]
|
||||
|
||||
cur.execute("""
|
||||
INSERT INTO accountants (name, signature_count, firm)
|
||||
VALUES (?, ?, ?)
|
||||
""", (name, data['count'], main_firm))
|
||||
|
||||
conn.commit()
|
||||
|
||||
# 更新 signatures 的 accountant_id
|
||||
cur.execute("""
|
||||
UPDATE signatures
|
||||
SET accountant_id = (
|
||||
SELECT accountant_id FROM accountants
|
||||
WHERE accountants.name = signatures.assigned_accountant
|
||||
)
|
||||
WHERE assigned_accountant IS NOT NULL
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
return len(accountants)
|
||||
|
||||
|
||||
def generate_report(stats, accountant_count):
|
||||
"""生成報告"""
|
||||
report = {
|
||||
'generated_at': datetime.now().isoformat(),
|
||||
'summary': {
|
||||
'total_pdfs': stats['total_pdfs'],
|
||||
'pdfs_with_1_sig': stats['sig_count_1'],
|
||||
'pdfs_with_2_sigs': stats['sig_count_2'],
|
||||
'pdfs_with_gt2_sigs': stats['sig_count_gt2'],
|
||||
'pdfs_without_excel': stats['no_excel_data'],
|
||||
},
|
||||
'signatures': {
|
||||
'valid': stats['valid_signatures'],
|
||||
'invalid': stats['invalid_signatures'],
|
||||
'total': stats['valid_signatures'] + stats['invalid_signatures'],
|
||||
},
|
||||
'assignment_method': {
|
||||
'ocr_matched': stats['ocr_matched'],
|
||||
'y_coordinate': stats['y_coordinate_assigned'],
|
||||
},
|
||||
'accountants': {
|
||||
'total_unique': accountant_count,
|
||||
}
|
||||
}
|
||||
|
||||
# 儲存 JSON
|
||||
json_path = f"{REPORT_DIR}/signature_cleanup_report.json"
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(report, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# 儲存 Markdown
|
||||
md_path = f"{REPORT_DIR}/signature_cleanup_report.md"
|
||||
with open(md_path, 'w', encoding='utf-8') as f:
|
||||
f.write("# 簽名清理與歸檔報告\n\n")
|
||||
f.write(f"生成時間: {report['generated_at']}\n\n")
|
||||
|
||||
f.write("## PDF 分布\n\n")
|
||||
f.write("| 類型 | 數量 |\n")
|
||||
f.write("|------|------|\n")
|
||||
f.write(f"| 總 PDF 數 | {stats['total_pdfs']} |\n")
|
||||
f.write(f"| 1 個簽名 | {stats['sig_count_1']} |\n")
|
||||
f.write(f"| 2 個簽名 (正常) | {stats['sig_count_2']} |\n")
|
||||
f.write(f"| >2 個簽名 (需篩選) | {stats['sig_count_gt2']} |\n")
|
||||
f.write(f"| 無 Excel 資料 | {stats['no_excel_data']} |\n")
|
||||
|
||||
f.write("\n## 簽名統計\n\n")
|
||||
f.write("| 類型 | 數量 |\n")
|
||||
f.write("|------|------|\n")
|
||||
f.write(f"| 有效簽名 | {stats['valid_signatures']} |\n")
|
||||
f.write(f"| 無效簽名 (誤判) | {stats['invalid_signatures']} |\n")
|
||||
|
||||
f.write("\n## 歸檔方式\n\n")
|
||||
f.write("| 方式 | 數量 |\n")
|
||||
f.write("|------|------|\n")
|
||||
f.write(f"| OCR 姓名匹配 | {stats['ocr_matched']} |\n")
|
||||
f.write(f"| Y 座標推斷 | {stats['y_coordinate_assigned']} |\n")
|
||||
|
||||
f.write(f"\n## 會計師\n\n")
|
||||
f.write(f"唯一會計師數: **{accountant_count}**\n")
|
||||
|
||||
print(f"報告已儲存: {json_path}")
|
||||
print(f"報告已儲存: {md_path}")
|
||||
|
||||
return report
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("簽名清理與會計師歸檔")
|
||||
print("=" * 60)
|
||||
|
||||
conn = get_connection()
|
||||
|
||||
# 1. 準備資料庫
|
||||
print("\n[1/4] 準備資料庫...")
|
||||
add_columns_if_needed(conn)
|
||||
create_accountants_table(conn)
|
||||
|
||||
# 2. 處理所有 PDF
|
||||
print("\n[2/4] 處理 PDF 簽名...")
|
||||
stats = process_all_pdfs(conn)
|
||||
|
||||
# 3. 建立 accountants 表
|
||||
print("\n[3/4] 建立會計師表...")
|
||||
accountant_count = build_accountants_table(conn)
|
||||
|
||||
# 4. 生成報告
|
||||
print("\n[4/4] 生成報告...")
|
||||
report = generate_report(stats, accountant_count)
|
||||
|
||||
conn.close()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("完成!")
|
||||
print("=" * 60)
|
||||
print(f"有效簽名: {stats['valid_signatures']}")
|
||||
print(f"無效簽名: {stats['invalid_signatures']}")
|
||||
print(f"唯一會計師: {accountant_count}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,272 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
第三階段:同人簽名聚類分析
|
||||
|
||||
對每位會計師的簽名進行相似度分析,判斷是否有「複製貼上」行為。
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from tqdm import tqdm
|
||||
|
||||
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FEATURES_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/features/signature_features.npy'
|
||||
REPORT_DIR = '/Volumes/NV2/PDF-Processing/signature-analysis/reports'
|
||||
|
||||
|
||||
def load_data():
|
||||
"""載入特徵向量和會計師分配"""
|
||||
print("載入特徵向量...")
|
||||
features = np.load(FEATURES_PATH)
|
||||
print(f"特徵矩陣形狀: {features.shape}")
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cur = conn.cursor()
|
||||
|
||||
# 取得所有 signature_id 順序(與特徵向量對應)
|
||||
cur.execute("SELECT signature_id FROM signatures ORDER BY signature_id")
|
||||
all_sig_ids = [row[0] for row in cur.fetchall()]
|
||||
sig_id_to_idx = {sig_id: idx for idx, sig_id in enumerate(all_sig_ids)}
|
||||
|
||||
# 取得有效簽名的會計師分配
|
||||
cur.execute("""
|
||||
SELECT s.signature_id, s.assigned_accountant, s.accountant_id, a.name, a.firm
|
||||
FROM signatures s
|
||||
LEFT JOIN accountants a ON s.accountant_id = a.accountant_id
|
||||
WHERE s.is_valid = 1 AND s.assigned_accountant IS NOT NULL
|
||||
ORDER BY s.signature_id
|
||||
""")
|
||||
|
||||
acc_signatures = defaultdict(list)
|
||||
acc_info = {}
|
||||
|
||||
for row in cur.fetchall():
|
||||
sig_id, _, acc_id, acc_name, firm = row
|
||||
if acc_id and sig_id in sig_id_to_idx:
|
||||
acc_signatures[acc_id].append(sig_id)
|
||||
if acc_id not in acc_info:
|
||||
acc_info[acc_id] = {'name': acc_name, 'firm': firm}
|
||||
|
||||
conn.close()
|
||||
|
||||
return features, sig_id_to_idx, acc_signatures, acc_info
|
||||
|
||||
|
||||
def compute_similarity_stats(features, sig_ids, sig_id_to_idx):
|
||||
"""計算一組簽名的相似度統計"""
|
||||
if len(sig_ids) < 2:
|
||||
return None
|
||||
|
||||
# 取得特徵
|
||||
indices = [sig_id_to_idx[sid] for sid in sig_ids]
|
||||
feat = features[indices]
|
||||
|
||||
# 正規化
|
||||
norms = np.linalg.norm(feat, axis=1, keepdims=True)
|
||||
norms[norms == 0] = 1
|
||||
feat_norm = feat / norms
|
||||
|
||||
# 計算餘弦相似度矩陣
|
||||
sim_matrix = np.dot(feat_norm, feat_norm.T)
|
||||
|
||||
# 取上三角(排除對角線)
|
||||
upper_tri = sim_matrix[np.triu_indices(len(sim_matrix), k=1)]
|
||||
|
||||
if len(upper_tri) == 0:
|
||||
return None
|
||||
|
||||
# 統計
|
||||
stats = {
|
||||
'total_pairs': len(upper_tri),
|
||||
'min_sim': float(upper_tri.min()),
|
||||
'max_sim': float(upper_tri.max()),
|
||||
'mean_sim': float(upper_tri.mean()),
|
||||
'std_sim': float(upper_tri.std()),
|
||||
'pairs_gt_90': int((upper_tri > 0.90).sum()),
|
||||
'pairs_gt_95': int((upper_tri > 0.95).sum()),
|
||||
'pairs_gt_99': int((upper_tri > 0.99).sum()),
|
||||
}
|
||||
|
||||
# 計算比例
|
||||
stats['ratio_gt_90'] = stats['pairs_gt_90'] / stats['total_pairs']
|
||||
stats['ratio_gt_95'] = stats['pairs_gt_95'] / stats['total_pairs']
|
||||
stats['ratio_gt_99'] = stats['pairs_gt_99'] / stats['total_pairs']
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def analyze_all_accountants(features, sig_id_to_idx, acc_signatures, acc_info):
|
||||
"""分析所有會計師"""
|
||||
results = []
|
||||
|
||||
for acc_id, sig_ids in tqdm(acc_signatures.items(), desc="分析會計師"):
|
||||
info = acc_info.get(acc_id, {})
|
||||
stats = compute_similarity_stats(features, sig_ids, sig_id_to_idx)
|
||||
|
||||
if stats:
|
||||
result = {
|
||||
'accountant_id': acc_id,
|
||||
'name': info.get('name', ''),
|
||||
'firm': info.get('firm', ''),
|
||||
'signature_count': len(sig_ids),
|
||||
**stats
|
||||
}
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def classify_risk(result):
|
||||
"""分類風險等級"""
|
||||
ratio_95 = result.get('ratio_gt_95', 0)
|
||||
ratio_99 = result.get('ratio_gt_99', 0)
|
||||
mean_sim = result.get('mean_sim', 0)
|
||||
|
||||
# 高風險:大量高相似度對
|
||||
if ratio_99 > 0.05 or ratio_95 > 0.3:
|
||||
return 'high'
|
||||
# 中風險
|
||||
elif ratio_95 > 0.1 or mean_sim > 0.85:
|
||||
return 'medium'
|
||||
# 低風險
|
||||
else:
|
||||
return 'low'
|
||||
|
||||
|
||||
def save_results(results, acc_signatures):
|
||||
"""儲存結果"""
|
||||
# 分類風險
|
||||
for r in results:
|
||||
r['risk_level'] = classify_risk(r)
|
||||
|
||||
# 統計
|
||||
risk_counts = defaultdict(int)
|
||||
for r in results:
|
||||
risk_counts[r['risk_level']] += 1
|
||||
|
||||
summary = {
|
||||
'generated_at': datetime.now().isoformat(),
|
||||
'total_accountants': len(results),
|
||||
'risk_distribution': dict(risk_counts),
|
||||
'high_risk_count': risk_counts['high'],
|
||||
'medium_risk_count': risk_counts['medium'],
|
||||
'low_risk_count': risk_counts['low'],
|
||||
}
|
||||
|
||||
# 按風險排序
|
||||
results_sorted = sorted(results, key=lambda x: (-x.get('ratio_gt_95', 0), -x.get('mean_sim', 0)))
|
||||
|
||||
# 儲存 JSON
|
||||
output = {
|
||||
'summary': summary,
|
||||
'accountants': results_sorted
|
||||
}
|
||||
|
||||
json_path = f"{REPORT_DIR}/accountant_similarity_analysis.json"
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(output, f, ensure_ascii=False, indent=2)
|
||||
print(f"已儲存: {json_path}")
|
||||
|
||||
# 儲存 Markdown 報告
|
||||
md_path = f"{REPORT_DIR}/accountant_similarity_analysis.md"
|
||||
with open(md_path, 'w', encoding='utf-8') as f:
|
||||
f.write("# 會計師簽名相似度分析報告\n\n")
|
||||
f.write(f"生成時間: {summary['generated_at']}\n\n")
|
||||
|
||||
f.write("## 摘要\n\n")
|
||||
f.write(f"| 指標 | 數值 |\n")
|
||||
f.write(f"|------|------|\n")
|
||||
f.write(f"| 總會計師數 | {summary['total_accountants']} |\n")
|
||||
f.write(f"| 高風險 | {risk_counts['high']} |\n")
|
||||
f.write(f"| 中風險 | {risk_counts['medium']} |\n")
|
||||
f.write(f"| 低風險 | {risk_counts['low']} |\n")
|
||||
|
||||
f.write("\n## 風險分類標準\n\n")
|
||||
f.write("- **高風險**: >5% 的簽名對相似度 >0.99,或 >30% 的簽名對相似度 >0.95\n")
|
||||
f.write("- **中風險**: >10% 的簽名對相似度 >0.95,或平均相似度 >0.85\n")
|
||||
f.write("- **低風險**: 其他情況\n")
|
||||
|
||||
f.write("\n## 高風險會計師 (Top 30)\n\n")
|
||||
f.write("| 排名 | 姓名 | 事務所 | 簽名數 | 平均相似度 | >0.95比例 | >0.99比例 |\n")
|
||||
f.write("|------|------|--------|--------|------------|-----------|----------|\n")
|
||||
|
||||
high_risk = [r for r in results_sorted if r['risk_level'] == 'high']
|
||||
for i, r in enumerate(high_risk[:30], 1):
|
||||
f.write(f"| {i} | {r['name']} | {r['firm'] or '-'} | {r['signature_count']} | ")
|
||||
f.write(f"{r['mean_sim']:.3f} | {r['ratio_gt_95']*100:.1f}% | {r['ratio_gt_99']*100:.1f}% |\n")
|
||||
|
||||
f.write("\n## 所有會計師統計分布\n\n")
|
||||
|
||||
# 平均相似度分布
|
||||
mean_sims = [r['mean_sim'] for r in results]
|
||||
f.write("### 平均相似度分布\n\n")
|
||||
f.write(f"- 最小: {min(mean_sims):.3f}\n")
|
||||
f.write(f"- 最大: {max(mean_sims):.3f}\n")
|
||||
f.write(f"- 平均: {np.mean(mean_sims):.3f}\n")
|
||||
f.write(f"- 中位數: {np.median(mean_sims):.3f}\n")
|
||||
|
||||
print(f"已儲存: {md_path}")
|
||||
|
||||
return summary, results_sorted
|
||||
|
||||
|
||||
def update_database(results):
|
||||
"""更新資料庫,添加風險等級"""
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cur = conn.cursor()
|
||||
|
||||
# 添加欄位
|
||||
try:
|
||||
cur.execute("ALTER TABLE accountants ADD COLUMN risk_level TEXT")
|
||||
cur.execute("ALTER TABLE accountants ADD COLUMN mean_similarity REAL")
|
||||
cur.execute("ALTER TABLE accountants ADD COLUMN ratio_gt_95 REAL")
|
||||
except:
|
||||
pass # 欄位已存在
|
||||
|
||||
# 更新
|
||||
for r in results:
|
||||
cur.execute("""
|
||||
UPDATE accountants
|
||||
SET risk_level = ?, mean_similarity = ?, ratio_gt_95 = ?
|
||||
WHERE accountant_id = ?
|
||||
""", (r['risk_level'], r['mean_sim'], r['ratio_gt_95'], r['accountant_id']))
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print("資料庫已更新")
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("第三階段:同人簽名聚類分析")
|
||||
print("=" * 60)
|
||||
|
||||
# 載入資料
|
||||
features, sig_id_to_idx, acc_signatures, acc_info = load_data()
|
||||
print(f"會計師數: {len(acc_signatures)}")
|
||||
|
||||
# 分析所有會計師
|
||||
print("\n開始分析...")
|
||||
results = analyze_all_accountants(features, sig_id_to_idx, acc_signatures, acc_info)
|
||||
|
||||
# 儲存結果
|
||||
print("\n儲存結果...")
|
||||
summary, results_sorted = save_results(results, acc_signatures)
|
||||
|
||||
# 更新資料庫
|
||||
update_database(results_sorted)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("完成!")
|
||||
print("=" * 60)
|
||||
print(f"總會計師: {summary['total_accountants']}")
|
||||
print(f"高風險: {summary['high_risk_count']}")
|
||||
print(f"中風險: {summary['medium_risk_count']}")
|
||||
print(f"低風險: {summary['low_risk_count']}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,371 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
第四階段:PDF 簽名真偽判定
|
||||
|
||||
對每份 PDF 的簽名判斷是「親簽」還是「複製貼上」
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
import json
|
||||
import csv
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from tqdm import tqdm
|
||||
|
||||
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FEATURES_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/features/signature_features.npy'
|
||||
REPORT_DIR = '/Volumes/NV2/PDF-Processing/signature-analysis/reports'
|
||||
|
||||
# 門檻設定
|
||||
THRESHOLD_COPY = 0.95 # 高於此值判定為「複製貼上」
|
||||
THRESHOLD_AUTHENTIC = 0.85 # 低於此值判定為「親簽」
|
||||
# 介於兩者之間為「不確定」
|
||||
|
||||
|
||||
def load_data():
|
||||
"""載入資料"""
|
||||
print("載入特徵向量...")
|
||||
features = np.load(FEATURES_PATH)
|
||||
|
||||
# 正規化
|
||||
norms = np.linalg.norm(features, axis=1, keepdims=True)
|
||||
norms[norms == 0] = 1
|
||||
features_norm = features / norms
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cur = conn.cursor()
|
||||
|
||||
# 取得簽名資訊
|
||||
cur.execute("""
|
||||
SELECT s.signature_id, s.source_pdf, s.assigned_accountant,
|
||||
s.excel_accountant1, s.excel_accountant2, s.excel_firm
|
||||
FROM signatures s
|
||||
WHERE s.is_valid = 1 AND s.assigned_accountant IS NOT NULL
|
||||
ORDER BY s.signature_id
|
||||
""")
|
||||
|
||||
sig_data = {}
|
||||
pdf_signatures = defaultdict(list)
|
||||
acc_signatures = defaultdict(list)
|
||||
pdf_info = {}
|
||||
|
||||
for row in cur.fetchall():
|
||||
sig_id, pdf, acc_name, acc1, acc2, firm = row
|
||||
sig_data[sig_id] = {
|
||||
'pdf': pdf,
|
||||
'accountant': acc_name,
|
||||
}
|
||||
pdf_signatures[pdf].append((sig_id, acc_name))
|
||||
acc_signatures[acc_name].append(sig_id)
|
||||
|
||||
if pdf not in pdf_info:
|
||||
pdf_info[pdf] = {
|
||||
'accountant1': acc1,
|
||||
'accountant2': acc2,
|
||||
'firm': firm
|
||||
}
|
||||
|
||||
# signature_id -> feature index
|
||||
cur.execute("SELECT signature_id FROM signatures ORDER BY signature_id")
|
||||
all_sig_ids = [row[0] for row in cur.fetchall()]
|
||||
sig_id_to_idx = {sid: idx for idx, sid in enumerate(all_sig_ids)}
|
||||
|
||||
conn.close()
|
||||
|
||||
return features_norm, sig_data, pdf_signatures, acc_signatures, pdf_info, sig_id_to_idx
|
||||
|
||||
|
||||
def get_max_similarity_to_others(sig_id, acc_name, acc_signatures, sig_id_to_idx, features_norm):
|
||||
"""計算該簽名與同一會計師其他簽名的最大相似度"""
|
||||
other_sigs = [s for s in acc_signatures[acc_name] if s != sig_id and s in sig_id_to_idx]
|
||||
if not other_sigs:
|
||||
return None, None
|
||||
|
||||
idx = sig_id_to_idx[sig_id]
|
||||
other_indices = [sig_id_to_idx[s] for s in other_sigs]
|
||||
|
||||
feat = features_norm[idx]
|
||||
other_feats = features_norm[other_indices]
|
||||
|
||||
similarities = np.dot(other_feats, feat)
|
||||
max_idx = similarities.argmax()
|
||||
|
||||
return float(similarities[max_idx]), other_sigs[max_idx]
|
||||
|
||||
|
||||
def classify_signature(max_sim):
|
||||
"""分類簽名"""
|
||||
if max_sim is None:
|
||||
return 'unknown' # 無法判定(沒有其他簽名可比對)
|
||||
elif max_sim >= THRESHOLD_COPY:
|
||||
return 'copy' # 複製貼上
|
||||
elif max_sim <= THRESHOLD_AUTHENTIC:
|
||||
return 'authentic' # 親簽
|
||||
else:
|
||||
return 'uncertain' # 不確定
|
||||
|
||||
|
||||
def classify_pdf(verdicts):
|
||||
"""根據兩個簽名的判定結果,給出 PDF 整體判定"""
|
||||
if not verdicts:
|
||||
return 'unknown'
|
||||
|
||||
# 如果有任一簽名是複製,整份 PDF 判定為複製
|
||||
if 'copy' in verdicts:
|
||||
return 'copy'
|
||||
# 如果兩個都是親簽
|
||||
elif all(v == 'authentic' for v in verdicts):
|
||||
return 'authentic'
|
||||
# 如果有不確定的
|
||||
elif 'uncertain' in verdicts:
|
||||
return 'uncertain'
|
||||
else:
|
||||
return 'unknown'
|
||||
|
||||
|
||||
def analyze_all_pdfs(features_norm, sig_data, pdf_signatures, acc_signatures, pdf_info, sig_id_to_idx):
|
||||
"""分析所有 PDF"""
|
||||
results = []
|
||||
|
||||
for pdf, sigs in tqdm(pdf_signatures.items(), desc="分析 PDF"):
|
||||
info = pdf_info.get(pdf, {})
|
||||
|
||||
pdf_result = {
|
||||
'pdf': pdf,
|
||||
'accountant1': info.get('accountant1', ''),
|
||||
'accountant2': info.get('accountant2', ''),
|
||||
'firm': info.get('firm', ''),
|
||||
'signatures': []
|
||||
}
|
||||
|
||||
verdicts = []
|
||||
|
||||
for sig_id, acc_name in sigs:
|
||||
max_sim, most_similar_sig = get_max_similarity_to_others(
|
||||
sig_id, acc_name, acc_signatures, sig_id_to_idx, features_norm
|
||||
)
|
||||
verdict = classify_signature(max_sim)
|
||||
verdicts.append(verdict)
|
||||
|
||||
pdf_result['signatures'].append({
|
||||
'signature_id': sig_id,
|
||||
'accountant': acc_name,
|
||||
'max_similarity': max_sim,
|
||||
'verdict': verdict
|
||||
})
|
||||
|
||||
pdf_result['pdf_verdict'] = classify_pdf(verdicts)
|
||||
results.append(pdf_result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def generate_statistics(results):
|
||||
"""生成統計"""
|
||||
stats = {
|
||||
'total_pdfs': len(results),
|
||||
'pdf_verdicts': defaultdict(int),
|
||||
'signature_verdicts': defaultdict(int),
|
||||
'by_firm': defaultdict(lambda: defaultdict(int))
|
||||
}
|
||||
|
||||
for r in results:
|
||||
stats['pdf_verdicts'][r['pdf_verdict']] += 1
|
||||
firm = r['firm'] or '未知'
|
||||
stats['by_firm'][firm][r['pdf_verdict']] += 1
|
||||
|
||||
for sig in r['signatures']:
|
||||
stats['signature_verdicts'][sig['verdict']] += 1
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def save_results(results, stats):
|
||||
"""儲存結果"""
|
||||
timestamp = datetime.now().isoformat()
|
||||
|
||||
# 1. 儲存完整 JSON
|
||||
json_path = f"{REPORT_DIR}/pdf_signature_verdicts.json"
|
||||
output = {
|
||||
'generated_at': timestamp,
|
||||
'thresholds': {
|
||||
'copy': THRESHOLD_COPY,
|
||||
'authentic': THRESHOLD_AUTHENTIC
|
||||
},
|
||||
'statistics': {
|
||||
'total_pdfs': stats['total_pdfs'],
|
||||
'pdf_verdicts': dict(stats['pdf_verdicts']),
|
||||
'signature_verdicts': dict(stats['signature_verdicts'])
|
||||
},
|
||||
'results': results
|
||||
}
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(output, f, ensure_ascii=False, indent=2)
|
||||
print(f"已儲存: {json_path}")
|
||||
|
||||
# 2. 儲存 CSV(簡易版)
|
||||
csv_path = f"{REPORT_DIR}/pdf_signature_verdicts.csv"
|
||||
with open(csv_path, 'w', encoding='utf-8', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(['PDF', '會計師1', '會計師2', '事務所', '判定結果',
|
||||
'簽名1_會計師', '簽名1_相似度', '簽名1_判定',
|
||||
'簽名2_會計師', '簽名2_相似度', '簽名2_判定'])
|
||||
|
||||
for r in results:
|
||||
row = [
|
||||
r['pdf'],
|
||||
r['accountant1'],
|
||||
r['accountant2'],
|
||||
r['firm'] or '',
|
||||
r['pdf_verdict']
|
||||
]
|
||||
|
||||
for sig in r['signatures'][:2]: # 最多 2 個簽名
|
||||
row.extend([
|
||||
sig['accountant'],
|
||||
f"{sig['max_similarity']:.3f}" if sig['max_similarity'] else '',
|
||||
sig['verdict']
|
||||
])
|
||||
|
||||
# 補齊欄位
|
||||
while len(row) < 11:
|
||||
row.append('')
|
||||
|
||||
writer.writerow(row)
|
||||
print(f"已儲存: {csv_path}")
|
||||
|
||||
# 3. 儲存 Markdown 報告
|
||||
md_path = f"{REPORT_DIR}/pdf_signature_verdict_report.md"
|
||||
with open(md_path, 'w', encoding='utf-8') as f:
|
||||
f.write("# PDF 簽名真偽判定報告\n\n")
|
||||
f.write(f"生成時間: {timestamp}\n\n")
|
||||
|
||||
f.write("## 判定標準\n\n")
|
||||
f.write(f"- **複製貼上 (copy)**: 與同一會計師其他簽名相似度 ≥ {THRESHOLD_COPY}\n")
|
||||
f.write(f"- **親簽 (authentic)**: 與同一會計師其他簽名相似度 ≤ {THRESHOLD_AUTHENTIC}\n")
|
||||
f.write(f"- **不確定 (uncertain)**: 相似度介於 {THRESHOLD_AUTHENTIC} ~ {THRESHOLD_COPY}\n")
|
||||
f.write(f"- **無法判定 (unknown)**: 該會計師只有此一份簽名,無法比對\n\n")
|
||||
|
||||
f.write("## 整體統計\n\n")
|
||||
f.write("### PDF 判定結果\n\n")
|
||||
f.write("| 判定 | 數量 | 百分比 |\n")
|
||||
f.write("|------|------|--------|\n")
|
||||
|
||||
total = stats['total_pdfs']
|
||||
for verdict in ['copy', 'uncertain', 'authentic', 'unknown']:
|
||||
count = stats['pdf_verdicts'].get(verdict, 0)
|
||||
pct = count / total * 100 if total > 0 else 0
|
||||
label = {
|
||||
'copy': '複製貼上',
|
||||
'authentic': '親簽',
|
||||
'uncertain': '不確定',
|
||||
'unknown': '無法判定'
|
||||
}.get(verdict, verdict)
|
||||
f.write(f"| {label} | {count:,} | {pct:.1f}% |\n")
|
||||
|
||||
f.write(f"\n**總計: {total:,} 份 PDF**\n")
|
||||
|
||||
f.write("\n### 簽名判定結果\n\n")
|
||||
f.write("| 判定 | 數量 | 百分比 |\n")
|
||||
f.write("|------|------|--------|\n")
|
||||
|
||||
sig_total = sum(stats['signature_verdicts'].values())
|
||||
for verdict in ['copy', 'uncertain', 'authentic', 'unknown']:
|
||||
count = stats['signature_verdicts'].get(verdict, 0)
|
||||
pct = count / sig_total * 100 if sig_total > 0 else 0
|
||||
label = {
|
||||
'copy': '複製貼上',
|
||||
'authentic': '親簽',
|
||||
'uncertain': '不確定',
|
||||
'unknown': '無法判定'
|
||||
}.get(verdict, verdict)
|
||||
f.write(f"| {label} | {count:,} | {pct:.1f}% |\n")
|
||||
|
||||
f.write(f"\n**總計: {sig_total:,} 個簽名**\n")
|
||||
|
||||
f.write("\n### 按事務所統計\n\n")
|
||||
f.write("| 事務所 | 複製貼上 | 不確定 | 親簽 | 無法判定 | 總計 |\n")
|
||||
f.write("|--------|----------|--------|------|----------|------|\n")
|
||||
|
||||
# 按總數排序
|
||||
firms_sorted = sorted(stats['by_firm'].items(),
|
||||
key=lambda x: sum(x[1].values()), reverse=True)
|
||||
|
||||
for firm, verdicts in firms_sorted[:20]:
|
||||
copy_n = verdicts.get('copy', 0)
|
||||
uncertain_n = verdicts.get('uncertain', 0)
|
||||
authentic_n = verdicts.get('authentic', 0)
|
||||
unknown_n = verdicts.get('unknown', 0)
|
||||
total_n = copy_n + uncertain_n + authentic_n + unknown_n
|
||||
f.write(f"| {firm} | {copy_n:,} | {uncertain_n:,} | {authentic_n:,} | {unknown_n:,} | {total_n:,} |\n")
|
||||
|
||||
print(f"已儲存: {md_path}")
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def update_database(results):
|
||||
"""更新資料庫"""
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cur = conn.cursor()
|
||||
|
||||
# 添加欄位
|
||||
try:
|
||||
cur.execute("ALTER TABLE signatures ADD COLUMN signature_verdict TEXT")
|
||||
cur.execute("ALTER TABLE signatures ADD COLUMN max_similarity_to_same_accountant REAL")
|
||||
except:
|
||||
pass
|
||||
|
||||
# 更新
|
||||
for r in results:
|
||||
for sig in r['signatures']:
|
||||
cur.execute("""
|
||||
UPDATE signatures
|
||||
SET signature_verdict = ?, max_similarity_to_same_accountant = ?
|
||||
WHERE signature_id = ?
|
||||
""", (sig['verdict'], sig['max_similarity'], sig['signature_id']))
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print("資料庫已更新")
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("第四階段:PDF 簽名真偽判定")
|
||||
print("=" * 60)
|
||||
|
||||
# 載入資料
|
||||
features_norm, sig_data, pdf_signatures, acc_signatures, pdf_info, sig_id_to_idx = load_data()
|
||||
print(f"PDF 數: {len(pdf_signatures)}")
|
||||
print(f"有效簽名: {len(sig_data)}")
|
||||
|
||||
# 分析所有 PDF
|
||||
print("\n開始分析...")
|
||||
results = analyze_all_pdfs(
|
||||
features_norm, sig_data, pdf_signatures, acc_signatures, pdf_info, sig_id_to_idx
|
||||
)
|
||||
|
||||
# 生成統計
|
||||
stats = generate_statistics(results)
|
||||
|
||||
# 儲存結果
|
||||
print("\n儲存結果...")
|
||||
save_results(results, stats)
|
||||
|
||||
# 更新資料庫
|
||||
update_database(results)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("完成!")
|
||||
print("=" * 60)
|
||||
print(f"\nPDF 判定結果:")
|
||||
print(f" 複製貼上: {stats['pdf_verdicts'].get('copy', 0):,}")
|
||||
print(f" 不確定: {stats['pdf_verdicts'].get('uncertain', 0):,}")
|
||||
print(f" 親簽: {stats['pdf_verdicts'].get('authentic', 0):,}")
|
||||
print(f" 無法判定: {stats['pdf_verdicts'].get('unknown', 0):,}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,319 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Compute SSIM and pHash for all signature pairs (closest match per accountant).
|
||||
Uses multiprocessing for parallel image loading and computation.
|
||||
Saves results to database and outputs complete CSV.
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
import cv2
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import csv
|
||||
import time
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
from multiprocessing import Pool, cpu_count
|
||||
from pathlib import Path
|
||||
|
||||
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
IMAGE_DIR = '/Volumes/NV2/PDF-Processing/yolo-signatures/images'
|
||||
OUTPUT_CSV = '/Volumes/NV2/PDF-Processing/signature-analysis/reports/complete_pdf_report.csv'
|
||||
CHECKPOINT_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/ssim_checkpoint.json'
|
||||
NUM_WORKERS = max(1, cpu_count() - 2) # Leave 2 cores free
|
||||
BATCH_SIZE = 1000
|
||||
|
||||
|
||||
def compute_phash(img, hash_size=8):
|
||||
"""Compute perceptual hash."""
|
||||
resized = cv2.resize(img, (hash_size + 1, hash_size))
|
||||
diff = resized[:, 1:] > resized[:, :-1]
|
||||
return diff.flatten()
|
||||
|
||||
|
||||
def compute_pair_ssim(args):
|
||||
"""Compute SSIM, pHash, histogram correlation for a pair of images."""
|
||||
sig_id, file1, file2, cosine_sim = args
|
||||
|
||||
path1 = os.path.join(IMAGE_DIR, file1)
|
||||
path2 = os.path.join(IMAGE_DIR, file2)
|
||||
|
||||
result = {
|
||||
'signature_id': sig_id,
|
||||
'match_file': file2,
|
||||
'cosine_similarity': cosine_sim,
|
||||
'ssim': None,
|
||||
'phash_distance': None,
|
||||
'histogram_corr': None,
|
||||
'pixel_identical': False,
|
||||
}
|
||||
|
||||
try:
|
||||
img1 = cv2.imread(path1, cv2.IMREAD_GRAYSCALE)
|
||||
img2 = cv2.imread(path2, cv2.IMREAD_GRAYSCALE)
|
||||
|
||||
if img1 is None or img2 is None:
|
||||
return result
|
||||
|
||||
# Resize to same dimensions
|
||||
h = min(img1.shape[0], img2.shape[0])
|
||||
w = min(img1.shape[1], img2.shape[1])
|
||||
if h < 3 or w < 3:
|
||||
return result
|
||||
|
||||
img1_r = cv2.resize(img1, (w, h))
|
||||
img2_r = cv2.resize(img2, (w, h))
|
||||
|
||||
# Pixel identical check
|
||||
result['pixel_identical'] = bool(np.array_equal(img1_r, img2_r))
|
||||
|
||||
# SSIM
|
||||
try:
|
||||
from skimage.metrics import structural_similarity as ssim
|
||||
win_size = min(7, min(h, w))
|
||||
if win_size % 2 == 0:
|
||||
win_size -= 1
|
||||
if win_size >= 3:
|
||||
result['ssim'] = float(ssim(img1_r, img2_r, win_size=win_size))
|
||||
else:
|
||||
result['ssim'] = None
|
||||
except Exception:
|
||||
result['ssim'] = None
|
||||
|
||||
# Histogram correlation
|
||||
hist1 = cv2.calcHist([img1_r], [0], None, [256], [0, 256])
|
||||
hist2 = cv2.calcHist([img2_r], [0], None, [256], [0, 256])
|
||||
result['histogram_corr'] = float(cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL))
|
||||
|
||||
# pHash distance
|
||||
h1 = compute_phash(img1_r)
|
||||
h2 = compute_phash(img2_r)
|
||||
result['phash_distance'] = int(np.sum(h1 != h2))
|
||||
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def load_checkpoint():
|
||||
"""Load checkpoint of already processed signature IDs."""
|
||||
if os.path.exists(CHECKPOINT_PATH):
|
||||
with open(CHECKPOINT_PATH, 'r') as f:
|
||||
data = json.load(f)
|
||||
return set(data.get('processed_ids', []))
|
||||
return set()
|
||||
|
||||
|
||||
def save_checkpoint(processed_ids):
|
||||
"""Save checkpoint."""
|
||||
with open(CHECKPOINT_PATH, 'w') as f:
|
||||
json.dump({'processed_ids': list(processed_ids), 'timestamp': str(datetime.now())}, f)
|
||||
|
||||
|
||||
def main():
|
||||
start_time = time.time()
|
||||
print("=" * 70)
|
||||
print("SSIM & pHash Computation for All Signature Pairs")
|
||||
print(f"Workers: {NUM_WORKERS}")
|
||||
print("=" * 70)
|
||||
|
||||
# --- Step 1: Load data ---
|
||||
print("\n[1/4] Loading data from database...")
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute('''
|
||||
SELECT signature_id, image_filename, assigned_accountant, feature_vector
|
||||
FROM signatures
|
||||
WHERE feature_vector IS NOT NULL AND assigned_accountant IS NOT NULL
|
||||
''')
|
||||
rows = cur.fetchall()
|
||||
|
||||
sig_ids = []
|
||||
filenames = []
|
||||
accountants = []
|
||||
features = []
|
||||
|
||||
for row in rows:
|
||||
sig_ids.append(row[0])
|
||||
filenames.append(row[1])
|
||||
accountants.append(row[2])
|
||||
features.append(np.frombuffer(row[3], dtype=np.float32))
|
||||
|
||||
features = np.array(features)
|
||||
print(f" Loaded {len(sig_ids)} signatures")
|
||||
|
||||
# --- Step 2: Find closest match per signature ---
|
||||
print("\n[2/4] Finding closest match per signature (same accountant)...")
|
||||
acct_groups = defaultdict(list)
|
||||
for i, acct in enumerate(accountants):
|
||||
acct_groups[acct].append(i)
|
||||
|
||||
# Load checkpoint
|
||||
processed_ids = load_checkpoint()
|
||||
print(f" Checkpoint: {len(processed_ids)} already processed")
|
||||
|
||||
# Prepare tasks
|
||||
tasks = []
|
||||
for acct, indices in acct_groups.items():
|
||||
if len(indices) < 2:
|
||||
continue
|
||||
vecs = features[indices]
|
||||
sim_matrix = vecs @ vecs.T
|
||||
np.fill_diagonal(sim_matrix, -1) # Exclude self
|
||||
|
||||
for local_i, global_i in enumerate(indices):
|
||||
if sig_ids[global_i] in processed_ids:
|
||||
continue
|
||||
best_local = np.argmax(sim_matrix[local_i])
|
||||
best_global = indices[best_local]
|
||||
best_sim = float(sim_matrix[local_i, best_local])
|
||||
tasks.append((
|
||||
sig_ids[global_i],
|
||||
filenames[global_i],
|
||||
filenames[best_global],
|
||||
best_sim
|
||||
))
|
||||
|
||||
print(f" Tasks to process: {len(tasks)}")
|
||||
|
||||
# --- Step 3: Compute SSIM/pHash in parallel ---
|
||||
print(f"\n[3/4] Computing SSIM & pHash ({len(tasks)} pairs, {NUM_WORKERS} workers)...")
|
||||
|
||||
# Add SSIM columns to database if not exist
|
||||
try:
|
||||
cur.execute('ALTER TABLE signatures ADD COLUMN ssim_to_closest REAL')
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
cur.execute('ALTER TABLE signatures ADD COLUMN phash_distance_to_closest INTEGER')
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
cur.execute('ALTER TABLE signatures ADD COLUMN histogram_corr_to_closest REAL')
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
cur.execute('ALTER TABLE signatures ADD COLUMN pixel_identical_to_closest INTEGER')
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
cur.execute('ALTER TABLE signatures ADD COLUMN closest_match_file TEXT')
|
||||
except:
|
||||
pass
|
||||
conn.commit()
|
||||
|
||||
total = len(tasks)
|
||||
done = 0
|
||||
batch_results = []
|
||||
|
||||
with Pool(NUM_WORKERS) as pool:
|
||||
for result in pool.imap_unordered(compute_pair_ssim, tasks, chunksize=50):
|
||||
batch_results.append(result)
|
||||
done += 1
|
||||
|
||||
if done % BATCH_SIZE == 0 or done == total:
|
||||
# Save batch to database
|
||||
for r in batch_results:
|
||||
cur.execute('''
|
||||
UPDATE signatures SET
|
||||
ssim_to_closest = ?,
|
||||
phash_distance_to_closest = ?,
|
||||
histogram_corr_to_closest = ?,
|
||||
pixel_identical_to_closest = ?,
|
||||
closest_match_file = ?
|
||||
WHERE signature_id = ?
|
||||
''', (
|
||||
r['ssim'],
|
||||
r['phash_distance'],
|
||||
r['histogram_corr'],
|
||||
1 if r['pixel_identical'] else 0,
|
||||
r['match_file'],
|
||||
r['signature_id']
|
||||
))
|
||||
processed_ids.add(r['signature_id'])
|
||||
conn.commit()
|
||||
save_checkpoint(processed_ids)
|
||||
batch_results = []
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
rate = done / elapsed
|
||||
eta = (total - done) / rate if rate > 0 else 0
|
||||
print(f" {done:,}/{total:,} ({100*done/total:.1f}%) "
|
||||
f"| {rate:.1f} pairs/s | ETA: {eta/60:.1f} min")
|
||||
|
||||
# --- Step 4: Generate complete CSV ---
|
||||
print(f"\n[4/4] Generating complete CSV...")
|
||||
|
||||
cur.execute('''
|
||||
SELECT
|
||||
s.source_pdf,
|
||||
s.year_month,
|
||||
s.serial_number,
|
||||
s.doc_type,
|
||||
s.page_number,
|
||||
s.sig_index,
|
||||
s.image_filename,
|
||||
s.assigned_accountant,
|
||||
s.excel_accountant1,
|
||||
s.excel_accountant2,
|
||||
s.excel_firm,
|
||||
s.detection_confidence,
|
||||
s.signature_verdict,
|
||||
s.max_similarity_to_same_accountant,
|
||||
s.ssim_to_closest,
|
||||
s.phash_distance_to_closest,
|
||||
s.histogram_corr_to_closest,
|
||||
s.pixel_identical_to_closest,
|
||||
s.closest_match_file,
|
||||
a.risk_level,
|
||||
a.mean_similarity as acct_mean_similarity,
|
||||
a.ratio_gt_95 as acct_ratio_gt_95
|
||||
FROM signatures s
|
||||
LEFT JOIN accountants a ON s.assigned_accountant = a.name
|
||||
ORDER BY s.source_pdf, s.sig_index
|
||||
''')
|
||||
|
||||
columns = [
|
||||
'source_pdf', 'year_month', 'serial_number', 'doc_type',
|
||||
'page_number', 'sig_index', 'image_filename',
|
||||
'assigned_accountant', 'excel_accountant1', 'excel_accountant2', 'excel_firm',
|
||||
'detection_confidence', 'signature_verdict',
|
||||
'max_cosine_similarity', 'ssim_to_closest', 'phash_distance_to_closest',
|
||||
'histogram_corr_to_closest', 'pixel_identical_to_closest', 'closest_match_file',
|
||||
'accountant_risk_level', 'accountant_mean_similarity', 'accountant_ratio_gt_95'
|
||||
]
|
||||
|
||||
with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(columns)
|
||||
for row in cur:
|
||||
writer.writerow(row)
|
||||
|
||||
# Count rows
|
||||
cur.execute('SELECT COUNT(*) FROM signatures')
|
||||
total_sigs = cur.fetchone()[0]
|
||||
cur.execute('SELECT COUNT(DISTINCT source_pdf) FROM signatures')
|
||||
total_pdfs = cur.fetchone()[0]
|
||||
|
||||
conn.close()
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Complete!")
|
||||
print(f" Total signatures: {total_sigs:,}")
|
||||
print(f" Total PDFs: {total_pdfs:,}")
|
||||
print(f" Output: {OUTPUT_CSV}")
|
||||
print(f" Time: {elapsed/60:.1f} minutes")
|
||||
print(f"{'='*70}")
|
||||
|
||||
# Clean up checkpoint
|
||||
if os.path.exists(CHECKPOINT_PATH):
|
||||
os.remove(CHECKPOINT_PATH)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,407 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate PDF-level aggregated report with multi-method verdicts.
|
||||
One row per PDF with all Group A-F columns plus new SSIM/pHash/combined verdicts.
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import csv
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
|
||||
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
OUTPUT_CSV = '/Volumes/NV2/PDF-Processing/signature-analysis/reports/pdf_level_complete_report.csv'
|
||||
|
||||
# Thresholds from statistical analysis
|
||||
COSINE_THRESHOLD = 0.95
|
||||
COSINE_STATISTICAL = 0.944 # mu + 2*sigma
|
||||
KDE_CROSSOVER = 0.838
|
||||
SSIM_HIGH = 0.95
|
||||
SSIM_MEDIUM = 0.80
|
||||
PHASH_IDENTICAL = 0
|
||||
PHASH_SIMILAR = 5
|
||||
|
||||
|
||||
def classify_overall(max_cosine, max_ssim, min_phash, has_pixel_identical):
|
||||
"""
|
||||
Multi-method combined verdict.
|
||||
Returns (verdict, confidence_level, n_methods_agree)
|
||||
"""
|
||||
evidence_copy = 0
|
||||
evidence_genuine = 0
|
||||
total_methods = 0
|
||||
|
||||
# Method 1: Cosine similarity
|
||||
if max_cosine is not None:
|
||||
total_methods += 1
|
||||
if max_cosine > COSINE_THRESHOLD:
|
||||
evidence_copy += 1
|
||||
elif max_cosine < KDE_CROSSOVER:
|
||||
evidence_genuine += 1
|
||||
|
||||
# Method 2: SSIM
|
||||
if max_ssim is not None:
|
||||
total_methods += 1
|
||||
if max_ssim > SSIM_HIGH:
|
||||
evidence_copy += 1
|
||||
elif max_ssim < 0.5:
|
||||
evidence_genuine += 1
|
||||
|
||||
# Method 3: pHash
|
||||
if min_phash is not None:
|
||||
total_methods += 1
|
||||
if min_phash <= PHASH_IDENTICAL:
|
||||
evidence_copy += 1
|
||||
elif min_phash > 15:
|
||||
evidence_genuine += 1
|
||||
|
||||
# Method 4: Pixel identical
|
||||
if has_pixel_identical is not None:
|
||||
total_methods += 1
|
||||
if has_pixel_identical:
|
||||
evidence_copy += 1
|
||||
|
||||
# Decision logic
|
||||
if has_pixel_identical:
|
||||
verdict = 'definite_copy'
|
||||
confidence = 'very_high'
|
||||
elif max_ssim is not None and max_ssim > SSIM_HIGH and min_phash is not None and min_phash <= PHASH_SIMILAR:
|
||||
verdict = 'definite_copy'
|
||||
confidence = 'very_high'
|
||||
elif evidence_copy >= 3:
|
||||
verdict = 'very_likely_copy'
|
||||
confidence = 'high'
|
||||
elif evidence_copy >= 2:
|
||||
verdict = 'likely_copy'
|
||||
confidence = 'medium'
|
||||
elif max_cosine is not None and max_cosine > COSINE_THRESHOLD:
|
||||
verdict = 'likely_copy'
|
||||
confidence = 'medium'
|
||||
elif max_cosine is not None and max_cosine > KDE_CROSSOVER:
|
||||
verdict = 'uncertain'
|
||||
confidence = 'low'
|
||||
elif max_cosine is not None and max_cosine <= KDE_CROSSOVER:
|
||||
verdict = 'likely_genuine'
|
||||
confidence = 'medium'
|
||||
else:
|
||||
verdict = 'unknown'
|
||||
confidence = 'none'
|
||||
|
||||
return verdict, confidence, evidence_copy, total_methods
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("PDF-Level Aggregated Report Generator")
|
||||
print("=" * 70)
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Load all signature data grouped by PDF
|
||||
print("\n[1/3] Loading signature data...")
|
||||
cur.execute('''
|
||||
SELECT
|
||||
s.source_pdf,
|
||||
s.year_month,
|
||||
s.serial_number,
|
||||
s.doc_type,
|
||||
s.page_number,
|
||||
s.sig_index,
|
||||
s.assigned_accountant,
|
||||
s.excel_accountant1,
|
||||
s.excel_accountant2,
|
||||
s.excel_firm,
|
||||
s.detection_confidence,
|
||||
s.signature_verdict,
|
||||
s.max_similarity_to_same_accountant,
|
||||
s.ssim_to_closest,
|
||||
s.phash_distance_to_closest,
|
||||
s.histogram_corr_to_closest,
|
||||
s.pixel_identical_to_closest,
|
||||
a.risk_level,
|
||||
a.mean_similarity,
|
||||
a.ratio_gt_95,
|
||||
a.signature_count
|
||||
FROM signatures s
|
||||
LEFT JOIN accountants a ON s.assigned_accountant = a.name
|
||||
ORDER BY s.source_pdf, s.sig_index
|
||||
''')
|
||||
|
||||
# Group by PDF
|
||||
pdf_data = defaultdict(list)
|
||||
for row in cur:
|
||||
pdf_data[row[0]].append(row)
|
||||
|
||||
print(f" {len(pdf_data)} PDFs loaded")
|
||||
|
||||
# Generate PDF-level rows
|
||||
print("\n[2/3] Aggregating per-PDF statistics...")
|
||||
|
||||
columns = [
|
||||
# Group A: PDF Identity
|
||||
'source_pdf', 'year_month', 'serial_number', 'doc_type',
|
||||
|
||||
# Group B: Excel Master Data
|
||||
'accountant_1', 'accountant_2', 'firm',
|
||||
|
||||
# Group C: YOLO Detection
|
||||
'n_signatures_detected', 'avg_detection_confidence',
|
||||
|
||||
# Group D: Cosine Similarity
|
||||
'max_cosine_similarity', 'min_cosine_similarity', 'avg_cosine_similarity',
|
||||
|
||||
# Group E: Verdict (original per-sig)
|
||||
'sig1_cosine_verdict', 'sig2_cosine_verdict',
|
||||
|
||||
# Group F: Accountant Risk
|
||||
'acct1_name', 'acct1_risk_level', 'acct1_mean_similarity',
|
||||
'acct1_ratio_gt_95', 'acct1_total_signatures',
|
||||
'acct2_name', 'acct2_risk_level', 'acct2_mean_similarity',
|
||||
'acct2_ratio_gt_95', 'acct2_total_signatures',
|
||||
|
||||
# Group G: SSIM (NEW)
|
||||
'max_ssim', 'min_ssim', 'avg_ssim',
|
||||
'verdict_ssim',
|
||||
|
||||
# Group H: pHash (NEW)
|
||||
'min_phash_distance', 'max_phash_distance', 'avg_phash_distance',
|
||||
'verdict_phash',
|
||||
|
||||
# Group I: Histogram Correlation (NEW)
|
||||
'max_histogram_corr', 'avg_histogram_corr',
|
||||
|
||||
# Group J: Pixel Identity (NEW)
|
||||
'has_pixel_identical',
|
||||
'verdict_pixel',
|
||||
|
||||
# Group K: Statistical Threshold (NEW)
|
||||
'verdict_statistical', # Based on mu+2sigma (0.944)
|
||||
|
||||
# Group L: KDE Crossover (NEW)
|
||||
'verdict_kde', # Based on KDE crossover (0.838)
|
||||
|
||||
# Group M: Multi-Method Combined (NEW)
|
||||
'overall_verdict',
|
||||
'confidence_level',
|
||||
'n_methods_copy',
|
||||
'n_methods_total',
|
||||
]
|
||||
|
||||
rows = []
|
||||
for pdf_name, sigs in pdf_data.items():
|
||||
# Group A: Identity (from first signature)
|
||||
first = sigs[0]
|
||||
year_month = first[1]
|
||||
serial_number = first[2]
|
||||
doc_type = first[3]
|
||||
|
||||
# Group B: Excel data
|
||||
excel_acct1 = first[7]
|
||||
excel_acct2 = first[8]
|
||||
excel_firm = first[9]
|
||||
|
||||
# Group C: Detection
|
||||
n_sigs = len(sigs)
|
||||
confidences = [s[10] for s in sigs if s[10] is not None]
|
||||
avg_conf = np.mean(confidences) if confidences else None
|
||||
|
||||
# Group D: Cosine similarity
|
||||
cosines = [s[12] for s in sigs if s[12] is not None]
|
||||
max_cosine = max(cosines) if cosines else None
|
||||
min_cosine = min(cosines) if cosines else None
|
||||
avg_cosine = np.mean(cosines) if cosines else None
|
||||
|
||||
# Group E: Per-sig verdicts
|
||||
verdicts = [s[11] for s in sigs]
|
||||
sig1_verdict = verdicts[0] if len(verdicts) > 0 else None
|
||||
sig2_verdict = verdicts[1] if len(verdicts) > 1 else None
|
||||
|
||||
# Group F: Accountant risk - separate for acct1 and acct2
|
||||
# Match by assigned_accountant to excel_accountant1/2
|
||||
acct1_info = {'name': None, 'risk': None, 'mean_sim': None, 'ratio': None, 'count': None}
|
||||
acct2_info = {'name': None, 'risk': None, 'mean_sim': None, 'ratio': None, 'count': None}
|
||||
|
||||
for s in sigs:
|
||||
assigned = s[6]
|
||||
if assigned and assigned == excel_acct1 and acct1_info['name'] is None:
|
||||
acct1_info = {
|
||||
'name': assigned, 'risk': s[17],
|
||||
'mean_sim': s[18], 'ratio': s[19], 'count': s[20]
|
||||
}
|
||||
elif assigned and assigned == excel_acct2 and acct2_info['name'] is None:
|
||||
acct2_info = {
|
||||
'name': assigned, 'risk': s[17],
|
||||
'mean_sim': s[18], 'ratio': s[19], 'count': s[20]
|
||||
}
|
||||
elif assigned and acct1_info['name'] is None:
|
||||
acct1_info = {
|
||||
'name': assigned, 'risk': s[17],
|
||||
'mean_sim': s[18], 'ratio': s[19], 'count': s[20]
|
||||
}
|
||||
elif assigned and acct2_info['name'] is None:
|
||||
acct2_info = {
|
||||
'name': assigned, 'risk': s[17],
|
||||
'mean_sim': s[18], 'ratio': s[19], 'count': s[20]
|
||||
}
|
||||
|
||||
# Group G: SSIM
|
||||
ssims = [s[13] for s in sigs if s[13] is not None]
|
||||
max_ssim = max(ssims) if ssims else None
|
||||
min_ssim = min(ssims) if ssims else None
|
||||
avg_ssim = np.mean(ssims) if ssims else None
|
||||
|
||||
if max_ssim is not None:
|
||||
if max_ssim > SSIM_HIGH:
|
||||
verdict_ssim = 'copy'
|
||||
elif max_ssim > SSIM_MEDIUM:
|
||||
verdict_ssim = 'suspicious'
|
||||
else:
|
||||
verdict_ssim = 'genuine'
|
||||
else:
|
||||
verdict_ssim = None
|
||||
|
||||
# Group H: pHash
|
||||
phashes = [s[14] for s in sigs if s[14] is not None]
|
||||
min_phash = min(phashes) if phashes else None
|
||||
max_phash = max(phashes) if phashes else None
|
||||
avg_phash = np.mean(phashes) if phashes else None
|
||||
|
||||
if min_phash is not None:
|
||||
if min_phash <= PHASH_IDENTICAL:
|
||||
verdict_phash = 'copy'
|
||||
elif min_phash <= PHASH_SIMILAR:
|
||||
verdict_phash = 'suspicious'
|
||||
else:
|
||||
verdict_phash = 'genuine'
|
||||
else:
|
||||
verdict_phash = None
|
||||
|
||||
# Group I: Histogram correlation
|
||||
histcorrs = [s[15] for s in sigs if s[15] is not None]
|
||||
max_histcorr = max(histcorrs) if histcorrs else None
|
||||
avg_histcorr = np.mean(histcorrs) if histcorrs else None
|
||||
|
||||
# Group J: Pixel identical
|
||||
pixel_ids = [s[16] for s in sigs if s[16] is not None]
|
||||
has_pixel = any(p == 1 for p in pixel_ids) if pixel_ids else False
|
||||
verdict_pixel = 'copy' if has_pixel else 'genuine'
|
||||
|
||||
# Group K: Statistical threshold (mu+2sigma = 0.944)
|
||||
if max_cosine is not None:
|
||||
if max_cosine > COSINE_STATISTICAL:
|
||||
verdict_stat = 'copy'
|
||||
elif max_cosine > KDE_CROSSOVER:
|
||||
verdict_stat = 'uncertain'
|
||||
else:
|
||||
verdict_stat = 'genuine'
|
||||
else:
|
||||
verdict_stat = None
|
||||
|
||||
# Group L: KDE crossover (0.838)
|
||||
if max_cosine is not None:
|
||||
if max_cosine > KDE_CROSSOVER:
|
||||
verdict_kde = 'above_crossover'
|
||||
else:
|
||||
verdict_kde = 'below_crossover'
|
||||
else:
|
||||
verdict_kde = None
|
||||
|
||||
# Group M: Multi-method combined
|
||||
overall, confidence, n_copy, n_total = classify_overall(
|
||||
max_cosine, max_ssim, min_phash, has_pixel)
|
||||
|
||||
rows.append([
|
||||
# A
|
||||
pdf_name, year_month, serial_number, doc_type,
|
||||
# B
|
||||
excel_acct1, excel_acct2, excel_firm,
|
||||
# C
|
||||
n_sigs, avg_conf,
|
||||
# D
|
||||
max_cosine, min_cosine, avg_cosine,
|
||||
# E
|
||||
sig1_verdict, sig2_verdict,
|
||||
# F
|
||||
acct1_info['name'], acct1_info['risk'], acct1_info['mean_sim'],
|
||||
acct1_info['ratio'], acct1_info['count'],
|
||||
acct2_info['name'], acct2_info['risk'], acct2_info['mean_sim'],
|
||||
acct2_info['ratio'], acct2_info['count'],
|
||||
# G
|
||||
max_ssim, min_ssim, avg_ssim, verdict_ssim,
|
||||
# H
|
||||
min_phash, max_phash, avg_phash, verdict_phash,
|
||||
# I
|
||||
max_histcorr, avg_histcorr,
|
||||
# J
|
||||
1 if has_pixel else 0, verdict_pixel,
|
||||
# K
|
||||
verdict_stat,
|
||||
# L
|
||||
verdict_kde,
|
||||
# M
|
||||
overall, confidence, n_copy, n_total,
|
||||
])
|
||||
|
||||
# Write CSV
|
||||
print(f"\n[3/3] Writing {len(rows)} PDF rows to CSV...")
|
||||
with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(columns)
|
||||
writer.writerows(rows)
|
||||
|
||||
conn.close()
|
||||
|
||||
# Print summary statistics
|
||||
print(f"\n{'='*70}")
|
||||
print("SUMMARY")
|
||||
print(f"{'='*70}")
|
||||
print(f"Total PDFs: {len(rows):,}")
|
||||
|
||||
# Overall verdict distribution
|
||||
verdict_counts = defaultdict(int)
|
||||
confidence_counts = defaultdict(int)
|
||||
for r in rows:
|
||||
verdict_counts[r[-4]] += 1
|
||||
confidence_counts[r[-3]] += 1
|
||||
|
||||
print(f"\n--- Overall Verdict Distribution ---")
|
||||
for v in ['definite_copy', 'very_likely_copy', 'likely_copy', 'uncertain', 'likely_genuine', 'unknown']:
|
||||
c = verdict_counts.get(v, 0)
|
||||
print(f" {v:20s}: {c:>6,} ({100*c/len(rows):5.1f}%)")
|
||||
|
||||
print(f"\n--- Confidence Level Distribution ---")
|
||||
for c_level in ['very_high', 'high', 'medium', 'low', 'none']:
|
||||
c = confidence_counts.get(c_level, 0)
|
||||
print(f" {c_level:10s}: {c:>6,} ({100*c/len(rows):5.1f}%)")
|
||||
|
||||
# Per-method verdict distribution
|
||||
# Column indices: verdict_ssim=27, verdict_phash=31, verdict_pixel=35, verdict_stat=36, verdict_kde=37
|
||||
print(f"\n--- Per-Method Verdict Distribution ---")
|
||||
for col_idx, method_name in [(27, 'SSIM'), (31, 'pHash'), (35, 'Pixel'), (36, 'Statistical'), (37, 'KDE')]:
|
||||
counts = defaultdict(int)
|
||||
for r in rows:
|
||||
counts[r[col_idx]] += 1
|
||||
print(f"\n {method_name}:")
|
||||
for k, v in sorted(counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {str(k):20s}: {v:>6,} ({100*v/len(rows):5.1f}%)")
|
||||
|
||||
# Cross-method agreement
|
||||
print(f"\n--- Method Agreement (cosine>0.95 PDFs) ---")
|
||||
cosine_copy = [r for r in rows if r[9] is not None and r[9] > COSINE_THRESHOLD]
|
||||
if cosine_copy:
|
||||
ssim_agree = sum(1 for r in cosine_copy if r[27] == 'copy')
|
||||
phash_agree = sum(1 for r in cosine_copy if r[31] == 'copy')
|
||||
pixel_agree = sum(1 for r in cosine_copy if r[34] == 1)
|
||||
print(f" PDFs with cosine > 0.95: {len(cosine_copy):,}")
|
||||
print(f" Also SSIM > 0.95: {ssim_agree:>6,} ({100*ssim_agree/len(cosine_copy):5.1f}%)")
|
||||
print(f" Also pHash = 0: {phash_agree:>6,} ({100*phash_agree/len(cosine_copy):5.1f}%)")
|
||||
print(f" Also pixel-identical: {pixel_agree:>4,} ({100*pixel_agree/len(cosine_copy):5.1f}%)")
|
||||
|
||||
print(f"\nOutput: {OUTPUT_CSV}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user