Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification

Paper draft includes all sections (Abstract through Conclusion), 36 references,
and supporting scripts. Key methodology: Cosine similarity + dHash dual-method
verification with thresholds calibrated against known-replication firm (Firm A).

Includes:
- 8 section markdown files (paper_a_*.md)
- Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0)
- Recalibrated classification script (84,386 PDFs, 5-tier system)
- Figure generation and Word export scripts
- Citation renumbering script ([1]-[36])
- Signature analysis pipeline (12 steps)
- YOLO extraction scripts

Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-06 23:05:33 +08:00
parent 21df0ff387
commit 939a348da4
33 changed files with 9315 additions and 0 deletions
+246
View File
@@ -0,0 +1,246 @@
#!/usr/bin/env python3
"""
Step 1: 建立 SQLite 資料庫,匯入簽名記錄
從 extraction_results.csv 匯入資料,展開每個圖片為獨立記錄
解析圖片檔名填充 year_month, sig_index
計算圖片尺寸 width, height
"""
import sqlite3
import pandas as pd
import cv2
import os
import re
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
# 路徑配置
IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
CSV_PATH = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/extraction_results.csv")
OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis")
DB_PATH = OUTPUT_DIR / "signature_analysis.db"
def parse_image_filename(filename: str) -> dict:
"""
解析圖片檔名,提取結構化資訊
範例: 201301_2458_AI1_page4_sig1.png
"""
# 移除 .png 副檔名
name = filename.replace('.png', '')
# 解析模式: {YYYYMM}_{SERIAL}_{DOCTYPE}_page{PAGE}_sig{N}
match = re.match(r'^(\d{6})_([^_]+)_([^_]+)_page(\d+)_sig(\d+)$', name)
if match:
year_month, serial, doc_type, page, sig_index = match.groups()
return {
'year_month': year_month,
'serial_number': serial,
'doc_type': doc_type,
'page_number': int(page),
'sig_index': int(sig_index)
}
else:
# 無法解析時返回 None
return {
'year_month': None,
'serial_number': None,
'doc_type': None,
'page_number': None,
'sig_index': None
}
def get_image_dimensions(image_path: Path) -> tuple:
"""讀取圖片尺寸"""
try:
img = cv2.imread(str(image_path))
if img is not None:
h, w = img.shape[:2]
return w, h
return None, None
except Exception:
return None, None
def process_single_image(args: tuple) -> dict:
"""處理單張圖片,返回資料記錄"""
image_filename, source_pdf, confidence_avg = args
# 解析檔名
parsed = parse_image_filename(image_filename)
# 取得圖片尺寸
image_path = IMAGES_DIR / image_filename
width, height = get_image_dimensions(image_path)
return {
'image_filename': image_filename,
'source_pdf': source_pdf,
'year_month': parsed['year_month'],
'serial_number': parsed['serial_number'],
'doc_type': parsed['doc_type'],
'page_number': parsed['page_number'],
'sig_index': parsed['sig_index'],
'detection_confidence': confidence_avg,
'image_width': width,
'image_height': height
}
def create_database():
"""建立資料庫 schema"""
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# 建立 signatures 表
cursor.execute('''
CREATE TABLE IF NOT EXISTS signatures (
signature_id INTEGER PRIMARY KEY AUTOINCREMENT,
image_filename TEXT UNIQUE NOT NULL,
source_pdf TEXT NOT NULL,
year_month TEXT,
serial_number TEXT,
doc_type TEXT,
page_number INTEGER,
sig_index INTEGER,
detection_confidence REAL,
image_width INTEGER,
image_height INTEGER,
accountant_name TEXT,
accountant_id INTEGER,
feature_vector BLOB,
cluster_id INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# 建立索引
cursor.execute('CREATE INDEX IF NOT EXISTS idx_source_pdf ON signatures(source_pdf)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_year_month ON signatures(year_month)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_accountant_id ON signatures(accountant_id)')
conn.commit()
conn.close()
print(f"資料庫已建立: {DB_PATH}")
def expand_csv_to_records(csv_path: Path) -> list:
"""
將 CSV 展開為單張圖片記錄
CSV 格式: filename,page,num_signatures,confidence_avg,image_files
需要將 image_files 展開為多筆記錄
"""
df = pd.read_csv(csv_path)
records = []
for _, row in df.iterrows():
source_pdf = row['filename']
confidence_avg = row['confidence_avg']
image_files_str = row['image_files']
# 展開 image_files(逗號分隔)
if pd.notna(image_files_str):
image_files = [f.strip() for f in image_files_str.split(',')]
for img_file in image_files:
records.append((img_file, source_pdf, confidence_avg))
return records
def import_data():
"""匯入資料到資料庫"""
print("讀取 CSV 並展開記錄...")
records = expand_csv_to_records(CSV_PATH)
print(f"{len(records)} 張簽名圖片待處理")
print("處理圖片資訊(讀取尺寸)...")
processed_records = []
# 使用多執行緒加速圖片尺寸讀取
with ThreadPoolExecutor(max_workers=8) as executor:
futures = {executor.submit(process_single_image, r): r for r in records}
for future in tqdm(as_completed(futures), total=len(records), desc="處理圖片"):
result = future.result()
processed_records.append(result)
print("寫入資料庫...")
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# 批次插入
insert_sql = '''
INSERT OR IGNORE INTO signatures (
image_filename, source_pdf, year_month, serial_number, doc_type,
page_number, sig_index, detection_confidence, image_width, image_height
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
'''
batch_data = [
(
r['image_filename'], r['source_pdf'], r['year_month'], r['serial_number'],
r['doc_type'], r['page_number'], r['sig_index'], r['detection_confidence'],
r['image_width'], r['image_height']
)
for r in processed_records
]
cursor.executemany(insert_sql, batch_data)
conn.commit()
# 統計結果
cursor.execute('SELECT COUNT(*) FROM signatures')
total = cursor.fetchone()[0]
cursor.execute('SELECT COUNT(DISTINCT source_pdf) FROM signatures')
pdf_count = cursor.fetchone()[0]
cursor.execute('SELECT COUNT(DISTINCT year_month) FROM signatures')
period_count = cursor.fetchone()[0]
cursor.execute('SELECT MIN(year_month), MAX(year_month) FROM signatures')
min_date, max_date = cursor.fetchone()
conn.close()
print("\n" + "=" * 50)
print("資料庫建立完成")
print("=" * 50)
print(f"簽名總數: {total:,}")
print(f"PDF 檔案數: {pdf_count:,}")
print(f"時間範圍: {min_date} ~ {max_date} ({period_count} 個月)")
print(f"資料庫位置: {DB_PATH}")
def main():
print("=" * 50)
print("Step 1: 建立簽名分析資料庫")
print("=" * 50)
# 檢查來源檔案
if not CSV_PATH.exists():
print(f"錯誤: 找不到 CSV 檔案 {CSV_PATH}")
return
if not IMAGES_DIR.exists():
print(f"錯誤: 找不到圖片目錄 {IMAGES_DIR}")
return
# 建立資料庫
create_database()
# 匯入資料
import_data()
if __name__ == "__main__":
main()
+241
View File
@@ -0,0 +1,241 @@
#!/usr/bin/env python3
"""
Step 2: 使用 ResNet-50 提取簽名圖片的特徵向量
預處理流程:
1. 載入圖片 (RGB)
2. 縮放至 224x224(保持比例,填充白色)
3. 正規化 (ImageNet mean/std)
4. 通過 ResNet-50 (去掉最後分類層)
5. L2 正規化
6. 輸出 2048 維特徵向量
"""
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import numpy as np
import cv2
import sqlite3
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
# 路徑配置
IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis")
DB_PATH = OUTPUT_DIR / "signature_analysis.db"
FEATURES_PATH = OUTPUT_DIR / "features"
# 模型配置
BATCH_SIZE = 64
NUM_WORKERS = 4
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else
"cuda" if torch.cuda.is_available() else "cpu")
class SignatureDataset(Dataset):
"""簽名圖片資料集"""
def __init__(self, image_paths: list, transform=None):
self.image_paths = image_paths
self.transform = transform
def __len__(self):
return len(self.image_paths)
def __getitem__(self, idx):
img_path = self.image_paths[idx]
# 載入圖片
img = cv2.imread(str(img_path))
if img is None:
# 如果讀取失敗,返回白色圖片
img = np.ones((224, 224, 3), dtype=np.uint8) * 255
else:
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# 調整大小(保持比例,填充白色)
img = self.resize_with_padding(img, 224, 224)
if self.transform:
img = self.transform(img)
return img, str(img_path.name)
@staticmethod
def resize_with_padding(img, target_w, target_h):
"""調整大小並填充白色以保持比例"""
h, w = img.shape[:2]
# 計算縮放比例
scale = min(target_w / w, target_h / h)
new_w = int(w * scale)
new_h = int(h * scale)
# 縮放
resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
# 建立白色畫布
canvas = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255
# 置中貼上
x_offset = (target_w - new_w) // 2
y_offset = (target_h - new_h) // 2
canvas[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized
return canvas
class FeatureExtractor:
"""特徵提取器"""
def __init__(self, device):
self.device = device
# 載入預訓練 ResNet-50
print(f"載入 ResNet-50 模型... (device: {device})")
self.model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
# 移除最後的分類層,保留特徵
self.model = nn.Sequential(*list(self.model.children())[:-1])
self.model = self.model.to(device)
self.model.eval()
# ImageNet 正規化
self.transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])
@torch.no_grad()
def extract_batch(self, images):
"""提取一批圖片的特徵"""
images = images.to(self.device)
features = self.model(images)
features = features.squeeze(-1).squeeze(-1) # [B, 2048]
# L2 正規化
features = nn.functional.normalize(features, p=2, dim=1)
return features.cpu().numpy()
def get_image_list_from_db():
"""從資料庫取得所有圖片檔名"""
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('SELECT image_filename FROM signatures ORDER BY signature_id')
filenames = [row[0] for row in cursor.fetchall()]
conn.close()
return filenames
def save_features_to_db(features_dict: dict):
"""將特徵向量存入資料庫"""
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
for filename, feature in tqdm(features_dict.items(), desc="寫入資料庫"):
cursor.execute('''
UPDATE signatures
SET feature_vector = ?
WHERE image_filename = ?
''', (feature.tobytes(), filename))
conn.commit()
conn.close()
def main():
print("=" * 60)
print("Step 2: ResNet-50 特徵向量提取")
print("=" * 60)
print(f"裝置: {DEVICE}")
# 確保輸出目錄存在
FEATURES_PATH.mkdir(parents=True, exist_ok=True)
# 從資料庫取得圖片列表
print("從資料庫讀取圖片列表...")
filenames = get_image_list_from_db()
print(f"{len(filenames):,} 張圖片待處理")
# 建立圖片路徑列表
image_paths = [IMAGES_DIR / f for f in filenames]
# 初始化特徵提取器
extractor = FeatureExtractor(DEVICE)
# 建立資料集和載入器
dataset = SignatureDataset(image_paths, transform=extractor.transform)
dataloader = DataLoader(
dataset,
batch_size=BATCH_SIZE,
shuffle=False,
num_workers=NUM_WORKERS,
pin_memory=True
)
# 提取特徵
print(f"\n開始提取特徵 (batch_size={BATCH_SIZE})...")
all_features = []
all_filenames = []
for images, batch_filenames in tqdm(dataloader, desc="提取特徵"):
features = extractor.extract_batch(images)
all_features.append(features)
all_filenames.extend(batch_filenames)
# 合併所有特徵
all_features = np.vstack(all_features)
print(f"\n特徵矩陣形狀: {all_features.shape}")
# 儲存為 numpy 檔案(備份)
npy_path = FEATURES_PATH / "signature_features.npy"
np.save(npy_path, all_features)
print(f"特徵向量已儲存: {npy_path} ({all_features.nbytes / 1e9:.2f} GB)")
# 儲存檔名對應(用於後續索引)
filenames_path = FEATURES_PATH / "signature_filenames.txt"
with open(filenames_path, 'w') as f:
for fn in all_filenames:
f.write(fn + '\n')
print(f"檔名列表已儲存: {filenames_path}")
# 更新資料庫
print("\n更新資料庫中的特徵向量...")
features_dict = dict(zip(all_filenames, all_features))
save_features_to_db(features_dict)
# 統計
print("\n" + "=" * 60)
print("特徵提取完成")
print("=" * 60)
print(f"處理圖片數: {len(all_filenames):,}")
print(f"特徵維度: {all_features.shape[1]}")
print(f"特徵檔案: {npy_path}")
print(f"檔案大小: {all_features.nbytes / 1e9:.2f} GB")
# 簡單驗證
print("\n特徵統計:")
print(f" 平均值: {all_features.mean():.6f}")
print(f" 標準差: {all_features.std():.6f}")
print(f" 最小值: {all_features.min():.6f}")
print(f" 最大值: {all_features.max():.6f}")
# L2 norm 驗證(應該都是 1.0
norms = np.linalg.norm(all_features, axis=1)
print(f" L2 norm: {norms.mean():.6f} ± {norms.std():.6f}")
if __name__ == "__main__":
main()
@@ -0,0 +1,368 @@
#!/usr/bin/env python3
"""
Step 3: 相似度分布探索
1. 隨機抽樣 100,000 對簽名
2. 計算 cosine similarity
3. 繪製直方圖分布
4. 找出高相似度對 (>0.95)
5. 分析高相似度對的來源
"""
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm import tqdm
import random
from collections import defaultdict
import json
# 路徑配置
OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis")
FEATURES_PATH = OUTPUT_DIR / "features" / "signature_features.npy"
FILENAMES_PATH = OUTPUT_DIR / "features" / "signature_filenames.txt"
REPORTS_PATH = OUTPUT_DIR / "reports"
# 分析配置
NUM_RANDOM_PAIRS = 100000
HIGH_SIMILARITY_THRESHOLD = 0.95
VERY_HIGH_SIMILARITY_THRESHOLD = 0.99
def load_data():
"""載入特徵向量和檔名"""
print("載入特徵向量...")
features = np.load(FEATURES_PATH)
print(f"特徵矩陣形狀: {features.shape}")
print("載入檔名列表...")
with open(FILENAMES_PATH, 'r') as f:
filenames = [line.strip() for line in f.readlines()]
print(f"檔名數量: {len(filenames)}")
return features, filenames
def parse_filename(filename: str) -> dict:
"""解析檔名提取資訊"""
# 範例: 201301_2458_AI1_page4_sig1.png
parts = filename.replace('.png', '').split('_')
if len(parts) >= 5:
return {
'year_month': parts[0],
'serial': parts[1],
'doc_type': parts[2],
'page': parts[3].replace('page', ''),
'sig_index': parts[4].replace('sig', '')
}
return {'raw': filename}
def cosine_similarity(v1, v2):
"""計算餘弦相似度(向量已 L2 正規化)"""
return np.dot(v1, v2)
def random_sampling_analysis(features, filenames, n_pairs=100000):
"""隨機抽樣計算相似度分布"""
print(f"\n隨機抽樣 {n_pairs:,} 對簽名...")
n = len(filenames)
similarities = []
pair_indices = []
# 產生隨機配對
for _ in tqdm(range(n_pairs), desc="計算相似度"):
i, j = random.sample(range(n), 2)
sim = cosine_similarity(features[i], features[j])
similarities.append(sim)
pair_indices.append((i, j))
return np.array(similarities), pair_indices
def find_high_similarity_pairs(features, filenames, threshold=0.95, sample_size=100000):
"""找出高相似度的簽名對"""
print(f"\n搜尋相似度 > {threshold} 的簽名對...")
n = len(filenames)
high_sim_pairs = []
# 使用隨機抽樣找高相似度對
# 由於全量計算太慢 (n^2 = 33 billion pairs),採用抽樣策略
for _ in tqdm(range(sample_size), desc="搜尋高相似度"):
i, j = random.sample(range(n), 2)
sim = cosine_similarity(features[i], features[j])
if sim > threshold:
high_sim_pairs.append({
'idx1': i,
'idx2': j,
'file1': filenames[i],
'file2': filenames[j],
'similarity': float(sim),
'parsed1': parse_filename(filenames[i]),
'parsed2': parse_filename(filenames[j])
})
return high_sim_pairs
def systematic_high_similarity_search(features, filenames, threshold=0.95, batch_size=1000):
"""
更系統化的高相似度搜尋:
對每個簽名,找出與它最相似的其他簽名
"""
print(f"\n系統化搜尋高相似度對 (threshold={threshold})...")
print("這會對每個簽名找出最相似的候選...")
n = len(filenames)
high_sim_pairs = []
seen_pairs = set()
# 隨機抽樣一部分簽名作為查詢
sample_indices = random.sample(range(n), min(5000, n))
for idx in tqdm(sample_indices, desc="搜尋"):
# 計算這個簽名與所有其他簽名的相似度
# 使用矩陣運算加速
sims = features @ features[idx]
# 找出高於閾值的(排除自己)
high_sim_idx = np.where(sims > threshold)[0]
for j in high_sim_idx:
if j != idx:
pair_key = tuple(sorted([idx, int(j)]))
if pair_key not in seen_pairs:
seen_pairs.add(pair_key)
high_sim_pairs.append({
'idx1': int(idx),
'idx2': int(j),
'file1': filenames[idx],
'file2': filenames[int(j)],
'similarity': float(sims[j]),
'parsed1': parse_filename(filenames[idx]),
'parsed2': parse_filename(filenames[int(j)])
})
return high_sim_pairs
def analyze_high_similarity_sources(high_sim_pairs):
"""分析高相似度對的來源特徵"""
print("\n分析高相似度對的來源...")
stats = {
'same_pdf': 0,
'same_year_month': 0,
'same_doc_type': 0,
'different_everything': 0,
'total': len(high_sim_pairs)
}
for pair in high_sim_pairs:
p1, p2 = pair.get('parsed1', {}), pair.get('parsed2', {})
# 同一 PDF
if p1.get('year_month') == p2.get('year_month') and \
p1.get('serial') == p2.get('serial') and \
p1.get('doc_type') == p2.get('doc_type'):
stats['same_pdf'] += 1
# 同月份
elif p1.get('year_month') == p2.get('year_month'):
stats['same_year_month'] += 1
# 同類型
elif p1.get('doc_type') == p2.get('doc_type'):
stats['same_doc_type'] += 1
else:
stats['different_everything'] += 1
return stats
def plot_similarity_distribution(similarities, output_path):
"""繪製相似度分布圖"""
print("\n繪製分布圖...")
try:
# 轉換為 Python list 完全避免 numpy 問題
sim_list = similarities.tolist()
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 左圖:完整分布 - 使用 range 指定 bins
ax1 = axes[0]
ax1.hist(sim_list, bins=np.linspace(min(sim_list), max(sim_list), 101).tolist(),
density=True, alpha=0.7, color='steelblue', edgecolor='white')
ax1.axvline(x=0.95, color='red', linestyle='--', label='0.95 threshold')
ax1.axvline(x=0.99, color='darkred', linestyle='--', label='0.99 threshold')
ax1.set_xlabel('Cosine Similarity', fontsize=12)
ax1.set_ylabel('Density', fontsize=12)
ax1.set_title('Signature Similarity Distribution (Random Sampling)', fontsize=14)
ax1.legend()
# 統計標註
mean_sim = float(np.mean(similarities))
std_sim = float(np.std(similarities))
ax1.annotate(f'Mean: {mean_sim:.4f}\nStd: {std_sim:.4f}',
xy=(0.02, 0.95), xycoords='axes fraction',
fontsize=10, verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
# 右圖:高相似度區域放大
ax2 = axes[1]
high_sim_list = [x for x in sim_list if x > 0.8]
if len(high_sim_list) > 0:
ax2.hist(high_sim_list, bins=np.linspace(0.8, max(high_sim_list), 51).tolist(),
density=True, alpha=0.7, color='coral', edgecolor='white')
ax2.axvline(x=0.95, color='red', linestyle='--', label='0.95 threshold')
ax2.axvline(x=0.99, color='darkred', linestyle='--', label='0.99 threshold')
ax2.set_xlabel('Cosine Similarity', fontsize=12)
ax2.set_ylabel('Density', fontsize=12)
ax2.set_title('High Similarity Region (> 0.8)', fontsize=14)
ax2.legend()
# 高相似度統計
pct_95 = int((similarities > 0.95).sum()) / len(similarities) * 100
pct_99 = int((similarities > 0.99).sum()) / len(similarities) * 100
ax2.annotate(f'> 0.95: {pct_95:.4f}%\n> 0.99: {pct_99:.4f}%',
xy=(0.98, 0.95), xycoords='axes fraction',
fontsize=10, verticalalignment='top', horizontalalignment='right',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
plt.tight_layout()
plt.savefig(output_path, dpi=150, bbox_inches='tight')
plt.close()
print(f"分布圖已儲存: {output_path}")
except Exception as e:
print(f"繪圖失敗: {e}")
print("跳過繪圖,繼續其他分析...")
def generate_statistics_report(similarities, high_sim_pairs, source_stats, output_path):
"""生成統計報告"""
report = {
'random_sampling': {
'n_pairs': len(similarities),
'mean': float(np.mean(similarities)),
'std': float(np.std(similarities)),
'min': float(np.min(similarities)),
'max': float(np.max(similarities)),
'percentiles': {
'25%': float(np.percentile(similarities, 25)),
'50%': float(np.percentile(similarities, 50)),
'75%': float(np.percentile(similarities, 75)),
'90%': float(np.percentile(similarities, 90)),
'95%': float(np.percentile(similarities, 95)),
'99%': float(np.percentile(similarities, 99)),
},
'above_thresholds': {
'>0.90': int((similarities > 0.90).sum()),
'>0.95': int((similarities > 0.95).sum()),
'>0.99': int((similarities > 0.99).sum()),
}
},
'high_similarity_search': {
'threshold': HIGH_SIMILARITY_THRESHOLD,
'pairs_found': len(high_sim_pairs),
'source_analysis': source_stats,
'top_10_pairs': sorted(high_sim_pairs, key=lambda x: x['similarity'], reverse=True)[:10]
}
}
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2, ensure_ascii=False)
print(f"統計報告已儲存: {output_path}")
return report
def print_summary(report):
"""印出摘要"""
print("\n" + "=" * 70)
print("相似度分布分析摘要")
print("=" * 70)
rs = report['random_sampling']
print(f"\n隨機抽樣統計 ({rs['n_pairs']:,} 對):")
print(f" 平均相似度: {rs['mean']:.4f}")
print(f" 標準差: {rs['std']:.4f}")
print(f" 範圍: [{rs['min']:.4f}, {rs['max']:.4f}]")
print(f"\n百分位數:")
for k, v in rs['percentiles'].items():
print(f" {k}: {v:.4f}")
print(f"\n高相似度對數量:")
for k, v in rs['above_thresholds'].items():
pct = v / rs['n_pairs'] * 100
print(f" {k}: {v:,} ({pct:.4f}%)")
hs = report['high_similarity_search']
print(f"\n系統化搜尋結果 (threshold={hs['threshold']}):")
print(f" 發現高相似度對: {hs['pairs_found']:,}")
if hs['source_analysis']['total'] > 0:
sa = hs['source_analysis']
print(f"\n來源分析:")
print(f" 同一 PDF: {sa['same_pdf']} ({sa['same_pdf']/sa['total']*100:.1f}%)")
print(f" 同月份: {sa['same_year_month']} ({sa['same_year_month']/sa['total']*100:.1f}%)")
print(f" 同類型: {sa['same_doc_type']} ({sa['same_doc_type']/sa['total']*100:.1f}%)")
print(f" 完全不同: {sa['different_everything']} ({sa['different_everything']/sa['total']*100:.1f}%)")
if hs['top_10_pairs']:
print(f"\nTop 10 高相似度對:")
for i, pair in enumerate(hs['top_10_pairs'], 1):
print(f" {i}. {pair['similarity']:.4f}")
print(f" {pair['file1']}")
print(f" {pair['file2']}")
def main():
print("=" * 70)
print("Step 3: 相似度分布探索")
print("=" * 70)
# 確保輸出目錄存在
REPORTS_PATH.mkdir(parents=True, exist_ok=True)
# 載入資料
features, filenames = load_data()
# 隨機抽樣分析
similarities, pair_indices = random_sampling_analysis(features, filenames, NUM_RANDOM_PAIRS)
# 繪製分布圖
plot_similarity_distribution(
similarities,
REPORTS_PATH / "similarity_distribution.png"
)
# 系統化搜尋高相似度對
high_sim_pairs = systematic_high_similarity_search(
features, filenames,
threshold=HIGH_SIMILARITY_THRESHOLD
)
# 分析來源
source_stats = analyze_high_similarity_sources(high_sim_pairs)
# 生成報告
report = generate_statistics_report(
similarities, high_sim_pairs, source_stats,
REPORTS_PATH / "similarity_statistics.json"
)
# 儲存高相似度對列表
high_sim_output = REPORTS_PATH / "high_similarity_pairs.json"
with open(high_sim_output, 'w', encoding='utf-8') as f:
json.dump(high_sim_pairs, f, indent=2, ensure_ascii=False)
print(f"高相似度對列表已儲存: {high_sim_output}")
# 印出摘要
print_summary(report)
if __name__ == "__main__":
main()
@@ -0,0 +1,274 @@
#!/usr/bin/env python3
"""
Step 4: 生成高相似度案例的視覺化報告
讀取 high_similarity_pairs.json
為 Top N 高相似度對生成並排對比圖
生成 HTML 報告
"""
import json
import cv2
import numpy as np
from pathlib import Path
from tqdm import tqdm
import base64
from io import BytesIO
# 路徑配置
IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
REPORTS_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
HIGH_SIM_JSON = REPORTS_PATH / "high_similarity_pairs.json"
# 報告配置
TOP_N = 100 # 顯示前 N 對
def load_image(filename: str) -> np.ndarray:
"""載入圖片"""
img_path = IMAGES_DIR / filename
img = cv2.imread(str(img_path))
if img is None:
# 返回空白圖片
return np.ones((100, 200, 3), dtype=np.uint8) * 255
return img
def create_comparison_image(file1: str, file2: str, similarity: float) -> np.ndarray:
"""建立並排對比圖"""
img1 = load_image(file1)
img2 = load_image(file2)
# 統一高度
h1, w1 = img1.shape[:2]
h2, w2 = img2.shape[:2]
target_h = max(h1, h2, 100)
# 縮放
if h1 != target_h:
scale = target_h / h1
img1 = cv2.resize(img1, (int(w1 * scale), target_h))
if h2 != target_h:
scale = target_h / h2
img2 = cv2.resize(img2, (int(w2 * scale), target_h))
# 加入分隔線
separator = np.ones((target_h, 20, 3), dtype=np.uint8) * 200
# 合併
comparison = np.hstack([img1, separator, img2])
return comparison
def image_to_base64(img: np.ndarray) -> str:
"""將圖片轉換為 base64"""
_, buffer = cv2.imencode('.png', img)
return base64.b64encode(buffer).decode('utf-8')
def generate_html_report(pairs: list, output_path: Path):
"""生成 HTML 報告"""
html_content = """
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>簽名相似度分析報告 - 高相似度案例</title>
<style>
body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
max-width: 1400px;
margin: 0 auto;
padding: 20px;
background-color: #f5f5f5;
}
h1 {
color: #333;
text-align: center;
border-bottom: 2px solid #666;
padding-bottom: 10px;
}
.summary {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 20px;
border-radius: 10px;
margin-bottom: 30px;
}
.summary h2 {
margin-top: 0;
}
.pair-card {
background: white;
border-radius: 10px;
padding: 20px;
margin-bottom: 20px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
.pair-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 15px;
padding-bottom: 10px;
border-bottom: 1px solid #eee;
}
.pair-number {
font-size: 1.2em;
font-weight: bold;
color: #333;
}
.similarity-badge {
background: #dc3545;
color: white;
padding: 5px 15px;
border-radius: 20px;
font-weight: bold;
}
.similarity-badge.high {
background: #dc3545;
}
.similarity-badge.very-high {
background: #8b0000;
}
.file-info {
font-family: monospace;
font-size: 0.9em;
color: #666;
margin-bottom: 10px;
}
.comparison-image {
max-width: 100%;
border: 1px solid #ddd;
border-radius: 5px;
}
.analysis {
margin-top: 15px;
padding: 10px;
background: #f8f9fa;
border-radius: 5px;
font-size: 0.9em;
}
.tag {
display: inline-block;
padding: 2px 8px;
border-radius: 3px;
margin-right: 5px;
font-size: 0.8em;
}
.tag-same-serial { background: #ffebee; color: #c62828; }
.tag-same-month { background: #fff3e0; color: #e65100; }
.tag-diff { background: #e8f5e9; color: #2e7d32; }
</style>
</head>
<body>
<h1>簽名相似度分析報告 - 高相似度案例</h1>
<div class="summary">
<h2>摘要</h2>
<p><strong>分析結果:</strong>發現 659,111 對高相似度簽名 (>0.95)</p>
<p><strong>本報告顯示:</strong>Top """ + str(TOP_N) + """ 最高相似度案例</p>
<p><strong>結論:</strong>存在大量相似度接近或等於 1.0 的簽名對,強烈暗示「複製貼上」行為</p>
</div>
<div class="pairs-container">
"""
for i, pair in enumerate(pairs[:TOP_N], 1):
sim = pair['similarity']
file1 = pair['file1']
file2 = pair['file2']
p1 = pair.get('parsed1', {})
p2 = pair.get('parsed2', {})
# 分析關係
tags = []
if p1.get('serial') == p2.get('serial'):
tags.append(('<span class="tag tag-same-serial">同序號</span>', ''))
if p1.get('year_month') == p2.get('year_month'):
tags.append(('<span class="tag tag-same-month">同月份</span>', ''))
if p1.get('year_month') != p2.get('year_month') and p1.get('serial') != p2.get('serial'):
tags.append(('<span class="tag tag-diff">完全不同文件</span>', ''))
badge_class = 'very-high' if sim >= 0.99 else 'high'
# 建立對比圖
try:
comparison_img = create_comparison_image(file1, file2, sim)
img_base64 = image_to_base64(comparison_img)
img_html = f'<img src="data:image/png;base64,{img_base64}" class="comparison-image">'
except Exception as e:
img_html = f'<p style="color:red">無法載入圖片: {e}</p>'
tag_html = ''.join([t[0] for t in tags])
html_content += f"""
<div class="pair-card">
<div class="pair-header">
<span class="pair-number">#{i}</span>
<span class="similarity-badge {badge_class}">相似度: {sim:.4f}</span>
</div>
<div class="file-info">
<strong>簽名 1:</strong> {file1}<br>
<strong>簽名 2:</strong> {file2}
</div>
{img_html}
<div class="analysis">
{tag_html}
<br><small>日期: {p1.get('year_month', 'N/A')} vs {p2.get('year_month', 'N/A')} |
序號: {p1.get('serial', 'N/A')} vs {p2.get('serial', 'N/A')}</small>
</div>
</div>
"""
html_content += """
</div>
<div style="text-align: center; margin-top: 30px; color: #666;">
<p>生成時間: 2024 | 簽名真實性研究計劃</p>
</div>
</body>
</html>
"""
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html_content)
print(f"HTML 報告已儲存: {output_path}")
def main():
print("=" * 60)
print("Step 4: 生成高相似度案例視覺化報告")
print("=" * 60)
# 載入高相似度對
print("載入高相似度對資料...")
with open(HIGH_SIM_JSON, 'r', encoding='utf-8') as f:
pairs = json.load(f)
print(f"{len(pairs):,} 對高相似度簽名")
# 按相似度排序
pairs_sorted = sorted(pairs, key=lambda x: x['similarity'], reverse=True)
# 統計
sim_1 = len([p for p in pairs_sorted if p['similarity'] >= 0.9999])
sim_99 = len([p for p in pairs_sorted if p['similarity'] >= 0.99])
sim_97 = len([p for p in pairs_sorted if p['similarity'] >= 0.97])
print(f"\n相似度統計:")
print(f" = 1.0 (完全相同): {sim_1:,}")
print(f" >= 0.99: {sim_99:,}")
print(f" >= 0.97: {sim_97:,}")
# 生成報告
print(f"\n生成 Top {TOP_N} 視覺化報告...")
generate_html_report(pairs_sorted, REPORTS_PATH / "high_similarity_report.html")
print("\n完成!")
if __name__ == "__main__":
main()
+432
View File
@@ -0,0 +1,432 @@
#!/usr/bin/env python3
"""
Step 5: 從 PDF 提取會計師印刷姓名
流程:
1. 從資料庫讀取簽名記錄,按 (PDF, page) 分組
2. 對每個頁面重新執行 YOLO 獲取簽名框座標
3. 對整頁執行 PaddleOCR 提取印刷文字
4. 過濾出候選姓名(2-4 個中文字)
5. 配對簽名與最近的印刷姓名
6. 更新資料庫的 accountant_name 欄位
"""
import sqlite3
import json
import re
import sys
import time
from pathlib import Path
from typing import Optional, List, Dict, Tuple
from collections import defaultdict
from tqdm import tqdm
import numpy as np
import cv2
import fitz # PyMuPDF
# 加入父目錄到路徑以便匯入
sys.path.insert(0, str(Path(__file__).parent.parent))
from paddleocr_client import PaddleOCRClient
# 路徑配置
PDF_BASE = Path("/Volumes/NV2/PDF-Processing/total-pdf")
YOLO_MODEL_PATH = Path("/Volumes/NV2/pdf_recognize/models/best.pt")
DB_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db")
REPORTS_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
# 處理配置
DPI = 150
CONFIDENCE_THRESHOLD = 0.5
NAME_SEARCH_MARGIN = 200 # 簽名框周圍搜索姓名的像素範圍
PROGRESS_SAVE_INTERVAL = 100 # 每處理 N 個頁面保存一次進度
# 中文姓名正則
CHINESE_NAME_PATTERN = re.compile(r'^[\u4e00-\u9fff]{2,4}$')
def find_pdf_file(filename: str) -> Optional[str]:
"""搜尋 PDF 檔案路徑"""
# 先在 batch_* 子目錄尋找
for batch_dir in sorted(PDF_BASE.glob("batch_*")):
pdf_path = batch_dir / filename
if pdf_path.exists():
return str(pdf_path)
# 再在頂層目錄尋找
pdf_path = PDF_BASE / filename
if pdf_path.exists():
return str(pdf_path)
return None
def render_pdf_page(pdf_path: str, page_num: int) -> Optional[np.ndarray]:
"""渲染 PDF 頁面為圖像"""
try:
doc = fitz.open(pdf_path)
if page_num < 1 or page_num > len(doc):
doc.close()
return None
page = doc[page_num - 1]
mat = fitz.Matrix(DPI / 72, DPI / 72)
pix = page.get_pixmap(matrix=mat, alpha=False)
image = np.frombuffer(pix.samples, dtype=np.uint8)
image = image.reshape(pix.height, pix.width, pix.n)
doc.close()
return image
except Exception as e:
print(f"渲染失敗: {pdf_path} page {page_num}: {e}")
return None
def detect_signatures_yolo(image: np.ndarray, model) -> List[Dict]:
"""使用 YOLO 偵測簽名框"""
results = model(image, conf=CONFIDENCE_THRESHOLD, verbose=False)
signatures = []
for r in results:
for box in r.boxes:
x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
conf = float(box.conf[0].cpu().numpy())
signatures.append({
'x': x1,
'y': y1,
'width': x2 - x1,
'height': y2 - y1,
'confidence': conf,
'center_x': (x1 + x2) / 2,
'center_y': (y1 + y2) / 2
})
# 按位置排序(上到下,左到右)
signatures.sort(key=lambda s: (s['y'], s['x']))
return signatures
def extract_text_candidates(image: np.ndarray, ocr_client: PaddleOCRClient) -> List[Dict]:
"""從圖像中提取所有文字候選"""
try:
results = ocr_client.ocr(image)
candidates = []
for result in results:
text = result.get('text', '').strip()
box = result.get('box', [])
confidence = result.get('confidence', 0.0)
if not box or not text:
continue
# 計算邊界框中心
xs = [point[0] for point in box]
ys = [point[1] for point in box]
center_x = sum(xs) / len(xs)
center_y = sum(ys) / len(ys)
candidates.append({
'text': text,
'center_x': center_x,
'center_y': center_y,
'x': min(xs),
'y': min(ys),
'width': max(xs) - min(xs),
'height': max(ys) - min(ys),
'confidence': confidence
})
return candidates
except Exception as e:
print(f"OCR 失敗: {e}")
return []
def filter_name_candidates(candidates: List[Dict]) -> List[Dict]:
"""過濾出可能是姓名的文字(2-4 個中文字,不含數字標點)"""
names = []
for c in candidates:
text = c['text']
# 移除空白和標點
text_clean = re.sub(r'[\s\:\\,\\.\。]', '', text)
if CHINESE_NAME_PATTERN.match(text_clean):
c['text_clean'] = text_clean
names.append(c)
return names
def match_signature_to_name(
sig: Dict,
name_candidates: List[Dict],
margin: int = NAME_SEARCH_MARGIN
) -> Optional[str]:
"""為簽名框配對最近的姓名候選"""
sig_center_x = sig['center_x']
sig_center_y = sig['center_y']
# 過濾出在搜索範圍內的姓名
nearby_names = []
for name in name_candidates:
dx = abs(name['center_x'] - sig_center_x)
dy = abs(name['center_y'] - sig_center_y)
# 在 margin 範圍內
if dx <= margin + sig['width']/2 and dy <= margin + sig['height']/2:
distance = (dx**2 + dy**2) ** 0.5
nearby_names.append((name, distance))
if not nearby_names:
return None
# 返回距離最近的
nearby_names.sort(key=lambda x: x[1])
return nearby_names[0][0]['text_clean']
def get_pages_to_process(conn: sqlite3.Connection) -> List[Tuple[str, int, List[int]]]:
"""
從資料庫獲取需要處理的 (PDF, page) 組合
Returns:
List of (source_pdf, page_number, [signature_ids])
"""
cursor = conn.cursor()
# 查詢尚未有 accountant_name 的簽名,按 (PDF, page) 分組
cursor.execute('''
SELECT source_pdf, page_number, GROUP_CONCAT(signature_id)
FROM signatures
WHERE accountant_name IS NULL OR accountant_name = ''
GROUP BY source_pdf, page_number
ORDER BY source_pdf, page_number
''')
pages = []
for row in cursor.fetchall():
source_pdf, page_number, sig_ids_str = row
sig_ids = [int(x) for x in sig_ids_str.split(',')]
pages.append((source_pdf, page_number, sig_ids))
return pages
def update_signature_names(
conn: sqlite3.Connection,
updates: List[Tuple[int, str, int, int, int, int]]
):
"""
更新資料庫中的簽名姓名和座標
Args:
updates: List of (signature_id, accountant_name, x, y, width, height)
"""
cursor = conn.cursor()
# 確保 signature_boxes 表存在
cursor.execute('''
CREATE TABLE IF NOT EXISTS signature_boxes (
signature_id INTEGER PRIMARY KEY,
x INTEGER,
y INTEGER,
width INTEGER,
height INTEGER,
FOREIGN KEY (signature_id) REFERENCES signatures(signature_id)
)
''')
for sig_id, name, x, y, w, h in updates:
# 更新姓名
cursor.execute('''
UPDATE signatures SET accountant_name = ? WHERE signature_id = ?
''', (name, sig_id))
# 更新或插入座標
cursor.execute('''
INSERT OR REPLACE INTO signature_boxes (signature_id, x, y, width, height)
VALUES (?, ?, ?, ?, ?)
''', (sig_id, x, y, w, h))
conn.commit()
def process_page(
source_pdf: str,
page_number: int,
sig_ids: List[int],
yolo_model,
ocr_client: PaddleOCRClient,
conn: sqlite3.Connection
) -> Dict:
"""
處理單一頁面:偵測簽名框、提取姓名、配對
Returns:
處理結果統計
"""
result = {
'source_pdf': source_pdf,
'page_number': page_number,
'num_signatures': len(sig_ids),
'matched': 0,
'unmatched': 0,
'error': None
}
# 找 PDF 檔案
pdf_path = find_pdf_file(source_pdf)
if pdf_path is None:
result['error'] = 'PDF not found'
return result
# 渲染頁面
image = render_pdf_page(pdf_path, page_number)
if image is None:
result['error'] = 'Render failed'
return result
# YOLO 偵測簽名框
sig_boxes = detect_signatures_yolo(image, yolo_model)
if len(sig_boxes) != len(sig_ids):
# 簽名數量不匹配,嘗試按順序配對
pass
# OCR 提取文字
text_candidates = extract_text_candidates(image, ocr_client)
# 過濾出姓名候選
name_candidates = filter_name_candidates(text_candidates)
# 配對簽名與姓名
updates = []
for i, (sig_id, sig_box) in enumerate(zip(sig_ids, sig_boxes)):
matched_name = match_signature_to_name(sig_box, name_candidates)
if matched_name:
result['matched'] += 1
else:
result['unmatched'] += 1
matched_name = '' # 空字串表示未配對
updates.append((
sig_id,
matched_name,
sig_box['x'],
sig_box['y'],
sig_box['width'],
sig_box['height']
))
# 如果 YOLO 偵測數量少於記錄數量,處理剩餘的
if len(sig_boxes) < len(sig_ids):
for sig_id in sig_ids[len(sig_boxes):]:
updates.append((sig_id, '', 0, 0, 0, 0))
result['unmatched'] += 1
# 更新資料庫
update_signature_names(conn, updates)
return result
def main():
print("=" * 60)
print("Step 5: 從 PDF 提取會計師印刷姓名")
print("=" * 60)
# 確保報告目錄存在
REPORTS_PATH.mkdir(parents=True, exist_ok=True)
# 連接資料庫
print("\n連接資料庫...")
conn = sqlite3.connect(DB_PATH)
# 獲取需要處理的頁面
print("查詢待處理頁面...")
pages = get_pages_to_process(conn)
print(f"{len(pages)} 個頁面待處理")
if not pages:
print("沒有需要處理的頁面")
conn.close()
return
# 初始化 YOLO
print("\n載入 YOLO 模型...")
from ultralytics import YOLO
yolo_model = YOLO(str(YOLO_MODEL_PATH))
# 初始化 OCR 客戶端
print("連接 PaddleOCR 伺服器...")
ocr_client = PaddleOCRClient()
if not ocr_client.health_check():
print("錯誤: PaddleOCR 伺服器無法連接")
print("請確認伺服器 http://192.168.30.36:5555 正在運行")
conn.close()
return
print("OCR 伺服器連接成功")
# 統計
stats = {
'total_pages': len(pages),
'processed': 0,
'matched': 0,
'unmatched': 0,
'errors': 0,
'start_time': time.time()
}
# 處理每個頁面
print(f"\n開始處理 {len(pages)} 個頁面...")
for source_pdf, page_number, sig_ids in tqdm(pages, desc="處理頁面"):
result = process_page(
source_pdf, page_number, sig_ids,
yolo_model, ocr_client, conn
)
stats['processed'] += 1
stats['matched'] += result['matched']
stats['unmatched'] += result['unmatched']
if result['error']:
stats['errors'] += 1
# 定期保存進度報告
if stats['processed'] % PROGRESS_SAVE_INTERVAL == 0:
elapsed = time.time() - stats['start_time']
rate = stats['processed'] / elapsed
remaining = (stats['total_pages'] - stats['processed']) / rate if rate > 0 else 0
print(f"\n進度: {stats['processed']}/{stats['total_pages']} "
f"({stats['processed']/stats['total_pages']*100:.1f}%)")
print(f"配對成功: {stats['matched']}, 未配對: {stats['unmatched']}")
print(f"預估剩餘時間: {remaining/60:.1f} 分鐘")
# 最終統計
elapsed = time.time() - stats['start_time']
stats['elapsed_seconds'] = elapsed
print("\n" + "=" * 60)
print("處理完成")
print("=" * 60)
print(f"總頁面數: {stats['total_pages']}")
print(f"處理成功: {stats['processed']}")
print(f"配對成功: {stats['matched']}")
print(f"未配對: {stats['unmatched']}")
print(f"錯誤: {stats['errors']}")
print(f"耗時: {elapsed/60:.1f} 分鐘")
# 保存報告
report_path = REPORTS_PATH / "name_extraction_report.json"
with open(report_path, 'w', encoding='utf-8') as f:
json.dump(stats, f, indent=2, ensure_ascii=False)
print(f"\n報告已儲存: {report_path}")
conn.close()
if __name__ == "__main__":
main()
+402
View File
@@ -0,0 +1,402 @@
#!/usr/bin/env python3
"""
Step 5: 從 PDF 提取會計師姓名 - 完整處理版本
流程:
1. 從資料庫讀取簽名記錄,按 (PDF, page) 分組
2. 對每個頁面重新執行 YOLO 獲取簽名框座標
3. 對整頁執行 PaddleOCR 提取文字
4. 過濾出候選姓名(2-4 個中文字)
5. 配對簽名與最近的姓名
6. 更新資料庫並生成報告
"""
import sqlite3
import json
import re
import sys
import time
from pathlib import Path
from typing import Optional, List, Dict, Tuple
from collections import defaultdict
from datetime import datetime
from tqdm import tqdm
import numpy as np
import fitz # PyMuPDF
# 加入父目錄到路徑
sys.path.insert(0, str(Path(__file__).parent.parent))
from paddleocr_client import PaddleOCRClient
# 路徑配置
PDF_BASE = Path("/Volumes/NV2/PDF-Processing/total-pdf")
YOLO_MODEL_PATH = Path("/Volumes/NV2/pdf_recognize/models/best.pt")
DB_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db")
REPORTS_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
# 處理配置
DPI = 150
CONFIDENCE_THRESHOLD = 0.5
NAME_SEARCH_MARGIN = 200
PROGRESS_SAVE_INTERVAL = 100
BATCH_COMMIT_SIZE = 50
# 中文姓名正則
CHINESE_NAME_PATTERN = re.compile(r'^[\u4e00-\u9fff]{2,4}$')
# 排除的常見詞
EXCLUDE_WORDS = {'會計', '會計師', '事務所', '', '聯合', '出具報告'}
def find_pdf_file(filename: str) -> Optional[str]:
"""搜尋 PDF 檔案路徑"""
for batch_dir in sorted(PDF_BASE.glob("batch_*")):
pdf_path = batch_dir / filename
if pdf_path.exists():
return str(pdf_path)
pdf_path = PDF_BASE / filename
if pdf_path.exists():
return str(pdf_path)
return None
def render_pdf_page(pdf_path: str, page_num: int) -> Optional[np.ndarray]:
"""渲染 PDF 頁面為圖像"""
try:
doc = fitz.open(pdf_path)
if page_num < 1 or page_num > len(doc):
doc.close()
return None
page = doc[page_num - 1]
mat = fitz.Matrix(DPI / 72, DPI / 72)
pix = page.get_pixmap(matrix=mat, alpha=False)
image = np.frombuffer(pix.samples, dtype=np.uint8)
image = image.reshape(pix.height, pix.width, pix.n)
doc.close()
return image
except Exception:
return None
def detect_signatures_yolo(image: np.ndarray, model) -> List[Dict]:
"""使用 YOLO 偵測簽名框"""
results = model(image, conf=CONFIDENCE_THRESHOLD, verbose=False)
signatures = []
for r in results:
for box in r.boxes:
x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
conf = float(box.conf[0].cpu().numpy())
signatures.append({
'x': x1, 'y': y1,
'width': x2 - x1, 'height': y2 - y1,
'confidence': conf,
'center_x': (x1 + x2) / 2,
'center_y': (y1 + y2) / 2
})
signatures.sort(key=lambda s: (s['y'], s['x']))
return signatures
def extract_and_filter_names(image: np.ndarray, ocr_client: PaddleOCRClient) -> List[Dict]:
"""從圖像提取並過濾姓名候選"""
try:
results = ocr_client.ocr(image)
except Exception:
return []
candidates = []
for result in results:
text = result.get('text', '').strip()
box = result.get('box', [])
if not box or not text:
continue
# 清理文字
text_clean = re.sub(r'[\s\:\\,\\.\\、]', '', text)
# 檢查是否為姓名候選
if CHINESE_NAME_PATTERN.match(text_clean) and text_clean not in EXCLUDE_WORDS:
xs = [point[0] for point in box]
ys = [point[1] for point in box]
candidates.append({
'text': text_clean,
'center_x': sum(xs) / len(xs),
'center_y': sum(ys) / len(ys),
})
return candidates
def match_signature_to_name(sig: Dict, name_candidates: List[Dict]) -> Optional[str]:
"""為簽名框配對最近的姓名"""
margin = NAME_SEARCH_MARGIN
nearby = []
for name in name_candidates:
dx = abs(name['center_x'] - sig['center_x'])
dy = abs(name['center_y'] - sig['center_y'])
if dx <= margin + sig['width']/2 and dy <= margin + sig['height']/2:
distance = (dx**2 + dy**2) ** 0.5
nearby.append((name['text'], distance))
if nearby:
nearby.sort(key=lambda x: x[1])
return nearby[0][0]
return None
def get_pages_to_process(conn: sqlite3.Connection) -> List[Tuple[str, int, List[int]]]:
"""從資料庫獲取需要處理的頁面"""
cursor = conn.cursor()
cursor.execute('''
SELECT source_pdf, page_number, GROUP_CONCAT(signature_id)
FROM signatures
WHERE accountant_name IS NULL OR accountant_name = ''
GROUP BY source_pdf, page_number
ORDER BY source_pdf, page_number
''')
pages = []
for row in cursor.fetchall():
source_pdf, page_number, sig_ids_str = row
sig_ids = [int(x) for x in sig_ids_str.split(',')]
pages.append((source_pdf, page_number, sig_ids))
return pages
def process_page(
source_pdf: str, page_number: int, sig_ids: List[int],
yolo_model, ocr_client: PaddleOCRClient
) -> Dict:
"""處理單一頁面"""
result = {
'source_pdf': source_pdf,
'page_number': page_number,
'num_signatures': len(sig_ids),
'matched': 0,
'unmatched': 0,
'error': None,
'updates': []
}
pdf_path = find_pdf_file(source_pdf)
if pdf_path is None:
result['error'] = 'PDF not found'
return result
image = render_pdf_page(pdf_path, page_number)
if image is None:
result['error'] = 'Render failed'
return result
sig_boxes = detect_signatures_yolo(image, yolo_model)
name_candidates = extract_and_filter_names(image, ocr_client)
for i, sig_id in enumerate(sig_ids):
if i < len(sig_boxes):
sig = sig_boxes[i]
matched_name = match_signature_to_name(sig, name_candidates)
if matched_name:
result['matched'] += 1
else:
result['unmatched'] += 1
matched_name = ''
result['updates'].append((
sig_id, matched_name,
sig['x'], sig['y'], sig['width'], sig['height']
))
else:
result['updates'].append((sig_id, '', 0, 0, 0, 0))
result['unmatched'] += 1
return result
def save_updates_to_db(conn: sqlite3.Connection, updates: List[Tuple]):
"""批次更新資料庫"""
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS signature_boxes (
signature_id INTEGER PRIMARY KEY,
x INTEGER, y INTEGER, width INTEGER, height INTEGER,
FOREIGN KEY (signature_id) REFERENCES signatures(signature_id)
)
''')
for sig_id, name, x, y, w, h in updates:
cursor.execute('UPDATE signatures SET accountant_name = ? WHERE signature_id = ?', (name, sig_id))
if x > 0: # 有座標才存
cursor.execute('''
INSERT OR REPLACE INTO signature_boxes (signature_id, x, y, width, height)
VALUES (?, ?, ?, ?, ?)
''', (sig_id, x, y, w, h))
conn.commit()
def generate_report(stats: Dict, output_path: Path):
"""生成處理報告"""
report = {
'title': '會計師姓名提取報告',
'generated_at': datetime.now().isoformat(),
'summary': {
'total_pages': stats['total_pages'],
'processed_pages': stats['processed'],
'total_signatures': stats['total_sigs'],
'matched_signatures': stats['matched'],
'unmatched_signatures': stats['unmatched'],
'match_rate': f"{stats['matched']/stats['total_sigs']*100:.1f}%" if stats['total_sigs'] > 0 else "N/A",
'errors': stats['errors'],
'elapsed_seconds': stats['elapsed_seconds'],
'elapsed_human': f"{stats['elapsed_seconds']/3600:.1f} 小時"
},
'methodology': {
'step1': 'YOLO 模型偵測簽名框座標',
'step2': 'PaddleOCR 整頁 OCR 提取文字',
'step3': '過濾 2-4 個中文字作為姓名候選',
'step4': f'在簽名框周圍 {NAME_SEARCH_MARGIN}px 範圍內配對最近的姓名',
'dpi': DPI,
'yolo_confidence': CONFIDENCE_THRESHOLD
},
'name_distribution': stats.get('name_distribution', {}),
'error_samples': stats.get('error_samples', [])
}
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2, ensure_ascii=False)
# 同時生成 Markdown 報告
md_path = output_path.with_suffix('.md')
with open(md_path, 'w', encoding='utf-8') as f:
f.write(f"# {report['title']}\n\n")
f.write(f"生成時間: {report['generated_at']}\n\n")
f.write("## 摘要\n\n")
f.write(f"| 指標 | 數值 |\n|------|------|\n")
for k, v in report['summary'].items():
f.write(f"| {k} | {v} |\n")
f.write("\n## 方法論\n\n")
for k, v in report['methodology'].items():
f.write(f"- **{k}**: {v}\n")
f.write("\n## 姓名分布 (Top 50)\n\n")
names = sorted(report['name_distribution'].items(), key=lambda x: -x[1])[:50]
for name, count in names:
f.write(f"- {name}: {count}\n")
return report
def main():
print("=" * 70)
print("Step 5: 從 PDF 提取會計師姓名 - 完整處理")
print("=" * 70)
print(f"開始時間: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
REPORTS_PATH.mkdir(parents=True, exist_ok=True)
# 連接資料庫
conn = sqlite3.connect(DB_PATH)
pages = get_pages_to_process(conn)
print(f"\n待處理頁面: {len(pages):,}")
if not pages:
print("沒有需要處理的頁面")
conn.close()
return
# 載入 YOLO
print("\n載入 YOLO 模型...")
from ultralytics import YOLO
yolo_model = YOLO(str(YOLO_MODEL_PATH))
# 連接 OCR
print("連接 PaddleOCR 伺服器...")
ocr_client = PaddleOCRClient()
if not ocr_client.health_check():
print("錯誤: PaddleOCR 伺服器無法連接")
conn.close()
return
print("OCR 伺服器連接成功\n")
# 統計
stats = {
'total_pages': len(pages),
'processed': 0,
'total_sigs': sum(len(p[2]) for p in pages),
'matched': 0,
'unmatched': 0,
'errors': 0,
'error_samples': [],
'name_distribution': defaultdict(int),
'start_time': time.time()
}
all_updates = []
# 處理每個頁面
for source_pdf, page_number, sig_ids in tqdm(pages, desc="處理頁面"):
result = process_page(source_pdf, page_number, sig_ids, yolo_model, ocr_client)
stats['processed'] += 1
stats['matched'] += result['matched']
stats['unmatched'] += result['unmatched']
if result['error']:
stats['errors'] += 1
if len(stats['error_samples']) < 20:
stats['error_samples'].append({
'pdf': source_pdf,
'page': page_number,
'error': result['error']
})
else:
all_updates.extend(result['updates'])
for update in result['updates']:
if update[1]: # 有姓名
stats['name_distribution'][update[1]] += 1
# 批次提交
if len(all_updates) >= BATCH_COMMIT_SIZE:
save_updates_to_db(conn, all_updates)
all_updates = []
# 定期顯示進度
if stats['processed'] % PROGRESS_SAVE_INTERVAL == 0:
elapsed = time.time() - stats['start_time']
rate = stats['processed'] / elapsed
remaining = (stats['total_pages'] - stats['processed']) / rate if rate > 0 else 0
print(f"\n[進度] {stats['processed']:,}/{stats['total_pages']:,} "
f"({stats['processed']/stats['total_pages']*100:.1f}%) | "
f"配對: {stats['matched']:,} | "
f"剩餘: {remaining/60:.1f} 分鐘")
# 最後一批提交
if all_updates:
save_updates_to_db(conn, all_updates)
stats['elapsed_seconds'] = time.time() - stats['start_time']
stats['name_distribution'] = dict(stats['name_distribution'])
# 生成報告
print("\n生成報告...")
report_path = REPORTS_PATH / "name_extraction_report.json"
generate_report(stats, report_path)
print("\n" + "=" * 70)
print("處理完成!")
print("=" * 70)
print(f"總頁面: {stats['total_pages']:,}")
print(f"總簽名: {stats['total_sigs']:,}")
print(f"配對成功: {stats['matched']:,} ({stats['matched']/stats['total_sigs']*100:.1f}%)")
print(f"未配對: {stats['unmatched']:,}")
print(f"錯誤: {stats['errors']:,}")
print(f"耗時: {stats['elapsed_seconds']/3600:.2f} 小時")
print(f"\n報告已儲存:")
print(f" - {report_path}")
print(f" - {report_path.with_suffix('.md')}")
conn.close()
if __name__ == "__main__":
main()
+450
View File
@@ -0,0 +1,450 @@
#!/usr/bin/env python3
"""
簽名清理與會計師歸檔
1. 標記 sig_count > 2 的 PDF,篩選最佳 2 個簽名
2. 用 OCR 或座標歸檔到會計師
3. 建立 accountants 表
"""
import sqlite3
import json
from collections import defaultdict
from datetime import datetime
from opencc import OpenCC
# 簡繁轉換
cc_s2t = OpenCC('s2t')
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
REPORT_DIR = '/Volumes/NV2/PDF-Processing/signature-analysis/reports'
def get_connection():
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
return conn
def add_columns_if_needed(conn):
"""添加新欄位"""
cur = conn.cursor()
# 檢查現有欄位
cur.execute("PRAGMA table_info(signatures)")
columns = [row[1] for row in cur.fetchall()]
if 'is_valid' not in columns:
cur.execute("ALTER TABLE signatures ADD COLUMN is_valid INTEGER DEFAULT 1")
print("已添加 is_valid 欄位")
if 'assigned_accountant' not in columns:
cur.execute("ALTER TABLE signatures ADD COLUMN assigned_accountant TEXT")
print("已添加 assigned_accountant 欄位")
conn.commit()
def create_accountants_table(conn):
"""建立 accountants 表"""
cur = conn.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS accountants (
accountant_id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT UNIQUE NOT NULL,
signature_count INTEGER DEFAULT 0,
firm TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
conn.commit()
print("accountants 表已建立")
def get_pdf_signatures(conn):
"""取得每份 PDF 的簽名資料"""
cur = conn.cursor()
cur.execute("""
SELECT s.signature_id, s.source_pdf, s.page_number, s.accountant_name,
s.excel_accountant1, s.excel_accountant2, s.excel_firm,
sb.x, sb.y, sb.width, sb.height
FROM signatures s
LEFT JOIN signature_boxes sb ON s.signature_id = sb.signature_id
ORDER BY s.source_pdf, s.page_number, sb.y
""")
pdf_sigs = defaultdict(list)
for row in cur.fetchall():
pdf_sigs[row['source_pdf']].append(dict(row))
return pdf_sigs
def normalize_name(name):
"""正規化姓名(簡轉繁)"""
if not name:
return None
return cc_s2t.convert(name)
def names_match(ocr_name, excel_name):
"""檢查 OCR 姓名是否與 Excel 姓名匹配"""
if not ocr_name or not excel_name:
return False
# 精確匹配
if ocr_name == excel_name:
return True
# 簡繁轉換後匹配
ocr_trad = normalize_name(ocr_name)
if ocr_trad == excel_name:
return True
return False
def score_signature(sig, excel_acc1, excel_acc2):
"""為簽名評分"""
score = 0
ocr_name = sig.get('accountant_name', '')
# 1. OCR 姓名匹配 (+100)
if names_match(ocr_name, excel_acc1) or names_match(ocr_name, excel_acc2):
score += 100
# 2. 合理尺寸 (+20)
width = sig.get('width', 0) or 0
height = sig.get('height', 0) or 0
if 30 < width < 500 and 20 < height < 200:
score += 20
# 3. 頁面位置 - Y 座標越大分數越高 (最多 +15)
y = sig.get('y', 0) or 0
score += min(y / 100, 15)
# 4. 如果尺寸過大(可能是印章),扣分
if width > 300 or height > 150:
score -= 30
return score
def select_best_two(signatures, excel_acc1, excel_acc2):
"""選擇最佳的 2 個簽名"""
if len(signatures) <= 2:
return signatures
scored = []
for sig in signatures:
score = score_signature(sig, excel_acc1, excel_acc2)
scored.append((sig, score))
# 按分數排序
scored.sort(key=lambda x: -x[1])
# 取前 2 個
return [s[0] for s in scored[:2]]
def assign_to_accountant(sig1, sig2, excel_acc1, excel_acc2):
"""將簽名歸檔到會計師"""
ocr1 = sig1.get('accountant_name', '')
ocr2 = sig2.get('accountant_name', '')
# 方法 A: OCR 姓名匹配
if names_match(ocr1, excel_acc1):
return [(sig1, excel_acc1), (sig2, excel_acc2)]
elif names_match(ocr1, excel_acc2):
return [(sig1, excel_acc2), (sig2, excel_acc1)]
elif names_match(ocr2, excel_acc1):
return [(sig1, excel_acc2), (sig2, excel_acc1)]
elif names_match(ocr2, excel_acc2):
return [(sig1, excel_acc1), (sig2, excel_acc2)]
# 方法 B: 按 Y 座標(假設會計師1 在上)
y1 = sig1.get('y', 0) or 0
y2 = sig2.get('y', 0) or 0
if y1 <= y2:
return [(sig1, excel_acc1), (sig2, excel_acc2)]
else:
return [(sig1, excel_acc2), (sig2, excel_acc1)]
def process_all_pdfs(conn):
"""處理所有 PDF"""
print("正在載入簽名資料...")
pdf_sigs = get_pdf_signatures(conn)
print(f"{len(pdf_sigs)} 份 PDF")
cur = conn.cursor()
stats = {
'total_pdfs': len(pdf_sigs),
'sig_count_1': 0,
'sig_count_2': 0,
'sig_count_gt2': 0,
'valid_signatures': 0,
'invalid_signatures': 0,
'ocr_matched': 0,
'y_coordinate_assigned': 0,
'no_excel_data': 0,
}
assignments = [] # (signature_id, assigned_accountant, is_valid)
for pdf_name, sigs in pdf_sigs.items():
sig_count = len(sigs)
excel_acc1 = sigs[0].get('excel_accountant1') if sigs else None
excel_acc2 = sigs[0].get('excel_accountant2') if sigs else None
if not excel_acc1 and not excel_acc2:
# 無 Excel 資料
stats['no_excel_data'] += 1
for sig in sigs:
assignments.append((sig['signature_id'], None, 1))
continue
if sig_count == 1:
stats['sig_count_1'] += 1
# 只有 1 個簽名,保留但無法確定是哪位會計師
sig = sigs[0]
ocr_name = sig.get('accountant_name', '')
if names_match(ocr_name, excel_acc1):
assignments.append((sig['signature_id'], excel_acc1, 1))
stats['ocr_matched'] += 1
elif names_match(ocr_name, excel_acc2):
assignments.append((sig['signature_id'], excel_acc2, 1))
stats['ocr_matched'] += 1
else:
# 無法確定,暫時不指派
assignments.append((sig['signature_id'], None, 1))
stats['valid_signatures'] += 1
elif sig_count == 2:
stats['sig_count_2'] += 1
# 正常情況
sig1, sig2 = sigs[0], sigs[1]
pairs = assign_to_accountant(sig1, sig2, excel_acc1, excel_acc2)
for sig, acc in pairs:
assignments.append((sig['signature_id'], acc, 1))
stats['valid_signatures'] += 1
# 統計匹配方式
ocr_name = sig.get('accountant_name', '')
if names_match(ocr_name, acc):
stats['ocr_matched'] += 1
else:
stats['y_coordinate_assigned'] += 1
else:
stats['sig_count_gt2'] += 1
# 需要篩選
best_two = select_best_two(sigs, excel_acc1, excel_acc2)
# 標記有效/無效
valid_ids = {s['signature_id'] for s in best_two}
for sig in sigs:
if sig['signature_id'] in valid_ids:
is_valid = 1
stats['valid_signatures'] += 1
else:
is_valid = 0
stats['invalid_signatures'] += 1
assignments.append((sig['signature_id'], None, is_valid))
# 歸檔有效的 2 個
if len(best_two) == 2:
sig1, sig2 = best_two[0], best_two[1]
pairs = assign_to_accountant(sig1, sig2, excel_acc1, excel_acc2)
for sig, acc in pairs:
assignments.append((sig['signature_id'], acc, 1))
ocr_name = sig.get('accountant_name', '')
if names_match(ocr_name, acc):
stats['ocr_matched'] += 1
else:
stats['y_coordinate_assigned'] += 1
elif len(best_two) == 1:
sig = best_two[0]
ocr_name = sig.get('accountant_name', '')
if names_match(ocr_name, excel_acc1):
assignments.append((sig['signature_id'], excel_acc1, 1))
elif names_match(ocr_name, excel_acc2):
assignments.append((sig['signature_id'], excel_acc2, 1))
else:
assignments.append((sig['signature_id'], None, 1))
# 批量更新資料庫
print(f"正在更新 {len(assignments)} 筆簽名...")
for sig_id, acc, is_valid in assignments:
cur.execute("""
UPDATE signatures
SET assigned_accountant = ?, is_valid = ?
WHERE signature_id = ?
""", (acc, is_valid, sig_id))
conn.commit()
return stats
def build_accountants_table(conn):
"""建立會計師表"""
cur = conn.cursor()
# 清空現有資料
cur.execute("DELETE FROM accountants")
# 收集所有會計師姓名
cur.execute("""
SELECT assigned_accountant, excel_firm, COUNT(*) as cnt
FROM signatures
WHERE assigned_accountant IS NOT NULL AND is_valid = 1
GROUP BY assigned_accountant
""")
accountants = {}
for row in cur.fetchall():
name = row[0]
firm = row[1]
count = row[2]
if name not in accountants:
accountants[name] = {'count': 0, 'firms': defaultdict(int)}
accountants[name]['count'] += count
if firm:
accountants[name]['firms'][firm] += count
# 插入 accountants 表
for name, data in accountants.items():
# 找出最常見的事務所
main_firm = None
if data['firms']:
main_firm = max(data['firms'].items(), key=lambda x: x[1])[0]
cur.execute("""
INSERT INTO accountants (name, signature_count, firm)
VALUES (?, ?, ?)
""", (name, data['count'], main_firm))
conn.commit()
# 更新 signatures 的 accountant_id
cur.execute("""
UPDATE signatures
SET accountant_id = (
SELECT accountant_id FROM accountants
WHERE accountants.name = signatures.assigned_accountant
)
WHERE assigned_accountant IS NOT NULL
""")
conn.commit()
return len(accountants)
def generate_report(stats, accountant_count):
"""生成報告"""
report = {
'generated_at': datetime.now().isoformat(),
'summary': {
'total_pdfs': stats['total_pdfs'],
'pdfs_with_1_sig': stats['sig_count_1'],
'pdfs_with_2_sigs': stats['sig_count_2'],
'pdfs_with_gt2_sigs': stats['sig_count_gt2'],
'pdfs_without_excel': stats['no_excel_data'],
},
'signatures': {
'valid': stats['valid_signatures'],
'invalid': stats['invalid_signatures'],
'total': stats['valid_signatures'] + stats['invalid_signatures'],
},
'assignment_method': {
'ocr_matched': stats['ocr_matched'],
'y_coordinate': stats['y_coordinate_assigned'],
},
'accountants': {
'total_unique': accountant_count,
}
}
# 儲存 JSON
json_path = f"{REPORT_DIR}/signature_cleanup_report.json"
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(report, f, ensure_ascii=False, indent=2)
# 儲存 Markdown
md_path = f"{REPORT_DIR}/signature_cleanup_report.md"
with open(md_path, 'w', encoding='utf-8') as f:
f.write("# 簽名清理與歸檔報告\n\n")
f.write(f"生成時間: {report['generated_at']}\n\n")
f.write("## PDF 分布\n\n")
f.write("| 類型 | 數量 |\n")
f.write("|------|------|\n")
f.write(f"| 總 PDF 數 | {stats['total_pdfs']} |\n")
f.write(f"| 1 個簽名 | {stats['sig_count_1']} |\n")
f.write(f"| 2 個簽名 (正常) | {stats['sig_count_2']} |\n")
f.write(f"| >2 個簽名 (需篩選) | {stats['sig_count_gt2']} |\n")
f.write(f"| 無 Excel 資料 | {stats['no_excel_data']} |\n")
f.write("\n## 簽名統計\n\n")
f.write("| 類型 | 數量 |\n")
f.write("|------|------|\n")
f.write(f"| 有效簽名 | {stats['valid_signatures']} |\n")
f.write(f"| 無效簽名 (誤判) | {stats['invalid_signatures']} |\n")
f.write("\n## 歸檔方式\n\n")
f.write("| 方式 | 數量 |\n")
f.write("|------|------|\n")
f.write(f"| OCR 姓名匹配 | {stats['ocr_matched']} |\n")
f.write(f"| Y 座標推斷 | {stats['y_coordinate_assigned']} |\n")
f.write(f"\n## 會計師\n\n")
f.write(f"唯一會計師數: **{accountant_count}**\n")
print(f"報告已儲存: {json_path}")
print(f"報告已儲存: {md_path}")
return report
def main():
print("=" * 60)
print("簽名清理與會計師歸檔")
print("=" * 60)
conn = get_connection()
# 1. 準備資料庫
print("\n[1/4] 準備資料庫...")
add_columns_if_needed(conn)
create_accountants_table(conn)
# 2. 處理所有 PDF
print("\n[2/4] 處理 PDF 簽名...")
stats = process_all_pdfs(conn)
# 3. 建立 accountants 表
print("\n[3/4] 建立會計師表...")
accountant_count = build_accountants_table(conn)
# 4. 生成報告
print("\n[4/4] 生成報告...")
report = generate_report(stats, accountant_count)
conn.close()
print("\n" + "=" * 60)
print("完成!")
print("=" * 60)
print(f"有效簽名: {stats['valid_signatures']}")
print(f"無效簽名: {stats['invalid_signatures']}")
print(f"唯一會計師: {accountant_count}")
if __name__ == '__main__':
main()
@@ -0,0 +1,272 @@
#!/usr/bin/env python3
"""
第三階段:同人簽名聚類分析
對每位會計師的簽名進行相似度分析,判斷是否有「複製貼上」行為。
"""
import sqlite3
import numpy as np
import json
from collections import defaultdict
from datetime import datetime
from tqdm import tqdm
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
FEATURES_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/features/signature_features.npy'
REPORT_DIR = '/Volumes/NV2/PDF-Processing/signature-analysis/reports'
def load_data():
"""載入特徵向量和會計師分配"""
print("載入特徵向量...")
features = np.load(FEATURES_PATH)
print(f"特徵矩陣形狀: {features.shape}")
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
# 取得所有 signature_id 順序(與特徵向量對應)
cur.execute("SELECT signature_id FROM signatures ORDER BY signature_id")
all_sig_ids = [row[0] for row in cur.fetchall()]
sig_id_to_idx = {sig_id: idx for idx, sig_id in enumerate(all_sig_ids)}
# 取得有效簽名的會計師分配
cur.execute("""
SELECT s.signature_id, s.assigned_accountant, s.accountant_id, a.name, a.firm
FROM signatures s
LEFT JOIN accountants a ON s.accountant_id = a.accountant_id
WHERE s.is_valid = 1 AND s.assigned_accountant IS NOT NULL
ORDER BY s.signature_id
""")
acc_signatures = defaultdict(list)
acc_info = {}
for row in cur.fetchall():
sig_id, _, acc_id, acc_name, firm = row
if acc_id and sig_id in sig_id_to_idx:
acc_signatures[acc_id].append(sig_id)
if acc_id not in acc_info:
acc_info[acc_id] = {'name': acc_name, 'firm': firm}
conn.close()
return features, sig_id_to_idx, acc_signatures, acc_info
def compute_similarity_stats(features, sig_ids, sig_id_to_idx):
"""計算一組簽名的相似度統計"""
if len(sig_ids) < 2:
return None
# 取得特徵
indices = [sig_id_to_idx[sid] for sid in sig_ids]
feat = features[indices]
# 正規化
norms = np.linalg.norm(feat, axis=1, keepdims=True)
norms[norms == 0] = 1
feat_norm = feat / norms
# 計算餘弦相似度矩陣
sim_matrix = np.dot(feat_norm, feat_norm.T)
# 取上三角(排除對角線)
upper_tri = sim_matrix[np.triu_indices(len(sim_matrix), k=1)]
if len(upper_tri) == 0:
return None
# 統計
stats = {
'total_pairs': len(upper_tri),
'min_sim': float(upper_tri.min()),
'max_sim': float(upper_tri.max()),
'mean_sim': float(upper_tri.mean()),
'std_sim': float(upper_tri.std()),
'pairs_gt_90': int((upper_tri > 0.90).sum()),
'pairs_gt_95': int((upper_tri > 0.95).sum()),
'pairs_gt_99': int((upper_tri > 0.99).sum()),
}
# 計算比例
stats['ratio_gt_90'] = stats['pairs_gt_90'] / stats['total_pairs']
stats['ratio_gt_95'] = stats['pairs_gt_95'] / stats['total_pairs']
stats['ratio_gt_99'] = stats['pairs_gt_99'] / stats['total_pairs']
return stats
def analyze_all_accountants(features, sig_id_to_idx, acc_signatures, acc_info):
"""分析所有會計師"""
results = []
for acc_id, sig_ids in tqdm(acc_signatures.items(), desc="分析會計師"):
info = acc_info.get(acc_id, {})
stats = compute_similarity_stats(features, sig_ids, sig_id_to_idx)
if stats:
result = {
'accountant_id': acc_id,
'name': info.get('name', ''),
'firm': info.get('firm', ''),
'signature_count': len(sig_ids),
**stats
}
results.append(result)
return results
def classify_risk(result):
"""分類風險等級"""
ratio_95 = result.get('ratio_gt_95', 0)
ratio_99 = result.get('ratio_gt_99', 0)
mean_sim = result.get('mean_sim', 0)
# 高風險:大量高相似度對
if ratio_99 > 0.05 or ratio_95 > 0.3:
return 'high'
# 中風險
elif ratio_95 > 0.1 or mean_sim > 0.85:
return 'medium'
# 低風險
else:
return 'low'
def save_results(results, acc_signatures):
"""儲存結果"""
# 分類風險
for r in results:
r['risk_level'] = classify_risk(r)
# 統計
risk_counts = defaultdict(int)
for r in results:
risk_counts[r['risk_level']] += 1
summary = {
'generated_at': datetime.now().isoformat(),
'total_accountants': len(results),
'risk_distribution': dict(risk_counts),
'high_risk_count': risk_counts['high'],
'medium_risk_count': risk_counts['medium'],
'low_risk_count': risk_counts['low'],
}
# 按風險排序
results_sorted = sorted(results, key=lambda x: (-x.get('ratio_gt_95', 0), -x.get('mean_sim', 0)))
# 儲存 JSON
output = {
'summary': summary,
'accountants': results_sorted
}
json_path = f"{REPORT_DIR}/accountant_similarity_analysis.json"
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
print(f"已儲存: {json_path}")
# 儲存 Markdown 報告
md_path = f"{REPORT_DIR}/accountant_similarity_analysis.md"
with open(md_path, 'w', encoding='utf-8') as f:
f.write("# 會計師簽名相似度分析報告\n\n")
f.write(f"生成時間: {summary['generated_at']}\n\n")
f.write("## 摘要\n\n")
f.write(f"| 指標 | 數值 |\n")
f.write(f"|------|------|\n")
f.write(f"| 總會計師數 | {summary['total_accountants']} |\n")
f.write(f"| 高風險 | {risk_counts['high']} |\n")
f.write(f"| 中風險 | {risk_counts['medium']} |\n")
f.write(f"| 低風險 | {risk_counts['low']} |\n")
f.write("\n## 風險分類標準\n\n")
f.write("- **高風險**: >5% 的簽名對相似度 >0.99,或 >30% 的簽名對相似度 >0.95\n")
f.write("- **中風險**: >10% 的簽名對相似度 >0.95,或平均相似度 >0.85\n")
f.write("- **低風險**: 其他情況\n")
f.write("\n## 高風險會計師 (Top 30)\n\n")
f.write("| 排名 | 姓名 | 事務所 | 簽名數 | 平均相似度 | >0.95比例 | >0.99比例 |\n")
f.write("|------|------|--------|--------|------------|-----------|----------|\n")
high_risk = [r for r in results_sorted if r['risk_level'] == 'high']
for i, r in enumerate(high_risk[:30], 1):
f.write(f"| {i} | {r['name']} | {r['firm'] or '-'} | {r['signature_count']} | ")
f.write(f"{r['mean_sim']:.3f} | {r['ratio_gt_95']*100:.1f}% | {r['ratio_gt_99']*100:.1f}% |\n")
f.write("\n## 所有會計師統計分布\n\n")
# 平均相似度分布
mean_sims = [r['mean_sim'] for r in results]
f.write("### 平均相似度分布\n\n")
f.write(f"- 最小: {min(mean_sims):.3f}\n")
f.write(f"- 最大: {max(mean_sims):.3f}\n")
f.write(f"- 平均: {np.mean(mean_sims):.3f}\n")
f.write(f"- 中位數: {np.median(mean_sims):.3f}\n")
print(f"已儲存: {md_path}")
return summary, results_sorted
def update_database(results):
"""更新資料庫,添加風險等級"""
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
# 添加欄位
try:
cur.execute("ALTER TABLE accountants ADD COLUMN risk_level TEXT")
cur.execute("ALTER TABLE accountants ADD COLUMN mean_similarity REAL")
cur.execute("ALTER TABLE accountants ADD COLUMN ratio_gt_95 REAL")
except:
pass # 欄位已存在
# 更新
for r in results:
cur.execute("""
UPDATE accountants
SET risk_level = ?, mean_similarity = ?, ratio_gt_95 = ?
WHERE accountant_id = ?
""", (r['risk_level'], r['mean_sim'], r['ratio_gt_95'], r['accountant_id']))
conn.commit()
conn.close()
print("資料庫已更新")
def main():
print("=" * 60)
print("第三階段:同人簽名聚類分析")
print("=" * 60)
# 載入資料
features, sig_id_to_idx, acc_signatures, acc_info = load_data()
print(f"會計師數: {len(acc_signatures)}")
# 分析所有會計師
print("\n開始分析...")
results = analyze_all_accountants(features, sig_id_to_idx, acc_signatures, acc_info)
# 儲存結果
print("\n儲存結果...")
summary, results_sorted = save_results(results, acc_signatures)
# 更新資料庫
update_database(results_sorted)
print("\n" + "=" * 60)
print("完成!")
print("=" * 60)
print(f"總會計師: {summary['total_accountants']}")
print(f"高風險: {summary['high_risk_count']}")
print(f"中風險: {summary['medium_risk_count']}")
print(f"低風險: {summary['low_risk_count']}")
if __name__ == '__main__':
main()
@@ -0,0 +1,371 @@
#!/usr/bin/env python3
"""
第四階段:PDF 簽名真偽判定
對每份 PDF 的簽名判斷是「親簽」還是「複製貼上」
"""
import sqlite3
import numpy as np
import json
import csv
from collections import defaultdict
from datetime import datetime
from tqdm import tqdm
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
FEATURES_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/features/signature_features.npy'
REPORT_DIR = '/Volumes/NV2/PDF-Processing/signature-analysis/reports'
# 門檻設定
THRESHOLD_COPY = 0.95 # 高於此值判定為「複製貼上」
THRESHOLD_AUTHENTIC = 0.85 # 低於此值判定為「親簽」
# 介於兩者之間為「不確定」
def load_data():
"""載入資料"""
print("載入特徵向量...")
features = np.load(FEATURES_PATH)
# 正規化
norms = np.linalg.norm(features, axis=1, keepdims=True)
norms[norms == 0] = 1
features_norm = features / norms
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
# 取得簽名資訊
cur.execute("""
SELECT s.signature_id, s.source_pdf, s.assigned_accountant,
s.excel_accountant1, s.excel_accountant2, s.excel_firm
FROM signatures s
WHERE s.is_valid = 1 AND s.assigned_accountant IS NOT NULL
ORDER BY s.signature_id
""")
sig_data = {}
pdf_signatures = defaultdict(list)
acc_signatures = defaultdict(list)
pdf_info = {}
for row in cur.fetchall():
sig_id, pdf, acc_name, acc1, acc2, firm = row
sig_data[sig_id] = {
'pdf': pdf,
'accountant': acc_name,
}
pdf_signatures[pdf].append((sig_id, acc_name))
acc_signatures[acc_name].append(sig_id)
if pdf not in pdf_info:
pdf_info[pdf] = {
'accountant1': acc1,
'accountant2': acc2,
'firm': firm
}
# signature_id -> feature index
cur.execute("SELECT signature_id FROM signatures ORDER BY signature_id")
all_sig_ids = [row[0] for row in cur.fetchall()]
sig_id_to_idx = {sid: idx for idx, sid in enumerate(all_sig_ids)}
conn.close()
return features_norm, sig_data, pdf_signatures, acc_signatures, pdf_info, sig_id_to_idx
def get_max_similarity_to_others(sig_id, acc_name, acc_signatures, sig_id_to_idx, features_norm):
"""計算該簽名與同一會計師其他簽名的最大相似度"""
other_sigs = [s for s in acc_signatures[acc_name] if s != sig_id and s in sig_id_to_idx]
if not other_sigs:
return None, None
idx = sig_id_to_idx[sig_id]
other_indices = [sig_id_to_idx[s] for s in other_sigs]
feat = features_norm[idx]
other_feats = features_norm[other_indices]
similarities = np.dot(other_feats, feat)
max_idx = similarities.argmax()
return float(similarities[max_idx]), other_sigs[max_idx]
def classify_signature(max_sim):
"""分類簽名"""
if max_sim is None:
return 'unknown' # 無法判定(沒有其他簽名可比對)
elif max_sim >= THRESHOLD_COPY:
return 'copy' # 複製貼上
elif max_sim <= THRESHOLD_AUTHENTIC:
return 'authentic' # 親簽
else:
return 'uncertain' # 不確定
def classify_pdf(verdicts):
"""根據兩個簽名的判定結果,給出 PDF 整體判定"""
if not verdicts:
return 'unknown'
# 如果有任一簽名是複製,整份 PDF 判定為複製
if 'copy' in verdicts:
return 'copy'
# 如果兩個都是親簽
elif all(v == 'authentic' for v in verdicts):
return 'authentic'
# 如果有不確定的
elif 'uncertain' in verdicts:
return 'uncertain'
else:
return 'unknown'
def analyze_all_pdfs(features_norm, sig_data, pdf_signatures, acc_signatures, pdf_info, sig_id_to_idx):
"""分析所有 PDF"""
results = []
for pdf, sigs in tqdm(pdf_signatures.items(), desc="分析 PDF"):
info = pdf_info.get(pdf, {})
pdf_result = {
'pdf': pdf,
'accountant1': info.get('accountant1', ''),
'accountant2': info.get('accountant2', ''),
'firm': info.get('firm', ''),
'signatures': []
}
verdicts = []
for sig_id, acc_name in sigs:
max_sim, most_similar_sig = get_max_similarity_to_others(
sig_id, acc_name, acc_signatures, sig_id_to_idx, features_norm
)
verdict = classify_signature(max_sim)
verdicts.append(verdict)
pdf_result['signatures'].append({
'signature_id': sig_id,
'accountant': acc_name,
'max_similarity': max_sim,
'verdict': verdict
})
pdf_result['pdf_verdict'] = classify_pdf(verdicts)
results.append(pdf_result)
return results
def generate_statistics(results):
"""生成統計"""
stats = {
'total_pdfs': len(results),
'pdf_verdicts': defaultdict(int),
'signature_verdicts': defaultdict(int),
'by_firm': defaultdict(lambda: defaultdict(int))
}
for r in results:
stats['pdf_verdicts'][r['pdf_verdict']] += 1
firm = r['firm'] or '未知'
stats['by_firm'][firm][r['pdf_verdict']] += 1
for sig in r['signatures']:
stats['signature_verdicts'][sig['verdict']] += 1
return stats
def save_results(results, stats):
"""儲存結果"""
timestamp = datetime.now().isoformat()
# 1. 儲存完整 JSON
json_path = f"{REPORT_DIR}/pdf_signature_verdicts.json"
output = {
'generated_at': timestamp,
'thresholds': {
'copy': THRESHOLD_COPY,
'authentic': THRESHOLD_AUTHENTIC
},
'statistics': {
'total_pdfs': stats['total_pdfs'],
'pdf_verdicts': dict(stats['pdf_verdicts']),
'signature_verdicts': dict(stats['signature_verdicts'])
},
'results': results
}
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
print(f"已儲存: {json_path}")
# 2. 儲存 CSV(簡易版)
csv_path = f"{REPORT_DIR}/pdf_signature_verdicts.csv"
with open(csv_path, 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow(['PDF', '會計師1', '會計師2', '事務所', '判定結果',
'簽名1_會計師', '簽名1_相似度', '簽名1_判定',
'簽名2_會計師', '簽名2_相似度', '簽名2_判定'])
for r in results:
row = [
r['pdf'],
r['accountant1'],
r['accountant2'],
r['firm'] or '',
r['pdf_verdict']
]
for sig in r['signatures'][:2]: # 最多 2 個簽名
row.extend([
sig['accountant'],
f"{sig['max_similarity']:.3f}" if sig['max_similarity'] else '',
sig['verdict']
])
# 補齊欄位
while len(row) < 11:
row.append('')
writer.writerow(row)
print(f"已儲存: {csv_path}")
# 3. 儲存 Markdown 報告
md_path = f"{REPORT_DIR}/pdf_signature_verdict_report.md"
with open(md_path, 'w', encoding='utf-8') as f:
f.write("# PDF 簽名真偽判定報告\n\n")
f.write(f"生成時間: {timestamp}\n\n")
f.write("## 判定標準\n\n")
f.write(f"- **複製貼上 (copy)**: 與同一會計師其他簽名相似度 ≥ {THRESHOLD_COPY}\n")
f.write(f"- **親簽 (authentic)**: 與同一會計師其他簽名相似度 ≤ {THRESHOLD_AUTHENTIC}\n")
f.write(f"- **不確定 (uncertain)**: 相似度介於 {THRESHOLD_AUTHENTIC} ~ {THRESHOLD_COPY}\n")
f.write(f"- **無法判定 (unknown)**: 該會計師只有此一份簽名,無法比對\n\n")
f.write("## 整體統計\n\n")
f.write("### PDF 判定結果\n\n")
f.write("| 判定 | 數量 | 百分比 |\n")
f.write("|------|------|--------|\n")
total = stats['total_pdfs']
for verdict in ['copy', 'uncertain', 'authentic', 'unknown']:
count = stats['pdf_verdicts'].get(verdict, 0)
pct = count / total * 100 if total > 0 else 0
label = {
'copy': '複製貼上',
'authentic': '親簽',
'uncertain': '不確定',
'unknown': '無法判定'
}.get(verdict, verdict)
f.write(f"| {label} | {count:,} | {pct:.1f}% |\n")
f.write(f"\n**總計: {total:,} 份 PDF**\n")
f.write("\n### 簽名判定結果\n\n")
f.write("| 判定 | 數量 | 百分比 |\n")
f.write("|------|------|--------|\n")
sig_total = sum(stats['signature_verdicts'].values())
for verdict in ['copy', 'uncertain', 'authentic', 'unknown']:
count = stats['signature_verdicts'].get(verdict, 0)
pct = count / sig_total * 100 if sig_total > 0 else 0
label = {
'copy': '複製貼上',
'authentic': '親簽',
'uncertain': '不確定',
'unknown': '無法判定'
}.get(verdict, verdict)
f.write(f"| {label} | {count:,} | {pct:.1f}% |\n")
f.write(f"\n**總計: {sig_total:,} 個簽名**\n")
f.write("\n### 按事務所統計\n\n")
f.write("| 事務所 | 複製貼上 | 不確定 | 親簽 | 無法判定 | 總計 |\n")
f.write("|--------|----------|--------|------|----------|------|\n")
# 按總數排序
firms_sorted = sorted(stats['by_firm'].items(),
key=lambda x: sum(x[1].values()), reverse=True)
for firm, verdicts in firms_sorted[:20]:
copy_n = verdicts.get('copy', 0)
uncertain_n = verdicts.get('uncertain', 0)
authentic_n = verdicts.get('authentic', 0)
unknown_n = verdicts.get('unknown', 0)
total_n = copy_n + uncertain_n + authentic_n + unknown_n
f.write(f"| {firm} | {copy_n:,} | {uncertain_n:,} | {authentic_n:,} | {unknown_n:,} | {total_n:,} |\n")
print(f"已儲存: {md_path}")
return stats
def update_database(results):
"""更新資料庫"""
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
# 添加欄位
try:
cur.execute("ALTER TABLE signatures ADD COLUMN signature_verdict TEXT")
cur.execute("ALTER TABLE signatures ADD COLUMN max_similarity_to_same_accountant REAL")
except:
pass
# 更新
for r in results:
for sig in r['signatures']:
cur.execute("""
UPDATE signatures
SET signature_verdict = ?, max_similarity_to_same_accountant = ?
WHERE signature_id = ?
""", (sig['verdict'], sig['max_similarity'], sig['signature_id']))
conn.commit()
conn.close()
print("資料庫已更新")
def main():
print("=" * 60)
print("第四階段:PDF 簽名真偽判定")
print("=" * 60)
# 載入資料
features_norm, sig_data, pdf_signatures, acc_signatures, pdf_info, sig_id_to_idx = load_data()
print(f"PDF 數: {len(pdf_signatures)}")
print(f"有效簽名: {len(sig_data)}")
# 分析所有 PDF
print("\n開始分析...")
results = analyze_all_pdfs(
features_norm, sig_data, pdf_signatures, acc_signatures, pdf_info, sig_id_to_idx
)
# 生成統計
stats = generate_statistics(results)
# 儲存結果
print("\n儲存結果...")
save_results(results, stats)
# 更新資料庫
update_database(results)
print("\n" + "=" * 60)
print("完成!")
print("=" * 60)
print(f"\nPDF 判定結果:")
print(f" 複製貼上: {stats['pdf_verdicts'].get('copy', 0):,}")
print(f" 不確定: {stats['pdf_verdicts'].get('uncertain', 0):,}")
print(f" 親簽: {stats['pdf_verdicts'].get('authentic', 0):,}")
print(f" 無法判定: {stats['pdf_verdicts'].get('unknown', 0):,}")
if __name__ == '__main__':
main()
File diff suppressed because it is too large Load Diff
+319
View File
@@ -0,0 +1,319 @@
#!/usr/bin/env python3
"""
Compute SSIM and pHash for all signature pairs (closest match per accountant).
Uses multiprocessing for parallel image loading and computation.
Saves results to database and outputs complete CSV.
"""
import sqlite3
import numpy as np
import cv2
import os
import sys
import json
import csv
import time
from datetime import datetime
from collections import defaultdict
from multiprocessing import Pool, cpu_count
from pathlib import Path
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
IMAGE_DIR = '/Volumes/NV2/PDF-Processing/yolo-signatures/images'
OUTPUT_CSV = '/Volumes/NV2/PDF-Processing/signature-analysis/reports/complete_pdf_report.csv'
CHECKPOINT_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/ssim_checkpoint.json'
NUM_WORKERS = max(1, cpu_count() - 2) # Leave 2 cores free
BATCH_SIZE = 1000
def compute_phash(img, hash_size=8):
"""Compute perceptual hash."""
resized = cv2.resize(img, (hash_size + 1, hash_size))
diff = resized[:, 1:] > resized[:, :-1]
return diff.flatten()
def compute_pair_ssim(args):
"""Compute SSIM, pHash, histogram correlation for a pair of images."""
sig_id, file1, file2, cosine_sim = args
path1 = os.path.join(IMAGE_DIR, file1)
path2 = os.path.join(IMAGE_DIR, file2)
result = {
'signature_id': sig_id,
'match_file': file2,
'cosine_similarity': cosine_sim,
'ssim': None,
'phash_distance': None,
'histogram_corr': None,
'pixel_identical': False,
}
try:
img1 = cv2.imread(path1, cv2.IMREAD_GRAYSCALE)
img2 = cv2.imread(path2, cv2.IMREAD_GRAYSCALE)
if img1 is None or img2 is None:
return result
# Resize to same dimensions
h = min(img1.shape[0], img2.shape[0])
w = min(img1.shape[1], img2.shape[1])
if h < 3 or w < 3:
return result
img1_r = cv2.resize(img1, (w, h))
img2_r = cv2.resize(img2, (w, h))
# Pixel identical check
result['pixel_identical'] = bool(np.array_equal(img1_r, img2_r))
# SSIM
try:
from skimage.metrics import structural_similarity as ssim
win_size = min(7, min(h, w))
if win_size % 2 == 0:
win_size -= 1
if win_size >= 3:
result['ssim'] = float(ssim(img1_r, img2_r, win_size=win_size))
else:
result['ssim'] = None
except Exception:
result['ssim'] = None
# Histogram correlation
hist1 = cv2.calcHist([img1_r], [0], None, [256], [0, 256])
hist2 = cv2.calcHist([img2_r], [0], None, [256], [0, 256])
result['histogram_corr'] = float(cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL))
# pHash distance
h1 = compute_phash(img1_r)
h2 = compute_phash(img2_r)
result['phash_distance'] = int(np.sum(h1 != h2))
except Exception as e:
pass
return result
def load_checkpoint():
"""Load checkpoint of already processed signature IDs."""
if os.path.exists(CHECKPOINT_PATH):
with open(CHECKPOINT_PATH, 'r') as f:
data = json.load(f)
return set(data.get('processed_ids', []))
return set()
def save_checkpoint(processed_ids):
"""Save checkpoint."""
with open(CHECKPOINT_PATH, 'w') as f:
json.dump({'processed_ids': list(processed_ids), 'timestamp': str(datetime.now())}, f)
def main():
start_time = time.time()
print("=" * 70)
print("SSIM & pHash Computation for All Signature Pairs")
print(f"Workers: {NUM_WORKERS}")
print("=" * 70)
# --- Step 1: Load data ---
print("\n[1/4] Loading data from database...")
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
cur.execute('''
SELECT signature_id, image_filename, assigned_accountant, feature_vector
FROM signatures
WHERE feature_vector IS NOT NULL AND assigned_accountant IS NOT NULL
''')
rows = cur.fetchall()
sig_ids = []
filenames = []
accountants = []
features = []
for row in rows:
sig_ids.append(row[0])
filenames.append(row[1])
accountants.append(row[2])
features.append(np.frombuffer(row[3], dtype=np.float32))
features = np.array(features)
print(f" Loaded {len(sig_ids)} signatures")
# --- Step 2: Find closest match per signature ---
print("\n[2/4] Finding closest match per signature (same accountant)...")
acct_groups = defaultdict(list)
for i, acct in enumerate(accountants):
acct_groups[acct].append(i)
# Load checkpoint
processed_ids = load_checkpoint()
print(f" Checkpoint: {len(processed_ids)} already processed")
# Prepare tasks
tasks = []
for acct, indices in acct_groups.items():
if len(indices) < 2:
continue
vecs = features[indices]
sim_matrix = vecs @ vecs.T
np.fill_diagonal(sim_matrix, -1) # Exclude self
for local_i, global_i in enumerate(indices):
if sig_ids[global_i] in processed_ids:
continue
best_local = np.argmax(sim_matrix[local_i])
best_global = indices[best_local]
best_sim = float(sim_matrix[local_i, best_local])
tasks.append((
sig_ids[global_i],
filenames[global_i],
filenames[best_global],
best_sim
))
print(f" Tasks to process: {len(tasks)}")
# --- Step 3: Compute SSIM/pHash in parallel ---
print(f"\n[3/4] Computing SSIM & pHash ({len(tasks)} pairs, {NUM_WORKERS} workers)...")
# Add SSIM columns to database if not exist
try:
cur.execute('ALTER TABLE signatures ADD COLUMN ssim_to_closest REAL')
except:
pass
try:
cur.execute('ALTER TABLE signatures ADD COLUMN phash_distance_to_closest INTEGER')
except:
pass
try:
cur.execute('ALTER TABLE signatures ADD COLUMN histogram_corr_to_closest REAL')
except:
pass
try:
cur.execute('ALTER TABLE signatures ADD COLUMN pixel_identical_to_closest INTEGER')
except:
pass
try:
cur.execute('ALTER TABLE signatures ADD COLUMN closest_match_file TEXT')
except:
pass
conn.commit()
total = len(tasks)
done = 0
batch_results = []
with Pool(NUM_WORKERS) as pool:
for result in pool.imap_unordered(compute_pair_ssim, tasks, chunksize=50):
batch_results.append(result)
done += 1
if done % BATCH_SIZE == 0 or done == total:
# Save batch to database
for r in batch_results:
cur.execute('''
UPDATE signatures SET
ssim_to_closest = ?,
phash_distance_to_closest = ?,
histogram_corr_to_closest = ?,
pixel_identical_to_closest = ?,
closest_match_file = ?
WHERE signature_id = ?
''', (
r['ssim'],
r['phash_distance'],
r['histogram_corr'],
1 if r['pixel_identical'] else 0,
r['match_file'],
r['signature_id']
))
processed_ids.add(r['signature_id'])
conn.commit()
save_checkpoint(processed_ids)
batch_results = []
elapsed = time.time() - start_time
rate = done / elapsed
eta = (total - done) / rate if rate > 0 else 0
print(f" {done:,}/{total:,} ({100*done/total:.1f}%) "
f"| {rate:.1f} pairs/s | ETA: {eta/60:.1f} min")
# --- Step 4: Generate complete CSV ---
print(f"\n[4/4] Generating complete CSV...")
cur.execute('''
SELECT
s.source_pdf,
s.year_month,
s.serial_number,
s.doc_type,
s.page_number,
s.sig_index,
s.image_filename,
s.assigned_accountant,
s.excel_accountant1,
s.excel_accountant2,
s.excel_firm,
s.detection_confidence,
s.signature_verdict,
s.max_similarity_to_same_accountant,
s.ssim_to_closest,
s.phash_distance_to_closest,
s.histogram_corr_to_closest,
s.pixel_identical_to_closest,
s.closest_match_file,
a.risk_level,
a.mean_similarity as acct_mean_similarity,
a.ratio_gt_95 as acct_ratio_gt_95
FROM signatures s
LEFT JOIN accountants a ON s.assigned_accountant = a.name
ORDER BY s.source_pdf, s.sig_index
''')
columns = [
'source_pdf', 'year_month', 'serial_number', 'doc_type',
'page_number', 'sig_index', 'image_filename',
'assigned_accountant', 'excel_accountant1', 'excel_accountant2', 'excel_firm',
'detection_confidence', 'signature_verdict',
'max_cosine_similarity', 'ssim_to_closest', 'phash_distance_to_closest',
'histogram_corr_to_closest', 'pixel_identical_to_closest', 'closest_match_file',
'accountant_risk_level', 'accountant_mean_similarity', 'accountant_ratio_gt_95'
]
with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(columns)
for row in cur:
writer.writerow(row)
# Count rows
cur.execute('SELECT COUNT(*) FROM signatures')
total_sigs = cur.fetchone()[0]
cur.execute('SELECT COUNT(DISTINCT source_pdf) FROM signatures')
total_pdfs = cur.fetchone()[0]
conn.close()
elapsed = time.time() - start_time
print(f"\n{'='*70}")
print(f"Complete!")
print(f" Total signatures: {total_sigs:,}")
print(f" Total PDFs: {total_pdfs:,}")
print(f" Output: {OUTPUT_CSV}")
print(f" Time: {elapsed/60:.1f} minutes")
print(f"{'='*70}")
# Clean up checkpoint
if os.path.exists(CHECKPOINT_PATH):
os.remove(CHECKPOINT_PATH)
if __name__ == '__main__':
main()
@@ -0,0 +1,407 @@
#!/usr/bin/env python3
"""
Generate PDF-level aggregated report with multi-method verdicts.
One row per PDF with all Group A-F columns plus new SSIM/pHash/combined verdicts.
"""
import sqlite3
import csv
import numpy as np
from datetime import datetime
from collections import defaultdict
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
OUTPUT_CSV = '/Volumes/NV2/PDF-Processing/signature-analysis/reports/pdf_level_complete_report.csv'
# Thresholds from statistical analysis
COSINE_THRESHOLD = 0.95
COSINE_STATISTICAL = 0.944 # mu + 2*sigma
KDE_CROSSOVER = 0.838
SSIM_HIGH = 0.95
SSIM_MEDIUM = 0.80
PHASH_IDENTICAL = 0
PHASH_SIMILAR = 5
def classify_overall(max_cosine, max_ssim, min_phash, has_pixel_identical):
"""
Multi-method combined verdict.
Returns (verdict, confidence_level, n_methods_agree)
"""
evidence_copy = 0
evidence_genuine = 0
total_methods = 0
# Method 1: Cosine similarity
if max_cosine is not None:
total_methods += 1
if max_cosine > COSINE_THRESHOLD:
evidence_copy += 1
elif max_cosine < KDE_CROSSOVER:
evidence_genuine += 1
# Method 2: SSIM
if max_ssim is not None:
total_methods += 1
if max_ssim > SSIM_HIGH:
evidence_copy += 1
elif max_ssim < 0.5:
evidence_genuine += 1
# Method 3: pHash
if min_phash is not None:
total_methods += 1
if min_phash <= PHASH_IDENTICAL:
evidence_copy += 1
elif min_phash > 15:
evidence_genuine += 1
# Method 4: Pixel identical
if has_pixel_identical is not None:
total_methods += 1
if has_pixel_identical:
evidence_copy += 1
# Decision logic
if has_pixel_identical:
verdict = 'definite_copy'
confidence = 'very_high'
elif max_ssim is not None and max_ssim > SSIM_HIGH and min_phash is not None and min_phash <= PHASH_SIMILAR:
verdict = 'definite_copy'
confidence = 'very_high'
elif evidence_copy >= 3:
verdict = 'very_likely_copy'
confidence = 'high'
elif evidence_copy >= 2:
verdict = 'likely_copy'
confidence = 'medium'
elif max_cosine is not None and max_cosine > COSINE_THRESHOLD:
verdict = 'likely_copy'
confidence = 'medium'
elif max_cosine is not None and max_cosine > KDE_CROSSOVER:
verdict = 'uncertain'
confidence = 'low'
elif max_cosine is not None and max_cosine <= KDE_CROSSOVER:
verdict = 'likely_genuine'
confidence = 'medium'
else:
verdict = 'unknown'
confidence = 'none'
return verdict, confidence, evidence_copy, total_methods
def main():
print("=" * 70)
print("PDF-Level Aggregated Report Generator")
print("=" * 70)
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
# Load all signature data grouped by PDF
print("\n[1/3] Loading signature data...")
cur.execute('''
SELECT
s.source_pdf,
s.year_month,
s.serial_number,
s.doc_type,
s.page_number,
s.sig_index,
s.assigned_accountant,
s.excel_accountant1,
s.excel_accountant2,
s.excel_firm,
s.detection_confidence,
s.signature_verdict,
s.max_similarity_to_same_accountant,
s.ssim_to_closest,
s.phash_distance_to_closest,
s.histogram_corr_to_closest,
s.pixel_identical_to_closest,
a.risk_level,
a.mean_similarity,
a.ratio_gt_95,
a.signature_count
FROM signatures s
LEFT JOIN accountants a ON s.assigned_accountant = a.name
ORDER BY s.source_pdf, s.sig_index
''')
# Group by PDF
pdf_data = defaultdict(list)
for row in cur:
pdf_data[row[0]].append(row)
print(f" {len(pdf_data)} PDFs loaded")
# Generate PDF-level rows
print("\n[2/3] Aggregating per-PDF statistics...")
columns = [
# Group A: PDF Identity
'source_pdf', 'year_month', 'serial_number', 'doc_type',
# Group B: Excel Master Data
'accountant_1', 'accountant_2', 'firm',
# Group C: YOLO Detection
'n_signatures_detected', 'avg_detection_confidence',
# Group D: Cosine Similarity
'max_cosine_similarity', 'min_cosine_similarity', 'avg_cosine_similarity',
# Group E: Verdict (original per-sig)
'sig1_cosine_verdict', 'sig2_cosine_verdict',
# Group F: Accountant Risk
'acct1_name', 'acct1_risk_level', 'acct1_mean_similarity',
'acct1_ratio_gt_95', 'acct1_total_signatures',
'acct2_name', 'acct2_risk_level', 'acct2_mean_similarity',
'acct2_ratio_gt_95', 'acct2_total_signatures',
# Group G: SSIM (NEW)
'max_ssim', 'min_ssim', 'avg_ssim',
'verdict_ssim',
# Group H: pHash (NEW)
'min_phash_distance', 'max_phash_distance', 'avg_phash_distance',
'verdict_phash',
# Group I: Histogram Correlation (NEW)
'max_histogram_corr', 'avg_histogram_corr',
# Group J: Pixel Identity (NEW)
'has_pixel_identical',
'verdict_pixel',
# Group K: Statistical Threshold (NEW)
'verdict_statistical', # Based on mu+2sigma (0.944)
# Group L: KDE Crossover (NEW)
'verdict_kde', # Based on KDE crossover (0.838)
# Group M: Multi-Method Combined (NEW)
'overall_verdict',
'confidence_level',
'n_methods_copy',
'n_methods_total',
]
rows = []
for pdf_name, sigs in pdf_data.items():
# Group A: Identity (from first signature)
first = sigs[0]
year_month = first[1]
serial_number = first[2]
doc_type = first[3]
# Group B: Excel data
excel_acct1 = first[7]
excel_acct2 = first[8]
excel_firm = first[9]
# Group C: Detection
n_sigs = len(sigs)
confidences = [s[10] for s in sigs if s[10] is not None]
avg_conf = np.mean(confidences) if confidences else None
# Group D: Cosine similarity
cosines = [s[12] for s in sigs if s[12] is not None]
max_cosine = max(cosines) if cosines else None
min_cosine = min(cosines) if cosines else None
avg_cosine = np.mean(cosines) if cosines else None
# Group E: Per-sig verdicts
verdicts = [s[11] for s in sigs]
sig1_verdict = verdicts[0] if len(verdicts) > 0 else None
sig2_verdict = verdicts[1] if len(verdicts) > 1 else None
# Group F: Accountant risk - separate for acct1 and acct2
# Match by assigned_accountant to excel_accountant1/2
acct1_info = {'name': None, 'risk': None, 'mean_sim': None, 'ratio': None, 'count': None}
acct2_info = {'name': None, 'risk': None, 'mean_sim': None, 'ratio': None, 'count': None}
for s in sigs:
assigned = s[6]
if assigned and assigned == excel_acct1 and acct1_info['name'] is None:
acct1_info = {
'name': assigned, 'risk': s[17],
'mean_sim': s[18], 'ratio': s[19], 'count': s[20]
}
elif assigned and assigned == excel_acct2 and acct2_info['name'] is None:
acct2_info = {
'name': assigned, 'risk': s[17],
'mean_sim': s[18], 'ratio': s[19], 'count': s[20]
}
elif assigned and acct1_info['name'] is None:
acct1_info = {
'name': assigned, 'risk': s[17],
'mean_sim': s[18], 'ratio': s[19], 'count': s[20]
}
elif assigned and acct2_info['name'] is None:
acct2_info = {
'name': assigned, 'risk': s[17],
'mean_sim': s[18], 'ratio': s[19], 'count': s[20]
}
# Group G: SSIM
ssims = [s[13] for s in sigs if s[13] is not None]
max_ssim = max(ssims) if ssims else None
min_ssim = min(ssims) if ssims else None
avg_ssim = np.mean(ssims) if ssims else None
if max_ssim is not None:
if max_ssim > SSIM_HIGH:
verdict_ssim = 'copy'
elif max_ssim > SSIM_MEDIUM:
verdict_ssim = 'suspicious'
else:
verdict_ssim = 'genuine'
else:
verdict_ssim = None
# Group H: pHash
phashes = [s[14] for s in sigs if s[14] is not None]
min_phash = min(phashes) if phashes else None
max_phash = max(phashes) if phashes else None
avg_phash = np.mean(phashes) if phashes else None
if min_phash is not None:
if min_phash <= PHASH_IDENTICAL:
verdict_phash = 'copy'
elif min_phash <= PHASH_SIMILAR:
verdict_phash = 'suspicious'
else:
verdict_phash = 'genuine'
else:
verdict_phash = None
# Group I: Histogram correlation
histcorrs = [s[15] for s in sigs if s[15] is not None]
max_histcorr = max(histcorrs) if histcorrs else None
avg_histcorr = np.mean(histcorrs) if histcorrs else None
# Group J: Pixel identical
pixel_ids = [s[16] for s in sigs if s[16] is not None]
has_pixel = any(p == 1 for p in pixel_ids) if pixel_ids else False
verdict_pixel = 'copy' if has_pixel else 'genuine'
# Group K: Statistical threshold (mu+2sigma = 0.944)
if max_cosine is not None:
if max_cosine > COSINE_STATISTICAL:
verdict_stat = 'copy'
elif max_cosine > KDE_CROSSOVER:
verdict_stat = 'uncertain'
else:
verdict_stat = 'genuine'
else:
verdict_stat = None
# Group L: KDE crossover (0.838)
if max_cosine is not None:
if max_cosine > KDE_CROSSOVER:
verdict_kde = 'above_crossover'
else:
verdict_kde = 'below_crossover'
else:
verdict_kde = None
# Group M: Multi-method combined
overall, confidence, n_copy, n_total = classify_overall(
max_cosine, max_ssim, min_phash, has_pixel)
rows.append([
# A
pdf_name, year_month, serial_number, doc_type,
# B
excel_acct1, excel_acct2, excel_firm,
# C
n_sigs, avg_conf,
# D
max_cosine, min_cosine, avg_cosine,
# E
sig1_verdict, sig2_verdict,
# F
acct1_info['name'], acct1_info['risk'], acct1_info['mean_sim'],
acct1_info['ratio'], acct1_info['count'],
acct2_info['name'], acct2_info['risk'], acct2_info['mean_sim'],
acct2_info['ratio'], acct2_info['count'],
# G
max_ssim, min_ssim, avg_ssim, verdict_ssim,
# H
min_phash, max_phash, avg_phash, verdict_phash,
# I
max_histcorr, avg_histcorr,
# J
1 if has_pixel else 0, verdict_pixel,
# K
verdict_stat,
# L
verdict_kde,
# M
overall, confidence, n_copy, n_total,
])
# Write CSV
print(f"\n[3/3] Writing {len(rows)} PDF rows to CSV...")
with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(columns)
writer.writerows(rows)
conn.close()
# Print summary statistics
print(f"\n{'='*70}")
print("SUMMARY")
print(f"{'='*70}")
print(f"Total PDFs: {len(rows):,}")
# Overall verdict distribution
verdict_counts = defaultdict(int)
confidence_counts = defaultdict(int)
for r in rows:
verdict_counts[r[-4]] += 1
confidence_counts[r[-3]] += 1
print(f"\n--- Overall Verdict Distribution ---")
for v in ['definite_copy', 'very_likely_copy', 'likely_copy', 'uncertain', 'likely_genuine', 'unknown']:
c = verdict_counts.get(v, 0)
print(f" {v:20s}: {c:>6,} ({100*c/len(rows):5.1f}%)")
print(f"\n--- Confidence Level Distribution ---")
for c_level in ['very_high', 'high', 'medium', 'low', 'none']:
c = confidence_counts.get(c_level, 0)
print(f" {c_level:10s}: {c:>6,} ({100*c/len(rows):5.1f}%)")
# Per-method verdict distribution
# Column indices: verdict_ssim=27, verdict_phash=31, verdict_pixel=35, verdict_stat=36, verdict_kde=37
print(f"\n--- Per-Method Verdict Distribution ---")
for col_idx, method_name in [(27, 'SSIM'), (31, 'pHash'), (35, 'Pixel'), (36, 'Statistical'), (37, 'KDE')]:
counts = defaultdict(int)
for r in rows:
counts[r[col_idx]] += 1
print(f"\n {method_name}:")
for k, v in sorted(counts.items(), key=lambda x: -x[1]):
print(f" {str(k):20s}: {v:>6,} ({100*v/len(rows):5.1f}%)")
# Cross-method agreement
print(f"\n--- Method Agreement (cosine>0.95 PDFs) ---")
cosine_copy = [r for r in rows if r[9] is not None and r[9] > COSINE_THRESHOLD]
if cosine_copy:
ssim_agree = sum(1 for r in cosine_copy if r[27] == 'copy')
phash_agree = sum(1 for r in cosine_copy if r[31] == 'copy')
pixel_agree = sum(1 for r in cosine_copy if r[34] == 1)
print(f" PDFs with cosine > 0.95: {len(cosine_copy):,}")
print(f" Also SSIM > 0.95: {ssim_agree:>6,} ({100*ssim_agree/len(cosine_copy):5.1f}%)")
print(f" Also pHash = 0: {phash_agree:>6,} ({100*phash_agree/len(cosine_copy):5.1f}%)")
print(f" Also pixel-identical: {pixel_agree:>4,} ({100*pixel_agree/len(cosine_copy):5.1f}%)")
print(f"\nOutput: {OUTPUT_CSV}")
print(f"{'='*70}")
if __name__ == '__main__':
main()