#!/usr/bin/env python3
"""
Step 4: 生成高相似度案例的視覺化報告
讀取 high_similarity_pairs.json
為 Top N 高相似度對生成並排對比圖
生成 HTML 報告
"""
import json
import cv2
import numpy as np
from pathlib import Path
from tqdm import tqdm
import base64
from io import BytesIO
# 路徑配置
IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
REPORTS_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
HIGH_SIM_JSON = REPORTS_PATH / "high_similarity_pairs.json"
# 報告配置
TOP_N = 100 # 顯示前 N 對
def load_image(filename: str) -> np.ndarray:
"""載入圖片"""
img_path = IMAGES_DIR / filename
img = cv2.imread(str(img_path))
if img is None:
# 返回空白圖片
return np.ones((100, 200, 3), dtype=np.uint8) * 255
return img
def create_comparison_image(file1: str, file2: str, similarity: float) -> np.ndarray:
"""建立並排對比圖"""
img1 = load_image(file1)
img2 = load_image(file2)
# 統一高度
h1, w1 = img1.shape[:2]
h2, w2 = img2.shape[:2]
target_h = max(h1, h2, 100)
# 縮放
if h1 != target_h:
scale = target_h / h1
img1 = cv2.resize(img1, (int(w1 * scale), target_h))
if h2 != target_h:
scale = target_h / h2
img2 = cv2.resize(img2, (int(w2 * scale), target_h))
# 加入分隔線
separator = np.ones((target_h, 20, 3), dtype=np.uint8) * 200
# 合併
comparison = np.hstack([img1, separator, img2])
return comparison
def image_to_base64(img: np.ndarray) -> str:
"""將圖片轉換為 base64"""
_, buffer = cv2.imencode('.png', img)
return base64.b64encode(buffer).decode('utf-8')
def generate_html_report(pairs: list, output_path: Path):
"""生成 HTML 報告"""
html_content = """
簽名相似度分析報告 - 高相似度案例
簽名相似度分析報告 - 高相似度案例
摘要
分析結果:發現 659,111 對高相似度簽名 (>0.95)
本報告顯示:Top """ + str(TOP_N) + """ 最高相似度案例
結論:存在大量相似度接近或等於 1.0 的簽名對,強烈暗示「複製貼上」行為
"""
for i, pair in enumerate(pairs[:TOP_N], 1):
sim = pair['similarity']
file1 = pair['file1']
file2 = pair['file2']
p1 = pair.get('parsed1', {})
p2 = pair.get('parsed2', {})
# 分析關係
tags = []
if p1.get('serial') == p2.get('serial'):
tags.append(('
同序號', ''))
if p1.get('year_month') == p2.get('year_month'):
tags.append(('
同月份', ''))
if p1.get('year_month') != p2.get('year_month') and p1.get('serial') != p2.get('serial'):
tags.append(('
完全不同文件', ''))
badge_class = 'very-high' if sim >= 0.99 else 'high'
# 建立對比圖
try:
comparison_img = create_comparison_image(file1, file2, sim)
img_base64 = image_to_base64(comparison_img)
img_html = f'

'
except Exception as e:
img_html = f'
無法載入圖片: {e}
'
tag_html = ''.join([t[0] for t in tags])
html_content += f"""
簽名 1: {file1}
簽名 2: {file2}
{img_html}
{tag_html}
日期: {p1.get('year_month', 'N/A')} vs {p2.get('year_month', 'N/A')} |
序號: {p1.get('serial', 'N/A')} vs {p2.get('serial', 'N/A')}
"""
html_content += """
"""
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html_content)
print(f"HTML 報告已儲存: {output_path}")
def main():
print("=" * 60)
print("Step 4: 生成高相似度案例視覺化報告")
print("=" * 60)
# 載入高相似度對
print("載入高相似度對資料...")
with open(HIGH_SIM_JSON, 'r', encoding='utf-8') as f:
pairs = json.load(f)
print(f"共 {len(pairs):,} 對高相似度簽名")
# 按相似度排序
pairs_sorted = sorted(pairs, key=lambda x: x['similarity'], reverse=True)
# 統計
sim_1 = len([p for p in pairs_sorted if p['similarity'] >= 0.9999])
sim_99 = len([p for p in pairs_sorted if p['similarity'] >= 0.99])
sim_97 = len([p for p in pairs_sorted if p['similarity'] >= 0.97])
print(f"\n相似度統計:")
print(f" = 1.0 (完全相同): {sim_1:,}")
print(f" >= 0.99: {sim_99:,}")
print(f" >= 0.97: {sim_97:,}")
# 生成報告
print(f"\n生成 Top {TOP_N} 視覺化報告...")
generate_html_report(pairs_sorted, REPORTS_PATH / "high_similarity_report.html")
print("\n完成!")
if __name__ == "__main__":
main()