#!/usr/bin/env python3 """ Step 4: 生成高相似度案例的視覺化報告 讀取 high_similarity_pairs.json 為 Top N 高相似度對生成並排對比圖 生成 HTML 報告 """ import json import cv2 import numpy as np from pathlib import Path from tqdm import tqdm import base64 from io import BytesIO # 路徑配置 IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images") REPORTS_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports") HIGH_SIM_JSON = REPORTS_PATH / "high_similarity_pairs.json" # 報告配置 TOP_N = 100 # 顯示前 N 對 def load_image(filename: str) -> np.ndarray: """載入圖片""" img_path = IMAGES_DIR / filename img = cv2.imread(str(img_path)) if img is None: # 返回空白圖片 return np.ones((100, 200, 3), dtype=np.uint8) * 255 return img def create_comparison_image(file1: str, file2: str, similarity: float) -> np.ndarray: """建立並排對比圖""" img1 = load_image(file1) img2 = load_image(file2) # 統一高度 h1, w1 = img1.shape[:2] h2, w2 = img2.shape[:2] target_h = max(h1, h2, 100) # 縮放 if h1 != target_h: scale = target_h / h1 img1 = cv2.resize(img1, (int(w1 * scale), target_h)) if h2 != target_h: scale = target_h / h2 img2 = cv2.resize(img2, (int(w2 * scale), target_h)) # 加入分隔線 separator = np.ones((target_h, 20, 3), dtype=np.uint8) * 200 # 合併 comparison = np.hstack([img1, separator, img2]) return comparison def image_to_base64(img: np.ndarray) -> str: """將圖片轉換為 base64""" _, buffer = cv2.imencode('.png', img) return base64.b64encode(buffer).decode('utf-8') def generate_html_report(pairs: list, output_path: Path): """生成 HTML 報告""" html_content = """ 簽名相似度分析報告 - 高相似度案例

簽名相似度分析報告 - 高相似度案例

摘要

分析結果:發現 659,111 對高相似度簽名 (>0.95)

本報告顯示:Top """ + str(TOP_N) + """ 最高相似度案例

結論:存在大量相似度接近或等於 1.0 的簽名對,強烈暗示「複製貼上」行為

""" for i, pair in enumerate(pairs[:TOP_N], 1): sim = pair['similarity'] file1 = pair['file1'] file2 = pair['file2'] p1 = pair.get('parsed1', {}) p2 = pair.get('parsed2', {}) # 分析關係 tags = [] if p1.get('serial') == p2.get('serial'): tags.append(('同序號', '')) if p1.get('year_month') == p2.get('year_month'): tags.append(('同月份', '')) if p1.get('year_month') != p2.get('year_month') and p1.get('serial') != p2.get('serial'): tags.append(('完全不同文件', '')) badge_class = 'very-high' if sim >= 0.99 else 'high' # 建立對比圖 try: comparison_img = create_comparison_image(file1, file2, sim) img_base64 = image_to_base64(comparison_img) img_html = f'' except Exception as e: img_html = f'

無法載入圖片: {e}

' tag_html = ''.join([t[0] for t in tags]) html_content += f"""
#{i} 相似度: {sim:.4f}
簽名 1: {file1}
簽名 2: {file2}
{img_html}
{tag_html}
日期: {p1.get('year_month', 'N/A')} vs {p2.get('year_month', 'N/A')} | 序號: {p1.get('serial', 'N/A')} vs {p2.get('serial', 'N/A')}
""" html_content += """

生成時間: 2024 | 簽名真實性研究計劃

""" with open(output_path, 'w', encoding='utf-8') as f: f.write(html_content) print(f"HTML 報告已儲存: {output_path}") def main(): print("=" * 60) print("Step 4: 生成高相似度案例視覺化報告") print("=" * 60) # 載入高相似度對 print("載入高相似度對資料...") with open(HIGH_SIM_JSON, 'r', encoding='utf-8') as f: pairs = json.load(f) print(f"共 {len(pairs):,} 對高相似度簽名") # 按相似度排序 pairs_sorted = sorted(pairs, key=lambda x: x['similarity'], reverse=True) # 統計 sim_1 = len([p for p in pairs_sorted if p['similarity'] >= 0.9999]) sim_99 = len([p for p in pairs_sorted if p['similarity'] >= 0.99]) sim_97 = len([p for p in pairs_sorted if p['similarity'] >= 0.97]) print(f"\n相似度統計:") print(f" = 1.0 (完全相同): {sim_1:,}") print(f" >= 0.99: {sim_99:,}") print(f" >= 0.97: {sim_97:,}") # 生成報告 print(f"\n生成 Top {TOP_N} 視覺化報告...") generate_html_report(pairs_sorted, REPORTS_PATH / "high_similarity_report.html") print("\n完成!") if __name__ == "__main__": main()