pdf_signature_extraction/signature_analysis/04_generate_visual_report.py

#!/usr/bin/env python3
"""
Step 4: 生成高相似度案例的視覺化報告

讀取 high_similarity_pairs.json
為 Top N 高相似度對生成並排對比圖
生成 HTML 報告
"""

import json
import cv2
import numpy as np
from pathlib import Path
from tqdm import tqdm
import base64
from io import BytesIO

# 路徑配置
IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
REPORTS_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
HIGH_SIM_JSON = REPORTS_PATH / "high_similarity_pairs.json"

# 報告配置
TOP_N = 100  # 顯示前 N 對


def load_image(filename: str) -> np.ndarray:
    """載入圖片"""
    img_path = IMAGES_DIR / filename
    img = cv2.imread(str(img_path))
    if img is None:
        # 返回空白圖片
        return np.ones((100, 200, 3), dtype=np.uint8) * 255
    return img


def create_comparison_image(file1: str, file2: str, similarity: float) -> np.ndarray:
    """建立並排對比圖"""
    img1 = load_image(file1)
    img2 = load_image(file2)

    # 統一高度
    h1, w1 = img1.shape[:2]
    h2, w2 = img2.shape[:2]
    target_h = max(h1, h2, 100)

    # 縮放
    if h1 != target_h:
        scale = target_h / h1
        img1 = cv2.resize(img1, (int(w1 * scale), target_h))
    if h2 != target_h:
        scale = target_h / h2
        img2 = cv2.resize(img2, (int(w2 * scale), target_h))

    # 加入分隔線
    separator = np.ones((target_h, 20, 3), dtype=np.uint8) * 200

    # 合併
    comparison = np.hstack([img1, separator, img2])

    return comparison


def image_to_base64(img: np.ndarray) -> str:
    """將圖片轉換為 base64"""
    _, buffer = cv2.imencode('.png', img)
    return base64.b64encode(buffer).decode('utf-8')


def generate_html_report(pairs: list, output_path: Path):
    """生成 HTML 報告"""
    html_content = """
<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <title>簽名相似度分析報告 - 高相似度案例</title>
    <style>
        body {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            max-width: 1400px;
            margin: 0 auto;
            padding: 20px;
            background-color: #f5f5f5;
        }
        h1 {
            color: #333;
            text-align: center;
            border-bottom: 2px solid #666;
            padding-bottom: 10px;
        }
        .summary {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 20px;
            border-radius: 10px;
            margin-bottom: 30px;
        }
        .summary h2 {
            margin-top: 0;
        }
        .pair-card {
            background: white;
            border-radius: 10px;
            padding: 20px;
            margin-bottom: 20px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
        }
        .pair-header {
            display: flex;
            justify-content: space-between;
            align-items: center;
            margin-bottom: 15px;
            padding-bottom: 10px;
            border-bottom: 1px solid #eee;
        }
        .pair-number {
            font-size: 1.2em;
            font-weight: bold;
            color: #333;
        }
        .similarity-badge {
            background: #dc3545;
            color: white;
            padding: 5px 15px;
            border-radius: 20px;
            font-weight: bold;
        }
        .similarity-badge.high {
            background: #dc3545;
        }
        .similarity-badge.very-high {
            background: #8b0000;
        }
        .file-info {
            font-family: monospace;
            font-size: 0.9em;
            color: #666;
            margin-bottom: 10px;
        }
        .comparison-image {
            max-width: 100%;
            border: 1px solid #ddd;
            border-radius: 5px;
        }
        .analysis {
            margin-top: 15px;
            padding: 10px;
            background: #f8f9fa;
            border-radius: 5px;
            font-size: 0.9em;
        }
        .tag {
            display: inline-block;
            padding: 2px 8px;
            border-radius: 3px;
            margin-right: 5px;
            font-size: 0.8em;
        }
        .tag-same-serial { background: #ffebee; color: #c62828; }
        .tag-same-month { background: #fff3e0; color: #e65100; }
        .tag-diff { background: #e8f5e9; color: #2e7d32; }
    </style>
</head>
<body>
    <h1>簽名相似度分析報告 - 高相似度案例</h1>

    <div class="summary">
        <h2>摘要</h2>
        <p><strong>分析結果：</strong>發現 659,111 對高相似度簽名 (>0.95)</p>
        <p><strong>本報告顯示：</strong>Top """ + str(TOP_N) + """ 最高相似度案例</p>
        <p><strong>結論：</strong>存在大量相似度接近或等於 1.0 的簽名對，強烈暗示「複製貼上」行為</p>
    </div>

    <div class="pairs-container">
"""

    for i, pair in enumerate(pairs[:TOP_N], 1):
        sim = pair['similarity']
        file1 = pair['file1']
        file2 = pair['file2']
        p1 = pair.get('parsed1', {})
        p2 = pair.get('parsed2', {})

        # 分析關係
        tags = []
        if p1.get('serial') == p2.get('serial'):
            tags.append(('<span class="tag tag-same-serial">同序號</span>', ''))
        if p1.get('year_month') == p2.get('year_month'):
            tags.append(('<span class="tag tag-same-month">同月份</span>', ''))
        if p1.get('year_month') != p2.get('year_month') and p1.get('serial') != p2.get('serial'):
            tags.append(('<span class="tag tag-diff">完全不同文件</span>', ''))

        badge_class = 'very-high' if sim >= 0.99 else 'high'

        # 建立對比圖
        try:
            comparison_img = create_comparison_image(file1, file2, sim)
            img_base64 = image_to_base64(comparison_img)
            img_html = f'<img src="data:image/png;base64,{img_base64}" class="comparison-image">'
        except Exception as e:
            img_html = f'<p style="color:red">無法載入圖片: {e}</p>'

        tag_html = ''.join([t[0] for t in tags])

        html_content += f"""
        <div class="pair-card">
            <div class="pair-header">
                <span class="pair-number">#{i}</span>
                <span class="similarity-badge {badge_class}">相似度: {sim:.4f}</span>
            </div>
            <div class="file-info">
                <strong>簽名 1:</strong> {file1}<br>
                <strong>簽名 2:</strong> {file2}
            </div>
            {img_html}
            <div class="analysis">
                {tag_html}
                <br><small>日期: {p1.get('year_month', 'N/A')} vs {p2.get('year_month', 'N/A')} |
                序號: {p1.get('serial', 'N/A')} vs {p2.get('serial', 'N/A')}</small>
            </div>
        </div>
"""

    html_content += """
    </div>

    <div style="text-align: center; margin-top: 30px; color: #666;">
        <p>生成時間: 2024 | 簽名真實性研究計劃</p>
    </div>
</body>
</html>
"""

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(html_content)

    print(f"HTML 報告已儲存: {output_path}")


def main():
    print("=" * 60)
    print("Step 4: 生成高相似度案例視覺化報告")
    print("=" * 60)

    # 載入高相似度對
    print("載入高相似度對資料...")
    with open(HIGH_SIM_JSON, 'r', encoding='utf-8') as f:
        pairs = json.load(f)

    print(f"共 {len(pairs):,} 對高相似度簽名")

    # 按相似度排序
    pairs_sorted = sorted(pairs, key=lambda x: x['similarity'], reverse=True)

    # 統計
    sim_1 = len([p for p in pairs_sorted if p['similarity'] >= 0.9999])
    sim_99 = len([p for p in pairs_sorted if p['similarity'] >= 0.99])
    sim_97 = len([p for p in pairs_sorted if p['similarity'] >= 0.97])

    print(f"\n相似度統計:")
    print(f"  = 1.0 (完全相同): {sim_1:,}")
    print(f"  >= 0.99: {sim_99:,}")
    print(f"  >= 0.97: {sim_97:,}")

    # 生成報告
    print(f"\n生成 Top {TOP_N} 視覺化報告...")
    generate_html_report(pairs_sorted, REPORTS_PATH / "high_similarity_report.html")

    print("\n完成！")


if __name__ == "__main__":
    main()