Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification
Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,274 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Step 4: 生成高相似度案例的視覺化報告
|
||||
|
||||
讀取 high_similarity_pairs.json
|
||||
為 Top N 高相似度對生成並排對比圖
|
||||
生成 HTML 報告
|
||||
"""
|
||||
|
||||
import json
|
||||
import cv2
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
import base64
|
||||
from io import BytesIO
|
||||
|
||||
# 路徑配置
|
||||
IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
|
||||
REPORTS_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
|
||||
HIGH_SIM_JSON = REPORTS_PATH / "high_similarity_pairs.json"
|
||||
|
||||
# 報告配置
|
||||
TOP_N = 100 # 顯示前 N 對
|
||||
|
||||
|
||||
def load_image(filename: str) -> np.ndarray:
|
||||
"""載入圖片"""
|
||||
img_path = IMAGES_DIR / filename
|
||||
img = cv2.imread(str(img_path))
|
||||
if img is None:
|
||||
# 返回空白圖片
|
||||
return np.ones((100, 200, 3), dtype=np.uint8) * 255
|
||||
return img
|
||||
|
||||
|
||||
def create_comparison_image(file1: str, file2: str, similarity: float) -> np.ndarray:
|
||||
"""建立並排對比圖"""
|
||||
img1 = load_image(file1)
|
||||
img2 = load_image(file2)
|
||||
|
||||
# 統一高度
|
||||
h1, w1 = img1.shape[:2]
|
||||
h2, w2 = img2.shape[:2]
|
||||
target_h = max(h1, h2, 100)
|
||||
|
||||
# 縮放
|
||||
if h1 != target_h:
|
||||
scale = target_h / h1
|
||||
img1 = cv2.resize(img1, (int(w1 * scale), target_h))
|
||||
if h2 != target_h:
|
||||
scale = target_h / h2
|
||||
img2 = cv2.resize(img2, (int(w2 * scale), target_h))
|
||||
|
||||
# 加入分隔線
|
||||
separator = np.ones((target_h, 20, 3), dtype=np.uint8) * 200
|
||||
|
||||
# 合併
|
||||
comparison = np.hstack([img1, separator, img2])
|
||||
|
||||
return comparison
|
||||
|
||||
|
||||
def image_to_base64(img: np.ndarray) -> str:
|
||||
"""將圖片轉換為 base64"""
|
||||
_, buffer = cv2.imencode('.png', img)
|
||||
return base64.b64encode(buffer).decode('utf-8')
|
||||
|
||||
|
||||
def generate_html_report(pairs: list, output_path: Path):
|
||||
"""生成 HTML 報告"""
|
||||
html_content = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>簽名相似度分析報告 - 高相似度案例</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
||||
max-width: 1400px;
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
h1 {
|
||||
color: #333;
|
||||
text-align: center;
|
||||
border-bottom: 2px solid #666;
|
||||
padding-bottom: 10px;
|
||||
}
|
||||
.summary {
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
color: white;
|
||||
padding: 20px;
|
||||
border-radius: 10px;
|
||||
margin-bottom: 30px;
|
||||
}
|
||||
.summary h2 {
|
||||
margin-top: 0;
|
||||
}
|
||||
.pair-card {
|
||||
background: white;
|
||||
border-radius: 10px;
|
||||
padding: 20px;
|
||||
margin-bottom: 20px;
|
||||
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
||||
}
|
||||
.pair-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 15px;
|
||||
padding-bottom: 10px;
|
||||
border-bottom: 1px solid #eee;
|
||||
}
|
||||
.pair-number {
|
||||
font-size: 1.2em;
|
||||
font-weight: bold;
|
||||
color: #333;
|
||||
}
|
||||
.similarity-badge {
|
||||
background: #dc3545;
|
||||
color: white;
|
||||
padding: 5px 15px;
|
||||
border-radius: 20px;
|
||||
font-weight: bold;
|
||||
}
|
||||
.similarity-badge.high {
|
||||
background: #dc3545;
|
||||
}
|
||||
.similarity-badge.very-high {
|
||||
background: #8b0000;
|
||||
}
|
||||
.file-info {
|
||||
font-family: monospace;
|
||||
font-size: 0.9em;
|
||||
color: #666;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
.comparison-image {
|
||||
max-width: 100%;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 5px;
|
||||
}
|
||||
.analysis {
|
||||
margin-top: 15px;
|
||||
padding: 10px;
|
||||
background: #f8f9fa;
|
||||
border-radius: 5px;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
.tag {
|
||||
display: inline-block;
|
||||
padding: 2px 8px;
|
||||
border-radius: 3px;
|
||||
margin-right: 5px;
|
||||
font-size: 0.8em;
|
||||
}
|
||||
.tag-same-serial { background: #ffebee; color: #c62828; }
|
||||
.tag-same-month { background: #fff3e0; color: #e65100; }
|
||||
.tag-diff { background: #e8f5e9; color: #2e7d32; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>簽名相似度分析報告 - 高相似度案例</h1>
|
||||
|
||||
<div class="summary">
|
||||
<h2>摘要</h2>
|
||||
<p><strong>分析結果:</strong>發現 659,111 對高相似度簽名 (>0.95)</p>
|
||||
<p><strong>本報告顯示:</strong>Top """ + str(TOP_N) + """ 最高相似度案例</p>
|
||||
<p><strong>結論:</strong>存在大量相似度接近或等於 1.0 的簽名對,強烈暗示「複製貼上」行為</p>
|
||||
</div>
|
||||
|
||||
<div class="pairs-container">
|
||||
"""
|
||||
|
||||
for i, pair in enumerate(pairs[:TOP_N], 1):
|
||||
sim = pair['similarity']
|
||||
file1 = pair['file1']
|
||||
file2 = pair['file2']
|
||||
p1 = pair.get('parsed1', {})
|
||||
p2 = pair.get('parsed2', {})
|
||||
|
||||
# 分析關係
|
||||
tags = []
|
||||
if p1.get('serial') == p2.get('serial'):
|
||||
tags.append(('<span class="tag tag-same-serial">同序號</span>', ''))
|
||||
if p1.get('year_month') == p2.get('year_month'):
|
||||
tags.append(('<span class="tag tag-same-month">同月份</span>', ''))
|
||||
if p1.get('year_month') != p2.get('year_month') and p1.get('serial') != p2.get('serial'):
|
||||
tags.append(('<span class="tag tag-diff">完全不同文件</span>', ''))
|
||||
|
||||
badge_class = 'very-high' if sim >= 0.99 else 'high'
|
||||
|
||||
# 建立對比圖
|
||||
try:
|
||||
comparison_img = create_comparison_image(file1, file2, sim)
|
||||
img_base64 = image_to_base64(comparison_img)
|
||||
img_html = f'<img src="data:image/png;base64,{img_base64}" class="comparison-image">'
|
||||
except Exception as e:
|
||||
img_html = f'<p style="color:red">無法載入圖片: {e}</p>'
|
||||
|
||||
tag_html = ''.join([t[0] for t in tags])
|
||||
|
||||
html_content += f"""
|
||||
<div class="pair-card">
|
||||
<div class="pair-header">
|
||||
<span class="pair-number">#{i}</span>
|
||||
<span class="similarity-badge {badge_class}">相似度: {sim:.4f}</span>
|
||||
</div>
|
||||
<div class="file-info">
|
||||
<strong>簽名 1:</strong> {file1}<br>
|
||||
<strong>簽名 2:</strong> {file2}
|
||||
</div>
|
||||
{img_html}
|
||||
<div class="analysis">
|
||||
{tag_html}
|
||||
<br><small>日期: {p1.get('year_month', 'N/A')} vs {p2.get('year_month', 'N/A')} |
|
||||
序號: {p1.get('serial', 'N/A')} vs {p2.get('serial', 'N/A')}</small>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
html_content += """
|
||||
</div>
|
||||
|
||||
<div style="text-align: center; margin-top: 30px; color: #666;">
|
||||
<p>生成時間: 2024 | 簽名真實性研究計劃</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(html_content)
|
||||
|
||||
print(f"HTML 報告已儲存: {output_path}")
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Step 4: 生成高相似度案例視覺化報告")
|
||||
print("=" * 60)
|
||||
|
||||
# 載入高相似度對
|
||||
print("載入高相似度對資料...")
|
||||
with open(HIGH_SIM_JSON, 'r', encoding='utf-8') as f:
|
||||
pairs = json.load(f)
|
||||
|
||||
print(f"共 {len(pairs):,} 對高相似度簽名")
|
||||
|
||||
# 按相似度排序
|
||||
pairs_sorted = sorted(pairs, key=lambda x: x['similarity'], reverse=True)
|
||||
|
||||
# 統計
|
||||
sim_1 = len([p for p in pairs_sorted if p['similarity'] >= 0.9999])
|
||||
sim_99 = len([p for p in pairs_sorted if p['similarity'] >= 0.99])
|
||||
sim_97 = len([p for p in pairs_sorted if p['similarity'] >= 0.97])
|
||||
|
||||
print(f"\n相似度統計:")
|
||||
print(f" = 1.0 (完全相同): {sim_1:,}")
|
||||
print(f" >= 0.99: {sim_99:,}")
|
||||
print(f" >= 0.97: {sim_97:,}")
|
||||
|
||||
# 生成報告
|
||||
print(f"\n生成 Top {TOP_N} 視覺化報告...")
|
||||
generate_html_report(pairs_sorted, REPORTS_PATH / "high_similarity_report.html")
|
||||
|
||||
print("\n完成!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user