Files
pdf_signature_extraction/paper/generate_paper_figures.py
T
gbanyan 939a348da4 Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification
Paper draft includes all sections (Abstract through Conclusion), 36 references,
and supporting scripts. Key methodology: Cosine similarity + dHash dual-method
verification with thresholds calibrated against known-replication firm (Firm A).

Includes:
- 8 section markdown files (paper_a_*.md)
- Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0)
- Recalibrated classification script (84,386 PDFs, 5-tier system)
- Figure generation and Word export scripts
- Citation renumbering script ([1]-[36])
- Signature analysis pipeline (12 steps)
- YOLO extraction scripts

Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 23:05:33 +08:00

393 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Generate all figures for Paper A (IEEE TAI submission).
Outputs to /Volumes/NV2/PDF-Processing/signature-analysis/paper_figures/
"""
import numpy as np
import sqlite3
import json
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.patches import FancyBboxPatch, FancyArrowPatch
from collections import defaultdict
from pathlib import Path
# Config
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
ABLATION_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/ablation/ablation_results.json'
OUTPUT_DIR = Path('/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
# IEEE formatting
plt.rcParams.update({
'font.family': 'serif',
'font.serif': ['Times New Roman', 'DejaVu Serif'],
'font.size': 9,
'axes.labelsize': 10,
'axes.titlesize': 10,
'xtick.labelsize': 8,
'ytick.labelsize': 8,
'legend.fontsize': 8,
'figure.dpi': 300,
'savefig.dpi': 300,
'savefig.bbox': 'tight',
'savefig.pad_inches': 0.05,
})
# IEEE column widths
COL_WIDTH = 3.5 # single column inches
FULL_WIDTH = 7.16 # full page width inches
def load_signature_data():
"""Load per-signature best-match similarities and accountant info."""
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
cur.execute('''
SELECT s.assigned_accountant, s.max_similarity_to_same_accountant, a.firm
FROM signatures s
LEFT JOIN accountants a ON s.assigned_accountant = a.name
WHERE s.max_similarity_to_same_accountant IS NOT NULL
AND s.assigned_accountant IS NOT NULL
''')
rows = cur.fetchall()
conn.close()
data = {
'accountants': [r[0] for r in rows],
'max_sims': np.array([r[1] for r in rows]),
'firms': [r[2] for r in rows],
}
return data
def load_intra_inter_from_features():
"""Compute intra/inter class distributions from feature vectors."""
print("Loading features for intra/inter distributions...")
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
cur.execute('''
SELECT assigned_accountant, feature_vector
FROM signatures
WHERE feature_vector IS NOT NULL AND assigned_accountant IS NOT NULL
''')
rows = cur.fetchall()
conn.close()
acct_groups = defaultdict(list)
features_list = []
accountants = []
for r in rows:
feat = np.frombuffer(r[1], dtype=np.float32)
idx = len(features_list)
features_list.append(feat)
accountants.append(r[0])
acct_groups[r[0]].append(idx)
features = np.array(features_list)
print(f" Loaded {len(features)} signatures, {len(acct_groups)} accountants")
# Intra-class
print(" Computing intra-class...")
intra_sims = []
for acct, indices in acct_groups.items():
if len(indices) < 3:
continue
vecs = features[indices]
sim_matrix = vecs @ vecs.T
n = len(indices)
triu_idx = np.triu_indices(n, k=1)
intra_sims.extend(sim_matrix[triu_idx].tolist())
intra_sims = np.array(intra_sims)
print(f" Intra-class: {len(intra_sims):,} pairs")
# Inter-class
print(" Computing inter-class...")
all_acct_list = list(acct_groups.keys())
inter_sims = []
for _ in range(500_000):
a1, a2 = np.random.choice(len(all_acct_list), 2, replace=False)
i1 = np.random.choice(acct_groups[all_acct_list[a1]])
i2 = np.random.choice(acct_groups[all_acct_list[a2]])
sim = float(features[i1] @ features[i2])
inter_sims.append(sim)
inter_sims = np.array(inter_sims)
print(f" Inter-class: {len(inter_sims):,} pairs")
return intra_sims, inter_sims
def fig1_pipeline(output_path):
"""Fig 1: Pipeline architecture diagram."""
print("Generating Fig 1: Pipeline...")
fig, ax = plt.subplots(1, 1, figsize=(FULL_WIDTH, 1.8))
ax.set_xlim(0, 10)
ax.set_ylim(0, 2)
ax.axis('off')
# Stages
stages = [
("90,282\nPDFs", "#E3F2FD"),
("VLM\nPre-screen", "#BBDEFB"),
("YOLO\nDetection", "#90CAF9"),
("ResNet-50\nFeatures", "#64B5F6"),
("Cosine +\npHash", "#42A5F5"),
("Calibration\n& Classify", "#1E88E5"),
]
annotations = [
"86,072 docs",
"182,328 sigs",
"2048-dim",
"Dual verify",
"Verdicts",
]
box_w = 1.3
box_h = 1.0
gap = 0.38
start_x = 0.15
y_center = 1.0
for i, (label, color) in enumerate(stages):
x = start_x + i * (box_w + gap)
box = FancyBboxPatch(
(x, y_center - box_h/2), box_w, box_h,
boxstyle="round,pad=0.1",
facecolor=color, edgecolor='#1565C0', linewidth=1.2
)
ax.add_patch(box)
ax.text(x + box_w/2, y_center, label,
ha='center', va='center', fontsize=8, fontweight='bold',
color='#0D47A1' if i < 3 else 'white')
# Arrow + annotation
if i < len(stages) - 1:
arrow_x = x + box_w + 0.02
ax.annotate('', xy=(arrow_x + gap - 0.04, y_center),
xytext=(arrow_x, y_center),
arrowprops=dict(arrowstyle='->', color='#1565C0', lw=1.5))
ax.text(arrow_x + gap/2, y_center - 0.62, annotations[i],
ha='center', va='top', fontsize=6.5, color='#555555', style='italic')
plt.savefig(output_path, format='png')
plt.savefig(output_path.with_suffix('.pdf'), format='pdf')
plt.close()
print(f" Saved: {output_path}")
def fig2_intra_inter_kde(intra_sims, inter_sims, output_path):
"""Fig 2: Intra vs Inter class cosine similarity distributions."""
print("Generating Fig 2: Intra vs Inter KDE...")
from scipy.stats import gaussian_kde
fig, ax = plt.subplots(1, 1, figsize=(COL_WIDTH, 2.5))
x_grid = np.linspace(0.3, 1.0, 500)
kde_intra = gaussian_kde(intra_sims, bw_method=0.02)
kde_inter = gaussian_kde(inter_sims, bw_method=0.02)
y_intra = kde_intra(x_grid)
y_inter = kde_inter(x_grid)
ax.fill_between(x_grid, y_intra, alpha=0.3, color='#E53935', label='Intra-class (same CPA)')
ax.fill_between(x_grid, y_inter, alpha=0.3, color='#1E88E5', label='Inter-class (diff. CPA)')
ax.plot(x_grid, y_intra, color='#C62828', linewidth=1.5)
ax.plot(x_grid, y_inter, color='#1565C0', linewidth=1.5)
# Find crossover
diff = y_intra - y_inter
sign_changes = np.where(np.diff(np.sign(diff)))[0]
crossovers = x_grid[sign_changes]
valid = crossovers[(crossovers > 0.5) & (crossovers < 1.0)]
if len(valid) > 0:
xover = valid[-1]
ax.axvline(x=xover, color='#4CAF50', linestyle='--', linewidth=1.2, alpha=0.8)
ax.text(xover + 0.01, ax.get_ylim()[1] * 0.85, f'KDE crossover\n= {xover:.3f}',
fontsize=7, color='#2E7D32', va='top')
ax.set_xlabel('Cosine Similarity')
ax.set_ylabel('Density')
ax.legend(loc='upper left', framealpha=0.9)
ax.set_xlim(0.35, 1.0)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tight_layout()
plt.savefig(output_path, format='png')
plt.savefig(output_path.with_suffix('.pdf'), format='pdf')
plt.close()
print(f" Saved: {output_path}")
def fig3_firm_a_calibration(data, output_path):
"""Fig 3: Firm A calibration - per-signature best match distribution."""
print("Generating Fig 3: Firm A Calibration...")
from scipy.stats import gaussian_kde
firm_a_mask = np.array([f == '勤業眾信聯合' for f in data['firms']])
non_firm_a_mask = ~firm_a_mask
firm_a_sims = data['max_sims'][firm_a_mask]
others_sims = data['max_sims'][non_firm_a_mask]
fig, ax = plt.subplots(1, 1, figsize=(COL_WIDTH, 2.5))
x_grid = np.linspace(0.5, 1.0, 500)
kde_a = gaussian_kde(firm_a_sims, bw_method=0.015)
kde_others = gaussian_kde(others_sims, bw_method=0.015)
y_a = kde_a(x_grid)
y_others = kde_others(x_grid)
ax.fill_between(x_grid, y_a, alpha=0.35, color='#E53935',
label=f'Firm A (known replication, n={len(firm_a_sims):,})')
ax.fill_between(x_grid, y_others, alpha=0.25, color='#78909C',
label=f'Other CPAs (n={len(others_sims):,})')
ax.plot(x_grid, y_a, color='#C62828', linewidth=1.5)
ax.plot(x_grid, y_others, color='#546E7A', linewidth=1.5)
# Mark key statistics
p1 = np.percentile(firm_a_sims, 1)
ax.axvline(x=p1, color='#E53935', linestyle=':', linewidth=1, alpha=0.7)
ax.text(p1 - 0.01, ax.get_ylim()[1] * 0.5 if ax.get_ylim()[1] > 0 else 10,
f'Firm A\n1st pct\n= {p1:.3f}', fontsize=6.5, color='#C62828',
ha='right', va='center')
mean_a = firm_a_sims.mean()
ax.axvline(x=mean_a, color='#E53935', linestyle='--', linewidth=1, alpha=0.7)
ax.set_xlabel('Per-Signature Best-Match Cosine Similarity')
ax.set_ylabel('Density')
ax.legend(loc='upper left', framealpha=0.9, fontsize=7)
ax.set_xlim(0.5, 1.005)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tight_layout()
plt.savefig(output_path, format='png')
plt.savefig(output_path.with_suffix('.pdf'), format='pdf')
plt.close()
print(f" Saved: {output_path}")
def fig4_ablation(output_path):
"""Fig 4: Ablation backbone comparison."""
print("Generating Fig 4: Ablation...")
with open(ABLATION_PATH) as f:
results = json.load(f)
backbones = ['ResNet-50\n(2048-d)', 'VGG-16\n(4096-d)', 'EfficientNet-B0\n(1280-d)']
backbone_keys = ['resnet50', 'vgg16', 'efficientnet_b0']
results_map = {r['backbone']: r for r in results}
fig, axes = plt.subplots(1, 3, figsize=(FULL_WIDTH, 2.2))
colors = ['#1E88E5', '#FFA726', '#66BB6A']
# Panel (a): Intra/Inter means with error bars
ax = axes[0]
x = np.arange(len(backbones))
width = 0.35
intra_means = [results_map[k]['intra']['mean'] for k in backbone_keys]
intra_stds = [results_map[k]['intra']['std'] for k in backbone_keys]
inter_means = [results_map[k]['inter']['mean'] for k in backbone_keys]
inter_stds = [results_map[k]['inter']['std'] for k in backbone_keys]
bars1 = ax.bar(x - width/2, intra_means, width, yerr=intra_stds,
color='#E53935', alpha=0.7, label='Intra', capsize=3, error_kw={'linewidth': 0.8})
bars2 = ax.bar(x + width/2, inter_means, width, yerr=inter_stds,
color='#1E88E5', alpha=0.7, label='Inter', capsize=3, error_kw={'linewidth': 0.8})
ax.set_ylabel('Cosine Similarity')
ax.set_xticks(x)
ax.set_xticklabels(backbones, fontsize=7)
ax.legend(fontsize=7)
ax.set_ylim(0.5, 1.0)
ax.set_title('(a) Mean Similarity', fontsize=9)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Panel (b): Cohen's d
ax = axes[1]
cohens_ds = [results_map[k]['cohens_d'] for k in backbone_keys]
bars = ax.bar(x, cohens_ds, 0.5, color=colors, alpha=0.8, edgecolor='#333', linewidth=0.5)
ax.set_ylabel("Cohen's d")
ax.set_xticks(x)
ax.set_xticklabels(backbones, fontsize=7)
ax.set_ylim(0, 0.9)
ax.set_title("(b) Cohen's d", fontsize=9)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Add value labels
for bar, val in zip(bars, cohens_ds):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
f'{val:.3f}', ha='center', va='bottom', fontsize=7, fontweight='bold')
# Panel (c): KDE crossover
ax = axes[2]
crossovers = [results_map[k]['kde_crossover'] for k in backbone_keys]
bars = ax.bar(x, crossovers, 0.5, color=colors, alpha=0.8, edgecolor='#333', linewidth=0.5)
ax.set_ylabel('KDE Crossover')
ax.set_xticks(x)
ax.set_xticklabels(backbones, fontsize=7)
ax.set_ylim(0.7, 0.9)
ax.set_title('(c) KDE Crossover', fontsize=9)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
for bar, val in zip(bars, crossovers):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
f'{val:.3f}', ha='center', va='bottom', fontsize=7, fontweight='bold')
plt.tight_layout()
plt.savefig(output_path, format='png')
plt.savefig(output_path.with_suffix('.pdf'), format='pdf')
plt.close()
print(f" Saved: {output_path}")
def main():
print("=" * 60)
print("Generating Paper Figures")
print("=" * 60)
# Fig 1: Pipeline (no data needed)
fig1_pipeline(OUTPUT_DIR / 'fig1_pipeline.png')
# Fig 4: Ablation (uses pre-computed JSON)
fig4_ablation(OUTPUT_DIR / 'fig4_ablation.png')
# Load data for Fig 2 & 3
data = load_signature_data()
print(f"Loaded {len(data['max_sims']):,} signatures")
# Fig 3: Firm A calibration (uses per-signature best match from DB)
fig3_firm_a_calibration(data, OUTPUT_DIR / 'fig3_firm_a_calibration.png')
# Fig 2: Intra vs Inter (needs full feature vectors)
intra_sims, inter_sims = load_intra_inter_from_features()
fig2_intra_inter_kde(intra_sims, inter_sims, OUTPUT_DIR / 'fig2_intra_inter_kde.png')
print("\n" + "=" * 60)
print("All figures saved to:", OUTPUT_DIR)
print("=" * 60)
if __name__ == "__main__":
main()