#!/usr/bin/env python3 """ Generate all figures for Paper A (IEEE TAI submission). Outputs to /Volumes/NV2/PDF-Processing/signature-analysis/paper_figures/ """ import numpy as np import sqlite3 import json import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import matplotlib.patches as mpatches from matplotlib.patches import FancyBboxPatch, FancyArrowPatch from collections import defaultdict from pathlib import Path # Config DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' ABLATION_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/ablation/ablation_results.json' OUTPUT_DIR = Path('/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures') OUTPUT_DIR.mkdir(parents=True, exist_ok=True) RANDOM_SEED = 42 np.random.seed(RANDOM_SEED) # IEEE formatting plt.rcParams.update({ 'font.family': 'serif', 'font.serif': ['Times New Roman', 'DejaVu Serif'], 'font.size': 9, 'axes.labelsize': 10, 'axes.titlesize': 10, 'xtick.labelsize': 8, 'ytick.labelsize': 8, 'legend.fontsize': 8, 'figure.dpi': 300, 'savefig.dpi': 300, 'savefig.bbox': 'tight', 'savefig.pad_inches': 0.05, }) # IEEE column widths COL_WIDTH = 3.5 # single column inches FULL_WIDTH = 7.16 # full page width inches def load_signature_data(): """Load per-signature best-match similarities and accountant info.""" conn = sqlite3.connect(DB_PATH) cur = conn.cursor() cur.execute(''' SELECT s.assigned_accountant, s.max_similarity_to_same_accountant, a.firm FROM signatures s LEFT JOIN accountants a ON s.assigned_accountant = a.name WHERE s.max_similarity_to_same_accountant IS NOT NULL AND s.assigned_accountant IS NOT NULL ''') rows = cur.fetchall() conn.close() data = { 'accountants': [r[0] for r in rows], 'max_sims': np.array([r[1] for r in rows]), 'firms': [r[2] for r in rows], } return data def load_intra_inter_from_features(): """Compute intra/inter class distributions from feature vectors.""" print("Loading features for intra/inter distributions...") conn = sqlite3.connect(DB_PATH) cur = conn.cursor() cur.execute(''' SELECT assigned_accountant, feature_vector FROM signatures WHERE feature_vector IS NOT NULL AND assigned_accountant IS NOT NULL ''') rows = cur.fetchall() conn.close() acct_groups = defaultdict(list) features_list = [] accountants = [] for r in rows: feat = np.frombuffer(r[1], dtype=np.float32) idx = len(features_list) features_list.append(feat) accountants.append(r[0]) acct_groups[r[0]].append(idx) features = np.array(features_list) print(f" Loaded {len(features)} signatures, {len(acct_groups)} accountants") # Intra-class print(" Computing intra-class...") intra_sims = [] for acct, indices in acct_groups.items(): if len(indices) < 3: continue vecs = features[indices] sim_matrix = vecs @ vecs.T n = len(indices) triu_idx = np.triu_indices(n, k=1) intra_sims.extend(sim_matrix[triu_idx].tolist()) intra_sims = np.array(intra_sims) print(f" Intra-class: {len(intra_sims):,} pairs") # Inter-class print(" Computing inter-class...") all_acct_list = list(acct_groups.keys()) inter_sims = [] for _ in range(500_000): a1, a2 = np.random.choice(len(all_acct_list), 2, replace=False) i1 = np.random.choice(acct_groups[all_acct_list[a1]]) i2 = np.random.choice(acct_groups[all_acct_list[a2]]) sim = float(features[i1] @ features[i2]) inter_sims.append(sim) inter_sims = np.array(inter_sims) print(f" Inter-class: {len(inter_sims):,} pairs") return intra_sims, inter_sims def fig1_pipeline(output_path): """Fig 1: Pipeline architecture diagram.""" print("Generating Fig 1: Pipeline...") fig, ax = plt.subplots(1, 1, figsize=(FULL_WIDTH, 1.8)) ax.set_xlim(0, 10) ax.set_ylim(0, 2) ax.axis('off') # Stages stages = [ ("90,282\nPDFs", "#E3F2FD"), ("VLM\nPre-screen", "#BBDEFB"), ("YOLO\nDetection", "#90CAF9"), ("ResNet-50\nFeatures", "#64B5F6"), ("Cosine +\npHash", "#42A5F5"), ("Calibration\n& Classify", "#1E88E5"), ] annotations = [ "86,072 docs", "182,328 sigs", "2048-dim", "Dual verify", "Verdicts", ] box_w = 1.3 box_h = 1.0 gap = 0.38 start_x = 0.15 y_center = 1.0 for i, (label, color) in enumerate(stages): x = start_x + i * (box_w + gap) box = FancyBboxPatch( (x, y_center - box_h/2), box_w, box_h, boxstyle="round,pad=0.1", facecolor=color, edgecolor='#1565C0', linewidth=1.2 ) ax.add_patch(box) ax.text(x + box_w/2, y_center, label, ha='center', va='center', fontsize=8, fontweight='bold', color='#0D47A1' if i < 3 else 'white') # Arrow + annotation if i < len(stages) - 1: arrow_x = x + box_w + 0.02 ax.annotate('', xy=(arrow_x + gap - 0.04, y_center), xytext=(arrow_x, y_center), arrowprops=dict(arrowstyle='->', color='#1565C0', lw=1.5)) ax.text(arrow_x + gap/2, y_center - 0.62, annotations[i], ha='center', va='top', fontsize=6.5, color='#555555', style='italic') plt.savefig(output_path, format='png') plt.savefig(output_path.with_suffix('.pdf'), format='pdf') plt.close() print(f" Saved: {output_path}") def fig2_intra_inter_kde(intra_sims, inter_sims, output_path): """Fig 2: Intra vs Inter class cosine similarity distributions.""" print("Generating Fig 2: Intra vs Inter KDE...") from scipy.stats import gaussian_kde fig, ax = plt.subplots(1, 1, figsize=(COL_WIDTH, 2.5)) x_grid = np.linspace(0.3, 1.0, 500) kde_intra = gaussian_kde(intra_sims, bw_method=0.02) kde_inter = gaussian_kde(inter_sims, bw_method=0.02) y_intra = kde_intra(x_grid) y_inter = kde_inter(x_grid) ax.fill_between(x_grid, y_intra, alpha=0.3, color='#E53935', label='Intra-class (same CPA)') ax.fill_between(x_grid, y_inter, alpha=0.3, color='#1E88E5', label='Inter-class (diff. CPA)') ax.plot(x_grid, y_intra, color='#C62828', linewidth=1.5) ax.plot(x_grid, y_inter, color='#1565C0', linewidth=1.5) # Find crossover diff = y_intra - y_inter sign_changes = np.where(np.diff(np.sign(diff)))[0] crossovers = x_grid[sign_changes] valid = crossovers[(crossovers > 0.5) & (crossovers < 1.0)] if len(valid) > 0: xover = valid[-1] ax.axvline(x=xover, color='#4CAF50', linestyle='--', linewidth=1.2, alpha=0.8) ax.text(xover + 0.01, ax.get_ylim()[1] * 0.85, f'KDE crossover\n= {xover:.3f}', fontsize=7, color='#2E7D32', va='top') ax.set_xlabel('Cosine Similarity') ax.set_ylabel('Density') ax.legend(loc='upper left', framealpha=0.9) ax.set_xlim(0.35, 1.0) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) plt.tight_layout() plt.savefig(output_path, format='png') plt.savefig(output_path.with_suffix('.pdf'), format='pdf') plt.close() print(f" Saved: {output_path}") def fig3_firm_a_calibration(data, output_path): """Fig 3: Firm A calibration - per-signature best match distribution.""" print("Generating Fig 3: Firm A Calibration...") from scipy.stats import gaussian_kde firm_a_mask = np.array([f == '勤業眾信聯合' for f in data['firms']]) non_firm_a_mask = ~firm_a_mask firm_a_sims = data['max_sims'][firm_a_mask] others_sims = data['max_sims'][non_firm_a_mask] fig, ax = plt.subplots(1, 1, figsize=(COL_WIDTH, 2.5)) x_grid = np.linspace(0.5, 1.0, 500) kde_a = gaussian_kde(firm_a_sims, bw_method=0.015) kde_others = gaussian_kde(others_sims, bw_method=0.015) y_a = kde_a(x_grid) y_others = kde_others(x_grid) ax.fill_between(x_grid, y_a, alpha=0.35, color='#E53935', label=f'Firm A (known replication, n={len(firm_a_sims):,})') ax.fill_between(x_grid, y_others, alpha=0.25, color='#78909C', label=f'Other CPAs (n={len(others_sims):,})') ax.plot(x_grid, y_a, color='#C62828', linewidth=1.5) ax.plot(x_grid, y_others, color='#546E7A', linewidth=1.5) # Mark key statistics p1 = np.percentile(firm_a_sims, 1) ax.axvline(x=p1, color='#E53935', linestyle=':', linewidth=1, alpha=0.7) ax.text(p1 - 0.01, ax.get_ylim()[1] * 0.5 if ax.get_ylim()[1] > 0 else 10, f'Firm A\n1st pct\n= {p1:.3f}', fontsize=6.5, color='#C62828', ha='right', va='center') mean_a = firm_a_sims.mean() ax.axvline(x=mean_a, color='#E53935', linestyle='--', linewidth=1, alpha=0.7) ax.set_xlabel('Per-Signature Best-Match Cosine Similarity') ax.set_ylabel('Density') ax.legend(loc='upper left', framealpha=0.9, fontsize=7) ax.set_xlim(0.5, 1.005) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) plt.tight_layout() plt.savefig(output_path, format='png') plt.savefig(output_path.with_suffix('.pdf'), format='pdf') plt.close() print(f" Saved: {output_path}") def fig4_ablation(output_path): """Fig 4: Ablation backbone comparison.""" print("Generating Fig 4: Ablation...") with open(ABLATION_PATH) as f: results = json.load(f) backbones = ['ResNet-50\n(2048-d)', 'VGG-16\n(4096-d)', 'EfficientNet-B0\n(1280-d)'] backbone_keys = ['resnet50', 'vgg16', 'efficientnet_b0'] results_map = {r['backbone']: r for r in results} fig, axes = plt.subplots(1, 3, figsize=(FULL_WIDTH, 2.2)) colors = ['#1E88E5', '#FFA726', '#66BB6A'] # Panel (a): Intra/Inter means with error bars ax = axes[0] x = np.arange(len(backbones)) width = 0.35 intra_means = [results_map[k]['intra']['mean'] for k in backbone_keys] intra_stds = [results_map[k]['intra']['std'] for k in backbone_keys] inter_means = [results_map[k]['inter']['mean'] for k in backbone_keys] inter_stds = [results_map[k]['inter']['std'] for k in backbone_keys] bars1 = ax.bar(x - width/2, intra_means, width, yerr=intra_stds, color='#E53935', alpha=0.7, label='Intra', capsize=3, error_kw={'linewidth': 0.8}) bars2 = ax.bar(x + width/2, inter_means, width, yerr=inter_stds, color='#1E88E5', alpha=0.7, label='Inter', capsize=3, error_kw={'linewidth': 0.8}) ax.set_ylabel('Cosine Similarity') ax.set_xticks(x) ax.set_xticklabels(backbones, fontsize=7) ax.legend(fontsize=7) ax.set_ylim(0.5, 1.0) ax.set_title('(a) Mean Similarity', fontsize=9) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) # Panel (b): Cohen's d ax = axes[1] cohens_ds = [results_map[k]['cohens_d'] for k in backbone_keys] bars = ax.bar(x, cohens_ds, 0.5, color=colors, alpha=0.8, edgecolor='#333', linewidth=0.5) ax.set_ylabel("Cohen's d") ax.set_xticks(x) ax.set_xticklabels(backbones, fontsize=7) ax.set_ylim(0, 0.9) ax.set_title("(b) Cohen's d", fontsize=9) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) # Add value labels for bar, val in zip(bars, cohens_ds): ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, f'{val:.3f}', ha='center', va='bottom', fontsize=7, fontweight='bold') # Panel (c): KDE crossover ax = axes[2] crossovers = [results_map[k]['kde_crossover'] for k in backbone_keys] bars = ax.bar(x, crossovers, 0.5, color=colors, alpha=0.8, edgecolor='#333', linewidth=0.5) ax.set_ylabel('KDE Crossover') ax.set_xticks(x) ax.set_xticklabels(backbones, fontsize=7) ax.set_ylim(0.7, 0.9) ax.set_title('(c) KDE Crossover', fontsize=9) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) for bar, val in zip(bars, crossovers): ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005, f'{val:.3f}', ha='center', va='bottom', fontsize=7, fontweight='bold') plt.tight_layout() plt.savefig(output_path, format='png') plt.savefig(output_path.with_suffix('.pdf'), format='pdf') plt.close() print(f" Saved: {output_path}") def main(): print("=" * 60) print("Generating Paper Figures") print("=" * 60) # Fig 1: Pipeline (no data needed) fig1_pipeline(OUTPUT_DIR / 'fig1_pipeline.png') # Fig 4: Ablation (uses pre-computed JSON) fig4_ablation(OUTPUT_DIR / 'fig4_ablation.png') # Load data for Fig 2 & 3 data = load_signature_data() print(f"Loaded {len(data['max_sims']):,} signatures") # Fig 3: Firm A calibration (uses per-signature best match from DB) fig3_firm_a_calibration(data, OUTPUT_DIR / 'fig3_firm_a_calibration.png') # Fig 2: Intra vs Inter (needs full feature vectors) intra_sims, inter_sims = load_intra_inter_from_features() fig2_intra_inter_kde(intra_sims, inter_sims, OUTPUT_DIR / 'fig2_intra_inter_kde.png') print("\n" + "=" * 60) print("All figures saved to:", OUTPUT_DIR) print("=" * 60) if __name__ == "__main__": main()