939a348da4
Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
393 lines
13 KiB
Python
393 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate all figures for Paper A (IEEE TAI submission).
|
|
Outputs to /Volumes/NV2/PDF-Processing/signature-analysis/paper_figures/
|
|
"""
|
|
|
|
import numpy as np
|
|
import sqlite3
|
|
import json
|
|
import matplotlib
|
|
matplotlib.use('Agg')
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.patches as mpatches
|
|
from matplotlib.patches import FancyBboxPatch, FancyArrowPatch
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
# Config
|
|
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
|
ABLATION_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/ablation/ablation_results.json'
|
|
OUTPUT_DIR = Path('/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures')
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
RANDOM_SEED = 42
|
|
np.random.seed(RANDOM_SEED)
|
|
|
|
# IEEE formatting
|
|
plt.rcParams.update({
|
|
'font.family': 'serif',
|
|
'font.serif': ['Times New Roman', 'DejaVu Serif'],
|
|
'font.size': 9,
|
|
'axes.labelsize': 10,
|
|
'axes.titlesize': 10,
|
|
'xtick.labelsize': 8,
|
|
'ytick.labelsize': 8,
|
|
'legend.fontsize': 8,
|
|
'figure.dpi': 300,
|
|
'savefig.dpi': 300,
|
|
'savefig.bbox': 'tight',
|
|
'savefig.pad_inches': 0.05,
|
|
})
|
|
|
|
# IEEE column widths
|
|
COL_WIDTH = 3.5 # single column inches
|
|
FULL_WIDTH = 7.16 # full page width inches
|
|
|
|
|
|
def load_signature_data():
|
|
"""Load per-signature best-match similarities and accountant info."""
|
|
conn = sqlite3.connect(DB_PATH)
|
|
cur = conn.cursor()
|
|
|
|
cur.execute('''
|
|
SELECT s.assigned_accountant, s.max_similarity_to_same_accountant, a.firm
|
|
FROM signatures s
|
|
LEFT JOIN accountants a ON s.assigned_accountant = a.name
|
|
WHERE s.max_similarity_to_same_accountant IS NOT NULL
|
|
AND s.assigned_accountant IS NOT NULL
|
|
''')
|
|
rows = cur.fetchall()
|
|
conn.close()
|
|
|
|
data = {
|
|
'accountants': [r[0] for r in rows],
|
|
'max_sims': np.array([r[1] for r in rows]),
|
|
'firms': [r[2] for r in rows],
|
|
}
|
|
return data
|
|
|
|
|
|
def load_intra_inter_from_features():
|
|
"""Compute intra/inter class distributions from feature vectors."""
|
|
print("Loading features for intra/inter distributions...")
|
|
conn = sqlite3.connect(DB_PATH)
|
|
cur = conn.cursor()
|
|
|
|
cur.execute('''
|
|
SELECT assigned_accountant, feature_vector
|
|
FROM signatures
|
|
WHERE feature_vector IS NOT NULL AND assigned_accountant IS NOT NULL
|
|
''')
|
|
rows = cur.fetchall()
|
|
conn.close()
|
|
|
|
acct_groups = defaultdict(list)
|
|
features_list = []
|
|
accountants = []
|
|
for r in rows:
|
|
feat = np.frombuffer(r[1], dtype=np.float32)
|
|
idx = len(features_list)
|
|
features_list.append(feat)
|
|
accountants.append(r[0])
|
|
acct_groups[r[0]].append(idx)
|
|
|
|
features = np.array(features_list)
|
|
print(f" Loaded {len(features)} signatures, {len(acct_groups)} accountants")
|
|
|
|
# Intra-class
|
|
print(" Computing intra-class...")
|
|
intra_sims = []
|
|
for acct, indices in acct_groups.items():
|
|
if len(indices) < 3:
|
|
continue
|
|
vecs = features[indices]
|
|
sim_matrix = vecs @ vecs.T
|
|
n = len(indices)
|
|
triu_idx = np.triu_indices(n, k=1)
|
|
intra_sims.extend(sim_matrix[triu_idx].tolist())
|
|
intra_sims = np.array(intra_sims)
|
|
print(f" Intra-class: {len(intra_sims):,} pairs")
|
|
|
|
# Inter-class
|
|
print(" Computing inter-class...")
|
|
all_acct_list = list(acct_groups.keys())
|
|
inter_sims = []
|
|
for _ in range(500_000):
|
|
a1, a2 = np.random.choice(len(all_acct_list), 2, replace=False)
|
|
i1 = np.random.choice(acct_groups[all_acct_list[a1]])
|
|
i2 = np.random.choice(acct_groups[all_acct_list[a2]])
|
|
sim = float(features[i1] @ features[i2])
|
|
inter_sims.append(sim)
|
|
inter_sims = np.array(inter_sims)
|
|
print(f" Inter-class: {len(inter_sims):,} pairs")
|
|
|
|
return intra_sims, inter_sims
|
|
|
|
|
|
def fig1_pipeline(output_path):
|
|
"""Fig 1: Pipeline architecture diagram."""
|
|
print("Generating Fig 1: Pipeline...")
|
|
|
|
fig, ax = plt.subplots(1, 1, figsize=(FULL_WIDTH, 1.8))
|
|
ax.set_xlim(0, 10)
|
|
ax.set_ylim(0, 2)
|
|
ax.axis('off')
|
|
|
|
# Stages
|
|
stages = [
|
|
("90,282\nPDFs", "#E3F2FD"),
|
|
("VLM\nPre-screen", "#BBDEFB"),
|
|
("YOLO\nDetection", "#90CAF9"),
|
|
("ResNet-50\nFeatures", "#64B5F6"),
|
|
("Cosine +\npHash", "#42A5F5"),
|
|
("Calibration\n& Classify", "#1E88E5"),
|
|
]
|
|
|
|
annotations = [
|
|
"86,072 docs",
|
|
"182,328 sigs",
|
|
"2048-dim",
|
|
"Dual verify",
|
|
"Verdicts",
|
|
]
|
|
|
|
box_w = 1.3
|
|
box_h = 1.0
|
|
gap = 0.38
|
|
start_x = 0.15
|
|
y_center = 1.0
|
|
|
|
for i, (label, color) in enumerate(stages):
|
|
x = start_x + i * (box_w + gap)
|
|
box = FancyBboxPatch(
|
|
(x, y_center - box_h/2), box_w, box_h,
|
|
boxstyle="round,pad=0.1",
|
|
facecolor=color, edgecolor='#1565C0', linewidth=1.2
|
|
)
|
|
ax.add_patch(box)
|
|
ax.text(x + box_w/2, y_center, label,
|
|
ha='center', va='center', fontsize=8, fontweight='bold',
|
|
color='#0D47A1' if i < 3 else 'white')
|
|
|
|
# Arrow + annotation
|
|
if i < len(stages) - 1:
|
|
arrow_x = x + box_w + 0.02
|
|
ax.annotate('', xy=(arrow_x + gap - 0.04, y_center),
|
|
xytext=(arrow_x, y_center),
|
|
arrowprops=dict(arrowstyle='->', color='#1565C0', lw=1.5))
|
|
ax.text(arrow_x + gap/2, y_center - 0.62, annotations[i],
|
|
ha='center', va='top', fontsize=6.5, color='#555555', style='italic')
|
|
|
|
plt.savefig(output_path, format='png')
|
|
plt.savefig(output_path.with_suffix('.pdf'), format='pdf')
|
|
plt.close()
|
|
print(f" Saved: {output_path}")
|
|
|
|
|
|
def fig2_intra_inter_kde(intra_sims, inter_sims, output_path):
|
|
"""Fig 2: Intra vs Inter class cosine similarity distributions."""
|
|
print("Generating Fig 2: Intra vs Inter KDE...")
|
|
from scipy.stats import gaussian_kde
|
|
|
|
fig, ax = plt.subplots(1, 1, figsize=(COL_WIDTH, 2.5))
|
|
|
|
x_grid = np.linspace(0.3, 1.0, 500)
|
|
|
|
kde_intra = gaussian_kde(intra_sims, bw_method=0.02)
|
|
kde_inter = gaussian_kde(inter_sims, bw_method=0.02)
|
|
|
|
y_intra = kde_intra(x_grid)
|
|
y_inter = kde_inter(x_grid)
|
|
|
|
ax.fill_between(x_grid, y_intra, alpha=0.3, color='#E53935', label='Intra-class (same CPA)')
|
|
ax.fill_between(x_grid, y_inter, alpha=0.3, color='#1E88E5', label='Inter-class (diff. CPA)')
|
|
ax.plot(x_grid, y_intra, color='#C62828', linewidth=1.5)
|
|
ax.plot(x_grid, y_inter, color='#1565C0', linewidth=1.5)
|
|
|
|
# Find crossover
|
|
diff = y_intra - y_inter
|
|
sign_changes = np.where(np.diff(np.sign(diff)))[0]
|
|
crossovers = x_grid[sign_changes]
|
|
valid = crossovers[(crossovers > 0.5) & (crossovers < 1.0)]
|
|
if len(valid) > 0:
|
|
xover = valid[-1]
|
|
ax.axvline(x=xover, color='#4CAF50', linestyle='--', linewidth=1.2, alpha=0.8)
|
|
ax.text(xover + 0.01, ax.get_ylim()[1] * 0.85, f'KDE crossover\n= {xover:.3f}',
|
|
fontsize=7, color='#2E7D32', va='top')
|
|
|
|
ax.set_xlabel('Cosine Similarity')
|
|
ax.set_ylabel('Density')
|
|
ax.legend(loc='upper left', framealpha=0.9)
|
|
ax.set_xlim(0.35, 1.0)
|
|
ax.spines['top'].set_visible(False)
|
|
ax.spines['right'].set_visible(False)
|
|
|
|
plt.tight_layout()
|
|
plt.savefig(output_path, format='png')
|
|
plt.savefig(output_path.with_suffix('.pdf'), format='pdf')
|
|
plt.close()
|
|
print(f" Saved: {output_path}")
|
|
|
|
|
|
def fig3_firm_a_calibration(data, output_path):
|
|
"""Fig 3: Firm A calibration - per-signature best match distribution."""
|
|
print("Generating Fig 3: Firm A Calibration...")
|
|
from scipy.stats import gaussian_kde
|
|
|
|
firm_a_mask = np.array([f == '勤業眾信聯合' for f in data['firms']])
|
|
non_firm_a_mask = ~firm_a_mask
|
|
|
|
firm_a_sims = data['max_sims'][firm_a_mask]
|
|
others_sims = data['max_sims'][non_firm_a_mask]
|
|
|
|
fig, ax = plt.subplots(1, 1, figsize=(COL_WIDTH, 2.5))
|
|
|
|
x_grid = np.linspace(0.5, 1.0, 500)
|
|
|
|
kde_a = gaussian_kde(firm_a_sims, bw_method=0.015)
|
|
kde_others = gaussian_kde(others_sims, bw_method=0.015)
|
|
|
|
y_a = kde_a(x_grid)
|
|
y_others = kde_others(x_grid)
|
|
|
|
ax.fill_between(x_grid, y_a, alpha=0.35, color='#E53935',
|
|
label=f'Firm A (known replication, n={len(firm_a_sims):,})')
|
|
ax.fill_between(x_grid, y_others, alpha=0.25, color='#78909C',
|
|
label=f'Other CPAs (n={len(others_sims):,})')
|
|
ax.plot(x_grid, y_a, color='#C62828', linewidth=1.5)
|
|
ax.plot(x_grid, y_others, color='#546E7A', linewidth=1.5)
|
|
|
|
# Mark key statistics
|
|
p1 = np.percentile(firm_a_sims, 1)
|
|
ax.axvline(x=p1, color='#E53935', linestyle=':', linewidth=1, alpha=0.7)
|
|
ax.text(p1 - 0.01, ax.get_ylim()[1] * 0.5 if ax.get_ylim()[1] > 0 else 10,
|
|
f'Firm A\n1st pct\n= {p1:.3f}', fontsize=6.5, color='#C62828',
|
|
ha='right', va='center')
|
|
|
|
mean_a = firm_a_sims.mean()
|
|
ax.axvline(x=mean_a, color='#E53935', linestyle='--', linewidth=1, alpha=0.7)
|
|
|
|
ax.set_xlabel('Per-Signature Best-Match Cosine Similarity')
|
|
ax.set_ylabel('Density')
|
|
ax.legend(loc='upper left', framealpha=0.9, fontsize=7)
|
|
ax.set_xlim(0.5, 1.005)
|
|
ax.spines['top'].set_visible(False)
|
|
ax.spines['right'].set_visible(False)
|
|
|
|
plt.tight_layout()
|
|
plt.savefig(output_path, format='png')
|
|
plt.savefig(output_path.with_suffix('.pdf'), format='pdf')
|
|
plt.close()
|
|
print(f" Saved: {output_path}")
|
|
|
|
|
|
def fig4_ablation(output_path):
|
|
"""Fig 4: Ablation backbone comparison."""
|
|
print("Generating Fig 4: Ablation...")
|
|
|
|
with open(ABLATION_PATH) as f:
|
|
results = json.load(f)
|
|
|
|
backbones = ['ResNet-50\n(2048-d)', 'VGG-16\n(4096-d)', 'EfficientNet-B0\n(1280-d)']
|
|
backbone_keys = ['resnet50', 'vgg16', 'efficientnet_b0']
|
|
results_map = {r['backbone']: r for r in results}
|
|
|
|
fig, axes = plt.subplots(1, 3, figsize=(FULL_WIDTH, 2.2))
|
|
|
|
colors = ['#1E88E5', '#FFA726', '#66BB6A']
|
|
|
|
# Panel (a): Intra/Inter means with error bars
|
|
ax = axes[0]
|
|
x = np.arange(len(backbones))
|
|
width = 0.35
|
|
|
|
intra_means = [results_map[k]['intra']['mean'] for k in backbone_keys]
|
|
intra_stds = [results_map[k]['intra']['std'] for k in backbone_keys]
|
|
inter_means = [results_map[k]['inter']['mean'] for k in backbone_keys]
|
|
inter_stds = [results_map[k]['inter']['std'] for k in backbone_keys]
|
|
|
|
bars1 = ax.bar(x - width/2, intra_means, width, yerr=intra_stds,
|
|
color='#E53935', alpha=0.7, label='Intra', capsize=3, error_kw={'linewidth': 0.8})
|
|
bars2 = ax.bar(x + width/2, inter_means, width, yerr=inter_stds,
|
|
color='#1E88E5', alpha=0.7, label='Inter', capsize=3, error_kw={'linewidth': 0.8})
|
|
|
|
ax.set_ylabel('Cosine Similarity')
|
|
ax.set_xticks(x)
|
|
ax.set_xticklabels(backbones, fontsize=7)
|
|
ax.legend(fontsize=7)
|
|
ax.set_ylim(0.5, 1.0)
|
|
ax.set_title('(a) Mean Similarity', fontsize=9)
|
|
ax.spines['top'].set_visible(False)
|
|
ax.spines['right'].set_visible(False)
|
|
|
|
# Panel (b): Cohen's d
|
|
ax = axes[1]
|
|
cohens_ds = [results_map[k]['cohens_d'] for k in backbone_keys]
|
|
bars = ax.bar(x, cohens_ds, 0.5, color=colors, alpha=0.8, edgecolor='#333', linewidth=0.5)
|
|
ax.set_ylabel("Cohen's d")
|
|
ax.set_xticks(x)
|
|
ax.set_xticklabels(backbones, fontsize=7)
|
|
ax.set_ylim(0, 0.9)
|
|
ax.set_title("(b) Cohen's d", fontsize=9)
|
|
ax.spines['top'].set_visible(False)
|
|
ax.spines['right'].set_visible(False)
|
|
|
|
# Add value labels
|
|
for bar, val in zip(bars, cohens_ds):
|
|
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
|
|
f'{val:.3f}', ha='center', va='bottom', fontsize=7, fontweight='bold')
|
|
|
|
# Panel (c): KDE crossover
|
|
ax = axes[2]
|
|
crossovers = [results_map[k]['kde_crossover'] for k in backbone_keys]
|
|
bars = ax.bar(x, crossovers, 0.5, color=colors, alpha=0.8, edgecolor='#333', linewidth=0.5)
|
|
ax.set_ylabel('KDE Crossover')
|
|
ax.set_xticks(x)
|
|
ax.set_xticklabels(backbones, fontsize=7)
|
|
ax.set_ylim(0.7, 0.9)
|
|
ax.set_title('(c) KDE Crossover', fontsize=9)
|
|
ax.spines['top'].set_visible(False)
|
|
ax.spines['right'].set_visible(False)
|
|
|
|
for bar, val in zip(bars, crossovers):
|
|
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
|
|
f'{val:.3f}', ha='center', va='bottom', fontsize=7, fontweight='bold')
|
|
|
|
plt.tight_layout()
|
|
plt.savefig(output_path, format='png')
|
|
plt.savefig(output_path.with_suffix('.pdf'), format='pdf')
|
|
plt.close()
|
|
print(f" Saved: {output_path}")
|
|
|
|
|
|
def main():
|
|
print("=" * 60)
|
|
print("Generating Paper Figures")
|
|
print("=" * 60)
|
|
|
|
# Fig 1: Pipeline (no data needed)
|
|
fig1_pipeline(OUTPUT_DIR / 'fig1_pipeline.png')
|
|
|
|
# Fig 4: Ablation (uses pre-computed JSON)
|
|
fig4_ablation(OUTPUT_DIR / 'fig4_ablation.png')
|
|
|
|
# Load data for Fig 2 & 3
|
|
data = load_signature_data()
|
|
print(f"Loaded {len(data['max_sims']):,} signatures")
|
|
|
|
# Fig 3: Firm A calibration (uses per-signature best match from DB)
|
|
fig3_firm_a_calibration(data, OUTPUT_DIR / 'fig3_firm_a_calibration.png')
|
|
|
|
# Fig 2: Intra vs Inter (needs full feature vectors)
|
|
intra_sims, inter_sims = load_intra_inter_from_features()
|
|
fig2_intra_inter_kde(intra_sims, inter_sims, OUTPUT_DIR / 'fig2_intra_inter_kde.png')
|
|
|
|
print("\n" + "=" * 60)
|
|
print("All figures saved to:", OUTPUT_DIR)
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|