Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification
Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,392 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate all figures for Paper A (IEEE TAI submission).
|
||||
Outputs to /Volumes/NV2/PDF-Processing/signature-analysis/paper_figures/
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import sqlite3
|
||||
import json
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as mpatches
|
||||
from matplotlib.patches import FancyBboxPatch, FancyArrowPatch
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
# Config
|
||||
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
ABLATION_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/ablation/ablation_results.json'
|
||||
OUTPUT_DIR = Path('/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures')
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
RANDOM_SEED = 42
|
||||
np.random.seed(RANDOM_SEED)
|
||||
|
||||
# IEEE formatting
|
||||
plt.rcParams.update({
|
||||
'font.family': 'serif',
|
||||
'font.serif': ['Times New Roman', 'DejaVu Serif'],
|
||||
'font.size': 9,
|
||||
'axes.labelsize': 10,
|
||||
'axes.titlesize': 10,
|
||||
'xtick.labelsize': 8,
|
||||
'ytick.labelsize': 8,
|
||||
'legend.fontsize': 8,
|
||||
'figure.dpi': 300,
|
||||
'savefig.dpi': 300,
|
||||
'savefig.bbox': 'tight',
|
||||
'savefig.pad_inches': 0.05,
|
||||
})
|
||||
|
||||
# IEEE column widths
|
||||
COL_WIDTH = 3.5 # single column inches
|
||||
FULL_WIDTH = 7.16 # full page width inches
|
||||
|
||||
|
||||
def load_signature_data():
|
||||
"""Load per-signature best-match similarities and accountant info."""
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute('''
|
||||
SELECT s.assigned_accountant, s.max_similarity_to_same_accountant, a.firm
|
||||
FROM signatures s
|
||||
LEFT JOIN accountants a ON s.assigned_accountant = a.name
|
||||
WHERE s.max_similarity_to_same_accountant IS NOT NULL
|
||||
AND s.assigned_accountant IS NOT NULL
|
||||
''')
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
|
||||
data = {
|
||||
'accountants': [r[0] for r in rows],
|
||||
'max_sims': np.array([r[1] for r in rows]),
|
||||
'firms': [r[2] for r in rows],
|
||||
}
|
||||
return data
|
||||
|
||||
|
||||
def load_intra_inter_from_features():
|
||||
"""Compute intra/inter class distributions from feature vectors."""
|
||||
print("Loading features for intra/inter distributions...")
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute('''
|
||||
SELECT assigned_accountant, feature_vector
|
||||
FROM signatures
|
||||
WHERE feature_vector IS NOT NULL AND assigned_accountant IS NOT NULL
|
||||
''')
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
|
||||
acct_groups = defaultdict(list)
|
||||
features_list = []
|
||||
accountants = []
|
||||
for r in rows:
|
||||
feat = np.frombuffer(r[1], dtype=np.float32)
|
||||
idx = len(features_list)
|
||||
features_list.append(feat)
|
||||
accountants.append(r[0])
|
||||
acct_groups[r[0]].append(idx)
|
||||
|
||||
features = np.array(features_list)
|
||||
print(f" Loaded {len(features)} signatures, {len(acct_groups)} accountants")
|
||||
|
||||
# Intra-class
|
||||
print(" Computing intra-class...")
|
||||
intra_sims = []
|
||||
for acct, indices in acct_groups.items():
|
||||
if len(indices) < 3:
|
||||
continue
|
||||
vecs = features[indices]
|
||||
sim_matrix = vecs @ vecs.T
|
||||
n = len(indices)
|
||||
triu_idx = np.triu_indices(n, k=1)
|
||||
intra_sims.extend(sim_matrix[triu_idx].tolist())
|
||||
intra_sims = np.array(intra_sims)
|
||||
print(f" Intra-class: {len(intra_sims):,} pairs")
|
||||
|
||||
# Inter-class
|
||||
print(" Computing inter-class...")
|
||||
all_acct_list = list(acct_groups.keys())
|
||||
inter_sims = []
|
||||
for _ in range(500_000):
|
||||
a1, a2 = np.random.choice(len(all_acct_list), 2, replace=False)
|
||||
i1 = np.random.choice(acct_groups[all_acct_list[a1]])
|
||||
i2 = np.random.choice(acct_groups[all_acct_list[a2]])
|
||||
sim = float(features[i1] @ features[i2])
|
||||
inter_sims.append(sim)
|
||||
inter_sims = np.array(inter_sims)
|
||||
print(f" Inter-class: {len(inter_sims):,} pairs")
|
||||
|
||||
return intra_sims, inter_sims
|
||||
|
||||
|
||||
def fig1_pipeline(output_path):
|
||||
"""Fig 1: Pipeline architecture diagram."""
|
||||
print("Generating Fig 1: Pipeline...")
|
||||
|
||||
fig, ax = plt.subplots(1, 1, figsize=(FULL_WIDTH, 1.8))
|
||||
ax.set_xlim(0, 10)
|
||||
ax.set_ylim(0, 2)
|
||||
ax.axis('off')
|
||||
|
||||
# Stages
|
||||
stages = [
|
||||
("90,282\nPDFs", "#E3F2FD"),
|
||||
("VLM\nPre-screen", "#BBDEFB"),
|
||||
("YOLO\nDetection", "#90CAF9"),
|
||||
("ResNet-50\nFeatures", "#64B5F6"),
|
||||
("Cosine +\npHash", "#42A5F5"),
|
||||
("Calibration\n& Classify", "#1E88E5"),
|
||||
]
|
||||
|
||||
annotations = [
|
||||
"86,072 docs",
|
||||
"182,328 sigs",
|
||||
"2048-dim",
|
||||
"Dual verify",
|
||||
"Verdicts",
|
||||
]
|
||||
|
||||
box_w = 1.3
|
||||
box_h = 1.0
|
||||
gap = 0.38
|
||||
start_x = 0.15
|
||||
y_center = 1.0
|
||||
|
||||
for i, (label, color) in enumerate(stages):
|
||||
x = start_x + i * (box_w + gap)
|
||||
box = FancyBboxPatch(
|
||||
(x, y_center - box_h/2), box_w, box_h,
|
||||
boxstyle="round,pad=0.1",
|
||||
facecolor=color, edgecolor='#1565C0', linewidth=1.2
|
||||
)
|
||||
ax.add_patch(box)
|
||||
ax.text(x + box_w/2, y_center, label,
|
||||
ha='center', va='center', fontsize=8, fontweight='bold',
|
||||
color='#0D47A1' if i < 3 else 'white')
|
||||
|
||||
# Arrow + annotation
|
||||
if i < len(stages) - 1:
|
||||
arrow_x = x + box_w + 0.02
|
||||
ax.annotate('', xy=(arrow_x + gap - 0.04, y_center),
|
||||
xytext=(arrow_x, y_center),
|
||||
arrowprops=dict(arrowstyle='->', color='#1565C0', lw=1.5))
|
||||
ax.text(arrow_x + gap/2, y_center - 0.62, annotations[i],
|
||||
ha='center', va='top', fontsize=6.5, color='#555555', style='italic')
|
||||
|
||||
plt.savefig(output_path, format='png')
|
||||
plt.savefig(output_path.with_suffix('.pdf'), format='pdf')
|
||||
plt.close()
|
||||
print(f" Saved: {output_path}")
|
||||
|
||||
|
||||
def fig2_intra_inter_kde(intra_sims, inter_sims, output_path):
|
||||
"""Fig 2: Intra vs Inter class cosine similarity distributions."""
|
||||
print("Generating Fig 2: Intra vs Inter KDE...")
|
||||
from scipy.stats import gaussian_kde
|
||||
|
||||
fig, ax = plt.subplots(1, 1, figsize=(COL_WIDTH, 2.5))
|
||||
|
||||
x_grid = np.linspace(0.3, 1.0, 500)
|
||||
|
||||
kde_intra = gaussian_kde(intra_sims, bw_method=0.02)
|
||||
kde_inter = gaussian_kde(inter_sims, bw_method=0.02)
|
||||
|
||||
y_intra = kde_intra(x_grid)
|
||||
y_inter = kde_inter(x_grid)
|
||||
|
||||
ax.fill_between(x_grid, y_intra, alpha=0.3, color='#E53935', label='Intra-class (same CPA)')
|
||||
ax.fill_between(x_grid, y_inter, alpha=0.3, color='#1E88E5', label='Inter-class (diff. CPA)')
|
||||
ax.plot(x_grid, y_intra, color='#C62828', linewidth=1.5)
|
||||
ax.plot(x_grid, y_inter, color='#1565C0', linewidth=1.5)
|
||||
|
||||
# Find crossover
|
||||
diff = y_intra - y_inter
|
||||
sign_changes = np.where(np.diff(np.sign(diff)))[0]
|
||||
crossovers = x_grid[sign_changes]
|
||||
valid = crossovers[(crossovers > 0.5) & (crossovers < 1.0)]
|
||||
if len(valid) > 0:
|
||||
xover = valid[-1]
|
||||
ax.axvline(x=xover, color='#4CAF50', linestyle='--', linewidth=1.2, alpha=0.8)
|
||||
ax.text(xover + 0.01, ax.get_ylim()[1] * 0.85, f'KDE crossover\n= {xover:.3f}',
|
||||
fontsize=7, color='#2E7D32', va='top')
|
||||
|
||||
ax.set_xlabel('Cosine Similarity')
|
||||
ax.set_ylabel('Density')
|
||||
ax.legend(loc='upper left', framealpha=0.9)
|
||||
ax.set_xlim(0.35, 1.0)
|
||||
ax.spines['top'].set_visible(False)
|
||||
ax.spines['right'].set_visible(False)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_path, format='png')
|
||||
plt.savefig(output_path.with_suffix('.pdf'), format='pdf')
|
||||
plt.close()
|
||||
print(f" Saved: {output_path}")
|
||||
|
||||
|
||||
def fig3_firm_a_calibration(data, output_path):
|
||||
"""Fig 3: Firm A calibration - per-signature best match distribution."""
|
||||
print("Generating Fig 3: Firm A Calibration...")
|
||||
from scipy.stats import gaussian_kde
|
||||
|
||||
firm_a_mask = np.array([f == '勤業眾信聯合' for f in data['firms']])
|
||||
non_firm_a_mask = ~firm_a_mask
|
||||
|
||||
firm_a_sims = data['max_sims'][firm_a_mask]
|
||||
others_sims = data['max_sims'][non_firm_a_mask]
|
||||
|
||||
fig, ax = plt.subplots(1, 1, figsize=(COL_WIDTH, 2.5))
|
||||
|
||||
x_grid = np.linspace(0.5, 1.0, 500)
|
||||
|
||||
kde_a = gaussian_kde(firm_a_sims, bw_method=0.015)
|
||||
kde_others = gaussian_kde(others_sims, bw_method=0.015)
|
||||
|
||||
y_a = kde_a(x_grid)
|
||||
y_others = kde_others(x_grid)
|
||||
|
||||
ax.fill_between(x_grid, y_a, alpha=0.35, color='#E53935',
|
||||
label=f'Firm A (known replication, n={len(firm_a_sims):,})')
|
||||
ax.fill_between(x_grid, y_others, alpha=0.25, color='#78909C',
|
||||
label=f'Other CPAs (n={len(others_sims):,})')
|
||||
ax.plot(x_grid, y_a, color='#C62828', linewidth=1.5)
|
||||
ax.plot(x_grid, y_others, color='#546E7A', linewidth=1.5)
|
||||
|
||||
# Mark key statistics
|
||||
p1 = np.percentile(firm_a_sims, 1)
|
||||
ax.axvline(x=p1, color='#E53935', linestyle=':', linewidth=1, alpha=0.7)
|
||||
ax.text(p1 - 0.01, ax.get_ylim()[1] * 0.5 if ax.get_ylim()[1] > 0 else 10,
|
||||
f'Firm A\n1st pct\n= {p1:.3f}', fontsize=6.5, color='#C62828',
|
||||
ha='right', va='center')
|
||||
|
||||
mean_a = firm_a_sims.mean()
|
||||
ax.axvline(x=mean_a, color='#E53935', linestyle='--', linewidth=1, alpha=0.7)
|
||||
|
||||
ax.set_xlabel('Per-Signature Best-Match Cosine Similarity')
|
||||
ax.set_ylabel('Density')
|
||||
ax.legend(loc='upper left', framealpha=0.9, fontsize=7)
|
||||
ax.set_xlim(0.5, 1.005)
|
||||
ax.spines['top'].set_visible(False)
|
||||
ax.spines['right'].set_visible(False)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_path, format='png')
|
||||
plt.savefig(output_path.with_suffix('.pdf'), format='pdf')
|
||||
plt.close()
|
||||
print(f" Saved: {output_path}")
|
||||
|
||||
|
||||
def fig4_ablation(output_path):
|
||||
"""Fig 4: Ablation backbone comparison."""
|
||||
print("Generating Fig 4: Ablation...")
|
||||
|
||||
with open(ABLATION_PATH) as f:
|
||||
results = json.load(f)
|
||||
|
||||
backbones = ['ResNet-50\n(2048-d)', 'VGG-16\n(4096-d)', 'EfficientNet-B0\n(1280-d)']
|
||||
backbone_keys = ['resnet50', 'vgg16', 'efficientnet_b0']
|
||||
results_map = {r['backbone']: r for r in results}
|
||||
|
||||
fig, axes = plt.subplots(1, 3, figsize=(FULL_WIDTH, 2.2))
|
||||
|
||||
colors = ['#1E88E5', '#FFA726', '#66BB6A']
|
||||
|
||||
# Panel (a): Intra/Inter means with error bars
|
||||
ax = axes[0]
|
||||
x = np.arange(len(backbones))
|
||||
width = 0.35
|
||||
|
||||
intra_means = [results_map[k]['intra']['mean'] for k in backbone_keys]
|
||||
intra_stds = [results_map[k]['intra']['std'] for k in backbone_keys]
|
||||
inter_means = [results_map[k]['inter']['mean'] for k in backbone_keys]
|
||||
inter_stds = [results_map[k]['inter']['std'] for k in backbone_keys]
|
||||
|
||||
bars1 = ax.bar(x - width/2, intra_means, width, yerr=intra_stds,
|
||||
color='#E53935', alpha=0.7, label='Intra', capsize=3, error_kw={'linewidth': 0.8})
|
||||
bars2 = ax.bar(x + width/2, inter_means, width, yerr=inter_stds,
|
||||
color='#1E88E5', alpha=0.7, label='Inter', capsize=3, error_kw={'linewidth': 0.8})
|
||||
|
||||
ax.set_ylabel('Cosine Similarity')
|
||||
ax.set_xticks(x)
|
||||
ax.set_xticklabels(backbones, fontsize=7)
|
||||
ax.legend(fontsize=7)
|
||||
ax.set_ylim(0.5, 1.0)
|
||||
ax.set_title('(a) Mean Similarity', fontsize=9)
|
||||
ax.spines['top'].set_visible(False)
|
||||
ax.spines['right'].set_visible(False)
|
||||
|
||||
# Panel (b): Cohen's d
|
||||
ax = axes[1]
|
||||
cohens_ds = [results_map[k]['cohens_d'] for k in backbone_keys]
|
||||
bars = ax.bar(x, cohens_ds, 0.5, color=colors, alpha=0.8, edgecolor='#333', linewidth=0.5)
|
||||
ax.set_ylabel("Cohen's d")
|
||||
ax.set_xticks(x)
|
||||
ax.set_xticklabels(backbones, fontsize=7)
|
||||
ax.set_ylim(0, 0.9)
|
||||
ax.set_title("(b) Cohen's d", fontsize=9)
|
||||
ax.spines['top'].set_visible(False)
|
||||
ax.spines['right'].set_visible(False)
|
||||
|
||||
# Add value labels
|
||||
for bar, val in zip(bars, cohens_ds):
|
||||
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
|
||||
f'{val:.3f}', ha='center', va='bottom', fontsize=7, fontweight='bold')
|
||||
|
||||
# Panel (c): KDE crossover
|
||||
ax = axes[2]
|
||||
crossovers = [results_map[k]['kde_crossover'] for k in backbone_keys]
|
||||
bars = ax.bar(x, crossovers, 0.5, color=colors, alpha=0.8, edgecolor='#333', linewidth=0.5)
|
||||
ax.set_ylabel('KDE Crossover')
|
||||
ax.set_xticks(x)
|
||||
ax.set_xticklabels(backbones, fontsize=7)
|
||||
ax.set_ylim(0.7, 0.9)
|
||||
ax.set_title('(c) KDE Crossover', fontsize=9)
|
||||
ax.spines['top'].set_visible(False)
|
||||
ax.spines['right'].set_visible(False)
|
||||
|
||||
for bar, val in zip(bars, crossovers):
|
||||
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
|
||||
f'{val:.3f}', ha='center', va='bottom', fontsize=7, fontweight='bold')
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_path, format='png')
|
||||
plt.savefig(output_path.with_suffix('.pdf'), format='pdf')
|
||||
plt.close()
|
||||
print(f" Saved: {output_path}")
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Generating Paper Figures")
|
||||
print("=" * 60)
|
||||
|
||||
# Fig 1: Pipeline (no data needed)
|
||||
fig1_pipeline(OUTPUT_DIR / 'fig1_pipeline.png')
|
||||
|
||||
# Fig 4: Ablation (uses pre-computed JSON)
|
||||
fig4_ablation(OUTPUT_DIR / 'fig4_ablation.png')
|
||||
|
||||
# Load data for Fig 2 & 3
|
||||
data = load_signature_data()
|
||||
print(f"Loaded {len(data['max_sims']):,} signatures")
|
||||
|
||||
# Fig 3: Firm A calibration (uses per-signature best match from DB)
|
||||
fig3_firm_a_calibration(data, OUTPUT_DIR / 'fig3_firm_a_calibration.png')
|
||||
|
||||
# Fig 2: Intra vs Inter (needs full feature vectors)
|
||||
intra_sims, inter_sims = load_intra_inter_from_features()
|
||||
fig2_intra_inter_kde(intra_sims, inter_sims, OUTPUT_DIR / 'fig2_intra_inter_kde.png')
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("All figures saved to:", OUTPUT_DIR)
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user