Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification

Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 23:05:33 +08:00
parent 21df0ff387
commit 939a348da4
33 changed files with 9315 additions and 0 deletions
@@ -0,0 +1,392 @@
+#!/usr/bin/env python3
+"""
+Generate all figures for Paper A (IEEE TAI submission).
+Outputs to /Volumes/NV2/PDF-Processing/signature-analysis/paper_figures/
+"""
+
+import numpy as np
+import sqlite3
+import json
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+from matplotlib.patches import FancyBboxPatch, FancyArrowPatch
+from collections import defaultdict
+from pathlib import Path
+
+# Config
+DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+ABLATION_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/ablation/ablation_results.json'
+OUTPUT_DIR = Path('/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures')
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+RANDOM_SEED = 42
+np.random.seed(RANDOM_SEED)
+
+# IEEE formatting
+plt.rcParams.update({
+    'font.family': 'serif',
+    'font.serif': ['Times New Roman', 'DejaVu Serif'],
+    'font.size': 9,
+    'axes.labelsize': 10,
+    'axes.titlesize': 10,
+    'xtick.labelsize': 8,
+    'ytick.labelsize': 8,
+    'legend.fontsize': 8,
+    'figure.dpi': 300,
+    'savefig.dpi': 300,
+    'savefig.bbox': 'tight',
+    'savefig.pad_inches': 0.05,
+})
+
+# IEEE column widths
+COL_WIDTH = 3.5  # single column inches
+FULL_WIDTH = 7.16  # full page width inches
+
+
+def load_signature_data():
+    """Load per-signature best-match similarities and accountant info."""
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+
+    cur.execute('''
+        SELECT s.assigned_accountant, s.max_similarity_to_same_accountant, a.firm
+        FROM signatures s
+        LEFT JOIN accountants a ON s.assigned_accountant = a.name
+        WHERE s.max_similarity_to_same_accountant IS NOT NULL
+        AND s.assigned_accountant IS NOT NULL
+    ''')
+    rows = cur.fetchall()
+    conn.close()
+
+    data = {
+        'accountants': [r[0] for r in rows],
+        'max_sims': np.array([r[1] for r in rows]),
+        'firms': [r[2] for r in rows],
+    }
+    return data
+
+
+def load_intra_inter_from_features():
+    """Compute intra/inter class distributions from feature vectors."""
+    print("Loading features for intra/inter distributions...")
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+
+    cur.execute('''
+        SELECT assigned_accountant, feature_vector
+        FROM signatures
+        WHERE feature_vector IS NOT NULL AND assigned_accountant IS NOT NULL
+    ''')
+    rows = cur.fetchall()
+    conn.close()
+
+    acct_groups = defaultdict(list)
+    features_list = []
+    accountants = []
+    for r in rows:
+        feat = np.frombuffer(r[1], dtype=np.float32)
+        idx = len(features_list)
+        features_list.append(feat)
+        accountants.append(r[0])
+        acct_groups[r[0]].append(idx)
+
+    features = np.array(features_list)
+    print(f"  Loaded {len(features)} signatures, {len(acct_groups)} accountants")
+
+    # Intra-class
+    print("  Computing intra-class...")
+    intra_sims = []
+    for acct, indices in acct_groups.items():
+        if len(indices) < 3:
+            continue
+        vecs = features[indices]
+        sim_matrix = vecs @ vecs.T
+        n = len(indices)
+        triu_idx = np.triu_indices(n, k=1)
+        intra_sims.extend(sim_matrix[triu_idx].tolist())
+    intra_sims = np.array(intra_sims)
+    print(f"  Intra-class: {len(intra_sims):,} pairs")
+
+    # Inter-class
+    print("  Computing inter-class...")
+    all_acct_list = list(acct_groups.keys())
+    inter_sims = []
+    for _ in range(500_000):
+        a1, a2 = np.random.choice(len(all_acct_list), 2, replace=False)
+        i1 = np.random.choice(acct_groups[all_acct_list[a1]])
+        i2 = np.random.choice(acct_groups[all_acct_list[a2]])
+        sim = float(features[i1] @ features[i2])
+        inter_sims.append(sim)
+    inter_sims = np.array(inter_sims)
+    print(f"  Inter-class: {len(inter_sims):,} pairs")
+
+    return intra_sims, inter_sims
+
+
+def fig1_pipeline(output_path):
+    """Fig 1: Pipeline architecture diagram."""
+    print("Generating Fig 1: Pipeline...")
+
+    fig, ax = plt.subplots(1, 1, figsize=(FULL_WIDTH, 1.8))
+    ax.set_xlim(0, 10)
+    ax.set_ylim(0, 2)
+    ax.axis('off')
+
+    # Stages
+    stages = [
+        ("90,282\nPDFs", "#E3F2FD"),
+        ("VLM\nPre-screen", "#BBDEFB"),
+        ("YOLO\nDetection", "#90CAF9"),
+        ("ResNet-50\nFeatures", "#64B5F6"),
+        ("Cosine +\npHash", "#42A5F5"),
+        ("Calibration\n& Classify", "#1E88E5"),
+    ]
+
+    annotations = [
+        "86,072 docs",
+        "182,328 sigs",
+        "2048-dim",
+        "Dual verify",
+        "Verdicts",
+    ]
+
+    box_w = 1.3
+    box_h = 1.0
+    gap = 0.38
+    start_x = 0.15
+    y_center = 1.0
+
+    for i, (label, color) in enumerate(stages):
+        x = start_x + i * (box_w + gap)
+        box = FancyBboxPatch(
+            (x, y_center - box_h/2), box_w, box_h,
+            boxstyle="round,pad=0.1",
+            facecolor=color, edgecolor='#1565C0', linewidth=1.2
+        )
+        ax.add_patch(box)
+        ax.text(x + box_w/2, y_center, label,
+                ha='center', va='center', fontsize=8, fontweight='bold',
+                color='#0D47A1' if i < 3 else 'white')
+
+        # Arrow + annotation
+        if i < len(stages) - 1:
+            arrow_x = x + box_w + 0.02
+            ax.annotate('', xy=(arrow_x + gap - 0.04, y_center),
+                       xytext=(arrow_x, y_center),
+                       arrowprops=dict(arrowstyle='->', color='#1565C0', lw=1.5))
+            ax.text(arrow_x + gap/2, y_center - 0.62, annotations[i],
+                   ha='center', va='top', fontsize=6.5, color='#555555', style='italic')
+
+    plt.savefig(output_path, format='png')
+    plt.savefig(output_path.with_suffix('.pdf'), format='pdf')
+    plt.close()
+    print(f"  Saved: {output_path}")
+
+
+def fig2_intra_inter_kde(intra_sims, inter_sims, output_path):
+    """Fig 2: Intra vs Inter class cosine similarity distributions."""
+    print("Generating Fig 2: Intra vs Inter KDE...")
+    from scipy.stats import gaussian_kde
+
+    fig, ax = plt.subplots(1, 1, figsize=(COL_WIDTH, 2.5))
+
+    x_grid = np.linspace(0.3, 1.0, 500)
+
+    kde_intra = gaussian_kde(intra_sims, bw_method=0.02)
+    kde_inter = gaussian_kde(inter_sims, bw_method=0.02)
+
+    y_intra = kde_intra(x_grid)
+    y_inter = kde_inter(x_grid)
+
+    ax.fill_between(x_grid, y_intra, alpha=0.3, color='#E53935', label='Intra-class (same CPA)')
+    ax.fill_between(x_grid, y_inter, alpha=0.3, color='#1E88E5', label='Inter-class (diff. CPA)')
+    ax.plot(x_grid, y_intra, color='#C62828', linewidth=1.5)
+    ax.plot(x_grid, y_inter, color='#1565C0', linewidth=1.5)
+
+    # Find crossover
+    diff = y_intra - y_inter
+    sign_changes = np.where(np.diff(np.sign(diff)))[0]
+    crossovers = x_grid[sign_changes]
+    valid = crossovers[(crossovers > 0.5) & (crossovers < 1.0)]
+    if len(valid) > 0:
+        xover = valid[-1]
+        ax.axvline(x=xover, color='#4CAF50', linestyle='--', linewidth=1.2, alpha=0.8)
+        ax.text(xover + 0.01, ax.get_ylim()[1] * 0.85, f'KDE crossover\n= {xover:.3f}',
+                fontsize=7, color='#2E7D32', va='top')
+
+    ax.set_xlabel('Cosine Similarity')
+    ax.set_ylabel('Density')
+    ax.legend(loc='upper left', framealpha=0.9)
+    ax.set_xlim(0.35, 1.0)
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+
+    plt.tight_layout()
+    plt.savefig(output_path, format='png')
+    plt.savefig(output_path.with_suffix('.pdf'), format='pdf')
+    plt.close()
+    print(f"  Saved: {output_path}")
+
+
+def fig3_firm_a_calibration(data, output_path):
+    """Fig 3: Firm A calibration - per-signature best match distribution."""
+    print("Generating Fig 3: Firm A Calibration...")
+    from scipy.stats import gaussian_kde
+
+    firm_a_mask = np.array([f == '勤業眾信聯合' for f in data['firms']])
+    non_firm_a_mask = ~firm_a_mask
+
+    firm_a_sims = data['max_sims'][firm_a_mask]
+    others_sims = data['max_sims'][non_firm_a_mask]
+
+    fig, ax = plt.subplots(1, 1, figsize=(COL_WIDTH, 2.5))
+
+    x_grid = np.linspace(0.5, 1.0, 500)
+
+    kde_a = gaussian_kde(firm_a_sims, bw_method=0.015)
+    kde_others = gaussian_kde(others_sims, bw_method=0.015)
+
+    y_a = kde_a(x_grid)
+    y_others = kde_others(x_grid)
+
+    ax.fill_between(x_grid, y_a, alpha=0.35, color='#E53935',
+                    label=f'Firm A (known replication, n={len(firm_a_sims):,})')
+    ax.fill_between(x_grid, y_others, alpha=0.25, color='#78909C',
+                    label=f'Other CPAs (n={len(others_sims):,})')
+    ax.plot(x_grid, y_a, color='#C62828', linewidth=1.5)
+    ax.plot(x_grid, y_others, color='#546E7A', linewidth=1.5)
+
+    # Mark key statistics
+    p1 = np.percentile(firm_a_sims, 1)
+    ax.axvline(x=p1, color='#E53935', linestyle=':', linewidth=1, alpha=0.7)
+    ax.text(p1 - 0.01, ax.get_ylim()[1] * 0.5 if ax.get_ylim()[1] > 0 else 10,
+            f'Firm A\n1st pct\n= {p1:.3f}', fontsize=6.5, color='#C62828',
+            ha='right', va='center')
+
+    mean_a = firm_a_sims.mean()
+    ax.axvline(x=mean_a, color='#E53935', linestyle='--', linewidth=1, alpha=0.7)
+
+    ax.set_xlabel('Per-Signature Best-Match Cosine Similarity')
+    ax.set_ylabel('Density')
+    ax.legend(loc='upper left', framealpha=0.9, fontsize=7)
+    ax.set_xlim(0.5, 1.005)
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+
+    plt.tight_layout()
+    plt.savefig(output_path, format='png')
+    plt.savefig(output_path.with_suffix('.pdf'), format='pdf')
+    plt.close()
+    print(f"  Saved: {output_path}")
+
+
+def fig4_ablation(output_path):
+    """Fig 4: Ablation backbone comparison."""
+    print("Generating Fig 4: Ablation...")
+
+    with open(ABLATION_PATH) as f:
+        results = json.load(f)
+
+    backbones = ['ResNet-50\n(2048-d)', 'VGG-16\n(4096-d)', 'EfficientNet-B0\n(1280-d)']
+    backbone_keys = ['resnet50', 'vgg16', 'efficientnet_b0']
+    results_map = {r['backbone']: r for r in results}
+
+    fig, axes = plt.subplots(1, 3, figsize=(FULL_WIDTH, 2.2))
+
+    colors = ['#1E88E5', '#FFA726', '#66BB6A']
+
+    # Panel (a): Intra/Inter means with error bars
+    ax = axes[0]
+    x = np.arange(len(backbones))
+    width = 0.35
+
+    intra_means = [results_map[k]['intra']['mean'] for k in backbone_keys]
+    intra_stds = [results_map[k]['intra']['std'] for k in backbone_keys]
+    inter_means = [results_map[k]['inter']['mean'] for k in backbone_keys]
+    inter_stds = [results_map[k]['inter']['std'] for k in backbone_keys]
+
+    bars1 = ax.bar(x - width/2, intra_means, width, yerr=intra_stds,
+                   color='#E53935', alpha=0.7, label='Intra', capsize=3, error_kw={'linewidth': 0.8})
+    bars2 = ax.bar(x + width/2, inter_means, width, yerr=inter_stds,
+                   color='#1E88E5', alpha=0.7, label='Inter', capsize=3, error_kw={'linewidth': 0.8})
+
+    ax.set_ylabel('Cosine Similarity')
+    ax.set_xticks(x)
+    ax.set_xticklabels(backbones, fontsize=7)
+    ax.legend(fontsize=7)
+    ax.set_ylim(0.5, 1.0)
+    ax.set_title('(a) Mean Similarity', fontsize=9)
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+
+    # Panel (b): Cohen's d
+    ax = axes[1]
+    cohens_ds = [results_map[k]['cohens_d'] for k in backbone_keys]
+    bars = ax.bar(x, cohens_ds, 0.5, color=colors, alpha=0.8, edgecolor='#333', linewidth=0.5)
+    ax.set_ylabel("Cohen's d")
+    ax.set_xticks(x)
+    ax.set_xticklabels(backbones, fontsize=7)
+    ax.set_ylim(0, 0.9)
+    ax.set_title("(b) Cohen's d", fontsize=9)
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+
+    # Add value labels
+    for bar, val in zip(bars, cohens_ds):
+        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
+                f'{val:.3f}', ha='center', va='bottom', fontsize=7, fontweight='bold')
+
+    # Panel (c): KDE crossover
+    ax = axes[2]
+    crossovers = [results_map[k]['kde_crossover'] for k in backbone_keys]
+    bars = ax.bar(x, crossovers, 0.5, color=colors, alpha=0.8, edgecolor='#333', linewidth=0.5)
+    ax.set_ylabel('KDE Crossover')
+    ax.set_xticks(x)
+    ax.set_xticklabels(backbones, fontsize=7)
+    ax.set_ylim(0.7, 0.9)
+    ax.set_title('(c) KDE Crossover', fontsize=9)
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+
+    for bar, val in zip(bars, crossovers):
+        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
+                f'{val:.3f}', ha='center', va='bottom', fontsize=7, fontweight='bold')
+
+    plt.tight_layout()
+    plt.savefig(output_path, format='png')
+    plt.savefig(output_path.with_suffix('.pdf'), format='pdf')
+    plt.close()
+    print(f"  Saved: {output_path}")
+
+
+def main():
+    print("=" * 60)
+    print("Generating Paper Figures")
+    print("=" * 60)
+
+    # Fig 1: Pipeline (no data needed)
+    fig1_pipeline(OUTPUT_DIR / 'fig1_pipeline.png')
+
+    # Fig 4: Ablation (uses pre-computed JSON)
+    fig4_ablation(OUTPUT_DIR / 'fig4_ablation.png')
+
+    # Load data for Fig 2 & 3
+    data = load_signature_data()
+    print(f"Loaded {len(data['max_sims']):,} signatures")
+
+    # Fig 3: Firm A calibration (uses per-signature best match from DB)
+    fig3_firm_a_calibration(data, OUTPUT_DIR / 'fig3_firm_a_calibration.png')
+
+    # Fig 2: Intra vs Inter (needs full feature vectors)
+    intra_sims, inter_sims = load_intra_inter_from_features()
+    fig2_intra_inter_kde(intra_sims, inter_sims, OUTPUT_DIR / 'fig2_intra_inter_kde.png')
+
+    print("\n" + "=" * 60)
+    print("All figures saved to:", OUTPUT_DIR)
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()