Files
pdf_signature_extraction/paper/v13_build/scripts/make_figs123.py
T
gbanyan 66c9194fcf Paper A v13: filled submission draft (rev7) + reproducible build bundle
Fill all 18 placeholders in the condensed v13 submission draft with
data verified against the analysis DB and LOCKED canonical scripts;
close 12/13 co-author review items (only #8b protocol first-run open).

Key changes (need co-author sign-off; see handoff doc):
- Firm A out-of-sample HC 0.01% -> 0.42% (buggy 0.0001 from Script 49
  same-pair bug, propagated v4.2->v13; never reuse 0.0001)
- §III-D empty cell ~=0 -> 7,681 honest reframe (not degenerate crops)
- low cosine cut 0.837 -> 0.8547 primary (BCD 2013-2019 closed-world,
  held-out discipline; 0.8489 confirmed = BCD all-period); HC/MC/HSC
  unchanged, UN/LH move <=0.4pp

Adds Figures 1-5 (real-data plots + schematics), full references,
Appendix A/B, UN/HSC ICCR, n-reconciliation, #13 MOPS-metadata
survival verification, "參" set-level feasibility probe (negative).
Two codex (gpt-5.5) adversarial rounds applied; no fabrication found.

Bundle: paper/v13_build/ (markdown source, harvest/figure scripts,
figures) for reproducibility. Handoff note for co-author included.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-15 03:24:50 +08:00

76 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.patches import FancyBboxPatch, FancyArrowPatch, Rectangle
import numpy as np
# ============ Figure 1: data split grid ============
fig, ax = plt.subplots(figsize=(7, 3.2))
firms = ['Firm A', 'Firm B', 'Firm C', 'Firm D']
periods = ['20132019', '20202023']
# role per (row firm, col period)
def role(f, p):
if f == 'Firm A':
return ('Held-out test 1\n(Firm A, full record)', '#c0392b')
if p == '20132019':
return ('Calibration\n(clean reference)', '#27ae60')
return ('Held-out test 2\n(secondary)', '#2980b9')
for i, f in enumerate(firms):
for j, p in enumerate(periods):
txt, col = role(f, p)
ax.add_patch(Rectangle((j, len(firms)-1-i), 1, 1, facecolor=col, alpha=0.30, edgecolor='black', lw=1))
ax.text(j+0.5, len(firms)-1-i+0.5, txt, ha='center', va='center', fontsize=6.5)
ax.set_xlim(0, 2); ax.set_ylim(0, 4)
ax.set_xticks([0.5, 1.5]); ax.set_xticklabels(periods, fontsize=9)
ax.set_yticks([3.5, 2.5, 1.5, 0.5]); ax.set_yticklabels(firms, fontsize=9)
ax.tick_params(length=0)
for s in ax.spines.values(): s.set_visible(False)
ax.set_title('Figure 1. Data split: calibrate on the clean cell, test everything else', fontsize=9)
fig.tight_layout(); fig.savefig('/tmp/fig1.png', dpi=200, bbox_inches='tight'); plt.close(fig)
# ============ Figure 2: pipeline ============
fig, ax = plt.subplots(figsize=(9, 2.5))
steps = ['Raw PDF\nreport', 'Find signature\npage (VLM)', 'Detect signatures\n(YOLOv11)\n+ red-stamp removal',
'Feature extraction\n(ResNet-50, 2048-d)', 'Two similarities\ncosine (style)\nmin dHash (structure)', 'Five-way\nlabel']
n = len(steps); w = 1.0/n
cols = ['#ecf0f1', '#d6eaf8', '#d5f5e3', '#fcf3cf', '#fadbd8', '#e8daef']
for i, (s, c) in enumerate(zip(steps, cols)):
x = i*w + 0.01
ax.add_patch(FancyBboxPatch((x, 0.30), w-0.02, 0.40, boxstyle='round,pad=0.005,rounding_size=0.02',
facecolor=c, edgecolor='black', lw=1, transform=ax.transAxes))
ax.text(x+(w-0.02)/2, 0.50, s, ha='center', va='center', fontsize=6.8, transform=ax.transAxes)
if i < n-1:
ax.add_patch(FancyArrowPatch((x+w-0.012, 0.50), (x+w+0.002, 0.50), transform=ax.transAxes,
arrowstyle='-|>', mutation_scale=10, lw=1.2, color='black'))
ax.axis('off')
ax.set_title('Figure 2. The screening pipeline', fontsize=9, y=0.92)
fig.savefig('/tmp/fig2.png', dpi=200, bbox_inches='tight'); plt.close(fig)
# ============ Figure 3: two-measure plane, five regions ============
fig, ax = plt.subplots(figsize=(5.2, 4.2))
LO, HI = 0.8547, 0.95
DH1, DH2 = 5, 15
xmin, xmax = 0.70, 1.005
ymin, ymax = -1, 30
# LH (cos<=LO): whole column
ax.add_patch(Rectangle((xmin, ymin), LO-xmin, ymax-ymin, facecolor='#bdc3c7', alpha=0.5))
# UN (LO<cos<=HI)
ax.add_patch(Rectangle((LO, ymin), HI-LO, ymax-ymin, facecolor='#f7dc6f', alpha=0.5))
# high-cosine band subdivided by dHash
ax.add_patch(Rectangle((HI, ymin), xmax-HI, DH1-ymin, facecolor='#cb4335', alpha=0.55)) # HC dHash<=5
ax.add_patch(Rectangle((HI, DH1), xmax-HI, DH2-DH1, facecolor='#eb984e', alpha=0.55)) # MC 5<dHash<=15
ax.add_patch(Rectangle((HI, DH2), xmax-HI, ymax-DH2, facecolor='#aed6f1', alpha=0.6)) # HSC dHash>15
ax.axvline(LO, color='gray', ls=':', lw=1); ax.axvline(HI, color='black', ls='--', lw=1)
ax.plot([HI, xmax], [DH1, DH1], 'k--', lw=0.8); ax.plot([HI, xmax], [DH2, DH2], 'k--', lw=0.8)
ax.text((xmin+LO)/2, 22, 'LH', ha='center', fontsize=11, weight='bold')
ax.text((LO+HI)/2, 22, 'UN', ha='center', fontsize=11, weight='bold')
ax.text((HI+xmax)/2, 2, 'HC', ha='center', fontsize=11, weight='bold', color='white')
ax.text((HI+xmax)/2, 9.5, 'MC', ha='center', fontsize=11, weight='bold')
ax.text((HI+xmax)/2, 22, 'HSC', ha='center', fontsize=10, weight='bold')
ax.text(LO, ymin-1.5, '0.8547', ha='center', fontsize=7); ax.text(HI, ymin-1.5, '0.95', ha='center', fontsize=7)
ax.set_xlim(xmin, xmax); ax.set_ylim(ymin, ymax)
ax.set_xlabel('cosine similarity (style)'); ax.set_ylabel('dHash distance (structure)')
ax.set_title('Figure 3. The two measures and the five regions', fontsize=9)
fig.tight_layout(); fig.savefig('/tmp/fig3.png', dpi=200, bbox_inches='tight'); plt.close(fig)
print('figs 1/2/3 OK')