Files
pdf_signature_extraction/paper/v13_build/scripts/make_figs.py
T
gbanyan 66c9194fcf Paper A v13: filled submission draft (rev7) + reproducible build bundle
Fill all 18 placeholders in the condensed v13 submission draft with
data verified against the analysis DB and LOCKED canonical scripts;
close 12/13 co-author review items (only #8b protocol first-run open).

Key changes (need co-author sign-off; see handoff doc):
- Firm A out-of-sample HC 0.01% -> 0.42% (buggy 0.0001 from Script 49
  same-pair bug, propagated v4.2->v13; never reuse 0.0001)
- §III-D empty cell ~=0 -> 7,681 honest reframe (not degenerate crops)
- low cosine cut 0.837 -> 0.8547 primary (BCD 2013-2019 closed-world,
  held-out discipline; 0.8489 confirmed = BCD all-period); HC/MC/HSC
  unchanged, UN/LH move <=0.4pp

Adds Figures 1-5 (real-data plots + schematics), full references,
Appendix A/B, UN/HSC ICCR, n-reconciliation, #13 MOPS-metadata
survival verification, "參" set-level feasibility probe (negative).
Two codex (gpt-5.5) adversarial rounds applied; no fabrication found.

Bundle: paper/v13_build/ (markdown source, harvest/figure scripts,
figures) for reproducibility. Handoff note for co-author included.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-15 03:24:50 +08:00

59 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import sqlite3, numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
ALIAS={'勤業眾信聯合':'A','安侯建業聯合':'B','資誠聯合':'C','安永聯合':'D'}
COL={'A':'#c0392b','B':'#2980b9','C':'#27ae60','D':'#8e44ad'}
c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True)
rows=c.execute("""SELECT a.firm, s.max_similarity_to_same_accountant, s.min_dhash_independent,
s.assigned_accountant, CAST(substr(s.year_month,1,4) AS INT)
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
WHERE s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL
AND a.firm IN ('勤業眾信聯合','安侯建業聯合','資誠聯合','安永聯合')""").fetchall()
firm=np.array([ALIAS[r[0]] for r in rows]); cos=np.array([r[1] for r in rows],float)
dh=np.array([r[2] for r in rows],float); acc=np.array([r[3] for r in rows]); yr=np.array([r[4] for r in rows])
A=firm=='A'; BCD=np.isin(firm,['B','C','D'])
# ---- Figure 4: two panels, Firm A vs BCD ----
fig,ax=plt.subplots(1,2,figsize=(9,3.4))
ax[0].hist(cos[A],bins=np.linspace(0.7,1.0,60),density=True,alpha=0.6,color='#c0392b',label='Firm A')
ax[0].hist(cos[BCD],bins=np.linspace(0.7,1.0,60),density=True,alpha=0.5,color='#34495e',label='Firms B/C/D')
ax[0].axvline(0.95,ls='--',c='k',lw=0.8); ax[0].axvline(0.8547,ls=':',c='gray',lw=0.8)
ax[0].set_title('(a) Within-accountant cosine',fontsize=10)
ax[0].set_xlabel('max cosine to same accountant'); ax[0].set_ylabel('density')
ax[0].text(0.952,ax[0].get_ylim()[1]*0.9,'0.95',fontsize=7); ax[0].legend(fontsize=8,frameon=False)
ax[0].annotate('A median 0.986',(0.986,0),(0.80,ax[0].get_ylim()[1]*0.55),fontsize=7,color='#c0392b',arrowprops=dict(arrowstyle='->',color='#c0392b',lw=0.7))
ax[0].annotate('B/C/D median 0.959',(0.959,0),(0.72,ax[0].get_ylim()[1]*0.35),fontsize=7,color='#34495e',arrowprops=dict(arrowstyle='->',color='#34495e',lw=0.7))
bins=np.arange(0,21)-0.5
ax[1].hist(np.clip(dh[A],0,20),bins=bins,density=True,alpha=0.6,color='#c0392b',label='Firm A')
ax[1].hist(np.clip(dh[BCD],0,20),bins=bins,density=True,alpha=0.5,color='#34495e',label='Firms B/C/D')
ax[1].axvline(5,ls='--',c='k',lw=0.8)
ax[1].set_title('(b) Within-accountant dHash',fontsize=10)
ax[1].set_xlabel('min dHash to same accountant'); ax[1].set_ylabel('density')
ax[1].text(5.1,ax[1].get_ylim()[1]*0.9,'5',fontsize=7); ax[1].legend(fontsize=8,frameon=False)
ax[1].text(0.50,0.62,'A median 2 / B,C,D median 7',transform=ax[1].transAxes,fontsize=7)
fig.text(0.5,-0.02,'Cross-firm held-out HC rate 0.42% sits at/below the clean reference ICCR 0.59%; within-Firm-A HC rate is 82%.',ha='center',fontsize=7,style='italic')
fig.tight_layout(); fig.savefig('/tmp/fig4.png',dpi=200,bbox_inches='tight'); plt.close(fig)
# ---- Figure 5: per-accountant HC rate, ranked, per period ----
def hc_by_acc(mask):
out={}
a=acc[mask]; h=((cos[mask]>0.95)&(dh[mask]<=5)).astype(float); f=firm[mask]
for ai in np.unique(a):
m=a==ai
if m.sum()>=5: out[ai]=(h[m].mean(),f[m][0])
return out
fig,ax=plt.subplots(1,2,figsize=(9,3.4),sharey=True)
for j,(lo,hi,ttl) in enumerate([(2013,2019,'(a) 20132019'),(2020,2023,'(b) 20202023')]):
d=hc_by_acc(BCD|A if False else ((yr>=lo)&(yr<=hi)))
items=sorted(d.items(),key=lambda kv:-kv[1][0])
xs=np.arange(len(items)); ys=[v[0]*100 for _,v in items]; cs=[COL[v[1]] for _,v in items]
ax[j].scatter(xs,ys,c=cs,s=10)
ax[j].set_title(ttl,fontsize=10); ax[j].set_xlabel('accountant rank');
if j==0: ax[j].set_ylabel('per-accountant HC rate (%)')
from matplotlib.lines import Line2D
ax[1].legend([Line2D([0],[0],marker='o',ls='',color=COL[k]) for k in 'ABCD'],['Firm A','Firm B','Firm C','Firm D'],fontsize=7,frameon=False,loc='upper right')
fig.tight_layout(); fig.savefig('/tmp/fig5.png',dpi=200,bbox_inches='tight'); plt.close(fig)
print('figs OK', __import__('os').path.getsize('/tmp/fig4.png'), __import__('os').path.getsize('/tmp/fig5.png'))