Paper A v13: filled submission draft (rev7) + reproducible build bundle
Fill all 18 placeholders in the condensed v13 submission draft with data verified against the analysis DB and LOCKED canonical scripts; close 12/13 co-author review items (only #8b protocol first-run open). Key changes (need co-author sign-off; see handoff doc): - Firm A out-of-sample HC 0.01% -> 0.42% (buggy 0.0001 from Script 49 same-pair bug, propagated v4.2->v13; never reuse 0.0001) - §III-D empty cell ~=0 -> 7,681 honest reframe (not degenerate crops) - low cosine cut 0.837 -> 0.8547 primary (BCD 2013-2019 closed-world, held-out discipline; 0.8489 confirmed = BCD all-period); HC/MC/HSC unchanged, UN/LH move <=0.4pp Adds Figures 1-5 (real-data plots + schematics), full references, Appendix A/B, UN/HSC ICCR, n-reconciliation, #13 MOPS-metadata survival verification, "參" set-level feasibility probe (negative). Two codex (gpt-5.5) adversarial rounds applied; no fabrication found. Bundle: paper/v13_build/ (markdown source, harvest/figure scripts, figures) for reproducibility. Handoff note for co-author included. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,58 @@
|
||||
import sqlite3, numpy as np
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
import matplotlib.pyplot as plt
|
||||
DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
ALIAS={'勤業眾信聯合':'A','安侯建業聯合':'B','資誠聯合':'C','安永聯合':'D'}
|
||||
COL={'A':'#c0392b','B':'#2980b9','C':'#27ae60','D':'#8e44ad'}
|
||||
c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True)
|
||||
rows=c.execute("""SELECT a.firm, s.max_similarity_to_same_accountant, s.min_dhash_independent,
|
||||
s.assigned_accountant, CAST(substr(s.year_month,1,4) AS INT)
|
||||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL
|
||||
AND a.firm IN ('勤業眾信聯合','安侯建業聯合','資誠聯合','安永聯合')""").fetchall()
|
||||
firm=np.array([ALIAS[r[0]] for r in rows]); cos=np.array([r[1] for r in rows],float)
|
||||
dh=np.array([r[2] for r in rows],float); acc=np.array([r[3] for r in rows]); yr=np.array([r[4] for r in rows])
|
||||
A=firm=='A'; BCD=np.isin(firm,['B','C','D'])
|
||||
|
||||
# ---- Figure 4: two panels, Firm A vs BCD ----
|
||||
fig,ax=plt.subplots(1,2,figsize=(9,3.4))
|
||||
ax[0].hist(cos[A],bins=np.linspace(0.7,1.0,60),density=True,alpha=0.6,color='#c0392b',label='Firm A')
|
||||
ax[0].hist(cos[BCD],bins=np.linspace(0.7,1.0,60),density=True,alpha=0.5,color='#34495e',label='Firms B/C/D')
|
||||
ax[0].axvline(0.95,ls='--',c='k',lw=0.8); ax[0].axvline(0.8547,ls=':',c='gray',lw=0.8)
|
||||
ax[0].set_title('(a) Within-accountant cosine',fontsize=10)
|
||||
ax[0].set_xlabel('max cosine to same accountant'); ax[0].set_ylabel('density')
|
||||
ax[0].text(0.952,ax[0].get_ylim()[1]*0.9,'0.95',fontsize=7); ax[0].legend(fontsize=8,frameon=False)
|
||||
ax[0].annotate('A median 0.986',(0.986,0),(0.80,ax[0].get_ylim()[1]*0.55),fontsize=7,color='#c0392b',arrowprops=dict(arrowstyle='->',color='#c0392b',lw=0.7))
|
||||
ax[0].annotate('B/C/D median 0.959',(0.959,0),(0.72,ax[0].get_ylim()[1]*0.35),fontsize=7,color='#34495e',arrowprops=dict(arrowstyle='->',color='#34495e',lw=0.7))
|
||||
bins=np.arange(0,21)-0.5
|
||||
ax[1].hist(np.clip(dh[A],0,20),bins=bins,density=True,alpha=0.6,color='#c0392b',label='Firm A')
|
||||
ax[1].hist(np.clip(dh[BCD],0,20),bins=bins,density=True,alpha=0.5,color='#34495e',label='Firms B/C/D')
|
||||
ax[1].axvline(5,ls='--',c='k',lw=0.8)
|
||||
ax[1].set_title('(b) Within-accountant dHash',fontsize=10)
|
||||
ax[1].set_xlabel('min dHash to same accountant'); ax[1].set_ylabel('density')
|
||||
ax[1].text(5.1,ax[1].get_ylim()[1]*0.9,'5',fontsize=7); ax[1].legend(fontsize=8,frameon=False)
|
||||
ax[1].text(0.50,0.62,'A median 2 / B,C,D median 7',transform=ax[1].transAxes,fontsize=7)
|
||||
fig.text(0.5,-0.02,'Cross-firm held-out HC rate 0.42% sits at/below the clean reference ICCR 0.59%; within-Firm-A HC rate is 82%.',ha='center',fontsize=7,style='italic')
|
||||
fig.tight_layout(); fig.savefig('/tmp/fig4.png',dpi=200,bbox_inches='tight'); plt.close(fig)
|
||||
|
||||
# ---- Figure 5: per-accountant HC rate, ranked, per period ----
|
||||
def hc_by_acc(mask):
|
||||
out={}
|
||||
a=acc[mask]; h=((cos[mask]>0.95)&(dh[mask]<=5)).astype(float); f=firm[mask]
|
||||
for ai in np.unique(a):
|
||||
m=a==ai
|
||||
if m.sum()>=5: out[ai]=(h[m].mean(),f[m][0])
|
||||
return out
|
||||
fig,ax=plt.subplots(1,2,figsize=(9,3.4),sharey=True)
|
||||
for j,(lo,hi,ttl) in enumerate([(2013,2019,'(a) 2013–2019'),(2020,2023,'(b) 2020–2023')]):
|
||||
d=hc_by_acc(BCD|A if False else ((yr>=lo)&(yr<=hi)))
|
||||
items=sorted(d.items(),key=lambda kv:-kv[1][0])
|
||||
xs=np.arange(len(items)); ys=[v[0]*100 for _,v in items]; cs=[COL[v[1]] for _,v in items]
|
||||
ax[j].scatter(xs,ys,c=cs,s=10)
|
||||
ax[j].set_title(ttl,fontsize=10); ax[j].set_xlabel('accountant rank');
|
||||
if j==0: ax[j].set_ylabel('per-accountant HC rate (%)')
|
||||
from matplotlib.lines import Line2D
|
||||
ax[1].legend([Line2D([0],[0],marker='o',ls='',color=COL[k]) for k in 'ABCD'],['Firm A','Firm B','Firm C','Firm D'],fontsize=7,frameon=False,loc='upper right')
|
||||
fig.tight_layout(); fig.savefig('/tmp/fig5.png',dpi=200,bbox_inches='tight'); plt.close(fig)
|
||||
print('figs OK', __import__('os').path.getsize('/tmp/fig4.png'), __import__('os').path.getsize('/tmp/fig5.png'))
|
||||
Reference in New Issue
Block a user