import sqlite3, numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' ALIAS={'勤業眾信聯合':'A','安侯建業聯合':'B','資誠聯合':'C','安永聯合':'D'} COL={'A':'#c0392b','B':'#2980b9','C':'#27ae60','D':'#8e44ad'} c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True) rows=c.execute("""SELECT a.firm, s.max_similarity_to_same_accountant, s.min_dhash_independent, s.assigned_accountant, CAST(substr(s.year_month,1,4) AS INT) FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name WHERE s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL AND a.firm IN ('勤業眾信聯合','安侯建業聯合','資誠聯合','安永聯合')""").fetchall() firm=np.array([ALIAS[r[0]] for r in rows]); cos=np.array([r[1] for r in rows],float) dh=np.array([r[2] for r in rows],float); acc=np.array([r[3] for r in rows]); yr=np.array([r[4] for r in rows]) A=firm=='A'; BCD=np.isin(firm,['B','C','D']) # ---- Figure 4: two panels, Firm A vs BCD ---- fig,ax=plt.subplots(1,2,figsize=(9,3.4)) ax[0].hist(cos[A],bins=np.linspace(0.7,1.0,60),density=True,alpha=0.6,color='#c0392b',label='Firm A') ax[0].hist(cos[BCD],bins=np.linspace(0.7,1.0,60),density=True,alpha=0.5,color='#34495e',label='Firms B/C/D') ax[0].axvline(0.95,ls='--',c='k',lw=0.8); ax[0].axvline(0.8547,ls=':',c='gray',lw=0.8) ax[0].set_title('(a) Within-accountant cosine',fontsize=10) ax[0].set_xlabel('max cosine to same accountant'); ax[0].set_ylabel('density') ax[0].text(0.952,ax[0].get_ylim()[1]*0.9,'0.95',fontsize=7); ax[0].legend(fontsize=8,frameon=False) ax[0].annotate('A median 0.986',(0.986,0),(0.80,ax[0].get_ylim()[1]*0.55),fontsize=7,color='#c0392b',arrowprops=dict(arrowstyle='->',color='#c0392b',lw=0.7)) ax[0].annotate('B/C/D median 0.959',(0.959,0),(0.72,ax[0].get_ylim()[1]*0.35),fontsize=7,color='#34495e',arrowprops=dict(arrowstyle='->',color='#34495e',lw=0.7)) bins=np.arange(0,21)-0.5 ax[1].hist(np.clip(dh[A],0,20),bins=bins,density=True,alpha=0.6,color='#c0392b',label='Firm A') ax[1].hist(np.clip(dh[BCD],0,20),bins=bins,density=True,alpha=0.5,color='#34495e',label='Firms B/C/D') ax[1].axvline(5,ls='--',c='k',lw=0.8) ax[1].set_title('(b) Within-accountant dHash',fontsize=10) ax[1].set_xlabel('min dHash to same accountant'); ax[1].set_ylabel('density') ax[1].text(5.1,ax[1].get_ylim()[1]*0.9,'5',fontsize=7); ax[1].legend(fontsize=8,frameon=False) ax[1].text(0.50,0.62,'A median 2 / B,C,D median 7',transform=ax[1].transAxes,fontsize=7) fig.text(0.5,-0.02,'Cross-firm held-out HC rate 0.42% sits at/below the clean reference ICCR 0.59%; within-Firm-A HC rate is 82%.',ha='center',fontsize=7,style='italic') fig.tight_layout(); fig.savefig('/tmp/fig4.png',dpi=200,bbox_inches='tight'); plt.close(fig) # ---- Figure 5: per-accountant HC rate, ranked, per period ---- def hc_by_acc(mask): out={} a=acc[mask]; h=((cos[mask]>0.95)&(dh[mask]<=5)).astype(float); f=firm[mask] for ai in np.unique(a): m=a==ai if m.sum()>=5: out[ai]=(h[m].mean(),f[m][0]) return out fig,ax=plt.subplots(1,2,figsize=(9,3.4),sharey=True) for j,(lo,hi,ttl) in enumerate([(2013,2019,'(a) 2013–2019'),(2020,2023,'(b) 2020–2023')]): d=hc_by_acc(BCD|A if False else ((yr>=lo)&(yr<=hi))) items=sorted(d.items(),key=lambda kv:-kv[1][0]) xs=np.arange(len(items)); ys=[v[0]*100 for _,v in items]; cs=[COL[v[1]] for _,v in items] ax[j].scatter(xs,ys,c=cs,s=10) ax[j].set_title(ttl,fontsize=10); ax[j].set_xlabel('accountant rank'); if j==0: ax[j].set_ylabel('per-accountant HC rate (%)') from matplotlib.lines import Line2D ax[1].legend([Line2D([0],[0],marker='o',ls='',color=COL[k]) for k in 'ABCD'],['Firm A','Firm B','Firm C','Firm D'],fontsize=7,frameon=False,loc='upper right') fig.tight_layout(); fig.savefig('/tmp/fig5.png',dpi=200,bbox_inches='tight'); plt.close(fig) print('figs OK', __import__('os').path.getsize('/tmp/fig4.png'), __import__('os').path.getsize('/tmp/fig5.png'))