Paper A v13: filled submission draft (rev7) + reproducible build bundle
Fill all 18 placeholders in the condensed v13 submission draft with data verified against the analysis DB and LOCKED canonical scripts; close 12/13 co-author review items (only #8b protocol first-run open). Key changes (need co-author sign-off; see handoff doc): - Firm A out-of-sample HC 0.01% -> 0.42% (buggy 0.0001 from Script 49 same-pair bug, propagated v4.2->v13; never reuse 0.0001) - §III-D empty cell ~=0 -> 7,681 honest reframe (not degenerate crops) - low cosine cut 0.837 -> 0.8547 primary (BCD 2013-2019 closed-world, held-out discipline; 0.8489 confirmed = BCD all-period); HC/MC/HSC unchanged, UN/LH move <=0.4pp Adds Figures 1-5 (real-data plots + schematics), full references, Appendix A/B, UN/HSC ICCR, n-reconciliation, #13 MOPS-metadata survival verification, "參" set-level feasibility probe (negative). Two codex (gpt-5.5) adversarial rounds applied; no fabrication found. Bundle: paper/v13_build/ (markdown source, harvest/figure scripts, figures) for reproducibility. Handoff note for co-author included. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,79 @@
|
||||
import sqlite3, numpy as np
|
||||
from collections import defaultdict
|
||||
from scipy.stats import gaussian_kde
|
||||
DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FIRM_A='勤業眾信聯合'; BIG4=('勤業眾信聯合','安侯建業聯合','資誠聯合','安永聯合')
|
||||
SEED=42; POP=np.array([bin(i).count('1') for i in range(256)],dtype=np.uint8)
|
||||
def load():
|
||||
c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True)
|
||||
r=c.execute("""SELECT s.assigned_accountant,a.firm,s.source_pdf,s.feature_vector,s.dhash_vector,
|
||||
CAST(substr(s.year_month,1,4) AS INT) FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""").fetchall()
|
||||
c.close(); return r
|
||||
def crossover(keep,label):
|
||||
feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
|
||||
feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None)
|
||||
cpas=np.array([r[0] for r in keep]); by=defaultdict(list)
|
||||
for i,c in enumerate(cpas): by[c].append(i)
|
||||
by={c:np.array(v) for c,v in by.items() if len(v)>=3}; accts=list(by.keys())
|
||||
pw=np.array([len(by[c])*(len(by[c])-1)/2 for c in accts],float); pw/=pw.sum()
|
||||
rng=np.random.default_rng(SEED); M=100_000
|
||||
intra=np.empty(M,np.float32); ci=rng.choice(len(accts),M,p=pw)
|
||||
for t in range(M):
|
||||
a,b=rng.choice(by[accts[ci[t]]],2,replace=False); intra[t]=feats[a]@feats[b]
|
||||
inter=np.empty(M,np.float32)
|
||||
for t in range(M):
|
||||
i,j=rng.choice(len(accts),2,replace=False); inter[t]=feats[rng.choice(by[accts[i]])]@feats[rng.choice(by[accts[j]])]
|
||||
xs=np.linspace(0.3,1.0,10000); diff=gaussian_kde(intra)(xs)-gaussian_kde(inter)(xs)
|
||||
cr=[float(x) for x in xs[np.where(np.diff(np.sign(diff)))[0]] if 0.6<x<0.99]
|
||||
print(f' [{label}] crossover {[f"{x:.4f}" for x in cr]} (n={len(keep)}, accts>=3={len(accts)})')
|
||||
def percomp_bands(keep,label,M=500_000):
|
||||
feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
|
||||
feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None)
|
||||
dh=np.stack([np.frombuffer(r[4],np.uint8) for r in keep]); cpas=np.array([r[0] for r in keep])
|
||||
by=defaultdict(list)
|
||||
for i,c in enumerate(cpas): by[c].append(i)
|
||||
accts=[c for c,v in by.items() if len(v)>=1]; rng=np.random.default_rng(SEED)
|
||||
n=len(keep); ii=rng.integers(0,n,M*2); jj=rng.integers(0,n,M*2)
|
||||
keepm=cpas[ii]!=cpas[jj]; ii=ii[keepm][:M]; jj=jj[keepm][:M]
|
||||
cos=np.einsum('ij,ij->i',feats[ii],feats[jj]); d=POP[dh[ii]^dh[jj]].sum(1)
|
||||
hc=(cos>0.95)&(d<=5); mc=(cos>0.95)&(d>5)&(d<=15); hsc=(cos>0.95)&(d>15)
|
||||
un=(cos>0.837)&(cos<=0.95); lh=cos<=0.837
|
||||
print(f' [{label}] per-COMPARISON ICCR (M={len(ii)}): HC {hc.mean():.6f} MC {mc.mean():.6f} HSC {hsc.mean():.6f} UN {un.mean():.4f} LH {lh.mean():.4f}')
|
||||
def persig_perdoc_bands(keep,label):
|
||||
n=len(keep); feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
|
||||
feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None)
|
||||
dh=np.stack([np.frombuffer(r[4],np.uint8) for r in keep]); cpas=np.array([r[0] for r in keep]); docs=np.array([r[2] for r in keep])
|
||||
ci=defaultdict(list)
|
||||
for i,c in enumerate(cpas): ci[c].append(i)
|
||||
ci={c:np.array(v) for c,v in ci.items()}; ps={c:len(v)-1 for c,v in ci.items()}
|
||||
allidx=np.arange(n); rng=np.random.default_rng(SEED); mc=np.zeros(n,np.float32); md=np.full(n,64,np.int32)
|
||||
for si in range(n):
|
||||
p=ps[cpas[si]]
|
||||
if p<=0: continue
|
||||
same=ci[cpas[si]]; need=p; cand=[]; att=0
|
||||
while need>0 and att<10:
|
||||
dr=rng.choice(n,size=need*2,replace=True); ok=dr[~np.isin(dr,same)]; cand.extend(ok[:need].tolist()); need-=len(ok[:need]); att+=1
|
||||
cand=np.array(cand[:p],dtype=np.int64)
|
||||
mc[si]=(feats[cand]@feats[si]).max(); md[si]=int(POP[dh[cand]^dh[si]].sum(1).min())
|
||||
un=(mc>0.837)&(mc<=0.95); hsc=(mc>0.95)&(md>15)
|
||||
# per-doc: any signature in band
|
||||
dd=defaultdict(list)
|
||||
for i in range(n): dd[docs[i]].append(i)
|
||||
docs_un=np.mean([un[v].any() for v in dd.values()]); docs_hsc=np.mean([hsc[v].any() for v in dd.values()])
|
||||
print(f' [{label}] per-SIGNATURE ICCR: UN {un.mean():.4f} HSC {hsc.mean():.6f}')
|
||||
print(f' [{label}] per-REPORT ICCR: UN {docs_un:.4f} HSC {docs_hsc:.6f} (n_doc={len(dd)})')
|
||||
|
||||
rows=load()
|
||||
bcd_all=[r for r in rows if r[1] in BIG4 and r[1]!=FIRM_A]
|
||||
bcd_19=[r for r in bcd_all if 2013<=r[5]<=2019]
|
||||
print("=== ITEM 11: KDE crossover (verify corpus 0.837 / BCD-all 0.8489, then closed-world 2013-2019) ===")
|
||||
crossover(rows,'corpus-wide (verify ~0.8367)')
|
||||
crossover(bcd_all,'BCD-only ALL period (verify 0.8489)')
|
||||
crossover(bcd_19,'BCD 2013-2019 CLOSED-WORLD (NEW primary candidate)')
|
||||
print("\n=== ITEM 3: UN / HSC full ICCR on BCD 2013-2019 ===")
|
||||
percomp_bands(bcd_19,'BCD 2013-2019')
|
||||
persig_perdoc_bands(bcd_19,'BCD 2013-2019')
|
||||
print("\n=== ITEM 12: n reconciliation ===")
|
||||
print(f" BCD full-period (2013-2023) signatures = {len(bcd_all)} <- Script53 logged n=89,994")
|
||||
print(f" BCD 2013-2019 signatures = {len(bcd_19)} <- headline ICCR base (reproduces 0.0059)")
|
||||
@@ -0,0 +1,70 @@
|
||||
import sqlite3
|
||||
from collections import defaultdict, Counter
|
||||
import numpy as np
|
||||
DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FIRM_A='勤業眾信聯合'; BIG4=('勤業眾信聯合','安侯建業聯合','資誠聯合','安永聯合')
|
||||
ALIAS={'勤業眾信聯合':'A','安侯建業聯合':'B','資誠聯合':'C','安永聯合':'D'}
|
||||
SEED=42; POP=np.array([bin(i).count('1') for i in range(256)],dtype=np.uint8)
|
||||
def wilson(k,n,z=1.96):
|
||||
if n==0: return (None,None)
|
||||
p=k/n; d=1+z*z/n; c=(p+z*z/(2*n))/d; h=z*np.sqrt(p*(1-p)/n+z*z/(4*n*n))/d
|
||||
return (max(0,c-h),min(1,c+h))
|
||||
def load():
|
||||
c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True); cur=c.cursor()
|
||||
cur.execute("""SELECT s.assigned_accountant,a.firm,s.source_pdf,s.feature_vector,s.dhash_vector,
|
||||
CAST(substr(s.year_month,1,4) AS INT) FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""")
|
||||
r=cur.fetchall(); c.close(); return r
|
||||
def canonical_sampler(rng,n,n_pool,same_cpa,all_idx):
|
||||
need=n_pool; cand=[]; att=0
|
||||
while need>0 and att<10:
|
||||
draw=rng.choice(n,size=need*2,replace=True); ok=draw[~np.isin(draw,same_cpa)]
|
||||
cand.extend(ok[:need].tolist()); need-=len(ok[:need]); att+=1
|
||||
if need>0:
|
||||
pm=np.ones(n,bool); pm[same_cpa]=False
|
||||
cand.extend(rng.choice(all_idx[pm],size=need,replace=False).tolist())
|
||||
return np.array(cand[:n_pool],dtype=np.int64)
|
||||
def simulate(keep):
|
||||
n=len(keep); feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
|
||||
nr=np.linalg.norm(feats,axis=1,keepdims=True); nr[nr==0]=1; feats=feats/nr
|
||||
dh=np.stack([np.frombuffer(r[4],np.uint8) for r in keep]); cpas=np.array([r[0] for r in keep])
|
||||
cpa_idx=defaultdict(list)
|
||||
for i,c in enumerate(cpas): cpa_idx[c].append(i)
|
||||
cpa_idx={c:np.array(v) for c,v in cpa_idx.items()}; ps={c:len(v)-1 for c,v in cpa_idx.items()}
|
||||
all_idx=np.arange(n); rng=np.random.default_rng(SEED)
|
||||
mc=np.zeros(n,np.float32); md=np.full(n,64,np.int32)
|
||||
for si in range(n):
|
||||
p=ps[cpas[si]]
|
||||
if p<=0: continue
|
||||
cand=canonical_sampler(rng,n,p,cpa_idx[cpas[si]],all_idx)
|
||||
mc[si]=(feats[cand]@feats[si]).max(); md[si]=int(POP[dh[cand]^dh[si]].sum(axis=1).min())
|
||||
return mc,md
|
||||
def iccr(keep,label):
|
||||
mc,md=simulate(keep); n=len(keep)
|
||||
hc=(mc>0.95)&(md<=5); d2=(mc>0.95)&(md<=15)
|
||||
un=(mc>0.837)&(mc<=0.95); hsc=(mc>0.95)&(md>15)
|
||||
print(f"\n== {label} (n_sig={n:,}) ==")
|
||||
for nm,a in [('HC',hc),('HC+MC',d2),('UN-band',un),('HSC-band',hsc)]:
|
||||
k=int(a.sum()); lo,hi=wilson(k,n); print(f" ICCR per-sig {nm}: {k/n:.6f} ({k}/{n}) [{lo:.5f},{hi:.5f}]")
|
||||
def a_oos(rows,label):
|
||||
A=[r for r in rows if r[1]==FIRM_A]; BCD=[r for r in rows if r[1] in BIG4 and r[1]!=FIRM_A]
|
||||
bf=np.stack([np.frombuffer(r[3],np.float32) for r in BCD]).astype(np.float32)
|
||||
bn=np.linalg.norm(bf,axis=1,keepdims=True); bn[bn==0]=1; bf=bf/bn
|
||||
bdh=np.stack([np.frombuffer(r[4],np.uint8) for r in BCD]); nb=bf.shape[0]
|
||||
ac=defaultdict(list)
|
||||
for i,r in enumerate(A): ac[r[0]].append(i)
|
||||
ps={c:len(v)-1 for c,v in ac.items()}; rng=np.random.default_rng(SEED); hc=np.zeros(len(A),bool)
|
||||
for i,r in enumerate(A):
|
||||
p=ps[r[0]]
|
||||
if p<=0: continue
|
||||
cand=rng.choice(nb,size=p,replace=True); sf=np.frombuffer(r[3],np.float32).astype(np.float32); sf=sf/max(np.linalg.norm(sf),1e-9)
|
||||
mc=(bf[cand]@sf).max(); mdv=int(POP[bdh[cand]^np.frombuffer(r[4],np.uint8)].sum(axis=1).min())
|
||||
hc[i]=(mc>0.95)and(mdv<=5)
|
||||
k=int(hc.sum()); n=len(A); lo,hi=wilson(k,n)
|
||||
print(f"\n== Firm A OOS vs {label} BCD pool == per-sig HC: {k/n:.6f} ({k}/{n}) [{lo:.6f},{hi:.6f}]")
|
||||
rows=load()
|
||||
bcd_all=[r for r in rows if r[1] in BIG4 and r[1]!=FIRM_A]
|
||||
bcd_19=[r for r in bcd_all if 2013<=r[5]<=2019]
|
||||
iccr(bcd_19,'BCD 2013-2019 (verify per-sig HC~0.0059)')
|
||||
a_oos([r for r in rows if 2013<=r[5]<=2019],'2013-2019')
|
||||
a_oos(rows,'full-period')
|
||||
@@ -0,0 +1,58 @@
|
||||
import sqlite3, numpy as np
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
import matplotlib.pyplot as plt
|
||||
DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
ALIAS={'勤業眾信聯合':'A','安侯建業聯合':'B','資誠聯合':'C','安永聯合':'D'}
|
||||
COL={'A':'#c0392b','B':'#2980b9','C':'#27ae60','D':'#8e44ad'}
|
||||
c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True)
|
||||
rows=c.execute("""SELECT a.firm, s.max_similarity_to_same_accountant, s.min_dhash_independent,
|
||||
s.assigned_accountant, CAST(substr(s.year_month,1,4) AS INT)
|
||||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL
|
||||
AND a.firm IN ('勤業眾信聯合','安侯建業聯合','資誠聯合','安永聯合')""").fetchall()
|
||||
firm=np.array([ALIAS[r[0]] for r in rows]); cos=np.array([r[1] for r in rows],float)
|
||||
dh=np.array([r[2] for r in rows],float); acc=np.array([r[3] for r in rows]); yr=np.array([r[4] for r in rows])
|
||||
A=firm=='A'; BCD=np.isin(firm,['B','C','D'])
|
||||
|
||||
# ---- Figure 4: two panels, Firm A vs BCD ----
|
||||
fig,ax=plt.subplots(1,2,figsize=(9,3.4))
|
||||
ax[0].hist(cos[A],bins=np.linspace(0.7,1.0,60),density=True,alpha=0.6,color='#c0392b',label='Firm A')
|
||||
ax[0].hist(cos[BCD],bins=np.linspace(0.7,1.0,60),density=True,alpha=0.5,color='#34495e',label='Firms B/C/D')
|
||||
ax[0].axvline(0.95,ls='--',c='k',lw=0.8); ax[0].axvline(0.8547,ls=':',c='gray',lw=0.8)
|
||||
ax[0].set_title('(a) Within-accountant cosine',fontsize=10)
|
||||
ax[0].set_xlabel('max cosine to same accountant'); ax[0].set_ylabel('density')
|
||||
ax[0].text(0.952,ax[0].get_ylim()[1]*0.9,'0.95',fontsize=7); ax[0].legend(fontsize=8,frameon=False)
|
||||
ax[0].annotate('A median 0.986',(0.986,0),(0.80,ax[0].get_ylim()[1]*0.55),fontsize=7,color='#c0392b',arrowprops=dict(arrowstyle='->',color='#c0392b',lw=0.7))
|
||||
ax[0].annotate('B/C/D median 0.959',(0.959,0),(0.72,ax[0].get_ylim()[1]*0.35),fontsize=7,color='#34495e',arrowprops=dict(arrowstyle='->',color='#34495e',lw=0.7))
|
||||
bins=np.arange(0,21)-0.5
|
||||
ax[1].hist(np.clip(dh[A],0,20),bins=bins,density=True,alpha=0.6,color='#c0392b',label='Firm A')
|
||||
ax[1].hist(np.clip(dh[BCD],0,20),bins=bins,density=True,alpha=0.5,color='#34495e',label='Firms B/C/D')
|
||||
ax[1].axvline(5,ls='--',c='k',lw=0.8)
|
||||
ax[1].set_title('(b) Within-accountant dHash',fontsize=10)
|
||||
ax[1].set_xlabel('min dHash to same accountant'); ax[1].set_ylabel('density')
|
||||
ax[1].text(5.1,ax[1].get_ylim()[1]*0.9,'5',fontsize=7); ax[1].legend(fontsize=8,frameon=False)
|
||||
ax[1].text(0.50,0.62,'A median 2 / B,C,D median 7',transform=ax[1].transAxes,fontsize=7)
|
||||
fig.text(0.5,-0.02,'Cross-firm held-out HC rate 0.42% sits at/below the clean reference ICCR 0.59%; within-Firm-A HC rate is 82%.',ha='center',fontsize=7,style='italic')
|
||||
fig.tight_layout(); fig.savefig('/tmp/fig4.png',dpi=200,bbox_inches='tight'); plt.close(fig)
|
||||
|
||||
# ---- Figure 5: per-accountant HC rate, ranked, per period ----
|
||||
def hc_by_acc(mask):
|
||||
out={}
|
||||
a=acc[mask]; h=((cos[mask]>0.95)&(dh[mask]<=5)).astype(float); f=firm[mask]
|
||||
for ai in np.unique(a):
|
||||
m=a==ai
|
||||
if m.sum()>=5: out[ai]=(h[m].mean(),f[m][0])
|
||||
return out
|
||||
fig,ax=plt.subplots(1,2,figsize=(9,3.4),sharey=True)
|
||||
for j,(lo,hi,ttl) in enumerate([(2013,2019,'(a) 2013–2019'),(2020,2023,'(b) 2020–2023')]):
|
||||
d=hc_by_acc(BCD|A if False else ((yr>=lo)&(yr<=hi)))
|
||||
items=sorted(d.items(),key=lambda kv:-kv[1][0])
|
||||
xs=np.arange(len(items)); ys=[v[0]*100 for _,v in items]; cs=[COL[v[1]] for _,v in items]
|
||||
ax[j].scatter(xs,ys,c=cs,s=10)
|
||||
ax[j].set_title(ttl,fontsize=10); ax[j].set_xlabel('accountant rank');
|
||||
if j==0: ax[j].set_ylabel('per-accountant HC rate (%)')
|
||||
from matplotlib.lines import Line2D
|
||||
ax[1].legend([Line2D([0],[0],marker='o',ls='',color=COL[k]) for k in 'ABCD'],['Firm A','Firm B','Firm C','Firm D'],fontsize=7,frameon=False,loc='upper right')
|
||||
fig.tight_layout(); fig.savefig('/tmp/fig5.png',dpi=200,bbox_inches='tight'); plt.close(fig)
|
||||
print('figs OK', __import__('os').path.getsize('/tmp/fig4.png'), __import__('os').path.getsize('/tmp/fig5.png'))
|
||||
@@ -0,0 +1,75 @@
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.patches import FancyBboxPatch, FancyArrowPatch, Rectangle
|
||||
import numpy as np
|
||||
|
||||
# ============ Figure 1: data split grid ============
|
||||
fig, ax = plt.subplots(figsize=(7, 3.2))
|
||||
firms = ['Firm A', 'Firm B', 'Firm C', 'Firm D']
|
||||
periods = ['2013–2019', '2020–2023']
|
||||
# role per (row firm, col period)
|
||||
def role(f, p):
|
||||
if f == 'Firm A':
|
||||
return ('Held-out test 1\n(Firm A, full record)', '#c0392b')
|
||||
if p == '2013–2019':
|
||||
return ('Calibration\n(clean reference)', '#27ae60')
|
||||
return ('Held-out test 2\n(secondary)', '#2980b9')
|
||||
for i, f in enumerate(firms):
|
||||
for j, p in enumerate(periods):
|
||||
txt, col = role(f, p)
|
||||
ax.add_patch(Rectangle((j, len(firms)-1-i), 1, 1, facecolor=col, alpha=0.30, edgecolor='black', lw=1))
|
||||
ax.text(j+0.5, len(firms)-1-i+0.5, txt, ha='center', va='center', fontsize=6.5)
|
||||
ax.set_xlim(0, 2); ax.set_ylim(0, 4)
|
||||
ax.set_xticks([0.5, 1.5]); ax.set_xticklabels(periods, fontsize=9)
|
||||
ax.set_yticks([3.5, 2.5, 1.5, 0.5]); ax.set_yticklabels(firms, fontsize=9)
|
||||
ax.tick_params(length=0)
|
||||
for s in ax.spines.values(): s.set_visible(False)
|
||||
ax.set_title('Figure 1. Data split: calibrate on the clean cell, test everything else', fontsize=9)
|
||||
fig.tight_layout(); fig.savefig('/tmp/fig1.png', dpi=200, bbox_inches='tight'); plt.close(fig)
|
||||
|
||||
# ============ Figure 2: pipeline ============
|
||||
fig, ax = plt.subplots(figsize=(9, 2.5))
|
||||
steps = ['Raw PDF\nreport', 'Find signature\npage (VLM)', 'Detect signatures\n(YOLOv11)\n+ red-stamp removal',
|
||||
'Feature extraction\n(ResNet-50, 2048-d)', 'Two similarities\ncosine (style)\nmin dHash (structure)', 'Five-way\nlabel']
|
||||
n = len(steps); w = 1.0/n
|
||||
cols = ['#ecf0f1', '#d6eaf8', '#d5f5e3', '#fcf3cf', '#fadbd8', '#e8daef']
|
||||
for i, (s, c) in enumerate(zip(steps, cols)):
|
||||
x = i*w + 0.01
|
||||
ax.add_patch(FancyBboxPatch((x, 0.30), w-0.02, 0.40, boxstyle='round,pad=0.005,rounding_size=0.02',
|
||||
facecolor=c, edgecolor='black', lw=1, transform=ax.transAxes))
|
||||
ax.text(x+(w-0.02)/2, 0.50, s, ha='center', va='center', fontsize=6.8, transform=ax.transAxes)
|
||||
if i < n-1:
|
||||
ax.add_patch(FancyArrowPatch((x+w-0.012, 0.50), (x+w+0.002, 0.50), transform=ax.transAxes,
|
||||
arrowstyle='-|>', mutation_scale=10, lw=1.2, color='black'))
|
||||
ax.axis('off')
|
||||
ax.set_title('Figure 2. The screening pipeline', fontsize=9, y=0.92)
|
||||
fig.savefig('/tmp/fig2.png', dpi=200, bbox_inches='tight'); plt.close(fig)
|
||||
|
||||
# ============ Figure 3: two-measure plane, five regions ============
|
||||
fig, ax = plt.subplots(figsize=(5.2, 4.2))
|
||||
LO, HI = 0.8547, 0.95
|
||||
DH1, DH2 = 5, 15
|
||||
xmin, xmax = 0.70, 1.005
|
||||
ymin, ymax = -1, 30
|
||||
# LH (cos<=LO): whole column
|
||||
ax.add_patch(Rectangle((xmin, ymin), LO-xmin, ymax-ymin, facecolor='#bdc3c7', alpha=0.5))
|
||||
# UN (LO<cos<=HI)
|
||||
ax.add_patch(Rectangle((LO, ymin), HI-LO, ymax-ymin, facecolor='#f7dc6f', alpha=0.5))
|
||||
# high-cosine band subdivided by dHash
|
||||
ax.add_patch(Rectangle((HI, ymin), xmax-HI, DH1-ymin, facecolor='#cb4335', alpha=0.55)) # HC dHash<=5
|
||||
ax.add_patch(Rectangle((HI, DH1), xmax-HI, DH2-DH1, facecolor='#eb984e', alpha=0.55)) # MC 5<dHash<=15
|
||||
ax.add_patch(Rectangle((HI, DH2), xmax-HI, ymax-DH2, facecolor='#aed6f1', alpha=0.6)) # HSC dHash>15
|
||||
ax.axvline(LO, color='gray', ls=':', lw=1); ax.axvline(HI, color='black', ls='--', lw=1)
|
||||
ax.plot([HI, xmax], [DH1, DH1], 'k--', lw=0.8); ax.plot([HI, xmax], [DH2, DH2], 'k--', lw=0.8)
|
||||
ax.text((xmin+LO)/2, 22, 'LH', ha='center', fontsize=11, weight='bold')
|
||||
ax.text((LO+HI)/2, 22, 'UN', ha='center', fontsize=11, weight='bold')
|
||||
ax.text((HI+xmax)/2, 2, 'HC', ha='center', fontsize=11, weight='bold', color='white')
|
||||
ax.text((HI+xmax)/2, 9.5, 'MC', ha='center', fontsize=11, weight='bold')
|
||||
ax.text((HI+xmax)/2, 22, 'HSC', ha='center', fontsize=10, weight='bold')
|
||||
ax.text(LO, ymin-1.5, '0.8547', ha='center', fontsize=7); ax.text(HI, ymin-1.5, '0.95', ha='center', fontsize=7)
|
||||
ax.set_xlim(xmin, xmax); ax.set_ylim(ymin, ymax)
|
||||
ax.set_xlabel('cosine similarity (style)'); ax.set_ylabel('dHash distance (structure)')
|
||||
ax.set_title('Figure 3. The two measures and the five regions', fontsize=9)
|
||||
fig.tight_layout(); fig.savefig('/tmp/fig3.png', dpi=200, bbox_inches='tight'); plt.close(fig)
|
||||
print('figs 1/2/3 OK')
|
||||
@@ -0,0 +1,49 @@
|
||||
import sqlite3, numpy as np
|
||||
DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
BCD=('安侯建業聯合','資誠聯合','安永聯合')
|
||||
c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True)
|
||||
rows=c.execute("""SELECT s.assigned_accountant, s.max_similarity_to_same_accountant, s.min_dhash_independent
|
||||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE a.firm IN ('安侯建業聯合','資誠聯合','安永聯合')
|
||||
AND CAST(substr(s.year_month,1,4) AS INT) BETWEEN 2013 AND 2019
|
||||
AND s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL""").fetchall()
|
||||
from collections import defaultdict
|
||||
by=defaultdict(list)
|
||||
for a,cos,dh in rows: by[a].append((cos,dh))
|
||||
accs={a:np.array(v) for a,v in by.items() if len(v)>=15}
|
||||
print(f"BCD 2013-2019: {len(accs)} accountants with >=15 signatures (of {len(by)} total)")
|
||||
|
||||
rep=[]; tight=[]; rem_med=[]; klass=[]
|
||||
for a,v in accs.items():
|
||||
cos=v[:,0]; dh=v[:,1]
|
||||
hc=(cos>0.95)&(dh<=5)
|
||||
rf=hc.mean(); tf=(cos>0.95).mean()
|
||||
isolated=cos[cos<=0.95]
|
||||
rm=np.median(isolated) if len(isolated)>=3 else np.nan
|
||||
rep.append(rf); tight.append(tf); rem_med.append(rm)
|
||||
klass.append('pure-hand' if rf<0.10 else ('pure-stamp' if rf>0.90 else 'mixed'))
|
||||
rep=np.array(rep); tight=np.array(tight); rem_med=np.array(rem_med); klass=np.array(klass)
|
||||
|
||||
import collections
|
||||
print("\n=== Per-accountant replication-fraction (HC share) distribution ===")
|
||||
for lo,hi in [(0,0.1),(0.1,0.3),(0.3,0.5),(0.5,0.7),(0.7,0.9),(0.9,1.01)]:
|
||||
n=((rep>=lo)&(rep<hi)).sum(); print(f" rep_frac [{lo:.1f},{hi:.1f}): {n:3d} accountants")
|
||||
print(" class counts:", dict(collections.Counter(klass)))
|
||||
|
||||
mixed=klass=='mixed'
|
||||
print(f"\n=== MIXED accountants (n={mixed.sum()}): is the non-tight remainder dispersed (separable)? ===")
|
||||
rm_mixed=rem_med[mixed & ~np.isnan(rem_med)]
|
||||
print(f" remainder (cos<=0.95) median cosine across mixed accountants: median={np.median(rm_mixed):.3f}, IQR[{np.percentile(rm_mixed,25):.3f},{np.percentile(rm_mixed,75):.3f}]")
|
||||
print(f" fraction of mixed accountants whose remainder median < 0.90 (clearly dispersed): {(rm_mixed<0.90).mean():.2f}")
|
||||
print(f" fraction with remainder median < 0.85 (very dispersed): {(rm_mixed<0.85).mean():.2f}")
|
||||
# gap between tight group (cos>0.95) and remainder: per mixed accountant
|
||||
gaps=[]
|
||||
for a,v in accs.items():
|
||||
cos=v[:,0]
|
||||
t=cos[cos>0.95]; r=cos[cos<=0.95]
|
||||
if len(t)>=3 and len(r)>=3:
|
||||
gaps.append(np.median(t)-np.median(r))
|
||||
gaps=np.array(gaps)
|
||||
print(f"\n=== Tight-vs-remainder cosine gap (all accountants with both parts, n={len(gaps)}) ===")
|
||||
print(f" median gap = {np.median(gaps):.3f} (large gap => two-component structure is real & separable)")
|
||||
print(f" fraction with gap > 0.10: {(gaps>0.10).mean():.2f}")
|
||||
Reference in New Issue
Block a user