Paper A v13: filled submission draft (rev7) + reproducible build bundle

Fill all 18 placeholders in the condensed v13 submission draft with
data verified against the analysis DB and LOCKED canonical scripts;
close 12/13 co-author review items (only #8b protocol first-run open).

Key changes (need co-author sign-off; see handoff doc):
- Firm A out-of-sample HC 0.01% -> 0.42% (buggy 0.0001 from Script 49
  same-pair bug, propagated v4.2->v13; never reuse 0.0001)
- §III-D empty cell ~=0 -> 7,681 honest reframe (not degenerate crops)
- low cosine cut 0.837 -> 0.8547 primary (BCD 2013-2019 closed-world,
  held-out discipline; 0.8489 confirmed = BCD all-period); HC/MC/HSC
  unchanged, UN/LH move <=0.4pp

Adds Figures 1-5 (real-data plots + schematics), full references,
Appendix A/B, UN/HSC ICCR, n-reconciliation, #13 MOPS-metadata
survival verification, "參" set-level feasibility probe (negative).
Two codex (gpt-5.5) adversarial rounds applied; no fabrication found.

Bundle: paper/v13_build/ (markdown source, harvest/figure scripts,
figures) for reproducibility. Handoff note for co-author included.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-15 03:24:50 +08:00
parent 1e8466f7a8
commit 66c9194fcf
13 changed files with 749 additions and 0 deletions
+79
View File
@@ -0,0 +1,79 @@
import sqlite3, numpy as np
from collections import defaultdict
from scipy.stats import gaussian_kde
DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
FIRM_A='勤業眾信聯合'; BIG4=('勤業眾信聯合','安侯建業聯合','資誠聯合','安永聯合')
SEED=42; POP=np.array([bin(i).count('1') for i in range(256)],dtype=np.uint8)
def load():
c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True)
r=c.execute("""SELECT s.assigned_accountant,a.firm,s.source_pdf,s.feature_vector,s.dhash_vector,
CAST(substr(s.year_month,1,4) AS INT) FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""").fetchall()
c.close(); return r
def crossover(keep,label):
feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None)
cpas=np.array([r[0] for r in keep]); by=defaultdict(list)
for i,c in enumerate(cpas): by[c].append(i)
by={c:np.array(v) for c,v in by.items() if len(v)>=3}; accts=list(by.keys())
pw=np.array([len(by[c])*(len(by[c])-1)/2 for c in accts],float); pw/=pw.sum()
rng=np.random.default_rng(SEED); M=100_000
intra=np.empty(M,np.float32); ci=rng.choice(len(accts),M,p=pw)
for t in range(M):
a,b=rng.choice(by[accts[ci[t]]],2,replace=False); intra[t]=feats[a]@feats[b]
inter=np.empty(M,np.float32)
for t in range(M):
i,j=rng.choice(len(accts),2,replace=False); inter[t]=feats[rng.choice(by[accts[i]])]@feats[rng.choice(by[accts[j]])]
xs=np.linspace(0.3,1.0,10000); diff=gaussian_kde(intra)(xs)-gaussian_kde(inter)(xs)
cr=[float(x) for x in xs[np.where(np.diff(np.sign(diff)))[0]] if 0.6<x<0.99]
print(f' [{label}] crossover {[f"{x:.4f}" for x in cr]} (n={len(keep)}, accts>=3={len(accts)})')
def percomp_bands(keep,label,M=500_000):
feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None)
dh=np.stack([np.frombuffer(r[4],np.uint8) for r in keep]); cpas=np.array([r[0] for r in keep])
by=defaultdict(list)
for i,c in enumerate(cpas): by[c].append(i)
accts=[c for c,v in by.items() if len(v)>=1]; rng=np.random.default_rng(SEED)
n=len(keep); ii=rng.integers(0,n,M*2); jj=rng.integers(0,n,M*2)
keepm=cpas[ii]!=cpas[jj]; ii=ii[keepm][:M]; jj=jj[keepm][:M]
cos=np.einsum('ij,ij->i',feats[ii],feats[jj]); d=POP[dh[ii]^dh[jj]].sum(1)
hc=(cos>0.95)&(d<=5); mc=(cos>0.95)&(d>5)&(d<=15); hsc=(cos>0.95)&(d>15)
un=(cos>0.837)&(cos<=0.95); lh=cos<=0.837
print(f' [{label}] per-COMPARISON ICCR (M={len(ii)}): HC {hc.mean():.6f} MC {mc.mean():.6f} HSC {hsc.mean():.6f} UN {un.mean():.4f} LH {lh.mean():.4f}')
def persig_perdoc_bands(keep,label):
n=len(keep); feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None)
dh=np.stack([np.frombuffer(r[4],np.uint8) for r in keep]); cpas=np.array([r[0] for r in keep]); docs=np.array([r[2] for r in keep])
ci=defaultdict(list)
for i,c in enumerate(cpas): ci[c].append(i)
ci={c:np.array(v) for c,v in ci.items()}; ps={c:len(v)-1 for c,v in ci.items()}
allidx=np.arange(n); rng=np.random.default_rng(SEED); mc=np.zeros(n,np.float32); md=np.full(n,64,np.int32)
for si in range(n):
p=ps[cpas[si]]
if p<=0: continue
same=ci[cpas[si]]; need=p; cand=[]; att=0
while need>0 and att<10:
dr=rng.choice(n,size=need*2,replace=True); ok=dr[~np.isin(dr,same)]; cand.extend(ok[:need].tolist()); need-=len(ok[:need]); att+=1
cand=np.array(cand[:p],dtype=np.int64)
mc[si]=(feats[cand]@feats[si]).max(); md[si]=int(POP[dh[cand]^dh[si]].sum(1).min())
un=(mc>0.837)&(mc<=0.95); hsc=(mc>0.95)&(md>15)
# per-doc: any signature in band
dd=defaultdict(list)
for i in range(n): dd[docs[i]].append(i)
docs_un=np.mean([un[v].any() for v in dd.values()]); docs_hsc=np.mean([hsc[v].any() for v in dd.values()])
print(f' [{label}] per-SIGNATURE ICCR: UN {un.mean():.4f} HSC {hsc.mean():.6f}')
print(f' [{label}] per-REPORT ICCR: UN {docs_un:.4f} HSC {docs_hsc:.6f} (n_doc={len(dd)})')
rows=load()
bcd_all=[r for r in rows if r[1] in BIG4 and r[1]!=FIRM_A]
bcd_19=[r for r in bcd_all if 2013<=r[5]<=2019]
print("=== ITEM 11: KDE crossover (verify corpus 0.837 / BCD-all 0.8489, then closed-world 2013-2019) ===")
crossover(rows,'corpus-wide (verify ~0.8367)')
crossover(bcd_all,'BCD-only ALL period (verify 0.8489)')
crossover(bcd_19,'BCD 2013-2019 CLOSED-WORLD (NEW primary candidate)')
print("\n=== ITEM 3: UN / HSC full ICCR on BCD 2013-2019 ===")
percomp_bands(bcd_19,'BCD 2013-2019')
persig_perdoc_bands(bcd_19,'BCD 2013-2019')
print("\n=== ITEM 12: n reconciliation ===")
print(f" BCD full-period (2013-2023) signatures = {len(bcd_all)} <- Script53 logged n=89,994")
print(f" BCD 2013-2019 signatures = {len(bcd_19)} <- headline ICCR base (reproduces 0.0059)")
+70
View File
@@ -0,0 +1,70 @@
import sqlite3
from collections import defaultdict, Counter
import numpy as np
DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
FIRM_A='勤業眾信聯合'; BIG4=('勤業眾信聯合','安侯建業聯合','資誠聯合','安永聯合')
ALIAS={'勤業眾信聯合':'A','安侯建業聯合':'B','資誠聯合':'C','安永聯合':'D'}
SEED=42; POP=np.array([bin(i).count('1') for i in range(256)],dtype=np.uint8)
def wilson(k,n,z=1.96):
if n==0: return (None,None)
p=k/n; d=1+z*z/n; c=(p+z*z/(2*n))/d; h=z*np.sqrt(p*(1-p)/n+z*z/(4*n*n))/d
return (max(0,c-h),min(1,c+h))
def load():
c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True); cur=c.cursor()
cur.execute("""SELECT s.assigned_accountant,a.firm,s.source_pdf,s.feature_vector,s.dhash_vector,
CAST(substr(s.year_month,1,4) AS INT) FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""")
r=cur.fetchall(); c.close(); return r
def canonical_sampler(rng,n,n_pool,same_cpa,all_idx):
need=n_pool; cand=[]; att=0
while need>0 and att<10:
draw=rng.choice(n,size=need*2,replace=True); ok=draw[~np.isin(draw,same_cpa)]
cand.extend(ok[:need].tolist()); need-=len(ok[:need]); att+=1
if need>0:
pm=np.ones(n,bool); pm[same_cpa]=False
cand.extend(rng.choice(all_idx[pm],size=need,replace=False).tolist())
return np.array(cand[:n_pool],dtype=np.int64)
def simulate(keep):
n=len(keep); feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
nr=np.linalg.norm(feats,axis=1,keepdims=True); nr[nr==0]=1; feats=feats/nr
dh=np.stack([np.frombuffer(r[4],np.uint8) for r in keep]); cpas=np.array([r[0] for r in keep])
cpa_idx=defaultdict(list)
for i,c in enumerate(cpas): cpa_idx[c].append(i)
cpa_idx={c:np.array(v) for c,v in cpa_idx.items()}; ps={c:len(v)-1 for c,v in cpa_idx.items()}
all_idx=np.arange(n); rng=np.random.default_rng(SEED)
mc=np.zeros(n,np.float32); md=np.full(n,64,np.int32)
for si in range(n):
p=ps[cpas[si]]
if p<=0: continue
cand=canonical_sampler(rng,n,p,cpa_idx[cpas[si]],all_idx)
mc[si]=(feats[cand]@feats[si]).max(); md[si]=int(POP[dh[cand]^dh[si]].sum(axis=1).min())
return mc,md
def iccr(keep,label):
mc,md=simulate(keep); n=len(keep)
hc=(mc>0.95)&(md<=5); d2=(mc>0.95)&(md<=15)
un=(mc>0.837)&(mc<=0.95); hsc=(mc>0.95)&(md>15)
print(f"\n== {label} (n_sig={n:,}) ==")
for nm,a in [('HC',hc),('HC+MC',d2),('UN-band',un),('HSC-band',hsc)]:
k=int(a.sum()); lo,hi=wilson(k,n); print(f" ICCR per-sig {nm}: {k/n:.6f} ({k}/{n}) [{lo:.5f},{hi:.5f}]")
def a_oos(rows,label):
A=[r for r in rows if r[1]==FIRM_A]; BCD=[r for r in rows if r[1] in BIG4 and r[1]!=FIRM_A]
bf=np.stack([np.frombuffer(r[3],np.float32) for r in BCD]).astype(np.float32)
bn=np.linalg.norm(bf,axis=1,keepdims=True); bn[bn==0]=1; bf=bf/bn
bdh=np.stack([np.frombuffer(r[4],np.uint8) for r in BCD]); nb=bf.shape[0]
ac=defaultdict(list)
for i,r in enumerate(A): ac[r[0]].append(i)
ps={c:len(v)-1 for c,v in ac.items()}; rng=np.random.default_rng(SEED); hc=np.zeros(len(A),bool)
for i,r in enumerate(A):
p=ps[r[0]]
if p<=0: continue
cand=rng.choice(nb,size=p,replace=True); sf=np.frombuffer(r[3],np.float32).astype(np.float32); sf=sf/max(np.linalg.norm(sf),1e-9)
mc=(bf[cand]@sf).max(); mdv=int(POP[bdh[cand]^np.frombuffer(r[4],np.uint8)].sum(axis=1).min())
hc[i]=(mc>0.95)and(mdv<=5)
k=int(hc.sum()); n=len(A); lo,hi=wilson(k,n)
print(f"\n== Firm A OOS vs {label} BCD pool == per-sig HC: {k/n:.6f} ({k}/{n}) [{lo:.6f},{hi:.6f}]")
rows=load()
bcd_all=[r for r in rows if r[1] in BIG4 and r[1]!=FIRM_A]
bcd_19=[r for r in bcd_all if 2013<=r[5]<=2019]
iccr(bcd_19,'BCD 2013-2019 (verify per-sig HC~0.0059)')
a_oos([r for r in rows if 2013<=r[5]<=2019],'2013-2019')
a_oos(rows,'full-period')
+58
View File
@@ -0,0 +1,58 @@
import sqlite3, numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
ALIAS={'勤業眾信聯合':'A','安侯建業聯合':'B','資誠聯合':'C','安永聯合':'D'}
COL={'A':'#c0392b','B':'#2980b9','C':'#27ae60','D':'#8e44ad'}
c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True)
rows=c.execute("""SELECT a.firm, s.max_similarity_to_same_accountant, s.min_dhash_independent,
s.assigned_accountant, CAST(substr(s.year_month,1,4) AS INT)
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
WHERE s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL
AND a.firm IN ('勤業眾信聯合','安侯建業聯合','資誠聯合','安永聯合')""").fetchall()
firm=np.array([ALIAS[r[0]] for r in rows]); cos=np.array([r[1] for r in rows],float)
dh=np.array([r[2] for r in rows],float); acc=np.array([r[3] for r in rows]); yr=np.array([r[4] for r in rows])
A=firm=='A'; BCD=np.isin(firm,['B','C','D'])
# ---- Figure 4: two panels, Firm A vs BCD ----
fig,ax=plt.subplots(1,2,figsize=(9,3.4))
ax[0].hist(cos[A],bins=np.linspace(0.7,1.0,60),density=True,alpha=0.6,color='#c0392b',label='Firm A')
ax[0].hist(cos[BCD],bins=np.linspace(0.7,1.0,60),density=True,alpha=0.5,color='#34495e',label='Firms B/C/D')
ax[0].axvline(0.95,ls='--',c='k',lw=0.8); ax[0].axvline(0.8547,ls=':',c='gray',lw=0.8)
ax[0].set_title('(a) Within-accountant cosine',fontsize=10)
ax[0].set_xlabel('max cosine to same accountant'); ax[0].set_ylabel('density')
ax[0].text(0.952,ax[0].get_ylim()[1]*0.9,'0.95',fontsize=7); ax[0].legend(fontsize=8,frameon=False)
ax[0].annotate('A median 0.986',(0.986,0),(0.80,ax[0].get_ylim()[1]*0.55),fontsize=7,color='#c0392b',arrowprops=dict(arrowstyle='->',color='#c0392b',lw=0.7))
ax[0].annotate('B/C/D median 0.959',(0.959,0),(0.72,ax[0].get_ylim()[1]*0.35),fontsize=7,color='#34495e',arrowprops=dict(arrowstyle='->',color='#34495e',lw=0.7))
bins=np.arange(0,21)-0.5
ax[1].hist(np.clip(dh[A],0,20),bins=bins,density=True,alpha=0.6,color='#c0392b',label='Firm A')
ax[1].hist(np.clip(dh[BCD],0,20),bins=bins,density=True,alpha=0.5,color='#34495e',label='Firms B/C/D')
ax[1].axvline(5,ls='--',c='k',lw=0.8)
ax[1].set_title('(b) Within-accountant dHash',fontsize=10)
ax[1].set_xlabel('min dHash to same accountant'); ax[1].set_ylabel('density')
ax[1].text(5.1,ax[1].get_ylim()[1]*0.9,'5',fontsize=7); ax[1].legend(fontsize=8,frameon=False)
ax[1].text(0.50,0.62,'A median 2 / B,C,D median 7',transform=ax[1].transAxes,fontsize=7)
fig.text(0.5,-0.02,'Cross-firm held-out HC rate 0.42% sits at/below the clean reference ICCR 0.59%; within-Firm-A HC rate is 82%.',ha='center',fontsize=7,style='italic')
fig.tight_layout(); fig.savefig('/tmp/fig4.png',dpi=200,bbox_inches='tight'); plt.close(fig)
# ---- Figure 5: per-accountant HC rate, ranked, per period ----
def hc_by_acc(mask):
out={}
a=acc[mask]; h=((cos[mask]>0.95)&(dh[mask]<=5)).astype(float); f=firm[mask]
for ai in np.unique(a):
m=a==ai
if m.sum()>=5: out[ai]=(h[m].mean(),f[m][0])
return out
fig,ax=plt.subplots(1,2,figsize=(9,3.4),sharey=True)
for j,(lo,hi,ttl) in enumerate([(2013,2019,'(a) 20132019'),(2020,2023,'(b) 20202023')]):
d=hc_by_acc(BCD|A if False else ((yr>=lo)&(yr<=hi)))
items=sorted(d.items(),key=lambda kv:-kv[1][0])
xs=np.arange(len(items)); ys=[v[0]*100 for _,v in items]; cs=[COL[v[1]] for _,v in items]
ax[j].scatter(xs,ys,c=cs,s=10)
ax[j].set_title(ttl,fontsize=10); ax[j].set_xlabel('accountant rank');
if j==0: ax[j].set_ylabel('per-accountant HC rate (%)')
from matplotlib.lines import Line2D
ax[1].legend([Line2D([0],[0],marker='o',ls='',color=COL[k]) for k in 'ABCD'],['Firm A','Firm B','Firm C','Firm D'],fontsize=7,frameon=False,loc='upper right')
fig.tight_layout(); fig.savefig('/tmp/fig5.png',dpi=200,bbox_inches='tight'); plt.close(fig)
print('figs OK', __import__('os').path.getsize('/tmp/fig4.png'), __import__('os').path.getsize('/tmp/fig5.png'))
+75
View File
@@ -0,0 +1,75 @@
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.patches import FancyBboxPatch, FancyArrowPatch, Rectangle
import numpy as np
# ============ Figure 1: data split grid ============
fig, ax = plt.subplots(figsize=(7, 3.2))
firms = ['Firm A', 'Firm B', 'Firm C', 'Firm D']
periods = ['20132019', '20202023']
# role per (row firm, col period)
def role(f, p):
if f == 'Firm A':
return ('Held-out test 1\n(Firm A, full record)', '#c0392b')
if p == '20132019':
return ('Calibration\n(clean reference)', '#27ae60')
return ('Held-out test 2\n(secondary)', '#2980b9')
for i, f in enumerate(firms):
for j, p in enumerate(periods):
txt, col = role(f, p)
ax.add_patch(Rectangle((j, len(firms)-1-i), 1, 1, facecolor=col, alpha=0.30, edgecolor='black', lw=1))
ax.text(j+0.5, len(firms)-1-i+0.5, txt, ha='center', va='center', fontsize=6.5)
ax.set_xlim(0, 2); ax.set_ylim(0, 4)
ax.set_xticks([0.5, 1.5]); ax.set_xticklabels(periods, fontsize=9)
ax.set_yticks([3.5, 2.5, 1.5, 0.5]); ax.set_yticklabels(firms, fontsize=9)
ax.tick_params(length=0)
for s in ax.spines.values(): s.set_visible(False)
ax.set_title('Figure 1. Data split: calibrate on the clean cell, test everything else', fontsize=9)
fig.tight_layout(); fig.savefig('/tmp/fig1.png', dpi=200, bbox_inches='tight'); plt.close(fig)
# ============ Figure 2: pipeline ============
fig, ax = plt.subplots(figsize=(9, 2.5))
steps = ['Raw PDF\nreport', 'Find signature\npage (VLM)', 'Detect signatures\n(YOLOv11)\n+ red-stamp removal',
'Feature extraction\n(ResNet-50, 2048-d)', 'Two similarities\ncosine (style)\nmin dHash (structure)', 'Five-way\nlabel']
n = len(steps); w = 1.0/n
cols = ['#ecf0f1', '#d6eaf8', '#d5f5e3', '#fcf3cf', '#fadbd8', '#e8daef']
for i, (s, c) in enumerate(zip(steps, cols)):
x = i*w + 0.01
ax.add_patch(FancyBboxPatch((x, 0.30), w-0.02, 0.40, boxstyle='round,pad=0.005,rounding_size=0.02',
facecolor=c, edgecolor='black', lw=1, transform=ax.transAxes))
ax.text(x+(w-0.02)/2, 0.50, s, ha='center', va='center', fontsize=6.8, transform=ax.transAxes)
if i < n-1:
ax.add_patch(FancyArrowPatch((x+w-0.012, 0.50), (x+w+0.002, 0.50), transform=ax.transAxes,
arrowstyle='-|>', mutation_scale=10, lw=1.2, color='black'))
ax.axis('off')
ax.set_title('Figure 2. The screening pipeline', fontsize=9, y=0.92)
fig.savefig('/tmp/fig2.png', dpi=200, bbox_inches='tight'); plt.close(fig)
# ============ Figure 3: two-measure plane, five regions ============
fig, ax = plt.subplots(figsize=(5.2, 4.2))
LO, HI = 0.8547, 0.95
DH1, DH2 = 5, 15
xmin, xmax = 0.70, 1.005
ymin, ymax = -1, 30
# LH (cos<=LO): whole column
ax.add_patch(Rectangle((xmin, ymin), LO-xmin, ymax-ymin, facecolor='#bdc3c7', alpha=0.5))
# UN (LO<cos<=HI)
ax.add_patch(Rectangle((LO, ymin), HI-LO, ymax-ymin, facecolor='#f7dc6f', alpha=0.5))
# high-cosine band subdivided by dHash
ax.add_patch(Rectangle((HI, ymin), xmax-HI, DH1-ymin, facecolor='#cb4335', alpha=0.55)) # HC dHash<=5
ax.add_patch(Rectangle((HI, DH1), xmax-HI, DH2-DH1, facecolor='#eb984e', alpha=0.55)) # MC 5<dHash<=15
ax.add_patch(Rectangle((HI, DH2), xmax-HI, ymax-DH2, facecolor='#aed6f1', alpha=0.6)) # HSC dHash>15
ax.axvline(LO, color='gray', ls=':', lw=1); ax.axvline(HI, color='black', ls='--', lw=1)
ax.plot([HI, xmax], [DH1, DH1], 'k--', lw=0.8); ax.plot([HI, xmax], [DH2, DH2], 'k--', lw=0.8)
ax.text((xmin+LO)/2, 22, 'LH', ha='center', fontsize=11, weight='bold')
ax.text((LO+HI)/2, 22, 'UN', ha='center', fontsize=11, weight='bold')
ax.text((HI+xmax)/2, 2, 'HC', ha='center', fontsize=11, weight='bold', color='white')
ax.text((HI+xmax)/2, 9.5, 'MC', ha='center', fontsize=11, weight='bold')
ax.text((HI+xmax)/2, 22, 'HSC', ha='center', fontsize=10, weight='bold')
ax.text(LO, ymin-1.5, '0.8547', ha='center', fontsize=7); ax.text(HI, ymin-1.5, '0.95', ha='center', fontsize=7)
ax.set_xlim(xmin, xmax); ax.set_ylim(ymin, ymax)
ax.set_xlabel('cosine similarity (style)'); ax.set_ylabel('dHash distance (structure)')
ax.set_title('Figure 3. The two measures and the five regions', fontsize=9)
fig.tight_layout(); fig.savefig('/tmp/fig3.png', dpi=200, bbox_inches='tight'); plt.close(fig)
print('figs 1/2/3 OK')
+49
View File
@@ -0,0 +1,49 @@
import sqlite3, numpy as np
DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
BCD=('安侯建業聯合','資誠聯合','安永聯合')
c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True)
rows=c.execute("""SELECT s.assigned_accountant, s.max_similarity_to_same_accountant, s.min_dhash_independent
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
WHERE a.firm IN ('安侯建業聯合','資誠聯合','安永聯合')
AND CAST(substr(s.year_month,1,4) AS INT) BETWEEN 2013 AND 2019
AND s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL""").fetchall()
from collections import defaultdict
by=defaultdict(list)
for a,cos,dh in rows: by[a].append((cos,dh))
accs={a:np.array(v) for a,v in by.items() if len(v)>=15}
print(f"BCD 2013-2019: {len(accs)} accountants with >=15 signatures (of {len(by)} total)")
rep=[]; tight=[]; rem_med=[]; klass=[]
for a,v in accs.items():
cos=v[:,0]; dh=v[:,1]
hc=(cos>0.95)&(dh<=5)
rf=hc.mean(); tf=(cos>0.95).mean()
isolated=cos[cos<=0.95]
rm=np.median(isolated) if len(isolated)>=3 else np.nan
rep.append(rf); tight.append(tf); rem_med.append(rm)
klass.append('pure-hand' if rf<0.10 else ('pure-stamp' if rf>0.90 else 'mixed'))
rep=np.array(rep); tight=np.array(tight); rem_med=np.array(rem_med); klass=np.array(klass)
import collections
print("\n=== Per-accountant replication-fraction (HC share) distribution ===")
for lo,hi in [(0,0.1),(0.1,0.3),(0.3,0.5),(0.5,0.7),(0.7,0.9),(0.9,1.01)]:
n=((rep>=lo)&(rep<hi)).sum(); print(f" rep_frac [{lo:.1f},{hi:.1f}): {n:3d} accountants")
print(" class counts:", dict(collections.Counter(klass)))
mixed=klass=='mixed'
print(f"\n=== MIXED accountants (n={mixed.sum()}): is the non-tight remainder dispersed (separable)? ===")
rm_mixed=rem_med[mixed & ~np.isnan(rem_med)]
print(f" remainder (cos<=0.95) median cosine across mixed accountants: median={np.median(rm_mixed):.3f}, IQR[{np.percentile(rm_mixed,25):.3f},{np.percentile(rm_mixed,75):.3f}]")
print(f" fraction of mixed accountants whose remainder median < 0.90 (clearly dispersed): {(rm_mixed<0.90).mean():.2f}")
print(f" fraction with remainder median < 0.85 (very dispersed): {(rm_mixed<0.85).mean():.2f}")
# gap between tight group (cos>0.95) and remainder: per mixed accountant
gaps=[]
for a,v in accs.items():
cos=v[:,0]
t=cos[cos>0.95]; r=cos[cos<=0.95]
if len(t)>=3 and len(r)>=3:
gaps.append(np.median(t)-np.median(r))
gaps=np.array(gaps)
print(f"\n=== Tight-vs-remainder cosine gap (all accountants with both parts, n={len(gaps)}) ===")
print(f" median gap = {np.median(gaps):.3f} (large gap => two-component structure is real & separable)")
print(f" fraction with gap > 0.10: {(gaps>0.10).mean():.2f}")