Paper A v13: filled submission draft (rev7) + reproducible build bundle

Fill all 18 placeholders in the condensed v13 submission draft with data verified against the analysis DB and LOCKED canonical scripts; close 12/13 co-author review items (only #8b protocol first-run open). Key changes (need co-author sign-off; see handoff doc): - Firm A out-of-sample HC 0.01% -> 0.42% (buggy 0.0001 from Script 49 same-pair bug, propagated v4.2->v13; never reuse 0.0001) - §III-D empty cell ~=0 -> 7,681 honest reframe (not degenerate crops) - low cosine cut 0.837 -> 0.8547 primary (BCD 2013-2019 closed-world, held-out discipline; 0.8489 confirmed = BCD all-period); HC/MC/HSC unchanged, UN/LH move <=0.4pp Adds Figures 1-5 (real-data plots + schematics), full references, Appendix A/B, UN/HSC ICCR, n-reconciliation, #13 MOPS-metadata survival verification, "參" set-level feasibility probe (negative). Two codex (gpt-5.5) adversarial rounds applied; no fabrication found. Bundle: paper/v13_build/ (markdown source, harvest/figure scripts, figures) for reproducibility. Handoff note for co-author included. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-15 03:24:50 +08:00
parent 1e8466f7a8
commit 66c9194fcf
13 changed files with 749 additions and 0 deletions
@@ -0,0 +1,79 @@
+import sqlite3, numpy as np
+from collections import defaultdict
+from scipy.stats import gaussian_kde
+DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+FIRM_A='勤業眾信聯合'; BIG4=('勤業眾信聯合','安侯建業聯合','資誠聯合','安永聯合')
+SEED=42; POP=np.array([bin(i).count('1') for i in range(256)],dtype=np.uint8)
+def load():
+    c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True)
+    r=c.execute("""SELECT s.assigned_accountant,a.firm,s.source_pdf,s.feature_vector,s.dhash_vector,
+      CAST(substr(s.year_month,1,4) AS INT) FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
+      WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""").fetchall()
+    c.close(); return r
+def crossover(keep,label):
+    feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
+    feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None)
+    cpas=np.array([r[0] for r in keep]); by=defaultdict(list)
+    for i,c in enumerate(cpas): by[c].append(i)
+    by={c:np.array(v) for c,v in by.items() if len(v)>=3}; accts=list(by.keys())
+    pw=np.array([len(by[c])*(len(by[c])-1)/2 for c in accts],float); pw/=pw.sum()
+    rng=np.random.default_rng(SEED); M=100_000
+    intra=np.empty(M,np.float32); ci=rng.choice(len(accts),M,p=pw)
+    for t in range(M):
+        a,b=rng.choice(by[accts[ci[t]]],2,replace=False); intra[t]=feats[a]@feats[b]
+    inter=np.empty(M,np.float32)
+    for t in range(M):
+        i,j=rng.choice(len(accts),2,replace=False); inter[t]=feats[rng.choice(by[accts[i]])]@feats[rng.choice(by[accts[j]])]
+    xs=np.linspace(0.3,1.0,10000); diff=gaussian_kde(intra)(xs)-gaussian_kde(inter)(xs)
+    cr=[float(x) for x in xs[np.where(np.diff(np.sign(diff)))[0]] if 0.6<x<0.99]
+    print(f'  [{label}] crossover {[f"{x:.4f}" for x in cr]} (n={len(keep)}, accts>=3={len(accts)})')
+def percomp_bands(keep,label,M=500_000):
+    feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
+    feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None)
+    dh=np.stack([np.frombuffer(r[4],np.uint8) for r in keep]); cpas=np.array([r[0] for r in keep])
+    by=defaultdict(list)
+    for i,c in enumerate(cpas): by[c].append(i)
+    accts=[c for c,v in by.items() if len(v)>=1]; rng=np.random.default_rng(SEED)
+    n=len(keep); ii=rng.integers(0,n,M*2); jj=rng.integers(0,n,M*2)
+    keepm=cpas[ii]!=cpas[jj]; ii=ii[keepm][:M]; jj=jj[keepm][:M]
+    cos=np.einsum('ij,ij->i',feats[ii],feats[jj]); d=POP[dh[ii]^dh[jj]].sum(1)
+    hc=(cos>0.95)&(d<=5); mc=(cos>0.95)&(d>5)&(d<=15); hsc=(cos>0.95)&(d>15)
+    un=(cos>0.837)&(cos<=0.95); lh=cos<=0.837
+    print(f'  [{label}] per-COMPARISON ICCR (M={len(ii)}): HC {hc.mean():.6f}  MC {mc.mean():.6f}  HSC {hsc.mean():.6f}  UN {un.mean():.4f}  LH {lh.mean():.4f}')
+def persig_perdoc_bands(keep,label):
+    n=len(keep); feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
+    feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None)
+    dh=np.stack([np.frombuffer(r[4],np.uint8) for r in keep]); cpas=np.array([r[0] for r in keep]); docs=np.array([r[2] for r in keep])
+    ci=defaultdict(list)
+    for i,c in enumerate(cpas): ci[c].append(i)
+    ci={c:np.array(v) for c,v in ci.items()}; ps={c:len(v)-1 for c,v in ci.items()}
+    allidx=np.arange(n); rng=np.random.default_rng(SEED); mc=np.zeros(n,np.float32); md=np.full(n,64,np.int32)
+    for si in range(n):
+        p=ps[cpas[si]]
+        if p<=0: continue
+        same=ci[cpas[si]]; need=p; cand=[]; att=0
+        while need>0 and att<10:
+            dr=rng.choice(n,size=need*2,replace=True); ok=dr[~np.isin(dr,same)]; cand.extend(ok[:need].tolist()); need-=len(ok[:need]); att+=1
+        cand=np.array(cand[:p],dtype=np.int64)
+        mc[si]=(feats[cand]@feats[si]).max(); md[si]=int(POP[dh[cand]^dh[si]].sum(1).min())
+    un=(mc>0.837)&(mc<=0.95); hsc=(mc>0.95)&(md>15)
+    # per-doc: any signature in band
+    dd=defaultdict(list)
+    for i in range(n): dd[docs[i]].append(i)
+    docs_un=np.mean([un[v].any() for v in dd.values()]); docs_hsc=np.mean([hsc[v].any() for v in dd.values()])
+    print(f'  [{label}] per-SIGNATURE ICCR: UN {un.mean():.4f}  HSC {hsc.mean():.6f}')
+    print(f'  [{label}] per-REPORT ICCR:    UN {docs_un:.4f}  HSC {docs_hsc:.6f}  (n_doc={len(dd)})')
+
+rows=load()
+bcd_all=[r for r in rows if r[1] in BIG4 and r[1]!=FIRM_A]
+bcd_19=[r for r in bcd_all if 2013<=r[5]<=2019]
+print("=== ITEM 11: KDE crossover (verify corpus 0.837 / BCD-all 0.8489, then closed-world 2013-2019) ===")
+crossover(rows,'corpus-wide (verify ~0.8367)')
+crossover(bcd_all,'BCD-only ALL period (verify 0.8489)')
+crossover(bcd_19,'BCD 2013-2019 CLOSED-WORLD (NEW primary candidate)')
+print("\n=== ITEM 3: UN / HSC full ICCR on BCD 2013-2019 ===")
+percomp_bands(bcd_19,'BCD 2013-2019')
+persig_perdoc_bands(bcd_19,'BCD 2013-2019')
+print("\n=== ITEM 12: n reconciliation ===")
+print(f"  BCD full-period (2013-2023) signatures = {len(bcd_all)}  <- Script53 logged n=89,994")
+print(f"  BCD 2013-2019 signatures              = {len(bcd_19)}  <- headline ICCR base (reproduces 0.0059)")
@@ -0,0 +1,70 @@
+import sqlite3
+from collections import defaultdict, Counter
+import numpy as np
+DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+FIRM_A='勤業眾信聯合'; BIG4=('勤業眾信聯合','安侯建業聯合','資誠聯合','安永聯合')
+ALIAS={'勤業眾信聯合':'A','安侯建業聯合':'B','資誠聯合':'C','安永聯合':'D'}
+SEED=42; POP=np.array([bin(i).count('1') for i in range(256)],dtype=np.uint8)
+def wilson(k,n,z=1.96):
+    if n==0: return (None,None)
+    p=k/n; d=1+z*z/n; c=(p+z*z/(2*n))/d; h=z*np.sqrt(p*(1-p)/n+z*z/(4*n*n))/d
+    return (max(0,c-h),min(1,c+h))
+def load():
+    c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True); cur=c.cursor()
+    cur.execute("""SELECT s.assigned_accountant,a.firm,s.source_pdf,s.feature_vector,s.dhash_vector,
+        CAST(substr(s.year_month,1,4) AS INT) FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
+        WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""")
+    r=cur.fetchall(); c.close(); return r
+def canonical_sampler(rng,n,n_pool,same_cpa,all_idx):
+    need=n_pool; cand=[]; att=0
+    while need>0 and att<10:
+        draw=rng.choice(n,size=need*2,replace=True); ok=draw[~np.isin(draw,same_cpa)]
+        cand.extend(ok[:need].tolist()); need-=len(ok[:need]); att+=1
+    if need>0:
+        pm=np.ones(n,bool); pm[same_cpa]=False
+        cand.extend(rng.choice(all_idx[pm],size=need,replace=False).tolist())
+    return np.array(cand[:n_pool],dtype=np.int64)
+def simulate(keep):
+    n=len(keep); feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
+    nr=np.linalg.norm(feats,axis=1,keepdims=True); nr[nr==0]=1; feats=feats/nr
+    dh=np.stack([np.frombuffer(r[4],np.uint8) for r in keep]); cpas=np.array([r[0] for r in keep])
+    cpa_idx=defaultdict(list)
+    for i,c in enumerate(cpas): cpa_idx[c].append(i)
+    cpa_idx={c:np.array(v) for c,v in cpa_idx.items()}; ps={c:len(v)-1 for c,v in cpa_idx.items()}
+    all_idx=np.arange(n); rng=np.random.default_rng(SEED)
+    mc=np.zeros(n,np.float32); md=np.full(n,64,np.int32)
+    for si in range(n):
+        p=ps[cpas[si]]
+        if p<=0: continue
+        cand=canonical_sampler(rng,n,p,cpa_idx[cpas[si]],all_idx)
+        mc[si]=(feats[cand]@feats[si]).max(); md[si]=int(POP[dh[cand]^dh[si]].sum(axis=1).min())
+    return mc,md
+def iccr(keep,label):
+    mc,md=simulate(keep); n=len(keep)
+    hc=(mc>0.95)&(md<=5); d2=(mc>0.95)&(md<=15)
+    un=(mc>0.837)&(mc<=0.95); hsc=(mc>0.95)&(md>15)
+    print(f"\n== {label} (n_sig={n:,}) ==")
+    for nm,a in [('HC',hc),('HC+MC',d2),('UN-band',un),('HSC-band',hsc)]:
+        k=int(a.sum()); lo,hi=wilson(k,n); print(f"  ICCR per-sig {nm}: {k/n:.6f} ({k}/{n}) [{lo:.5f},{hi:.5f}]")
+def a_oos(rows,label):
+    A=[r for r in rows if r[1]==FIRM_A]; BCD=[r for r in rows if r[1] in BIG4 and r[1]!=FIRM_A]
+    bf=np.stack([np.frombuffer(r[3],np.float32) for r in BCD]).astype(np.float32)
+    bn=np.linalg.norm(bf,axis=1,keepdims=True); bn[bn==0]=1; bf=bf/bn
+    bdh=np.stack([np.frombuffer(r[4],np.uint8) for r in BCD]); nb=bf.shape[0]
+    ac=defaultdict(list)
+    for i,r in enumerate(A): ac[r[0]].append(i)
+    ps={c:len(v)-1 for c,v in ac.items()}; rng=np.random.default_rng(SEED); hc=np.zeros(len(A),bool)
+    for i,r in enumerate(A):
+        p=ps[r[0]]
+        if p<=0: continue
+        cand=rng.choice(nb,size=p,replace=True); sf=np.frombuffer(r[3],np.float32).astype(np.float32); sf=sf/max(np.linalg.norm(sf),1e-9)
+        mc=(bf[cand]@sf).max(); mdv=int(POP[bdh[cand]^np.frombuffer(r[4],np.uint8)].sum(axis=1).min())
+        hc[i]=(mc>0.95)and(mdv<=5)
+    k=int(hc.sum()); n=len(A); lo,hi=wilson(k,n)
+    print(f"\n== Firm A OOS vs {label} BCD pool == per-sig HC: {k/n:.6f} ({k}/{n}) [{lo:.6f},{hi:.6f}]")
+rows=load()
+bcd_all=[r for r in rows if r[1] in BIG4 and r[1]!=FIRM_A]
+bcd_19=[r for r in bcd_all if 2013<=r[5]<=2019]
+iccr(bcd_19,'BCD 2013-2019 (verify per-sig HC~0.0059)')
+a_oos([r for r in rows if 2013<=r[5]<=2019],'2013-2019')
+a_oos(rows,'full-period')
@@ -0,0 +1,58 @@
+import sqlite3, numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+ALIAS={'勤業眾信聯合':'A','安侯建業聯合':'B','資誠聯合':'C','安永聯合':'D'}
+COL={'A':'#c0392b','B':'#2980b9','C':'#27ae60','D':'#8e44ad'}
+c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True)
+rows=c.execute("""SELECT a.firm, s.max_similarity_to_same_accountant, s.min_dhash_independent,
+  s.assigned_accountant, CAST(substr(s.year_month,1,4) AS INT)
+  FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
+  WHERE s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL
+    AND a.firm IN ('勤業眾信聯合','安侯建業聯合','資誠聯合','安永聯合')""").fetchall()
+firm=np.array([ALIAS[r[0]] for r in rows]); cos=np.array([r[1] for r in rows],float)
+dh=np.array([r[2] for r in rows],float); acc=np.array([r[3] for r in rows]); yr=np.array([r[4] for r in rows])
+A=firm=='A'; BCD=np.isin(firm,['B','C','D'])
+
+# ---- Figure 4: two panels, Firm A vs BCD ----
+fig,ax=plt.subplots(1,2,figsize=(9,3.4))
+ax[0].hist(cos[A],bins=np.linspace(0.7,1.0,60),density=True,alpha=0.6,color='#c0392b',label='Firm A')
+ax[0].hist(cos[BCD],bins=np.linspace(0.7,1.0,60),density=True,alpha=0.5,color='#34495e',label='Firms B/C/D')
+ax[0].axvline(0.95,ls='--',c='k',lw=0.8); ax[0].axvline(0.8547,ls=':',c='gray',lw=0.8)
+ax[0].set_title('(a) Within-accountant cosine',fontsize=10)
+ax[0].set_xlabel('max cosine to same accountant'); ax[0].set_ylabel('density')
+ax[0].text(0.952,ax[0].get_ylim()[1]*0.9,'0.95',fontsize=7); ax[0].legend(fontsize=8,frameon=False)
+ax[0].annotate('A median 0.986',(0.986,0),(0.80,ax[0].get_ylim()[1]*0.55),fontsize=7,color='#c0392b',arrowprops=dict(arrowstyle='->',color='#c0392b',lw=0.7))
+ax[0].annotate('B/C/D median 0.959',(0.959,0),(0.72,ax[0].get_ylim()[1]*0.35),fontsize=7,color='#34495e',arrowprops=dict(arrowstyle='->',color='#34495e',lw=0.7))
+bins=np.arange(0,21)-0.5
+ax[1].hist(np.clip(dh[A],0,20),bins=bins,density=True,alpha=0.6,color='#c0392b',label='Firm A')
+ax[1].hist(np.clip(dh[BCD],0,20),bins=bins,density=True,alpha=0.5,color='#34495e',label='Firms B/C/D')
+ax[1].axvline(5,ls='--',c='k',lw=0.8)
+ax[1].set_title('(b) Within-accountant dHash',fontsize=10)
+ax[1].set_xlabel('min dHash to same accountant'); ax[1].set_ylabel('density')
+ax[1].text(5.1,ax[1].get_ylim()[1]*0.9,'5',fontsize=7); ax[1].legend(fontsize=8,frameon=False)
+ax[1].text(0.50,0.62,'A median 2 / B,C,D median 7',transform=ax[1].transAxes,fontsize=7)
+fig.text(0.5,-0.02,'Cross-firm held-out HC rate 0.42% sits at/below the clean reference ICCR 0.59%; within-Firm-A HC rate is 82%.',ha='center',fontsize=7,style='italic')
+fig.tight_layout(); fig.savefig('/tmp/fig4.png',dpi=200,bbox_inches='tight'); plt.close(fig)
+
+# ---- Figure 5: per-accountant HC rate, ranked, per period ----
+def hc_by_acc(mask):
+    out={}
+    a=acc[mask]; h=((cos[mask]>0.95)&(dh[mask]<=5)).astype(float); f=firm[mask]
+    for ai in np.unique(a):
+        m=a==ai
+        if m.sum()>=5: out[ai]=(h[m].mean(),f[m][0])
+    return out
+fig,ax=plt.subplots(1,2,figsize=(9,3.4),sharey=True)
+for j,(lo,hi,ttl) in enumerate([(2013,2019,'(a) 2013–2019'),(2020,2023,'(b) 2020–2023')]):
+    d=hc_by_acc(BCD|A if False else ((yr>=lo)&(yr<=hi)))
+    items=sorted(d.items(),key=lambda kv:-kv[1][0])
+    xs=np.arange(len(items)); ys=[v[0]*100 for _,v in items]; cs=[COL[v[1]] for _,v in items]
+    ax[j].scatter(xs,ys,c=cs,s=10)
+    ax[j].set_title(ttl,fontsize=10); ax[j].set_xlabel('accountant rank'); 
+    if j==0: ax[j].set_ylabel('per-accountant HC rate (%)')
+from matplotlib.lines import Line2D
+ax[1].legend([Line2D([0],[0],marker='o',ls='',color=COL[k]) for k in 'ABCD'],['Firm A','Firm B','Firm C','Firm D'],fontsize=7,frameon=False,loc='upper right')
+fig.tight_layout(); fig.savefig('/tmp/fig5.png',dpi=200,bbox_inches='tight'); plt.close(fig)
+print('figs OK', __import__('os').path.getsize('/tmp/fig4.png'), __import__('os').path.getsize('/tmp/fig5.png'))
@@ -0,0 +1,75 @@
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from matplotlib.patches import FancyBboxPatch, FancyArrowPatch, Rectangle
+import numpy as np
+
+# ============ Figure 1: data split grid ============
+fig, ax = plt.subplots(figsize=(7, 3.2))
+firms = ['Firm A', 'Firm B', 'Firm C', 'Firm D']
+periods = ['2013–2019', '2020–2023']
+# role per (row firm, col period)
+def role(f, p):
+    if f == 'Firm A':
+        return ('Held-out test 1\n(Firm A, full record)', '#c0392b')
+    if p == '2013–2019':
+        return ('Calibration\n(clean reference)', '#27ae60')
+    return ('Held-out test 2\n(secondary)', '#2980b9')
+for i, f in enumerate(firms):
+    for j, p in enumerate(periods):
+        txt, col = role(f, p)
+        ax.add_patch(Rectangle((j, len(firms)-1-i), 1, 1, facecolor=col, alpha=0.30, edgecolor='black', lw=1))
+        ax.text(j+0.5, len(firms)-1-i+0.5, txt, ha='center', va='center', fontsize=6.5)
+ax.set_xlim(0, 2); ax.set_ylim(0, 4)
+ax.set_xticks([0.5, 1.5]); ax.set_xticklabels(periods, fontsize=9)
+ax.set_yticks([3.5, 2.5, 1.5, 0.5]); ax.set_yticklabels(firms, fontsize=9)
+ax.tick_params(length=0)
+for s in ax.spines.values(): s.set_visible(False)
+ax.set_title('Figure 1. Data split: calibrate on the clean cell, test everything else', fontsize=9)
+fig.tight_layout(); fig.savefig('/tmp/fig1.png', dpi=200, bbox_inches='tight'); plt.close(fig)
+
+# ============ Figure 2: pipeline ============
+fig, ax = plt.subplots(figsize=(9, 2.5))
+steps = ['Raw PDF\nreport', 'Find signature\npage (VLM)', 'Detect signatures\n(YOLOv11)\n+ red-stamp removal',
+         'Feature extraction\n(ResNet-50, 2048-d)', 'Two similarities\ncosine (style)\nmin dHash (structure)', 'Five-way\nlabel']
+n = len(steps); w = 1.0/n
+cols = ['#ecf0f1', '#d6eaf8', '#d5f5e3', '#fcf3cf', '#fadbd8', '#e8daef']
+for i, (s, c) in enumerate(zip(steps, cols)):
+    x = i*w + 0.01
+    ax.add_patch(FancyBboxPatch((x, 0.30), w-0.02, 0.40, boxstyle='round,pad=0.005,rounding_size=0.02',
+                                facecolor=c, edgecolor='black', lw=1, transform=ax.transAxes))
+    ax.text(x+(w-0.02)/2, 0.50, s, ha='center', va='center', fontsize=6.8, transform=ax.transAxes)
+    if i < n-1:
+        ax.add_patch(FancyArrowPatch((x+w-0.012, 0.50), (x+w+0.002, 0.50), transform=ax.transAxes,
+                                     arrowstyle='-|>', mutation_scale=10, lw=1.2, color='black'))
+ax.axis('off')
+ax.set_title('Figure 2. The screening pipeline', fontsize=9, y=0.92)
+fig.savefig('/tmp/fig2.png', dpi=200, bbox_inches='tight'); plt.close(fig)
+
+# ============ Figure 3: two-measure plane, five regions ============
+fig, ax = plt.subplots(figsize=(5.2, 4.2))
+LO, HI = 0.8547, 0.95
+DH1, DH2 = 5, 15
+xmin, xmax = 0.70, 1.005
+ymin, ymax = -1, 30
+# LH (cos<=LO): whole column
+ax.add_patch(Rectangle((xmin, ymin), LO-xmin, ymax-ymin, facecolor='#bdc3c7', alpha=0.5))
+# UN (LO<cos<=HI)
+ax.add_patch(Rectangle((LO, ymin), HI-LO, ymax-ymin, facecolor='#f7dc6f', alpha=0.5))
+# high-cosine band subdivided by dHash
+ax.add_patch(Rectangle((HI, ymin), xmax-HI, DH1-ymin, facecolor='#cb4335', alpha=0.55))   # HC dHash<=5
+ax.add_patch(Rectangle((HI, DH1), xmax-HI, DH2-DH1, facecolor='#eb984e', alpha=0.55))      # MC 5<dHash<=15
+ax.add_patch(Rectangle((HI, DH2), xmax-HI, ymax-DH2, facecolor='#aed6f1', alpha=0.6))      # HSC dHash>15
+ax.axvline(LO, color='gray', ls=':', lw=1); ax.axvline(HI, color='black', ls='--', lw=1)
+ax.plot([HI, xmax], [DH1, DH1], 'k--', lw=0.8); ax.plot([HI, xmax], [DH2, DH2], 'k--', lw=0.8)
+ax.text((xmin+LO)/2, 22, 'LH', ha='center', fontsize=11, weight='bold')
+ax.text((LO+HI)/2, 22, 'UN', ha='center', fontsize=11, weight='bold')
+ax.text((HI+xmax)/2, 2, 'HC', ha='center', fontsize=11, weight='bold', color='white')
+ax.text((HI+xmax)/2, 9.5, 'MC', ha='center', fontsize=11, weight='bold')
+ax.text((HI+xmax)/2, 22, 'HSC', ha='center', fontsize=10, weight='bold')
+ax.text(LO, ymin-1.5, '0.8547', ha='center', fontsize=7); ax.text(HI, ymin-1.5, '0.95', ha='center', fontsize=7)
+ax.set_xlim(xmin, xmax); ax.set_ylim(ymin, ymax)
+ax.set_xlabel('cosine similarity (style)'); ax.set_ylabel('dHash distance (structure)')
+ax.set_title('Figure 3. The two measures and the five regions', fontsize=9)
+fig.tight_layout(); fig.savefig('/tmp/fig3.png', dpi=200, bbox_inches='tight'); plt.close(fig)
+print('figs 1/2/3 OK')
@@ -0,0 +1,49 @@
+import sqlite3, numpy as np
+DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+BCD=('安侯建業聯合','資誠聯合','安永聯合')
+c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True)
+rows=c.execute("""SELECT s.assigned_accountant, s.max_similarity_to_same_accountant, s.min_dhash_independent
+  FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
+  WHERE a.firm IN ('安侯建業聯合','資誠聯合','安永聯合')
+    AND CAST(substr(s.year_month,1,4) AS INT) BETWEEN 2013 AND 2019
+    AND s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL""").fetchall()
+from collections import defaultdict
+by=defaultdict(list)
+for a,cos,dh in rows: by[a].append((cos,dh))
+accs={a:np.array(v) for a,v in by.items() if len(v)>=15}
+print(f"BCD 2013-2019: {len(accs)} accountants with >=15 signatures (of {len(by)} total)")
+
+rep=[]; tight=[]; rem_med=[]; klass=[]
+for a,v in accs.items():
+    cos=v[:,0]; dh=v[:,1]
+    hc=(cos>0.95)&(dh<=5)
+    rf=hc.mean(); tf=(cos>0.95).mean()
+    isolated=cos[cos<=0.95]
+    rm=np.median(isolated) if len(isolated)>=3 else np.nan
+    rep.append(rf); tight.append(tf); rem_med.append(rm)
+    klass.append('pure-hand' if rf<0.10 else ('pure-stamp' if rf>0.90 else 'mixed'))
+rep=np.array(rep); tight=np.array(tight); rem_med=np.array(rem_med); klass=np.array(klass)
+
+import collections
+print("\n=== Per-accountant replication-fraction (HC share) distribution ===")
+for lo,hi in [(0,0.1),(0.1,0.3),(0.3,0.5),(0.5,0.7),(0.7,0.9),(0.9,1.01)]:
+    n=((rep>=lo)&(rep<hi)).sum(); print(f"  rep_frac [{lo:.1f},{hi:.1f}): {n:3d} accountants")
+print("  class counts:", dict(collections.Counter(klass)))
+
+mixed=klass=='mixed'
+print(f"\n=== MIXED accountants (n={mixed.sum()}): is the non-tight remainder dispersed (separable)? ===")
+rm_mixed=rem_med[mixed & ~np.isnan(rem_med)]
+print(f"  remainder (cos<=0.95) median cosine across mixed accountants: median={np.median(rm_mixed):.3f}, IQR[{np.percentile(rm_mixed,25):.3f},{np.percentile(rm_mixed,75):.3f}]")
+print(f"  fraction of mixed accountants whose remainder median < 0.90 (clearly dispersed): {(rm_mixed<0.90).mean():.2f}")
+print(f"  fraction with remainder median < 0.85 (very dispersed): {(rm_mixed<0.85).mean():.2f}")
+# gap between tight group (cos>0.95) and remainder: per mixed accountant
+gaps=[]
+for a,v in accs.items():
+    cos=v[:,0]
+    t=cos[cos>0.95]; r=cos[cos<=0.95]
+    if len(t)>=3 and len(r)>=3:
+        gaps.append(np.median(t)-np.median(r))
+gaps=np.array(gaps)
+print(f"\n=== Tight-vs-remainder cosine gap (all accountants with both parts, n={len(gaps)}) ===")
+print(f"  median gap = {np.median(gaps):.3f}  (large gap => two-component structure is real & separable)")
+print(f"  fraction with gap > 0.10: {(gaps>0.10).mean():.2f}")