import sqlite3, numpy as np from collections import defaultdict from scipy.stats import gaussian_kde DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' FIRM_A='勤業眾信聯合'; BIG4=('勤業眾信聯合','安侯建業聯合','資誠聯合','安永聯合') SEED=42; POP=np.array([bin(i).count('1') for i in range(256)],dtype=np.uint8) def load(): c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True) r=c.execute("""SELECT s.assigned_accountant,a.firm,s.source_pdf,s.feature_vector,s.dhash_vector, CAST(substr(s.year_month,1,4) AS INT) FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""").fetchall() c.close(); return r def crossover(keep,label): feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32) feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None) cpas=np.array([r[0] for r in keep]); by=defaultdict(list) for i,c in enumerate(cpas): by[c].append(i) by={c:np.array(v) for c,v in by.items() if len(v)>=3}; accts=list(by.keys()) pw=np.array([len(by[c])*(len(by[c])-1)/2 for c in accts],float); pw/=pw.sum() rng=np.random.default_rng(SEED); M=100_000 intra=np.empty(M,np.float32); ci=rng.choice(len(accts),M,p=pw) for t in range(M): a,b=rng.choice(by[accts[ci[t]]],2,replace=False); intra[t]=feats[a]@feats[b] inter=np.empty(M,np.float32) for t in range(M): i,j=rng.choice(len(accts),2,replace=False); inter[t]=feats[rng.choice(by[accts[i]])]@feats[rng.choice(by[accts[j]])] xs=np.linspace(0.3,1.0,10000); diff=gaussian_kde(intra)(xs)-gaussian_kde(inter)(xs) cr=[float(x) for x in xs[np.where(np.diff(np.sign(diff)))[0]] if 0.6=3={len(accts)})') def percomp_bands(keep,label,M=500_000): feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32) feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None) dh=np.stack([np.frombuffer(r[4],np.uint8) for r in keep]); cpas=np.array([r[0] for r in keep]) by=defaultdict(list) for i,c in enumerate(cpas): by[c].append(i) accts=[c for c,v in by.items() if len(v)>=1]; rng=np.random.default_rng(SEED) n=len(keep); ii=rng.integers(0,n,M*2); jj=rng.integers(0,n,M*2) keepm=cpas[ii]!=cpas[jj]; ii=ii[keepm][:M]; jj=jj[keepm][:M] cos=np.einsum('ij,ij->i',feats[ii],feats[jj]); d=POP[dh[ii]^dh[jj]].sum(1) hc=(cos>0.95)&(d<=5); mc=(cos>0.95)&(d>5)&(d<=15); hsc=(cos>0.95)&(d>15) un=(cos>0.837)&(cos<=0.95); lh=cos<=0.837 print(f' [{label}] per-COMPARISON ICCR (M={len(ii)}): HC {hc.mean():.6f} MC {mc.mean():.6f} HSC {hsc.mean():.6f} UN {un.mean():.4f} LH {lh.mean():.4f}') def persig_perdoc_bands(keep,label): n=len(keep); feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32) feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None) dh=np.stack([np.frombuffer(r[4],np.uint8) for r in keep]); cpas=np.array([r[0] for r in keep]); docs=np.array([r[2] for r in keep]) ci=defaultdict(list) for i,c in enumerate(cpas): ci[c].append(i) ci={c:np.array(v) for c,v in ci.items()}; ps={c:len(v)-1 for c,v in ci.items()} allidx=np.arange(n); rng=np.random.default_rng(SEED); mc=np.zeros(n,np.float32); md=np.full(n,64,np.int32) for si in range(n): p=ps[cpas[si]] if p<=0: continue same=ci[cpas[si]]; need=p; cand=[]; att=0 while need>0 and att<10: dr=rng.choice(n,size=need*2,replace=True); ok=dr[~np.isin(dr,same)]; cand.extend(ok[:need].tolist()); need-=len(ok[:need]); att+=1 cand=np.array(cand[:p],dtype=np.int64) mc[si]=(feats[cand]@feats[si]).max(); md[si]=int(POP[dh[cand]^dh[si]].sum(1).min()) un=(mc>0.837)&(mc<=0.95); hsc=(mc>0.95)&(md>15) # per-doc: any signature in band dd=defaultdict(list) for i in range(n): dd[docs[i]].append(i) docs_un=np.mean([un[v].any() for v in dd.values()]); docs_hsc=np.mean([hsc[v].any() for v in dd.values()]) print(f' [{label}] per-SIGNATURE ICCR: UN {un.mean():.4f} HSC {hsc.mean():.6f}') print(f' [{label}] per-REPORT ICCR: UN {docs_un:.4f} HSC {docs_hsc:.6f} (n_doc={len(dd)})') rows=load() bcd_all=[r for r in rows if r[1] in BIG4 and r[1]!=FIRM_A] bcd_19=[r for r in bcd_all if 2013<=r[5]<=2019] print("=== ITEM 11: KDE crossover (verify corpus 0.837 / BCD-all 0.8489, then closed-world 2013-2019) ===") crossover(rows,'corpus-wide (verify ~0.8367)') crossover(bcd_all,'BCD-only ALL period (verify 0.8489)') crossover(bcd_19,'BCD 2013-2019 CLOSED-WORLD (NEW primary candidate)') print("\n=== ITEM 3: UN / HSC full ICCR on BCD 2013-2019 ===") percomp_bands(bcd_19,'BCD 2013-2019') persig_perdoc_bands(bcd_19,'BCD 2013-2019') print("\n=== ITEM 12: n reconciliation ===") print(f" BCD full-period (2013-2023) signatures = {len(bcd_all)} <- Script53 logged n=89,994") print(f" BCD 2013-2019 signatures = {len(bcd_19)} <- headline ICCR base (reproduces 0.0059)")