import sqlite3, numpy as np DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' BCD=('安侯建業聯合','資誠聯合','安永聯合') c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True) rows=c.execute("""SELECT s.assigned_accountant, s.max_similarity_to_same_accountant, s.min_dhash_independent FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name WHERE a.firm IN ('安侯建業聯合','資誠聯合','安永聯合') AND CAST(substr(s.year_month,1,4) AS INT) BETWEEN 2013 AND 2019 AND s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL""").fetchall() from collections import defaultdict by=defaultdict(list) for a,cos,dh in rows: by[a].append((cos,dh)) accs={a:np.array(v) for a,v in by.items() if len(v)>=15} print(f"BCD 2013-2019: {len(accs)} accountants with >=15 signatures (of {len(by)} total)") rep=[]; tight=[]; rem_med=[]; klass=[] for a,v in accs.items(): cos=v[:,0]; dh=v[:,1] hc=(cos>0.95)&(dh<=5) rf=hc.mean(); tf=(cos>0.95).mean() isolated=cos[cos<=0.95] rm=np.median(isolated) if len(isolated)>=3 else np.nan rep.append(rf); tight.append(tf); rem_med.append(rm) klass.append('pure-hand' if rf<0.10 else ('pure-stamp' if rf>0.90 else 'mixed')) rep=np.array(rep); tight=np.array(tight); rem_med=np.array(rem_med); klass=np.array(klass) import collections print("\n=== Per-accountant replication-fraction (HC share) distribution ===") for lo,hi in [(0,0.1),(0.1,0.3),(0.3,0.5),(0.5,0.7),(0.7,0.9),(0.9,1.01)]: n=((rep>=lo)&(rep0.95) and remainder: per mixed accountant gaps=[] for a,v in accs.items(): cos=v[:,0] t=cos[cos>0.95]; r=cos[cos<=0.95] if len(t)>=3 and len(r)>=3: gaps.append(np.median(t)-np.median(r)) gaps=np.array(gaps) print(f"\n=== Tight-vs-remainder cosine gap (all accountants with both parts, n={len(gaps)}) ===") print(f" median gap = {np.median(gaps):.3f} (large gap => two-component structure is real & separable)") print(f" fraction with gap > 0.10: {(gaps>0.10).mean():.2f}")