#!/usr/bin/env python3 """Script 48: full-fidelity (no subsample) BCD-only recompute of per-signature and per-document inter-CPA any-pair ICCR, plus corpus-style KDE crossover. Vectorized popcount. Scopes: ABCD, BCD-only, BCD+non-Big-4. Read-only. """ import sqlite3 from collections import defaultdict import numpy as np from scipy.stats import gaussian_kde DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' FIRM_A = '勤業眾信聯合' BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合') ALIAS = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'} SEED = 42 POP = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8) def load(): conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True) cur = conn.cursor() cur.execute(""" SELECT s.assigned_accountant, a.firm, s.source_pdf, s.feature_vector, s.dhash_vector FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""") rows = cur.fetchall() conn.close() return rows def wilson(k, n, z=1.96): if n == 0: return (None, None) p = k/n; d = 1+z*z/n c = (p+z*z/(2*n))/d h = z*np.sqrt(p*(1-p)/n+z*z/(4*n*n))/d return (max(0.0, c-h), min(1.0, c+h)) def prep(rows, keep_fn): keep = [r for r in rows if keep_fn(r[1])] feats = np.stack([np.frombuffer(r[3], np.float32) for r in keep]).astype(np.float32) feats /= np.clip(np.linalg.norm(feats, axis=1, keepdims=True), 1e-9, None) dh = np.stack([np.frombuffer(r[4], np.uint8) for r in keep]) # (n,8) cpas = np.array([r[0] for r in keep]) firms = np.array([ALIAS.get(r[1], 'X') for r in keep]) docs = np.array([r[2] for r in keep]) return feats, dh, cpas, firms, docs def crossover(feats, cpas, label): by = defaultdict(list) for i, c in enumerate(cpas): by[c].append(i) by = {c: np.array(v) for c, v in by.items() if len(v) >= 2} accts = list(by.keys()) rng = np.random.default_rng(SEED) N = 100_000 intra = np.empty(N, np.float32); inter = np.empty(N, np.float32) ks = rng.integers(0, len(accts), N) for t in range(N): idx = by[accts[ks[t]]] a, b = rng.choice(idx, 2, replace=False) intra[t] = feats[a] @ feats[b] i, j = rng.choice(len(accts), 2, replace=False) inter[t] = feats[rng.choice(by[accts[i]])] @ feats[rng.choice(by[accts[j]])] xs = np.linspace(0.3, 1.0, 10000) diff = gaussian_kde(intra)(xs) - gaussian_kde(inter)(xs) cross = [float(x) for x in xs[np.where(np.diff(np.sign(diff)))[0]] if 0.6 < x < 0.99] print(f' [{label}] crossover {[f"{x:.4f}" for x in cross]} ' f'(intra {intra.mean():.4f} / inter {inter.mean():.4f})') def pool_sim(feats, dh, cpas, firms, docs, label): n = len(cpas) cpa_idx = defaultdict(list) for i, c in enumerate(cpas): cpa_idx[c].append(i) cpa_idx = {c: np.array(v) for c, v in cpa_idx.items()} pool_size = {c: len(v)-1 for c, v in cpa_idx.items()} rng = np.random.default_rng(SEED) sig_hc = np.zeros(n, bool) doc_hcmc = defaultdict(bool) for si in range(n): c = cpas[si]; npool = pool_size[c] if npool <= 0: continue same = cpa_idx[c] draw = rng.integers(0, n, size=npool + same.size + 20) cand = draw[~np.isin(draw, same)][:npool] cosv = feats[cand] @ feats[si] cg = cosv > 0.95 if cg.any(): dist = POP[dh[cand] ^ dh[si]].sum(axis=1) sig_hc[si] = bool((cg & (dist <= 5)).any()) if (cg & (dist <= 15)).any(): doc_hcmc[docs[si]] = True else: doc_hcmc.setdefault(docs[si], doc_hcmc[docs[si]] if docs[si] in doc_hcmc else False) # ensure every doc present for d in docs: doc_hcmc.setdefault(d, False) k = int(sig_hc.sum()) lo, hi = wilson(k, n) print(f'\n [{label}] per-SIGNATURE any-pair HC (cos>0.95 & dh<=5): ' f'{k/n:.4f} ({k}/{n}) Wilson95% [{lo:.4f},{hi:.4f}]') for f in sorted(set(firms)): m = firms == f print(f' Firm {f}: {sig_hc[m].sum()/m.sum():.4f} ({int(sig_hc[m].sum())}/{int(m.sum())})') # per-doc, with firm of first sig dfirm = {} for i, d in enumerate(docs): dfirm.setdefault(d, firms[i]) dl = list(doc_hcmc.keys()) dv = np.array([doc_hcmc[d] for d in dl]) df = np.array([dfirm[d] for d in dl]) dk = int(dv.sum()); dm = len(dv) dlo, dhi = wilson(dk, dm) print(f' [{label}] per-DOCUMENT HC+MC (cos>0.95 & dh<=15): ' f'{dk/dm:.4f} ({dk}/{dm}) Wilson95% [{dlo:.4f},{dhi:.4f}]') for f in sorted(set(df)): m = df == f print(f' Firm {f}: {dv[m].sum()/m.sum():.4f} ({int(dv[m].sum())}/{int(m.sum())})') rows = load() SCOPES = [('ABCD', lambda fm: fm in BIG4), ('BCD-only', lambda fm: fm in BIG4 and fm != FIRM_A), ('BCD+nonBig4', lambda fm: fm != FIRM_A)] print('=== KDE crossover ===') for name, fn in SCOPES[:2]: f, _, c, _, _ = prep(rows, fn) crossover(f, c, name) print('\n=== per-signature & per-document inter-CPA ICCR (full) ===') for name, fn in SCOPES: f, dh, c, fm, dc = prep(rows, fn) pool_sim(f, dh, c, fm, dc, name)