#!/usr/bin/env python3 """Script 49: Firm A as out-of-sample target against a clean BCD baseline. (1) A signatures scored against a BCD-only candidate pool (true out-of-sample inter-firm coincidence). (2) Observed deployed rate on ACTUAL same-CPA pools, per firm (the real fired rate, from precomputed deployed descriptors), to juxtapose against the clean BCD inter-CPA coincidence floor. Read-only. """ import sqlite3 from collections import defaultdict import numpy as np DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' FIRM_A = '勤業眾信聯合' BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合') ALIAS = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'} SEED = 42 POP = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8) def wilson(k, n, z=1.96): if n == 0: return (None, None) p = k/n; d = 1+z*z/n c = (p+z*z/(2*n))/d h = z*np.sqrt(p*(1-p)/n+z*z/(4*n*n))/d return (max(0.0, c-h), min(1.0, c+h)) conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True) cur = conn.cursor() cur.execute(""" SELECT s.assigned_accountant, a.firm, s.source_pdf, s.feature_vector, s.dhash_vector, s.max_similarity_to_same_accountant, s.min_dhash_independent FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name WHERE s.assigned_accountant IS NOT NULL AND a.firm IN (?,?,?,?) AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""", BIG4) rows = cur.fetchall() conn.close() # ---- (1) Firm A source vs BCD-only candidate pool ---- print('=== (1) Firm A out-of-sample vs clean BCD candidate pool ===') A = [r for r in rows if r[1] == FIRM_A] BCD = [r for r in rows if r[1] in BIG4 and r[1] != FIRM_A] bcd_feat = np.stack([np.frombuffer(r[3], np.float32) for r in BCD]).astype(np.float32) bcd_feat /= np.clip(np.linalg.norm(bcd_feat, axis=1, keepdims=True), 1e-9, None) bcd_dh = np.stack([np.frombuffer(r[4], np.uint8) for r in BCD]) nb = len(BCD) # A CPA pool sizes (their own same-CPA count - 1), to match negative-anchor construction a_cpa_idx = defaultdict(list) for i, r in enumerate(A): a_cpa_idx[r[0]].append(i) pool_size = {c: len(v)-1 for c, v in a_cpa_idx.items()} rng = np.random.default_rng(SEED) sig_hc = np.zeros(len(A), bool) doc_hcmc = defaultdict(bool) for i, r in enumerate(A): npool = max(pool_size[r[0]], 1) cand = rng.integers(0, nb, size=npool) sf = np.frombuffer(r[3], np.float32).astype(np.float32) sf /= max(np.linalg.norm(sf), 1e-9) cosv = bcd_feat[cand] @ sf cg = cosv > 0.95 doc_hcmc.setdefault(r[2], False) if cg.any(): dist = POP[bcd_dh[cand] ^ np.frombuffer(r[4], np.uint8)].sum(axis=1) sig_hc[i] = bool((cg & (dist <= 5)).any()) if (cg & (dist <= 15)).any(): doc_hcmc[r[2]] = True k = int(sig_hc.sum()); n = len(A); lo, hi = wilson(k, n) print(f' A-source vs BCD-pool per-SIGNATURE HC (cos>0.95 & dh<=5): ' f'{k/n:.4f} ({k}/{n}) Wilson95% [{lo:.4f},{hi:.4f}]') dv = np.array(list(doc_hcmc.values())); dk = int(dv.sum()); dm = len(dv) dlo, dhi = wilson(dk, dm) print(f' A-source vs BCD-pool per-DOCUMENT HC+MC (cos>0.95 & dh<=15): ' f'{dk/dm:.4f} ({dk}/{dm}) Wilson95% [{dlo:.4f},{dhi:.4f}]') # ---- (2) Observed deployed rate on ACTUAL same-CPA pools, per firm ---- print('\n=== (2) Observed deployed rate on actual same-CPA pools (real fired rate) ===') print(' per-signature HC = max_sim>0.95 & min_dh<=5 ; per-doc HC+MC worst-case dh<=15') by_firm_sig = defaultdict(lambda: [0, 0]) doc_obs = {} doc_firm = {} for r in rows: fm = ALIAS[r[1]] ms, md = r[5], r[6] if ms is None or md is None: continue hc = (ms > 0.95) and (md <= 5) hcmc = (ms > 0.95) and (md <= 15) by_firm_sig[fm][0] += int(hc); by_firm_sig[fm][1] += 1 doc_firm.setdefault(r[2], fm) doc_obs[r[2]] = doc_obs.get(r[2], False) or hcmc for fm in sorted(by_firm_sig): k, n = by_firm_sig[fm] lo, hi = wilson(k, n) print(f' Firm {fm} per-SIGNATURE HC: {k/n:.4f} ({k}/{n}) [{lo:.4f},{hi:.4f}]') dd = defaultdict(lambda: [0, 0]) for d, hit in doc_obs.items(): fm = doc_firm[d]; dd[fm][0] += int(hit); dd[fm][1] += 1 for fm in sorted(dd): k, n = dd[fm] print(f' Firm {fm} per-DOCUMENT HC+MC: {k/n:.4f} ({k}/{n})') print(f'\n Clean BCD inter-CPA coincidence FLOOR: per-sig HC=0.0048, per-doc HC+MC=0.1281')