#!/usr/bin/env python3 """Script 46: BCD-only (exclude Firm A) per-comparison ICCR recompute. Replicates 40b's inter-CPA negative-anchor pair sampling (N=500k, seed=42) but compares three negative-anchor pool compositions: - ABCD : all Big-4 (current paper baseline) - BCD : Big-4 excluding Firm A (normative-baseline proposal) - BCD+nonB4 : BCD plus all non-Big-4 firms Reports marginal cos>0.95, dHash<=5, and the joint HC rule cos>0.95 & dHash<=5. Read-only. """ import sqlite3 from collections import defaultdict import numpy as np DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' N_PAIRS = 500_000 SEED = 42 FIRM_A = '勤業眾信聯合' BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合') def hamming(a, b): return (int.from_bytes(a, 'big') ^ int.from_bytes(b, 'big')).bit_count() def load(): conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True) cur = conn.cursor() cur.execute(""" SELECT s.assigned_accountant, a.firm, s.feature_vector, s.dhash_vector FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""") rows = cur.fetchall() conn.close() return rows def wilson(k, n, z=1.96): if n == 0: return (None, None) p = k / n d = 1 + z*z/n c = (p + z*z/(2*n)) / d h = z*np.sqrt(p*(1-p)/n + z*z/(4*n*n)) / d return (max(0.0, c-h), min(1.0, c+h)) def iccr(rows, label): by = defaultdict(list) for acct, firm, fv, dh in rows: by[acct].append((fv, dh)) accts = list(by.keys()) feats = {a: np.stack([np.frombuffer(r[0], dtype=np.float32) for r in by[a]]) for a in accts} dhs = {a: [r[1] for r in by[a]] for a in accts} rng = np.random.default_rng(SEED) cos = np.empty(N_PAIRS, np.float32) dv = np.empty(N_PAIRS, np.int32) na = len(accts) for t in range(N_PAIRS): i, j = rng.choice(na, 2, replace=False) a1, a2 = accts[i], accts[j] k1 = int(rng.integers(0, len(by[a1]))) k2 = int(rng.integers(0, len(by[a2]))) cos[t] = float(feats[a1][k1] @ feats[a2][k2]) dv[t] = hamming(dhs[a1][k1], dhs[a2][k2]) n = N_PAIRS m_cos = int((cos > 0.95).sum()) m_dh = int((dv <= 5).sum()) joint = int(((cos > 0.95) & (dv <= 5)).sum()) jlo, jhi = wilson(joint, n) print(f'\n== {label} ==') print(f' signatures={len(rows):,} accountants={na} pairs={n:,}') print(f' cos>0.95 ICCR = {m_cos/n:.5f} ({m_cos})') print(f' dHash<=5 ICCR = {m_dh/n:.5f} ({m_dh})') print(f' JOINT (HC rule) ICCR = {joint/n:.6f} ({joint}) Wilson95% [{jlo:.6f},{jhi:.6f}]') return joint/n rows = load() abcd = [r for r in rows if r[1] in BIG4] bcd = [r for r in rows if r[1] in BIG4 and r[1] != FIRM_A] bcd_non = [r for r in rows if r[1] != FIRM_A] iccr(abcd, 'ABCD (current paper baseline)') iccr(bcd, 'BCD only (exclude Firm A)') iccr(bcd_non, 'BCD + non-Big-4')