pdf_signature_extraction/signature_analysis/46_bcd_only_iccr.py

#!/usr/bin/env python3
"""Script 46: BCD-only (exclude Firm A) per-comparison ICCR recompute.

Replicates 40b's inter-CPA negative-anchor pair sampling (N=500k, seed=42)
but compares three negative-anchor pool compositions:
  - ABCD      : all Big-4 (current paper baseline)
  - BCD       : Big-4 excluding Firm A (normative-baseline proposal)
  - BCD+nonB4 : BCD plus all non-Big-4 firms
Reports marginal cos>0.95, dHash<=5, and the joint HC rule cos>0.95 & dHash<=5.
Read-only.
"""
import sqlite3
from collections import defaultdict
import numpy as np

DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
N_PAIRS = 500_000
SEED = 42
FIRM_A = '勤業眾信聯合'
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')


def hamming(a, b):
    return (int.from_bytes(a, 'big') ^ int.from_bytes(b, 'big')).bit_count()


def load():
    conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
    cur = conn.cursor()
    cur.execute("""
        SELECT s.assigned_accountant, a.firm, s.feature_vector, s.dhash_vector
        FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
        WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL
          AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""")
    rows = cur.fetchall()
    conn.close()
    return rows


def wilson(k, n, z=1.96):
    if n == 0:
        return (None, None)
    p = k / n
    d = 1 + z*z/n
    c = (p + z*z/(2*n)) / d
    h = z*np.sqrt(p*(1-p)/n + z*z/(4*n*n)) / d
    return (max(0.0, c-h), min(1.0, c+h))


def iccr(rows, label):
    by = defaultdict(list)
    for acct, firm, fv, dh in rows:
        by[acct].append((fv, dh))
    accts = list(by.keys())
    feats = {a: np.stack([np.frombuffer(r[0], dtype=np.float32) for r in by[a]]) for a in accts}
    dhs = {a: [r[1] for r in by[a]] for a in accts}
    rng = np.random.default_rng(SEED)
    cos = np.empty(N_PAIRS, np.float32)
    dv = np.empty(N_PAIRS, np.int32)
    na = len(accts)
    for t in range(N_PAIRS):
        i, j = rng.choice(na, 2, replace=False)
        a1, a2 = accts[i], accts[j]
        k1 = int(rng.integers(0, len(by[a1])))
        k2 = int(rng.integers(0, len(by[a2])))
        cos[t] = float(feats[a1][k1] @ feats[a2][k2])
        dv[t] = hamming(dhs[a1][k1], dhs[a2][k2])
    n = N_PAIRS
    m_cos = int((cos > 0.95).sum())
    m_dh = int((dv <= 5).sum())
    joint = int(((cos > 0.95) & (dv <= 5)).sum())
    jlo, jhi = wilson(joint, n)
    print(f'\n== {label} ==')
    print(f'  signatures={len(rows):,}  accountants={na}  pairs={n:,}')
    print(f'  cos>0.95         ICCR = {m_cos/n:.5f}  ({m_cos})')
    print(f'  dHash<=5         ICCR = {m_dh/n:.5f}  ({m_dh})')
    print(f'  JOINT (HC rule)  ICCR = {joint/n:.6f}  ({joint})  Wilson95% [{jlo:.6f},{jhi:.6f}]')
    return joint/n


rows = load()
abcd = [r for r in rows if r[1] in BIG4]
bcd = [r for r in rows if r[1] in BIG4 and r[1] != FIRM_A]
bcd_non = [r for r in rows if r[1] != FIRM_A]
iccr(abcd, 'ABCD (current paper baseline)')
iccr(bcd, 'BCD only (exclude Firm A)')
iccr(bcd_non, 'BCD + non-Big-4')