Paper A v4.1: BCD-baseline reframe + screening positioning + trim

- Re-anchor inter-CPA coincidence-rate (ICCR) calibration on a normative non-Firm-A baseline (Firms B/C/D); Firm A held out as an out-of-sample target. Locked canonical numbers (codex-audited; Scripts 46/52/53): per-comparison HC 0.00014->0.000018, per-signature HC 0.0116, per-document HC+MC 0.34->0.1905; KDE crossover 0.837 retained corpus-wide. - Reposition as an operator-tunable, semi-automated screening/triage framework (title -> "Automated Screening..."): HC = high-specificity operating point; MC band demoted to low-specificity advisory; Firm A = demonstration that the screening surfaces a templated end, audit-quality implications deferred. - Apply codex prose-review fixes: triage-neutral five-way labels, soften mechanism/specificity wording, supersede MC claim-strength, update stale Appendix script references (40b/43/45 -> 46/52/53). - Trim pass: compress Sec. V discussion + Sec. III echoes (27.7k -> 26.8k words); no substantive content removed. - Add analysis scripts 45-53 (firm-year trends; BCD-only ICCR recompute; canonical-sampler locked numbers; Firm-A out-of-sample; BCD regression + cross-firm hit matrix). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-04 19:35:10 +08:00
parent becce857e1
commit 3c7fcc010f
11 changed files with 1225 additions and 184 deletions
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""Script 48: full-fidelity (no subsample) BCD-only recompute of per-signature
+and per-document inter-CPA any-pair ICCR, plus corpus-style KDE crossover.
+Vectorized popcount. Scopes: ABCD, BCD-only, BCD+non-Big-4. Read-only.
+"""
+import sqlite3
+from collections import defaultdict
+import numpy as np
+from scipy.stats import gaussian_kde
+
+DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+FIRM_A = '勤業眾信聯合'
+BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
+ALIAS = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'}
+SEED = 42
+POP = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
+
+
+def load():
+    conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
+    cur = conn.cursor()
+    cur.execute("""
+        SELECT s.assigned_accountant, a.firm, s.source_pdf,
+               s.feature_vector, s.dhash_vector
+        FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
+        WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL
+          AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""")
+    rows = cur.fetchall()
+    conn.close()
+    return rows
+
+
+def wilson(k, n, z=1.96):
+    if n == 0:
+        return (None, None)
+    p = k/n; d = 1+z*z/n
+    c = (p+z*z/(2*n))/d
+    h = z*np.sqrt(p*(1-p)/n+z*z/(4*n*n))/d
+    return (max(0.0, c-h), min(1.0, c+h))
+
+
+def prep(rows, keep_fn):
+    keep = [r for r in rows if keep_fn(r[1])]
+    feats = np.stack([np.frombuffer(r[3], np.float32) for r in keep]).astype(np.float32)
+    feats /= np.clip(np.linalg.norm(feats, axis=1, keepdims=True), 1e-9, None)
+    dh = np.stack([np.frombuffer(r[4], np.uint8) for r in keep])  # (n,8)
+    cpas = np.array([r[0] for r in keep])
+    firms = np.array([ALIAS.get(r[1], 'X') for r in keep])
+    docs = np.array([r[2] for r in keep])
+    return feats, dh, cpas, firms, docs
+
+
+def crossover(feats, cpas, label):
+    by = defaultdict(list)
+    for i, c in enumerate(cpas):
+        by[c].append(i)
+    by = {c: np.array(v) for c, v in by.items() if len(v) >= 2}
+    accts = list(by.keys())
+    rng = np.random.default_rng(SEED)
+    N = 100_000
+    intra = np.empty(N, np.float32); inter = np.empty(N, np.float32)
+    ks = rng.integers(0, len(accts), N)
+    for t in range(N):
+        idx = by[accts[ks[t]]]
+        a, b = rng.choice(idx, 2, replace=False)
+        intra[t] = feats[a] @ feats[b]
+        i, j = rng.choice(len(accts), 2, replace=False)
+        inter[t] = feats[rng.choice(by[accts[i]])] @ feats[rng.choice(by[accts[j]])]
+    xs = np.linspace(0.3, 1.0, 10000)
+    diff = gaussian_kde(intra)(xs) - gaussian_kde(inter)(xs)
+    cross = [float(x) for x in xs[np.where(np.diff(np.sign(diff)))[0]] if 0.6 < x < 0.99]
+    print(f'  [{label}] crossover {[f"{x:.4f}" for x in cross]}  '
+          f'(intra {intra.mean():.4f} / inter {inter.mean():.4f})')
+
+
+def pool_sim(feats, dh, cpas, firms, docs, label):
+    n = len(cpas)
+    cpa_idx = defaultdict(list)
+    for i, c in enumerate(cpas):
+        cpa_idx[c].append(i)
+    cpa_idx = {c: np.array(v) for c, v in cpa_idx.items()}
+    pool_size = {c: len(v)-1 for c, v in cpa_idx.items()}
+    rng = np.random.default_rng(SEED)
+    sig_hc = np.zeros(n, bool)
+    doc_hcmc = defaultdict(bool)
+    for si in range(n):
+        c = cpas[si]; npool = pool_size[c]
+        if npool <= 0:
+            continue
+        same = cpa_idx[c]
+        draw = rng.integers(0, n, size=npool + same.size + 20)
+        cand = draw[~np.isin(draw, same)][:npool]
+        cosv = feats[cand] @ feats[si]
+        cg = cosv > 0.95
+        if cg.any():
+            dist = POP[dh[cand] ^ dh[si]].sum(axis=1)
+            sig_hc[si] = bool((cg & (dist <= 5)).any())
+            if (cg & (dist <= 15)).any():
+                doc_hcmc[docs[si]] = True
+        else:
+            doc_hcmc.setdefault(docs[si], doc_hcmc[docs[si]] if docs[si] in doc_hcmc else False)
+    # ensure every doc present
+    for d in docs:
+        doc_hcmc.setdefault(d, False)
+    k = int(sig_hc.sum())
+    lo, hi = wilson(k, n)
+    print(f'\n  [{label}] per-SIGNATURE any-pair HC (cos>0.95 & dh<=5): '
+          f'{k/n:.4f} ({k}/{n}) Wilson95% [{lo:.4f},{hi:.4f}]')
+    for f in sorted(set(firms)):
+        m = firms == f
+        print(f'      Firm {f}: {sig_hc[m].sum()/m.sum():.4f} ({int(sig_hc[m].sum())}/{int(m.sum())})')
+    # per-doc, with firm of first sig
+    dfirm = {}
+    for i, d in enumerate(docs):
+        dfirm.setdefault(d, firms[i])
+    dl = list(doc_hcmc.keys())
+    dv = np.array([doc_hcmc[d] for d in dl])
+    df = np.array([dfirm[d] for d in dl])
+    dk = int(dv.sum()); dm = len(dv)
+    dlo, dhi = wilson(dk, dm)
+    print(f'  [{label}] per-DOCUMENT HC+MC (cos>0.95 & dh<=15): '
+          f'{dk/dm:.4f} ({dk}/{dm}) Wilson95% [{dlo:.4f},{dhi:.4f}]')
+    for f in sorted(set(df)):
+        m = df == f
+        print(f'      Firm {f}: {dv[m].sum()/m.sum():.4f} ({int(dv[m].sum())}/{int(m.sum())})')
+
+
+rows = load()
+SCOPES = [('ABCD', lambda fm: fm in BIG4),
+          ('BCD-only', lambda fm: fm in BIG4 and fm != FIRM_A),
+          ('BCD+nonBig4', lambda fm: fm != FIRM_A)]
+
+print('=== KDE crossover ===')
+for name, fn in SCOPES[:2]:
+    f, _, c, _, _ = prep(rows, fn)
+    crossover(f, c, name)
+
+print('\n=== per-signature & per-document inter-CPA ICCR (full) ===')
+for name, fn in SCOPES:
+    f, dh, c, fm, dc = prep(rows, fn)
+    pool_sim(f, dh, c, fm, dc, name)