Paper A v4.1: BCD-baseline reframe + screening positioning + trim
- Re-anchor inter-CPA coincidence-rate (ICCR) calibration on a normative non-Firm-A baseline (Firms B/C/D); Firm A held out as an out-of-sample target. Locked canonical numbers (codex-audited; Scripts 46/52/53): per-comparison HC 0.00014->0.000018, per-signature HC 0.0116, per-document HC+MC 0.34->0.1905; KDE crossover 0.837 retained corpus-wide. - Reposition as an operator-tunable, semi-automated screening/triage framework (title -> "Automated Screening..."): HC = high-specificity operating point; MC band demoted to low-specificity advisory; Firm A = demonstration that the screening surfaces a templated end, audit-quality implications deferred. - Apply codex prose-review fixes: triage-neutral five-way labels, soften mechanism/specificity wording, supersede MC claim-strength, update stale Appendix script references (40b/43/45 -> 46/52/53). - Trim pass: compress Sec. V discussion + Sec. III echoes (27.7k -> 26.8k words); no substantive content removed. - Add analysis scripts 45-53 (firm-year trends; BCD-only ICCR recompute; canonical-sampler locked numbers; Firm-A out-of-sample; BCD regression + cross-firm hit matrix). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Script 49: Firm A as out-of-sample target against a clean BCD baseline.
|
||||
(1) A signatures scored against a BCD-only candidate pool (true out-of-sample
|
||||
inter-firm coincidence).
|
||||
(2) Observed deployed rate on ACTUAL same-CPA pools, per firm (the real fired
|
||||
rate, from precomputed deployed descriptors), to juxtapose against the
|
||||
clean BCD inter-CPA coincidence floor. Read-only.
|
||||
"""
|
||||
import sqlite3
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FIRM_A = '勤業眾信聯合'
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'}
|
||||
SEED = 42
|
||||
POP = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
|
||||
|
||||
|
||||
def wilson(k, n, z=1.96):
|
||||
if n == 0:
|
||||
return (None, None)
|
||||
p = k/n; d = 1+z*z/n
|
||||
c = (p+z*z/(2*n))/d
|
||||
h = z*np.sqrt(p*(1-p)/n+z*z/(4*n*n))/d
|
||||
return (max(0.0, c-h), min(1.0, c+h))
|
||||
|
||||
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT s.assigned_accountant, a.firm, s.source_pdf, s.feature_vector,
|
||||
s.dhash_vector, s.max_similarity_to_same_accountant, s.min_dhash_independent
|
||||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL AND a.firm IN (?,?,?,?)
|
||||
AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""", BIG4)
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
|
||||
# ---- (1) Firm A source vs BCD-only candidate pool ----
|
||||
print('=== (1) Firm A out-of-sample vs clean BCD candidate pool ===')
|
||||
A = [r for r in rows if r[1] == FIRM_A]
|
||||
BCD = [r for r in rows if r[1] in BIG4 and r[1] != FIRM_A]
|
||||
bcd_feat = np.stack([np.frombuffer(r[3], np.float32) for r in BCD]).astype(np.float32)
|
||||
bcd_feat /= np.clip(np.linalg.norm(bcd_feat, axis=1, keepdims=True), 1e-9, None)
|
||||
bcd_dh = np.stack([np.frombuffer(r[4], np.uint8) for r in BCD])
|
||||
nb = len(BCD)
|
||||
# A CPA pool sizes (their own same-CPA count - 1), to match negative-anchor construction
|
||||
a_cpa_idx = defaultdict(list)
|
||||
for i, r in enumerate(A):
|
||||
a_cpa_idx[r[0]].append(i)
|
||||
pool_size = {c: len(v)-1 for c, v in a_cpa_idx.items()}
|
||||
rng = np.random.default_rng(SEED)
|
||||
sig_hc = np.zeros(len(A), bool)
|
||||
doc_hcmc = defaultdict(bool)
|
||||
for i, r in enumerate(A):
|
||||
npool = max(pool_size[r[0]], 1)
|
||||
cand = rng.integers(0, nb, size=npool)
|
||||
sf = np.frombuffer(r[3], np.float32).astype(np.float32)
|
||||
sf /= max(np.linalg.norm(sf), 1e-9)
|
||||
cosv = bcd_feat[cand] @ sf
|
||||
cg = cosv > 0.95
|
||||
doc_hcmc.setdefault(r[2], False)
|
||||
if cg.any():
|
||||
dist = POP[bcd_dh[cand] ^ np.frombuffer(r[4], np.uint8)].sum(axis=1)
|
||||
sig_hc[i] = bool((cg & (dist <= 5)).any())
|
||||
if (cg & (dist <= 15)).any():
|
||||
doc_hcmc[r[2]] = True
|
||||
k = int(sig_hc.sum()); n = len(A); lo, hi = wilson(k, n)
|
||||
print(f' A-source vs BCD-pool per-SIGNATURE HC (cos>0.95 & dh<=5): '
|
||||
f'{k/n:.4f} ({k}/{n}) Wilson95% [{lo:.4f},{hi:.4f}]')
|
||||
dv = np.array(list(doc_hcmc.values())); dk = int(dv.sum()); dm = len(dv)
|
||||
dlo, dhi = wilson(dk, dm)
|
||||
print(f' A-source vs BCD-pool per-DOCUMENT HC+MC (cos>0.95 & dh<=15): '
|
||||
f'{dk/dm:.4f} ({dk}/{dm}) Wilson95% [{dlo:.4f},{dhi:.4f}]')
|
||||
|
||||
# ---- (2) Observed deployed rate on ACTUAL same-CPA pools, per firm ----
|
||||
print('\n=== (2) Observed deployed rate on actual same-CPA pools (real fired rate) ===')
|
||||
print(' per-signature HC = max_sim>0.95 & min_dh<=5 ; per-doc HC+MC worst-case dh<=15')
|
||||
by_firm_sig = defaultdict(lambda: [0, 0])
|
||||
doc_obs = {}
|
||||
doc_firm = {}
|
||||
for r in rows:
|
||||
fm = ALIAS[r[1]]
|
||||
ms, md = r[5], r[6]
|
||||
if ms is None or md is None:
|
||||
continue
|
||||
hc = (ms > 0.95) and (md <= 5)
|
||||
hcmc = (ms > 0.95) and (md <= 15)
|
||||
by_firm_sig[fm][0] += int(hc); by_firm_sig[fm][1] += 1
|
||||
doc_firm.setdefault(r[2], fm)
|
||||
doc_obs[r[2]] = doc_obs.get(r[2], False) or hcmc
|
||||
for fm in sorted(by_firm_sig):
|
||||
k, n = by_firm_sig[fm]
|
||||
lo, hi = wilson(k, n)
|
||||
print(f' Firm {fm} per-SIGNATURE HC: {k/n:.4f} ({k}/{n}) [{lo:.4f},{hi:.4f}]')
|
||||
dd = defaultdict(lambda: [0, 0])
|
||||
for d, hit in doc_obs.items():
|
||||
fm = doc_firm[d]; dd[fm][0] += int(hit); dd[fm][1] += 1
|
||||
for fm in sorted(dd):
|
||||
k, n = dd[fm]
|
||||
print(f' Firm {fm} per-DOCUMENT HC+MC: {k/n:.4f} ({k}/{n})')
|
||||
print(f'\n Clean BCD inter-CPA coincidence FLOOR: per-sig HC=0.0048, per-doc HC+MC=0.1281')
|
||||
Reference in New Issue
Block a user