3c7fcc010f
- Re-anchor inter-CPA coincidence-rate (ICCR) calibration on a normative non-Firm-A baseline (Firms B/C/D); Firm A held out as an out-of-sample target. Locked canonical numbers (codex-audited; Scripts 46/52/53): per-comparison HC 0.00014->0.000018, per-signature HC 0.0116, per-document HC+MC 0.34->0.1905; KDE crossover 0.837 retained corpus-wide. - Reposition as an operator-tunable, semi-automated screening/triage framework (title -> "Automated Screening..."): HC = high-specificity operating point; MC band demoted to low-specificity advisory; Firm A = demonstration that the screening surfaces a templated end, audit-quality implications deferred. - Apply codex prose-review fixes: triage-neutral five-way labels, soften mechanism/specificity wording, supersede MC claim-strength, update stale Appendix script references (40b/43/45 -> 46/52/53). - Trim pass: compress Sec. V discussion + Sec. III echoes (27.7k -> 26.8k words); no substantive content removed. - Add analysis scripts 45-53 (firm-year trends; BCD-only ICCR recompute; canonical-sampler locked numbers; Firm-A out-of-sample; BCD regression + cross-firm hit matrix). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
88 lines
3.0 KiB
Python
88 lines
3.0 KiB
Python
#!/usr/bin/env python3
|
|
"""Script 46: BCD-only (exclude Firm A) per-comparison ICCR recompute.
|
|
|
|
Replicates 40b's inter-CPA negative-anchor pair sampling (N=500k, seed=42)
|
|
but compares three negative-anchor pool compositions:
|
|
- ABCD : all Big-4 (current paper baseline)
|
|
- BCD : Big-4 excluding Firm A (normative-baseline proposal)
|
|
- BCD+nonB4 : BCD plus all non-Big-4 firms
|
|
Reports marginal cos>0.95, dHash<=5, and the joint HC rule cos>0.95 & dHash<=5.
|
|
Read-only.
|
|
"""
|
|
import sqlite3
|
|
from collections import defaultdict
|
|
import numpy as np
|
|
|
|
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
|
N_PAIRS = 500_000
|
|
SEED = 42
|
|
FIRM_A = '勤業眾信聯合'
|
|
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
|
|
|
|
|
def hamming(a, b):
|
|
return (int.from_bytes(a, 'big') ^ int.from_bytes(b, 'big')).bit_count()
|
|
|
|
|
|
def load():
|
|
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
|
cur = conn.cursor()
|
|
cur.execute("""
|
|
SELECT s.assigned_accountant, a.firm, s.feature_vector, s.dhash_vector
|
|
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
|
WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL
|
|
AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""")
|
|
rows = cur.fetchall()
|
|
conn.close()
|
|
return rows
|
|
|
|
|
|
def wilson(k, n, z=1.96):
|
|
if n == 0:
|
|
return (None, None)
|
|
p = k / n
|
|
d = 1 + z*z/n
|
|
c = (p + z*z/(2*n)) / d
|
|
h = z*np.sqrt(p*(1-p)/n + z*z/(4*n*n)) / d
|
|
return (max(0.0, c-h), min(1.0, c+h))
|
|
|
|
|
|
def iccr(rows, label):
|
|
by = defaultdict(list)
|
|
for acct, firm, fv, dh in rows:
|
|
by[acct].append((fv, dh))
|
|
accts = list(by.keys())
|
|
feats = {a: np.stack([np.frombuffer(r[0], dtype=np.float32) for r in by[a]]) for a in accts}
|
|
dhs = {a: [r[1] for r in by[a]] for a in accts}
|
|
rng = np.random.default_rng(SEED)
|
|
cos = np.empty(N_PAIRS, np.float32)
|
|
dv = np.empty(N_PAIRS, np.int32)
|
|
na = len(accts)
|
|
for t in range(N_PAIRS):
|
|
i, j = rng.choice(na, 2, replace=False)
|
|
a1, a2 = accts[i], accts[j]
|
|
k1 = int(rng.integers(0, len(by[a1])))
|
|
k2 = int(rng.integers(0, len(by[a2])))
|
|
cos[t] = float(feats[a1][k1] @ feats[a2][k2])
|
|
dv[t] = hamming(dhs[a1][k1], dhs[a2][k2])
|
|
n = N_PAIRS
|
|
m_cos = int((cos > 0.95).sum())
|
|
m_dh = int((dv <= 5).sum())
|
|
joint = int(((cos > 0.95) & (dv <= 5)).sum())
|
|
jlo, jhi = wilson(joint, n)
|
|
print(f'\n== {label} ==')
|
|
print(f' signatures={len(rows):,} accountants={na} pairs={n:,}')
|
|
print(f' cos>0.95 ICCR = {m_cos/n:.5f} ({m_cos})')
|
|
print(f' dHash<=5 ICCR = {m_dh/n:.5f} ({m_dh})')
|
|
print(f' JOINT (HC rule) ICCR = {joint/n:.6f} ({joint}) Wilson95% [{jlo:.6f},{jhi:.6f}]')
|
|
return joint/n
|
|
|
|
|
|
rows = load()
|
|
abcd = [r for r in rows if r[1] in BIG4]
|
|
bcd = [r for r in rows if r[1] in BIG4 and r[1] != FIRM_A]
|
|
bcd_non = [r for r in rows if r[1] != FIRM_A]
|
|
iccr(abcd, 'ABCD (current paper baseline)')
|
|
iccr(bcd, 'BCD only (exclude Firm A)')
|
|
iccr(bcd_non, 'BCD + non-Big-4')
|