Paper A v4.2: re-anchor primary calibration to clean BCD 2013-2019 baseline
- Restrict the calibration negative anchor to Firms B/C/D, fiscal years 2013-2019 (pre-electronic-signature hand-signing period); B/C/D adopted e-signing post-2020 at staggered times, so 2013-2019 is the construct-clean baseline. Firm A scored across its full 2013-2023 record against it. - New locked numbers (codex-audited, Scripts 54/55): per-comparison HC floor 0.000010; per-signature HC floor 0.0059 [boot 0.0045-0.0073]; per-document HC 0.0117 / HC+MC 0.1753; per-firm HC+MC B 0.162 / C 0.225 / D 0.089. Firm A observed 0.817 = ~139x the clean floor (was ~70x on all-period BCD); Firm A out-of-sample vs clean pool 0.0001 (below floor -> never resembles genuine hand-signing). BCD 2020+ robustness: per-sig 0.0105, per-comparison 0.000036 (~2x pre-2020) quantifies the e-signing contamination. - Propagated through abstract / Sec. I / III-L / IV-M / V / conclusion; 0.837 crossover kept corpus-wide; ABCD retained as contamination comparison. - Grounded the 2013-2019 choice on data (floor drift) + e-sign-adoption background, not on in-text interview claims (double-blind). - Add Scripts 54 (temporal floor stability) and 55 (BCD 2013-2019 primary calibration + Firm A scoring). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Script 54: temporal stability of the BCD inter-CPA floor.
|
||||
Does the normative BCD per-comparison HC coincidence floor drift over time /
|
||||
get contaminated by post-2020 e-signing? Compares eras full / 2013-2019 /
|
||||
2020-2023 using the pool-size-independent per-comparison joint HC ICCR
|
||||
(cos>0.95 & dHash<=5) on BCD inter-CPA pairs (N=500k, seed 42), plus the
|
||||
observed deployed per-signature HC rate by firm by era. Read-only.
|
||||
"""
|
||||
import sqlite3
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FIRM_A = '勤業眾信聯合'
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'}
|
||||
SEED = 42
|
||||
N_PAIRS = 500_000
|
||||
POP = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
|
||||
|
||||
|
||||
def wilson(k, n, z=1.96):
|
||||
if n == 0:
|
||||
return (None, None)
|
||||
p = k/n; d = 1+z*z/n
|
||||
c = (p+z*z/(2*n))/d
|
||||
h = z*np.sqrt(p*(1-p)/n+z*z/(4*n*n))/d
|
||||
return (max(0.0, c-h), min(1.0, c+h))
|
||||
|
||||
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT s.assigned_accountant, a.firm, CAST(substr(s.year_month,1,4) AS INT),
|
||||
s.feature_vector, s.dhash_vector,
|
||||
s.max_similarity_to_same_accountant, s.min_dhash_independent
|
||||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE a.firm IN (?,?,?,?) AND s.year_month IS NOT NULL
|
||||
AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""", BIG4)
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
|
||||
ERAS = {'full 2013-2023': lambda y: True,
|
||||
'2013-2019 (pre-drift)': lambda y: 2013 <= y <= 2019,
|
||||
'2020-2023': lambda y: 2020 <= y <= 2023}
|
||||
|
||||
|
||||
def per_comparison_floor(era_fn, label):
|
||||
# BCD-only (exclude Firm A), era-restricted
|
||||
keep = [r for r in rows if r[1] != FIRM_A and era_fn(r[2])]
|
||||
feats = np.stack([np.frombuffer(r[3], np.float32) for r in keep]).astype(np.float32)
|
||||
feats /= np.clip(np.linalg.norm(feats, axis=1, keepdims=True), 1e-9, None)
|
||||
dh = np.stack([np.frombuffer(r[4], np.uint8) for r in keep])
|
||||
cpas = np.array([r[0] for r in keep])
|
||||
by = defaultdict(list)
|
||||
for i, c in enumerate(cpas):
|
||||
by[c].append(i)
|
||||
accts = list(by.keys())
|
||||
rng = np.random.default_rng(SEED)
|
||||
cos = np.empty(N_PAIRS, np.float32); dv = np.empty(N_PAIRS, np.int32)
|
||||
na = len(accts)
|
||||
for t in range(N_PAIRS):
|
||||
i, j = rng.choice(na, 2, replace=False)
|
||||
a1, a2 = accts[i], accts[j]
|
||||
k1 = by[a1][int(rng.integers(0, len(by[a1])))]
|
||||
k2 = by[a2][int(rng.integers(0, len(by[a2])))]
|
||||
cos[t] = feats[k1] @ feats[k2]
|
||||
dv[t] = POP[dh[k1] ^ dh[k2]].sum()
|
||||
joint = int(((cos > 0.95) & (dv <= 5)).sum())
|
||||
lo, hi = wilson(joint, N_PAIRS)
|
||||
print(f' [{label}] BCD per-comparison HC floor = {joint/N_PAIRS:.6f} '
|
||||
f'({joint}/{N_PAIRS}) Wilson95% [{lo:.6f},{hi:.6f}] '
|
||||
f'(n_sig={len(keep):,}, CPAs={na})')
|
||||
return joint/N_PAIRS
|
||||
|
||||
|
||||
print('=== (1) BCD per-comparison HC floor by era (pool-size-independent) ===')
|
||||
floors = {lab: per_comparison_floor(fn, lab) for lab, fn in ERAS.items()}
|
||||
|
||||
print('\n=== (2) Observed deployed per-signature HC rate by firm by era ===')
|
||||
print(' (max_sim>0.95 & min_dh<=5 on actual same-CPA pools)')
|
||||
for lab, fn in ERAS.items():
|
||||
print(f' --- {lab} ---')
|
||||
for fm_zh in BIG4:
|
||||
sub = [r for r in rows if r[1] == fm_zh and fn(r[2])
|
||||
and r[5] is not None and r[6] is not None]
|
||||
if not sub:
|
||||
continue
|
||||
k = sum(1 for r in sub if r[5] > 0.95 and r[6] <= 5)
|
||||
print(f' Firm {ALIAS[fm_zh]}: {k/len(sub):.4f} ({k}/{len(sub)})')
|
||||
|
||||
print('\n=== A-vs-floor multiple by era (observed A HC / BCD floor) ===')
|
||||
for lab, fn in ERAS.items():
|
||||
a = [r for r in rows if r[1] == FIRM_A and fn(r[2]) and r[5] is not None and r[6] is not None]
|
||||
a_rate = sum(1 for r in a if r[5] > 0.95 and r[6] <= 5)/len(a) if a else 0
|
||||
fl = floors[lab]
|
||||
# per-comparison floor is not directly comparable to observed pooled rate;
|
||||
# report ratio vs the per-signature floor proxy from Script 52 (0.0116 full).
|
||||
print(f' {lab}: observed A HC = {a_rate:.3f}; per-comparison floor = {fl:.6f}')
|
||||
@@ -0,0 +1,144 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Script 55: PRIMARY calibration on the clean pre-e-signature baseline
|
||||
BCD 2013-2019 (Firms B/C/D, fiscal years 2013-2019). Rationale: co-author
|
||||
interviews confirm B/C/D progressively adopted e-signature systems after 2020
|
||||
(staggered timing), so 2013-2019 BCD is the construct-clean hand-signing
|
||||
baseline. Canonical retry-loop sampler (matches Scripts 43/45/52), any-pair.
|
||||
Reports the floor + Firm A (all years) scored out-of-sample against it, and
|
||||
BCD 2020+ scored against the same threshold. Read-only.
|
||||
"""
|
||||
import sqlite3
|
||||
from collections import defaultdict, Counter
|
||||
import numpy as np
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FIRM_A = '勤業眾信聯合'
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'}
|
||||
SEED = 42
|
||||
N_BOOT = 1000
|
||||
POP = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
|
||||
|
||||
|
||||
def wilson(k, n, z=1.96):
|
||||
if n == 0:
|
||||
return (None, None)
|
||||
p = k/n; d = 1+z*z/n; c = (p+z*z/(2*n))/d
|
||||
h = z*np.sqrt(p*(1-p)/n+z*z/(4*n*n))/d
|
||||
return (max(0.0, c-h), min(1.0, c+h))
|
||||
|
||||
|
||||
def canon_sampler(rng, n, npool, same, all_idx):
|
||||
need = npool; cand = []; att = 0
|
||||
while need > 0 and att < 10:
|
||||
draw = rng.choice(n, size=need*2, replace=True)
|
||||
ok = draw[~np.isin(draw, same)]
|
||||
cand.extend(ok[:need].tolist()); need -= len(ok[:need]); att += 1
|
||||
if need > 0:
|
||||
pm = np.ones(n, bool); pm[same] = False
|
||||
cand.extend(rng.choice(all_idx[pm], size=need, replace=False).tolist())
|
||||
return np.array(cand[:npool], dtype=np.int64)
|
||||
|
||||
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""SELECT s.assigned_accountant,a.firm,CAST(substr(s.year_month,1,4) AS INT),
|
||||
s.source_pdf,s.feature_vector,s.dhash_vector,
|
||||
s.max_similarity_to_same_accountant,s.min_dhash_independent
|
||||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE a.firm IN (?,?,?,?) AND s.year_month IS NOT NULL
|
||||
AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""", BIG4)
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
|
||||
|
||||
def prep(rec):
|
||||
feats = np.stack([np.frombuffer(r[4], np.float32) for r in rec]).astype(np.float32)
|
||||
norms = np.linalg.norm(feats, axis=1, keepdims=True); norms[norms == 0] = 1.0
|
||||
feats /= norms
|
||||
dh = np.stack([np.frombuffer(r[5], np.uint8) for r in rec])
|
||||
return feats, dh
|
||||
|
||||
|
||||
def floor_on(baseline_rec, label):
|
||||
"""Canonical per-sig/per-doc HC floor on a baseline population."""
|
||||
feats, dh = prep(baseline_rec)
|
||||
n = len(baseline_rec)
|
||||
cpas = np.array([r[0] for r in baseline_rec])
|
||||
firms = np.array([ALIAS[r[1]] for r in baseline_rec])
|
||||
docs = np.array([r[3] for r in baseline_rec])
|
||||
cidx = defaultdict(list)
|
||||
for i, c in enumerate(cpas):
|
||||
cidx[c].append(i)
|
||||
cidx = {c: np.array(v) for c, v in cidx.items()}
|
||||
psize = {c: len(v)-1 for c, v in cidx.items()}
|
||||
all_idx = np.arange(n)
|
||||
rng = np.random.default_rng(SEED)
|
||||
mx = np.zeros(n, np.float32); mn = np.full(n, 64, np.int32)
|
||||
for si in range(n):
|
||||
np_ = psize[cpas[si]]
|
||||
if np_ <= 0:
|
||||
continue
|
||||
cand = canon_sampler(rng, n, np_, cidx[cpas[si]], all_idx)
|
||||
cosv = feats[cand] @ feats[si]
|
||||
mx[si] = cosv.max(); mn[si] = int(POP[dh[cand] ^ dh[si]].sum(axis=1).min())
|
||||
hc = (mx > 0.95) & (mn <= 5); d2 = (mx > 0.95) & (mn <= 15)
|
||||
k = int(hc.sum())
|
||||
rng2 = np.random.default_rng(SEED+1); cl = list(cidx.keys())
|
||||
bs = np.array([hc[np.concatenate([cidx[cl[i]] for i in rng2.choice(len(cl), len(cl), True)])].mean()
|
||||
for _ in range(N_BOOT)])
|
||||
print(f'\n [{label}] n_sig={n:,}, CPAs={len(cidx)}')
|
||||
print(f' per-sig HC floor = {k/n:.4f} ({k}/{n}) CPA-boot95% [{np.percentile(bs,2.5):.4f},{np.percentile(bs,97.5):.4f}]')
|
||||
dd1 = defaultdict(bool); dd2 = defaultdict(bool); dfirm = {}
|
||||
for i in range(n):
|
||||
if hc[i]: dd1[docs[i]] = True
|
||||
if d2[i]: dd2[docs[i]] = True
|
||||
dfirm.setdefault(docs[i], []).append(firms[i])
|
||||
dd1.setdefault(docs[i], False); dd2.setdefault(docs[i], False)
|
||||
dl = list(dd1.keys()); nd = len(dl)
|
||||
print(f' per-doc HC = {sum(dd1[d] for d in dl)/nd:.4f}; per-doc HC+MC = {sum(dd2[d] for d in dl)/nd:.4f} (n_doc={nd:,})')
|
||||
dom = {d: Counter(dfirm[d]).most_common(1)[0][0] for d in dl}
|
||||
for f in ['B', 'C', 'D']:
|
||||
ds = [d for d in dl if dom[d] == f]
|
||||
if ds:
|
||||
print(f' Firm {f} per-doc HC+MC: {sum(dd2[d] for d in ds)/len(ds):.4f} ({sum(dd2[d] for d in ds)}/{len(ds)})')
|
||||
return k/n
|
||||
|
||||
|
||||
def a_vs_baseline(baseline_rec, a_rec, label):
|
||||
bf, bdh = prep(baseline_rec); nb = len(baseline_rec)
|
||||
a_cpa = defaultdict(list)
|
||||
for i, r in enumerate(a_rec):
|
||||
a_cpa[r[0]].append(i)
|
||||
psize = {c: len(v)-1 for c, v in a_cpa.items()}
|
||||
rng = np.random.default_rng(SEED)
|
||||
hc = np.zeros(len(a_rec), bool)
|
||||
for i, r in enumerate(a_rec):
|
||||
np_ = psize[r[0]]
|
||||
if np_ <= 0:
|
||||
continue
|
||||
cand = rng.integers(0, nb, size=np_)
|
||||
sf = np.frombuffer(r[4], np.float32).astype(np.float32); sf /= max(np.linalg.norm(sf), 1e-9)
|
||||
cosv = bf[cand] @ sf
|
||||
if (cosv > 0.95).any():
|
||||
dist = POP[bdh[cand] ^ np.frombuffer(r[5], np.uint8)].sum(axis=1)
|
||||
hc[i] = bool(((cosv > 0.95) & (dist <= 5)).any())
|
||||
k = int(hc.sum()); n = len(a_rec); lo, hi = wilson(k, n)
|
||||
print(f' [{label}] Firm A (all yrs) vs BCD-2013-2019 pool: per-sig HC = {k/n:.4f} ({k}/{n}) [{lo:.5f},{hi:.5f}]')
|
||||
|
||||
|
||||
bcd_pre = [r for r in rows if r[1] != FIRM_A and 2013 <= r[2] <= 2019]
|
||||
bcd_post = [r for r in rows if r[1] != FIRM_A and r[2] >= 2020]
|
||||
A_all = [r for r in rows if r[1] == FIRM_A]
|
||||
|
||||
print('=== PRIMARY floor: BCD 2013-2019 ===')
|
||||
fl = floor_on(bcd_pre, 'BCD 2013-2019 (PRIMARY)')
|
||||
|
||||
print('\n=== Firm A scored against the BCD-2013-2019 threshold ===')
|
||||
a_vs_baseline(bcd_pre, A_all, 'A out-of-sample')
|
||||
A_obs = [r for r in A_all if r[6] is not None and r[7] is not None]
|
||||
ak = sum(1 for r in A_obs if r[6] > 0.95 and r[7] <= 5)
|
||||
print(f' Firm A observed (all yrs, own pools): per-sig HC = {ak/len(A_obs):.4f} -> {ak/len(A_obs)/fl:.0f}x the BCD-2013-2019 floor')
|
||||
|
||||
print('\n=== (optional) BCD 2020+ floor, same method (may be inflated by e-signing) ===')
|
||||
floor_on(bcd_post, 'BCD 2020-2023 (post e-signing)')
|
||||
Reference in New Issue
Block a user