Paper A v4.1: BCD-baseline reframe + screening positioning + trim
- Re-anchor inter-CPA coincidence-rate (ICCR) calibration on a normative non-Firm-A baseline (Firms B/C/D); Firm A held out as an out-of-sample target. Locked canonical numbers (codex-audited; Scripts 46/52/53): per-comparison HC 0.00014->0.000018, per-signature HC 0.0116, per-document HC+MC 0.34->0.1905; KDE crossover 0.837 retained corpus-wide. - Reposition as an operator-tunable, semi-automated screening/triage framework (title -> "Automated Screening..."): HC = high-specificity operating point; MC band demoted to low-specificity advisory; Firm A = demonstration that the screening surfaces a templated end, audit-quality implications deferred. - Apply codex prose-review fixes: triage-neutral five-way labels, soften mechanism/specificity wording, supersede MC claim-strength, update stale Appendix script references (40b/43/45 -> 46/52/53). - Trim pass: compress Sec. V discussion + Sec. III echoes (27.7k -> 26.8k words); no substantive content removed. - Add analysis scripts 45-53 (firm-year trends; BCD-only ICCR recompute; canonical-sampler locked numbers; Firm-A out-of-sample; BCD regression + cross-firm hit matrix). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Script 47: BCD-only recompute of (1) KDE crossover, (2) per-signature
|
||||
pool-normalized any-pair ICCR (cos>0.95 & dHash<=5), (3) per-document HC+MC
|
||||
inter-CPA ICCR (cos>0.95 & dHash<=15), each for ABCD vs BCD-only negative-anchor
|
||||
pools. Replicates Scripts 10/43/44 methodology. Document-level subsampling used
|
||||
for the pool simulation (exact same-CPA pool sizes retained). Read-only.
|
||||
"""
|
||||
import sqlite3
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
from scipy.stats import gaussian_kde
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FIRM_A = '勤業眾信聯合'
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'}
|
||||
SEED = 42
|
||||
N_INTRA = 200_000
|
||||
N_INTER = 500_000
|
||||
N_DOC_SUBSAMPLE = 9000 # documents processed in pool simulation per scope
|
||||
|
||||
|
||||
def load():
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT s.signature_id, s.assigned_accountant, a.firm, s.source_pdf,
|
||||
s.feature_vector, s.dhash_vector
|
||||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL AND a.firm IN (?,?,?,?)
|
||||
AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""", BIG4)
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
|
||||
def hamming1(q, c):
|
||||
return (int.from_bytes(q, 'big') ^ int.from_bytes(c, 'big')).bit_count()
|
||||
|
||||
|
||||
def wilson(k, n, z=1.96):
|
||||
if n == 0:
|
||||
return (None, None)
|
||||
p = k/n; d = 1+z*z/n
|
||||
c = (p+z*z/(2*n))/d
|
||||
h = z*np.sqrt(p*(1-p)/n+z*z/(4*n*n))/d
|
||||
return (max(0.0, c-h), min(1.0, c+h))
|
||||
|
||||
|
||||
def kde_crossover(feats, cpas, label):
|
||||
by = defaultdict(list)
|
||||
for i, c in enumerate(cpas):
|
||||
by[c].append(i)
|
||||
by = {c: np.array(v) for c, v in by.items() if len(v) >= 2}
|
||||
accts = list(by.keys())
|
||||
rng = np.random.default_rng(SEED)
|
||||
# intra: two sigs from same random CPA
|
||||
intra = np.empty(N_INTRA, np.float32)
|
||||
ks = rng.integers(0, len(accts), N_INTRA)
|
||||
for t in range(N_INTRA):
|
||||
idx = by[accts[ks[t]]]
|
||||
a, b = rng.choice(idx, 2, replace=False)
|
||||
intra[t] = feats[a] @ feats[b]
|
||||
# inter: two sigs from different CPAs
|
||||
inter = np.empty(N_INTER, np.float32)
|
||||
for t in range(N_INTER):
|
||||
i, j = rng.choice(len(accts), 2, replace=False)
|
||||
a = rng.choice(by[accts[i]]); b = rng.choice(by[accts[j]])
|
||||
inter[t] = feats[a] @ feats[b]
|
||||
xs = np.linspace(0.3, 1.0, 10000)
|
||||
ki = gaussian_kde(intra[:100000]); ke = gaussian_kde(inter[:100000])
|
||||
diff = ki(xs) - ke(xs)
|
||||
cross = xs[np.where(np.diff(np.sign(diff)))[0]]
|
||||
cross = [float(x) for x in cross if 0.6 < x < 0.99]
|
||||
print(f' [{label}] intra mean={intra.mean():.4f} inter mean={inter.mean():.4f}'
|
||||
f' KDE crossover(s): {[f"{x:.4f}" for x in cross]}')
|
||||
return cross
|
||||
|
||||
|
||||
def pool_sim(rows, scope_firms, label):
|
||||
"""Per-signature & per-document inter-CPA any-pair ICCR over a doc subsample."""
|
||||
keep = [r for r in rows if ALIAS[r[2]] in scope_firms]
|
||||
feats = np.stack([np.frombuffer(r[4], np.float32) for r in keep]).astype(np.float32)
|
||||
feats /= np.clip(np.linalg.norm(feats, axis=1, keepdims=True), 1e-9, None)
|
||||
cpas = [r[1] for r in keep]
|
||||
firms = [ALIAS[r[2]] for r in keep]
|
||||
docs = [r[3] for r in keep]
|
||||
dh = [r[5] for r in keep]
|
||||
n = len(keep)
|
||||
cpa_idx = defaultdict(list)
|
||||
for i, c in enumerate(cpas):
|
||||
cpa_idx[c].append(i)
|
||||
cpa_idx = {c: np.array(v) for c, v in cpa_idx.items()}
|
||||
pool_size = {c: len(v)-1 for c, v in cpa_idx.items()}
|
||||
doc_idx = defaultdict(list)
|
||||
for i, d in enumerate(docs):
|
||||
doc_idx[d].append(i)
|
||||
rng = np.random.default_rng(SEED)
|
||||
all_docs = list(doc_idx.keys())
|
||||
sub = rng.choice(len(all_docs), min(N_DOC_SUBSAMPLE, len(all_docs)), replace=False)
|
||||
sel_docs = [all_docs[i] for i in sub]
|
||||
|
||||
sig_hc = [] # per-signature: any-pair cos>0.95 & dh<=5
|
||||
sig_firm = []
|
||||
doc_hcmc = {} # per-document worst-case: any sig with cos>0.95 & dh<=15
|
||||
doc_firm = {}
|
||||
for d in sel_docs:
|
||||
dhit = False
|
||||
for si in doc_idx[d]:
|
||||
c = cpas[si]; npool = pool_size[c]
|
||||
if npool <= 0:
|
||||
sig_hc.append(False); sig_firm.append(firms[si]); continue
|
||||
same = cpa_idx[c]
|
||||
draw = rng.choice(n, size=min(npool*2+10, n), replace=True)
|
||||
cand = draw[~np.isin(draw, same)][:npool]
|
||||
cosv = feats[cand] @ feats[si]
|
||||
dhv = np.fromiter((hamming1(dh[si], dh[c2]) for c2 in cand), np.int32, len(cand))
|
||||
cg = cosv > 0.95
|
||||
hc = bool((cg & (dhv <= 5)).any())
|
||||
hcmc = bool((cg & (dhv <= 15)).any())
|
||||
sig_hc.append(hc); sig_firm.append(firms[si])
|
||||
if hcmc:
|
||||
dhit = True
|
||||
doc_hcmc[d] = dhit
|
||||
doc_firm[d] = firms[doc_idx[d][0]]
|
||||
|
||||
sig_hc = np.array(sig_hc); sig_firm = np.array(sig_firm)
|
||||
k = int(sig_hc.sum()); m = len(sig_hc)
|
||||
lo, hi = wilson(k, m)
|
||||
print(f'\n [{label}] per-SIGNATURE any-pair HC ICCR (cos>0.95 & dh<=5): '
|
||||
f'{k/m:.4f} ({k}/{m}) Wilson95% [{lo:.4f},{hi:.4f}]')
|
||||
for f in sorted(set(sig_firm)):
|
||||
msk = sig_firm == f
|
||||
kk = int(sig_hc[msk].sum()); mm = int(msk.sum())
|
||||
print(f' Firm {f}: {kk/mm:.4f} ({kk}/{mm})')
|
||||
dvals = np.array(list(doc_hcmc.values())); dfirm = np.array(list(doc_firm.values()))
|
||||
dk = int(dvals.sum()); dm = len(dvals)
|
||||
dlo, dhi = wilson(dk, dm)
|
||||
print(f' [{label}] per-DOCUMENT HC+MC ICCR (cos>0.95 & dh<=15): '
|
||||
f'{dk/dm:.4f} ({dk}/{dm}) Wilson95% [{dlo:.4f},{dhi:.4f}]')
|
||||
for f in sorted(set(dfirm)):
|
||||
msk = dfirm == f
|
||||
kk = int(dvals[msk].sum()); mm = int(msk.sum())
|
||||
print(f' Firm {f}: {kk/mm:.4f} ({kk}/{mm})')
|
||||
|
||||
|
||||
rows = load()
|
||||
allf = np.stack([np.frombuffer(r[4], np.float32) for r in rows]).astype(np.float32)
|
||||
allf /= np.clip(np.linalg.norm(allf, axis=1, keepdims=True), 1e-9, None)
|
||||
allc = [r[1] for r in rows]
|
||||
abcd_mask = [True]*len(rows)
|
||||
bcd_mask = [r[2] != FIRM_A for r in rows]
|
||||
|
||||
print('=== (1) KDE crossover (intra vs inter cosine) ===')
|
||||
kde_crossover(allf, allc, 'ABCD')
|
||||
kde_crossover(allf[bcd_mask], [allc[i] for i in range(len(rows)) if bcd_mask[i]], 'BCD-only')
|
||||
|
||||
print('\n=== (2)(3) per-signature & per-document inter-CPA ICCR ===')
|
||||
pool_sim(rows, {'A', 'B', 'C', 'D'}, 'ABCD (reproduce)')
|
||||
pool_sim(rows, {'B', 'C', 'D'}, 'BCD-only')
|
||||
Reference in New Issue
Block a user