Files
pdf_signature_extraction/signature_analysis/46_bcd_only_iccr.py
T
gbanyan 3c7fcc010f Paper A v4.1: BCD-baseline reframe + screening positioning + trim
- Re-anchor inter-CPA coincidence-rate (ICCR) calibration on a normative
  non-Firm-A baseline (Firms B/C/D); Firm A held out as an out-of-sample
  target. Locked canonical numbers (codex-audited; Scripts 46/52/53):
  per-comparison HC 0.00014->0.000018, per-signature HC 0.0116, per-document
  HC+MC 0.34->0.1905; KDE crossover 0.837 retained corpus-wide.
- Reposition as an operator-tunable, semi-automated screening/triage framework
  (title -> "Automated Screening..."): HC = high-specificity operating point;
  MC band demoted to low-specificity advisory; Firm A = demonstration that the
  screening surfaces a templated end, audit-quality implications deferred.
- Apply codex prose-review fixes: triage-neutral five-way labels, soften
  mechanism/specificity wording, supersede MC claim-strength, update stale
  Appendix script references (40b/43/45 -> 46/52/53).
- Trim pass: compress Sec. V discussion + Sec. III echoes (27.7k -> 26.8k
  words); no substantive content removed.
- Add analysis scripts 45-53 (firm-year trends; BCD-only ICCR recompute;
  canonical-sampler locked numbers; Firm-A out-of-sample; BCD regression +
  cross-firm hit matrix).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-04 19:35:10 +08:00

88 lines
3.0 KiB
Python

#!/usr/bin/env python3
"""Script 46: BCD-only (exclude Firm A) per-comparison ICCR recompute.
Replicates 40b's inter-CPA negative-anchor pair sampling (N=500k, seed=42)
but compares three negative-anchor pool compositions:
- ABCD : all Big-4 (current paper baseline)
- BCD : Big-4 excluding Firm A (normative-baseline proposal)
- BCD+nonB4 : BCD plus all non-Big-4 firms
Reports marginal cos>0.95, dHash<=5, and the joint HC rule cos>0.95 & dHash<=5.
Read-only.
"""
import sqlite3
from collections import defaultdict
import numpy as np
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
N_PAIRS = 500_000
SEED = 42
FIRM_A = '勤業眾信聯合'
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
def hamming(a, b):
return (int.from_bytes(a, 'big') ^ int.from_bytes(b, 'big')).bit_count()
def load():
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
cur = conn.cursor()
cur.execute("""
SELECT s.assigned_accountant, a.firm, s.feature_vector, s.dhash_vector
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL
AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""")
rows = cur.fetchall()
conn.close()
return rows
def wilson(k, n, z=1.96):
if n == 0:
return (None, None)
p = k / n
d = 1 + z*z/n
c = (p + z*z/(2*n)) / d
h = z*np.sqrt(p*(1-p)/n + z*z/(4*n*n)) / d
return (max(0.0, c-h), min(1.0, c+h))
def iccr(rows, label):
by = defaultdict(list)
for acct, firm, fv, dh in rows:
by[acct].append((fv, dh))
accts = list(by.keys())
feats = {a: np.stack([np.frombuffer(r[0], dtype=np.float32) for r in by[a]]) for a in accts}
dhs = {a: [r[1] for r in by[a]] for a in accts}
rng = np.random.default_rng(SEED)
cos = np.empty(N_PAIRS, np.float32)
dv = np.empty(N_PAIRS, np.int32)
na = len(accts)
for t in range(N_PAIRS):
i, j = rng.choice(na, 2, replace=False)
a1, a2 = accts[i], accts[j]
k1 = int(rng.integers(0, len(by[a1])))
k2 = int(rng.integers(0, len(by[a2])))
cos[t] = float(feats[a1][k1] @ feats[a2][k2])
dv[t] = hamming(dhs[a1][k1], dhs[a2][k2])
n = N_PAIRS
m_cos = int((cos > 0.95).sum())
m_dh = int((dv <= 5).sum())
joint = int(((cos > 0.95) & (dv <= 5)).sum())
jlo, jhi = wilson(joint, n)
print(f'\n== {label} ==')
print(f' signatures={len(rows):,} accountants={na} pairs={n:,}')
print(f' cos>0.95 ICCR = {m_cos/n:.5f} ({m_cos})')
print(f' dHash<=5 ICCR = {m_dh/n:.5f} ({m_dh})')
print(f' JOINT (HC rule) ICCR = {joint/n:.6f} ({joint}) Wilson95% [{jlo:.6f},{jhi:.6f}]')
return joint/n
rows = load()
abcd = [r for r in rows if r[1] in BIG4]
bcd = [r for r in rows if r[1] in BIG4 and r[1] != FIRM_A]
bcd_non = [r for r in rows if r[1] != FIRM_A]
iccr(abcd, 'ABCD (current paper baseline)')
iccr(bcd, 'BCD only (exclude Firm A)')
iccr(bcd_non, 'BCD + non-Big-4')