Paper A v4.1: BCD-baseline reframe + screening positioning + trim
- Re-anchor inter-CPA coincidence-rate (ICCR) calibration on a normative non-Firm-A baseline (Firms B/C/D); Firm A held out as an out-of-sample target. Locked canonical numbers (codex-audited; Scripts 46/52/53): per-comparison HC 0.00014->0.000018, per-signature HC 0.0116, per-document HC+MC 0.34->0.1905; KDE crossover 0.837 retained corpus-wide. - Reposition as an operator-tunable, semi-automated screening/triage framework (title -> "Automated Screening..."): HC = high-specificity operating point; MC band demoted to low-specificity advisory; Firm A = demonstration that the screening surfaces a templated end, audit-quality implications deferred. - Apply codex prose-review fixes: triage-neutral five-way labels, soften mechanism/specificity wording, supersede MC claim-strength, update stale Appendix script references (40b/43/45 -> 46/52/53). - Trim pass: compress Sec. V discussion + Sec. III echoes (27.7k -> 26.8k words); no substantive content removed. - Add analysis scripts 45-53 (firm-year trends; BCD-only ICCR recompute; canonical-sampler locked numbers; Firm-A out-of-sample; BCD regression + cross-firm hit matrix). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Firm x year descriptor trends (B-gate diagnostic).
|
||||
|
||||
Plots per-firm yearly mean cosine, mean dHash, and HC-box hit share to test
|
||||
whether Firms B/C/D show a 2020 structural break converging toward Firm A.
|
||||
Read-only against the production DB.
|
||||
"""
|
||||
import sqlite3
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FIRMS = [('勤業眾信聯合', 'Firm A (Deloitte)', '#d62728'),
|
||||
('安侯建業聯合', 'Firm B (KPMG)', '#1f77b4'),
|
||||
('資誠聯合', 'Firm C (PwC)', '#2ca02c'),
|
||||
('安永聯合', 'Firm D (EY)', '#ff7f0e')]
|
||||
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
|
||||
|
||||
def series(firm_zh):
|
||||
cur.execute("""
|
||||
SELECT CAST(substr(s.year_month,1,4) AS INT) AS yr,
|
||||
AVG(s.max_similarity_to_same_accountant),
|
||||
AVG(s.min_dhash_independent),
|
||||
AVG(CASE WHEN s.max_similarity_to_same_accountant>0.95
|
||||
AND s.min_dhash_independent<=5 THEN 1.0 ELSE 0.0 END),
|
||||
COUNT(*)
|
||||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE a.firm=? AND s.year_month IS NOT NULL
|
||||
AND s.max_similarity_to_same_accountant IS NOT NULL
|
||||
AND s.min_dhash_independent IS NOT NULL
|
||||
GROUP BY yr ORDER BY yr""", (firm_zh,))
|
||||
return cur.fetchall()
|
||||
|
||||
|
||||
fig, axes = plt.subplots(1, 3, figsize=(16, 4.8))
|
||||
for firm_zh, label, color in FIRMS:
|
||||
rows = series(firm_zh)
|
||||
yrs = [r[0] for r in rows]
|
||||
axes[0].plot(yrs, [r[1] for r in rows], 'o-', color=color, label=label)
|
||||
axes[1].plot(yrs, [r[2] for r in rows], 'o-', color=color, label=label)
|
||||
axes[2].plot(yrs, [r[3] for r in rows], 'o-', color=color, label=label)
|
||||
|
||||
for ax in axes:
|
||||
ax.axvline(2020, ls='--', color='grey', alpha=0.6)
|
||||
ax.text(2020.05, ax.get_ylim()[0], ' 2020', color='grey', fontsize=8, va='bottom')
|
||||
ax.set_xlabel('Fiscal year')
|
||||
ax.grid(alpha=0.3)
|
||||
axes[0].set_title('Mean best-match cosine'); axes[0].axhline(0.95, ls=':', color='k', alpha=0.4)
|
||||
axes[1].set_title('Mean independent-min dHash'); axes[1].axhline(5, ls=':', color='k', alpha=0.4)
|
||||
axes[2].set_title('HC-box share (cos>0.95 & dHash$\\leq$5)')
|
||||
axes[0].legend(fontsize=8, loc='lower right')
|
||||
fig.suptitle('Big-4 descriptor trends 2013–2023 (2023 = partial, to Apr) — no 2020 break, no convergence to A',
|
||||
fontsize=11)
|
||||
fig.tight_layout()
|
||||
out = '/Volumes/NV2/pdf_recognize/signature_analysis/firm_year_trends.png'
|
||||
fig.savefig(out, dpi=130, bbox_inches='tight')
|
||||
print('saved', out)
|
||||
conn.close()
|
||||
@@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Script 46: BCD-only (exclude Firm A) per-comparison ICCR recompute.
|
||||
|
||||
Replicates 40b's inter-CPA negative-anchor pair sampling (N=500k, seed=42)
|
||||
but compares three negative-anchor pool compositions:
|
||||
- ABCD : all Big-4 (current paper baseline)
|
||||
- BCD : Big-4 excluding Firm A (normative-baseline proposal)
|
||||
- BCD+nonB4 : BCD plus all non-Big-4 firms
|
||||
Reports marginal cos>0.95, dHash<=5, and the joint HC rule cos>0.95 & dHash<=5.
|
||||
Read-only.
|
||||
"""
|
||||
import sqlite3
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
N_PAIRS = 500_000
|
||||
SEED = 42
|
||||
FIRM_A = '勤業眾信聯合'
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
|
||||
|
||||
def hamming(a, b):
|
||||
return (int.from_bytes(a, 'big') ^ int.from_bytes(b, 'big')).bit_count()
|
||||
|
||||
|
||||
def load():
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT s.assigned_accountant, a.firm, s.feature_vector, s.dhash_vector
|
||||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL
|
||||
AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""")
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
|
||||
def wilson(k, n, z=1.96):
|
||||
if n == 0:
|
||||
return (None, None)
|
||||
p = k / n
|
||||
d = 1 + z*z/n
|
||||
c = (p + z*z/(2*n)) / d
|
||||
h = z*np.sqrt(p*(1-p)/n + z*z/(4*n*n)) / d
|
||||
return (max(0.0, c-h), min(1.0, c+h))
|
||||
|
||||
|
||||
def iccr(rows, label):
|
||||
by = defaultdict(list)
|
||||
for acct, firm, fv, dh in rows:
|
||||
by[acct].append((fv, dh))
|
||||
accts = list(by.keys())
|
||||
feats = {a: np.stack([np.frombuffer(r[0], dtype=np.float32) for r in by[a]]) for a in accts}
|
||||
dhs = {a: [r[1] for r in by[a]] for a in accts}
|
||||
rng = np.random.default_rng(SEED)
|
||||
cos = np.empty(N_PAIRS, np.float32)
|
||||
dv = np.empty(N_PAIRS, np.int32)
|
||||
na = len(accts)
|
||||
for t in range(N_PAIRS):
|
||||
i, j = rng.choice(na, 2, replace=False)
|
||||
a1, a2 = accts[i], accts[j]
|
||||
k1 = int(rng.integers(0, len(by[a1])))
|
||||
k2 = int(rng.integers(0, len(by[a2])))
|
||||
cos[t] = float(feats[a1][k1] @ feats[a2][k2])
|
||||
dv[t] = hamming(dhs[a1][k1], dhs[a2][k2])
|
||||
n = N_PAIRS
|
||||
m_cos = int((cos > 0.95).sum())
|
||||
m_dh = int((dv <= 5).sum())
|
||||
joint = int(((cos > 0.95) & (dv <= 5)).sum())
|
||||
jlo, jhi = wilson(joint, n)
|
||||
print(f'\n== {label} ==')
|
||||
print(f' signatures={len(rows):,} accountants={na} pairs={n:,}')
|
||||
print(f' cos>0.95 ICCR = {m_cos/n:.5f} ({m_cos})')
|
||||
print(f' dHash<=5 ICCR = {m_dh/n:.5f} ({m_dh})')
|
||||
print(f' JOINT (HC rule) ICCR = {joint/n:.6f} ({joint}) Wilson95% [{jlo:.6f},{jhi:.6f}]')
|
||||
return joint/n
|
||||
|
||||
|
||||
rows = load()
|
||||
abcd = [r for r in rows if r[1] in BIG4]
|
||||
bcd = [r for r in rows if r[1] in BIG4 and r[1] != FIRM_A]
|
||||
bcd_non = [r for r in rows if r[1] != FIRM_A]
|
||||
iccr(abcd, 'ABCD (current paper baseline)')
|
||||
iccr(bcd, 'BCD only (exclude Firm A)')
|
||||
iccr(bcd_non, 'BCD + non-Big-4')
|
||||
@@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Script 47: BCD-only recompute of (1) KDE crossover, (2) per-signature
|
||||
pool-normalized any-pair ICCR (cos>0.95 & dHash<=5), (3) per-document HC+MC
|
||||
inter-CPA ICCR (cos>0.95 & dHash<=15), each for ABCD vs BCD-only negative-anchor
|
||||
pools. Replicates Scripts 10/43/44 methodology. Document-level subsampling used
|
||||
for the pool simulation (exact same-CPA pool sizes retained). Read-only.
|
||||
"""
|
||||
import sqlite3
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
from scipy.stats import gaussian_kde
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FIRM_A = '勤業眾信聯合'
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'}
|
||||
SEED = 42
|
||||
N_INTRA = 200_000
|
||||
N_INTER = 500_000
|
||||
N_DOC_SUBSAMPLE = 9000 # documents processed in pool simulation per scope
|
||||
|
||||
|
||||
def load():
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT s.signature_id, s.assigned_accountant, a.firm, s.source_pdf,
|
||||
s.feature_vector, s.dhash_vector
|
||||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL AND a.firm IN (?,?,?,?)
|
||||
AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""", BIG4)
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
|
||||
def hamming1(q, c):
|
||||
return (int.from_bytes(q, 'big') ^ int.from_bytes(c, 'big')).bit_count()
|
||||
|
||||
|
||||
def wilson(k, n, z=1.96):
|
||||
if n == 0:
|
||||
return (None, None)
|
||||
p = k/n; d = 1+z*z/n
|
||||
c = (p+z*z/(2*n))/d
|
||||
h = z*np.sqrt(p*(1-p)/n+z*z/(4*n*n))/d
|
||||
return (max(0.0, c-h), min(1.0, c+h))
|
||||
|
||||
|
||||
def kde_crossover(feats, cpas, label):
|
||||
by = defaultdict(list)
|
||||
for i, c in enumerate(cpas):
|
||||
by[c].append(i)
|
||||
by = {c: np.array(v) for c, v in by.items() if len(v) >= 2}
|
||||
accts = list(by.keys())
|
||||
rng = np.random.default_rng(SEED)
|
||||
# intra: two sigs from same random CPA
|
||||
intra = np.empty(N_INTRA, np.float32)
|
||||
ks = rng.integers(0, len(accts), N_INTRA)
|
||||
for t in range(N_INTRA):
|
||||
idx = by[accts[ks[t]]]
|
||||
a, b = rng.choice(idx, 2, replace=False)
|
||||
intra[t] = feats[a] @ feats[b]
|
||||
# inter: two sigs from different CPAs
|
||||
inter = np.empty(N_INTER, np.float32)
|
||||
for t in range(N_INTER):
|
||||
i, j = rng.choice(len(accts), 2, replace=False)
|
||||
a = rng.choice(by[accts[i]]); b = rng.choice(by[accts[j]])
|
||||
inter[t] = feats[a] @ feats[b]
|
||||
xs = np.linspace(0.3, 1.0, 10000)
|
||||
ki = gaussian_kde(intra[:100000]); ke = gaussian_kde(inter[:100000])
|
||||
diff = ki(xs) - ke(xs)
|
||||
cross = xs[np.where(np.diff(np.sign(diff)))[0]]
|
||||
cross = [float(x) for x in cross if 0.6 < x < 0.99]
|
||||
print(f' [{label}] intra mean={intra.mean():.4f} inter mean={inter.mean():.4f}'
|
||||
f' KDE crossover(s): {[f"{x:.4f}" for x in cross]}')
|
||||
return cross
|
||||
|
||||
|
||||
def pool_sim(rows, scope_firms, label):
|
||||
"""Per-signature & per-document inter-CPA any-pair ICCR over a doc subsample."""
|
||||
keep = [r for r in rows if ALIAS[r[2]] in scope_firms]
|
||||
feats = np.stack([np.frombuffer(r[4], np.float32) for r in keep]).astype(np.float32)
|
||||
feats /= np.clip(np.linalg.norm(feats, axis=1, keepdims=True), 1e-9, None)
|
||||
cpas = [r[1] for r in keep]
|
||||
firms = [ALIAS[r[2]] for r in keep]
|
||||
docs = [r[3] for r in keep]
|
||||
dh = [r[5] for r in keep]
|
||||
n = len(keep)
|
||||
cpa_idx = defaultdict(list)
|
||||
for i, c in enumerate(cpas):
|
||||
cpa_idx[c].append(i)
|
||||
cpa_idx = {c: np.array(v) for c, v in cpa_idx.items()}
|
||||
pool_size = {c: len(v)-1 for c, v in cpa_idx.items()}
|
||||
doc_idx = defaultdict(list)
|
||||
for i, d in enumerate(docs):
|
||||
doc_idx[d].append(i)
|
||||
rng = np.random.default_rng(SEED)
|
||||
all_docs = list(doc_idx.keys())
|
||||
sub = rng.choice(len(all_docs), min(N_DOC_SUBSAMPLE, len(all_docs)), replace=False)
|
||||
sel_docs = [all_docs[i] for i in sub]
|
||||
|
||||
sig_hc = [] # per-signature: any-pair cos>0.95 & dh<=5
|
||||
sig_firm = []
|
||||
doc_hcmc = {} # per-document worst-case: any sig with cos>0.95 & dh<=15
|
||||
doc_firm = {}
|
||||
for d in sel_docs:
|
||||
dhit = False
|
||||
for si in doc_idx[d]:
|
||||
c = cpas[si]; npool = pool_size[c]
|
||||
if npool <= 0:
|
||||
sig_hc.append(False); sig_firm.append(firms[si]); continue
|
||||
same = cpa_idx[c]
|
||||
draw = rng.choice(n, size=min(npool*2+10, n), replace=True)
|
||||
cand = draw[~np.isin(draw, same)][:npool]
|
||||
cosv = feats[cand] @ feats[si]
|
||||
dhv = np.fromiter((hamming1(dh[si], dh[c2]) for c2 in cand), np.int32, len(cand))
|
||||
cg = cosv > 0.95
|
||||
hc = bool((cg & (dhv <= 5)).any())
|
||||
hcmc = bool((cg & (dhv <= 15)).any())
|
||||
sig_hc.append(hc); sig_firm.append(firms[si])
|
||||
if hcmc:
|
||||
dhit = True
|
||||
doc_hcmc[d] = dhit
|
||||
doc_firm[d] = firms[doc_idx[d][0]]
|
||||
|
||||
sig_hc = np.array(sig_hc); sig_firm = np.array(sig_firm)
|
||||
k = int(sig_hc.sum()); m = len(sig_hc)
|
||||
lo, hi = wilson(k, m)
|
||||
print(f'\n [{label}] per-SIGNATURE any-pair HC ICCR (cos>0.95 & dh<=5): '
|
||||
f'{k/m:.4f} ({k}/{m}) Wilson95% [{lo:.4f},{hi:.4f}]')
|
||||
for f in sorted(set(sig_firm)):
|
||||
msk = sig_firm == f
|
||||
kk = int(sig_hc[msk].sum()); mm = int(msk.sum())
|
||||
print(f' Firm {f}: {kk/mm:.4f} ({kk}/{mm})')
|
||||
dvals = np.array(list(doc_hcmc.values())); dfirm = np.array(list(doc_firm.values()))
|
||||
dk = int(dvals.sum()); dm = len(dvals)
|
||||
dlo, dhi = wilson(dk, dm)
|
||||
print(f' [{label}] per-DOCUMENT HC+MC ICCR (cos>0.95 & dh<=15): '
|
||||
f'{dk/dm:.4f} ({dk}/{dm}) Wilson95% [{dlo:.4f},{dhi:.4f}]')
|
||||
for f in sorted(set(dfirm)):
|
||||
msk = dfirm == f
|
||||
kk = int(dvals[msk].sum()); mm = int(msk.sum())
|
||||
print(f' Firm {f}: {kk/mm:.4f} ({kk}/{mm})')
|
||||
|
||||
|
||||
rows = load()
|
||||
allf = np.stack([np.frombuffer(r[4], np.float32) for r in rows]).astype(np.float32)
|
||||
allf /= np.clip(np.linalg.norm(allf, axis=1, keepdims=True), 1e-9, None)
|
||||
allc = [r[1] for r in rows]
|
||||
abcd_mask = [True]*len(rows)
|
||||
bcd_mask = [r[2] != FIRM_A for r in rows]
|
||||
|
||||
print('=== (1) KDE crossover (intra vs inter cosine) ===')
|
||||
kde_crossover(allf, allc, 'ABCD')
|
||||
kde_crossover(allf[bcd_mask], [allc[i] for i in range(len(rows)) if bcd_mask[i]], 'BCD-only')
|
||||
|
||||
print('\n=== (2)(3) per-signature & per-document inter-CPA ICCR ===')
|
||||
pool_sim(rows, {'A', 'B', 'C', 'D'}, 'ABCD (reproduce)')
|
||||
pool_sim(rows, {'B', 'C', 'D'}, 'BCD-only')
|
||||
@@ -0,0 +1,141 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Script 48: full-fidelity (no subsample) BCD-only recompute of per-signature
|
||||
and per-document inter-CPA any-pair ICCR, plus corpus-style KDE crossover.
|
||||
Vectorized popcount. Scopes: ABCD, BCD-only, BCD+non-Big-4. Read-only.
|
||||
"""
|
||||
import sqlite3
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
from scipy.stats import gaussian_kde
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FIRM_A = '勤業眾信聯合'
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'}
|
||||
SEED = 42
|
||||
POP = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
|
||||
|
||||
|
||||
def load():
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT s.assigned_accountant, a.firm, s.source_pdf,
|
||||
s.feature_vector, s.dhash_vector
|
||||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL
|
||||
AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""")
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
|
||||
def wilson(k, n, z=1.96):
|
||||
if n == 0:
|
||||
return (None, None)
|
||||
p = k/n; d = 1+z*z/n
|
||||
c = (p+z*z/(2*n))/d
|
||||
h = z*np.sqrt(p*(1-p)/n+z*z/(4*n*n))/d
|
||||
return (max(0.0, c-h), min(1.0, c+h))
|
||||
|
||||
|
||||
def prep(rows, keep_fn):
|
||||
keep = [r for r in rows if keep_fn(r[1])]
|
||||
feats = np.stack([np.frombuffer(r[3], np.float32) for r in keep]).astype(np.float32)
|
||||
feats /= np.clip(np.linalg.norm(feats, axis=1, keepdims=True), 1e-9, None)
|
||||
dh = np.stack([np.frombuffer(r[4], np.uint8) for r in keep]) # (n,8)
|
||||
cpas = np.array([r[0] for r in keep])
|
||||
firms = np.array([ALIAS.get(r[1], 'X') for r in keep])
|
||||
docs = np.array([r[2] for r in keep])
|
||||
return feats, dh, cpas, firms, docs
|
||||
|
||||
|
||||
def crossover(feats, cpas, label):
|
||||
by = defaultdict(list)
|
||||
for i, c in enumerate(cpas):
|
||||
by[c].append(i)
|
||||
by = {c: np.array(v) for c, v in by.items() if len(v) >= 2}
|
||||
accts = list(by.keys())
|
||||
rng = np.random.default_rng(SEED)
|
||||
N = 100_000
|
||||
intra = np.empty(N, np.float32); inter = np.empty(N, np.float32)
|
||||
ks = rng.integers(0, len(accts), N)
|
||||
for t in range(N):
|
||||
idx = by[accts[ks[t]]]
|
||||
a, b = rng.choice(idx, 2, replace=False)
|
||||
intra[t] = feats[a] @ feats[b]
|
||||
i, j = rng.choice(len(accts), 2, replace=False)
|
||||
inter[t] = feats[rng.choice(by[accts[i]])] @ feats[rng.choice(by[accts[j]])]
|
||||
xs = np.linspace(0.3, 1.0, 10000)
|
||||
diff = gaussian_kde(intra)(xs) - gaussian_kde(inter)(xs)
|
||||
cross = [float(x) for x in xs[np.where(np.diff(np.sign(diff)))[0]] if 0.6 < x < 0.99]
|
||||
print(f' [{label}] crossover {[f"{x:.4f}" for x in cross]} '
|
||||
f'(intra {intra.mean():.4f} / inter {inter.mean():.4f})')
|
||||
|
||||
|
||||
def pool_sim(feats, dh, cpas, firms, docs, label):
|
||||
n = len(cpas)
|
||||
cpa_idx = defaultdict(list)
|
||||
for i, c in enumerate(cpas):
|
||||
cpa_idx[c].append(i)
|
||||
cpa_idx = {c: np.array(v) for c, v in cpa_idx.items()}
|
||||
pool_size = {c: len(v)-1 for c, v in cpa_idx.items()}
|
||||
rng = np.random.default_rng(SEED)
|
||||
sig_hc = np.zeros(n, bool)
|
||||
doc_hcmc = defaultdict(bool)
|
||||
for si in range(n):
|
||||
c = cpas[si]; npool = pool_size[c]
|
||||
if npool <= 0:
|
||||
continue
|
||||
same = cpa_idx[c]
|
||||
draw = rng.integers(0, n, size=npool + same.size + 20)
|
||||
cand = draw[~np.isin(draw, same)][:npool]
|
||||
cosv = feats[cand] @ feats[si]
|
||||
cg = cosv > 0.95
|
||||
if cg.any():
|
||||
dist = POP[dh[cand] ^ dh[si]].sum(axis=1)
|
||||
sig_hc[si] = bool((cg & (dist <= 5)).any())
|
||||
if (cg & (dist <= 15)).any():
|
||||
doc_hcmc[docs[si]] = True
|
||||
else:
|
||||
doc_hcmc.setdefault(docs[si], doc_hcmc[docs[si]] if docs[si] in doc_hcmc else False)
|
||||
# ensure every doc present
|
||||
for d in docs:
|
||||
doc_hcmc.setdefault(d, False)
|
||||
k = int(sig_hc.sum())
|
||||
lo, hi = wilson(k, n)
|
||||
print(f'\n [{label}] per-SIGNATURE any-pair HC (cos>0.95 & dh<=5): '
|
||||
f'{k/n:.4f} ({k}/{n}) Wilson95% [{lo:.4f},{hi:.4f}]')
|
||||
for f in sorted(set(firms)):
|
||||
m = firms == f
|
||||
print(f' Firm {f}: {sig_hc[m].sum()/m.sum():.4f} ({int(sig_hc[m].sum())}/{int(m.sum())})')
|
||||
# per-doc, with firm of first sig
|
||||
dfirm = {}
|
||||
for i, d in enumerate(docs):
|
||||
dfirm.setdefault(d, firms[i])
|
||||
dl = list(doc_hcmc.keys())
|
||||
dv = np.array([doc_hcmc[d] for d in dl])
|
||||
df = np.array([dfirm[d] for d in dl])
|
||||
dk = int(dv.sum()); dm = len(dv)
|
||||
dlo, dhi = wilson(dk, dm)
|
||||
print(f' [{label}] per-DOCUMENT HC+MC (cos>0.95 & dh<=15): '
|
||||
f'{dk/dm:.4f} ({dk}/{dm}) Wilson95% [{dlo:.4f},{dhi:.4f}]')
|
||||
for f in sorted(set(df)):
|
||||
m = df == f
|
||||
print(f' Firm {f}: {dv[m].sum()/m.sum():.4f} ({int(dv[m].sum())}/{int(m.sum())})')
|
||||
|
||||
|
||||
rows = load()
|
||||
SCOPES = [('ABCD', lambda fm: fm in BIG4),
|
||||
('BCD-only', lambda fm: fm in BIG4 and fm != FIRM_A),
|
||||
('BCD+nonBig4', lambda fm: fm != FIRM_A)]
|
||||
|
||||
print('=== KDE crossover ===')
|
||||
for name, fn in SCOPES[:2]:
|
||||
f, _, c, _, _ = prep(rows, fn)
|
||||
crossover(f, c, name)
|
||||
|
||||
print('\n=== per-signature & per-document inter-CPA ICCR (full) ===')
|
||||
for name, fn in SCOPES:
|
||||
f, dh, c, fm, dc = prep(rows, fn)
|
||||
pool_sim(f, dh, c, fm, dc, name)
|
||||
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Script 49: Firm A as out-of-sample target against a clean BCD baseline.
|
||||
(1) A signatures scored against a BCD-only candidate pool (true out-of-sample
|
||||
inter-firm coincidence).
|
||||
(2) Observed deployed rate on ACTUAL same-CPA pools, per firm (the real fired
|
||||
rate, from precomputed deployed descriptors), to juxtapose against the
|
||||
clean BCD inter-CPA coincidence floor. Read-only.
|
||||
"""
|
||||
import sqlite3
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FIRM_A = '勤業眾信聯合'
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'}
|
||||
SEED = 42
|
||||
POP = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
|
||||
|
||||
|
||||
def wilson(k, n, z=1.96):
|
||||
if n == 0:
|
||||
return (None, None)
|
||||
p = k/n; d = 1+z*z/n
|
||||
c = (p+z*z/(2*n))/d
|
||||
h = z*np.sqrt(p*(1-p)/n+z*z/(4*n*n))/d
|
||||
return (max(0.0, c-h), min(1.0, c+h))
|
||||
|
||||
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT s.assigned_accountant, a.firm, s.source_pdf, s.feature_vector,
|
||||
s.dhash_vector, s.max_similarity_to_same_accountant, s.min_dhash_independent
|
||||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL AND a.firm IN (?,?,?,?)
|
||||
AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""", BIG4)
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
|
||||
# ---- (1) Firm A source vs BCD-only candidate pool ----
|
||||
print('=== (1) Firm A out-of-sample vs clean BCD candidate pool ===')
|
||||
A = [r for r in rows if r[1] == FIRM_A]
|
||||
BCD = [r for r in rows if r[1] in BIG4 and r[1] != FIRM_A]
|
||||
bcd_feat = np.stack([np.frombuffer(r[3], np.float32) for r in BCD]).astype(np.float32)
|
||||
bcd_feat /= np.clip(np.linalg.norm(bcd_feat, axis=1, keepdims=True), 1e-9, None)
|
||||
bcd_dh = np.stack([np.frombuffer(r[4], np.uint8) for r in BCD])
|
||||
nb = len(BCD)
|
||||
# A CPA pool sizes (their own same-CPA count - 1), to match negative-anchor construction
|
||||
a_cpa_idx = defaultdict(list)
|
||||
for i, r in enumerate(A):
|
||||
a_cpa_idx[r[0]].append(i)
|
||||
pool_size = {c: len(v)-1 for c, v in a_cpa_idx.items()}
|
||||
rng = np.random.default_rng(SEED)
|
||||
sig_hc = np.zeros(len(A), bool)
|
||||
doc_hcmc = defaultdict(bool)
|
||||
for i, r in enumerate(A):
|
||||
npool = max(pool_size[r[0]], 1)
|
||||
cand = rng.integers(0, nb, size=npool)
|
||||
sf = np.frombuffer(r[3], np.float32).astype(np.float32)
|
||||
sf /= max(np.linalg.norm(sf), 1e-9)
|
||||
cosv = bcd_feat[cand] @ sf
|
||||
cg = cosv > 0.95
|
||||
doc_hcmc.setdefault(r[2], False)
|
||||
if cg.any():
|
||||
dist = POP[bcd_dh[cand] ^ np.frombuffer(r[4], np.uint8)].sum(axis=1)
|
||||
sig_hc[i] = bool((cg & (dist <= 5)).any())
|
||||
if (cg & (dist <= 15)).any():
|
||||
doc_hcmc[r[2]] = True
|
||||
k = int(sig_hc.sum()); n = len(A); lo, hi = wilson(k, n)
|
||||
print(f' A-source vs BCD-pool per-SIGNATURE HC (cos>0.95 & dh<=5): '
|
||||
f'{k/n:.4f} ({k}/{n}) Wilson95% [{lo:.4f},{hi:.4f}]')
|
||||
dv = np.array(list(doc_hcmc.values())); dk = int(dv.sum()); dm = len(dv)
|
||||
dlo, dhi = wilson(dk, dm)
|
||||
print(f' A-source vs BCD-pool per-DOCUMENT HC+MC (cos>0.95 & dh<=15): '
|
||||
f'{dk/dm:.4f} ({dk}/{dm}) Wilson95% [{dlo:.4f},{dhi:.4f}]')
|
||||
|
||||
# ---- (2) Observed deployed rate on ACTUAL same-CPA pools, per firm ----
|
||||
print('\n=== (2) Observed deployed rate on actual same-CPA pools (real fired rate) ===')
|
||||
print(' per-signature HC = max_sim>0.95 & min_dh<=5 ; per-doc HC+MC worst-case dh<=15')
|
||||
by_firm_sig = defaultdict(lambda: [0, 0])
|
||||
doc_obs = {}
|
||||
doc_firm = {}
|
||||
for r in rows:
|
||||
fm = ALIAS[r[1]]
|
||||
ms, md = r[5], r[6]
|
||||
if ms is None or md is None:
|
||||
continue
|
||||
hc = (ms > 0.95) and (md <= 5)
|
||||
hcmc = (ms > 0.95) and (md <= 15)
|
||||
by_firm_sig[fm][0] += int(hc); by_firm_sig[fm][1] += 1
|
||||
doc_firm.setdefault(r[2], fm)
|
||||
doc_obs[r[2]] = doc_obs.get(r[2], False) or hcmc
|
||||
for fm in sorted(by_firm_sig):
|
||||
k, n = by_firm_sig[fm]
|
||||
lo, hi = wilson(k, n)
|
||||
print(f' Firm {fm} per-SIGNATURE HC: {k/n:.4f} ({k}/{n}) [{lo:.4f},{hi:.4f}]')
|
||||
dd = defaultdict(lambda: [0, 0])
|
||||
for d, hit in doc_obs.items():
|
||||
fm = doc_firm[d]; dd[fm][0] += int(hit); dd[fm][1] += 1
|
||||
for fm in sorted(dd):
|
||||
k, n = dd[fm]
|
||||
print(f' Firm {fm} per-DOCUMENT HC+MC: {k/n:.4f} ({k}/{n})')
|
||||
print(f'\n Clean BCD inter-CPA coincidence FLOOR: per-sig HC=0.0048, per-doc HC+MC=0.1281')
|
||||
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Script 50: publication-grade scoped inter-CPA anchor recompute.
|
||||
Faithfully reproduces Script 45's any-pair five-way pool simulation
|
||||
(max_cos & min_dh over a random same-size inter-CPA pool, excl. same-CPA),
|
||||
then reports for scopes ABCD / BCD / BCD+nonBig4:
|
||||
- per-signature HC (D1) and HC+MC (D2) any-pair FAR
|
||||
- per-document HC (D1) and HC+MC (D2) any-pair FAR
|
||||
- per-firm per-document D2
|
||||
ABCD is printed first to verify reproduction of published values
|
||||
(per-sig HC~0.1102, per-doc D2~0.3375, Firm A~0.62). Read-only.
|
||||
"""
|
||||
import sqlite3
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FIRM_A = '勤業眾信聯合'
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'}
|
||||
SEED = 42
|
||||
POP = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
|
||||
|
||||
|
||||
def wilson(k, n, z=1.96):
|
||||
if n == 0:
|
||||
return (None, None)
|
||||
p = k/n; d = 1+z*z/n
|
||||
c = (p+z*z/(2*n))/d
|
||||
h = z*np.sqrt(p*(1-p)/n+z*z/(4*n*n))/d
|
||||
return (max(0.0, c-h), min(1.0, c+h))
|
||||
|
||||
|
||||
def load():
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT s.assigned_accountant, a.firm, s.source_pdf,
|
||||
s.feature_vector, s.dhash_vector
|
||||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL
|
||||
AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""")
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
|
||||
def run(rows, keep_fn, label):
|
||||
keep = [r for r in rows if keep_fn(r[1])]
|
||||
n = len(keep)
|
||||
feats = np.stack([np.frombuffer(r[3], np.float32) for r in keep]).astype(np.float32)
|
||||
feats /= np.clip(np.linalg.norm(feats, axis=1, keepdims=True), 1e-9, None)
|
||||
dh = np.stack([np.frombuffer(r[4], np.uint8) for r in keep])
|
||||
cpas = np.array([r[0] for r in keep])
|
||||
firms = np.array([ALIAS.get(r[1], 'NonB4') for r in keep])
|
||||
docs = np.array([r[2] for r in keep])
|
||||
cpa_idx = defaultdict(list)
|
||||
for i, c in enumerate(cpas):
|
||||
cpa_idx[c].append(i)
|
||||
cpa_idx = {c: np.array(v) for c, v in cpa_idx.items()}
|
||||
pool_size = {c: len(v)-1 for c, v in cpa_idx.items()}
|
||||
rng = np.random.default_rng(SEED)
|
||||
max_cos = np.zeros(n, np.float32)
|
||||
min_dh = np.full(n, 64, np.int32)
|
||||
for si in range(n):
|
||||
c = cpas[si]; npool = pool_size[c]
|
||||
if npool <= 0:
|
||||
continue
|
||||
same = cpa_idx[c]
|
||||
draw = rng.integers(0, n, size=npool + same.size + 20)
|
||||
cand = draw[~np.isin(draw, same)][:npool]
|
||||
cosv = feats[cand] @ feats[si]
|
||||
dist = POP[dh[cand] ^ dh[si]].sum(axis=1)
|
||||
max_cos[si] = cosv.max()
|
||||
min_dh[si] = int(dist.min())
|
||||
# any-pair classification
|
||||
hc = (max_cos > 0.95) & (min_dh <= 5)
|
||||
mc = (max_cos > 0.95) & (min_dh > 5) & (min_dh <= 15)
|
||||
d1 = hc
|
||||
d2 = hc | mc
|
||||
print(f'\n===== {label} (n_sig={n:,}) =====')
|
||||
for nm, arr in [('per-sig HC (D1)', d1), ('per-sig HC+MC (D2)', d2)]:
|
||||
k = int(arr.sum()); lo, hi = wilson(k, n)
|
||||
print(f' {nm}: {k/n:.4f} ({k}/{n}) [{lo:.4f},{hi:.4f}]')
|
||||
# per-document worst-case
|
||||
doc_d1 = defaultdict(bool); doc_d2 = defaultdict(bool); doc_firm = {}
|
||||
for i in range(n):
|
||||
if d1[i]: doc_d1[docs[i]] = True
|
||||
if d2[i]: doc_d2[docs[i]] = True
|
||||
doc_firm.setdefault(docs[i], firms[i])
|
||||
doc_d1.setdefault(docs[i], False); doc_d2.setdefault(docs[i], False)
|
||||
dl = list(doc_d2.keys())
|
||||
nd = len(dl)
|
||||
k1 = sum(doc_d1[d] for d in dl); k2 = sum(doc_d2[d] for d in dl)
|
||||
l1 = wilson(k1, nd); l2 = wilson(k2, nd)
|
||||
print(f' per-doc HC (D1): {k1/nd:.4f} ({k1}/{nd}) [{l1[0]:.4f},{l1[1]:.4f}]')
|
||||
print(f' per-doc HC+MC (D2):{k2/nd:.4f} ({k2}/{nd}) [{l2[0]:.4f},{l2[1]:.4f}]')
|
||||
df = np.array([doc_firm[d] for d in dl])
|
||||
dv = np.array([doc_d2[d] for d in dl])
|
||||
for f in sorted(set(df)):
|
||||
m = df == f
|
||||
print(f' Firm {f} per-doc D2: {dv[m].sum()/m.sum():.4f} ({int(dv[m].sum())}/{int(m.sum())})')
|
||||
|
||||
|
||||
rows = load()
|
||||
run(rows, lambda fm: fm in BIG4, 'ABCD (verify vs published: HC~0.110 / D2~0.338 / A~0.62)')
|
||||
run(rows, lambda fm: fm in BIG4 and fm != FIRM_A, 'BCD-only')
|
||||
run(rows, lambda fm: fm != FIRM_A, 'BCD + non-Big4')
|
||||
@@ -0,0 +1,135 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Script 51: publication polish.
|
||||
Part A: CPA-block bootstrap (1000 reps) on per-signature HC any-pair rate, and
|
||||
document-level bootstrap on per-document HC+MC, for ABCD & BCD.
|
||||
Part B: corpus-wide KDE crossover (pair-weighted intra, reproduce 0.837) plus
|
||||
BCD-only and BCD+nonBig4 variants.
|
||||
Read-only.
|
||||
"""
|
||||
import sqlite3
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
from scipy.stats import gaussian_kde
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FIRM_A = '勤業眾信聯合'
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'}
|
||||
SEED = 42
|
||||
N_BOOT = 1000
|
||||
POP = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
|
||||
|
||||
|
||||
def load():
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT s.assigned_accountant, a.firm, s.source_pdf,
|
||||
s.feature_vector, s.dhash_vector
|
||||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL
|
||||
AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""")
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
|
||||
# ============ Part A: bootstrap on anchor rates ============
|
||||
def simulate(keep):
|
||||
n = len(keep)
|
||||
feats = np.stack([np.frombuffer(r[3], np.float32) for r in keep]).astype(np.float32)
|
||||
feats /= np.clip(np.linalg.norm(feats, axis=1, keepdims=True), 1e-9, None)
|
||||
dh = np.stack([np.frombuffer(r[4], np.uint8) for r in keep])
|
||||
cpas = np.array([r[0] for r in keep])
|
||||
docs = np.array([r[2] for r in keep])
|
||||
cpa_idx = defaultdict(list)
|
||||
for i, c in enumerate(cpas):
|
||||
cpa_idx[c].append(i)
|
||||
cpa_idx = {c: np.array(v) for c, v in cpa_idx.items()}
|
||||
pool_size = {c: len(v)-1 for c, v in cpa_idx.items()}
|
||||
rng = np.random.default_rng(SEED)
|
||||
max_cos = np.zeros(n, np.float32); min_dh = np.full(n, 64, np.int32)
|
||||
for si in range(n):
|
||||
c = cpas[si]; npool = pool_size[c]
|
||||
if npool <= 0:
|
||||
continue
|
||||
same = cpa_idx[c]
|
||||
draw = rng.integers(0, n, size=npool + same.size + 20)
|
||||
cand = draw[~np.isin(draw, same)][:npool]
|
||||
cosv = feats[cand] @ feats[si]
|
||||
dist = POP[dh[cand] ^ dh[si]].sum(axis=1)
|
||||
max_cos[si] = cosv.max(); min_dh[si] = int(dist.min())
|
||||
hc = (max_cos > 0.95) & (min_dh <= 5)
|
||||
d2 = (max_cos > 0.95) & (min_dh <= 15)
|
||||
return hc, d2, cpa_idx, docs
|
||||
|
||||
|
||||
def boot_part(keep, label):
|
||||
hc, d2, cpa_idx, docs = simulate(keep)
|
||||
n = len(hc)
|
||||
rng = np.random.default_rng(SEED + 1)
|
||||
cpa_list = list(cpa_idx.keys())
|
||||
# CPA-block bootstrap on per-signature HC
|
||||
bs = np.empty(N_BOOT)
|
||||
for b in range(N_BOOT):
|
||||
cs = rng.choice(len(cpa_list), len(cpa_list), replace=True)
|
||||
idx = np.concatenate([cpa_idx[cpa_list[i]] for i in cs])
|
||||
bs[b] = hc[idx].mean()
|
||||
# document-level bootstrap on per-doc D2
|
||||
doc_d2 = defaultdict(bool)
|
||||
for i in range(n):
|
||||
doc_d2[docs[i]] = doc_d2[docs[i]] or bool(d2[i])
|
||||
dl = np.array(list(doc_d2.keys())); dvals = np.array([doc_d2[d] for d in dl])
|
||||
nd = len(dl); bd = np.empty(N_BOOT)
|
||||
for b in range(N_BOOT):
|
||||
s = rng.integers(0, nd, nd)
|
||||
bd[b] = dvals[s].mean()
|
||||
print(f'\n [{label}] per-sig HC point={hc.mean():.4f} '
|
||||
f'CPA-block boot95% [{np.percentile(bs,2.5):.4f}, {np.percentile(bs,97.5):.4f}]')
|
||||
print(f' [{label}] per-doc HC+MC point={dvals.mean():.4f} '
|
||||
f'doc boot95% [{np.percentile(bd,2.5):.4f}, {np.percentile(bd,97.5):.4f}]')
|
||||
|
||||
|
||||
# ============ Part B: pair-weighted KDE crossover ============
|
||||
def crossover(keep, label):
|
||||
feats = np.stack([np.frombuffer(r[3], np.float32) for r in keep]).astype(np.float32)
|
||||
feats /= np.clip(np.linalg.norm(feats, axis=1, keepdims=True), 1e-9, None)
|
||||
cpas = np.array([r[0] for r in keep])
|
||||
by = defaultdict(list)
|
||||
for i, c in enumerate(cpas):
|
||||
by[c].append(i)
|
||||
by = {c: np.array(v) for c, v in by.items() if len(v) >= 3}
|
||||
accts = list(by.keys())
|
||||
pair_w = np.array([len(by[c])*(len(by[c])-1)/2 for c in accts], float)
|
||||
pair_w /= pair_w.sum()
|
||||
rng = np.random.default_rng(SEED)
|
||||
M = 100_000
|
||||
# intra: CPA sampled proportional to pair count (= uniform over all intra pairs)
|
||||
intra = np.empty(M, np.float32)
|
||||
ci = rng.choice(len(accts), M, p=pair_w)
|
||||
for t in range(M):
|
||||
a, b = rng.choice(by[accts[ci[t]]], 2, replace=False)
|
||||
intra[t] = feats[a] @ feats[b]
|
||||
inter = np.empty(M, np.float32)
|
||||
for t in range(M):
|
||||
i, j = rng.choice(len(accts), 2, replace=False)
|
||||
inter[t] = feats[rng.choice(by[accts[i]])] @ feats[rng.choice(by[accts[j]])]
|
||||
xs = np.linspace(0.3, 1.0, 10000)
|
||||
diff = gaussian_kde(intra)(xs) - gaussian_kde(inter)(xs)
|
||||
cr = [float(x) for x in xs[np.where(np.diff(np.sign(diff)))[0]] if 0.6 < x < 0.99]
|
||||
print(f' [{label}] crossover {[f"{x:.4f}" for x in cr]} '
|
||||
f'(intra {intra.mean():.4f}/{np.median(intra):.4f} inter {inter.mean():.4f}/{np.median(inter):.4f})')
|
||||
|
||||
|
||||
rows = load()
|
||||
abcd = [r for r in rows if r[1] in BIG4]
|
||||
bcd = [r for r in rows if r[1] in BIG4 and r[1] != FIRM_A]
|
||||
|
||||
print('=== Part A: bootstrap CIs on anchor rates ===')
|
||||
boot_part(abcd, 'ABCD (verify ~0.109 / ~0.338)')
|
||||
boot_part(bcd, 'BCD-only')
|
||||
|
||||
print('\n=== Part B: KDE crossover (pair-weighted intra, corpus-wide reproduces 0.837) ===')
|
||||
crossover(rows, 'corpus-wide (all firms)')
|
||||
crossover(bcd, 'BCD-only')
|
||||
crossover([r for r in rows if r[1] != FIRM_A], 'BCD + non-Big4')
|
||||
@@ -0,0 +1,169 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Script 52: canonical-correct locked publication numbers (supersedes 48/50,
|
||||
fixes the per-firm assignment and A-out-of-sample any-pair issues codex flagged).
|
||||
|
||||
Uses the EXACT canonical candidate sampler of Scripts 43/45 (retry-loop to
|
||||
collect exactly n_pool non-same-CPA candidates, rng.choice, default_rng(42)),
|
||||
any-pair max-cos/min-dHash five-way classification, and dominant-firm document
|
||||
assignment. Scopes: ABCD / BCD / BCD+nonBig4, plus Firm-A out-of-sample vs a
|
||||
clean BCD candidate pool. Read-only.
|
||||
"""
|
||||
import sqlite3
|
||||
from collections import defaultdict, Counter
|
||||
import numpy as np
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FIRM_A = '勤業眾信聯合'
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'}
|
||||
SEED = 42
|
||||
N_BOOT = 1000
|
||||
POP = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
|
||||
|
||||
|
||||
def wilson(k, n, z=1.96):
|
||||
if n == 0:
|
||||
return (None, None)
|
||||
p = k/n; d = 1+z*z/n
|
||||
c = (p+z*z/(2*n))/d
|
||||
h = z*np.sqrt(p*(1-p)/n+z*z/(4*n*n))/d
|
||||
return (max(0.0, c-h), min(1.0, c+h))
|
||||
|
||||
|
||||
def load():
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT s.assigned_accountant, a.firm, s.source_pdf,
|
||||
s.feature_vector, s.dhash_vector
|
||||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL
|
||||
AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""")
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
|
||||
def canonical_sampler(rng, n, n_pool, same_cpa, all_idx):
|
||||
"""EXACT Scripts 43/45 sampler: retry-loop to exactly n_pool non-same."""
|
||||
need = n_pool
|
||||
cand = []
|
||||
attempts = 0
|
||||
while need > 0 and attempts < 10:
|
||||
draw = rng.choice(n, size=need * 2, replace=True)
|
||||
ok = draw[~np.isin(draw, same_cpa)]
|
||||
cand.extend(ok[:need].tolist())
|
||||
need -= len(ok[:need])
|
||||
attempts += 1
|
||||
if need > 0:
|
||||
pool_mask = np.ones(n, dtype=bool)
|
||||
pool_mask[same_cpa] = False
|
||||
fb = rng.choice(all_idx[pool_mask], size=need, replace=False)
|
||||
cand.extend(fb.tolist())
|
||||
return np.array(cand[:n_pool], dtype=np.int64)
|
||||
|
||||
|
||||
def simulate(keep):
|
||||
n = len(keep)
|
||||
feats = np.stack([np.frombuffer(r[3], np.float32) for r in keep]).astype(np.float32)
|
||||
norms = np.linalg.norm(feats, axis=1, keepdims=True); norms[norms == 0] = 1.0
|
||||
feats = feats / norms
|
||||
dh = np.stack([np.frombuffer(r[4], np.uint8) for r in keep])
|
||||
cpas = np.array([r[0] for r in keep])
|
||||
firms = np.array([ALIAS.get(r[1], 'NonB4') for r in keep])
|
||||
docs = np.array([r[2] for r in keep])
|
||||
cpa_idx = defaultdict(list)
|
||||
for i, c in enumerate(cpas):
|
||||
cpa_idx[c].append(i)
|
||||
cpa_idx = {c: np.array(v) for c, v in cpa_idx.items()}
|
||||
pool_size = {c: len(v)-1 for c, v in cpa_idx.items()}
|
||||
all_idx = np.arange(n)
|
||||
rng = np.random.default_rng(SEED)
|
||||
max_cos = np.zeros(n, np.float32); min_dh = np.full(n, 64, np.int32)
|
||||
for si in range(n):
|
||||
np_ = pool_size[cpas[si]]
|
||||
if np_ <= 0:
|
||||
continue
|
||||
cand = canonical_sampler(rng, n, np_, cpa_idx[cpas[si]], all_idx)
|
||||
cosv = feats[cand] @ feats[si]
|
||||
dist = POP[dh[cand] ^ dh[si]].sum(axis=1)
|
||||
max_cos[si] = cosv.max(); min_dh[si] = int(dist.min())
|
||||
return max_cos, min_dh, cpas, firms, docs, cpa_idx
|
||||
|
||||
|
||||
def report(keep, label):
|
||||
max_cos, min_dh, cpas, firms, docs, cpa_idx = simulate(keep)
|
||||
n = len(cpas)
|
||||
hc = (max_cos > 0.95) & (min_dh <= 5)
|
||||
d2 = (max_cos > 0.95) & (min_dh <= 15)
|
||||
print(f'\n===== {label} (n_sig={n:,}) =====')
|
||||
for nm, a in [('per-sig HC', hc), ('per-sig HC+MC', d2)]:
|
||||
k = int(a.sum()); lo, hi = wilson(k, n)
|
||||
print(f' {nm}: {k/n:.6f} ({k}/{n}) [{lo:.4f},{hi:.4f}]')
|
||||
# CPA-block bootstrap on per-sig HC
|
||||
rng = np.random.default_rng(SEED + 1)
|
||||
cl = list(cpa_idx.keys())
|
||||
bs = np.empty(N_BOOT)
|
||||
for b in range(N_BOOT):
|
||||
cs = rng.choice(len(cl), len(cl), replace=True)
|
||||
idx = np.concatenate([cpa_idx[cl[i]] for i in cs])
|
||||
bs[b] = hc[idx].mean()
|
||||
print(f' per-sig HC CPA-block boot95% [{np.percentile(bs,2.5):.4f},{np.percentile(bs,97.5):.4f}]')
|
||||
# per-doc, dominant-firm assignment (canonical)
|
||||
doc_sigs = defaultdict(list)
|
||||
for i in range(n):
|
||||
doc_sigs[docs[i]].append(i)
|
||||
dl = list(doc_sigs.keys()); nd = len(dl)
|
||||
doc_d1 = np.array([hc[doc_sigs[d]].any() for d in dl])
|
||||
doc_d2 = np.array([d2[doc_sigs[d]].any() for d in dl])
|
||||
doc_firm = np.array([Counter(firms[doc_sigs[d]]).most_common(1)[0][0] for d in dl])
|
||||
print(f' per-doc HC: {doc_d1.mean():.6f} ({int(doc_d1.sum())}/{nd})')
|
||||
print(f' per-doc HC+MC: {doc_d2.mean():.6f} ({int(doc_d2.sum())}/{nd})')
|
||||
for f in sorted(set(doc_firm)):
|
||||
m = doc_firm == f
|
||||
print(f' Firm {f} per-doc D2: {doc_d2[m].mean():.4f} ({int(doc_d2[m].sum())}/{int(m.sum())})')
|
||||
|
||||
|
||||
def a_out_of_sample(rows):
|
||||
"""Firm A source vs clean BCD candidate pool, any-pair, pool=count-1."""
|
||||
A = [r for r in rows if r[1] == FIRM_A]
|
||||
BCD = [r for r in rows if r[1] in BIG4 and r[1] != FIRM_A]
|
||||
bf = np.stack([np.frombuffer(r[3], np.float32) for r in BCD]).astype(np.float32)
|
||||
nb = bf.shape[0]
|
||||
bn = np.linalg.norm(bf, axis=1, keepdims=True); bn[bn == 0] = 1.0; bf = bf/bn
|
||||
bdh = np.stack([np.frombuffer(r[4], np.uint8) for r in BCD])
|
||||
a_cpa = defaultdict(list)
|
||||
for i, r in enumerate(A):
|
||||
a_cpa[r[0]].append(i)
|
||||
pool_size = {c: len(v)-1 for c, v in a_cpa.items()}
|
||||
rng = np.random.default_rng(SEED)
|
||||
hc = np.zeros(len(A), bool); d2 = np.zeros(len(A), bool)
|
||||
docs = np.array([r[2] for r in A])
|
||||
for i, r in enumerate(A):
|
||||
np_ = pool_size[r[0]]
|
||||
if np_ <= 0: # singleton CPA: no same-CPA pool, skip (canonical)
|
||||
continue
|
||||
cand = rng.choice(nb, size=np_, replace=True) # A not in BCD pool
|
||||
sf = np.frombuffer(r[3], np.float32).astype(np.float32)
|
||||
sf = sf/max(np.linalg.norm(sf), 1e-9)
|
||||
cosv = bf[cand] @ sf
|
||||
dist = POP[bdh[cand] ^ np.frombuffer(r[4], np.uint8)].sum(axis=1)
|
||||
mc, md = cosv.max(), int(dist.min())
|
||||
hc[i] = (mc > 0.95) and (md <= 5)
|
||||
d2[i] = (mc > 0.95) and (md <= 15)
|
||||
k = int(hc.sum()); n = len(A); lo, hi = wilson(k, n)
|
||||
print(f'\n===== Firm A out-of-sample vs clean BCD pool (any-pair) =====')
|
||||
print(f' per-sig HC: {k/n:.6f} ({k}/{n}) [{lo:.5f},{hi:.5f}]')
|
||||
ds = defaultdict(list)
|
||||
for i in range(n):
|
||||
ds[docs[i]].append(i)
|
||||
dl = list(ds.keys())
|
||||
dd2 = np.array([d2[ds[d]].any() for d in dl])
|
||||
print(f' per-doc HC+MC: {dd2.mean():.6f} ({int(dd2.sum())}/{len(dl)})')
|
||||
|
||||
|
||||
rows = load()
|
||||
report([r for r in rows if r[1] in BIG4], 'ABCD (verify: per-sig HC~0.1102 / per-doc D2~0.3375)')
|
||||
report([r for r in rows if r[1] in BIG4 and r[1] != FIRM_A], 'BCD-only (verify codex: HC~0.0116 / doc HC~0.0226 / doc D2~0.1905)')
|
||||
report([r for r in rows if r[1] != FIRM_A], 'BCD + non-Big4')
|
||||
a_out_of_sample(rows)
|
||||
@@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Script 53: BCD-only firm-effect logistic regression (Firm D reference) and
|
||||
BCD-only cross-firm hit matrix. Candidate pool = BCD (exclude Firm A and
|
||||
same-CPA). Canonical retry-loop sampler, any-pair + same-pair. Read-only.
|
||||
Replicates Script 44's logistic_fit and matrix logic, restricted to BCD.
|
||||
"""
|
||||
import sqlite3
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FIRM_A = '勤業眾信聯合'
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'}
|
||||
SEED = 42
|
||||
POP = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
|
||||
|
||||
|
||||
def logistic_fit(X, y, max_iter=200, l2=0.001):
|
||||
n, k = X.shape
|
||||
beta = np.zeros(k)
|
||||
for _ in range(max_iter):
|
||||
eta = np.clip(X @ beta, -30, 30)
|
||||
p = 1.0/(1.0+np.exp(-eta))
|
||||
grad = X.T @ (y-p) - l2*beta
|
||||
W = p*(1-p)
|
||||
H = -(X.T*W) @ X - l2*np.eye(k)
|
||||
try:
|
||||
delta = np.linalg.solve(H, grad)
|
||||
except np.linalg.LinAlgError:
|
||||
delta = 0.3*grad
|
||||
nb = beta - delta
|
||||
if np.max(np.abs(nb-beta)) < 1e-8:
|
||||
beta = nb; break
|
||||
beta = nb
|
||||
eta = np.clip(X @ beta, -30, 30)
|
||||
p = 1.0/(1.0+np.exp(-eta)); W = p*(1-p)
|
||||
cov = np.linalg.inv((X.T*W) @ X + l2*np.eye(k))
|
||||
return beta, np.sqrt(np.diag(cov))
|
||||
|
||||
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""SELECT s.assigned_accountant, a.firm, s.feature_vector, s.dhash_vector
|
||||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL AND a.firm IN (?,?,?)
|
||||
AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""",
|
||||
('安侯建業聯合', '資誠聯合', '安永聯合'))
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
|
||||
n = len(rows)
|
||||
feats = np.stack([np.frombuffer(r[2], np.float32) for r in rows]).astype(np.float32)
|
||||
norms = np.linalg.norm(feats, axis=1, keepdims=True); norms[norms == 0] = 1.0
|
||||
feats = feats / norms
|
||||
dh = np.stack([np.frombuffer(r[3], np.uint8) for r in rows])
|
||||
cpas = np.array([r[0] for r in rows])
|
||||
firms = np.array([ALIAS[r[1]] for r in rows])
|
||||
cpa_idx = defaultdict(list)
|
||||
for i, c in enumerate(cpas):
|
||||
cpa_idx[c].append(i)
|
||||
cpa_idx = {c: np.array(v) for c, v in cpa_idx.items()}
|
||||
pool_size = {c: len(v)-1 for c, v in cpa_idx.items()}
|
||||
all_idx = np.arange(n)
|
||||
print(f'BCD signatures: {n:,}; CPAs: {len(cpa_idx)}')
|
||||
|
||||
rng = np.random.default_rng(SEED)
|
||||
hit_any = np.zeros(n, bool)
|
||||
hit_same = np.zeros(n, bool)
|
||||
cand_firm_maxcos = np.empty(n, dtype=object) # any-pair partner firm
|
||||
cand_firm_same = np.empty(n, dtype=object)
|
||||
psize = np.zeros(n, np.int32)
|
||||
for si in range(n):
|
||||
np_ = pool_size[cpas[si]]; psize[si] = np_
|
||||
if np_ <= 0:
|
||||
continue
|
||||
same = cpa_idx[cpas[si]]
|
||||
need = np_; cand = []; att = 0
|
||||
while need > 0 and att < 10:
|
||||
draw = rng.choice(n, size=need*2, replace=True)
|
||||
ok = draw[~np.isin(draw, same)]
|
||||
cand.extend(ok[:need].tolist()); need -= len(ok[:need]); att += 1
|
||||
if need > 0:
|
||||
pm = np.ones(n, bool); pm[same] = False
|
||||
cand.extend(rng.choice(all_idx[pm], size=need, replace=False).tolist())
|
||||
cand = np.array(cand[:np_], dtype=np.int64)
|
||||
cosv = feats[cand] @ feats[si]
|
||||
dist = POP[dh[cand] ^ dh[si]].sum(axis=1)
|
||||
mc = int(np.argmax(cosv)); md = int(np.argmin(dist))
|
||||
if cosv[mc] > 0.95 and dist[md] <= 5:
|
||||
hit_any[si] = True
|
||||
cand_firm_maxcos[si] = firms[cand[mc]]
|
||||
spm = (cosv > 0.95) & (dist <= 5)
|
||||
if spm.any():
|
||||
hit_same[si] = True
|
||||
cand_firm_same[si] = firms[cand[int(np.argmax(spm))]]
|
||||
|
||||
# ---- Logistic regression: hit_any ~ FirmB + FirmC + log(pool), Firm D reference ----
|
||||
hp = psize > 0
|
||||
y = hit_any[hp].astype(np.float64)
|
||||
fa = firms[hp]
|
||||
lp = np.log(psize[hp].astype(np.float64)); lp = lp - lp.mean()
|
||||
X = np.column_stack([np.ones(y.shape), (fa == 'B').astype(float), (fa == 'C').astype(float), lp])
|
||||
beta, se = logistic_fit(X, y)
|
||||
print(f'\n[BCD logistic: hit_any ~ FirmB + FirmC + log(pool); Firm D = reference] n={len(y):,}, y_mean={y.mean():.4f}')
|
||||
for nm, b, s in zip(['intercept(FirmD)', 'FirmB', 'FirmC', 'log(pool,centred)'], beta, se):
|
||||
print(f' {nm}: beta={b:+.4f} SE={s:.4f} OR={np.exp(b):.4f} z~{abs(b)/s if s>0 else float("inf"):.2f}')
|
||||
|
||||
# ---- Cross-firm hit matrix (any-pair max-cos partner) ----
|
||||
print('\n[BCD cross-firm hit matrix: any-pair, source firm x max-cos partner firm]')
|
||||
print(' src -> B C D | within-firm% (n hits)')
|
||||
for sf in ['B', 'C', 'D']:
|
||||
m = (firms == sf) & hit_any
|
||||
parts = cand_firm_maxcos[m]
|
||||
tot = len(parts)
|
||||
cnt = {f: int((parts == f).sum()) for f in ['B', 'C', 'D']}
|
||||
wf = cnt[sf]/tot if tot else 0
|
||||
print(f' {sf} {cnt["B"]:5d} {cnt["C"]:5d} {cnt["D"]:5d} | {wf:6.1%} ({tot})')
|
||||
|
||||
print('\n[BCD same-pair within-firm concentration]')
|
||||
for sf in ['B', 'C', 'D']:
|
||||
m = (firms == sf) & hit_same
|
||||
parts = cand_firm_same[m]; tot = len(parts)
|
||||
wf = int((parts == sf).sum())/tot if tot else 0
|
||||
print(f' Firm {sf}: {wf:.1%} within-firm ({int((parts==sf).sum())}/{tot})')
|
||||
Reference in New Issue
Block a user