Add Scripts 39b/c/d/e + 40b + 43: anchor-based FAR diagnostics
Spike checkpoint in response to codex rounds 28-30 review:
- 39b/c: signature-level dip test on Big-4 and non-Big-4 marginals
- 39d: dHash discrete-value robustness (raw vs jittered + histogram
valleys + firm residualization); confirms within-firm dHash dip
rejection is integer-mass-point artefact
- 39e: dHash firm-residualized + jittered 2x2 factorial decomposition;
confirms Big-4 pooled dh "multimodality" is composition + integer
artefact (centered + jittered p=0.35, 0/5 seeds reject)
- 40b: inter-CPA per-pair FAR sweep (cos + dh marginal + joint +
conditional); replicates v3 cos>0.95 FAR=0.0006 and provides
v4-new dh FAR curve
- 43: pool-normalized per-signature FAR (codex round-30 fix for
per-pair vs per-signature conflation); per-sig FAR for deployed
any-pair rule = 11.02%, per-firm structure shows Firm A 20% vs
B/C/D <1%
These scripts replace the distributional path (K=3 mixture / dip /
antimode) with anchor-based threshold derivation. Companion
artefacts in reports/v4_big4/{signature_level_diptest,
midsmall_signature_diptest, dhash_discrete_robustness,
inter_cpa_far_sweep, pool_normalized_far}/.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,195 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script 39b: Signature-Level Dip Test (multimodality at the signature cloud)
|
||||
============================================================================
|
||||
Phase 5 pre-emptive evidence. Script 34 / 36 already report Hartigan
|
||||
dip tests on the 437 accountant-level (cos_mean, dh_mean) means and
|
||||
both marginals reject unimodality at p < 5e-4. Reviewers may ask
|
||||
whether the same multimodality is detectable at the signature level
|
||||
itself (n = 150,442 Big-4 signatures) and whether the multimodality
|
||||
is a within-firm or only a between-firm phenomenon.
|
||||
|
||||
This script supplies the missing dip evidence on the raw signature
|
||||
cloud. It is a *diagnostic* in the same role as Scripts 34/36 dip
|
||||
tests: it does not derive an operational threshold; it characterises
|
||||
the marginal distributions of (cos, dh_indep) at the signature level.
|
||||
|
||||
Outputs:
|
||||
reports/v4_big4/signature_level_diptest/
|
||||
sig_diptest_results.json
|
||||
sig_diptest_report.md
|
||||
|
||||
Tests performed:
|
||||
A. Pooled Big-4 marginals (cos, dh_indep), n = 150,442
|
||||
B. Per-firm marginals (Firm A / B / C / D separately)
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
import diptest
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from scipy import stats
|
||||
from scipy.signal import find_peaks
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
|
||||
'v4_big4/signature_level_diptest')
|
||||
OUT.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'Firm A',
|
||||
'安侯建業聯合': 'Firm B',
|
||||
'資誠聯合': 'Firm C',
|
||||
'安永聯合': 'Firm D'}
|
||||
N_BOOT = 2000
|
||||
|
||||
|
||||
def load_big4_signatures():
|
||||
conn = sqlite3.connect(DB)
|
||||
cur = conn.cursor()
|
||||
cur.execute('''
|
||||
SELECT s.assigned_accountant, a.firm,
|
||||
s.max_similarity_to_same_accountant,
|
||||
CAST(s.min_dhash_independent AS REAL)
|
||||
FROM signatures s
|
||||
JOIN accountants a ON s.assigned_accountant = a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL
|
||||
AND s.max_similarity_to_same_accountant IS NOT NULL
|
||||
AND s.min_dhash_independent IS NOT NULL
|
||||
AND a.firm IN (?, ?, ?, ?)
|
||||
''', BIG4)
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
|
||||
def kde_dip(values, n_boot=N_BOOT):
|
||||
arr = np.asarray(values, dtype=float)
|
||||
arr = arr[np.isfinite(arr)]
|
||||
dip, pval = diptest.diptest(arr, boot_pval=True, n_boot=n_boot)
|
||||
kde = stats.gaussian_kde(arr, bw_method='silverman')
|
||||
xs = np.linspace(arr.min(), arr.max(), 2000)
|
||||
density = kde(xs)
|
||||
peaks, _ = find_peaks(density, prominence=density.max() * 0.02)
|
||||
antimodes = []
|
||||
for i in range(len(peaks) - 1):
|
||||
seg = density[peaks[i]:peaks[i + 1]]
|
||||
if not len(seg):
|
||||
continue
|
||||
local = peaks[i] + int(np.argmin(seg))
|
||||
antimodes.append(float(xs[local]))
|
||||
return {
|
||||
'n': int(len(arr)),
|
||||
'dip': float(dip),
|
||||
'dip_pvalue': float(pval),
|
||||
'unimodal_alpha05': bool(pval > 0.05),
|
||||
'n_modes': int(len(peaks)),
|
||||
'mode_locations': [float(xs[p]) for p in peaks],
|
||||
'antimodes': antimodes,
|
||||
'n_boot': int(n_boot),
|
||||
}
|
||||
|
||||
|
||||
def _fmt_p(p):
|
||||
if p == 0.0:
|
||||
return '< 5e-4 (no bootstrap replicate exceeded observed dip)'
|
||||
return f'{p:.4g}'
|
||||
|
||||
|
||||
def main():
|
||||
print('=' * 72)
|
||||
print('Script 39b: Signature-Level Dip Test')
|
||||
print('=' * 72)
|
||||
rows = load_big4_signatures()
|
||||
cos_all = np.array([r[2] for r in rows], dtype=float)
|
||||
dh_all = np.array([r[3] for r in rows], dtype=float)
|
||||
firms = np.array([ALIAS[r[1]] for r in rows])
|
||||
print(f'\nLoaded {len(rows):,} Big-4 signatures')
|
||||
for f in sorted(set(firms)):
|
||||
print(f' {f}: {(firms == f).sum():,}')
|
||||
|
||||
results = {
|
||||
'meta': {
|
||||
'script': '39b',
|
||||
'timestamp': datetime.now().isoformat(timespec='seconds'),
|
||||
'n_total': int(len(rows)),
|
||||
'n_boot': N_BOOT,
|
||||
'note': ('Signature-level Hartigan dip test on Big-4 '
|
||||
'(cos, dh_indep) marginals; pooled and per-firm.'),
|
||||
},
|
||||
'pooled': {},
|
||||
'per_firm': {},
|
||||
}
|
||||
|
||||
# A. Pooled
|
||||
print('\n[A] Pooled Big-4')
|
||||
for desc, arr in [('cos', cos_all), ('dh_indep', dh_all)]:
|
||||
r = kde_dip(arr)
|
||||
results['pooled'][desc] = r
|
||||
print(f' {desc}: n={r["n"]:,}, dip={r["dip"]:.5f}, '
|
||||
f'p={_fmt_p(r["dip_pvalue"])}, n_modes={r["n_modes"]}')
|
||||
|
||||
# B. Per-firm
|
||||
print('\n[B] Per-firm')
|
||||
for f in sorted(set(firms)):
|
||||
mask = firms == f
|
||||
results['per_firm'][f] = {}
|
||||
for desc, arr in [('cos', cos_all[mask]), ('dh_indep', dh_all[mask])]:
|
||||
r = kde_dip(arr)
|
||||
results['per_firm'][f][desc] = r
|
||||
print(f' {f} {desc}: n={r["n"]:,}, dip={r["dip"]:.5f}, '
|
||||
f'p={_fmt_p(r["dip_pvalue"])}, n_modes={r["n_modes"]}')
|
||||
|
||||
json_path = OUT / 'sig_diptest_results.json'
|
||||
json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False),
|
||||
encoding='utf-8')
|
||||
print(f'\n[json] {json_path}')
|
||||
|
||||
md = ['# Signature-Level Dip Test (Script 39b)',
|
||||
'',
|
||||
f'Generated: {results["meta"]["timestamp"]}',
|
||||
f'Bootstrap replicates: {N_BOOT}',
|
||||
'',
|
||||
'## A. Pooled Big-4 signature cloud',
|
||||
'',
|
||||
f'n = {results["meta"]["n_total"]:,} signatures',
|
||||
'',
|
||||
'| Marginal | dip | p (boot) | n_modes | unimodal @0.05 |',
|
||||
'|---|---|---|---|---|']
|
||||
for desc in ['cos', 'dh_indep']:
|
||||
r = results['pooled'][desc]
|
||||
md.append(f'| {desc} | {r["dip"]:.5f} | {_fmt_p(r["dip_pvalue"])} | '
|
||||
f'{r["n_modes"]} | {r["unimodal_alpha05"]} |')
|
||||
|
||||
md += ['', '## B. Per-firm signature-level dip tests', '',
|
||||
'| Firm | Marginal | n | dip | p (boot) | n_modes | unimodal @0.05 |',
|
||||
'|---|---|---|---|---|---|---|']
|
||||
for f in sorted(results['per_firm']):
|
||||
for desc in ['cos', 'dh_indep']:
|
||||
r = results['per_firm'][f][desc]
|
||||
md.append(f'| {f} | {desc} | {r["n"]:,} | {r["dip"]:.5f} | '
|
||||
f'{_fmt_p(r["dip_pvalue"])} | {r["n_modes"]} | '
|
||||
f'{r["unimodal_alpha05"]} |')
|
||||
md += ['',
|
||||
'## Reading guide',
|
||||
'',
|
||||
('A unimodality rejection at the signature level confirms '
|
||||
'multimodal structure independent of accountant-level '
|
||||
'aggregation. A within-firm rejection further indicates the '
|
||||
'multimodality is not solely a between-firm artefact. A '
|
||||
'within-firm non-rejection (e.g., Firm A) is consistent with '
|
||||
'that firm being concentrated in a single mechanism corner.'),
|
||||
'',
|
||||
('All thresholds and operational classifiers remain those of '
|
||||
'v3.x §III-K and v4.0 §III-J; this script supplies diagnostic '
|
||||
'evidence only.'),
|
||||
'']
|
||||
md_path = OUT / 'sig_diptest_report.md'
|
||||
md_path.write_text('\n'.join(md), encoding='utf-8')
|
||||
print(f'[md ] {md_path}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,214 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script 39c: Mid/Small-Firm Signature-Level Dip Test
|
||||
====================================================
|
||||
Companion to Script 39b. 39b showed every Big-4 firm rejects
|
||||
unimodality on the dHash signature marginal (p < 5e-4 in each
|
||||
of A/B/C/D) while every Big-4 firm fails to reject unimodality
|
||||
on the cosine marginal. This script asks the same questions of
|
||||
the mid/small-firm population (non-Big-4):
|
||||
|
||||
1. Does the pooled mid/small-firm signature cloud show the same
|
||||
dHash multimodality?
|
||||
2. Within individual mid/small firms (those with enough
|
||||
signatures to support the test), does the dHash multimodality
|
||||
hold firm-internally as it does in Big-4?
|
||||
|
||||
If yes, the dHash signature-level multimodality is corpus-universal
|
||||
and the Big-4 scope restriction of v4.0 is not necessary on dHash
|
||||
grounds (cf §III-G item 2 which currently rests on Big-4-level
|
||||
multimodality). The cosine axis is reported alongside for
|
||||
completeness, but no v4.0 claim turns on cosine multimodality
|
||||
outside Big-4.
|
||||
|
||||
Outputs:
|
||||
reports/v4_big4/midsmall_signature_diptest/
|
||||
midsmall_diptest_results.json
|
||||
midsmall_diptest_report.md
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
import diptest
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from scipy import stats
|
||||
from scipy.signal import find_peaks
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
|
||||
'v4_big4/midsmall_signature_diptest')
|
||||
OUT.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
N_BOOT = 2000
|
||||
SINGLE_FIRM_MIN_SIG = 500 # minimum signature count to run a per-firm dip test
|
||||
|
||||
|
||||
def load_non_big4_signatures():
|
||||
conn = sqlite3.connect(DB)
|
||||
cur = conn.cursor()
|
||||
cur.execute('''
|
||||
SELECT a.firm,
|
||||
s.max_similarity_to_same_accountant,
|
||||
CAST(s.min_dhash_independent AS REAL)
|
||||
FROM signatures s
|
||||
JOIN accountants a ON s.assigned_accountant = a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL
|
||||
AND s.max_similarity_to_same_accountant IS NOT NULL
|
||||
AND s.min_dhash_independent IS NOT NULL
|
||||
AND a.firm IS NOT NULL
|
||||
AND a.firm NOT IN (?, ?, ?, ?)
|
||||
''', BIG4)
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
|
||||
def kde_dip(values, n_boot=N_BOOT):
|
||||
arr = np.asarray(values, dtype=float)
|
||||
arr = arr[np.isfinite(arr)]
|
||||
if len(arr) < 10:
|
||||
return {'n': int(len(arr)), 'skipped': 'too few points'}
|
||||
dip, pval = diptest.diptest(arr, boot_pval=True, n_boot=n_boot)
|
||||
kde = stats.gaussian_kde(arr, bw_method='silverman')
|
||||
xs = np.linspace(arr.min(), arr.max(), 2000)
|
||||
density = kde(xs)
|
||||
peaks, _ = find_peaks(density, prominence=density.max() * 0.02)
|
||||
antimodes = []
|
||||
for i in range(len(peaks) - 1):
|
||||
seg = density[peaks[i]:peaks[i + 1]]
|
||||
if not len(seg):
|
||||
continue
|
||||
local = peaks[i] + int(np.argmin(seg))
|
||||
antimodes.append(float(xs[local]))
|
||||
return {
|
||||
'n': int(len(arr)),
|
||||
'dip': float(dip),
|
||||
'dip_pvalue': float(pval),
|
||||
'unimodal_alpha05': bool(pval > 0.05),
|
||||
'n_modes': int(len(peaks)),
|
||||
'mode_locations': [float(xs[p]) for p in peaks],
|
||||
'antimodes': antimodes,
|
||||
'n_boot': int(n_boot),
|
||||
}
|
||||
|
||||
|
||||
def _fmt_p(p):
|
||||
if p == 0.0:
|
||||
return '< 5e-4'
|
||||
return f'{p:.4g}'
|
||||
|
||||
|
||||
def main():
|
||||
print('=' * 72)
|
||||
print('Script 39c: Mid/Small-Firm Signature-Level Dip Test')
|
||||
print('=' * 72)
|
||||
rows = load_non_big4_signatures()
|
||||
cos_all = np.array([r[1] for r in rows], dtype=float)
|
||||
dh_all = np.array([r[2] for r in rows], dtype=float)
|
||||
firms = np.array([r[0] for r in rows])
|
||||
n_total = len(rows)
|
||||
print(f'\nLoaded {n_total:,} non-Big-4 signatures across '
|
||||
f'{len(set(firms))} firms')
|
||||
|
||||
# Firm size table
|
||||
firm_counts = {}
|
||||
for f in firms:
|
||||
firm_counts[f] = firm_counts.get(f, 0) + 1
|
||||
top = sorted(firm_counts.items(), key=lambda x: -x[1])
|
||||
print('\nTop firms by signature count:')
|
||||
for f, n in top[:10]:
|
||||
print(f' {f}: {n:,}')
|
||||
|
||||
results = {
|
||||
'meta': {
|
||||
'script': '39c',
|
||||
'timestamp': datetime.now().isoformat(timespec='seconds'),
|
||||
'n_total': int(n_total),
|
||||
'n_firms': int(len(firm_counts)),
|
||||
'n_boot': N_BOOT,
|
||||
'single_firm_min_sig': SINGLE_FIRM_MIN_SIG,
|
||||
},
|
||||
'pooled': {},
|
||||
'per_firm_eligible': {},
|
||||
'firm_counts': dict(firm_counts),
|
||||
}
|
||||
|
||||
# A. Pooled non-Big-4
|
||||
print('\n[A] Pooled non-Big-4')
|
||||
for desc, arr in [('cos', cos_all), ('dh_indep', dh_all)]:
|
||||
r = kde_dip(arr)
|
||||
results['pooled'][desc] = r
|
||||
print(f' {desc}: n={r["n"]:,}, dip={r["dip"]:.5f}, '
|
||||
f'p={_fmt_p(r["dip_pvalue"])}, n_modes={r["n_modes"]}')
|
||||
|
||||
# B. Per-firm (only firms with >= SINGLE_FIRM_MIN_SIG signatures)
|
||||
eligible = [f for f, n in firm_counts.items() if n >= SINGLE_FIRM_MIN_SIG]
|
||||
print(f'\n[B] Per-firm dip test '
|
||||
f'(firms with >= {SINGLE_FIRM_MIN_SIG} signatures: {len(eligible)})')
|
||||
for f in sorted(eligible, key=lambda x: -firm_counts[x]):
|
||||
mask = firms == f
|
||||
results['per_firm_eligible'][f] = {'n': int(mask.sum())}
|
||||
for desc, arr in [('cos', cos_all[mask]), ('dh_indep', dh_all[mask])]:
|
||||
r = kde_dip(arr)
|
||||
results['per_firm_eligible'][f][desc] = r
|
||||
print(f' {f[:20]:<22s} {desc}: n={r["n"]:,}, dip={r["dip"]:.5f}, '
|
||||
f'p={_fmt_p(r["dip_pvalue"])}, n_modes={r["n_modes"]}')
|
||||
|
||||
json_path = OUT / 'midsmall_diptest_results.json'
|
||||
json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False),
|
||||
encoding='utf-8')
|
||||
print(f'\n[json] {json_path}')
|
||||
|
||||
md = ['# Mid/Small-Firm Signature-Level Dip Test (Script 39c)',
|
||||
'',
|
||||
f'Generated: {results["meta"]["timestamp"]}',
|
||||
f'Bootstrap replicates: {N_BOOT}',
|
||||
'',
|
||||
'## A. Pooled non-Big-4 signature cloud',
|
||||
'',
|
||||
f'n = {n_total:,} signatures across '
|
||||
f'{results["meta"]["n_firms"]} firms',
|
||||
'',
|
||||
'| Marginal | dip | p (boot) | n_modes | unimodal @0.05 |',
|
||||
'|---|---|---|---|---|']
|
||||
for desc in ['cos', 'dh_indep']:
|
||||
r = results['pooled'][desc]
|
||||
md.append(f'| {desc} | {r["dip"]:.5f} | {_fmt_p(r["dip_pvalue"])} | '
|
||||
f'{r["n_modes"]} | {r["unimodal_alpha05"]} |')
|
||||
|
||||
md += ['', f'## B. Single mid/small firms (>= {SINGLE_FIRM_MIN_SIG} '
|
||||
f'signatures), {len(eligible)} qualify', '',
|
||||
'| Firm | Marginal | n | dip | p (boot) | n_modes | unimodal @0.05 |',
|
||||
'|---|---|---|---|---|---|---|']
|
||||
for f in sorted(eligible, key=lambda x: -firm_counts[x]):
|
||||
for desc in ['cos', 'dh_indep']:
|
||||
r = results['per_firm_eligible'][f][desc]
|
||||
md.append(f'| {f[:20]} | {desc} | {r["n"]:,} | {r["dip"]:.5f} | '
|
||||
f'{_fmt_p(r["dip_pvalue"])} | {r["n_modes"]} | '
|
||||
f'{r["unimodal_alpha05"]} |')
|
||||
|
||||
md += ['',
|
||||
'## Reading guide',
|
||||
'',
|
||||
('If the pooled-non-Big-4 dHash marginal rejects unimodality '
|
||||
'AND the qualifying individual mid/small firms also reject, '
|
||||
'the dHash within-firm replication regime structure is '
|
||||
'corpus-universal and not Big-4-specific. In that case the '
|
||||
'Big-4 scope of v4.0 is justified on cosine-axis grounds '
|
||||
'(Firm-A composition; §III-G item 1) and accountant-level '
|
||||
'LOOO reproducibility (§III-G item 3), but not on dHash '
|
||||
'multimodality grounds (§III-G item 2 should be re-scoped or '
|
||||
'qualified). If the per-firm dHash tests instead fail to '
|
||||
'reject inside mid/small firms, the dHash multimodality is '
|
||||
'Big-4-specific and §III-G item 2 holds as stated.'),
|
||||
'']
|
||||
md_path = OUT / 'midsmall_diptest_report.md'
|
||||
md_path.write_text('\n'.join(md), encoding='utf-8')
|
||||
print(f'[md ] {md_path}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,446 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script 39d: dHash Discrete-Value Robustness Diagnostics
|
||||
========================================================
|
||||
Codex (gpt-5.5 xhigh) attack on Script 39b/39c findings revealed that
|
||||
the within-firm dHash dip-test rejections are driven by integer mass
|
||||
points (dHash takes integer values 0..64). A uniform jitter of
|
||||
[-0.5, +0.5] eliminates dip rejection in every firm tested. This
|
||||
script consolidates that finding into a permanent diagnostic and adds:
|
||||
|
||||
1. Raw vs jittered dip with multi-seed robustness (5 seeds)
|
||||
2. Integer-histogram valley analysis: locate local minima between
|
||||
adjacent peaks in the binned integer distribution; report whether
|
||||
any valley centers near dh = 5
|
||||
3. Firm-residualized dip on dHash (analog of cosine firm-mean
|
||||
centering that confirmed the cosine reframe)
|
||||
4. Pairwise pair-coincidence: does the same same-CPA pair achieve
|
||||
both max cosine and min dHash, or are the two descriptors
|
||||
attached to different pairs? Foundation for "is (cos, dh) a
|
||||
joint signature regime descriptor or two parallel descriptors"
|
||||
|
||||
This script does not derive operational thresholds; it characterises
|
||||
whether the v4.0 K=3 mixture and v3.x cos>0.95 AND dh<=5 rule are
|
||||
robustly supported once integer-discreteness artifacts are removed.
|
||||
|
||||
Outputs:
|
||||
reports/v4_big4/dhash_discrete_robustness/
|
||||
dhash_discrete_results.json
|
||||
dhash_discrete_report.md
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
import diptest
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
|
||||
'v4_big4/dhash_discrete_robustness')
|
||||
OUT.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'Firm A',
|
||||
'安侯建業聯合': 'Firm B',
|
||||
'資誠聯合': 'Firm C',
|
||||
'安永聯合': 'Firm D'}
|
||||
N_BOOT = 2000
|
||||
JITTER_SEEDS = [42, 43, 44, 45, 46]
|
||||
SINGLE_FIRM_MIN_SIG = 500
|
||||
|
||||
|
||||
def load_signatures():
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute('''
|
||||
SELECT a.firm, s.assigned_accountant,
|
||||
s.max_similarity_to_same_accountant,
|
||||
CAST(s.min_dhash_independent AS REAL)
|
||||
FROM signatures s
|
||||
JOIN accountants a ON s.assigned_accountant = a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL
|
||||
AND s.max_similarity_to_same_accountant IS NOT NULL
|
||||
AND s.min_dhash_independent IS NOT NULL
|
||||
AND a.firm IS NOT NULL
|
||||
''')
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
|
||||
def dip(values, n_boot=N_BOOT):
|
||||
arr = np.asarray(values, dtype=float)
|
||||
arr = arr[np.isfinite(arr)]
|
||||
d, p = diptest.diptest(arr, boot_pval=True, n_boot=n_boot)
|
||||
return float(d), float(p)
|
||||
|
||||
|
||||
def multi_seed_jitter_dip(values, seeds=JITTER_SEEDS, n_boot=N_BOOT):
|
||||
"""Compute dip stat + p-value across seeds; return distribution."""
|
||||
arr = np.asarray(values, dtype=float)
|
||||
arr = arr[np.isfinite(arr)]
|
||||
stats = []
|
||||
for seed in seeds:
|
||||
rng = np.random.default_rng(seed)
|
||||
j = arr + rng.uniform(-0.5, 0.5, len(arr))
|
||||
d, p = diptest.diptest(j, boot_pval=True, n_boot=n_boot)
|
||||
stats.append({'seed': seed, 'dip': float(d), 'p': float(p)})
|
||||
return {
|
||||
'n_seeds': len(seeds),
|
||||
'p_min': min(s['p'] for s in stats),
|
||||
'p_max': max(s['p'] for s in stats),
|
||||
'p_median': float(np.median([s['p'] for s in stats])),
|
||||
'dip_min': min(s['dip'] for s in stats),
|
||||
'dip_max': max(s['dip'] for s in stats),
|
||||
'reject_at_05_count': int(sum(1 for s in stats if s['p'] <= 0.05)),
|
||||
'per_seed': stats,
|
||||
}
|
||||
|
||||
|
||||
def integer_histogram_valleys(values, max_bin=20):
|
||||
"""For integer-valued data, locate local minima in the count
|
||||
histogram on bins 0..max_bin. Returns valley positions and depths
|
||||
relative to flanking peaks."""
|
||||
arr = np.asarray(values, dtype=float)
|
||||
arr = arr[np.isfinite(arr)]
|
||||
bins = np.arange(0, max_bin + 2) # 0, 1, ..., max_bin+1
|
||||
counts, edges = np.histogram(arr, bins=bins)
|
||||
centers = (edges[:-1] + edges[1:]) / 2.0
|
||||
valleys = []
|
||||
for i in range(1, len(counts) - 1):
|
||||
if counts[i] < counts[i - 1] and counts[i] < counts[i + 1]:
|
||||
left_peak = counts[i - 1]
|
||||
right_peak = counts[i + 1]
|
||||
min_peak = min(left_peak, right_peak)
|
||||
depth_rel = (min_peak - counts[i]) / min_peak if min_peak else 0
|
||||
valleys.append({
|
||||
'bin_center': float(centers[i]),
|
||||
'count': int(counts[i]),
|
||||
'left_peak_bin': int(centers[i - 1]),
|
||||
'left_peak_count': int(left_peak),
|
||||
'right_peak_bin': int(centers[i + 1]),
|
||||
'right_peak_count': int(right_peak),
|
||||
'depth_rel': float(depth_rel),
|
||||
})
|
||||
return {
|
||||
'histogram_bins_0_to_max': counts[:max_bin + 1].tolist(),
|
||||
'valleys': valleys,
|
||||
'note': ('valleys are bins where count < both neighbours; '
|
||||
'depth_rel = (min(neighbour) - bin) / min(neighbour). '
|
||||
'A genuine antimode would have a deep, stable valley '
|
||||
'with depth_rel > 0.1.'),
|
||||
}
|
||||
|
||||
|
||||
def firm_residualized(values, firm_labels):
|
||||
"""Return values with firm means subtracted (centered to grand mean
|
||||
over firms). Used to test whether residual within-firm structure
|
||||
rejects unimodality."""
|
||||
arr = np.asarray(values, dtype=float)
|
||||
firms = np.asarray(firm_labels)
|
||||
out = arr.copy()
|
||||
grand = float(np.mean(arr))
|
||||
for f in np.unique(firms):
|
||||
m = firms == f
|
||||
out[m] = arr[m] - float(np.mean(arr[m])) + grand
|
||||
return out
|
||||
|
||||
|
||||
def pair_coincidence_rate():
|
||||
"""Fraction of signatures whose max-cosine partner equals the
|
||||
min-dHash partner within the same-CPA cross-year pool."""
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute('''
|
||||
SELECT COUNT(*) AS n_total,
|
||||
SUM(CASE WHEN max_cosine_pair_id IS NOT NULL
|
||||
AND min_dhash_pair_id IS NOT NULL
|
||||
AND max_cosine_pair_id = min_dhash_pair_id
|
||||
THEN 1 ELSE 0 END) AS n_same_pair,
|
||||
SUM(CASE WHEN max_cosine_pair_id IS NOT NULL
|
||||
AND min_dhash_pair_id IS NOT NULL
|
||||
AND max_cosine_pair_id != min_dhash_pair_id
|
||||
THEN 1 ELSE 0 END) AS n_diff_pair,
|
||||
SUM(CASE WHEN max_cosine_pair_id IS NULL
|
||||
OR min_dhash_pair_id IS NULL
|
||||
THEN 1 ELSE 0 END) AS n_null
|
||||
FROM signatures
|
||||
''')
|
||||
row = cur.fetchone()
|
||||
conn.close()
|
||||
n_total, n_same, n_diff, n_null = row
|
||||
n_with_both = (n_same or 0) + (n_diff or 0)
|
||||
return {
|
||||
'n_total': int(n_total or 0),
|
||||
'n_with_both_pair_ids': int(n_with_both),
|
||||
'n_same_pair': int(n_same or 0),
|
||||
'n_diff_pair': int(n_diff or 0),
|
||||
'n_null': int(n_null or 0),
|
||||
'same_pair_rate': (float(n_same) / n_with_both
|
||||
if n_with_both else None),
|
||||
'note': ('rate computed over signatures where both '
|
||||
'max_cosine_pair_id and min_dhash_pair_id are present'),
|
||||
}
|
||||
|
||||
|
||||
def _fmt_p(p):
|
||||
return '< 5e-4' if p == 0.0 else f'{p:.4g}'
|
||||
|
||||
|
||||
def main():
|
||||
print('=' * 72)
|
||||
print('Script 39d: dHash Discrete-Value Robustness Diagnostics')
|
||||
print('=' * 72)
|
||||
rows = load_signatures()
|
||||
firms_raw = np.array([r[0] for r in rows])
|
||||
cos = np.array([r[2] for r in rows], dtype=float)
|
||||
dh = np.array([r[3] for r in rows], dtype=float)
|
||||
is_big4 = np.isin(firms_raw, BIG4)
|
||||
n = len(rows)
|
||||
print(f'\nLoaded {n:,} signatures; Big-4 {is_big4.sum():,}, '
|
||||
f'non-Big-4 {(~is_big4).sum():,}')
|
||||
|
||||
results = {
|
||||
'meta': {
|
||||
'script': '39d',
|
||||
'timestamp': datetime.now().isoformat(timespec='seconds'),
|
||||
'n_total_signatures': int(n),
|
||||
'n_big4': int(is_big4.sum()),
|
||||
'n_non_big4': int((~is_big4).sum()),
|
||||
'n_boot': N_BOOT,
|
||||
'jitter_seeds': JITTER_SEEDS,
|
||||
'note': ('Diagnostic for dHash integer-mass-point artifact '
|
||||
'in dip test; codex round-29 attack on Script 39b/c'),
|
||||
},
|
||||
}
|
||||
|
||||
# ---- A. Raw vs multi-seed jittered dip ----
|
||||
print('\n[A] Raw vs jittered dip (5 seeds, n_boot=2000)')
|
||||
panels = {}
|
||||
# Big-4 pooled
|
||||
print(' Big-4 pooled:')
|
||||
raw_d, raw_p = dip(dh[is_big4])
|
||||
j = multi_seed_jitter_dip(dh[is_big4])
|
||||
panels['big4_pooled'] = {
|
||||
'n': int(is_big4.sum()),
|
||||
'raw': {'dip': raw_d, 'p': raw_p},
|
||||
'jittered': j,
|
||||
}
|
||||
print(f' raw: dip={raw_d:.5f}, p={_fmt_p(raw_p)}')
|
||||
print(f' jitter: p_median={j["p_median"]:.4g}, '
|
||||
f'p_range=[{j["p_min"]:.4g}, {j["p_max"]:.4g}], '
|
||||
f'reject@.05 in {j["reject_at_05_count"]}/5 seeds')
|
||||
# Each Big-4 firm
|
||||
for f in BIG4:
|
||||
mask = firms_raw == f
|
||||
if mask.sum() == 0:
|
||||
continue
|
||||
raw_d, raw_p = dip(dh[mask])
|
||||
j = multi_seed_jitter_dip(dh[mask])
|
||||
panels[ALIAS[f]] = {
|
||||
'n': int(mask.sum()),
|
||||
'raw': {'dip': raw_d, 'p': raw_p},
|
||||
'jittered': j,
|
||||
}
|
||||
print(f' {ALIAS[f]} (n={mask.sum():,}):')
|
||||
print(f' raw: dip={raw_d:.5f}, p={_fmt_p(raw_p)}')
|
||||
print(f' jitter: p_median={j["p_median"]:.4g}, '
|
||||
f'reject@.05 in {j["reject_at_05_count"]}/5 seeds')
|
||||
# Non-Big-4 pooled
|
||||
print(' Non-Big-4 pooled:')
|
||||
raw_d, raw_p = dip(dh[~is_big4])
|
||||
j = multi_seed_jitter_dip(dh[~is_big4])
|
||||
panels['non_big4_pooled'] = {
|
||||
'n': int((~is_big4).sum()),
|
||||
'raw': {'dip': raw_d, 'p': raw_p},
|
||||
'jittered': j,
|
||||
}
|
||||
print(f' raw: dip={raw_d:.5f}, p={_fmt_p(raw_p)}')
|
||||
print(f' jitter: p_median={j["p_median"]:.4g}, '
|
||||
f'reject@.05 in {j["reject_at_05_count"]}/5 seeds')
|
||||
results['raw_vs_jittered_dip'] = panels
|
||||
|
||||
# ---- B. Integer-histogram valley analysis ----
|
||||
print('\n[B] Integer-histogram valley analysis (bins 0..20)')
|
||||
valleys = {}
|
||||
valleys['big4_pooled'] = integer_histogram_valleys(dh[is_big4])
|
||||
print(f' Big-4 pooled: {len(valleys["big4_pooled"]["valleys"])} valleys')
|
||||
for v in valleys['big4_pooled']['valleys']:
|
||||
print(f' bin {v["bin_center"]:.1f}: count={v["count"]}, '
|
||||
f'depth_rel={v["depth_rel"]:.3f}')
|
||||
for f in BIG4:
|
||||
mask = firms_raw == f
|
||||
if mask.sum() == 0:
|
||||
continue
|
||||
valleys[ALIAS[f]] = integer_histogram_valleys(dh[mask])
|
||||
print(f' {ALIAS[f]}: '
|
||||
f'{len(valleys[ALIAS[f]]["valleys"])} valleys')
|
||||
for v in valleys[ALIAS[f]]['valleys']:
|
||||
print(f' bin {v["bin_center"]:.1f}: count={v["count"]}, '
|
||||
f'depth_rel={v["depth_rel"]:.3f}')
|
||||
valleys['non_big4_pooled'] = integer_histogram_valleys(dh[~is_big4])
|
||||
print(f' Non-Big-4 pooled: '
|
||||
f'{len(valleys["non_big4_pooled"]["valleys"])} valleys')
|
||||
for v in valleys['non_big4_pooled']['valleys']:
|
||||
print(f' bin {v["bin_center"]:.1f}: count={v["count"]}, '
|
||||
f'depth_rel={v["depth_rel"]:.3f}')
|
||||
results['integer_histogram_valleys'] = valleys
|
||||
|
||||
# ---- C. Firm-residualized dip on dHash, signature level ----
|
||||
print('\n[C] Firm-residualized dHash dip (signature level)')
|
||||
firm_labels = np.array([
|
||||
ALIAS[f] if f in ALIAS else f'M:{f}'
|
||||
for f in firms_raw
|
||||
])
|
||||
# Big-4 only residualized over A/B/C/D
|
||||
dh_resid_big4 = firm_residualized(dh[is_big4], firm_labels[is_big4])
|
||||
raw_d, raw_p = dip(dh[is_big4])
|
||||
res_d, res_p = dip(dh_resid_big4)
|
||||
print(f' Big-4 raw: dip={raw_d:.5f}, p={_fmt_p(raw_p)}')
|
||||
print(f' Big-4 residualized: dip={res_d:.5f}, p={_fmt_p(res_p)}')
|
||||
# Also non-Big-4 residualized over their firms
|
||||
dh_resid_nbig4 = firm_residualized(dh[~is_big4], firm_labels[~is_big4])
|
||||
raw_d_n, raw_p_n = dip(dh[~is_big4])
|
||||
res_d_n, res_p_n = dip(dh_resid_nbig4)
|
||||
print(f' Non-Big-4 raw: dip={raw_d_n:.5f}, p={_fmt_p(raw_p_n)}')
|
||||
print(f' Non-Big-4 residualized: dip={res_d_n:.5f}, p={_fmt_p(res_p_n)}')
|
||||
results['firm_residualized_dh_dip'] = {
|
||||
'big4': {
|
||||
'raw': {'dip': raw_d, 'p': raw_p},
|
||||
'firm_residualized': {'dip': res_d, 'p': res_p},
|
||||
},
|
||||
'non_big4': {
|
||||
'raw': {'dip': raw_d_n, 'p': raw_p_n},
|
||||
'firm_residualized': {'dip': res_d_n, 'p': res_p_n},
|
||||
},
|
||||
'note': ('Residualization subtracts each firm mean dh and adds '
|
||||
'back the grand mean. If residual dip rejects, there is '
|
||||
'genuine within-firm dh multimodality independent of '
|
||||
'between-firm mean shifts. If residual fails to reject, '
|
||||
'all dh "multimodality" was between-firm composition.'),
|
||||
}
|
||||
|
||||
# ---- D. Pair-coincidence rate ----
|
||||
print('\n[D] Pair-coincidence rate (max-cos pair vs min-dh pair)')
|
||||
try:
|
||||
pc = pair_coincidence_rate()
|
||||
if pc['same_pair_rate'] is not None:
|
||||
print(f' n_with_both: {pc["n_with_both_pair_ids"]:,}, '
|
||||
f'same-pair rate: {pc["same_pair_rate"]:.4f}')
|
||||
else:
|
||||
print(' Pair IDs not stored in signatures table (skipped)')
|
||||
results['pair_coincidence'] = pc
|
||||
except sqlite3.OperationalError as e:
|
||||
print(f' SQL error (pair_id columns may not exist): {e}')
|
||||
results['pair_coincidence'] = {
|
||||
'error': str(e),
|
||||
'note': ('signatures table lacks max_cosine_pair_id / '
|
||||
'min_dhash_pair_id columns; analysis skipped'),
|
||||
}
|
||||
|
||||
json_path = OUT / 'dhash_discrete_results.json'
|
||||
json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False),
|
||||
encoding='utf-8')
|
||||
print(f'\n[json] {json_path}')
|
||||
|
||||
# ---- Report markdown ----
|
||||
md = ['# dHash Discrete-Value Robustness Diagnostics (Script 39d)',
|
||||
'', f'Generated: {results["meta"]["timestamp"]}',
|
||||
f'Bootstrap replicates: {N_BOOT}; jitter seeds: {JITTER_SEEDS}',
|
||||
'',
|
||||
'## A. Raw vs jittered dHash dip (signature level)',
|
||||
'',
|
||||
('dHash is integer-valued in [0, 64]. A raw dip test on '
|
||||
'integer mass points may reject unimodality due to discrete '
|
||||
'spikes rather than a continuous bimodal density. We add '
|
||||
'uniform jitter in [-0.5, +0.5] over 5 seeds and re-test.'),
|
||||
'',
|
||||
'| Scope | n | raw dip | raw p | jitter p median | jitter reject@.05 / 5 seeds |',
|
||||
'|---|---|---|---|---|---|']
|
||||
for key, label in [('big4_pooled', 'Big-4 pooled')] + \
|
||||
[(ALIAS[f], ALIAS[f]) for f in BIG4] + \
|
||||
[('non_big4_pooled', 'Non-Big-4 pooled')]:
|
||||
if key in panels:
|
||||
p = panels[key]
|
||||
md.append(f'| {label} | {p["n"]:,} | '
|
||||
f'{p["raw"]["dip"]:.5f} | '
|
||||
f'{_fmt_p(p["raw"]["p"])} | '
|
||||
f'{p["jittered"]["p_median"]:.4g} | '
|
||||
f'{p["jittered"]["reject_at_05_count"]}/5 |')
|
||||
md += ['',
|
||||
'**Interpretation.** If jittered dip ceases to reject in all '
|
||||
'panels, the raw-data rejection was driven by integer ties '
|
||||
'rather than a continuous bimodal density. Codex round-29 '
|
||||
'observed this pattern; this script confirms with multi-seed '
|
||||
'robustness.',
|
||||
'',
|
||||
'## B. Integer-histogram valley locations (bins 0..20)',
|
||||
'',
|
||||
('For each scope, list bins where count is strictly less '
|
||||
'than both neighbours, with relative depth '
|
||||
'(min(neighbour) - bin) / min(neighbour). A genuine '
|
||||
'antimode would show a deep, stable valley; integer-noise '
|
||||
'valleys are shallow and inconsistent across firms.'),
|
||||
'']
|
||||
for key, label in [('big4_pooled', 'Big-4 pooled')] + \
|
||||
[(ALIAS[f], ALIAS[f]) for f in BIG4] + \
|
||||
[('non_big4_pooled', 'Non-Big-4 pooled')]:
|
||||
if key in valleys:
|
||||
v_list = valleys[key]['valleys']
|
||||
if not v_list:
|
||||
md.append(f'- **{label}**: no integer-histogram valleys '
|
||||
f'in 0..20')
|
||||
else:
|
||||
desc = ', '.join(
|
||||
f'dh={v["bin_center"]:.0f} (depth_rel={v["depth_rel"]:.3f})'
|
||||
for v in v_list)
|
||||
md.append(f'- **{label}**: {desc}')
|
||||
md += ['',
|
||||
'## C. Firm-residualized dHash dip',
|
||||
'',
|
||||
('Subtract each firm mean dHash; add back grand mean. If '
|
||||
'residual rejects, within-firm multimodality is genuine. '
|
||||
'If residual fails to reject, all dh "multimodality" was '
|
||||
'between-firm composition.'),
|
||||
'',
|
||||
'| Scope | raw dip | raw p | residualized dip | residualized p |',
|
||||
'|---|---|---|---|---|']
|
||||
fr = results['firm_residualized_dh_dip']
|
||||
md += [f'| Big-4 | {fr["big4"]["raw"]["dip"]:.5f} | '
|
||||
f'{_fmt_p(fr["big4"]["raw"]["p"])} | '
|
||||
f'{fr["big4"]["firm_residualized"]["dip"]:.5f} | '
|
||||
f'{_fmt_p(fr["big4"]["firm_residualized"]["p"])} |',
|
||||
f'| Non-Big-4 | {fr["non_big4"]["raw"]["dip"]:.5f} | '
|
||||
f'{_fmt_p(fr["non_big4"]["raw"]["p"])} | '
|
||||
f'{fr["non_big4"]["firm_residualized"]["dip"]:.5f} | '
|
||||
f'{_fmt_p(fr["non_big4"]["firm_residualized"]["p"])} |']
|
||||
md += ['',
|
||||
'## D. Max-cos pair vs min-dh pair coincidence',
|
||||
'']
|
||||
pc = results.get('pair_coincidence', {})
|
||||
if 'same_pair_rate' in pc and pc['same_pair_rate'] is not None:
|
||||
md += [f'- n_signatures with both pair IDs: '
|
||||
f'{pc["n_with_both_pair_ids"]:,}',
|
||||
f'- same-pair rate: {pc["same_pair_rate"]:.4f} '
|
||||
f'({pc["n_same_pair"]:,} of '
|
||||
f'{pc["n_with_both_pair_ids"]:,})',
|
||||
'',
|
||||
('A high rate (>0.8) supports a single-pair regime '
|
||||
'descriptor language (cos and dh attached to the same '
|
||||
'partner). A low rate indicates the two descriptors '
|
||||
'attach to different partners and should be discussed '
|
||||
'as parallel-but-different evidence.')]
|
||||
elif 'error' in pc:
|
||||
md += [f'- column not present in DB: {pc["error"]}',
|
||||
('- note: schema-dependent; pair IDs not currently stored '
|
||||
'in signatures table.')]
|
||||
md.append('')
|
||||
md_path = OUT / 'dhash_discrete_report.md'
|
||||
md_path.write_text('\n'.join(md), encoding='utf-8')
|
||||
print(f'[md ] {md_path}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,250 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script 39e: dHash Firm-Residualized + Jittered Dip (final test)
|
||||
================================================================
|
||||
Script 39d showed:
|
||||
- Within-firm dh dip rejections all vanish after jitter (integer
|
||||
artifact)
|
||||
- Big-4 pooled dh dip survives jitter (p_median=0 over 5 seeds)
|
||||
|
||||
But Firm A mean dh = 2.73 vs Firms B/C/D ~6.5-7.4 -- a large
|
||||
between-firm location shift, analogous to the cosine case where
|
||||
firm-mean centering eliminated rejection.
|
||||
|
||||
This script applies BOTH corrections simultaneously:
|
||||
1. Firm-mean centering (remove between-firm location shifts)
|
||||
2. Uniform jitter in [-0.5, +0.5] (remove integer ties)
|
||||
|
||||
If the doubly-corrected dh distribution rejects unimodality, the
|
||||
Big-4 pooled multimodality is a genuine within-population, continuous
|
||||
phenomenon. If it fails to reject, dh "multimodality" is fully
|
||||
explained by between-firm composition (same conclusion as cosine).
|
||||
|
||||
Multi-seed (5 seeds) for robustness.
|
||||
|
||||
Outputs:
|
||||
reports/v4_big4/dhash_discrete_robustness/
|
||||
dhash_residualized_jittered_results.json
|
||||
dhash_residualized_jittered_report.md
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
import diptest
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
|
||||
'v4_big4/dhash_discrete_robustness')
|
||||
OUT.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'Firm A',
|
||||
'安侯建業聯合': 'Firm B',
|
||||
'資誠聯合': 'Firm C',
|
||||
'安永聯合': 'Firm D'}
|
||||
N_BOOT = 2000
|
||||
SEEDS = [42, 43, 44, 45, 46]
|
||||
|
||||
|
||||
def load_signatures():
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute('''
|
||||
SELECT a.firm, CAST(s.min_dhash_independent AS REAL)
|
||||
FROM signatures s
|
||||
JOIN accountants a ON s.assigned_accountant = a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL
|
||||
AND s.max_similarity_to_same_accountant IS NOT NULL
|
||||
AND s.min_dhash_independent IS NOT NULL
|
||||
AND a.firm IS NOT NULL
|
||||
''')
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
|
||||
def firm_residualize(values, firm_labels):
|
||||
arr = np.asarray(values, dtype=float)
|
||||
firms = np.asarray(firm_labels)
|
||||
out = arr.copy()
|
||||
grand = float(np.mean(arr))
|
||||
for f in np.unique(firms):
|
||||
m = firms == f
|
||||
out[m] = arr[m] - float(np.mean(arr[m])) + grand
|
||||
return out
|
||||
|
||||
|
||||
def dip_multi(values, seeds, with_jitter, n_boot=N_BOOT):
|
||||
arr = np.asarray(values, dtype=float)
|
||||
arr = arr[np.isfinite(arr)]
|
||||
results = []
|
||||
for seed in seeds:
|
||||
rng = np.random.default_rng(seed)
|
||||
v = arr + rng.uniform(-0.5, 0.5, len(arr)) if with_jitter else arr
|
||||
d, p = diptest.diptest(v, boot_pval=True, n_boot=n_boot)
|
||||
results.append({'seed': seed, 'dip': float(d), 'p': float(p)})
|
||||
if not with_jitter:
|
||||
break # without jitter the seed is irrelevant
|
||||
return results
|
||||
|
||||
|
||||
def _fmt_p(p):
|
||||
return '< 5e-4' if p == 0.0 else f'{p:.4g}'
|
||||
|
||||
|
||||
def summarize(name, results):
|
||||
ps = [r['p'] for r in results]
|
||||
ds = [r['dip'] for r in results]
|
||||
return {
|
||||
'name': name,
|
||||
'n_seeds': len(results),
|
||||
'dip_min': min(ds), 'dip_max': max(ds), 'dip_median': float(np.median(ds)),
|
||||
'p_min': min(ps), 'p_max': max(ps), 'p_median': float(np.median(ps)),
|
||||
'reject_at_05_count': int(sum(1 for p in ps if p <= 0.05)),
|
||||
'per_seed': results,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
print('=' * 72)
|
||||
print('Script 39e: dHash Firm-Residualized + Jittered Dip')
|
||||
print('=' * 72)
|
||||
rows = load_signatures()
|
||||
firms_raw = np.array([r[0] for r in rows])
|
||||
dh = np.array([r[1] for r in rows], dtype=float)
|
||||
is_big4 = np.isin(firms_raw, BIG4)
|
||||
big4_dh = dh[is_big4]
|
||||
big4_firms = np.array([ALIAS[f] for f in firms_raw[is_big4]])
|
||||
|
||||
print(f'\nLoaded {len(rows):,} signatures; Big-4 {is_big4.sum():,}')
|
||||
print('\nPer-firm Big-4 dh summary:')
|
||||
for f in sorted(set(big4_firms)):
|
||||
v = big4_dh[big4_firms == f]
|
||||
print(f' {f}: n={len(v):,} mean={v.mean():.3f} '
|
||||
f'median={np.median(v):.1f} sd={v.std():.3f}')
|
||||
|
||||
# ---- Test conditions, all on Big-4 signature-level dh ----
|
||||
panels = {}
|
||||
|
||||
# 1. Raw (no centering, no jitter)
|
||||
print('\n[1] Raw dh')
|
||||
r = dip_multi(big4_dh, [42], with_jitter=False)
|
||||
panels['raw'] = summarize('raw', r)
|
||||
print(f' dip={r[0]["dip"]:.5f}, p={_fmt_p(r[0]["p"])}')
|
||||
|
||||
# 2. Centered only (no jitter; integer values preserved)
|
||||
print('\n[2] Firm-mean centered, no jitter')
|
||||
centered = firm_residualize(big4_dh, big4_firms)
|
||||
r = dip_multi(centered, [42], with_jitter=False)
|
||||
panels['centered_only'] = summarize('centered_only', r)
|
||||
print(f' dip={r[0]["dip"]:.5f}, p={_fmt_p(r[0]["p"])}')
|
||||
|
||||
# 3. Jittered only (no centering)
|
||||
print('\n[3] Jittered (5 seeds), no centering')
|
||||
r = dip_multi(big4_dh, SEEDS, with_jitter=True)
|
||||
panels['jitter_only'] = summarize('jitter_only', r)
|
||||
print(f' p_median={panels["jitter_only"]["p_median"]:.4g}, '
|
||||
f'reject@.05 in '
|
||||
f'{panels["jitter_only"]["reject_at_05_count"]}/5 seeds')
|
||||
|
||||
# 4. Centered + jittered (THE key test)
|
||||
print('\n[4] Firm-mean centered + jittered (5 seeds) -- KEY TEST')
|
||||
r = dip_multi(centered, SEEDS, with_jitter=True)
|
||||
panels['centered_jittered'] = summarize('centered_jittered', r)
|
||||
print(f' p_median={panels["centered_jittered"]["p_median"]:.4g}, '
|
||||
f'reject@.05 in '
|
||||
f'{panels["centered_jittered"]["reject_at_05_count"]}/5 seeds')
|
||||
for s in r:
|
||||
print(f' seed {s["seed"]}: dip={s["dip"]:.5f}, p={_fmt_p(s["p"])}')
|
||||
|
||||
# Per-firm dh stats (re-confirm Firm A shift)
|
||||
firm_stats = {}
|
||||
for f in sorted(set(big4_firms)):
|
||||
v = big4_dh[big4_firms == f]
|
||||
firm_stats[f] = {
|
||||
'n': int(len(v)),
|
||||
'mean': float(v.mean()),
|
||||
'median': float(np.median(v)),
|
||||
'sd': float(v.std()),
|
||||
'p25': float(np.percentile(v, 25)),
|
||||
'p75': float(np.percentile(v, 75)),
|
||||
'pct_le_5': float(np.mean(v <= 5)),
|
||||
'pct_gt_15': float(np.mean(v > 15)),
|
||||
}
|
||||
|
||||
results = {
|
||||
'meta': {
|
||||
'script': '39e',
|
||||
'timestamp': datetime.now().isoformat(timespec='seconds'),
|
||||
'n_big4_signatures': int(big4_dh.size),
|
||||
'n_boot': N_BOOT,
|
||||
'seeds': SEEDS,
|
||||
'note': ('Final test: does Big-4 pooled dh multimodality '
|
||||
'survive BOTH firm-mean centering and integer-tie '
|
||||
'jitter?'),
|
||||
},
|
||||
'panels': panels,
|
||||
'per_firm_dh_stats': firm_stats,
|
||||
}
|
||||
|
||||
json_path = OUT / 'dhash_residualized_jittered_results.json'
|
||||
json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False),
|
||||
encoding='utf-8')
|
||||
print(f'\n[json] {json_path}')
|
||||
|
||||
md = [
|
||||
'# dHash Firm-Residualized + Jittered Dip (Script 39e)',
|
||||
'', f'Generated: {results["meta"]["timestamp"]}',
|
||||
f'Bootstrap replicates: {N_BOOT}; jitter seeds: {SEEDS}',
|
||||
'',
|
||||
'## Per-firm Big-4 dh summary',
|
||||
'', '| Firm | n | mean | median | sd | P25 | P75 | %<=5 | %>15 |',
|
||||
'|---|---|---|---|---|---|---|---|---|',
|
||||
]
|
||||
for f, s in firm_stats.items():
|
||||
md.append(f'| {f} | {s["n"]:,} | {s["mean"]:.3f} | '
|
||||
f'{s["median"]:.1f} | {s["sd"]:.3f} | '
|
||||
f'{s["p25"]:.1f} | {s["p75"]:.1f} | '
|
||||
f'{s["pct_le_5"]:.3f} | {s["pct_gt_15"]:.3f} |')
|
||||
md += [
|
||||
'',
|
||||
'## Dip test under four conditions (Big-4 pooled, sig-level)',
|
||||
'',
|
||||
'| Condition | dip | p (or p_median) | reject@.05 (seeds) |',
|
||||
'|---|---|---|---|',
|
||||
f'| 1. Raw (integer values) | {panels["raw"]["dip_median"]:.5f} '
|
||||
f'| {_fmt_p(panels["raw"]["p_median"])} | n/a (1 seed) |',
|
||||
f'| 2. Firm-mean centered, no jitter '
|
||||
f'| {panels["centered_only"]["dip_median"]:.5f} '
|
||||
f'| {_fmt_p(panels["centered_only"]["p_median"])} | n/a (1 seed) |',
|
||||
f'| 3. Jittered only (5 seeds) '
|
||||
f'| median {panels["jitter_only"]["dip_median"]:.5f} '
|
||||
f'| median {_fmt_p(panels["jitter_only"]["p_median"])} '
|
||||
f'| {panels["jitter_only"]["reject_at_05_count"]}/5 |',
|
||||
f'| 4. **Centered + jittered (5 seeds)** '
|
||||
f'| median {panels["centered_jittered"]["dip_median"]:.5f} '
|
||||
f'| median {_fmt_p(panels["centered_jittered"]["p_median"])} '
|
||||
f'| {panels["centered_jittered"]["reject_at_05_count"]}/5 |',
|
||||
'',
|
||||
'## Interpretation',
|
||||
'',
|
||||
('If Condition 4 still rejects unimodality, Big-4 dh has '
|
||||
'genuine within-population continuous multimodality '
|
||||
'independent of both between-firm location shifts and '
|
||||
'integer mass points. If Condition 4 fails to reject, the '
|
||||
'Big-4 pooled dh multimodality is fully explained by '
|
||||
'(between-firm mean shift) + (integer mass points). In the '
|
||||
'latter case, the dh axis carries no independent within-firm '
|
||||
'regime evidence beyond the cos axis.'),
|
||||
'',
|
||||
]
|
||||
md_path = OUT / 'dhash_residualized_jittered_report.md'
|
||||
md_path.write_text('\n'.join(md), encoding='utf-8')
|
||||
print(f'[md ] {md_path}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,413 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script 40b: Inter-CPA FAR Sweep for cos and dHash (joint + marginal)
|
||||
=====================================================================
|
||||
After codex round-29 destroyed the distributional path to thresholds
|
||||
(K=3 mixture / dip / antimode shown composition-driven by Scripts
|
||||
39b–39e), v4.0 pivots to an anchor-based threshold framework:
|
||||
empirically derived from inter-CPA negative anchor specificity.
|
||||
|
||||
Inter-CPA pairs (different CPAs, all-firm) are the negative anchor:
|
||||
they are by definition not same-CPA replications, and the user's
|
||||
within-CPA mechanism-transition concern (a CPA might switch from
|
||||
hand-sign to template mid-career) does not enter the inter-CPA
|
||||
calibration because each sampled pair crosses CPA boundaries.
|
||||
|
||||
This script samples a large number of inter-CPA pairs and computes
|
||||
both descriptors per pair (cosine via feature_vector dot product;
|
||||
Hamming distance via dhash_vector XOR). It then sweeps:
|
||||
|
||||
1. FAR(cos > k) across k in [0.80, 0.99]
|
||||
2. FAR(dHash <= k) across k in [0, 20]
|
||||
3. Joint FAR(cos > 0.95 AND dHash <= k) for k in [0, 20]
|
||||
4. Conditional FAR(dHash <= k | cos > 0.95) -- the v3 inherited
|
||||
rule's marginal specificity contribution from dHash
|
||||
|
||||
Outputs:
|
||||
reports/v4_big4/inter_cpa_far_sweep/
|
||||
far_sweep_results.json
|
||||
far_sweep_report.md
|
||||
|
||||
Sample size: 500,000 inter-CPA pairs (matches v3 Script 10
|
||||
convention). Big-4-only and full-corpus variants both reported.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
|
||||
'v4_big4/inter_cpa_far_sweep')
|
||||
OUT.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'Firm A',
|
||||
'安侯建業聯合': 'Firm B',
|
||||
'資誠聯合': 'Firm C',
|
||||
'安永聯合': 'Firm D'}
|
||||
N_PAIRS = 500_000
|
||||
SEED = 42
|
||||
|
||||
COS_GRID = [0.80, 0.83, 0.85, 0.87, 0.89, 0.90, 0.91, 0.92, 0.93, 0.94,
|
||||
0.945, 0.95, 0.955, 0.96, 0.965, 0.97, 0.975, 0.98, 0.985,
|
||||
0.99]
|
||||
DH_GRID = list(range(0, 21))
|
||||
|
||||
|
||||
def hamming_64bit(a_bytes, b_bytes):
|
||||
"""Hamming distance between two 8-byte (64-bit) dHash byte strings."""
|
||||
a = int.from_bytes(a_bytes, 'big')
|
||||
b = int.from_bytes(b_bytes, 'big')
|
||||
return (a ^ b).bit_count()
|
||||
|
||||
|
||||
def load_signatures():
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute('''
|
||||
SELECT s.signature_id, s.assigned_accountant, a.firm,
|
||||
s.feature_vector, s.dhash_vector
|
||||
FROM signatures s
|
||||
JOIN accountants a ON s.assigned_accountant = a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL
|
||||
AND s.feature_vector IS NOT NULL
|
||||
AND s.dhash_vector IS NOT NULL
|
||||
AND a.firm IS NOT NULL
|
||||
''')
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
|
||||
def sample_inter_cpa_pairs(rows, n_pairs, seed, restrict_to_big4=False):
|
||||
"""Sample inter-CPA pairs and compute (cos, dh) for each."""
|
||||
rng = np.random.default_rng(seed)
|
||||
if restrict_to_big4:
|
||||
rows = [r for r in rows if r[2] in BIG4]
|
||||
scope = 'big4_only'
|
||||
else:
|
||||
scope = 'all_firms'
|
||||
print(f' [{scope}] {len(rows):,} signatures available')
|
||||
|
||||
by_acct = defaultdict(list)
|
||||
for r in rows:
|
||||
by_acct[r[1]].append(r)
|
||||
accountants = list(by_acct.keys())
|
||||
n_acct = len(accountants)
|
||||
print(f' [{scope}] {n_acct} accountants')
|
||||
|
||||
features = {a: np.stack(
|
||||
[np.frombuffer(r[3], dtype=np.float32) for r in by_acct[a]]
|
||||
) for a in accountants}
|
||||
dhashes = {a: [r[4] for r in by_acct[a]] for a in accountants}
|
||||
|
||||
cos_vals = np.empty(n_pairs, dtype=np.float32)
|
||||
dh_vals = np.empty(n_pairs, dtype=np.int32)
|
||||
n_done = 0
|
||||
for _ in range(n_pairs):
|
||||
i, j = rng.choice(n_acct, 2, replace=False)
|
||||
a1, a2 = accountants[i], accountants[j]
|
||||
n1, n2 = len(by_acct[a1]), len(by_acct[a2])
|
||||
k1 = int(rng.integers(0, n1))
|
||||
k2 = int(rng.integers(0, n2))
|
||||
f1 = features[a1][k1]
|
||||
f2 = features[a2][k2]
|
||||
cos = float(f1 @ f2)
|
||||
d = hamming_64bit(dhashes[a1][k1], dhashes[a2][k2])
|
||||
cos_vals[n_done] = cos
|
||||
dh_vals[n_done] = d
|
||||
n_done += 1
|
||||
return scope, cos_vals, dh_vals
|
||||
|
||||
|
||||
def wilson_ci(k, n, z=1.96):
|
||||
if n == 0:
|
||||
return (None, None)
|
||||
phat = k / n
|
||||
denom = 1 + z * z / n
|
||||
centre = (phat + z * z / (2 * n)) / denom
|
||||
half = z * np.sqrt(phat * (1 - phat) / n + z * z / (4 * n * n)) / denom
|
||||
return (max(0.0, centre - half), min(1.0, centre + half))
|
||||
|
||||
|
||||
def far_at_cos(cos_vals, k):
|
||||
n = len(cos_vals)
|
||||
hits = int((cos_vals > k).sum())
|
||||
lo, hi = wilson_ci(hits, n)
|
||||
return {'k': float(k), 'n': n, 'hits': hits,
|
||||
'far': hits / n, 'ci95_lo': lo, 'ci95_hi': hi}
|
||||
|
||||
|
||||
def far_at_dh_le(dh_vals, k):
|
||||
n = len(dh_vals)
|
||||
hits = int((dh_vals <= k).sum())
|
||||
lo, hi = wilson_ci(hits, n)
|
||||
return {'k': int(k), 'n': n, 'hits': hits,
|
||||
'far': hits / n, 'ci95_lo': lo, 'ci95_hi': hi}
|
||||
|
||||
|
||||
def joint_far(cos_vals, dh_vals, cos_k, dh_k):
|
||||
n = len(cos_vals)
|
||||
hits = int(((cos_vals > cos_k) & (dh_vals <= dh_k)).sum())
|
||||
lo, hi = wilson_ci(hits, n)
|
||||
return {'cos_k': float(cos_k), 'dh_k': int(dh_k),
|
||||
'n': n, 'hits': hits,
|
||||
'far': hits / n, 'ci95_lo': lo, 'ci95_hi': hi}
|
||||
|
||||
|
||||
def cond_far(cos_vals, dh_vals, cos_k, dh_k):
|
||||
"""FAR(dh<=k | cos>cos_k)"""
|
||||
cos_mask = cos_vals > cos_k
|
||||
n_cond = int(cos_mask.sum())
|
||||
if n_cond == 0:
|
||||
return {'cos_k': float(cos_k), 'dh_k': int(dh_k),
|
||||
'n_cond': 0, 'hits': 0,
|
||||
'cond_far': None, 'ci95_lo': None, 'ci95_hi': None}
|
||||
hits = int(((dh_vals <= dh_k) & cos_mask).sum())
|
||||
lo, hi = wilson_ci(hits, n_cond)
|
||||
return {'cos_k': float(cos_k), 'dh_k': int(dh_k),
|
||||
'n_cond': n_cond, 'hits': hits,
|
||||
'cond_far': hits / n_cond, 'ci95_lo': lo, 'ci95_hi': hi}
|
||||
|
||||
|
||||
def invert_far_target(curve_entries, target, key='far'):
|
||||
"""Return the entries bracketing the target FAR (linear scan)."""
|
||||
sorted_e = sorted(curve_entries, key=lambda e: e[key])
|
||||
for e in sorted_e:
|
||||
if e[key] <= target:
|
||||
best = e
|
||||
else:
|
||||
break
|
||||
return best if sorted_e and sorted_e[0][key] <= target else None
|
||||
|
||||
|
||||
def _fmt(x, fmt='.5f'):
|
||||
return 'None' if x is None else format(x, fmt)
|
||||
|
||||
|
||||
def run_scope(rows, scope_name, restrict_to_big4):
|
||||
print(f'\n== Scope: {scope_name} ==')
|
||||
scope_label, cos_vals, dh_vals = sample_inter_cpa_pairs(
|
||||
rows, N_PAIRS, SEED, restrict_to_big4=restrict_to_big4)
|
||||
print(f' Sampled {len(cos_vals):,} inter-CPA pairs')
|
||||
print(f' cos: mean={cos_vals.mean():.4f}, '
|
||||
f'median={np.median(cos_vals):.4f}, '
|
||||
f'std={cos_vals.std():.4f}')
|
||||
print(f' dh : mean={dh_vals.mean():.4f}, '
|
||||
f'median={np.median(dh_vals):.4f}, '
|
||||
f'std={dh_vals.std():.4f}')
|
||||
|
||||
cos_curve = [far_at_cos(cos_vals, k) for k in COS_GRID]
|
||||
dh_curve = [far_at_dh_le(dh_vals, k) for k in DH_GRID]
|
||||
joint_curve_95 = [joint_far(cos_vals, dh_vals, 0.95, k) for k in DH_GRID]
|
||||
cond_curve_95 = [cond_far(cos_vals, dh_vals, 0.95, k) for k in DH_GRID]
|
||||
|
||||
print('\n [Cos FAR sweep]')
|
||||
for e in cos_curve:
|
||||
print(f' cos > {e["k"]:.3f}: FAR={_fmt(e["far"])}, '
|
||||
f'CI=[{_fmt(e["ci95_lo"])}, {_fmt(e["ci95_hi"])}], '
|
||||
f'hits={e["hits"]}/{e["n"]}')
|
||||
|
||||
print('\n [dHash FAR sweep]')
|
||||
for e in dh_curve:
|
||||
print(f' dh <= {e["k"]:2d}: FAR={_fmt(e["far"])}, '
|
||||
f'CI=[{_fmt(e["ci95_lo"])}, {_fmt(e["ci95_hi"])}], '
|
||||
f'hits={e["hits"]}/{e["n"]}')
|
||||
|
||||
print('\n [Joint FAR (cos > 0.95 AND dh <= k)]')
|
||||
for e in joint_curve_95:
|
||||
print(f' dh <= {e["dh_k"]:2d}: FAR={_fmt(e["far"])}, '
|
||||
f'hits={e["hits"]}/{e["n"]}')
|
||||
|
||||
print('\n [Conditional FAR(dh <= k | cos > 0.95)]')
|
||||
for e in cond_curve_95:
|
||||
cf = e['cond_far']
|
||||
print(f' dh <= {e["dh_k"]:2d}: P(dh<=k | cos>0.95)='
|
||||
f'{_fmt(cf) if cf is not None else "n/a"}, '
|
||||
f'hits={e["hits"]}/{e["n_cond"]}')
|
||||
|
||||
targets = [0.005, 0.001, 0.0005, 0.0001]
|
||||
inv = {}
|
||||
for t in targets:
|
||||
inv[f'cos_far_<=_{t}'] = invert_far_target(cos_curve, t, 'far')
|
||||
inv[f'dh_far_<=_{t}'] = invert_far_target(dh_curve, t, 'far')
|
||||
inv[f'joint_at_cos95_far_<=_{t}'] = invert_far_target(
|
||||
joint_curve_95, t, 'far')
|
||||
|
||||
print('\n [Threshold inversion]')
|
||||
for tgt in targets:
|
||||
e = inv[f'cos_far_<=_{tgt}']
|
||||
if e is not None:
|
||||
print(f' FAR <= {tgt}: max cos threshold with FAR<=tgt is '
|
||||
f'cos > {e["k"]:.3f} (FAR={e["far"]:.5f})')
|
||||
e = inv[f'dh_far_<=_{tgt}']
|
||||
if e is not None:
|
||||
print(f' FAR <= {tgt}: max dh threshold with FAR<=tgt is '
|
||||
f'dh <= {e["k"]} (FAR={e["far"]:.5f})')
|
||||
e = inv[f'joint_at_cos95_far_<=_{tgt}']
|
||||
if e is not None:
|
||||
print(f' FAR <= {tgt}: under cos>0.95, max dh threshold '
|
||||
f'with joint FAR<=tgt is dh <= {e["dh_k"]} '
|
||||
f'(joint FAR={e["far"]:.5f})')
|
||||
|
||||
return {
|
||||
'scope': scope_label,
|
||||
'n_pairs': int(len(cos_vals)),
|
||||
'cos_summary': {
|
||||
'mean': float(cos_vals.mean()),
|
||||
'median': float(np.median(cos_vals)),
|
||||
'std': float(cos_vals.std()),
|
||||
'p99': float(np.percentile(cos_vals, 99)),
|
||||
'p999': float(np.percentile(cos_vals, 99.9)),
|
||||
'max': float(cos_vals.max()),
|
||||
},
|
||||
'dh_summary': {
|
||||
'mean': float(dh_vals.mean()),
|
||||
'median': float(np.median(dh_vals)),
|
||||
'std': float(dh_vals.std()),
|
||||
'p01': float(np.percentile(dh_vals, 1)),
|
||||
'p001': float(np.percentile(dh_vals, 0.1)),
|
||||
'min': int(dh_vals.min()),
|
||||
},
|
||||
'cos_far_curve': cos_curve,
|
||||
'dh_far_curve': dh_curve,
|
||||
'joint_far_at_cos95_curve': joint_curve_95,
|
||||
'cond_far_at_cos95_curve': cond_curve_95,
|
||||
'threshold_inversions': inv,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
print('=' * 72)
|
||||
print('Script 40b: Inter-CPA FAR Sweep (cos + dHash, joint + marginal)')
|
||||
print('=' * 72)
|
||||
rows = load_signatures()
|
||||
print(f'\nLoaded {len(rows):,} signatures (full corpus)')
|
||||
|
||||
results = {
|
||||
'meta': {
|
||||
'script': '40b',
|
||||
'timestamp': datetime.now().isoformat(timespec='seconds'),
|
||||
'n_pairs_sampled': N_PAIRS,
|
||||
'seed': SEED,
|
||||
'note': ('Inter-CPA pair-level FAR sweep for cos and dHash. '
|
||||
'Anchor-based threshold derivation; replaces '
|
||||
'distributional path attacked in codex round-29.'),
|
||||
},
|
||||
'scopes': {},
|
||||
}
|
||||
|
||||
results['scopes']['big4_only'] = run_scope(
|
||||
rows, 'Big-4 only', restrict_to_big4=True)
|
||||
results['scopes']['all_firms'] = run_scope(
|
||||
rows, 'All firms', restrict_to_big4=False)
|
||||
|
||||
json_path = OUT / 'far_sweep_results.json'
|
||||
json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False),
|
||||
encoding='utf-8')
|
||||
print(f'\n[json] {json_path}')
|
||||
|
||||
md = [
|
||||
'# Inter-CPA FAR Sweep (Script 40b)',
|
||||
'',
|
||||
f'Generated: {results["meta"]["timestamp"]}',
|
||||
f'Inter-CPA pair samples per scope: {N_PAIRS:,}; seed: {SEED}',
|
||||
'',
|
||||
('Anchor-based threshold derivation. For each scope (Big-4 only '
|
||||
'or all firms), sample random inter-CPA pairs and compute '
|
||||
'cosine + Hamming distance per pair. Report False Acceptance '
|
||||
'Rates (FAR) at various thresholds; invert FAR target to '
|
||||
'derive thresholds with empirical specificity guarantees.'),
|
||||
'',
|
||||
]
|
||||
|
||||
for scope in ['big4_only', 'all_firms']:
|
||||
s = results['scopes'][scope]
|
||||
md += [f'## Scope: {scope} ({s["n_pairs"]:,} pairs)', '',
|
||||
'### Cosine FAR curve', '',
|
||||
'| cos > k | FAR | 95% CI | hits / n |',
|
||||
'|---|---|---|---|']
|
||||
for e in s['cos_far_curve']:
|
||||
md.append(f'| {e["k"]:.3f} | {_fmt(e["far"])} | '
|
||||
f'[{_fmt(e["ci95_lo"])}, {_fmt(e["ci95_hi"])}] | '
|
||||
f'{e["hits"]:,} / {e["n"]:,} |')
|
||||
md += ['', '### dHash FAR curve', '',
|
||||
'| dh <= k | FAR | 95% CI | hits / n |',
|
||||
'|---|---|---|---|']
|
||||
for e in s['dh_far_curve']:
|
||||
md.append(f'| {e["k"]:2d} | {_fmt(e["far"])} | '
|
||||
f'[{_fmt(e["ci95_lo"])}, {_fmt(e["ci95_hi"])}] | '
|
||||
f'{e["hits"]:,} / {e["n"]:,} |')
|
||||
md += ['', '### Joint FAR (cos > 0.95 AND dh <= k)', '',
|
||||
'| dh <= k | Joint FAR | hits / n |',
|
||||
'|---|---|---|']
|
||||
for e in s['joint_far_at_cos95_curve']:
|
||||
md.append(f'| {e["dh_k"]:2d} | {_fmt(e["far"])} | '
|
||||
f'{e["hits"]:,} / {e["n"]:,} |')
|
||||
md += ['',
|
||||
'### Conditional FAR(dh <= k | cos > 0.95)',
|
||||
'',
|
||||
'Among inter-CPA pairs that already exceed cos > 0.95, '
|
||||
'what fraction also have dh <= k? This quantifies '
|
||||
"dHash's marginal specificity contribution given the cos "
|
||||
"gate is already applied.",
|
||||
'',
|
||||
'| dh <= k | Conditional FAR | hits / n_cond |',
|
||||
'|---|---|---|']
|
||||
for e in s['cond_far_at_cos95_curve']:
|
||||
cf = e['cond_far']
|
||||
md.append(f'| {e["dh_k"]:2d} | '
|
||||
f'{_fmt(cf) if cf is not None else "n/a"} | '
|
||||
f'{e["hits"]:,} / {e["n_cond"]:,} |')
|
||||
md += ['', '### Threshold inversion', '',
|
||||
'| FAR target | cos thresh | dh thresh | joint dh thresh '
|
||||
'(under cos>0.95) |',
|
||||
'|---|---|---|---|']
|
||||
for tgt in [0.005, 0.001, 0.0005, 0.0001]:
|
||||
e_c = s['threshold_inversions'].get(f'cos_far_<=_{tgt}')
|
||||
e_d = s['threshold_inversions'].get(f'dh_far_<=_{tgt}')
|
||||
e_j = s['threshold_inversions'].get(
|
||||
f'joint_at_cos95_far_<=_{tgt}')
|
||||
c_str = (f'cos > {e_c["k"]:.3f} (FAR={e_c["far"]:.5f})'
|
||||
if e_c else 'unachievable')
|
||||
d_str = (f'dh <= {e_d["k"]} (FAR={e_d["far"]:.5f})'
|
||||
if e_d else 'unachievable')
|
||||
j_str = (f'dh <= {e_j["dh_k"]} (FAR={e_j["far"]:.5f})'
|
||||
if e_j else 'unachievable')
|
||||
md.append(f'| {tgt} | {c_str} | {d_str} | {j_str} |')
|
||||
md.append('')
|
||||
|
||||
md += [
|
||||
'## Interpretation',
|
||||
'',
|
||||
('- The cosine FAR curve replicates and extends v3.x §IV-I '
|
||||
'Table X (which reported FAR=0.0005 at cos>0.95 from a '
|
||||
'similar but smaller-sample inter-CPA negative anchor).'),
|
||||
('- The dHash FAR curve is the v4 contribution: prior v3.x '
|
||||
'work used dh<=5 by convention without an empirical '
|
||||
'specificity derivation. This script derives a specificity '
|
||||
"target → dh threshold mapping."),
|
||||
('- The conditional FAR(dh<=k | cos>0.95) curve tells us '
|
||||
'whether dHash adds specificity given the cos gate. If the '
|
||||
"conditional FAR at dh<=5 is meaningfully lower than 1.0, "
|
||||
'dHash is providing additional specificity. If it is near '
|
||||
'1.0, dHash is largely redundant given cos>0.95 and the '
|
||||
'five-way rule should be simplified.'),
|
||||
('- Thresholds derived by inverting FAR targets are '
|
||||
'specificity-anchored operating points, not distributional '
|
||||
'antimodes. They are robust to the integer-mass-point and '
|
||||
'between-firm-composition artefacts identified in Scripts '
|
||||
'39b–39e.'),
|
||||
'',
|
||||
]
|
||||
md_path = OUT / 'far_sweep_report.md'
|
||||
md_path.write_text('\n'.join(md), encoding='utf-8')
|
||||
print(f'[md ] {md_path}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,568 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script 43: Pool-Normalized Per-Signature FAR (anchor-based calibration)
|
||||
========================================================================
|
||||
Codex round-30 verdict on Script 40b: per-pair FAR (~0.00060 at
|
||||
cos>0.95) is NOT the per-signature classifier specificity. The
|
||||
deployed classifier uses max-cosine and min-dHash over each CPA's
|
||||
same-CPA pool, so the inter-CPA-equivalent specificity for a
|
||||
signature with pool size n is approximately 1 - (1 - pair_FAR)^n,
|
||||
which for Big-4 median pool ~280 is several percent, not 0.00014.
|
||||
|
||||
This script computes pool-normalized per-signature FAR by drawing,
|
||||
for each source signature s, a random inter-CPA candidate pool of
|
||||
size n_pool(s) (= same-CPA pool size of s), and computing the
|
||||
deployed descriptors against the random pool. The fraction of
|
||||
source signatures whose max-cosine exceeds k (and/or min-dHash <= k)
|
||||
is the per-signature FAR at that operating point.
|
||||
|
||||
We also report:
|
||||
- "Any-pair" joint FAR: max_cos > c AND min_dh <= d (descriptors
|
||||
may come from different candidates)
|
||||
- "Same-pair" joint FAR: at least one candidate has both
|
||||
cos > c AND dh <= d
|
||||
- Per-firm and pool-size-decile stratification
|
||||
- CPA-block bootstrap CI on key FAR points
|
||||
- Threshold inversion for target per-signature FAR
|
||||
|
||||
Inputs: full Big-4 sub-corpus (n=150,453 sigs / 468 CPAs).
|
||||
Random pool draws use one realisation per source signature, with
|
||||
seed control. CPA-block bootstrap quantifies sampling noise.
|
||||
|
||||
Outputs:
|
||||
reports/v4_big4/pool_normalized_far/
|
||||
pool_normalized_results.json
|
||||
pool_normalized_report.md
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
|
||||
'v4_big4/pool_normalized_far')
|
||||
OUT.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'Firm A',
|
||||
'安侯建業聯合': 'Firm B',
|
||||
'資誠聯合': 'Firm C',
|
||||
'安永聯合': 'Firm D'}
|
||||
SEED = 42
|
||||
BATCH = 200 # source signatures per batch
|
||||
N_BOOT_CPA = 1000 # CPA-block bootstrap replicates
|
||||
|
||||
COS_KS = [0.90, 0.92, 0.93, 0.94, 0.945, 0.95, 0.955, 0.96, 0.97, 0.98]
|
||||
DH_KS = [2, 3, 4, 5, 6, 8, 10, 15]
|
||||
|
||||
|
||||
def load_big4():
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute('''
|
||||
SELECT s.signature_id, s.assigned_accountant, a.firm, s.source_pdf,
|
||||
s.feature_vector, s.dhash_vector
|
||||
FROM signatures s
|
||||
JOIN accountants a ON s.assigned_accountant = a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL
|
||||
AND s.feature_vector IS NOT NULL
|
||||
AND s.dhash_vector IS NOT NULL
|
||||
AND a.firm IN (?, ?, ?, ?)
|
||||
''', BIG4)
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
|
||||
def hamming_vec(query_bytes, cand_bytes_array):
|
||||
"""Hamming between one 8-byte hash and an array of 8-byte hashes."""
|
||||
q = int.from_bytes(query_bytes, 'big')
|
||||
out = np.empty(len(cand_bytes_array), dtype=np.int32)
|
||||
for i, c in enumerate(cand_bytes_array):
|
||||
c_int = int.from_bytes(c, 'big')
|
||||
out[i] = (q ^ c_int).bit_count()
|
||||
return out
|
||||
|
||||
|
||||
def wilson_ci(k, n, z=1.96):
|
||||
if n == 0:
|
||||
return (None, None)
|
||||
phat = k / n
|
||||
denom = 1 + z * z / n
|
||||
centre = (phat + z * z / (2 * n)) / denom
|
||||
half = z * np.sqrt(phat * (1 - phat) / n + z * z / (4 * n * n)) / denom
|
||||
return (max(0.0, centre - half), min(1.0, centre + half))
|
||||
|
||||
|
||||
def main():
|
||||
print('=' * 72)
|
||||
print('Script 43: Pool-Normalized Per-Signature FAR')
|
||||
print('=' * 72)
|
||||
rows = load_big4()
|
||||
n_sigs = len(rows)
|
||||
print(f'\nLoaded {n_sigs:,} Big-4 signatures')
|
||||
|
||||
# Build index arrays
|
||||
sig_ids = np.array([r[0] for r in rows], dtype=np.int64)
|
||||
cpas = np.array([r[1] for r in rows])
|
||||
firms = np.array([ALIAS[r[2]] for r in rows])
|
||||
source_pdfs = np.array([r[3] for r in rows])
|
||||
|
||||
# Feature matrix
|
||||
feats = np.stack([np.frombuffer(r[4], dtype=np.float32)
|
||||
for r in rows]).astype(np.float32)
|
||||
print(f' Feature matrix: {feats.shape}, '
|
||||
f'{feats.nbytes / 1e9:.2f} GB')
|
||||
# L2-normalize
|
||||
norms = np.linalg.norm(feats, axis=1, keepdims=True)
|
||||
norms[norms == 0] = 1.0
|
||||
feats = feats / norms
|
||||
|
||||
# dHash bytes
|
||||
dhashes = [r[5] for r in rows]
|
||||
|
||||
# CPA → indices
|
||||
cpa_to_idx = defaultdict(list)
|
||||
for i, c in enumerate(cpas):
|
||||
cpa_to_idx[c].append(i)
|
||||
cpa_to_idx = {c: np.array(v, dtype=np.int64)
|
||||
for c, v in cpa_to_idx.items()}
|
||||
n_cpas = len(cpa_to_idx)
|
||||
pool_sizes = {c: len(v) - 1 for c, v in cpa_to_idx.items()}
|
||||
print(f' CPAs: {n_cpas}; pool-size summary: '
|
||||
f'min={min(pool_sizes.values())}, '
|
||||
f'median={int(np.median(list(pool_sizes.values())))}, '
|
||||
f'max={max(pool_sizes.values())}')
|
||||
|
||||
# Pre-compute: for sampling non-same-CPA candidates, we need fast
|
||||
# index sampling. The total available pool for each source sig is
|
||||
# all_indices \ same_cpa_indices.
|
||||
all_idx = np.arange(n_sigs, dtype=np.int64)
|
||||
|
||||
# ── Per-source-signature simulation ─────────────────────
|
||||
print('\nSimulating per-source-signature inter-CPA-equivalent pool...')
|
||||
rng = np.random.default_rng(SEED)
|
||||
|
||||
# Per-signature stored statistics
|
||||
max_cos = np.zeros(n_sigs, dtype=np.float32)
|
||||
min_dh = np.zeros(n_sigs, dtype=np.int32)
|
||||
cos_at_min_dh = np.zeros(n_sigs, dtype=np.float32)
|
||||
dh_at_max_cos = np.zeros(n_sigs, dtype=np.int32)
|
||||
pool_size_arr = np.zeros(n_sigs, dtype=np.int32)
|
||||
|
||||
# For each source signature, we also record indicator for same-pair
|
||||
# joint event at (cos>0.95, dh<=5) -- the headline operational rule.
|
||||
# This requires keeping per-signature any() flag for that pair.
|
||||
headline_same_pair_95_5 = np.zeros(n_sigs, dtype=bool)
|
||||
headline_same_pair_95_4 = np.zeros(n_sigs, dtype=bool)
|
||||
headline_same_pair_95_3 = np.zeros(n_sigs, dtype=bool)
|
||||
|
||||
# process batches of source signatures
|
||||
for batch_start in range(0, n_sigs, BATCH):
|
||||
batch_end = min(batch_start + BATCH, n_sigs)
|
||||
if batch_start % 5000 == 0:
|
||||
pct = batch_start / n_sigs * 100
|
||||
print(f' {batch_start:,}/{n_sigs:,} ({pct:.1f}%)')
|
||||
|
||||
for si in range(batch_start, batch_end):
|
||||
s_cpa = cpas[si]
|
||||
n_pool = pool_sizes[s_cpa]
|
||||
pool_size_arr[si] = n_pool
|
||||
|
||||
if n_pool <= 0:
|
||||
max_cos[si] = 0.0
|
||||
min_dh[si] = 64
|
||||
continue
|
||||
|
||||
# Sample n_pool candidates from non-same-CPA indices
|
||||
same_cpa = cpa_to_idx[s_cpa]
|
||||
# Use random.choice over all_idx excluding same_cpa is slow;
|
||||
# instead reject-sample from all_idx
|
||||
need = n_pool
|
||||
cand_indices = []
|
||||
attempts = 0
|
||||
while need > 0 and attempts < 10:
|
||||
draw = rng.choice(n_sigs, size=need * 2, replace=True)
|
||||
# filter out same_cpa
|
||||
same_mask = np.isin(draw, same_cpa)
|
||||
ok = draw[~same_mask]
|
||||
cand_indices.extend(ok[:need].tolist())
|
||||
need -= len(ok[:need])
|
||||
attempts += 1
|
||||
if need > 0:
|
||||
# fallback: deterministic sample without same-CPA
|
||||
pool_mask = np.ones(n_sigs, dtype=bool)
|
||||
pool_mask[same_cpa] = False
|
||||
pool_idx = all_idx[pool_mask]
|
||||
fb = rng.choice(pool_idx, size=need, replace=False)
|
||||
cand_indices.extend(fb.tolist())
|
||||
cand_indices = np.array(cand_indices[:n_pool], dtype=np.int64)
|
||||
|
||||
# Cosine: source feat @ cand feats
|
||||
cos_vec = feats[cand_indices] @ feats[si]
|
||||
# dHash
|
||||
dh_vec = hamming_vec(dhashes[si],
|
||||
[dhashes[c] for c in cand_indices])
|
||||
|
||||
mc_idx = int(np.argmax(cos_vec))
|
||||
md_idx = int(np.argmin(dh_vec))
|
||||
max_cos[si] = float(cos_vec[mc_idx])
|
||||
min_dh[si] = int(dh_vec[md_idx])
|
||||
dh_at_max_cos[si] = int(dh_vec[mc_idx])
|
||||
cos_at_min_dh[si] = float(cos_vec[md_idx])
|
||||
|
||||
# Same-pair joint indicators
|
||||
cos_gt = cos_vec > 0.95
|
||||
if cos_gt.any():
|
||||
dh_under_5 = dh_vec <= 5
|
||||
dh_under_4 = dh_vec <= 4
|
||||
dh_under_3 = dh_vec <= 3
|
||||
headline_same_pair_95_5[si] = bool((cos_gt & dh_under_5).any())
|
||||
headline_same_pair_95_4[si] = bool((cos_gt & dh_under_4).any())
|
||||
headline_same_pair_95_3[si] = bool((cos_gt & dh_under_3).any())
|
||||
|
||||
print(f' Done.')
|
||||
|
||||
# ── Aggregate ──────────────────────────────────────────
|
||||
print('\nAggregating per-signature FAR statistics...')
|
||||
|
||||
def far_marginal_cos(k):
|
||||
hits = int((max_cos > k).sum())
|
||||
lo, hi = wilson_ci(hits, n_sigs)
|
||||
return {'k': k, 'hits': hits, 'n': n_sigs,
|
||||
'far': hits / n_sigs,
|
||||
'ci95_lo': lo, 'ci95_hi': hi}
|
||||
|
||||
def far_marginal_dh(k):
|
||||
hits = int((min_dh <= k).sum())
|
||||
lo, hi = wilson_ci(hits, n_sigs)
|
||||
return {'k': k, 'hits': hits, 'n': n_sigs,
|
||||
'far': hits / n_sigs,
|
||||
'ci95_lo': lo, 'ci95_hi': hi}
|
||||
|
||||
def far_any_pair_joint(cos_k, dh_k):
|
||||
hits = int(((max_cos > cos_k) & (min_dh <= dh_k)).sum())
|
||||
lo, hi = wilson_ci(hits, n_sigs)
|
||||
return {'cos_k': cos_k, 'dh_k': dh_k,
|
||||
'hits': hits, 'n': n_sigs,
|
||||
'far': hits / n_sigs,
|
||||
'ci95_lo': lo, 'ci95_hi': hi}
|
||||
|
||||
def far_same_pair_joint(cos_k, dh_k, indicator):
|
||||
hits = int(indicator.sum())
|
||||
lo, hi = wilson_ci(hits, n_sigs)
|
||||
return {'cos_k': cos_k, 'dh_k': dh_k,
|
||||
'hits': hits, 'n': n_sigs,
|
||||
'far': hits / n_sigs,
|
||||
'ci95_lo': lo, 'ci95_hi': hi}
|
||||
|
||||
cos_curve = [far_marginal_cos(k) for k in COS_KS]
|
||||
dh_curve = [far_marginal_dh(k) for k in DH_KS]
|
||||
any_pair_curve = [far_any_pair_joint(0.95, k) for k in DH_KS]
|
||||
same_pair_curve = [
|
||||
far_same_pair_joint(0.95, 5, headline_same_pair_95_5),
|
||||
far_same_pair_joint(0.95, 4, headline_same_pair_95_4),
|
||||
far_same_pair_joint(0.95, 3, headline_same_pair_95_3),
|
||||
]
|
||||
|
||||
print('\n[Per-signature marginal cos FAR]')
|
||||
for e in cos_curve:
|
||||
print(f' max-cos > {e["k"]:.3f}: FAR={e["far"]:.4f}, '
|
||||
f'CI=[{e["ci95_lo"]:.4f}, {e["ci95_hi"]:.4f}], '
|
||||
f'hits={e["hits"]}/{e["n"]:,}')
|
||||
|
||||
print('\n[Per-signature marginal dh FAR]')
|
||||
for e in dh_curve:
|
||||
print(f' min-dh <= {e["k"]:2d}: FAR={e["far"]:.4f}, '
|
||||
f'CI=[{e["ci95_lo"]:.4f}, {e["ci95_hi"]:.4f}], '
|
||||
f'hits={e["hits"]}/{e["n"]:,}')
|
||||
|
||||
print('\n[Per-signature any-pair joint FAR (cos>0.95 AND dh<=k)]')
|
||||
for e in any_pair_curve:
|
||||
print(f' dh <= {e["dh_k"]:2d}: FAR={e["far"]:.4f}, '
|
||||
f'hits={e["hits"]}/{e["n"]:,}')
|
||||
|
||||
print('\n[Per-signature SAME-pair joint FAR]')
|
||||
for e in same_pair_curve:
|
||||
print(f' cos>0.95 AND dh<={e["dh_k"]}: FAR={e["far"]:.4f}, '
|
||||
f'CI=[{e["ci95_lo"]:.4f}, {e["ci95_hi"]:.4f}], '
|
||||
f'hits={e["hits"]}/{e["n"]:,}')
|
||||
|
||||
# Per-firm and per-pool-decile stratification
|
||||
print('\n[Per-firm headline FAR (any-pair, cos>0.95 AND dh<=5)]')
|
||||
per_firm = {}
|
||||
for f in sorted(set(firms)):
|
||||
mask = firms == f
|
||||
n_f = int(mask.sum())
|
||||
hits_anypair = int(((max_cos[mask] > 0.95) &
|
||||
(min_dh[mask] <= 5)).sum())
|
||||
hits_samepair = int(headline_same_pair_95_5[mask].sum())
|
||||
per_firm[f] = {
|
||||
'n': n_f,
|
||||
'any_pair_far': hits_anypair / n_f,
|
||||
'same_pair_far': hits_samepair / n_f,
|
||||
}
|
||||
print(f' {f}: n={n_f:,} '
|
||||
f'any-pair FAR={hits_anypair/n_f:.4f}, '
|
||||
f'same-pair FAR={hits_samepair/n_f:.4f}')
|
||||
|
||||
print('\n[Pool-size decile × headline FAR]')
|
||||
pool_arr = pool_size_arr
|
||||
deciles = np.percentile(pool_arr, np.arange(0, 110, 10))
|
||||
per_decile = {}
|
||||
for d in range(10):
|
||||
lo, hi = deciles[d], deciles[d + 1]
|
||||
mask = (pool_arr >= lo) & (pool_arr <= hi if d == 9
|
||||
else pool_arr < hi)
|
||||
n_d = int(mask.sum())
|
||||
if n_d == 0:
|
||||
continue
|
||||
hits_any = int(((max_cos[mask] > 0.95) &
|
||||
(min_dh[mask] <= 5)).sum())
|
||||
hits_same = int(headline_same_pair_95_5[mask].sum())
|
||||
per_decile[f'decile_{d+1}'] = {
|
||||
'pool_range': [float(lo), float(hi)],
|
||||
'n': n_d,
|
||||
'any_pair_far': hits_any / n_d,
|
||||
'same_pair_far': hits_same / n_d,
|
||||
}
|
||||
print(f' Decile {d+1} (pool {lo:.0f}-{hi:.0f}): n={n_d:,} '
|
||||
f'any-FAR={hits_any/n_d:.4f}, '
|
||||
f'same-FAR={hits_same/n_d:.4f}')
|
||||
|
||||
# CPA bootstrap on headline (cos>0.95 AND dh<=5, same-pair)
|
||||
print(f'\n[CPA-block bootstrap {N_BOOT_CPA} replicates]')
|
||||
rng_b = np.random.default_rng(SEED + 1)
|
||||
all_cpa_list = list(cpa_to_idx.keys())
|
||||
boot_anypair = np.zeros(N_BOOT_CPA)
|
||||
boot_samepair = np.zeros(N_BOOT_CPA)
|
||||
for b in range(N_BOOT_CPA):
|
||||
cpas_b = rng_b.choice(all_cpa_list, size=len(all_cpa_list),
|
||||
replace=True)
|
||||
idx_b = np.concatenate([cpa_to_idx[c] for c in cpas_b])
|
||||
n_b = len(idx_b)
|
||||
boot_anypair[b] = ((max_cos[idx_b] > 0.95) &
|
||||
(min_dh[idx_b] <= 5)).mean()
|
||||
boot_samepair[b] = headline_same_pair_95_5[idx_b].mean()
|
||||
boot_anypair_ci = (float(np.percentile(boot_anypair, 2.5)),
|
||||
float(np.percentile(boot_anypair, 97.5)))
|
||||
boot_samepair_ci = (float(np.percentile(boot_samepair, 2.5)),
|
||||
float(np.percentile(boot_samepair, 97.5)))
|
||||
print(f' any-pair FAR boot mean={boot_anypair.mean():.4f}, '
|
||||
f'95% CI={boot_anypair_ci}')
|
||||
print(f' same-pair FAR boot mean={boot_samepair.mean():.4f}, '
|
||||
f'95% CI={boot_samepair_ci}')
|
||||
|
||||
# Document-level aggregation: a document is flagged if any of its
|
||||
# signatures has max_cos > 0.95 AND min_dh <= 5 (the worst-case rule)
|
||||
print('\n[Document-level aggregation]')
|
||||
doc_idx = defaultdict(list)
|
||||
for i, pdf in enumerate(source_pdfs):
|
||||
doc_idx[pdf].append(i)
|
||||
n_docs = len(doc_idx)
|
||||
doc_anypair_flag = 0
|
||||
doc_samepair_flag = 0
|
||||
for pdf, idxs in doc_idx.items():
|
||||
idxs_a = np.array(idxs, dtype=np.int64)
|
||||
if ((max_cos[idxs_a] > 0.95) & (min_dh[idxs_a] <= 5)).any():
|
||||
doc_anypair_flag += 1
|
||||
if headline_same_pair_95_5[idxs_a].any():
|
||||
doc_samepair_flag += 1
|
||||
print(f' n_documents: {n_docs:,}')
|
||||
print(f' doc-level any-pair FAR (any sig flagged) = '
|
||||
f'{doc_anypair_flag/n_docs:.4f} ({doc_anypair_flag}/{n_docs})')
|
||||
print(f' doc-level same-pair FAR = '
|
||||
f'{doc_samepair_flag/n_docs:.4f} ({doc_samepair_flag}/{n_docs})')
|
||||
|
||||
# Threshold inversion: find cos and dh thresholds that hit per-sig
|
||||
# FAR targets at the marginal level
|
||||
print('\n[Per-signature marginal threshold inversion]')
|
||||
inversions = {}
|
||||
for tgt in [0.10, 0.05, 0.02, 0.01, 0.005]:
|
||||
c_pick = None
|
||||
for e in cos_curve:
|
||||
if e['far'] <= tgt:
|
||||
c_pick = e
|
||||
break
|
||||
d_pick = None
|
||||
for e in dh_curve:
|
||||
if e['far'] <= tgt:
|
||||
d_pick = e
|
||||
break
|
||||
any_pick = None
|
||||
for e in any_pair_curve:
|
||||
if e['far'] <= tgt:
|
||||
any_pick = e
|
||||
break
|
||||
same_pick = None
|
||||
for e in same_pair_curve:
|
||||
if e['far'] <= tgt:
|
||||
same_pick = e
|
||||
break
|
||||
inversions[f'per_sig_far_<=_{tgt}'] = {
|
||||
'marginal_cos': c_pick, 'marginal_dh': d_pick,
|
||||
'any_pair_joint': any_pick, 'same_pair_joint': same_pick,
|
||||
}
|
||||
print(f' per-sig FAR <= {tgt}:')
|
||||
if c_pick:
|
||||
print(f' marginal cos: cos > {c_pick["k"]} '
|
||||
f'(FAR={c_pick["far"]:.4f})')
|
||||
if d_pick:
|
||||
print(f' marginal dh: dh <= {d_pick["k"]} '
|
||||
f'(FAR={d_pick["far"]:.4f})')
|
||||
if any_pick:
|
||||
print(f' any-pair joint: dh <= {any_pick["dh_k"]} '
|
||||
f'(FAR={any_pick["far"]:.4f})')
|
||||
if same_pick:
|
||||
print(f' same-pair joint: dh <= {same_pick["dh_k"]} '
|
||||
f'(FAR={same_pick["far"]:.4f})')
|
||||
|
||||
results = {
|
||||
'meta': {
|
||||
'script': '43',
|
||||
'timestamp': datetime.now().isoformat(timespec='seconds'),
|
||||
'n_signatures': n_sigs,
|
||||
'n_cpas': n_cpas,
|
||||
'n_boot_cpa': N_BOOT_CPA,
|
||||
'seed': SEED,
|
||||
'note': ('Pool-normalized per-signature FAR. For each '
|
||||
'source signature, simulate inter-CPA candidate '
|
||||
'pool of size n_pool(s); compute deployed max-cos '
|
||||
'and min-dh; aggregate per-signature FAR.'),
|
||||
},
|
||||
'marginal_cos_curve': cos_curve,
|
||||
'marginal_dh_curve': dh_curve,
|
||||
'any_pair_joint_curve': any_pair_curve,
|
||||
'same_pair_joint': same_pair_curve,
|
||||
'per_firm_headline': per_firm,
|
||||
'per_pool_decile_headline': per_decile,
|
||||
'cpa_bootstrap_headline': {
|
||||
'any_pair_mean': float(boot_anypair.mean()),
|
||||
'any_pair_ci95': boot_anypair_ci,
|
||||
'same_pair_mean': float(boot_samepair.mean()),
|
||||
'same_pair_ci95': boot_samepair_ci,
|
||||
},
|
||||
'document_level_headline': {
|
||||
'n_docs': n_docs,
|
||||
'any_pair_far': doc_anypair_flag / n_docs,
|
||||
'same_pair_far': doc_samepair_flag / n_docs,
|
||||
},
|
||||
'threshold_inversions': inversions,
|
||||
}
|
||||
|
||||
json_path = OUT / 'pool_normalized_results.json'
|
||||
json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False),
|
||||
encoding='utf-8')
|
||||
print(f'\n[json] {json_path}')
|
||||
|
||||
md = ['# Pool-Normalized Per-Signature FAR (Script 43)',
|
||||
'', f'Generated: {results["meta"]["timestamp"]}',
|
||||
(f'Big-4 source signatures: {n_sigs:,} across {n_cpas} CPAs; '
|
||||
f'pool-size median={int(np.median(list(pool_sizes.values())))}, '
|
||||
f'max={max(pool_sizes.values())}'),
|
||||
(f'CPA-block bootstrap: {N_BOOT_CPA} replicates. Per source '
|
||||
'signature, one realisation of n_pool(s)-sized random '
|
||||
'inter-CPA candidate pool.'),
|
||||
'',
|
||||
'## Headline (cos>0.95 AND dh<=5)',
|
||||
'',
|
||||
'| Variant | per-sig FAR | 95% Wilson CI | CPA-bootstrap 95% CI |',
|
||||
'|---|---|---|---|']
|
||||
md.append(f'| any-pair joint | '
|
||||
f'{((max_cos > 0.95) & (min_dh <= 5)).mean():.4f} | '
|
||||
f'see JSON | [{boot_anypair_ci[0]:.4f}, '
|
||||
f'{boot_anypair_ci[1]:.4f}] |')
|
||||
md.append(f'| same-pair joint | '
|
||||
f'{headline_same_pair_95_5.mean():.4f} | '
|
||||
f'see JSON | [{boot_samepair_ci[0]:.4f}, '
|
||||
f'{boot_samepair_ci[1]:.4f}] |')
|
||||
md += [
|
||||
'',
|
||||
'## Marginal cos FAR (per-signature)',
|
||||
'',
|
||||
'| max-cos > k | FAR | 95% CI | hits / n |',
|
||||
'|---|---|---|---|']
|
||||
for e in cos_curve:
|
||||
md.append(f'| {e["k"]} | {e["far"]:.4f} | '
|
||||
f'[{e["ci95_lo"]:.4f}, {e["ci95_hi"]:.4f}] | '
|
||||
f'{e["hits"]} / {e["n"]:,} |')
|
||||
md += ['', '## Marginal dh FAR (per-signature)', '',
|
||||
'| min-dh <= k | FAR | 95% CI | hits / n |',
|
||||
'|---|---|---|---|']
|
||||
for e in dh_curve:
|
||||
md.append(f'| {e["k"]} | {e["far"]:.4f} | '
|
||||
f'[{e["ci95_lo"]:.4f}, {e["ci95_hi"]:.4f}] | '
|
||||
f'{e["hits"]} / {e["n"]:,} |')
|
||||
md += ['',
|
||||
'## Any-pair joint FAR (cos>0.95 AND dh<=k)',
|
||||
'',
|
||||
'| dh <= k | FAR | hits / n |',
|
||||
'|---|---|---|']
|
||||
for e in any_pair_curve:
|
||||
md.append(f'| {e["dh_k"]} | {e["far"]:.4f} | '
|
||||
f'{e["hits"]} / {e["n"]:,} |')
|
||||
md += ['',
|
||||
'## Same-pair joint FAR (one candidate satisfies both)',
|
||||
'',
|
||||
'| cos>0.95 AND dh<=k | FAR | 95% CI | hits / n |',
|
||||
'|---|---|---|---|']
|
||||
for e in same_pair_curve:
|
||||
md.append(f'| dh <= {e["dh_k"]} | {e["far"]:.4f} | '
|
||||
f'[{e["ci95_lo"]:.4f}, {e["ci95_hi"]:.4f}] | '
|
||||
f'{e["hits"]} / {e["n"]:,} |')
|
||||
md += ['', '## Per-firm headline', '',
|
||||
'| Firm | n | any-pair FAR | same-pair FAR |',
|
||||
'|---|---|---|---|']
|
||||
for f, s in per_firm.items():
|
||||
md.append(f'| {f} | {s["n"]:,} | {s["any_pair_far"]:.4f} | '
|
||||
f'{s["same_pair_far"]:.4f} |')
|
||||
md += ['', '## Per-pool-decile headline', '',
|
||||
'| Decile | pool range | n | any-pair FAR | same-pair FAR |',
|
||||
'|---|---|---|---|---|']
|
||||
for k, s in per_decile.items():
|
||||
md.append(f'| {k} | {s["pool_range"][0]:.0f}-'
|
||||
f'{s["pool_range"][1]:.0f} | {s["n"]:,} | '
|
||||
f'{s["any_pair_far"]:.4f} | '
|
||||
f'{s["same_pair_far"]:.4f} |')
|
||||
md += ['', '## Document-level',
|
||||
'',
|
||||
f'- n_documents: {n_docs:,}',
|
||||
f'- any-pair FAR (any sig flagged): '
|
||||
f'{doc_anypair_flag/n_docs:.4f} '
|
||||
f'({doc_anypair_flag}/{n_docs})',
|
||||
f'- same-pair FAR: {doc_samepair_flag/n_docs:.4f} '
|
||||
f'({doc_samepair_flag}/{n_docs})',
|
||||
'',
|
||||
'## Threshold inversion (per-signature FAR targets)',
|
||||
'',
|
||||
'| target | marginal cos | marginal dh | any-pair joint '
|
||||
'| same-pair joint |',
|
||||
'|---|---|---|---|---|']
|
||||
for tgt in [0.10, 0.05, 0.02, 0.01, 0.005]:
|
||||
inv = inversions[f'per_sig_far_<=_{tgt}']
|
||||
c = inv['marginal_cos']
|
||||
d = inv['marginal_dh']
|
||||
a = inv['any_pair_joint']
|
||||
s = inv['same_pair_joint']
|
||||
cs = (f'cos > {c["k"]} (FAR={c["far"]:.4f})'
|
||||
if c else 'unachievable')
|
||||
ds = (f'dh <= {d["k"]} (FAR={d["far"]:.4f})'
|
||||
if d else 'unachievable')
|
||||
as_ = (f'dh <= {a["dh_k"]} (FAR={a["far"]:.4f})'
|
||||
if a else 'unachievable')
|
||||
ss = (f'dh <= {s["dh_k"]} (FAR={s["far"]:.4f})'
|
||||
if s else 'unachievable')
|
||||
md.append(f'| {tgt} | {cs} | {ds} | {as_} | {ss} |')
|
||||
md.append('')
|
||||
|
||||
md_path = OUT / 'pool_normalized_report.md'
|
||||
md_path.write_text('\n'.join(md), encoding='utf-8')
|
||||
print(f'[md ] {md_path}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user