d4f370bd5e
Spike checkpoint in response to codex rounds 28-30 review:
- 39b/c: signature-level dip test on Big-4 and non-Big-4 marginals
- 39d: dHash discrete-value robustness (raw vs jittered + histogram
valleys + firm residualization); confirms within-firm dHash dip
rejection is integer-mass-point artefact
- 39e: dHash firm-residualized + jittered 2x2 factorial decomposition;
confirms Big-4 pooled dh "multimodality" is composition + integer
artefact (centered + jittered p=0.35, 0/5 seeds reject)
- 40b: inter-CPA per-pair FAR sweep (cos + dh marginal + joint +
conditional); replicates v3 cos>0.95 FAR=0.0006 and provides
v4-new dh FAR curve
- 43: pool-normalized per-signature FAR (codex round-30 fix for
per-pair vs per-signature conflation); per-sig FAR for deployed
any-pair rule = 11.02%, per-firm structure shows Firm A 20% vs
B/C/D <1%
These scripts replace the distributional path (K=3 mixture / dip /
antimode) with anchor-based threshold derivation. Companion
artefacts in reports/v4_big4/{signature_level_diptest,
midsmall_signature_diptest, dhash_discrete_robustness,
inter_cpa_far_sweep, pool_normalized_far}/.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
196 lines
7.1 KiB
Python
196 lines
7.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script 39b: Signature-Level Dip Test (multimodality at the signature cloud)
|
|
============================================================================
|
|
Phase 5 pre-emptive evidence. Script 34 / 36 already report Hartigan
|
|
dip tests on the 437 accountant-level (cos_mean, dh_mean) means and
|
|
both marginals reject unimodality at p < 5e-4. Reviewers may ask
|
|
whether the same multimodality is detectable at the signature level
|
|
itself (n = 150,442 Big-4 signatures) and whether the multimodality
|
|
is a within-firm or only a between-firm phenomenon.
|
|
|
|
This script supplies the missing dip evidence on the raw signature
|
|
cloud. It is a *diagnostic* in the same role as Scripts 34/36 dip
|
|
tests: it does not derive an operational threshold; it characterises
|
|
the marginal distributions of (cos, dh_indep) at the signature level.
|
|
|
|
Outputs:
|
|
reports/v4_big4/signature_level_diptest/
|
|
sig_diptest_results.json
|
|
sig_diptest_report.md
|
|
|
|
Tests performed:
|
|
A. Pooled Big-4 marginals (cos, dh_indep), n = 150,442
|
|
B. Per-firm marginals (Firm A / B / C / D separately)
|
|
"""
|
|
|
|
import json
|
|
import sqlite3
|
|
import numpy as np
|
|
import diptest
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from scipy import stats
|
|
from scipy.signal import find_peaks
|
|
|
|
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
|
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
|
|
'v4_big4/signature_level_diptest')
|
|
OUT.mkdir(parents=True, exist_ok=True)
|
|
|
|
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
|
ALIAS = {'勤業眾信聯合': 'Firm A',
|
|
'安侯建業聯合': 'Firm B',
|
|
'資誠聯合': 'Firm C',
|
|
'安永聯合': 'Firm D'}
|
|
N_BOOT = 2000
|
|
|
|
|
|
def load_big4_signatures():
|
|
conn = sqlite3.connect(DB)
|
|
cur = conn.cursor()
|
|
cur.execute('''
|
|
SELECT s.assigned_accountant, a.firm,
|
|
s.max_similarity_to_same_accountant,
|
|
CAST(s.min_dhash_independent AS REAL)
|
|
FROM signatures s
|
|
JOIN accountants a ON s.assigned_accountant = a.name
|
|
WHERE s.assigned_accountant IS NOT NULL
|
|
AND s.max_similarity_to_same_accountant IS NOT NULL
|
|
AND s.min_dhash_independent IS NOT NULL
|
|
AND a.firm IN (?, ?, ?, ?)
|
|
''', BIG4)
|
|
rows = cur.fetchall()
|
|
conn.close()
|
|
return rows
|
|
|
|
|
|
def kde_dip(values, n_boot=N_BOOT):
|
|
arr = np.asarray(values, dtype=float)
|
|
arr = arr[np.isfinite(arr)]
|
|
dip, pval = diptest.diptest(arr, boot_pval=True, n_boot=n_boot)
|
|
kde = stats.gaussian_kde(arr, bw_method='silverman')
|
|
xs = np.linspace(arr.min(), arr.max(), 2000)
|
|
density = kde(xs)
|
|
peaks, _ = find_peaks(density, prominence=density.max() * 0.02)
|
|
antimodes = []
|
|
for i in range(len(peaks) - 1):
|
|
seg = density[peaks[i]:peaks[i + 1]]
|
|
if not len(seg):
|
|
continue
|
|
local = peaks[i] + int(np.argmin(seg))
|
|
antimodes.append(float(xs[local]))
|
|
return {
|
|
'n': int(len(arr)),
|
|
'dip': float(dip),
|
|
'dip_pvalue': float(pval),
|
|
'unimodal_alpha05': bool(pval > 0.05),
|
|
'n_modes': int(len(peaks)),
|
|
'mode_locations': [float(xs[p]) for p in peaks],
|
|
'antimodes': antimodes,
|
|
'n_boot': int(n_boot),
|
|
}
|
|
|
|
|
|
def _fmt_p(p):
|
|
if p == 0.0:
|
|
return '< 5e-4 (no bootstrap replicate exceeded observed dip)'
|
|
return f'{p:.4g}'
|
|
|
|
|
|
def main():
|
|
print('=' * 72)
|
|
print('Script 39b: Signature-Level Dip Test')
|
|
print('=' * 72)
|
|
rows = load_big4_signatures()
|
|
cos_all = np.array([r[2] for r in rows], dtype=float)
|
|
dh_all = np.array([r[3] for r in rows], dtype=float)
|
|
firms = np.array([ALIAS[r[1]] for r in rows])
|
|
print(f'\nLoaded {len(rows):,} Big-4 signatures')
|
|
for f in sorted(set(firms)):
|
|
print(f' {f}: {(firms == f).sum():,}')
|
|
|
|
results = {
|
|
'meta': {
|
|
'script': '39b',
|
|
'timestamp': datetime.now().isoformat(timespec='seconds'),
|
|
'n_total': int(len(rows)),
|
|
'n_boot': N_BOOT,
|
|
'note': ('Signature-level Hartigan dip test on Big-4 '
|
|
'(cos, dh_indep) marginals; pooled and per-firm.'),
|
|
},
|
|
'pooled': {},
|
|
'per_firm': {},
|
|
}
|
|
|
|
# A. Pooled
|
|
print('\n[A] Pooled Big-4')
|
|
for desc, arr in [('cos', cos_all), ('dh_indep', dh_all)]:
|
|
r = kde_dip(arr)
|
|
results['pooled'][desc] = r
|
|
print(f' {desc}: n={r["n"]:,}, dip={r["dip"]:.5f}, '
|
|
f'p={_fmt_p(r["dip_pvalue"])}, n_modes={r["n_modes"]}')
|
|
|
|
# B. Per-firm
|
|
print('\n[B] Per-firm')
|
|
for f in sorted(set(firms)):
|
|
mask = firms == f
|
|
results['per_firm'][f] = {}
|
|
for desc, arr in [('cos', cos_all[mask]), ('dh_indep', dh_all[mask])]:
|
|
r = kde_dip(arr)
|
|
results['per_firm'][f][desc] = r
|
|
print(f' {f} {desc}: n={r["n"]:,}, dip={r["dip"]:.5f}, '
|
|
f'p={_fmt_p(r["dip_pvalue"])}, n_modes={r["n_modes"]}')
|
|
|
|
json_path = OUT / 'sig_diptest_results.json'
|
|
json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False),
|
|
encoding='utf-8')
|
|
print(f'\n[json] {json_path}')
|
|
|
|
md = ['# Signature-Level Dip Test (Script 39b)',
|
|
'',
|
|
f'Generated: {results["meta"]["timestamp"]}',
|
|
f'Bootstrap replicates: {N_BOOT}',
|
|
'',
|
|
'## A. Pooled Big-4 signature cloud',
|
|
'',
|
|
f'n = {results["meta"]["n_total"]:,} signatures',
|
|
'',
|
|
'| Marginal | dip | p (boot) | n_modes | unimodal @0.05 |',
|
|
'|---|---|---|---|---|']
|
|
for desc in ['cos', 'dh_indep']:
|
|
r = results['pooled'][desc]
|
|
md.append(f'| {desc} | {r["dip"]:.5f} | {_fmt_p(r["dip_pvalue"])} | '
|
|
f'{r["n_modes"]} | {r["unimodal_alpha05"]} |')
|
|
|
|
md += ['', '## B. Per-firm signature-level dip tests', '',
|
|
'| Firm | Marginal | n | dip | p (boot) | n_modes | unimodal @0.05 |',
|
|
'|---|---|---|---|---|---|---|']
|
|
for f in sorted(results['per_firm']):
|
|
for desc in ['cos', 'dh_indep']:
|
|
r = results['per_firm'][f][desc]
|
|
md.append(f'| {f} | {desc} | {r["n"]:,} | {r["dip"]:.5f} | '
|
|
f'{_fmt_p(r["dip_pvalue"])} | {r["n_modes"]} | '
|
|
f'{r["unimodal_alpha05"]} |')
|
|
md += ['',
|
|
'## Reading guide',
|
|
'',
|
|
('A unimodality rejection at the signature level confirms '
|
|
'multimodal structure independent of accountant-level '
|
|
'aggregation. A within-firm rejection further indicates the '
|
|
'multimodality is not solely a between-firm artefact. A '
|
|
'within-firm non-rejection (e.g., Firm A) is consistent with '
|
|
'that firm being concentrated in a single mechanism corner.'),
|
|
'',
|
|
('All thresholds and operational classifiers remain those of '
|
|
'v3.x §III-K and v4.0 §III-J; this script supplies diagnostic '
|
|
'evidence only.'),
|
|
'']
|
|
md_path = OUT / 'sig_diptest_report.md'
|
|
md_path.write_text('\n'.join(md), encoding='utf-8')
|
|
print(f'[md ] {md_path}')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|