#!/usr/bin/env python3 """ Script 39b: Signature-Level Dip Test (multimodality at the signature cloud) ============================================================================ Phase 5 pre-emptive evidence. Script 34 / 36 already report Hartigan dip tests on the 437 accountant-level (cos_mean, dh_mean) means and both marginals reject unimodality at p < 5e-4. Reviewers may ask whether the same multimodality is detectable at the signature level itself (n = 150,442 Big-4 signatures) and whether the multimodality is a within-firm or only a between-firm phenomenon. This script supplies the missing dip evidence on the raw signature cloud. It is a *diagnostic* in the same role as Scripts 34/36 dip tests: it does not derive an operational threshold; it characterises the marginal distributions of (cos, dh_indep) at the signature level. Outputs: reports/v4_big4/signature_level_diptest/ sig_diptest_results.json sig_diptest_report.md Tests performed: A. Pooled Big-4 marginals (cos, dh_indep), n = 150,442 B. Per-firm marginals (Firm A / B / C / D separately) """ import json import sqlite3 import numpy as np import diptest from pathlib import Path from datetime import datetime from scipy import stats from scipy.signal import find_peaks DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' 'v4_big4/signature_level_diptest') OUT.mkdir(parents=True, exist_ok=True) BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合') ALIAS = {'勤業眾信聯合': 'Firm A', '安侯建業聯合': 'Firm B', '資誠聯合': 'Firm C', '安永聯合': 'Firm D'} N_BOOT = 2000 def load_big4_signatures(): conn = sqlite3.connect(DB) cur = conn.cursor() cur.execute(''' SELECT s.assigned_accountant, a.firm, s.max_similarity_to_same_accountant, CAST(s.min_dhash_independent AS REAL) FROM signatures s JOIN accountants a ON s.assigned_accountant = a.name WHERE s.assigned_accountant IS NOT NULL AND s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL AND a.firm IN (?, ?, ?, ?) ''', BIG4) rows = cur.fetchall() conn.close() return rows def kde_dip(values, n_boot=N_BOOT): arr = np.asarray(values, dtype=float) arr = arr[np.isfinite(arr)] dip, pval = diptest.diptest(arr, boot_pval=True, n_boot=n_boot) kde = stats.gaussian_kde(arr, bw_method='silverman') xs = np.linspace(arr.min(), arr.max(), 2000) density = kde(xs) peaks, _ = find_peaks(density, prominence=density.max() * 0.02) antimodes = [] for i in range(len(peaks) - 1): seg = density[peaks[i]:peaks[i + 1]] if not len(seg): continue local = peaks[i] + int(np.argmin(seg)) antimodes.append(float(xs[local])) return { 'n': int(len(arr)), 'dip': float(dip), 'dip_pvalue': float(pval), 'unimodal_alpha05': bool(pval > 0.05), 'n_modes': int(len(peaks)), 'mode_locations': [float(xs[p]) for p in peaks], 'antimodes': antimodes, 'n_boot': int(n_boot), } def _fmt_p(p): if p == 0.0: return '< 5e-4 (no bootstrap replicate exceeded observed dip)' return f'{p:.4g}' def main(): print('=' * 72) print('Script 39b: Signature-Level Dip Test') print('=' * 72) rows = load_big4_signatures() cos_all = np.array([r[2] for r in rows], dtype=float) dh_all = np.array([r[3] for r in rows], dtype=float) firms = np.array([ALIAS[r[1]] for r in rows]) print(f'\nLoaded {len(rows):,} Big-4 signatures') for f in sorted(set(firms)): print(f' {f}: {(firms == f).sum():,}') results = { 'meta': { 'script': '39b', 'timestamp': datetime.now().isoformat(timespec='seconds'), 'n_total': int(len(rows)), 'n_boot': N_BOOT, 'note': ('Signature-level Hartigan dip test on Big-4 ' '(cos, dh_indep) marginals; pooled and per-firm.'), }, 'pooled': {}, 'per_firm': {}, } # A. Pooled print('\n[A] Pooled Big-4') for desc, arr in [('cos', cos_all), ('dh_indep', dh_all)]: r = kde_dip(arr) results['pooled'][desc] = r print(f' {desc}: n={r["n"]:,}, dip={r["dip"]:.5f}, ' f'p={_fmt_p(r["dip_pvalue"])}, n_modes={r["n_modes"]}') # B. Per-firm print('\n[B] Per-firm') for f in sorted(set(firms)): mask = firms == f results['per_firm'][f] = {} for desc, arr in [('cos', cos_all[mask]), ('dh_indep', dh_all[mask])]: r = kde_dip(arr) results['per_firm'][f][desc] = r print(f' {f} {desc}: n={r["n"]:,}, dip={r["dip"]:.5f}, ' f'p={_fmt_p(r["dip_pvalue"])}, n_modes={r["n_modes"]}') json_path = OUT / 'sig_diptest_results.json' json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False), encoding='utf-8') print(f'\n[json] {json_path}') md = ['# Signature-Level Dip Test (Script 39b)', '', f'Generated: {results["meta"]["timestamp"]}', f'Bootstrap replicates: {N_BOOT}', '', '## A. Pooled Big-4 signature cloud', '', f'n = {results["meta"]["n_total"]:,} signatures', '', '| Marginal | dip | p (boot) | n_modes | unimodal @0.05 |', '|---|---|---|---|---|'] for desc in ['cos', 'dh_indep']: r = results['pooled'][desc] md.append(f'| {desc} | {r["dip"]:.5f} | {_fmt_p(r["dip_pvalue"])} | ' f'{r["n_modes"]} | {r["unimodal_alpha05"]} |') md += ['', '## B. Per-firm signature-level dip tests', '', '| Firm | Marginal | n | dip | p (boot) | n_modes | unimodal @0.05 |', '|---|---|---|---|---|---|---|'] for f in sorted(results['per_firm']): for desc in ['cos', 'dh_indep']: r = results['per_firm'][f][desc] md.append(f'| {f} | {desc} | {r["n"]:,} | {r["dip"]:.5f} | ' f'{_fmt_p(r["dip_pvalue"])} | {r["n_modes"]} | ' f'{r["unimodal_alpha05"]} |') md += ['', '## Reading guide', '', ('A unimodality rejection at the signature level confirms ' 'multimodal structure independent of accountant-level ' 'aggregation. A within-firm rejection further indicates the ' 'multimodality is not solely a between-firm artefact. A ' 'within-firm non-rejection (e.g., Firm A) is consistent with ' 'that firm being concentrated in a single mechanism corner.'), '', ('All thresholds and operational classifiers remain those of ' 'v3.x §III-K and v4.0 §III-J; this script supplies diagnostic ' 'evidence only.'), ''] md_path = OUT / 'sig_diptest_report.md' md_path.write_text('\n'.join(md), encoding='utf-8') print(f'[md ] {md_path}') if __name__ == '__main__': main()