#!/usr/bin/env python3 """ Script 39e: dHash Firm-Residualized + Jittered Dip (final test) ================================================================ Script 39d showed: - Within-firm dh dip rejections all vanish after jitter (integer artifact) - Big-4 pooled dh dip survives jitter (p_median=0 over 5 seeds) But Firm A mean dh = 2.73 vs Firms B/C/D ~6.5-7.4 -- a large between-firm location shift, analogous to the cosine case where firm-mean centering eliminated rejection. This script applies BOTH corrections simultaneously: 1. Firm-mean centering (remove between-firm location shifts) 2. Uniform jitter in [-0.5, +0.5] (remove integer ties) If the doubly-corrected dh distribution rejects unimodality, the Big-4 pooled multimodality is a genuine within-population, continuous phenomenon. If it fails to reject, dh "multimodality" is fully explained by between-firm composition (same conclusion as cosine). Multi-seed (5 seeds) for robustness. Outputs: reports/v4_big4/dhash_discrete_robustness/ dhash_residualized_jittered_results.json dhash_residualized_jittered_report.md """ import json import sqlite3 import numpy as np import diptest from pathlib import Path from datetime import datetime DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' 'v4_big4/dhash_discrete_robustness') OUT.mkdir(parents=True, exist_ok=True) BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合') ALIAS = {'勤業眾信聯合': 'Firm A', '安侯建業聯合': 'Firm B', '資誠聯合': 'Firm C', '安永聯合': 'Firm D'} N_BOOT = 2000 SEEDS = [42, 43, 44, 45, 46] def load_signatures(): conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True) cur = conn.cursor() cur.execute(''' SELECT a.firm, CAST(s.min_dhash_independent AS REAL) FROM signatures s JOIN accountants a ON s.assigned_accountant = a.name WHERE s.assigned_accountant IS NOT NULL AND s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL AND a.firm IS NOT NULL ''') rows = cur.fetchall() conn.close() return rows def firm_residualize(values, firm_labels): arr = np.asarray(values, dtype=float) firms = np.asarray(firm_labels) out = arr.copy() grand = float(np.mean(arr)) for f in np.unique(firms): m = firms == f out[m] = arr[m] - float(np.mean(arr[m])) + grand return out def dip_multi(values, seeds, with_jitter, n_boot=N_BOOT): arr = np.asarray(values, dtype=float) arr = arr[np.isfinite(arr)] results = [] for seed in seeds: rng = np.random.default_rng(seed) v = arr + rng.uniform(-0.5, 0.5, len(arr)) if with_jitter else arr d, p = diptest.diptest(v, boot_pval=True, n_boot=n_boot) results.append({'seed': seed, 'dip': float(d), 'p': float(p)}) if not with_jitter: break # without jitter the seed is irrelevant return results def _fmt_p(p): return '< 5e-4' if p == 0.0 else f'{p:.4g}' def summarize(name, results): ps = [r['p'] for r in results] ds = [r['dip'] for r in results] return { 'name': name, 'n_seeds': len(results), 'dip_min': min(ds), 'dip_max': max(ds), 'dip_median': float(np.median(ds)), 'p_min': min(ps), 'p_max': max(ps), 'p_median': float(np.median(ps)), 'reject_at_05_count': int(sum(1 for p in ps if p <= 0.05)), 'per_seed': results, } def main(): print('=' * 72) print('Script 39e: dHash Firm-Residualized + Jittered Dip') print('=' * 72) rows = load_signatures() firms_raw = np.array([r[0] for r in rows]) dh = np.array([r[1] for r in rows], dtype=float) is_big4 = np.isin(firms_raw, BIG4) big4_dh = dh[is_big4] big4_firms = np.array([ALIAS[f] for f in firms_raw[is_big4]]) print(f'\nLoaded {len(rows):,} signatures; Big-4 {is_big4.sum():,}') print('\nPer-firm Big-4 dh summary:') for f in sorted(set(big4_firms)): v = big4_dh[big4_firms == f] print(f' {f}: n={len(v):,} mean={v.mean():.3f} ' f'median={np.median(v):.1f} sd={v.std():.3f}') # ---- Test conditions, all on Big-4 signature-level dh ---- panels = {} # 1. Raw (no centering, no jitter) print('\n[1] Raw dh') r = dip_multi(big4_dh, [42], with_jitter=False) panels['raw'] = summarize('raw', r) print(f' dip={r[0]["dip"]:.5f}, p={_fmt_p(r[0]["p"])}') # 2. Centered only (no jitter; integer values preserved) print('\n[2] Firm-mean centered, no jitter') centered = firm_residualize(big4_dh, big4_firms) r = dip_multi(centered, [42], with_jitter=False) panels['centered_only'] = summarize('centered_only', r) print(f' dip={r[0]["dip"]:.5f}, p={_fmt_p(r[0]["p"])}') # 3. Jittered only (no centering) print('\n[3] Jittered (5 seeds), no centering') r = dip_multi(big4_dh, SEEDS, with_jitter=True) panels['jitter_only'] = summarize('jitter_only', r) print(f' p_median={panels["jitter_only"]["p_median"]:.4g}, ' f'reject@.05 in ' f'{panels["jitter_only"]["reject_at_05_count"]}/5 seeds') # 4. Centered + jittered (THE key test) print('\n[4] Firm-mean centered + jittered (5 seeds) -- KEY TEST') r = dip_multi(centered, SEEDS, with_jitter=True) panels['centered_jittered'] = summarize('centered_jittered', r) print(f' p_median={panels["centered_jittered"]["p_median"]:.4g}, ' f'reject@.05 in ' f'{panels["centered_jittered"]["reject_at_05_count"]}/5 seeds') for s in r: print(f' seed {s["seed"]}: dip={s["dip"]:.5f}, p={_fmt_p(s["p"])}') # Per-firm dh stats (re-confirm Firm A shift) firm_stats = {} for f in sorted(set(big4_firms)): v = big4_dh[big4_firms == f] firm_stats[f] = { 'n': int(len(v)), 'mean': float(v.mean()), 'median': float(np.median(v)), 'sd': float(v.std()), 'p25': float(np.percentile(v, 25)), 'p75': float(np.percentile(v, 75)), 'pct_le_5': float(np.mean(v <= 5)), 'pct_gt_15': float(np.mean(v > 15)), } results = { 'meta': { 'script': '39e', 'timestamp': datetime.now().isoformat(timespec='seconds'), 'n_big4_signatures': int(big4_dh.size), 'n_boot': N_BOOT, 'seeds': SEEDS, 'note': ('Final test: does Big-4 pooled dh multimodality ' 'survive BOTH firm-mean centering and integer-tie ' 'jitter?'), }, 'panels': panels, 'per_firm_dh_stats': firm_stats, } json_path = OUT / 'dhash_residualized_jittered_results.json' json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False), encoding='utf-8') print(f'\n[json] {json_path}') md = [ '# dHash Firm-Residualized + Jittered Dip (Script 39e)', '', f'Generated: {results["meta"]["timestamp"]}', f'Bootstrap replicates: {N_BOOT}; jitter seeds: {SEEDS}', '', '## Per-firm Big-4 dh summary', '', '| Firm | n | mean | median | sd | P25 | P75 | %<=5 | %>15 |', '|---|---|---|---|---|---|---|---|---|', ] for f, s in firm_stats.items(): md.append(f'| {f} | {s["n"]:,} | {s["mean"]:.3f} | ' f'{s["median"]:.1f} | {s["sd"]:.3f} | ' f'{s["p25"]:.1f} | {s["p75"]:.1f} | ' f'{s["pct_le_5"]:.3f} | {s["pct_gt_15"]:.3f} |') md += [ '', '## Dip test under four conditions (Big-4 pooled, sig-level)', '', '| Condition | dip | p (or p_median) | reject@.05 (seeds) |', '|---|---|---|---|', f'| 1. Raw (integer values) | {panels["raw"]["dip_median"]:.5f} ' f'| {_fmt_p(panels["raw"]["p_median"])} | n/a (1 seed) |', f'| 2. Firm-mean centered, no jitter ' f'| {panels["centered_only"]["dip_median"]:.5f} ' f'| {_fmt_p(panels["centered_only"]["p_median"])} | n/a (1 seed) |', f'| 3. Jittered only (5 seeds) ' f'| median {panels["jitter_only"]["dip_median"]:.5f} ' f'| median {_fmt_p(panels["jitter_only"]["p_median"])} ' f'| {panels["jitter_only"]["reject_at_05_count"]}/5 |', f'| 4. **Centered + jittered (5 seeds)** ' f'| median {panels["centered_jittered"]["dip_median"]:.5f} ' f'| median {_fmt_p(panels["centered_jittered"]["p_median"])} ' f'| {panels["centered_jittered"]["reject_at_05_count"]}/5 |', '', '## Interpretation', '', ('If Condition 4 still rejects unimodality, Big-4 dh has ' 'genuine within-population continuous multimodality ' 'independent of both between-firm location shifts and ' 'integer mass points. If Condition 4 fails to reject, the ' 'Big-4 pooled dh multimodality is fully explained by ' '(between-firm mean shift) + (integer mass points). In the ' 'latter case, the dh axis carries no independent within-firm ' 'regime evidence beyond the cos axis.'), '', ] md_path = OUT / 'dhash_residualized_jittered_report.md' md_path.write_text('\n'.join(md), encoding='utf-8') print(f'[md ] {md_path}') if __name__ == '__main__': main()