#!/usr/bin/env python3 """ Script 15: Hartigan Dip Test for Unimodality ============================================= Runs the proper Hartigan & Hartigan (1985) dip test via the `diptest` package on the empirical signature-similarity distributions. Purpose: Confirm/refute bimodality assumption underpinning threshold-selection methods. Prior finding (2026-04-16): signature-level distribution is unimodal long-tail; the story is that bimodality only emerges at the accountant level. Tests: 1. Firm A (Deloitte) cosine max-similarity -> expected UNIMODAL 2. Firm A (Deloitte) independent min dHash -> expected UNIMODAL 3. Full-sample cosine max-similarity -> test 4. Full-sample independent min dHash -> test 5. Accountant-level cosine mean (per-accountant) -> expected BIMODAL / MULTIMODAL 6. Accountant-level dhash mean (per-accountant) -> expected BIMODAL / MULTIMODAL Output: reports/dip_test/dip_test_report.md reports/dip_test/dip_test_results.json """ import sqlite3 import json import numpy as np import diptest from pathlib import Path from datetime import datetime DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/dip_test') OUT.mkdir(parents=True, exist_ok=True) FIRM_A = '勤業眾信聯合' def run_dip(values, label, n_boot=2000): """Run Hartigan dip test and return structured result.""" arr = np.asarray(values, dtype=float) arr = arr[~np.isnan(arr)] if len(arr) < 4: return {'label': label, 'n': int(len(arr)), 'error': 'too few observations'} dip, pval = diptest.diptest(arr, boot_pval=True, n_boot=n_boot) verdict = 'UNIMODAL (accept H0)' if pval > 0.05 else 'MULTIMODAL (reject H0)' return { 'label': label, 'n': int(len(arr)), 'mean': float(np.mean(arr)), 'std': float(np.std(arr)), 'min': float(np.min(arr)), 'max': float(np.max(arr)), 'dip': float(dip), 'p_value': float(pval), 'n_boot': int(n_boot), 'verdict_alpha_05': verdict, } def fetch_firm_a(): conn = sqlite3.connect(DB) cur = conn.cursor() cur.execute(''' SELECT s.max_similarity_to_same_accountant, s.min_dhash_independent FROM signatures s JOIN accountants a ON s.assigned_accountant = a.name WHERE a.firm = ? AND s.max_similarity_to_same_accountant IS NOT NULL ''', (FIRM_A,)) rows = cur.fetchall() conn.close() cos = [r[0] for r in rows if r[0] is not None] dh = [r[1] for r in rows if r[1] is not None] return np.array(cos), np.array(dh) def fetch_full_sample(): conn = sqlite3.connect(DB) cur = conn.cursor() cur.execute(''' SELECT max_similarity_to_same_accountant, min_dhash_independent FROM signatures WHERE max_similarity_to_same_accountant IS NOT NULL ''') rows = cur.fetchall() conn.close() cos = np.array([r[0] for r in rows if r[0] is not None]) dh = np.array([r[1] for r in rows if r[1] is not None]) return cos, dh def fetch_accountant_aggregates(min_sigs=10): """Per-accountant mean cosine and mean independent dHash.""" conn = sqlite3.connect(DB) cur = conn.cursor() cur.execute(''' SELECT s.assigned_accountant, AVG(s.max_similarity_to_same_accountant) AS cos_mean, AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean, COUNT(*) AS n FROM signatures s WHERE s.assigned_accountant IS NOT NULL AND s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL GROUP BY s.assigned_accountant HAVING n >= ? ''', (min_sigs,)) rows = cur.fetchall() conn.close() cos_means = np.array([r[1] for r in rows]) dh_means = np.array([r[2] for r in rows]) return cos_means, dh_means, len(rows) def main(): print('='*70) print('Script 15: Hartigan Dip Test for Unimodality') print('='*70) results = {} # Firm A print('\n[1/3] Firm A (Deloitte)...') fa_cos, fa_dh = fetch_firm_a() print(f' Firm A cosine N={len(fa_cos):,}, dHash N={len(fa_dh):,}') results['firm_a_cosine'] = run_dip(fa_cos, 'Firm A cosine max-similarity') results['firm_a_dhash'] = run_dip(fa_dh, 'Firm A independent min dHash') # Full sample print('\n[2/3] Full sample...') all_cos, all_dh = fetch_full_sample() print(f' Full cosine N={len(all_cos):,}, dHash N={len(all_dh):,}') # Dip test on >=10k obs can be slow with 2000 boot; use 500 for full sample results['full_cosine'] = run_dip(all_cos, 'Full-sample cosine max-similarity', n_boot=500) results['full_dhash'] = run_dip(all_dh, 'Full-sample independent min dHash', n_boot=500) # Accountant-level aggregates print('\n[3/3] Accountant-level aggregates (min 10 sigs)...') acct_cos, acct_dh, n_acct = fetch_accountant_aggregates(min_sigs=10) print(f' Accountants analyzed: {n_acct}') results['accountant_cos_mean'] = run_dip(acct_cos, 'Per-accountant cosine mean') results['accountant_dh_mean'] = run_dip(acct_dh, 'Per-accountant dHash mean') # Print summary print('\n' + '='*70) print('RESULTS SUMMARY') print('='*70) print(f"{'Test':<40} {'N':>8} {'dip':>8} {'p':>10} Verdict") print('-'*90) for key, r in results.items(): if 'error' in r: continue print(f"{r['label']:<40} {r['n']:>8,} {r['dip']:>8.4f} " f"{r['p_value']:>10.4f} {r['verdict_alpha_05']}") # Write JSON json_path = OUT / 'dip_test_results.json' with open(json_path, 'w') as f: json.dump({ 'generated_at': datetime.now().isoformat(), 'db': DB, 'results': results, }, f, indent=2, ensure_ascii=False) print(f'\nJSON saved: {json_path}') # Write Markdown report md = [ '# Hartigan Dip Test Report', f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", '', '## Method', '', 'Hartigan & Hartigan (1985) dip test via `diptest` Python package.', 'H0: distribution is unimodal. H1: multimodal (two or more modes).', 'p-value computed by bootstrap against a uniform null (2000 reps for', 'Firm A/accountant-level, 500 reps for full-sample due to size).', '', '## Results', '', '| Test | N | dip | p-value | Verdict (α=0.05) |', '|------|---|-----|---------|------------------|', ] for r in results.values(): if 'error' in r: md.append(f"| {r['label']} | {r['n']} | — | — | {r['error']} |") continue md.append( f"| {r['label']} | {r['n']:,} | {r['dip']:.4f} | " f"{r['p_value']:.4f} | {r['verdict_alpha_05']} |" ) md += [ '', '## Interpretation', '', '* **Signature level** (Firm A + full sample): the dip test indicates', ' whether a single mode explains the max-cosine/min-dHash distribution.', ' Prior finding (2026-04-16) suggested unimodal long-tail; this script', ' provides the formal test.', '', '* **Accountant level** (per-accountant mean): if multimodal here but', ' unimodal at the signature level, this confirms the interpretation', " that signing-behaviour is discrete across accountants (replication", ' vs hand-signing), while replication quality itself is a continuous', ' spectrum.', '', '## Downstream implication', '', 'Methods that assume bimodality (KDE antimode, 2-component Beta mixture)', 'should be applied at the level where dip test rejects H0. If the', "signature-level dip test fails to reject, the paper should report this", 'and shift the mixture analysis to the accountant level (see Script 18).', ] md_path = OUT / 'dip_test_report.md' md_path.write_text('\n'.join(md), encoding='utf-8') print(f'Report saved: {md_path}') if __name__ == '__main__': main()