pdf_signature_extraction/signature_analysis/15_hartigan_dip_test.py

#!/usr/bin/env python3
"""
Script 15: Hartigan Dip Test for Unimodality
=============================================
Runs the proper Hartigan & Hartigan (1985) dip test via the `diptest` package
on the empirical signature-similarity distributions.

Purpose:
  Confirm/refute bimodality assumption underpinning threshold-selection methods.
  Prior finding (2026-04-16): signature-level distribution is unimodal long-tail;
  the story is that bimodality only emerges at the accountant level.

Firm A framing (2026-04-20, corrected):
  Interviews with multiple Firm A accountants confirm that MOST use
  replication (stamping / firm-level e-signing) but do NOT exclude a
  minority of hand-signers. Firm A is therefore a "replication-dominated"
  population, NOT a "pure" one. This framing is consistent with:
    - 92.5% of Firm A signatures exceed cosine 0.95
    - The long left tail (7.5% below 0.95) captures the minority
      hand-signers, not scan noise
    - Script 18: of 180 Firm A accountants, 139 cluster in C1
      (high-replication) and 32 in C2 (middle band = minority hand-signers)

Tests:
  1. Firm A (Deloitte) cosine max-similarity       -> expected UNIMODAL
  2. Firm A (Deloitte) independent min dHash       -> expected UNIMODAL
  3. Full-sample cosine max-similarity             -> test
  4. Full-sample independent min dHash             -> test
  5. Accountant-level cosine mean (per-accountant) -> expected BIMODAL / MULTIMODAL
  6. Accountant-level dhash mean (per-accountant)  -> expected BIMODAL / MULTIMODAL

Output:
  reports/dip_test/dip_test_report.md
  reports/dip_test/dip_test_results.json
"""

import sqlite3
import json
import numpy as np
import diptest
from pathlib import Path
from datetime import datetime

DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/dip_test')
OUT.mkdir(parents=True, exist_ok=True)

FIRM_A = '勤業眾信聯合'


def run_dip(values, label, n_boot=2000):
    """Run Hartigan dip test and return structured result."""
    arr = np.asarray(values, dtype=float)
    arr = arr[~np.isnan(arr)]
    if len(arr) < 4:
        return {'label': label, 'n': int(len(arr)), 'error': 'too few observations'}

    dip, pval = diptest.diptest(arr, boot_pval=True, n_boot=n_boot)
    verdict = 'UNIMODAL (accept H0)' if pval > 0.05 else 'MULTIMODAL (reject H0)'
    return {
        'label': label,
        'n': int(len(arr)),
        'mean': float(np.mean(arr)),
        'std': float(np.std(arr)),
        'min': float(np.min(arr)),
        'max': float(np.max(arr)),
        'dip': float(dip),
        'p_value': float(pval),
        'n_boot': int(n_boot),
        'verdict_alpha_05': verdict,
    }


def fetch_firm_a():
    conn = sqlite3.connect(DB)
    cur = conn.cursor()
    cur.execute('''
        SELECT s.max_similarity_to_same_accountant,
               s.min_dhash_independent
        FROM signatures s
        JOIN accountants a ON s.assigned_accountant = a.name
        WHERE a.firm = ?
          AND s.max_similarity_to_same_accountant IS NOT NULL
    ''', (FIRM_A,))
    rows = cur.fetchall()
    conn.close()
    cos = [r[0] for r in rows if r[0] is not None]
    dh = [r[1] for r in rows if r[1] is not None]
    return np.array(cos), np.array(dh)


def fetch_full_sample():
    conn = sqlite3.connect(DB)
    cur = conn.cursor()
    cur.execute('''
        SELECT max_similarity_to_same_accountant, min_dhash_independent
        FROM signatures
        WHERE max_similarity_to_same_accountant IS NOT NULL
    ''')
    rows = cur.fetchall()
    conn.close()
    cos = np.array([r[0] for r in rows if r[0] is not None])
    dh = np.array([r[1] for r in rows if r[1] is not None])
    return cos, dh


def fetch_accountant_aggregates(min_sigs=10):
    """Per-accountant mean cosine and mean independent dHash."""
    conn = sqlite3.connect(DB)
    cur = conn.cursor()
    cur.execute('''
        SELECT s.assigned_accountant,
               AVG(s.max_similarity_to_same_accountant) AS cos_mean,
               AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean,
               COUNT(*) AS n
        FROM signatures s
        WHERE s.assigned_accountant IS NOT NULL
          AND s.max_similarity_to_same_accountant IS NOT NULL
          AND s.min_dhash_independent IS NOT NULL
        GROUP BY s.assigned_accountant
        HAVING n >= ?
    ''', (min_sigs,))
    rows = cur.fetchall()
    conn.close()
    cos_means = np.array([r[1] for r in rows])
    dh_means = np.array([r[2] for r in rows])
    return cos_means, dh_means, len(rows)


def main():
    print('='*70)
    print('Script 15: Hartigan Dip Test for Unimodality')
    print('='*70)

    results = {}

    # Firm A
    print('\n[1/3] Firm A (Deloitte)...')
    fa_cos, fa_dh = fetch_firm_a()
    print(f'  Firm A cosine N={len(fa_cos):,}, dHash N={len(fa_dh):,}')
    results['firm_a_cosine'] = run_dip(fa_cos, 'Firm A cosine max-similarity')
    results['firm_a_dhash'] = run_dip(fa_dh, 'Firm A independent min dHash')

    # Full sample
    print('\n[2/3] Full sample...')
    all_cos, all_dh = fetch_full_sample()
    print(f'  Full cosine N={len(all_cos):,}, dHash N={len(all_dh):,}')
    # Dip test on >=10k obs can be slow with 2000 boot; use 500 for full sample
    results['full_cosine'] = run_dip(all_cos, 'Full-sample cosine max-similarity',
                                     n_boot=500)
    results['full_dhash'] = run_dip(all_dh, 'Full-sample independent min dHash',
                                    n_boot=500)

    # Accountant-level aggregates
    print('\n[3/3] Accountant-level aggregates (min 10 sigs)...')
    acct_cos, acct_dh, n_acct = fetch_accountant_aggregates(min_sigs=10)
    print(f'  Accountants analyzed: {n_acct}')
    results['accountant_cos_mean'] = run_dip(acct_cos,
                                             'Per-accountant cosine mean')
    results['accountant_dh_mean'] = run_dip(acct_dh,
                                            'Per-accountant dHash mean')

    # Print summary
    print('\n' + '='*70)
    print('RESULTS SUMMARY')
    print('='*70)
    print(f"{'Test':<40} {'N':>8} {'dip':>8} {'p':>10} Verdict")
    print('-'*90)
    for key, r in results.items():
        if 'error' in r:
            continue
        print(f"{r['label']:<40} {r['n']:>8,} {r['dip']:>8.4f} "
              f"{r['p_value']:>10.4f} {r['verdict_alpha_05']}")

    # Write JSON
    json_path = OUT / 'dip_test_results.json'
    with open(json_path, 'w') as f:
        json.dump({
            'generated_at': datetime.now().isoformat(),
            'db': DB,
            'results': results,
        }, f, indent=2, ensure_ascii=False)
    print(f'\nJSON saved: {json_path}')

    # Write Markdown report
    md = [
        '# Hartigan Dip Test Report',
        f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        '',
        '## Method',
        '',
        'Hartigan & Hartigan (1985) dip test via `diptest` Python package.',
        'H0: distribution is unimodal. H1: multimodal (two or more modes).',
        'p-value computed by bootstrap against a uniform null (2000 reps for',
        'Firm A/accountant-level, 500 reps for full-sample due to size).',
        '',
        '## Results',
        '',
        '| Test | N | dip | p-value | Verdict (α=0.05) |',
        '|------|---|-----|---------|------------------|',
    ]
    for r in results.values():
        if 'error' in r:
            md.append(f"| {r['label']} | {r['n']} | — | — | {r['error']} |")
            continue
        md.append(
            f"| {r['label']} | {r['n']:,} | {r['dip']:.4f} | "
            f"{r['p_value']:.4f} | {r['verdict_alpha_05']} |"
        )
    md += [
        '',
        '## Interpretation',
        '',
        '* **Signature level** (Firm A + full sample): the dip test indicates',
        '  whether a single mode explains the max-cosine/min-dHash distribution.',
        '  Prior finding (2026-04-16) suggested unimodal long-tail; this script',
        '  provides the formal test.',
        '',
        '* **Accountant level** (per-accountant mean): if multimodal here but',
        '  unimodal at the signature level, this confirms the interpretation',
        "  that signing-behaviour is discrete across accountants (replication",
        '  vs hand-signing), while replication quality itself is a continuous',
        '  spectrum.',
        '',
        '## Downstream implication',
        '',
        'Methods that assume bimodality (KDE antimode, 2-component Beta mixture)',
        'should be applied at the level where dip test rejects H0. If the',
        "signature-level dip test fails to reject, the paper should report this",
        'and shift the mixture analysis to the accountant level (see Script 18).',
    ]
    md_path = OUT / 'dip_test_report.md'
    md_path.write_text('\n'.join(md), encoding='utf-8')
    print(f'Report saved: {md_path}')


if __name__ == '__main__':
    main()