Add Scripts 39b/c/d/e + 40b + 43: anchor-based FAR diagnostics

Spike checkpoint in response to codex rounds 28-30 review: - 39b/c: signature-level dip test on Big-4 and non-Big-4 marginals - 39d: dHash discrete-value robustness (raw vs jittered + histogram valleys + firm residualization); confirms within-firm dHash dip rejection is integer-mass-point artefact - 39e: dHash firm-residualized + jittered 2x2 factorial decomposition; confirms Big-4 pooled dh "multimodality" is composition + integer artefact (centered + jittered p=0.35, 0/5 seeds reject) - 40b: inter-CPA per-pair FAR sweep (cos + dh marginal + joint + conditional); replicates v3 cos>0.95 FAR=0.0006 and provides v4-new dh FAR curve - 43: pool-normalized per-signature FAR (codex round-30 fix for per-pair vs per-signature conflation); per-sig FAR for deployed any-pair rule = 11.02%, per-firm structure shows Firm A 20% vs B/C/D <1% These scripts replace the distributional path (K=3 mixture / dip / antimode) with anchor-based threshold derivation. Companion artefacts in reports/v4_big4/{signature_level_diptest, midsmall_signature_diptest, dhash_discrete_robustness, inter_cpa_far_sweep, pool_normalized_far}/. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 14:08:49 +08:00
parent 6db5d635f5
commit d4f370bd5e
6 changed files with 2086 additions and 0 deletions
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+"""
+Script 39b: Signature-Level Dip Test (multimodality at the signature cloud)
+============================================================================
+Phase 5 pre-emptive evidence. Script 34 / 36 already report Hartigan
+dip tests on the 437 accountant-level (cos_mean, dh_mean) means and
+both marginals reject unimodality at p < 5e-4. Reviewers may ask
+whether the same multimodality is detectable at the signature level
+itself (n = 150,442 Big-4 signatures) and whether the multimodality
+is a within-firm or only a between-firm phenomenon.
+
+This script supplies the missing dip evidence on the raw signature
+cloud. It is a *diagnostic* in the same role as Scripts 34/36 dip
+tests: it does not derive an operational threshold; it characterises
+the marginal distributions of (cos, dh_indep) at the signature level.
+
+Outputs:
+  reports/v4_big4/signature_level_diptest/
+    sig_diptest_results.json
+    sig_diptest_report.md
+
+Tests performed:
+  A. Pooled Big-4 marginals (cos, dh_indep), n = 150,442
+  B. Per-firm marginals (Firm A / B / C / D separately)
+"""
+
+import json
+import sqlite3
+import numpy as np
+import diptest
+from pathlib import Path
+from datetime import datetime
+from scipy import stats
+from scipy.signal import find_peaks
+
+DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
+           'v4_big4/signature_level_diptest')
+OUT.mkdir(parents=True, exist_ok=True)
+
+BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
+ALIAS = {'勤業眾信聯合': 'Firm A',
+         '安侯建業聯合': 'Firm B',
+         '資誠聯合': 'Firm C',
+         '安永聯合': 'Firm D'}
+N_BOOT = 2000
+
+
+def load_big4_signatures():
+    conn = sqlite3.connect(DB)
+    cur = conn.cursor()
+    cur.execute('''
+        SELECT s.assigned_accountant, a.firm,
+               s.max_similarity_to_same_accountant,
+               CAST(s.min_dhash_independent AS REAL)
+        FROM signatures s
+        JOIN accountants a ON s.assigned_accountant = a.name
+        WHERE s.assigned_accountant IS NOT NULL
+          AND s.max_similarity_to_same_accountant IS NOT NULL
+          AND s.min_dhash_independent IS NOT NULL
+          AND a.firm IN (?, ?, ?, ?)
+    ''', BIG4)
+    rows = cur.fetchall()
+    conn.close()
+    return rows
+
+
+def kde_dip(values, n_boot=N_BOOT):
+    arr = np.asarray(values, dtype=float)
+    arr = arr[np.isfinite(arr)]
+    dip, pval = diptest.diptest(arr, boot_pval=True, n_boot=n_boot)
+    kde = stats.gaussian_kde(arr, bw_method='silverman')
+    xs = np.linspace(arr.min(), arr.max(), 2000)
+    density = kde(xs)
+    peaks, _ = find_peaks(density, prominence=density.max() * 0.02)
+    antimodes = []
+    for i in range(len(peaks) - 1):
+        seg = density[peaks[i]:peaks[i + 1]]
+        if not len(seg):
+            continue
+        local = peaks[i] + int(np.argmin(seg))
+        antimodes.append(float(xs[local]))
+    return {
+        'n': int(len(arr)),
+        'dip': float(dip),
+        'dip_pvalue': float(pval),
+        'unimodal_alpha05': bool(pval > 0.05),
+        'n_modes': int(len(peaks)),
+        'mode_locations': [float(xs[p]) for p in peaks],
+        'antimodes': antimodes,
+        'n_boot': int(n_boot),
+    }
+
+
+def _fmt_p(p):
+    if p == 0.0:
+        return '< 5e-4 (no bootstrap replicate exceeded observed dip)'
+    return f'{p:.4g}'
+
+
+def main():
+    print('=' * 72)
+    print('Script 39b: Signature-Level Dip Test')
+    print('=' * 72)
+    rows = load_big4_signatures()
+    cos_all = np.array([r[2] for r in rows], dtype=float)
+    dh_all = np.array([r[3] for r in rows], dtype=float)
+    firms = np.array([ALIAS[r[1]] for r in rows])
+    print(f'\nLoaded {len(rows):,} Big-4 signatures')
+    for f in sorted(set(firms)):
+        print(f'  {f}: {(firms == f).sum():,}')
+
+    results = {
+        'meta': {
+            'script': '39b',
+            'timestamp': datetime.now().isoformat(timespec='seconds'),
+            'n_total': int(len(rows)),
+            'n_boot': N_BOOT,
+            'note': ('Signature-level Hartigan dip test on Big-4 '
+                     '(cos, dh_indep) marginals; pooled and per-firm.'),
+        },
+        'pooled': {},
+        'per_firm': {},
+    }
+
+    # A. Pooled
+    print('\n[A] Pooled Big-4')
+    for desc, arr in [('cos', cos_all), ('dh_indep', dh_all)]:
+        r = kde_dip(arr)
+        results['pooled'][desc] = r
+        print(f'  {desc}: n={r["n"]:,}, dip={r["dip"]:.5f}, '
+              f'p={_fmt_p(r["dip_pvalue"])}, n_modes={r["n_modes"]}')
+
+    # B. Per-firm
+    print('\n[B] Per-firm')
+    for f in sorted(set(firms)):
+        mask = firms == f
+        results['per_firm'][f] = {}
+        for desc, arr in [('cos', cos_all[mask]), ('dh_indep', dh_all[mask])]:
+            r = kde_dip(arr)
+            results['per_firm'][f][desc] = r
+            print(f'  {f} {desc}: n={r["n"]:,}, dip={r["dip"]:.5f}, '
+                  f'p={_fmt_p(r["dip_pvalue"])}, n_modes={r["n_modes"]}')
+
+    json_path = OUT / 'sig_diptest_results.json'
+    json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False),
+                         encoding='utf-8')
+    print(f'\n[json] {json_path}')
+
+    md = ['# Signature-Level Dip Test (Script 39b)',
+          '',
+          f'Generated: {results["meta"]["timestamp"]}',
+          f'Bootstrap replicates: {N_BOOT}',
+          '',
+          '## A. Pooled Big-4 signature cloud',
+          '',
+          f'n = {results["meta"]["n_total"]:,} signatures',
+          '',
+          '| Marginal | dip | p (boot) | n_modes | unimodal @0.05 |',
+          '|---|---|---|---|---|']
+    for desc in ['cos', 'dh_indep']:
+        r = results['pooled'][desc]
+        md.append(f'| {desc} | {r["dip"]:.5f} | {_fmt_p(r["dip_pvalue"])} | '
+                  f'{r["n_modes"]} | {r["unimodal_alpha05"]} |')
+
+    md += ['', '## B. Per-firm signature-level dip tests', '',
+           '| Firm | Marginal | n | dip | p (boot) | n_modes | unimodal @0.05 |',
+           '|---|---|---|---|---|---|---|']
+    for f in sorted(results['per_firm']):
+        for desc in ['cos', 'dh_indep']:
+            r = results['per_firm'][f][desc]
+            md.append(f'| {f} | {desc} | {r["n"]:,} | {r["dip"]:.5f} | '
+                      f'{_fmt_p(r["dip_pvalue"])} | {r["n_modes"]} | '
+                      f'{r["unimodal_alpha05"]} |')
+    md += ['',
+           '## Reading guide',
+           '',
+           ('A unimodality rejection at the signature level confirms '
+            'multimodal structure independent of accountant-level '
+            'aggregation. A within-firm rejection further indicates the '
+            'multimodality is not solely a between-firm artefact. A '
+            'within-firm non-rejection (e.g., Firm A) is consistent with '
+            'that firm being concentrated in a single mechanism corner.'),
+           '',
+           ('All thresholds and operational classifiers remain those of '
+            'v3.x §III-K and v4.0 §III-J; this script supplies diagnostic '
+            'evidence only.'),
+           '']
+    md_path = OUT / 'sig_diptest_report.md'
+    md_path.write_text('\n'.join(md), encoding='utf-8')
+    print(f'[md  ] {md_path}')
+
+
+if __name__ == '__main__':
+    main()