#!/usr/bin/env python3 """ Script 20: Three-Method Threshold Determination at the Accountant Level ======================================================================= Completes the three-method convergent framework at the analysis level where the mixture structure is statistically supported (per Script 15 dip test: accountant cos_mean p<0.001). Runs on the per-accountant aggregates (mean best-match cosine, mean independent minimum dHash) for 686 CPAs with >=10 signatures: Method 1: KDE antimode with Hartigan dip test (formal unimodality test) Method 2: Burgstahler-Dichev / McCrary discontinuity Method 3: 2-component Beta mixture via EM + parallel logit-GMM Also re-runs the accountant-level 2-component GMM crossings from Script 18 for completeness and side-by-side comparison. Output: reports/accountant_three_methods/accountant_three_methods_report.md reports/accountant_three_methods/accountant_three_methods_results.json reports/accountant_three_methods/accountant_cos_panel.png reports/accountant_three_methods/accountant_dhash_panel.png """ import sqlite3 import json import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from pathlib import Path from datetime import datetime from scipy import stats from scipy.signal import find_peaks from scipy.optimize import brentq from sklearn.mixture import GaussianMixture import diptest DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' 'accountant_three_methods') OUT.mkdir(parents=True, exist_ok=True) EPS = 1e-6 Z_CRIT = 1.96 MIN_SIGS = 10 def load_accountant_means(min_sigs=MIN_SIGS): conn = sqlite3.connect(DB) cur = conn.cursor() cur.execute(''' SELECT s.assigned_accountant, AVG(s.max_similarity_to_same_accountant) AS cos_mean, AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean, COUNT(*) AS n FROM signatures s WHERE s.assigned_accountant IS NOT NULL AND s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL GROUP BY s.assigned_accountant HAVING n >= ? ''', (min_sigs,)) rows = cur.fetchall() conn.close() cos = np.array([r[1] for r in rows]) dh = np.array([r[2] for r in rows]) return cos, dh # ---------- Method 1: KDE antimode with dip test ---------- def method_kde_antimode(values, name): arr = np.asarray(values, dtype=float) arr = arr[np.isfinite(arr)] dip, pval = diptest.diptest(arr, boot_pval=True, n_boot=2000) kde = stats.gaussian_kde(arr, bw_method='silverman') xs = np.linspace(arr.min(), arr.max(), 2000) density = kde(xs) # Find modes (local maxima) and antimodes (local minima) peaks, _ = find_peaks(density, prominence=density.max() * 0.02) # Antimodes = local minima between peaks antimodes = [] for i in range(len(peaks) - 1): seg = density[peaks[i]:peaks[i + 1]] if len(seg) == 0: continue local = peaks[i] + int(np.argmin(seg)) antimodes.append(float(xs[local])) # Sensitivity analysis across bandwidth factors sens = {} for bwf in [0.5, 0.75, 1.0, 1.5, 2.0]: kde_s = stats.gaussian_kde(arr, bw_method=kde.factor * bwf) d_s = kde_s(xs) p_s, _ = find_peaks(d_s, prominence=d_s.max() * 0.02) sens[f'bw_x{bwf}'] = int(len(p_s)) return { 'name': name, 'n': int(len(arr)), 'dip': float(dip), 'dip_pvalue': float(pval), 'unimodal_alpha05': bool(pval > 0.05), 'kde_bandwidth_silverman': float(kde.factor), 'n_modes': int(len(peaks)), 'mode_locations': [float(xs[p]) for p in peaks], 'antimodes': antimodes, 'primary_antimode': (antimodes[0] if antimodes else None), 'bandwidth_sensitivity_n_modes': sens, } # ---------- Method 2: Burgstahler-Dichev / McCrary ---------- def method_bd_mccrary(values, bin_width, direction, name): arr = np.asarray(values, dtype=float) arr = arr[np.isfinite(arr)] lo = float(np.floor(arr.min() / bin_width) * bin_width) hi = float(np.ceil(arr.max() / bin_width) * bin_width) edges = np.arange(lo, hi + bin_width, bin_width) counts, _ = np.histogram(arr, bins=edges) centers = (edges[:-1] + edges[1:]) / 2.0 N = counts.sum() p = counts / N if N else counts.astype(float) n_bins = len(counts) z = np.full(n_bins, np.nan) expected = np.full(n_bins, np.nan) for i in range(1, n_bins - 1): p_lo = p[i - 1] p_hi = p[i + 1] exp_i = 0.5 * (counts[i - 1] + counts[i + 1]) var_i = (N * p[i] * (1 - p[i]) + 0.25 * N * (p_lo + p_hi) * (1 - p_lo - p_hi)) if var_i <= 0: continue z[i] = (counts[i] - exp_i) / np.sqrt(var_i) expected[i] = exp_i # Identify transitions transitions = [] for i in range(1, len(z)): if np.isnan(z[i - 1]) or np.isnan(z[i]): continue ok = False if direction == 'neg_to_pos' and z[i - 1] < -Z_CRIT and z[i] > Z_CRIT: ok = True elif direction == 'pos_to_neg' and z[i - 1] > Z_CRIT and z[i] < -Z_CRIT: ok = True if ok: transitions.append({ 'threshold_between': float((centers[i - 1] + centers[i]) / 2.0), 'z_before': float(z[i - 1]), 'z_after': float(z[i]), }) best = None if transitions: best = max(transitions, key=lambda t: abs(t['z_before']) + abs(t['z_after'])) return { 'name': name, 'n': int(len(arr)), 'bin_width': float(bin_width), 'direction': direction, 'n_transitions': len(transitions), 'transitions': transitions, 'best_transition': best, 'threshold': (best['threshold_between'] if best else None), 'bin_centers': [float(c) for c in centers], 'counts': [int(c) for c in counts], 'z_scores': [None if np.isnan(zi) else float(zi) for zi in z], } # ---------- Method 3: Beta mixture + logit-GMM ---------- def fit_beta_mixture_em(x, K=2, max_iter=300, tol=1e-6, seed=42): rng = np.random.default_rng(seed) x = np.clip(np.asarray(x, dtype=float), EPS, 1 - EPS) n = len(x) q = np.linspace(0, 1, K + 1) thresh = np.quantile(x, q[1:-1]) labels = np.digitize(x, thresh) resp = np.zeros((n, K)) resp[np.arange(n), labels] = 1.0 ll_hist = [] for it in range(max_iter): nk = resp.sum(axis=0) + 1e-12 weights = nk / nk.sum() mus = (resp * x[:, None]).sum(axis=0) / nk var_num = (resp * (x[:, None] - mus) ** 2).sum(axis=0) vars_ = var_num / nk upper = mus * (1 - mus) - 1e-9 vars_ = np.minimum(vars_, upper) vars_ = np.maximum(vars_, 1e-9) factor = np.maximum(mus * (1 - mus) / vars_ - 1, 1e-6) alphas = mus * factor betas = (1 - mus) * factor log_pdfs = np.column_stack([ stats.beta.logpdf(x, alphas[k], betas[k]) + np.log(weights[k]) for k in range(K) ]) m = log_pdfs.max(axis=1, keepdims=True) ll = (m.ravel() + np.log(np.exp(log_pdfs - m).sum(axis=1))).sum() ll_hist.append(float(ll)) new_resp = np.exp(log_pdfs - m) new_resp = new_resp / new_resp.sum(axis=1, keepdims=True) if it > 0 and abs(ll_hist[-1] - ll_hist[-2]) < tol: resp = new_resp break resp = new_resp order = np.argsort(mus) alphas, betas, weights, mus = alphas[order], betas[order], weights[order], mus[order] k_params = 3 * K - 1 ll_final = ll_hist[-1] return { 'K': K, 'alphas': [float(a) for a in alphas], 'betas': [float(b) for b in betas], 'weights': [float(w) for w in weights], 'mus': [float(m) for m in mus], 'log_likelihood': float(ll_final), 'aic': float(2 * k_params - 2 * ll_final), 'bic': float(k_params * np.log(n) - 2 * ll_final), 'n_iter': it + 1, } def beta_crossing(fit): if fit['K'] != 2: return None a1, b1, w1 = fit['alphas'][0], fit['betas'][0], fit['weights'][0] a2, b2, w2 = fit['alphas'][1], fit['betas'][1], fit['weights'][1] def diff(x): return (w2 * stats.beta.pdf(x, a2, b2) - w1 * stats.beta.pdf(x, a1, b1)) xs = np.linspace(EPS, 1 - EPS, 2000) ys = diff(xs) changes = np.where(np.diff(np.sign(ys)) != 0)[0] if not len(changes): return None mid = 0.5 * (fit['mus'][0] + fit['mus'][1]) crossings = [] for i in changes: try: crossings.append(brentq(diff, xs[i], xs[i + 1])) except ValueError: continue if not crossings: return None return float(min(crossings, key=lambda c: abs(c - mid))) def fit_logit_gmm(x, K=2, seed=42): x = np.clip(np.asarray(x, dtype=float), EPS, 1 - EPS) z = np.log(x / (1 - x)).reshape(-1, 1) gmm = GaussianMixture(n_components=K, random_state=seed, max_iter=500).fit(z) order = np.argsort(gmm.means_.ravel()) means = gmm.means_.ravel()[order] stds = np.sqrt(gmm.covariances_.ravel())[order] weights = gmm.weights_[order] crossing = None if K == 2: m1, s1, w1 = means[0], stds[0], weights[0] m2, s2, w2 = means[1], stds[1], weights[1] def diff(z0): return (w2 * stats.norm.pdf(z0, m2, s2) - w1 * stats.norm.pdf(z0, m1, s1)) zs = np.linspace(min(m1, m2) - 2, max(m1, m2) + 2, 2000) ys = diff(zs) ch = np.where(np.diff(np.sign(ys)) != 0)[0] if len(ch): try: z_cross = brentq(diff, zs[ch[0]], zs[ch[0] + 1]) crossing = float(1 / (1 + np.exp(-z_cross))) except ValueError: pass return { 'K': K, 'means_logit': [float(m) for m in means], 'stds_logit': [float(s) for s in stds], 'weights': [float(w) for w in weights], 'means_original_scale': [float(1 / (1 + np.exp(-m))) for m in means], 'aic': float(gmm.aic(z)), 'bic': float(gmm.bic(z)), 'crossing_original': crossing, } def method_beta_mixture(values, name, is_cosine=True): arr = np.asarray(values, dtype=float) arr = arr[np.isfinite(arr)] if not is_cosine: # normalize dHash into [0,1] by dividing by 64 (max Hamming) x = arr / 64.0 else: x = arr beta2 = fit_beta_mixture_em(x, K=2) beta3 = fit_beta_mixture_em(x, K=3) cross_beta2 = beta_crossing(beta2) # Transform back to original scale for dHash if not is_cosine and cross_beta2 is not None: cross_beta2 = cross_beta2 * 64.0 gmm2 = fit_logit_gmm(x, K=2) gmm3 = fit_logit_gmm(x, K=3) if not is_cosine and gmm2.get('crossing_original') is not None: gmm2['crossing_original'] = gmm2['crossing_original'] * 64.0 return { 'name': name, 'n': int(len(x)), 'scale_transform': ('identity' if is_cosine else 'dhash/64'), 'beta_2': beta2, 'beta_3': beta3, 'bic_preferred_K': (2 if beta2['bic'] < beta3['bic'] else 3), 'beta_2_crossing_original': cross_beta2, 'logit_gmm_2': gmm2, 'logit_gmm_3': gmm3, } # ---------- Plot helpers ---------- def plot_panel(values, methods, title, out_path, bin_width=None, is_cosine=True): arr = np.asarray(values, dtype=float) fig, axes = plt.subplots(2, 1, figsize=(11, 7), gridspec_kw={'height_ratios': [3, 1]}) ax = axes[0] if bin_width is None: bins = 40 else: lo = float(np.floor(arr.min() / bin_width) * bin_width) hi = float(np.ceil(arr.max() / bin_width) * bin_width) bins = np.arange(lo, hi + bin_width, bin_width) ax.hist(arr, bins=bins, density=True, alpha=0.55, color='steelblue', edgecolor='white') # KDE overlay kde = stats.gaussian_kde(arr, bw_method='silverman') xs = np.linspace(arr.min(), arr.max(), 500) ax.plot(xs, kde(xs), 'g-', lw=1.5, label='KDE (Silverman)') # Annotate thresholds from each method colors = {'kde': 'green', 'bd': 'red', 'beta': 'purple', 'gmm2': 'orange'} for key, (val, lbl) in methods.items(): if val is None: continue ax.axvline(val, color=colors.get(key, 'gray'), lw=2, ls='--', label=f'{lbl} = {val:.4f}') ax.set_xlabel(title + ' value') ax.set_ylabel('Density') ax.set_title(title) ax.legend(fontsize=8) ax2 = axes[1] ax2.set_title('Thresholds across methods') ax2.set_xlim(ax.get_xlim()) for i, (key, (val, lbl)) in enumerate(methods.items()): if val is None: continue ax2.scatter([val], [i], color=colors.get(key, 'gray'), s=100, zorder=3) ax2.annotate(f' {lbl}: {val:.4f}', (val, i), fontsize=8, va='center') ax2.set_yticks(range(len(methods))) ax2.set_yticklabels([m for m in methods.keys()]) ax2.set_xlabel(title + ' value') ax2.grid(alpha=0.3) plt.tight_layout() fig.savefig(out_path, dpi=150) plt.close() # ---------- GMM 2-comp crossing from Script 18 ---------- def marginal_2comp_crossing(X, dim): gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=42, n_init=15, max_iter=500).fit(X) means = gmm.means_ covs = gmm.covariances_ weights = gmm.weights_ m1, m2 = means[0][dim], means[1][dim] s1 = np.sqrt(covs[0][dim, dim]) s2 = np.sqrt(covs[1][dim, dim]) w1, w2 = weights[0], weights[1] def diff(x): return (w2 * stats.norm.pdf(x, m2, s2) - w1 * stats.norm.pdf(x, m1, s1)) xs = np.linspace(float(X[:, dim].min()), float(X[:, dim].max()), 2000) ys = diff(xs) ch = np.where(np.diff(np.sign(ys)) != 0)[0] if not len(ch): return None mid = 0.5 * (m1 + m2) crossings = [] for i in ch: try: crossings.append(brentq(diff, xs[i], xs[i + 1])) except ValueError: continue if not crossings: return None return float(min(crossings, key=lambda c: abs(c - mid))) def main(): print('=' * 70) print('Script 20: Three-Method Threshold at Accountant Level') print('=' * 70) cos, dh = load_accountant_means() print(f'\nN accountants (>={MIN_SIGS} sigs) = {len(cos)}') results = {} for desc, arr, bin_width, direction, is_cosine in [ ('cos_mean', cos, 0.002, 'neg_to_pos', True), ('dh_mean', dh, 0.2, 'pos_to_neg', False), ]: print(f'\n[{desc}]') m1 = method_kde_antimode(arr, f'{desc} KDE') print(f' Method 1 (KDE + dip): dip={m1["dip"]:.4f} ' f'p={m1["dip_pvalue"]:.4f} ' f'n_modes={m1["n_modes"]} ' f'antimode={m1["primary_antimode"]}') m2 = method_bd_mccrary(arr, bin_width, direction, f'{desc} BD') print(f' Method 2 (BD/McCrary): {m2["n_transitions"]} transitions, ' f'threshold={m2["threshold"]}') m3 = method_beta_mixture(arr, f'{desc} Beta', is_cosine=is_cosine) print(f' Method 3 (Beta mixture): BIC-preferred K={m3["bic_preferred_K"]}, ' f'Beta-2 crossing={m3["beta_2_crossing_original"]}, ' f'LogGMM-2 crossing={m3["logit_gmm_2"].get("crossing_original")}') # GMM 2-comp crossing (for completeness / reproduce Script 18) X = np.column_stack([cos, dh]) dim = 0 if desc == 'cos_mean' else 1 gmm2_crossing = marginal_2comp_crossing(X, dim) print(f' (Script 18 2-comp GMM marginal crossing = {gmm2_crossing})') results[desc] = { 'method_1_kde_antimode': m1, 'method_2_bd_mccrary': m2, 'method_3_beta_mixture': m3, 'script_18_gmm_2comp_crossing': gmm2_crossing, } methods_for_plot = { 'kde': (m1.get('primary_antimode'), 'KDE antimode'), 'bd': (m2.get('threshold'), 'BD/McCrary'), 'beta': (m3.get('beta_2_crossing_original'), 'Beta-2 crossing'), 'gmm2': (gmm2_crossing, 'GMM 2-comp crossing'), } png = OUT / f'accountant_{desc}_panel.png' plot_panel(arr, methods_for_plot, f'Accountant-level {desc}: three-method thresholds', png, bin_width=bin_width, is_cosine=is_cosine) print(f' plot: {png}') # Write JSON with open(OUT / 'accountant_three_methods_results.json', 'w') as f: json.dump({'generated_at': datetime.now().isoformat(), 'n_accountants': int(len(cos)), 'min_signatures': MIN_SIGS, 'results': results}, f, indent=2, ensure_ascii=False) print(f'\nJSON: {OUT / "accountant_three_methods_results.json"}') # Markdown md = [ '# Accountant-Level Three-Method Threshold Report', f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", '', f'N accountants (>={MIN_SIGS} signatures): {len(cos)}', '', '## Accountant-level cosine mean', '', '| Method | Threshold | Supporting statistic |', '|--------|-----------|----------------------|', ] r = results['cos_mean'] md.append(f"| Method 1: KDE antimode (with dip test) | " f"{r['method_1_kde_antimode']['primary_antimode']} | " f"dip={r['method_1_kde_antimode']['dip']:.4f}, " f"p={r['method_1_kde_antimode']['dip_pvalue']:.4f} " f"({'unimodal' if r['method_1_kde_antimode']['unimodal_alpha05'] else 'multimodal'}) |") md.append(f"| Method 2: Burgstahler-Dichev / McCrary | " f"{r['method_2_bd_mccrary']['threshold']} | " f"{r['method_2_bd_mccrary']['n_transitions']} transition(s) " f"at α=0.05 |") md.append(f"| Method 3: 2-component Beta mixture | " f"{r['method_3_beta_mixture']['beta_2_crossing_original']} | " f"Beta-2 BIC={r['method_3_beta_mixture']['beta_2']['bic']:.2f}, " f"Beta-3 BIC={r['method_3_beta_mixture']['beta_3']['bic']:.2f} " f"(BIC-preferred K={r['method_3_beta_mixture']['bic_preferred_K']}) |") md.append(f"| Method 3': LogGMM-2 on logit-transformed | " f"{r['method_3_beta_mixture']['logit_gmm_2'].get('crossing_original')} | " f"White 1982 quasi-MLE robustness check |") md.append(f"| Script 18 GMM 2-comp marginal crossing | " f"{r['script_18_gmm_2comp_crossing']} | full 2D mixture |") md += ['', '## Accountant-level dHash mean', '', '| Method | Threshold | Supporting statistic |', '|--------|-----------|----------------------|'] r = results['dh_mean'] md.append(f"| Method 1: KDE antimode | " f"{r['method_1_kde_antimode']['primary_antimode']} | " f"dip={r['method_1_kde_antimode']['dip']:.4f}, " f"p={r['method_1_kde_antimode']['dip_pvalue']:.4f} |") md.append(f"| Method 2: BD/McCrary | " f"{r['method_2_bd_mccrary']['threshold']} | " f"{r['method_2_bd_mccrary']['n_transitions']} transition(s) |") md.append(f"| Method 3: 2-component Beta mixture | " f"{r['method_3_beta_mixture']['beta_2_crossing_original']} | " f"Beta-2 BIC={r['method_3_beta_mixture']['beta_2']['bic']:.2f}, " f"Beta-3 BIC={r['method_3_beta_mixture']['beta_3']['bic']:.2f} |") md.append(f"| Method 3': LogGMM-2 | " f"{r['method_3_beta_mixture']['logit_gmm_2'].get('crossing_original')} | |") md.append(f"| Script 18 GMM 2-comp crossing | " f"{r['script_18_gmm_2comp_crossing']} | |") (OUT / 'accountant_three_methods_report.md').write_text('\n'.join(md), encoding='utf-8') print(f'Report: {OUT / "accountant_three_methods_report.md"}') if __name__ == '__main__': main()