pdf_signature_extraction/signature_analysis/20_accountant_level_three_methods.py

#!/usr/bin/env python3
"""
Script 20: Three-Method Threshold Determination at the Accountant Level
=======================================================================
Completes the three-method convergent framework at the analysis level
where the mixture structure is statistically supported (per Script 15
dip test: accountant cos_mean p<0.001).

Runs on the per-accountant aggregates (mean best-match cosine, mean
independent minimum dHash) for 686 CPAs with >=10 signatures:

  Method 1: KDE antimode with Hartigan dip test (formal unimodality test)
  Method 2: Burgstahler-Dichev / McCrary discontinuity
  Method 3: 2-component Beta mixture via EM + parallel logit-GMM

Also re-runs the accountant-level 2-component GMM crossings from
Script 18 for completeness and side-by-side comparison.

Output:
  reports/accountant_three_methods/accountant_three_methods_report.md
  reports/accountant_three_methods/accountant_three_methods_results.json
  reports/accountant_three_methods/accountant_cos_panel.png
  reports/accountant_three_methods/accountant_dhash_panel.png
"""

import sqlite3
import json
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
from scipy import stats
from scipy.signal import find_peaks
from scipy.optimize import brentq
from sklearn.mixture import GaussianMixture
import diptest

DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
           'accountant_three_methods')
OUT.mkdir(parents=True, exist_ok=True)

EPS = 1e-6
Z_CRIT = 1.96
MIN_SIGS = 10


def load_accountant_means(min_sigs=MIN_SIGS):
    conn = sqlite3.connect(DB)
    cur = conn.cursor()
    cur.execute('''
        SELECT s.assigned_accountant,
               AVG(s.max_similarity_to_same_accountant) AS cos_mean,
               AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean,
               COUNT(*) AS n
        FROM signatures s
        WHERE s.assigned_accountant IS NOT NULL
          AND s.max_similarity_to_same_accountant IS NOT NULL
          AND s.min_dhash_independent IS NOT NULL
        GROUP BY s.assigned_accountant
        HAVING n >= ?
    ''', (min_sigs,))
    rows = cur.fetchall()
    conn.close()
    cos = np.array([r[1] for r in rows])
    dh = np.array([r[2] for r in rows])
    return cos, dh


# ---------- Method 1: KDE antimode with dip test ----------
def method_kde_antimode(values, name):
    arr = np.asarray(values, dtype=float)
    arr = arr[np.isfinite(arr)]
    dip, pval = diptest.diptest(arr, boot_pval=True, n_boot=2000)
    kde = stats.gaussian_kde(arr, bw_method='silverman')
    xs = np.linspace(arr.min(), arr.max(), 2000)
    density = kde(xs)
    # Find modes (local maxima) and antimodes (local minima)
    peaks, _ = find_peaks(density, prominence=density.max() * 0.02)
    # Antimodes = local minima between peaks
    antimodes = []
    for i in range(len(peaks) - 1):
        seg = density[peaks[i]:peaks[i + 1]]
        if len(seg) == 0:
            continue
        local = peaks[i] + int(np.argmin(seg))
        antimodes.append(float(xs[local]))
    # Sensitivity analysis across bandwidth factors
    sens = {}
    for bwf in [0.5, 0.75, 1.0, 1.5, 2.0]:
        kde_s = stats.gaussian_kde(arr, bw_method=kde.factor * bwf)
        d_s = kde_s(xs)
        p_s, _ = find_peaks(d_s, prominence=d_s.max() * 0.02)
        sens[f'bw_x{bwf}'] = int(len(p_s))
    return {
        'name': name,
        'n': int(len(arr)),
        'dip': float(dip),
        'dip_pvalue': float(pval),
        'unimodal_alpha05': bool(pval > 0.05),
        'kde_bandwidth_silverman': float(kde.factor),
        'n_modes': int(len(peaks)),
        'mode_locations': [float(xs[p]) for p in peaks],
        'antimodes': antimodes,
        'primary_antimode': (antimodes[0] if antimodes else None),
        'bandwidth_sensitivity_n_modes': sens,
    }


# ---------- Method 2: Burgstahler-Dichev / McCrary ----------
def method_bd_mccrary(values, bin_width, direction, name):
    arr = np.asarray(values, dtype=float)
    arr = arr[np.isfinite(arr)]
    lo = float(np.floor(arr.min() / bin_width) * bin_width)
    hi = float(np.ceil(arr.max() / bin_width) * bin_width)
    edges = np.arange(lo, hi + bin_width, bin_width)
    counts, _ = np.histogram(arr, bins=edges)
    centers = (edges[:-1] + edges[1:]) / 2.0

    N = counts.sum()
    p = counts / N if N else counts.astype(float)
    n_bins = len(counts)
    z = np.full(n_bins, np.nan)
    expected = np.full(n_bins, np.nan)

    for i in range(1, n_bins - 1):
        p_lo = p[i - 1]
        p_hi = p[i + 1]
        exp_i = 0.5 * (counts[i - 1] + counts[i + 1])
        var_i = (N * p[i] * (1 - p[i])
                 + 0.25 * N * (p_lo + p_hi) * (1 - p_lo - p_hi))
        if var_i <= 0:
            continue
        z[i] = (counts[i] - exp_i) / np.sqrt(var_i)
        expected[i] = exp_i

    # Identify transitions
    transitions = []
    for i in range(1, len(z)):
        if np.isnan(z[i - 1]) or np.isnan(z[i]):
            continue
        ok = False
        if direction == 'neg_to_pos' and z[i - 1] < -Z_CRIT and z[i] > Z_CRIT:
            ok = True
        elif direction == 'pos_to_neg' and z[i - 1] > Z_CRIT and z[i] < -Z_CRIT:
            ok = True
        if ok:
            transitions.append({
                'threshold_between': float((centers[i - 1] + centers[i]) / 2.0),
                'z_before': float(z[i - 1]),
                'z_after': float(z[i]),
            })

    best = None
    if transitions:
        best = max(transitions,
                   key=lambda t: abs(t['z_before']) + abs(t['z_after']))
    return {
        'name': name,
        'n': int(len(arr)),
        'bin_width': float(bin_width),
        'direction': direction,
        'n_transitions': len(transitions),
        'transitions': transitions,
        'best_transition': best,
        'threshold': (best['threshold_between'] if best else None),
        'bin_centers': [float(c) for c in centers],
        'counts': [int(c) for c in counts],
        'z_scores': [None if np.isnan(zi) else float(zi) for zi in z],
    }


# ---------- Method 3: Beta mixture + logit-GMM ----------
def fit_beta_mixture_em(x, K=2, max_iter=300, tol=1e-6, seed=42):
    rng = np.random.default_rng(seed)
    x = np.clip(np.asarray(x, dtype=float), EPS, 1 - EPS)
    n = len(x)
    q = np.linspace(0, 1, K + 1)
    thresh = np.quantile(x, q[1:-1])
    labels = np.digitize(x, thresh)
    resp = np.zeros((n, K))
    resp[np.arange(n), labels] = 1.0
    ll_hist = []
    for it in range(max_iter):
        nk = resp.sum(axis=0) + 1e-12
        weights = nk / nk.sum()
        mus = (resp * x[:, None]).sum(axis=0) / nk
        var_num = (resp * (x[:, None] - mus) ** 2).sum(axis=0)
        vars_ = var_num / nk
        upper = mus * (1 - mus) - 1e-9
        vars_ = np.minimum(vars_, upper)
        vars_ = np.maximum(vars_, 1e-9)
        factor = np.maximum(mus * (1 - mus) / vars_ - 1, 1e-6)
        alphas = mus * factor
        betas = (1 - mus) * factor
        log_pdfs = np.column_stack([
            stats.beta.logpdf(x, alphas[k], betas[k]) + np.log(weights[k])
            for k in range(K)
        ])
        m = log_pdfs.max(axis=1, keepdims=True)
        ll = (m.ravel() + np.log(np.exp(log_pdfs - m).sum(axis=1))).sum()
        ll_hist.append(float(ll))
        new_resp = np.exp(log_pdfs - m)
        new_resp = new_resp / new_resp.sum(axis=1, keepdims=True)
        if it > 0 and abs(ll_hist[-1] - ll_hist[-2]) < tol:
            resp = new_resp
            break
        resp = new_resp
    order = np.argsort(mus)
    alphas, betas, weights, mus = alphas[order], betas[order], weights[order], mus[order]
    k_params = 3 * K - 1
    ll_final = ll_hist[-1]
    return {
        'K': K,
        'alphas': [float(a) for a in alphas],
        'betas': [float(b) for b in betas],
        'weights': [float(w) for w in weights],
        'mus': [float(m) for m in mus],
        'log_likelihood': float(ll_final),
        'aic': float(2 * k_params - 2 * ll_final),
        'bic': float(k_params * np.log(n) - 2 * ll_final),
        'n_iter': it + 1,
    }


def beta_crossing(fit):
    if fit['K'] != 2:
        return None
    a1, b1, w1 = fit['alphas'][0], fit['betas'][0], fit['weights'][0]
    a2, b2, w2 = fit['alphas'][1], fit['betas'][1], fit['weights'][1]

    def diff(x):
        return (w2 * stats.beta.pdf(x, a2, b2)
                - w1 * stats.beta.pdf(x, a1, b1))
    xs = np.linspace(EPS, 1 - EPS, 2000)
    ys = diff(xs)
    changes = np.where(np.diff(np.sign(ys)) != 0)[0]
    if not len(changes):
        return None
    mid = 0.5 * (fit['mus'][0] + fit['mus'][1])
    crossings = []
    for i in changes:
        try:
            crossings.append(brentq(diff, xs[i], xs[i + 1]))
        except ValueError:
            continue
    if not crossings:
        return None
    return float(min(crossings, key=lambda c: abs(c - mid)))


def fit_logit_gmm(x, K=2, seed=42):
    x = np.clip(np.asarray(x, dtype=float), EPS, 1 - EPS)
    z = np.log(x / (1 - x)).reshape(-1, 1)
    gmm = GaussianMixture(n_components=K, random_state=seed,
                          max_iter=500).fit(z)
    order = np.argsort(gmm.means_.ravel())
    means = gmm.means_.ravel()[order]
    stds = np.sqrt(gmm.covariances_.ravel())[order]
    weights = gmm.weights_[order]
    crossing = None
    if K == 2:
        m1, s1, w1 = means[0], stds[0], weights[0]
        m2, s2, w2 = means[1], stds[1], weights[1]

        def diff(z0):
            return (w2 * stats.norm.pdf(z0, m2, s2)
                    - w1 * stats.norm.pdf(z0, m1, s1))
        zs = np.linspace(min(m1, m2) - 2, max(m1, m2) + 2, 2000)
        ys = diff(zs)
        ch = np.where(np.diff(np.sign(ys)) != 0)[0]
        if len(ch):
            try:
                z_cross = brentq(diff, zs[ch[0]], zs[ch[0] + 1])
                crossing = float(1 / (1 + np.exp(-z_cross)))
            except ValueError:
                pass
    return {
        'K': K,
        'means_logit': [float(m) for m in means],
        'stds_logit': [float(s) for s in stds],
        'weights': [float(w) for w in weights],
        'means_original_scale': [float(1 / (1 + np.exp(-m))) for m in means],
        'aic': float(gmm.aic(z)),
        'bic': float(gmm.bic(z)),
        'crossing_original': crossing,
    }


def method_beta_mixture(values, name, is_cosine=True):
    arr = np.asarray(values, dtype=float)
    arr = arr[np.isfinite(arr)]
    if not is_cosine:
        # normalize dHash into [0,1] by dividing by 64 (max Hamming)
        x = arr / 64.0
    else:
        x = arr
    beta2 = fit_beta_mixture_em(x, K=2)
    beta3 = fit_beta_mixture_em(x, K=3)
    cross_beta2 = beta_crossing(beta2)
    # Transform back to original scale for dHash
    if not is_cosine and cross_beta2 is not None:
        cross_beta2 = cross_beta2 * 64.0
    gmm2 = fit_logit_gmm(x, K=2)
    gmm3 = fit_logit_gmm(x, K=3)
    if not is_cosine and gmm2.get('crossing_original') is not None:
        gmm2['crossing_original'] = gmm2['crossing_original'] * 64.0
    return {
        'name': name,
        'n': int(len(x)),
        'scale_transform': ('identity' if is_cosine else 'dhash/64'),
        'beta_2': beta2,
        'beta_3': beta3,
        'bic_preferred_K': (2 if beta2['bic'] < beta3['bic'] else 3),
        'beta_2_crossing_original': cross_beta2,
        'logit_gmm_2': gmm2,
        'logit_gmm_3': gmm3,
    }


# ---------- Plot helpers ----------
def plot_panel(values, methods, title, out_path, bin_width=None,
               is_cosine=True):
    arr = np.asarray(values, dtype=float)
    fig, axes = plt.subplots(2, 1, figsize=(11, 7),
                             gridspec_kw={'height_ratios': [3, 1]})

    ax = axes[0]
    if bin_width is None:
        bins = 40
    else:
        lo = float(np.floor(arr.min() / bin_width) * bin_width)
        hi = float(np.ceil(arr.max() / bin_width) * bin_width)
        bins = np.arange(lo, hi + bin_width, bin_width)
    ax.hist(arr, bins=bins, density=True, alpha=0.55, color='steelblue',
            edgecolor='white')
    # KDE overlay
    kde = stats.gaussian_kde(arr, bw_method='silverman')
    xs = np.linspace(arr.min(), arr.max(), 500)
    ax.plot(xs, kde(xs), 'g-', lw=1.5, label='KDE (Silverman)')

    # Annotate thresholds from each method
    colors = {'kde': 'green', 'bd': 'red', 'beta': 'purple', 'gmm2': 'orange'}
    for key, (val, lbl) in methods.items():
        if val is None:
            continue
        ax.axvline(val, color=colors.get(key, 'gray'), lw=2, ls='--',
                   label=f'{lbl} = {val:.4f}')
    ax.set_xlabel(title + ' value')
    ax.set_ylabel('Density')
    ax.set_title(title)
    ax.legend(fontsize=8)

    ax2 = axes[1]
    ax2.set_title('Thresholds across methods')
    ax2.set_xlim(ax.get_xlim())
    for i, (key, (val, lbl)) in enumerate(methods.items()):
        if val is None:
            continue
        ax2.scatter([val], [i], color=colors.get(key, 'gray'), s=100, zorder=3)
        ax2.annotate(f'  {lbl}: {val:.4f}', (val, i), fontsize=8,
                     va='center')
    ax2.set_yticks(range(len(methods)))
    ax2.set_yticklabels([m for m in methods.keys()])
    ax2.set_xlabel(title + ' value')
    ax2.grid(alpha=0.3)

    plt.tight_layout()
    fig.savefig(out_path, dpi=150)
    plt.close()


# ---------- GMM 2-comp crossing from Script 18 ----------
def marginal_2comp_crossing(X, dim):
    gmm = GaussianMixture(n_components=2, covariance_type='full',
                          random_state=42, n_init=15, max_iter=500).fit(X)
    means = gmm.means_
    covs = gmm.covariances_
    weights = gmm.weights_
    m1, m2 = means[0][dim], means[1][dim]
    s1 = np.sqrt(covs[0][dim, dim])
    s2 = np.sqrt(covs[1][dim, dim])
    w1, w2 = weights[0], weights[1]

    def diff(x):
        return (w2 * stats.norm.pdf(x, m2, s2)
                - w1 * stats.norm.pdf(x, m1, s1))
    xs = np.linspace(float(X[:, dim].min()), float(X[:, dim].max()), 2000)
    ys = diff(xs)
    ch = np.where(np.diff(np.sign(ys)) != 0)[0]
    if not len(ch):
        return None
    mid = 0.5 * (m1 + m2)
    crossings = []
    for i in ch:
        try:
            crossings.append(brentq(diff, xs[i], xs[i + 1]))
        except ValueError:
            continue
    if not crossings:
        return None
    return float(min(crossings, key=lambda c: abs(c - mid)))


def main():
    print('=' * 70)
    print('Script 20: Three-Method Threshold at Accountant Level')
    print('=' * 70)
    cos, dh = load_accountant_means()
    print(f'\nN accountants (>={MIN_SIGS} sigs) = {len(cos)}')

    results = {}

    for desc, arr, bin_width, direction, is_cosine in [
        ('cos_mean', cos, 0.002, 'neg_to_pos', True),
        ('dh_mean', dh, 0.2, 'pos_to_neg', False),
    ]:
        print(f'\n[{desc}]')
        m1 = method_kde_antimode(arr, f'{desc} KDE')
        print(f'  Method 1 (KDE + dip): dip={m1["dip"]:.4f} '
              f'p={m1["dip_pvalue"]:.4f} '
              f'n_modes={m1["n_modes"]} '
              f'antimode={m1["primary_antimode"]}')
        m2 = method_bd_mccrary(arr, bin_width, direction, f'{desc} BD')
        print(f'  Method 2 (BD/McCrary): {m2["n_transitions"]} transitions, '
              f'threshold={m2["threshold"]}')
        m3 = method_beta_mixture(arr, f'{desc} Beta', is_cosine=is_cosine)
        print(f'  Method 3 (Beta mixture): BIC-preferred K={m3["bic_preferred_K"]}, '
              f'Beta-2 crossing={m3["beta_2_crossing_original"]}, '
              f'LogGMM-2 crossing={m3["logit_gmm_2"].get("crossing_original")}')

        # GMM 2-comp crossing (for completeness / reproduce Script 18)
        X = np.column_stack([cos, dh])
        dim = 0 if desc == 'cos_mean' else 1
        gmm2_crossing = marginal_2comp_crossing(X, dim)
        print(f'  (Script 18 2-comp GMM marginal crossing = {gmm2_crossing})')

        results[desc] = {
            'method_1_kde_antimode': m1,
            'method_2_bd_mccrary': m2,
            'method_3_beta_mixture': m3,
            'script_18_gmm_2comp_crossing': gmm2_crossing,
        }

        methods_for_plot = {
            'kde': (m1.get('primary_antimode'), 'KDE antimode'),
            'bd': (m2.get('threshold'), 'BD/McCrary'),
            'beta': (m3.get('beta_2_crossing_original'), 'Beta-2 crossing'),
            'gmm2': (gmm2_crossing, 'GMM 2-comp crossing'),
        }
        png = OUT / f'accountant_{desc}_panel.png'
        plot_panel(arr, methods_for_plot,
                   f'Accountant-level {desc}: three-method thresholds',
                   png, bin_width=bin_width, is_cosine=is_cosine)
        print(f'  plot: {png}')

    # Write JSON
    with open(OUT / 'accountant_three_methods_results.json', 'w') as f:
        json.dump({'generated_at': datetime.now().isoformat(),
                   'n_accountants': int(len(cos)),
                   'min_signatures': MIN_SIGS,
                   'results': results}, f, indent=2, ensure_ascii=False)
    print(f'\nJSON: {OUT / "accountant_three_methods_results.json"}')

    # Markdown
    md = [
        '# Accountant-Level Three-Method Threshold Report',
        f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        '',
        f'N accountants (>={MIN_SIGS} signatures): {len(cos)}',
        '',
        '## Accountant-level cosine mean',
        '',
        '| Method | Threshold | Supporting statistic |',
        '|--------|-----------|----------------------|',
    ]
    r = results['cos_mean']
    md.append(f"| Method 1: KDE antimode (with dip test) | "
              f"{r['method_1_kde_antimode']['primary_antimode']} | "
              f"dip={r['method_1_kde_antimode']['dip']:.4f}, "
              f"p={r['method_1_kde_antimode']['dip_pvalue']:.4f} "
              f"({'unimodal' if r['method_1_kde_antimode']['unimodal_alpha05'] else 'multimodal'}) |")
    md.append(f"| Method 2: Burgstahler-Dichev / McCrary | "
              f"{r['method_2_bd_mccrary']['threshold']} | "
              f"{r['method_2_bd_mccrary']['n_transitions']} transition(s) "
              f"at α=0.05 |")
    md.append(f"| Method 3: 2-component Beta mixture | "
              f"{r['method_3_beta_mixture']['beta_2_crossing_original']} | "
              f"Beta-2 BIC={r['method_3_beta_mixture']['beta_2']['bic']:.2f}, "
              f"Beta-3 BIC={r['method_3_beta_mixture']['beta_3']['bic']:.2f} "
              f"(BIC-preferred K={r['method_3_beta_mixture']['bic_preferred_K']}) |")
    md.append(f"| Method 3': LogGMM-2 on logit-transformed | "
              f"{r['method_3_beta_mixture']['logit_gmm_2'].get('crossing_original')} | "
              f"White 1982 quasi-MLE robustness check |")
    md.append(f"| Script 18 GMM 2-comp marginal crossing | "
              f"{r['script_18_gmm_2comp_crossing']} | full 2D mixture |")

    md += ['', '## Accountant-level dHash mean', '',
           '| Method | Threshold | Supporting statistic |',
           '|--------|-----------|----------------------|']
    r = results['dh_mean']
    md.append(f"| Method 1: KDE antimode | "
              f"{r['method_1_kde_antimode']['primary_antimode']} | "
              f"dip={r['method_1_kde_antimode']['dip']:.4f}, "
              f"p={r['method_1_kde_antimode']['dip_pvalue']:.4f} |")
    md.append(f"| Method 2: BD/McCrary | "
              f"{r['method_2_bd_mccrary']['threshold']} | "
              f"{r['method_2_bd_mccrary']['n_transitions']} transition(s) |")
    md.append(f"| Method 3: 2-component Beta mixture | "
              f"{r['method_3_beta_mixture']['beta_2_crossing_original']} | "
              f"Beta-2 BIC={r['method_3_beta_mixture']['beta_2']['bic']:.2f}, "
              f"Beta-3 BIC={r['method_3_beta_mixture']['beta_3']['bic']:.2f} |")
    md.append(f"| Method 3': LogGMM-2 | "
              f"{r['method_3_beta_mixture']['logit_gmm_2'].get('crossing_original')} | |")
    md.append(f"| Script 18 GMM 2-comp crossing | "
              f"{r['script_18_gmm_2comp_crossing']} | |")

    (OUT / 'accountant_three_methods_report.md').write_text('\n'.join(md),
                                                            encoding='utf-8')
    print(f'Report: {OUT / "accountant_three_methods_report.md"}')


if __name__ == '__main__':
    main()