#!/usr/bin/env python3 """ Script 18: Accountant-Level 3-Component Gaussian Mixture ======================================================== Rebuild the GMM analysis from memory 2026-04-16: at the accountant level (not signature level), the joint distribution of (cosine_mean, dhash_mean) separates into three components corresponding to signing-behaviour regimes: C1 High-replication cos_mean ≈ 0.983, dh_mean ≈ 2.4, ~20%, Deloitte-heavy C2 Middle band cos_mean ≈ 0.954, dh_mean ≈ 7.0, ~52%, KPMG/PwC/EY C3 Hand-signed tendency cos_mean ≈ 0.928, dh_mean ≈ 11.2, ~28%, small firms The script: 1. Aggregates per-accountant means from the signature table. 2. Fits 1-, 2-, 3-, 4-component 2D Gaussian mixtures and selects by BIC. 3. Reports component parameters, cluster assignments, and per-firm breakdown. 4. For the 2-component fit derives the natural threshold (crossing of marginal densities in cosine-mean and dhash-mean). Output: reports/accountant_mixture/accountant_mixture_report.md reports/accountant_mixture/accountant_mixture_results.json reports/accountant_mixture/accountant_mixture_2d.png reports/accountant_mixture/accountant_mixture_marginals.png """ import sqlite3 import json import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from pathlib import Path from datetime import datetime from scipy import stats from scipy.optimize import brentq from sklearn.mixture import GaussianMixture DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' 'accountant_mixture') OUT.mkdir(parents=True, exist_ok=True) MIN_SIGS = 10 def load_accountant_aggregates(): conn = sqlite3.connect(DB) cur = conn.cursor() cur.execute(''' SELECT s.assigned_accountant, a.firm, AVG(s.max_similarity_to_same_accountant) AS cos_mean, AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean, COUNT(*) AS n FROM signatures s LEFT JOIN accountants a ON s.assigned_accountant = a.name WHERE s.assigned_accountant IS NOT NULL AND s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL GROUP BY s.assigned_accountant HAVING n >= ? ''', (MIN_SIGS,)) rows = cur.fetchall() conn.close() return [ {'accountant': r[0], 'firm': r[1] or '(unknown)', 'cos_mean': float(r[2]), 'dh_mean': float(r[3]), 'n': int(r[4])} for r in rows ] def fit_gmm_range(X, ks=(1, 2, 3, 4, 5), seed=42, n_init=10): results = [] best_bic = np.inf best = None for k in ks: gmm = GaussianMixture( n_components=k, covariance_type='full', random_state=seed, n_init=n_init, max_iter=500, ).fit(X) bic = gmm.bic(X) aic = gmm.aic(X) results.append({ 'k': int(k), 'bic': float(bic), 'aic': float(aic), 'converged': bool(gmm.converged_), 'n_iter': int(gmm.n_iter_), }) if bic < best_bic: best_bic = bic best = gmm return results, best def summarize_components(gmm, X, df): """Assign clusters, return per-component stats + per-firm breakdown.""" labels = gmm.predict(X) means = gmm.means_ order = np.argsort(means[:, 0]) # order by cos_mean ascending # Relabel so smallest cos_mean = component 1 relabel = np.argsort(order) # Actually invert: in prior memory C1 was HIGH replication (highest cos). # To keep consistent with memory, order DESCENDING by cos_mean so C1 = high. order = np.argsort(-means[:, 0]) relabel = {int(old): new + 1 for new, old in enumerate(order)} new_labels = np.array([relabel[int(l)] for l in labels]) components = [] for rank, old_idx in enumerate(order, start=1): mu = means[old_idx] cov = gmm.covariances_[old_idx] w = gmm.weights_[old_idx] mask = new_labels == rank firms = {} for row, in_cluster in zip(df, mask): if not in_cluster: continue firms[row['firm']] = firms.get(row['firm'], 0) + 1 firms_sorted = sorted(firms.items(), key=lambda kv: -kv[1]) components.append({ 'component': rank, 'mu_cos': float(mu[0]), 'mu_dh': float(mu[1]), 'cov_00': float(cov[0, 0]), 'cov_11': float(cov[1, 1]), 'cov_01': float(cov[0, 1]), 'corr': float(cov[0, 1] / np.sqrt(cov[0, 0] * cov[1, 1])), 'weight': float(w), 'n_accountants': int(mask.sum()), 'top_firms': firms_sorted[:5], }) return components, new_labels def marginal_crossing(means, covs, weights, dim, search_lo, search_hi): """Find crossing of two weighted marginal Gaussians along dimension `dim`.""" m1, m2 = means[0][dim], means[1][dim] s1 = np.sqrt(covs[0][dim, dim]) s2 = np.sqrt(covs[1][dim, dim]) w1, w2 = weights[0], weights[1] def diff(x): return (w2 * stats.norm.pdf(x, m2, s2) - w1 * stats.norm.pdf(x, m1, s1)) xs = np.linspace(search_lo, search_hi, 2000) ys = diff(xs) changes = np.where(np.diff(np.sign(ys)) != 0)[0] if not len(changes): return None mid = 0.5 * (m1 + m2) crossings = [] for i in changes: try: crossings.append(brentq(diff, xs[i], xs[i + 1])) except ValueError: continue if not crossings: return None return float(min(crossings, key=lambda c: abs(c - mid))) def plot_2d(df, labels, means, title, out_path): colors = ['#d62728', '#1f77b4', '#2ca02c', '#9467bd', '#ff7f0e'] fig, ax = plt.subplots(figsize=(9, 7)) for k in sorted(set(labels)): mask = labels == k xs = [r['cos_mean'] for r, m in zip(df, mask) if m] ys = [r['dh_mean'] for r, m in zip(df, mask) if m] ax.scatter(xs, ys, s=20, alpha=0.55, color=colors[(k - 1) % 5], label=f'C{k} (n={int(mask.sum())})') for i, mu in enumerate(means): ax.plot(mu[0], mu[1], 'k*', ms=18, mec='white', mew=1.5) ax.annotate(f' μ{i+1}', (mu[0], mu[1]), fontsize=10) ax.set_xlabel('Per-accountant mean cosine max-similarity') ax.set_ylabel('Per-accountant mean independent min dHash') ax.set_title(title) ax.legend() plt.tight_layout() fig.savefig(out_path, dpi=150) plt.close() def plot_marginals(df, labels, gmm_2, out_path, cos_cross=None, dh_cross=None): cos = np.array([r['cos_mean'] for r in df]) dh = np.array([r['dh_mean'] for r in df]) fig, axes = plt.subplots(1, 2, figsize=(13, 5)) # Cosine marginal ax = axes[0] ax.hist(cos, bins=40, density=True, alpha=0.5, color='steelblue', edgecolor='white') xs = np.linspace(cos.min(), cos.max(), 400) means_2 = gmm_2.means_ covs_2 = gmm_2.covariances_ weights_2 = gmm_2.weights_ order = np.argsort(-means_2[:, 0]) for rank, i in enumerate(order, start=1): ys = weights_2[i] * stats.norm.pdf(xs, means_2[i, 0], np.sqrt(covs_2[i, 0, 0])) ax.plot(xs, ys, '--', label=f'C{rank} μ={means_2[i,0]:.3f}') if cos_cross is not None: ax.axvline(cos_cross, color='green', lw=2, label=f'Crossing = {cos_cross:.4f}') ax.set_xlabel('Per-accountant mean cosine') ax.set_ylabel('Density') ax.set_title('Cosine marginal (2-component fit)') ax.legend(fontsize=8) # dHash marginal ax = axes[1] ax.hist(dh, bins=40, density=True, alpha=0.5, color='coral', edgecolor='white') xs = np.linspace(dh.min(), dh.max(), 400) for rank, i in enumerate(order, start=1): ys = weights_2[i] * stats.norm.pdf(xs, means_2[i, 1], np.sqrt(covs_2[i, 1, 1])) ax.plot(xs, ys, '--', label=f'C{rank} μ={means_2[i,1]:.2f}') if dh_cross is not None: ax.axvline(dh_cross, color='green', lw=2, label=f'Crossing = {dh_cross:.4f}') ax.set_xlabel('Per-accountant mean dHash') ax.set_ylabel('Density') ax.set_title('dHash marginal (2-component fit)') ax.legend(fontsize=8) plt.tight_layout() fig.savefig(out_path, dpi=150) plt.close() def main(): print('='*70) print('Script 18: Accountant-Level Gaussian Mixture') print('='*70) df = load_accountant_aggregates() print(f'\nAccountants with >= {MIN_SIGS} signatures: {len(df)}') X = np.array([[r['cos_mean'], r['dh_mean']] for r in df]) # Fit K=1..5 print('\nFitting GMMs with K=1..5...') bic_results, _ = fit_gmm_range(X, ks=(1, 2, 3, 4, 5), seed=42, n_init=15) for r in bic_results: print(f" K={r['k']}: BIC={r['bic']:.2f} AIC={r['aic']:.2f} " f"converged={r['converged']}") best_k = min(bic_results, key=lambda r: r['bic'])['k'] print(f'\nBIC-best K = {best_k}') # Fit 3-component specifically (target) gmm_3 = GaussianMixture(n_components=3, covariance_type='full', random_state=42, n_init=15, max_iter=500).fit(X) comps_3, labels_3 = summarize_components(gmm_3, X, df) print('\n--- 3-component summary ---') for c in comps_3: tops = ', '.join(f"{f}({n})" for f, n in c['top_firms']) print(f" C{c['component']}: cos={c['mu_cos']:.3f}, " f"dh={c['mu_dh']:.2f}, w={c['weight']:.2f}, " f"n={c['n_accountants']} -> {tops}") # Fit 2-component for threshold derivation gmm_2 = GaussianMixture(n_components=2, covariance_type='full', random_state=42, n_init=15, max_iter=500).fit(X) comps_2, labels_2 = summarize_components(gmm_2, X, df) # Crossings cos_cross = marginal_crossing(gmm_2.means_, gmm_2.covariances_, gmm_2.weights_, dim=0, search_lo=X[:, 0].min(), search_hi=X[:, 0].max()) dh_cross = marginal_crossing(gmm_2.means_, gmm_2.covariances_, gmm_2.weights_, dim=1, search_lo=X[:, 1].min(), search_hi=X[:, 1].max()) print(f'\n2-component crossings: cos={cos_cross}, dh={dh_cross}') # Plots plot_2d(df, labels_3, gmm_3.means_, '3-component accountant-level GMM', OUT / 'accountant_mixture_2d.png') plot_marginals(df, labels_2, gmm_2, OUT / 'accountant_mixture_marginals.png', cos_cross=cos_cross, dh_cross=dh_cross) # Per-accountant CSV (for downstream use) csv_path = OUT / 'accountant_clusters.csv' with open(csv_path, 'w', encoding='utf-8') as f: f.write('accountant,firm,n_signatures,cos_mean,dh_mean,' 'cluster_k3,cluster_k2\n') for r, k3, k2 in zip(df, labels_3, labels_2): f.write(f"{r['accountant']},{r['firm']},{r['n']}," f"{r['cos_mean']:.6f},{r['dh_mean']:.6f},{k3},{k2}\n") print(f'CSV: {csv_path}') # Summary JSON summary = { 'generated_at': datetime.now().isoformat(), 'n_accountants': len(df), 'min_signatures': MIN_SIGS, 'bic_model_selection': bic_results, 'best_k_by_bic': best_k, 'gmm_3': { 'components': comps_3, 'aic': float(gmm_3.aic(X)), 'bic': float(gmm_3.bic(X)), 'log_likelihood': float(gmm_3.score(X) * len(X)), }, 'gmm_2': { 'components': comps_2, 'aic': float(gmm_2.aic(X)), 'bic': float(gmm_2.bic(X)), 'log_likelihood': float(gmm_2.score(X) * len(X)), 'cos_crossing': cos_cross, 'dh_crossing': dh_cross, }, } with open(OUT / 'accountant_mixture_results.json', 'w') as f: json.dump(summary, f, indent=2, ensure_ascii=False) print(f'JSON: {OUT / "accountant_mixture_results.json"}') # Markdown md = [ '# Accountant-Level Gaussian Mixture Report', f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", '', '## Data', '', f'* Per-accountant aggregates: mean cosine max-similarity, ' f'mean independent min dHash.', f"* Minimum signatures per accountant: {MIN_SIGS}.", f'* Accountants included: **{len(df)}**.', '', '## Model selection (BIC)', '', '| K | BIC | AIC | Converged |', '|---|-----|-----|-----------|', ] for r in bic_results: mark = ' ←best' if r['k'] == best_k else '' md.append( f"| {r['k']} | {r['bic']:.2f} | {r['aic']:.2f} | " f"{r['converged']}{mark} |" ) md += ['', '## 3-component fit', '', '| Component | cos_mean | dh_mean | weight | n_accountants | top firms |', '|-----------|----------|---------|--------|----------------|-----------|'] for c in comps_3: tops = ', '.join(f"{f}:{n}" for f, n in c['top_firms']) md.append( f"| C{c['component']} | {c['mu_cos']:.3f} | {c['mu_dh']:.2f} | " f"{c['weight']:.3f} | {c['n_accountants']} | {tops} |" ) md += ['', '## 2-component fit (threshold derivation)', '', '| Component | cos_mean | dh_mean | weight | n_accountants |', '|-----------|----------|---------|--------|----------------|'] for c in comps_2: md.append( f"| C{c['component']} | {c['mu_cos']:.3f} | {c['mu_dh']:.2f} | " f"{c['weight']:.3f} | {c['n_accountants']} |" ) md += ['', '### Natural thresholds from 2-component crossings', '', f'* Cosine: **{cos_cross:.4f}**' if cos_cross else '* Cosine: no crossing found', f'* dHash: **{dh_cross:.4f}**' if dh_cross else '* dHash: no crossing found', '', '## Interpretation', '', 'The accountant-level mixture separates signing-behaviour regimes,', 'while the signature-level distribution is a continuous spectrum', '(see Scripts 15 and 17). The BIC-best model chooses how many', 'discrete regimes the data supports. The 2-component crossings', 'are the natural per-accountant thresholds for classifying a', "CPA's signing behaviour.", '', '## Artifacts', '', '* `accountant_mixture_2d.png` - 2D scatter with 3-component fit', '* `accountant_mixture_marginals.png` - 1D marginals with 2-component fit', '* `accountant_clusters.csv` - per-accountant cluster assignments', '* `accountant_mixture_results.json` - full numerical results', ] (OUT / 'accountant_mixture_report.md').write_text('\n'.join(md), encoding='utf-8') print(f'Report: {OUT / "accountant_mixture_report.md"}') if __name__ == '__main__': main()