#!/usr/bin/env python3 """ Script 32: Non-Firm-A Calibration Spike ======================================== Research question (branch ``from-outside-of-firmA``): If we throw away Firm A entirely, can we still derive meaningful cosine / dHash thresholds at the accountant level? Three subset analyses (per user's "1. 我們可以分開做" clarification): Subset I — Big-4 minus Firm A: KPMG + PwC + EY pooled Subset II — All non-Firm-A firms: every firm except 勤業眾信聯合 Subset III (baseline reference) — Firm A only Each subset is run through Script 20's three-method framework (KDE+dip, BD/McCrary, 2-component Beta mixture + logit-GMM) plus the 2D-GMM 2-comp marginal crossing from Script 18, on the per-accountant means of: * cos_mean = AVG(s.max_similarity_to_same_accountant) * dh_mean = AVG(s.min_dhash_independent) Time-stratified contingency analysis: If Subset I/II fail to expose bimodality, we re-load each accountant's signatures stratified into pre-2018 vs post-2020 sub-buckets (>=5 sigs per bucket required) and re-run the three-method framework on the resulting bucket-level means. This tests whether the time axis can substitute for the firm-anchor axis. Verdict (A/B/C): A Bimodal structure emerges in Subset I or II without time stratification, with crossings within +-0.02 (cos) / +-2.0 (dh) of Paper A baselines (0.945, 8.10) and dip-test multimodal at alpha=0.05. -> "outside-Firm-A calibration is viable" B Bimodal structure only emerges after time stratification. -> "time axis substitutes for firm anchor; v3.21 robustness or Paper C with time-stratified design" C No bimodality in either; crossings are unstable / outside plausible range. -> "Firm A is required as anchor; this strengthens Paper A's framing" Output: reports/non_firm_a_calibration/ non_firm_a_calibration_results.json non_firm_a_calibration_report.md panel__.png """ import sqlite3 import json import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from pathlib import Path from datetime import datetime from scipy import stats from scipy.signal import find_peaks from scipy.optimize import brentq from sklearn.mixture import GaussianMixture import diptest DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' 'non_firm_a_calibration') OUT.mkdir(parents=True, exist_ok=True) EPS = 1e-6 Z_CRIT = 1.96 MIN_SIGS = 10 MIN_SIGS_PER_BUCKET = 5 FIRM_A = '勤業眾信聯合' # Deloitte BIG4_NON_A = ('安侯建業聯合', '資誠聯合', '安永聯合') # KPMG, PwC, EY PAPER_A_COS_BASELINE = 0.945 PAPER_A_DH_BASELINE = 8.10 # ---------- Loaders ---------- def _accountant_means_query(firm_filter_sql, params, time_filter_sql=''): sql = f''' SELECT s.assigned_accountant, AVG(s.max_similarity_to_same_accountant) AS cos_mean, AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean, COUNT(*) AS n FROM signatures s JOIN accountants a ON s.assigned_accountant = a.name WHERE s.assigned_accountant IS NOT NULL AND s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL {firm_filter_sql} {time_filter_sql} GROUP BY s.assigned_accountant HAVING n >= ? ''' return sql, params + [MIN_SIGS] def load_subset(label): """Return (cos, dh, n_accountants, n_signatures).""" conn = sqlite3.connect(DB) cur = conn.cursor() if label == 'big4_non_A': firm_filter = 'AND a.firm IN (?, ?, ?)' params = list(BIG4_NON_A) elif label == 'all_non_A': firm_filter = 'AND a.firm IS NOT NULL AND a.firm != ?' params = [FIRM_A] elif label == 'firm_A': firm_filter = 'AND a.firm = ?' params = [FIRM_A] else: raise ValueError(label) sql, p = _accountant_means_query(firm_filter, params) cur.execute(sql, p) rows = cur.fetchall() conn.close() cos = np.array([r[1] for r in rows]) dh = np.array([r[2] for r in rows]) n_sigs = int(sum(r[3] for r in rows)) return cos, dh, len(rows), n_sigs def load_subset_time_stratified(label, period): """Per-accountant means computed only from `period` signatures. period: 'pre_2018' (year_month < 2018-01) or 'post_2020' (>= 2020-01). """ conn = sqlite3.connect(DB) cur = conn.cursor() if period == 'pre_2018': time_filter = "AND s.year_month < '2018-01'" elif period == 'post_2020': time_filter = "AND s.year_month >= '2020-01'" else: raise ValueError(period) if label == 'big4_non_A': firm_filter = 'AND a.firm IN (?, ?, ?)' params = list(BIG4_NON_A) elif label == 'all_non_A': firm_filter = 'AND a.firm IS NOT NULL AND a.firm != ?' params = [FIRM_A] else: raise ValueError(label) sql = f''' SELECT s.assigned_accountant, AVG(s.max_similarity_to_same_accountant) AS cos_mean, AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean, COUNT(*) AS n FROM signatures s JOIN accountants a ON s.assigned_accountant = a.name WHERE s.assigned_accountant IS NOT NULL AND s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL {firm_filter} {time_filter} GROUP BY s.assigned_accountant HAVING n >= ? ''' cur.execute(sql, params + [MIN_SIGS_PER_BUCKET]) rows = cur.fetchall() conn.close() cos = np.array([r[1] for r in rows]) dh = np.array([r[2] for r in rows]) return cos, dh, len(rows), int(sum(r[3] for r in rows)) # ---------- Methods (lifted from Script 20) ---------- def method_kde_antimode(values): arr = np.asarray(values, dtype=float) arr = arr[np.isfinite(arr)] if len(arr) < 8: return {'n': int(len(arr)), 'note': 'too few points'} dip, pval = diptest.diptest(arr, boot_pval=True, n_boot=2000) kde = stats.gaussian_kde(arr, bw_method='silverman') xs = np.linspace(arr.min(), arr.max(), 2000) density = kde(xs) peaks, _ = find_peaks(density, prominence=density.max() * 0.02) antimodes = [] for i in range(len(peaks) - 1): seg = density[peaks[i]:peaks[i + 1]] if not len(seg): continue local = peaks[i] + int(np.argmin(seg)) antimodes.append(float(xs[local])) sens = {} for bwf in [0.5, 0.75, 1.0, 1.5, 2.0]: kde_s = stats.gaussian_kde(arr, bw_method=kde.factor * bwf) d_s = kde_s(xs) p_s, _ = find_peaks(d_s, prominence=d_s.max() * 0.02) sens[f'bw_x{bwf}'] = int(len(p_s)) return { 'n': int(len(arr)), 'dip': float(dip), 'dip_pvalue': float(pval), 'unimodal_alpha05': bool(pval > 0.05), 'kde_bandwidth_silverman': float(kde.factor), 'n_modes': int(len(peaks)), 'mode_locations': [float(xs[p]) for p in peaks], 'antimodes': antimodes, 'primary_antimode': (antimodes[0] if antimodes else None), 'bandwidth_sensitivity_n_modes': sens, } def method_bd_mccrary(values, bin_width, direction): arr = np.asarray(values, dtype=float) arr = arr[np.isfinite(arr)] if len(arr) < 8: return {'n': int(len(arr)), 'note': 'too few points'} lo = float(np.floor(arr.min() / bin_width) * bin_width) hi = float(np.ceil(arr.max() / bin_width) * bin_width) edges = np.arange(lo, hi + bin_width, bin_width) counts, _ = np.histogram(arr, bins=edges) centers = (edges[:-1] + edges[1:]) / 2.0 N = counts.sum() p = counts / N if N else counts.astype(float) n_bins = len(counts) z = np.full(n_bins, np.nan) for i in range(1, n_bins - 1): p_lo = p[i - 1] p_hi = p[i + 1] exp_i = 0.5 * (counts[i - 1] + counts[i + 1]) var_i = (N * p[i] * (1 - p[i]) + 0.25 * N * (p_lo + p_hi) * (1 - p_lo - p_hi)) if var_i <= 0: continue z[i] = (counts[i] - exp_i) / np.sqrt(var_i) transitions = [] for i in range(1, len(z)): if np.isnan(z[i - 1]) or np.isnan(z[i]): continue ok = ((direction == 'neg_to_pos' and z[i - 1] < -Z_CRIT and z[i] > Z_CRIT) or (direction == 'pos_to_neg' and z[i - 1] > Z_CRIT and z[i] < -Z_CRIT)) if ok: transitions.append({ 'threshold_between': float((centers[i - 1] + centers[i]) / 2.0), 'z_before': float(z[i - 1]), 'z_after': float(z[i]), }) best = (max(transitions, key=lambda t: abs(t['z_before']) + abs(t['z_after'])) if transitions else None) return { 'n': int(len(arr)), 'bin_width': float(bin_width), 'direction': direction, 'n_transitions': len(transitions), 'transitions': transitions, 'threshold': (best['threshold_between'] if best else None), } def fit_beta_mixture_em(x, K=2, max_iter=300, tol=1e-6, seed=42): x = np.clip(np.asarray(x, dtype=float), EPS, 1 - EPS) n = len(x) q = np.linspace(0, 1, K + 1) thresh = np.quantile(x, q[1:-1]) labels = np.digitize(x, thresh) resp = np.zeros((n, K)) resp[np.arange(n), labels] = 1.0 ll_hist = [] for it in range(max_iter): nk = resp.sum(axis=0) + 1e-12 weights = nk / nk.sum() mus = (resp * x[:, None]).sum(axis=0) / nk var_num = (resp * (x[:, None] - mus) ** 2).sum(axis=0) vars_ = var_num / nk upper = mus * (1 - mus) - 1e-9 vars_ = np.minimum(vars_, upper) vars_ = np.maximum(vars_, 1e-9) factor = np.maximum(mus * (1 - mus) / vars_ - 1, 1e-6) alphas = mus * factor betas = (1 - mus) * factor log_pdfs = np.column_stack([ stats.beta.logpdf(x, alphas[k], betas[k]) + np.log(weights[k]) for k in range(K) ]) m = log_pdfs.max(axis=1, keepdims=True) ll = (m.ravel() + np.log(np.exp(log_pdfs - m).sum(axis=1))).sum() ll_hist.append(float(ll)) new_resp = np.exp(log_pdfs - m) new_resp = new_resp / new_resp.sum(axis=1, keepdims=True) if it > 0 and abs(ll_hist[-1] - ll_hist[-2]) < tol: resp = new_resp break resp = new_resp order = np.argsort(mus) alphas = alphas[order] betas = betas[order] weights = weights[order] mus = mus[order] k_params = 3 * K - 1 ll_final = ll_hist[-1] return { 'K': K, 'alphas': [float(a) for a in alphas], 'betas': [float(b) for b in betas], 'weights': [float(w) for w in weights], 'mus': [float(m) for m in mus], 'log_likelihood': ll_final, 'aic': float(2 * k_params - 2 * ll_final), 'bic': float(k_params * np.log(n) - 2 * ll_final), 'n_iter': it + 1, } def beta_crossing(fit): if fit['K'] != 2: return None a1, b1, w1 = fit['alphas'][0], fit['betas'][0], fit['weights'][0] a2, b2, w2 = fit['alphas'][1], fit['betas'][1], fit['weights'][1] def diff(x): return (w2 * stats.beta.pdf(x, a2, b2) - w1 * stats.beta.pdf(x, a1, b1)) xs = np.linspace(EPS, 1 - EPS, 2000) ys = diff(xs) changes = np.where(np.diff(np.sign(ys)) != 0)[0] if not len(changes): return None mid = 0.5 * (fit['mus'][0] + fit['mus'][1]) crossings = [] for i in changes: try: crossings.append(brentq(diff, xs[i], xs[i + 1])) except ValueError: continue if not crossings: return None return float(min(crossings, key=lambda c: abs(c - mid))) def fit_logit_gmm(x, K=2, seed=42): x = np.clip(np.asarray(x, dtype=float), EPS, 1 - EPS) z = np.log(x / (1 - x)).reshape(-1, 1) gmm = GaussianMixture(n_components=K, random_state=seed, max_iter=500).fit(z) order = np.argsort(gmm.means_.ravel()) means = gmm.means_.ravel()[order] stds = np.sqrt(gmm.covariances_.ravel())[order] weights = gmm.weights_[order] crossing = None if K == 2: m1, s1, w1 = means[0], stds[0], weights[0] m2, s2, w2 = means[1], stds[1], weights[1] def diff(z0): return (w2 * stats.norm.pdf(z0, m2, s2) - w1 * stats.norm.pdf(z0, m1, s1)) zs = np.linspace(min(m1, m2) - 2, max(m1, m2) + 2, 2000) ys = diff(zs) ch = np.where(np.diff(np.sign(ys)) != 0)[0] if len(ch): try: z_cross = brentq(diff, zs[ch[0]], zs[ch[0] + 1]) crossing = float(1 / (1 + np.exp(-z_cross))) except ValueError: pass return { 'K': K, 'means_logit': [float(m) for m in means], 'stds_logit': [float(s) for s in stds], 'weights': [float(w) for w in weights], 'means_original_scale': [float(1 / (1 + np.exp(-m))) for m in means], 'aic': float(gmm.aic(z)), 'bic': float(gmm.bic(z)), 'crossing_original': crossing, } def method_beta_mixture(values, is_cosine=True): arr = np.asarray(values, dtype=float) arr = arr[np.isfinite(arr)] if len(arr) < 8: return {'n': int(len(arr)), 'note': 'too few points'} x = arr if is_cosine else arr / 64.0 beta2 = fit_beta_mixture_em(x, K=2) beta3 = fit_beta_mixture_em(x, K=3) cross_beta2 = beta_crossing(beta2) if not is_cosine and cross_beta2 is not None: cross_beta2 = cross_beta2 * 64.0 gmm2 = fit_logit_gmm(x, K=2) gmm3 = fit_logit_gmm(x, K=3) if not is_cosine and gmm2.get('crossing_original') is not None: gmm2['crossing_original'] = gmm2['crossing_original'] * 64.0 return { 'n': int(len(x)), 'scale_transform': ('identity' if is_cosine else 'dhash/64'), 'beta_2': beta2, 'beta_3': beta3, 'bic_preferred_K': (2 if beta2['bic'] < beta3['bic'] else 3), 'beta_2_crossing_original': cross_beta2, 'logit_gmm_2': gmm2, 'logit_gmm_3': gmm3, } def gmm_2d_marginal_crossing(cos, dh, dim): """2-comp 2D GMM, then marginal crossing on the requested dim.""" X = np.column_stack([cos, dh]) if len(X) < 8: return None gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=42, n_init=15, max_iter=500).fit(X) means = gmm.means_ covs = gmm.covariances_ weights = gmm.weights_ m1, m2 = means[0][dim], means[1][dim] s1 = np.sqrt(covs[0][dim, dim]) s2 = np.sqrt(covs[1][dim, dim]) w1, w2 = weights[0], weights[1] def diff(x): return (w2 * stats.norm.pdf(x, m2, s2) - w1 * stats.norm.pdf(x, m1, s1)) xs = np.linspace(float(X[:, dim].min()), float(X[:, dim].max()), 2000) ys = diff(xs) ch = np.where(np.diff(np.sign(ys)) != 0)[0] if not len(ch): return None mid = 0.5 * (m1 + m2) crossings = [] for i in ch: try: crossings.append(brentq(diff, xs[i], xs[i + 1])) except ValueError: continue if not crossings: return None return float(min(crossings, key=lambda c: abs(c - mid))) def gmm_2d_3comp_summary(cos, dh): """K=3 2D GMM for completeness; report component means + weights.""" X = np.column_stack([cos, dh]) if len(X) < 12: return None gmm = GaussianMixture(n_components=3, covariance_type='full', random_state=42, n_init=15, max_iter=500).fit(X) order = np.argsort(gmm.means_[:, 0]) # sort by cosine ascending return { 'means': [[float(m[0]), float(m[1])] for m in gmm.means_[order]], 'weights': [float(w) for w in gmm.weights_[order]], 'bic': float(gmm.bic(X)), 'aic': float(gmm.aic(X)), } # ---------- Driver ---------- def run_three_method(cos, dh, label): results = {} for desc, arr, bin_width, direction, is_cos in [ ('cos_mean', cos, 0.002, 'neg_to_pos', True), ('dh_mean', dh, 0.2, 'pos_to_neg', False), ]: m1 = method_kde_antimode(arr) m2 = method_bd_mccrary(arr, bin_width, direction) m3 = method_beta_mixture(arr, is_cosine=is_cos) gmm2_marginal = gmm_2d_marginal_crossing( cos, dh, dim=(0 if desc == 'cos_mean' else 1)) results[desc] = { 'method_1_kde_antimode': m1, 'method_2_bd_mccrary': m2, 'method_3_beta_mixture': m3, 'gmm_2d_2comp_marginal_crossing': gmm2_marginal, } results['gmm_2d_3comp'] = gmm_2d_3comp_summary(cos, dh) return results def plot_panel(values, methods, title, out_path, bin_width=None): arr = np.asarray(values, dtype=float) fig, axes = plt.subplots(2, 1, figsize=(11, 7), gridspec_kw={'height_ratios': [3, 1]}) ax = axes[0] if bin_width is None: bins = 40 else: lo = float(np.floor(arr.min() / bin_width) * bin_width) hi = float(np.ceil(arr.max() / bin_width) * bin_width) bins = np.arange(lo, hi + bin_width, bin_width) ax.hist(arr, bins=bins, density=True, alpha=0.55, color='steelblue', edgecolor='white') kde = stats.gaussian_kde(arr, bw_method='silverman') xs = np.linspace(arr.min(), arr.max(), 500) ax.plot(xs, kde(xs), 'g-', lw=1.5, label='KDE (Silverman)') colors = {'kde': 'green', 'bd': 'red', 'beta': 'purple', 'gmm2': 'orange', 'baseline': 'black'} for key, (val, lbl) in methods.items(): if val is None: continue ls = ':' if key == 'baseline' else '--' ax.axvline(val, color=colors.get(key, 'gray'), lw=2, ls=ls, label=f'{lbl} = {val:.4f}') ax.set_xlabel(title) ax.set_ylabel('Density') ax.set_title(title) ax.legend(fontsize=8) ax2 = axes[1] ax2.set_title('Thresholds across methods') ax2.set_xlim(ax.get_xlim()) for i, (key, (val, lbl)) in enumerate(methods.items()): if val is None: continue ax2.scatter([val], [i], color=colors.get(key, 'gray'), s=100, zorder=3) ax2.annotate(f' {lbl}: {val:.4f}', (val, i), fontsize=8, va='center') ax2.set_yticks(range(len(methods))) ax2.set_yticklabels([m for m in methods.keys()]) ax2.grid(alpha=0.3) plt.tight_layout() fig.savefig(out_path, dpi=150) plt.close() def emit_panel(subset_label, results): for desc, bin_width in [('cos_mean', 0.002), ('dh_mean', 0.2)]: if 'note' in results[desc]['method_1_kde_antimode']: continue baseline = (PAPER_A_COS_BASELINE if desc == 'cos_mean' else PAPER_A_DH_BASELINE) methods_for_plot = { 'kde': (results[desc]['method_1_kde_antimode'].get('primary_antimode'), 'KDE antimode'), 'bd': (results[desc]['method_2_bd_mccrary'].get('threshold'), 'BD/McCrary'), 'beta': (results[desc]['method_3_beta_mixture'].get( 'beta_2_crossing_original'), 'Beta-2 crossing'), 'gmm2': (results[desc]['gmm_2d_2comp_marginal_crossing'], '2D GMM 2-comp'), 'baseline': (baseline, f'Paper A baseline'), } # Need data array for the plot; reload for size only arr = np.array([]) # filled by caller via closure if needed # Plot caller passes arr return methods_for_plot return None def classify_verdict(results_by_subset): """Return ('A'|'B'|'C', explanation).""" def well_separated(res, baseline_cos, baseline_dh): cos_cross = res['cos_mean']['method_3_beta_mixture'].get( 'beta_2_crossing_original') dh_cross = res['dh_mean']['method_3_beta_mixture'].get( 'beta_2_crossing_original') cos_dip_p = res['cos_mean']['method_1_kde_antimode'].get('dip_pvalue') dh_dip_p = res['dh_mean']['method_1_kde_antimode'].get('dip_pvalue') cos_ok = (cos_cross is not None and abs(cos_cross - baseline_cos) <= 0.02 and cos_dip_p is not None and cos_dip_p <= 0.05) dh_ok = (dh_cross is not None and abs(dh_cross - baseline_dh) <= 2.0 and dh_dip_p is not None and dh_dip_p <= 0.05) return cos_ok, dh_ok for subset in ('big4_non_A', 'all_non_A'): res = results_by_subset.get(subset) if not res: continue cos_ok, dh_ok = well_separated(res, PAPER_A_COS_BASELINE, PAPER_A_DH_BASELINE) if cos_ok and dh_ok: return 'A', (f"Subset '{subset}' shows bimodal cos+dh with " f"crossings within tolerance of Paper A baselines.") # B: time-stratified rescues it? for subset_period in ('big4_non_A_pre_2018', 'big4_non_A_post_2020', 'all_non_A_pre_2018', 'all_non_A_post_2020'): res = results_by_subset.get(subset_period) if not res: continue cos_ok, dh_ok = well_separated(res, PAPER_A_COS_BASELINE, PAPER_A_DH_BASELINE) if cos_ok and dh_ok: return 'B', (f"Time-stratified subset '{subset_period}' recovers " f"separable bimodality.") return 'C', ("Neither pooled nor time-stratified non-Firm-A calibration " "produces a baseline-consistent bimodal threshold.") def render_report(results_by_subset, sample_sizes, verdict): md = [ '# Non-Firm-A Calibration Spike (Script 32)', f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", '', '## Research Question', '', ('If we exclude Firm A (Deloitte) from calibration, can the ' 'three-method framework still recover a meaningful ' 'cosine / dHash threshold at the accountant level?'), '', '## Sample Sizes', '', '| Subset | N accountants (>=10 sigs) | N signatures |', '|--------|---------------------------|--------------|', ] for label, (n_acc, n_sig) in sample_sizes.items(): md.append(f'| `{label}` | {n_acc} | {n_sig} |') md += ['', '## Paper A Baselines (for comparison)', '', f'- Accountant-level 2D GMM 2-comp marginal crossings: ' f'cos = **{PAPER_A_COS_BASELINE}**, dHash = **{PAPER_A_DH_BASELINE}**', ''] for label, results in results_by_subset.items(): md += [f'## Subset: `{label}`', ''] for measure, baseline in [('cos_mean', PAPER_A_COS_BASELINE), ('dh_mean', PAPER_A_DH_BASELINE)]: r = results[measure] md += [f'### {measure}', '', '| Method | Threshold | Supporting statistic |', '|--------|-----------|----------------------|'] kde = r['method_1_kde_antimode'] if 'note' in kde: md.append(f'| Method 1: KDE+dip | n/a | {kde["note"]} |') else: tag = 'unimodal' if kde['unimodal_alpha05'] else 'multimodal' md.append( f'| Method 1: KDE antimode (dip test) | ' f'{kde["primary_antimode"]} | ' f'dip={kde["dip"]:.4f}, p={kde["dip_pvalue"]:.4f} ' f'({tag}); n_modes={kde["n_modes"]} |') bd = r['method_2_bd_mccrary'] md.append( f'| Method 2: BD/McCrary | {bd.get("threshold")} | ' f'{bd.get("n_transitions", 0)} transition(s) |') beta = r['method_3_beta_mixture'] if 'note' in beta: md.append(f'| Method 3: Beta mixture | n/a | {beta["note"]} |') else: md.append( f'| Method 3: 2-comp Beta mixture | ' f'{beta["beta_2_crossing_original"]} | ' f'Beta-2 BIC={beta["beta_2"]["bic"]:.2f}, ' f'Beta-3 BIC={beta["beta_3"]["bic"]:.2f} ' f'(K*={beta["bic_preferred_K"]}) |') md.append( f'| Method 3\': LogGMM-2 | ' f'{beta["logit_gmm_2"].get("crossing_original")} | ' f'logit-Gaussian robustness check |') md.append( f'| 2D GMM 2-comp marginal crossing | ' f'{r["gmm_2d_2comp_marginal_crossing"]} | ' f'paired with Paper A baseline = {baseline} |') md.append('') if results.get('gmm_2d_3comp'): g3 = results['gmm_2d_3comp'] md += ['### 2D GMM K=3 components (for completeness)', '', '| Component | mean cos | mean dh | weight |', '|-----------|----------|---------|--------|'] for i, (m, w) in enumerate(zip(g3['means'], g3['weights'])): md.append(f'| C{i + 1} | {m[0]:.4f} | {m[1]:.4f} | {w:.3f} |') md.append('') md.append(f'BIC(K=3 2D)={g3["bic"]:.2f}, AIC={g3["aic"]:.2f}') md.append('') md += ['## Verdict', '', f'**{verdict[0]}** — {verdict[1]}', '', '### Verdict legend', '- **A**: outside-Firm-A calibration is viable in pooled form ' '(crossings within +-0.02 cos / +-2.0 dh of Paper A baselines ' 'AND dip-test multimodal at alpha=0.05).', '- **B**: time-stratified subset recovers separable bimodality.', '- **C**: neither rescue works; Firm A remains required as anchor.', ''] return '\n'.join(md) def main(): print('=' * 72) print('Script 32: Non-Firm-A Calibration Spike') print('=' * 72) sample_sizes = {} results_by_subset = {} arrays_by_subset = {} # --- Pooled subsets --- for label in ('big4_non_A', 'all_non_A', 'firm_A'): cos, dh, n_acc, n_sig = load_subset(label) sample_sizes[label] = (n_acc, n_sig) arrays_by_subset[label] = (cos, dh) print(f'\n[{label}] N accountants={n_acc}, N sigs={n_sig}') results_by_subset[label] = run_three_method(cos, dh, label) for desc in ('cos_mean', 'dh_mean'): r = results_by_subset[label][desc] kde = r['method_1_kde_antimode'] beta = r['method_3_beta_mixture'] print(f' {desc}: dip p={kde.get("dip_pvalue")} ' f'(n_modes={kde.get("n_modes")}); ' f'Beta-2 cross={beta.get("beta_2_crossing_original")}; ' f'2D-GMM marginal={r["gmm_2d_2comp_marginal_crossing"]}') # --- Time-stratified secondary (run unconditionally; verdict logic decides) --- for label in ('big4_non_A', 'all_non_A'): for period in ('pre_2018', 'post_2020'): cos, dh, n_acc, n_sig = load_subset_time_stratified(label, period) key = f'{label}_{period}' sample_sizes[key] = (n_acc, n_sig) arrays_by_subset[key] = (cos, dh) print(f'\n[{key}] N accountants={n_acc}, N sigs={n_sig}') if n_acc < 8: print(f' (skipped: too few accountants for analysis)') continue results_by_subset[key] = run_three_method(cos, dh, key) for desc in ('cos_mean', 'dh_mean'): r = results_by_subset[key][desc] kde = r['method_1_kde_antimode'] beta = r['method_3_beta_mixture'] print(f' {desc}: dip p={kde.get("dip_pvalue")} ' f'(n_modes={kde.get("n_modes")}); ' f'Beta-2 cross={beta.get("beta_2_crossing_original")}; ' f'2D-GMM marginal={r["gmm_2d_2comp_marginal_crossing"]}') # --- Plots --- for label, results in results_by_subset.items(): cos, dh = arrays_by_subset[label] for desc, arr, bin_width, baseline in [ ('cos_mean', cos, 0.002, PAPER_A_COS_BASELINE), ('dh_mean', dh, 0.2, PAPER_A_DH_BASELINE), ]: r = results[desc] if 'note' in r['method_1_kde_antimode']: continue methods_for_plot = { 'kde': (r['method_1_kde_antimode'].get('primary_antimode'), 'KDE antimode'), 'bd': (r['method_2_bd_mccrary'].get('threshold'), 'BD/McCrary'), 'beta': (r['method_3_beta_mixture'].get( 'beta_2_crossing_original'), 'Beta-2 crossing'), 'gmm2': (r['gmm_2d_2comp_marginal_crossing'], '2D GMM 2-comp'), 'baseline': (baseline, 'Paper A baseline'), } png = OUT / f'panel_{label}_{desc}.png' plot_panel(arr, methods_for_plot, f'{label} -- accountant-level {desc}', png, bin_width=bin_width) print(f' plot: {png}') # --- Verdict --- verdict = classify_verdict(results_by_subset) print(f'\nVerdict: {verdict[0]} -- {verdict[1]}') # --- Persist --- payload = { 'generated_at': datetime.now().isoformat(), 'min_sigs_per_accountant': MIN_SIGS, 'min_sigs_per_bucket_time_stratified': MIN_SIGS_PER_BUCKET, 'paper_a_baselines': { 'cos': PAPER_A_COS_BASELINE, 'dh': PAPER_A_DH_BASELINE, }, 'sample_sizes': {k: {'n_accountants': v[0], 'n_signatures': v[1]} for k, v in sample_sizes.items()}, 'results': results_by_subset, 'verdict': {'class': verdict[0], 'explanation': verdict[1]}, } (OUT / 'non_firm_a_calibration_results.json').write_text( json.dumps(payload, indent=2, ensure_ascii=False), encoding='utf-8') print(f'\nJSON: {OUT / "non_firm_a_calibration_results.json"}') md = render_report(results_by_subset, sample_sizes, verdict) (OUT / 'non_firm_a_calibration_report.md').write_text(md, encoding='utf-8') print(f'Report: {OUT / "non_firm_a_calibration_report.md"}') if __name__ == '__main__': main()