#!/usr/bin/env python3 """ Script 41: Full-Dataset Robustness Comparison (light §IV-K) ============================================================= v4.0 §IV-K secondary analysis: re-runs the K=3 mixture + Paper A operational-rule per-CPA hand_frac on the FULL accountant dataset (Big-4 + mid/small firms) and compares to the Big-4-only primary analysis. Per the v4.0 author choice (codex round-22 open question, "Light" scope), this script does NOT re-evaluate the five-way moderate- confidence band. The five-way classifier inherits its v3.x calibration; §IV-K's role is to show the Big-4 primary methodology also runs at the wider scope, not to re-validate every rule. Inputs (DB): /Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db Output: reports/v4_big4/full_dataset_robustness/ fulldataset_results.json fulldataset_report.md panel_full_vs_big4.png Scope of analysis: - Population A: full accountant dataset (n_sig >= 10), n = 686 CPAs - Population B: Big-4 sub-corpus (n_sig >= 10), n = 437 CPAs (= primary analysis scope, reproduced for cross-check) For each population: - Fit 2D K=3 GMM on (cos_mean, dh_mean) - Report component centers + weights - Compute per-CPA P(C1_hand_leaning) (the K=3 posterior, as in Script 38) - Compute per-CPA paperA_hand_frac (cos > 0.95 AND dh <= 5 failure rate) - Spearman correlation between P(C1) and hand_frac Comparison highlights: - Component drift between full and Big-4 K=3 fits - Spearman correlation drift - Per-firm summary at full-dataset scope (Big-4 firms + grouped non-Big-4) """ import sqlite3 import json import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from pathlib import Path from datetime import datetime from scipy import stats from sklearn.mixture import GaussianMixture DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' 'v4_big4/full_dataset_robustness') OUT.mkdir(parents=True, exist_ok=True) SEED = 42 MIN_SIGS = 10 BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合') LABEL = {'勤業眾信聯合': 'Firm A (Deloitte)', '安侯建業聯合': 'KPMG', '資誠聯合': 'PwC', '安永聯合': 'EY'} PAPER_A_COS_CUT = 0.95 PAPER_A_DH_CUT = 5 def load_accountants(big4_only): conn = sqlite3.connect(DB) cur = conn.cursor() if big4_only: firm_filter = 'AND a.firm IN (?, ?, ?, ?)' params = list(BIG4) else: firm_filter = 'AND a.firm IS NOT NULL' params = [] sql = f''' SELECT s.assigned_accountant, a.firm, AVG(s.max_similarity_to_same_accountant) AS cos_mean, AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean, AVG(CASE WHEN s.max_similarity_to_same_accountant > ? AND s.min_dhash_independent <= ? THEN 0.0 ELSE 1.0 END) AS hand_frac, COUNT(*) AS n FROM signatures s JOIN accountants a ON s.assigned_accountant = a.name WHERE s.assigned_accountant IS NOT NULL AND s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL {firm_filter} GROUP BY s.assigned_accountant HAVING n >= ? ''' cur.execute(sql, [PAPER_A_COS_CUT, PAPER_A_DH_CUT] + params + [MIN_SIGS]) rows = cur.fetchall() conn.close() return [{'cpa': r[0], 'firm': r[1], 'cos_mean': float(r[2]), 'dh_mean': float(r[3]), 'hand_frac': float(r[4]), 'n_sigs': int(r[5])} for r in rows] def fit_k3(cpas): X = np.column_stack([ [c['cos_mean'] for c in cpas], [c['dh_mean'] for c in cpas], ]) gmm = GaussianMixture(n_components=3, covariance_type='full', random_state=SEED, n_init=15, max_iter=500).fit(X) order = np.argsort(gmm.means_[:, 0]) means_sorted = gmm.means_[order] weights_sorted = gmm.weights_[order] raw_post = gmm.predict_proba(X) p_c1 = raw_post[:, order[0]] return { 'means': means_sorted.tolist(), 'weights': weights_sorted.tolist(), 'bic': float(gmm.bic(X)), 'aic': float(gmm.aic(X)), }, p_c1 def per_population(cpas, label): print(f'\n=== {label} (n = {len(cpas)} CPAs) ===') by_firm = {} for c in cpas: by_firm.setdefault(c['firm'], 0) by_firm[c['firm']] += 1 fit, p_c1 = fit_k3(cpas) hf = np.array([c['hand_frac'] for c in cpas]) rho, p = stats.spearmanr(p_c1, hf) print(f' K=3 components (sorted by ascending cos):') for i, name in enumerate(['C1 hand-leaning', 'C2 mixed', 'C3 replicated']): m = fit['means'][i] print(f' {name}: cos={m[0]:.4f}, dh={m[1]:.4f}, ' f'weight={fit["weights"][i]:.3f}') print(f' K=3 BIC = {fit["bic"]:.2f}; AIC = {fit["aic"]:.2f}') print(f' Spearman rho (P_C1 vs paperA_hand_frac) = {rho:+.4f} ' f'(p = {p:.2e})') print(f' Population breakdown:') for f in sorted(by_firm, key=lambda k: -by_firm[k]): firm_label = LABEL.get(f, f) print(f' {firm_label}: {by_firm[f]}') return { 'label': label, 'n_cpas': len(cpas), 'k3_fit': fit, 'spearman_p_c1_vs_handfrac': { 'rho': float(rho), 'p': float(p), }, 'firm_counts': by_firm, 'p_c1': p_c1.tolist(), 'hand_frac': hf.tolist(), } def main(): print('=' * 72) print('Script 41: Full-Dataset Robustness Comparison (Light §IV-K)') print('=' * 72) full = load_accountants(big4_only=False) big4 = load_accountants(big4_only=True) full_summary = per_population(full, 'Full dataset') big4_summary = per_population(big4, 'Big-4 (primary)') # Component drift drift = [] for i, name in enumerate(['C1 hand-leaning', 'C2 mixed', 'C3 replicated']): d_cos = abs(full_summary['k3_fit']['means'][i][0] - big4_summary['k3_fit']['means'][i][0]) d_dh = abs(full_summary['k3_fit']['means'][i][1] - big4_summary['k3_fit']['means'][i][1]) d_w = abs(full_summary['k3_fit']['weights'][i] - big4_summary['k3_fit']['weights'][i]) drift.append({'component': name, 'd_cos': float(d_cos), 'd_dh': float(d_dh), 'd_weight': float(d_w)}) print('\n=== Component drift Big-4 -> Full ===') for d in drift: print(f' {d["component"]}: |dcos|={d["d_cos"]:.4f}, ' f'|ddh|={d["d_dh"]:.3f}, |dweight|={d["d_weight"]:.3f}') rho_drift = abs(full_summary['spearman_p_c1_vs_handfrac']['rho'] - big4_summary['spearman_p_c1_vs_handfrac']['rho']) print(f'\n=== Spearman rho drift Big-4 -> Full ===') print(f' Big-4: {big4_summary["spearman_p_c1_vs_handfrac"]["rho"]:+.4f}') print(f' Full: {full_summary["spearman_p_c1_vs_handfrac"]["rho"]:+.4f}') print(f' |drift| = {rho_drift:.4f}') # Plot: scatter of P_C1 vs hand_frac for both populations fig, axes = plt.subplots(1, 2, figsize=(13, 5.5)) for ax, summ in zip(axes, [big4_summary, full_summary]): p1 = np.array(summ['p_c1']) hf = np.array(summ['hand_frac']) ax.scatter(p1, hf, s=20, alpha=0.55, c='steelblue', edgecolor='white') rho = summ['spearman_p_c1_vs_handfrac']['rho'] ax.set_xlabel('K=3 posterior P(C1 hand-leaning)') ax.set_ylabel('Paper A box-rule hand-leaning rate') ax.set_title(f'{summ["label"]} (n = {summ["n_cpas"]})\n' f'Spearman rho = {rho:+.3f}') ax.set_xlim(-0.05, 1.05) ax.set_ylim(-0.05, 1.05) ax.grid(alpha=0.3) fig.tight_layout() fig.savefig(OUT / 'panel_full_vs_big4.png', dpi=150) plt.close(fig) print(f'\nPlot: {OUT / "panel_full_vs_big4.png"}') payload = { 'generated_at': datetime.now().isoformat(), 'min_sigs_per_accountant': MIN_SIGS, 'paper_a_cuts': {'cos': PAPER_A_COS_CUT, 'dh': PAPER_A_DH_CUT}, 'big4_summary': {k: v for k, v in big4_summary.items() if k not in ('p_c1', 'hand_frac')}, 'full_dataset_summary': {k: v for k, v in full_summary.items() if k not in ('p_c1', 'hand_frac')}, 'component_drift_big4_to_full': drift, 'spearman_rho_drift_big4_to_full': float(rho_drift), } json_path = OUT / 'fulldataset_results.json' json_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding='utf-8') print(f'JSON: {json_path}') md = [ '# §IV-K Full-Dataset Robustness Comparison (Light)', f'Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}', '', '## Scope', '', ('Compares the v4.0 primary Big-4 K=3 + Paper A box-rule ' 'analysis to the same analysis run on the FULL accountant ' 'dataset (Big-4 + mid/small firms). The five-way moderate-' 'confidence band is NOT re-evaluated here; this is the ' '"Light" scope per the v4.0 author choice (codex round-22 ' 'open question 1).'), '', '## Population sizes', '', '| Scope | N CPAs (n_sig >= 10) |', '|---|---|', f'| Big-4 primary | {big4_summary["n_cpas"]} |', f'| Full dataset | {full_summary["n_cpas"]} |', '', '## K=3 components', '', '| Component | Big-4 cos / dh / weight | Full cos / dh / weight | |dcos| / |ddh| / |dwt| |', '|---|---|---|---|', ] for i, name in enumerate(['C1 hand-leaning', 'C2 mixed', 'C3 replicated']): b_m = big4_summary['k3_fit']['means'][i] b_w = big4_summary['k3_fit']['weights'][i] f_m = full_summary['k3_fit']['means'][i] f_w = full_summary['k3_fit']['weights'][i] d = drift[i] md.append(f'| {name} | {b_m[0]:.4f} / {b_m[1]:.3f} / {b_w:.3f} | ' f'{f_m[0]:.4f} / {f_m[1]:.3f} / {f_w:.3f} | ' f'{d["d_cos"]:.4f} / {d["d_dh"]:.3f} / ' f'{d["d_weight"]:.3f} |') md += ['', f'BIC: Big-4 K=3 = {big4_summary["k3_fit"]["bic"]:.2f}; ' f'Full K=3 = {full_summary["k3_fit"]["bic"]:.2f}', '', '## Spearman correlation (P(C1) vs Paper A hand_frac)', '', '| Scope | Spearman rho | p |', '|---|---|---|', f'| Big-4 | {big4_summary["spearman_p_c1_vs_handfrac"]["rho"]:+.4f} | ' f'{big4_summary["spearman_p_c1_vs_handfrac"]["p"]:.2e} |', f'| Full dataset | {full_summary["spearman_p_c1_vs_handfrac"]["rho"]:+.4f} | ' f'{full_summary["spearman_p_c1_vs_handfrac"]["p"]:.2e} |', f'| |Drift| Big-4 -> Full | {rho_drift:.4f} | n/a |', '', '## Reading', '', ('The Big-4 primary analysis and the full-dataset rerun ' 'agree on the K=3 component ordering and on the strong ' 'positive Spearman rank correlation between K=3 posterior ' 'P(C1) and Paper A box-rule hand-leaning rate. Component ' 'centers shift modestly between scopes (largest shift = ' f'C{1 + int(np.argmax([d["d_cos"] for d in drift]))}, ' f'|dcos| = {max(d["d_cos"] for d in drift):.4f}); the ' 'Spearman rho remains > 0.9 in both populations. We read ' 'this as evidence that the v4.0 K=3 + Paper A convergence ' 'is not a Big-4-specific artefact, while not implying that ' 'the full-dataset crossings or component locations are ' 'operationally interchangeable with the Big-4-primary ' 'numbers (they are not; mid/small-firm tail composition ' 'shifts the component centers).'), '', '## Files', '- `fulldataset_results.json` -- machine-readable results', '- `panel_full_vs_big4.png` -- side-by-side scatter', ] md_path = OUT / 'fulldataset_report.md' md_path.write_text('\n'.join(md), encoding='utf-8') print(f'Report: {md_path}') if __name__ == '__main__': main()