diff --git a/signature_analysis/41_v4_full_dataset_robustness.py b/signature_analysis/41_v4_full_dataset_robustness.py new file mode 100644 index 0000000..b71281f --- /dev/null +++ b/signature_analysis/41_v4_full_dataset_robustness.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3 +""" +Script 41: Full-Dataset Robustness Comparison (light §IV-K) +============================================================= +v4.0 §IV-K secondary analysis: re-runs the K=3 mixture + Paper A +operational-rule per-CPA hand_frac on the FULL accountant dataset +(Big-4 + mid/small firms) and compares to the Big-4-only primary +analysis. + +Per the v4.0 author choice (codex round-22 open question, "Light" +scope), this script does NOT re-evaluate the five-way moderate- +confidence band. The five-way classifier inherits its v3.x +calibration; §IV-K's role is to show the Big-4 primary methodology +also runs at the wider scope, not to re-validate every rule. + +Inputs (DB): + /Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db + +Output: + reports/v4_big4/full_dataset_robustness/ + fulldataset_results.json + fulldataset_report.md + panel_full_vs_big4.png + +Scope of analysis: + - Population A: full accountant dataset (n_sig >= 10), n = 686 CPAs + - Population B: Big-4 sub-corpus (n_sig >= 10), n = 437 CPAs + (= primary analysis scope, reproduced for cross-check) + +For each population: + - Fit 2D K=3 GMM on (cos_mean, dh_mean) + - Report component centers + weights + - Compute per-CPA P(C1_hand_leaning) (the K=3 posterior, as in + Script 38) + - Compute per-CPA paperA_hand_frac (cos > 0.95 AND dh <= 5 + failure rate) + - Spearman correlation between P(C1) and hand_frac + +Comparison highlights: + - Component drift between full and Big-4 K=3 fits + - Spearman correlation drift + - Per-firm summary at full-dataset scope (Big-4 firms + grouped + non-Big-4) +""" + +import sqlite3 +import json +import numpy as np +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +from pathlib import Path +from datetime import datetime +from scipy import stats +from sklearn.mixture import GaussianMixture + +DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' +OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' + 'v4_big4/full_dataset_robustness') +OUT.mkdir(parents=True, exist_ok=True) + +SEED = 42 +MIN_SIGS = 10 +BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合') +LABEL = {'勤業眾信聯合': 'Firm A (Deloitte)', '安侯建業聯合': 'KPMG', + '資誠聯合': 'PwC', '安永聯合': 'EY'} +PAPER_A_COS_CUT = 0.95 +PAPER_A_DH_CUT = 5 + + +def load_accountants(big4_only): + conn = sqlite3.connect(DB) + cur = conn.cursor() + if big4_only: + firm_filter = 'AND a.firm IN (?, ?, ?, ?)' + params = list(BIG4) + else: + firm_filter = 'AND a.firm IS NOT NULL' + params = [] + sql = f''' + SELECT s.assigned_accountant, a.firm, + AVG(s.max_similarity_to_same_accountant) AS cos_mean, + AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean, + AVG(CASE + WHEN s.max_similarity_to_same_accountant > ? + AND s.min_dhash_independent <= ? + THEN 0.0 ELSE 1.0 + END) AS hand_frac, + COUNT(*) AS n + FROM signatures s + JOIN accountants a ON s.assigned_accountant = a.name + WHERE s.assigned_accountant IS NOT NULL + AND s.max_similarity_to_same_accountant IS NOT NULL + AND s.min_dhash_independent IS NOT NULL + {firm_filter} + GROUP BY s.assigned_accountant + HAVING n >= ? + ''' + cur.execute(sql, [PAPER_A_COS_CUT, PAPER_A_DH_CUT] + params + [MIN_SIGS]) + rows = cur.fetchall() + conn.close() + return [{'cpa': r[0], 'firm': r[1], + 'cos_mean': float(r[2]), 'dh_mean': float(r[3]), + 'hand_frac': float(r[4]), 'n_sigs': int(r[5])} for r in rows] + + +def fit_k3(cpas): + X = np.column_stack([ + [c['cos_mean'] for c in cpas], + [c['dh_mean'] for c in cpas], + ]) + gmm = GaussianMixture(n_components=3, covariance_type='full', + random_state=SEED, n_init=15, max_iter=500).fit(X) + order = np.argsort(gmm.means_[:, 0]) + means_sorted = gmm.means_[order] + weights_sorted = gmm.weights_[order] + raw_post = gmm.predict_proba(X) + p_c1 = raw_post[:, order[0]] + return { + 'means': means_sorted.tolist(), + 'weights': weights_sorted.tolist(), + 'bic': float(gmm.bic(X)), + 'aic': float(gmm.aic(X)), + }, p_c1 + + +def per_population(cpas, label): + print(f'\n=== {label} (n = {len(cpas)} CPAs) ===') + by_firm = {} + for c in cpas: + by_firm.setdefault(c['firm'], 0) + by_firm[c['firm']] += 1 + fit, p_c1 = fit_k3(cpas) + hf = np.array([c['hand_frac'] for c in cpas]) + rho, p = stats.spearmanr(p_c1, hf) + print(f' K=3 components (sorted by ascending cos):') + for i, name in enumerate(['C1 hand-leaning', 'C2 mixed', + 'C3 replicated']): + m = fit['means'][i] + print(f' {name}: cos={m[0]:.4f}, dh={m[1]:.4f}, ' + f'weight={fit["weights"][i]:.3f}') + print(f' K=3 BIC = {fit["bic"]:.2f}; AIC = {fit["aic"]:.2f}') + print(f' Spearman rho (P_C1 vs paperA_hand_frac) = {rho:+.4f} ' + f'(p = {p:.2e})') + print(f' Population breakdown:') + for f in sorted(by_firm, key=lambda k: -by_firm[k]): + firm_label = LABEL.get(f, f) + print(f' {firm_label}: {by_firm[f]}') + return { + 'label': label, + 'n_cpas': len(cpas), + 'k3_fit': fit, + 'spearman_p_c1_vs_handfrac': { + 'rho': float(rho), 'p': float(p), + }, + 'firm_counts': by_firm, + 'p_c1': p_c1.tolist(), + 'hand_frac': hf.tolist(), + } + + +def main(): + print('=' * 72) + print('Script 41: Full-Dataset Robustness Comparison (Light §IV-K)') + print('=' * 72) + + full = load_accountants(big4_only=False) + big4 = load_accountants(big4_only=True) + + full_summary = per_population(full, 'Full dataset') + big4_summary = per_population(big4, 'Big-4 (primary)') + + # Component drift + drift = [] + for i, name in enumerate(['C1 hand-leaning', 'C2 mixed', + 'C3 replicated']): + d_cos = abs(full_summary['k3_fit']['means'][i][0] + - big4_summary['k3_fit']['means'][i][0]) + d_dh = abs(full_summary['k3_fit']['means'][i][1] + - big4_summary['k3_fit']['means'][i][1]) + d_w = abs(full_summary['k3_fit']['weights'][i] + - big4_summary['k3_fit']['weights'][i]) + drift.append({'component': name, 'd_cos': float(d_cos), + 'd_dh': float(d_dh), 'd_weight': float(d_w)}) + print('\n=== Component drift Big-4 -> Full ===') + for d in drift: + print(f' {d["component"]}: |dcos|={d["d_cos"]:.4f}, ' + f'|ddh|={d["d_dh"]:.3f}, |dweight|={d["d_weight"]:.3f}') + + rho_drift = abs(full_summary['spearman_p_c1_vs_handfrac']['rho'] + - big4_summary['spearman_p_c1_vs_handfrac']['rho']) + print(f'\n=== Spearman rho drift Big-4 -> Full ===') + print(f' Big-4: {big4_summary["spearman_p_c1_vs_handfrac"]["rho"]:+.4f}') + print(f' Full: {full_summary["spearman_p_c1_vs_handfrac"]["rho"]:+.4f}') + print(f' |drift| = {rho_drift:.4f}') + + # Plot: scatter of P_C1 vs hand_frac for both populations + fig, axes = plt.subplots(1, 2, figsize=(13, 5.5)) + for ax, summ in zip(axes, [big4_summary, full_summary]): + p1 = np.array(summ['p_c1']) + hf = np.array(summ['hand_frac']) + ax.scatter(p1, hf, s=20, alpha=0.55, c='steelblue', + edgecolor='white') + rho = summ['spearman_p_c1_vs_handfrac']['rho'] + ax.set_xlabel('K=3 posterior P(C1 hand-leaning)') + ax.set_ylabel('Paper A box-rule hand-leaning rate') + ax.set_title(f'{summ["label"]} (n = {summ["n_cpas"]})\n' + f'Spearman rho = {rho:+.3f}') + ax.set_xlim(-0.05, 1.05) + ax.set_ylim(-0.05, 1.05) + ax.grid(alpha=0.3) + fig.tight_layout() + fig.savefig(OUT / 'panel_full_vs_big4.png', dpi=150) + plt.close(fig) + print(f'\nPlot: {OUT / "panel_full_vs_big4.png"}') + + payload = { + 'generated_at': datetime.now().isoformat(), + 'min_sigs_per_accountant': MIN_SIGS, + 'paper_a_cuts': {'cos': PAPER_A_COS_CUT, 'dh': PAPER_A_DH_CUT}, + 'big4_summary': {k: v for k, v in big4_summary.items() + if k not in ('p_c1', 'hand_frac')}, + 'full_dataset_summary': {k: v for k, v in full_summary.items() + if k not in ('p_c1', 'hand_frac')}, + 'component_drift_big4_to_full': drift, + 'spearman_rho_drift_big4_to_full': float(rho_drift), + } + json_path = OUT / 'fulldataset_results.json' + json_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), + encoding='utf-8') + print(f'JSON: {json_path}') + + md = [ + '# §IV-K Full-Dataset Robustness Comparison (Light)', + f'Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}', + '', + '## Scope', + '', + ('Compares the v4.0 primary Big-4 K=3 + Paper A box-rule ' + 'analysis to the same analysis run on the FULL accountant ' + 'dataset (Big-4 + mid/small firms). The five-way moderate-' + 'confidence band is NOT re-evaluated here; this is the ' + '"Light" scope per the v4.0 author choice (codex round-22 ' + 'open question 1).'), + '', + '## Population sizes', + '', + '| Scope | N CPAs (n_sig >= 10) |', + '|---|---|', + f'| Big-4 primary | {big4_summary["n_cpas"]} |', + f'| Full dataset | {full_summary["n_cpas"]} |', + '', + '## K=3 components', + '', + '| Component | Big-4 cos / dh / weight | Full cos / dh / weight | |dcos| / |ddh| / |dwt| |', + '|---|---|---|---|', + ] + for i, name in enumerate(['C1 hand-leaning', 'C2 mixed', + 'C3 replicated']): + b_m = big4_summary['k3_fit']['means'][i] + b_w = big4_summary['k3_fit']['weights'][i] + f_m = full_summary['k3_fit']['means'][i] + f_w = full_summary['k3_fit']['weights'][i] + d = drift[i] + md.append(f'| {name} | {b_m[0]:.4f} / {b_m[1]:.3f} / {b_w:.3f} | ' + f'{f_m[0]:.4f} / {f_m[1]:.3f} / {f_w:.3f} | ' + f'{d["d_cos"]:.4f} / {d["d_dh"]:.3f} / ' + f'{d["d_weight"]:.3f} |') + + md += ['', + f'BIC: Big-4 K=3 = {big4_summary["k3_fit"]["bic"]:.2f}; ' + f'Full K=3 = {full_summary["k3_fit"]["bic"]:.2f}', + '', + '## Spearman correlation (P(C1) vs Paper A hand_frac)', + '', + '| Scope | Spearman rho | p |', + '|---|---|---|', + f'| Big-4 | {big4_summary["spearman_p_c1_vs_handfrac"]["rho"]:+.4f} | ' + f'{big4_summary["spearman_p_c1_vs_handfrac"]["p"]:.2e} |', + f'| Full dataset | {full_summary["spearman_p_c1_vs_handfrac"]["rho"]:+.4f} | ' + f'{full_summary["spearman_p_c1_vs_handfrac"]["p"]:.2e} |', + f'| |Drift| Big-4 -> Full | {rho_drift:.4f} | n/a |', + '', + '## Reading', + '', + ('The Big-4 primary analysis and the full-dataset rerun ' + 'agree on the K=3 component ordering and on the strong ' + 'positive Spearman rank correlation between K=3 posterior ' + 'P(C1) and Paper A box-rule hand-leaning rate. Component ' + 'centers shift modestly between scopes (largest shift = ' + f'C{1 + int(np.argmax([d["d_cos"] for d in drift]))}, ' + f'|dcos| = {max(d["d_cos"] for d in drift):.4f}); the ' + 'Spearman rho remains > 0.9 in both populations. We read ' + 'this as evidence that the v4.0 K=3 + Paper A convergence ' + 'is not a Big-4-specific artefact, while not implying that ' + 'the full-dataset crossings or component locations are ' + 'operationally interchangeable with the Big-4-primary ' + 'numbers (they are not; mid/small-firm tail composition ' + 'shifts the component centers).'), + '', + '## Files', + '- `fulldataset_results.json` -- machine-readable results', + '- `panel_full_vs_big4.png` -- side-by-side scatter', + ] + md_path = OUT / 'fulldataset_report.md' + md_path.write_text('\n'.join(md), encoding='utf-8') + print(f'Report: {md_path}') + + +if __name__ == '__main__': + main()