9392f30aef
Light §IV-K secondary analysis per v4.0 author choice (codex
round-22 open question 1). Reruns the K=3 mixture + Paper A
operational-rule per-CPA hand_frac on the full accountant dataset
(n = 686) and compares to the Big-4 primary scope (n = 437).
Results:
Component drift Big-4 -> Full:
C1 hand-leaning |dcos| = 0.018, |ddh| = 2.0, |dwt| = 0.14
C2 mixed |dcos| = 0.002, |ddh| = 0.3, |dwt| = 0.02
C3 replicated |dcos| = 0.000, |ddh| = 0.0, |dwt| = 0.12
Spearman rho (P_C1 vs paperA_hand_frac):
Big-4: +0.9627
Full dataset: +0.9558
|drift| = 0.0069
Reading: K=3 component ordering and Spearman convergence are
preserved at full scope, supporting the v4.0 reproducibility
claim. Component locations and weights shift modestly because
mid/small-firm composition broadens C1 (hand-leaning) and reduces
C3 weight; this is expected since mid/small firms include
hand-leaning CPAs that the Big-4-primary scope deliberately
excludes. Crossings and component locations are NOT operationally
interchangeable between scopes; §IV-K reports them only as a
robustness cross-check.
The five-way moderate-confidence band is NOT re-evaluated here
(Light scope); §IV-J flags it as inherited from v3.x calibration
without v4-specific recalibration.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
312 lines
12 KiB
Python
312 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script 41: Full-Dataset Robustness Comparison (light §IV-K)
|
|
=============================================================
|
|
v4.0 §IV-K secondary analysis: re-runs the K=3 mixture + Paper A
|
|
operational-rule per-CPA hand_frac on the FULL accountant dataset
|
|
(Big-4 + mid/small firms) and compares to the Big-4-only primary
|
|
analysis.
|
|
|
|
Per the v4.0 author choice (codex round-22 open question, "Light"
|
|
scope), this script does NOT re-evaluate the five-way moderate-
|
|
confidence band. The five-way classifier inherits its v3.x
|
|
calibration; §IV-K's role is to show the Big-4 primary methodology
|
|
also runs at the wider scope, not to re-validate every rule.
|
|
|
|
Inputs (DB):
|
|
/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db
|
|
|
|
Output:
|
|
reports/v4_big4/full_dataset_robustness/
|
|
fulldataset_results.json
|
|
fulldataset_report.md
|
|
panel_full_vs_big4.png
|
|
|
|
Scope of analysis:
|
|
- Population A: full accountant dataset (n_sig >= 10), n = 686 CPAs
|
|
- Population B: Big-4 sub-corpus (n_sig >= 10), n = 437 CPAs
|
|
(= primary analysis scope, reproduced for cross-check)
|
|
|
|
For each population:
|
|
- Fit 2D K=3 GMM on (cos_mean, dh_mean)
|
|
- Report component centers + weights
|
|
- Compute per-CPA P(C1_hand_leaning) (the K=3 posterior, as in
|
|
Script 38)
|
|
- Compute per-CPA paperA_hand_frac (cos > 0.95 AND dh <= 5
|
|
failure rate)
|
|
- Spearman correlation between P(C1) and hand_frac
|
|
|
|
Comparison highlights:
|
|
- Component drift between full and Big-4 K=3 fits
|
|
- Spearman correlation drift
|
|
- Per-firm summary at full-dataset scope (Big-4 firms + grouped
|
|
non-Big-4)
|
|
"""
|
|
|
|
import sqlite3
|
|
import json
|
|
import numpy as np
|
|
import matplotlib
|
|
matplotlib.use('Agg')
|
|
import matplotlib.pyplot as plt
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from scipy import stats
|
|
from sklearn.mixture import GaussianMixture
|
|
|
|
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
|
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
|
|
'v4_big4/full_dataset_robustness')
|
|
OUT.mkdir(parents=True, exist_ok=True)
|
|
|
|
SEED = 42
|
|
MIN_SIGS = 10
|
|
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
|
LABEL = {'勤業眾信聯合': 'Firm A (Deloitte)', '安侯建業聯合': 'KPMG',
|
|
'資誠聯合': 'PwC', '安永聯合': 'EY'}
|
|
PAPER_A_COS_CUT = 0.95
|
|
PAPER_A_DH_CUT = 5
|
|
|
|
|
|
def load_accountants(big4_only):
|
|
conn = sqlite3.connect(DB)
|
|
cur = conn.cursor()
|
|
if big4_only:
|
|
firm_filter = 'AND a.firm IN (?, ?, ?, ?)'
|
|
params = list(BIG4)
|
|
else:
|
|
firm_filter = 'AND a.firm IS NOT NULL'
|
|
params = []
|
|
sql = f'''
|
|
SELECT s.assigned_accountant, a.firm,
|
|
AVG(s.max_similarity_to_same_accountant) AS cos_mean,
|
|
AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean,
|
|
AVG(CASE
|
|
WHEN s.max_similarity_to_same_accountant > ?
|
|
AND s.min_dhash_independent <= ?
|
|
THEN 0.0 ELSE 1.0
|
|
END) AS hand_frac,
|
|
COUNT(*) AS n
|
|
FROM signatures s
|
|
JOIN accountants a ON s.assigned_accountant = a.name
|
|
WHERE s.assigned_accountant IS NOT NULL
|
|
AND s.max_similarity_to_same_accountant IS NOT NULL
|
|
AND s.min_dhash_independent IS NOT NULL
|
|
{firm_filter}
|
|
GROUP BY s.assigned_accountant
|
|
HAVING n >= ?
|
|
'''
|
|
cur.execute(sql, [PAPER_A_COS_CUT, PAPER_A_DH_CUT] + params + [MIN_SIGS])
|
|
rows = cur.fetchall()
|
|
conn.close()
|
|
return [{'cpa': r[0], 'firm': r[1],
|
|
'cos_mean': float(r[2]), 'dh_mean': float(r[3]),
|
|
'hand_frac': float(r[4]), 'n_sigs': int(r[5])} for r in rows]
|
|
|
|
|
|
def fit_k3(cpas):
|
|
X = np.column_stack([
|
|
[c['cos_mean'] for c in cpas],
|
|
[c['dh_mean'] for c in cpas],
|
|
])
|
|
gmm = GaussianMixture(n_components=3, covariance_type='full',
|
|
random_state=SEED, n_init=15, max_iter=500).fit(X)
|
|
order = np.argsort(gmm.means_[:, 0])
|
|
means_sorted = gmm.means_[order]
|
|
weights_sorted = gmm.weights_[order]
|
|
raw_post = gmm.predict_proba(X)
|
|
p_c1 = raw_post[:, order[0]]
|
|
return {
|
|
'means': means_sorted.tolist(),
|
|
'weights': weights_sorted.tolist(),
|
|
'bic': float(gmm.bic(X)),
|
|
'aic': float(gmm.aic(X)),
|
|
}, p_c1
|
|
|
|
|
|
def per_population(cpas, label):
|
|
print(f'\n=== {label} (n = {len(cpas)} CPAs) ===')
|
|
by_firm = {}
|
|
for c in cpas:
|
|
by_firm.setdefault(c['firm'], 0)
|
|
by_firm[c['firm']] += 1
|
|
fit, p_c1 = fit_k3(cpas)
|
|
hf = np.array([c['hand_frac'] for c in cpas])
|
|
rho, p = stats.spearmanr(p_c1, hf)
|
|
print(f' K=3 components (sorted by ascending cos):')
|
|
for i, name in enumerate(['C1 hand-leaning', 'C2 mixed',
|
|
'C3 replicated']):
|
|
m = fit['means'][i]
|
|
print(f' {name}: cos={m[0]:.4f}, dh={m[1]:.4f}, '
|
|
f'weight={fit["weights"][i]:.3f}')
|
|
print(f' K=3 BIC = {fit["bic"]:.2f}; AIC = {fit["aic"]:.2f}')
|
|
print(f' Spearman rho (P_C1 vs paperA_hand_frac) = {rho:+.4f} '
|
|
f'(p = {p:.2e})')
|
|
print(f' Population breakdown:')
|
|
for f in sorted(by_firm, key=lambda k: -by_firm[k]):
|
|
firm_label = LABEL.get(f, f)
|
|
print(f' {firm_label}: {by_firm[f]}')
|
|
return {
|
|
'label': label,
|
|
'n_cpas': len(cpas),
|
|
'k3_fit': fit,
|
|
'spearman_p_c1_vs_handfrac': {
|
|
'rho': float(rho), 'p': float(p),
|
|
},
|
|
'firm_counts': by_firm,
|
|
'p_c1': p_c1.tolist(),
|
|
'hand_frac': hf.tolist(),
|
|
}
|
|
|
|
|
|
def main():
|
|
print('=' * 72)
|
|
print('Script 41: Full-Dataset Robustness Comparison (Light §IV-K)')
|
|
print('=' * 72)
|
|
|
|
full = load_accountants(big4_only=False)
|
|
big4 = load_accountants(big4_only=True)
|
|
|
|
full_summary = per_population(full, 'Full dataset')
|
|
big4_summary = per_population(big4, 'Big-4 (primary)')
|
|
|
|
# Component drift
|
|
drift = []
|
|
for i, name in enumerate(['C1 hand-leaning', 'C2 mixed',
|
|
'C3 replicated']):
|
|
d_cos = abs(full_summary['k3_fit']['means'][i][0]
|
|
- big4_summary['k3_fit']['means'][i][0])
|
|
d_dh = abs(full_summary['k3_fit']['means'][i][1]
|
|
- big4_summary['k3_fit']['means'][i][1])
|
|
d_w = abs(full_summary['k3_fit']['weights'][i]
|
|
- big4_summary['k3_fit']['weights'][i])
|
|
drift.append({'component': name, 'd_cos': float(d_cos),
|
|
'd_dh': float(d_dh), 'd_weight': float(d_w)})
|
|
print('\n=== Component drift Big-4 -> Full ===')
|
|
for d in drift:
|
|
print(f' {d["component"]}: |dcos|={d["d_cos"]:.4f}, '
|
|
f'|ddh|={d["d_dh"]:.3f}, |dweight|={d["d_weight"]:.3f}')
|
|
|
|
rho_drift = abs(full_summary['spearman_p_c1_vs_handfrac']['rho']
|
|
- big4_summary['spearman_p_c1_vs_handfrac']['rho'])
|
|
print(f'\n=== Spearman rho drift Big-4 -> Full ===')
|
|
print(f' Big-4: {big4_summary["spearman_p_c1_vs_handfrac"]["rho"]:+.4f}')
|
|
print(f' Full: {full_summary["spearman_p_c1_vs_handfrac"]["rho"]:+.4f}')
|
|
print(f' |drift| = {rho_drift:.4f}')
|
|
|
|
# Plot: scatter of P_C1 vs hand_frac for both populations
|
|
fig, axes = plt.subplots(1, 2, figsize=(13, 5.5))
|
|
for ax, summ in zip(axes, [big4_summary, full_summary]):
|
|
p1 = np.array(summ['p_c1'])
|
|
hf = np.array(summ['hand_frac'])
|
|
ax.scatter(p1, hf, s=20, alpha=0.55, c='steelblue',
|
|
edgecolor='white')
|
|
rho = summ['spearman_p_c1_vs_handfrac']['rho']
|
|
ax.set_xlabel('K=3 posterior P(C1 hand-leaning)')
|
|
ax.set_ylabel('Paper A box-rule hand-leaning rate')
|
|
ax.set_title(f'{summ["label"]} (n = {summ["n_cpas"]})\n'
|
|
f'Spearman rho = {rho:+.3f}')
|
|
ax.set_xlim(-0.05, 1.05)
|
|
ax.set_ylim(-0.05, 1.05)
|
|
ax.grid(alpha=0.3)
|
|
fig.tight_layout()
|
|
fig.savefig(OUT / 'panel_full_vs_big4.png', dpi=150)
|
|
plt.close(fig)
|
|
print(f'\nPlot: {OUT / "panel_full_vs_big4.png"}')
|
|
|
|
payload = {
|
|
'generated_at': datetime.now().isoformat(),
|
|
'min_sigs_per_accountant': MIN_SIGS,
|
|
'paper_a_cuts': {'cos': PAPER_A_COS_CUT, 'dh': PAPER_A_DH_CUT},
|
|
'big4_summary': {k: v for k, v in big4_summary.items()
|
|
if k not in ('p_c1', 'hand_frac')},
|
|
'full_dataset_summary': {k: v for k, v in full_summary.items()
|
|
if k not in ('p_c1', 'hand_frac')},
|
|
'component_drift_big4_to_full': drift,
|
|
'spearman_rho_drift_big4_to_full': float(rho_drift),
|
|
}
|
|
json_path = OUT / 'fulldataset_results.json'
|
|
json_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False),
|
|
encoding='utf-8')
|
|
print(f'JSON: {json_path}')
|
|
|
|
md = [
|
|
'# §IV-K Full-Dataset Robustness Comparison (Light)',
|
|
f'Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}',
|
|
'',
|
|
'## Scope',
|
|
'',
|
|
('Compares the v4.0 primary Big-4 K=3 + Paper A box-rule '
|
|
'analysis to the same analysis run on the FULL accountant '
|
|
'dataset (Big-4 + mid/small firms). The five-way moderate-'
|
|
'confidence band is NOT re-evaluated here; this is the '
|
|
'"Light" scope per the v4.0 author choice (codex round-22 '
|
|
'open question 1).'),
|
|
'',
|
|
'## Population sizes',
|
|
'',
|
|
'| Scope | N CPAs (n_sig >= 10) |',
|
|
'|---|---|',
|
|
f'| Big-4 primary | {big4_summary["n_cpas"]} |',
|
|
f'| Full dataset | {full_summary["n_cpas"]} |',
|
|
'',
|
|
'## K=3 components',
|
|
'',
|
|
'| Component | Big-4 cos / dh / weight | Full cos / dh / weight | |dcos| / |ddh| / |dwt| |',
|
|
'|---|---|---|---|',
|
|
]
|
|
for i, name in enumerate(['C1 hand-leaning', 'C2 mixed',
|
|
'C3 replicated']):
|
|
b_m = big4_summary['k3_fit']['means'][i]
|
|
b_w = big4_summary['k3_fit']['weights'][i]
|
|
f_m = full_summary['k3_fit']['means'][i]
|
|
f_w = full_summary['k3_fit']['weights'][i]
|
|
d = drift[i]
|
|
md.append(f'| {name} | {b_m[0]:.4f} / {b_m[1]:.3f} / {b_w:.3f} | '
|
|
f'{f_m[0]:.4f} / {f_m[1]:.3f} / {f_w:.3f} | '
|
|
f'{d["d_cos"]:.4f} / {d["d_dh"]:.3f} / '
|
|
f'{d["d_weight"]:.3f} |')
|
|
|
|
md += ['',
|
|
f'BIC: Big-4 K=3 = {big4_summary["k3_fit"]["bic"]:.2f}; '
|
|
f'Full K=3 = {full_summary["k3_fit"]["bic"]:.2f}',
|
|
'',
|
|
'## Spearman correlation (P(C1) vs Paper A hand_frac)',
|
|
'',
|
|
'| Scope | Spearman rho | p |',
|
|
'|---|---|---|',
|
|
f'| Big-4 | {big4_summary["spearman_p_c1_vs_handfrac"]["rho"]:+.4f} | '
|
|
f'{big4_summary["spearman_p_c1_vs_handfrac"]["p"]:.2e} |',
|
|
f'| Full dataset | {full_summary["spearman_p_c1_vs_handfrac"]["rho"]:+.4f} | '
|
|
f'{full_summary["spearman_p_c1_vs_handfrac"]["p"]:.2e} |',
|
|
f'| |Drift| Big-4 -> Full | {rho_drift:.4f} | n/a |',
|
|
'',
|
|
'## Reading',
|
|
'',
|
|
('The Big-4 primary analysis and the full-dataset rerun '
|
|
'agree on the K=3 component ordering and on the strong '
|
|
'positive Spearman rank correlation between K=3 posterior '
|
|
'P(C1) and Paper A box-rule hand-leaning rate. Component '
|
|
'centers shift modestly between scopes (largest shift = '
|
|
f'C{1 + int(np.argmax([d["d_cos"] for d in drift]))}, '
|
|
f'|dcos| = {max(d["d_cos"] for d in drift):.4f}); the '
|
|
'Spearman rho remains > 0.9 in both populations. We read '
|
|
'this as evidence that the v4.0 K=3 + Paper A convergence '
|
|
'is not a Big-4-specific artefact, while not implying that '
|
|
'the full-dataset crossings or component locations are '
|
|
'operationally interchangeable with the Big-4-primary '
|
|
'numbers (they are not; mid/small-firm tail composition '
|
|
'shifts the component centers).'),
|
|
'',
|
|
'## Files',
|
|
'- `fulldataset_results.json` -- machine-readable results',
|
|
'- `panel_full_vs_big4.png` -- side-by-side scatter',
|
|
]
|
|
md_path = OUT / 'fulldataset_report.md'
|
|
md_path.write_text('\n'.join(md), encoding='utf-8')
|
|
print(f'Report: {md_path}')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|