Add scripts 34 + 35: Big-4-only calibration foundation
Scripts 34 and 35 produced the empirical foundation that triggers the
Paper A v4.0 Big-4 reframe.
Script 34 (Big-4-only pooled calibration):
Pool Firm A + KPMG + PwC + EY (437 CPAs); first time the
three-method framework yields dip-test multimodal results
(p<0.0001 on both cos and dh axes) anywhere in the analysis
family. 2D-GMM K=2 marginal crossings with bootstrap 95% CI
(n=500): cos = 0.9755 [0.974, 0.977], dh = 3.755 [3.48, 3.97].
Crossing offsets from Paper A v3.20.0 baseline (0.945, 8.10):
+0.030 (cos), -4.345 (dh) -- mid/small-firm tail had
substantially shifted the published threshold.
Script 35 (Big-4 K=3 cluster membership):
Hard-assigns each Big-4 CPA to one of the K=3 components.
Findings:
* Firm A (Deloitte): 0% in C1 (hand-sign-leaning),
17.5% in C2 (mixed), 82.5% in C3 (replicated).
* PwC has the strongest hand-sign tradition (24/102 = 23.5%
in C1), followed by EY (11.5%) and KPMG (8.9%).
* 40 CPAs total in C1 across KPMG/PwC/EY.
Implications confirmed by these scripts:
* Big-4-only scope is the methodologically defensible primary
analysis; the published 0.945/8.10 reflects between-firm
structure rather than within-pool mechanism boundary.
* Firm A's role pivots from "calibration anchor" to
"case study of templated end of Big-4."
* Paper A is being reframed as v4.0 on sub-branch
paper-a-v4-big4, per Partner Jimmy's earlier direction
suggestion.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,218 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script 35: Big-4 K=3 Cluster Membership Inspection
|
||||
====================================================
|
||||
Companion to Script 34. Re-fits the Big-4-only 2D GMM with K=3
|
||||
(Big-4 = Firm A + KPMG + PwC + EY) and hard-assigns each of the
|
||||
437 CPAs to one of:
|
||||
|
||||
C1 (~14% weight): cos~0.946, dh~9.17 -- hand-sign-leaning
|
||||
C2 (~54% weight): cos~0.956, dh~6.66 -- mixed / partial replication
|
||||
C3 (~32% weight): cos~0.983, dh~2.41 -- replicated (templated)
|
||||
|
||||
Output:
|
||||
reports/big4_k3_cluster_inspection/
|
||||
cluster_membership.csv all 437 CPAs with cluster + posterior
|
||||
C1_handsign_leaning_members.csv pretty-printed C1 list sorted by
|
||||
paperA_hand_frac descending
|
||||
cluster_by_firm.csv firm x cluster cross-tab
|
||||
inspection_report.md
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import csv
|
||||
import json
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from sklearn.mixture import GaussianMixture
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
|
||||
'big4_k3_cluster_inspection')
|
||||
OUT.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
MIN_SIGS = 10
|
||||
PAPER_A_COS_CUT = 0.95
|
||||
PAPER_A_DH_CUT = 5
|
||||
|
||||
|
||||
def load_big4_with_handfrac():
|
||||
conn = sqlite3.connect(DB)
|
||||
cur = conn.cursor()
|
||||
cur.execute('''
|
||||
SELECT s.assigned_accountant,
|
||||
a.firm,
|
||||
AVG(s.max_similarity_to_same_accountant) AS cos_mean,
|
||||
AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean,
|
||||
AVG(CASE
|
||||
WHEN s.max_similarity_to_same_accountant > ?
|
||||
AND s.min_dhash_independent <= ?
|
||||
THEN 0.0 ELSE 1.0
|
||||
END) AS hand_frac,
|
||||
COUNT(*) AS n
|
||||
FROM signatures s
|
||||
JOIN accountants a ON s.assigned_accountant = a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL
|
||||
AND s.max_similarity_to_same_accountant IS NOT NULL
|
||||
AND s.min_dhash_independent IS NOT NULL
|
||||
AND a.firm IN (?, ?, ?, ?)
|
||||
GROUP BY s.assigned_accountant
|
||||
HAVING n >= ?
|
||||
''', (PAPER_A_COS_CUT, PAPER_A_DH_CUT) + BIG4 + (MIN_SIGS,))
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
|
||||
def main():
|
||||
print('=' * 72)
|
||||
print('Script 35: Big-4 K=3 Cluster Membership Inspection')
|
||||
print('=' * 72)
|
||||
rows = load_big4_with_handfrac()
|
||||
print(f'\nN Big-4 CPAs (n_sigs >= {MIN_SIGS}): {len(rows)}')
|
||||
|
||||
cos = np.array([r[2] for r in rows])
|
||||
dh = np.array([r[3] for r in rows])
|
||||
X = np.column_stack([cos, dh])
|
||||
|
||||
gmm = GaussianMixture(n_components=3, covariance_type='full',
|
||||
random_state=42, n_init=15, max_iter=500).fit(X)
|
||||
# Sort components by ascending cos so cluster numbering is stable
|
||||
order = np.argsort(gmm.means_[:, 0])
|
||||
means_sorted = gmm.means_[order]
|
||||
weights_sorted = gmm.weights_[order]
|
||||
|
||||
# remap component indices
|
||||
label_map = {old: new for new, old in enumerate(order)}
|
||||
raw_labels = gmm.predict(X)
|
||||
raw_post = gmm.predict_proba(X)
|
||||
labels = np.array([label_map[l] for l in raw_labels])
|
||||
post = raw_post[:, order]
|
||||
|
||||
print('\nK=3 components (sorted by cos ascending):')
|
||||
for i in range(3):
|
||||
print(f' C{i+1}: cos={means_sorted[i,0]:.4f}, '
|
||||
f'dh={means_sorted[i,1]:.4f}, weight={weights_sorted[i]:.3f}')
|
||||
|
||||
# Cross-tab firm x cluster
|
||||
by_firm_cluster = {}
|
||||
for (name, firm, cm, dm, hf, n), lab in zip(rows, labels):
|
||||
by_firm_cluster.setdefault(firm, [0, 0, 0])[lab] += 1
|
||||
print('\nFirm x cluster cross-tab (counts):')
|
||||
print(f' {"Firm":<20} {"C1":>5} {"C2":>5} {"C3":>5} {"total":>7}')
|
||||
for firm in BIG4:
|
||||
c = by_firm_cluster.get(firm, [0, 0, 0])
|
||||
total = sum(c)
|
||||
print(f' {firm:<20} {c[0]:>5} {c[1]:>5} {c[2]:>5} {total:>7}')
|
||||
|
||||
# Write membership CSV
|
||||
members_csv = OUT / 'cluster_membership.csv'
|
||||
with open(members_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
w = csv.writer(f)
|
||||
w.writerow(['cpa', 'firm', 'cos_mean', 'dh_mean', 'paperA_hand_frac',
|
||||
'n_signatures', 'cluster', 'p_C1', 'p_C2', 'p_C3'])
|
||||
for (name, firm, cm, dm, hf, n), lab, pp in zip(rows, labels, post):
|
||||
w.writerow([name, firm, f'{cm:.4f}', f'{dm:.4f}',
|
||||
f'{hf:.4f}', n, f'C{lab+1}',
|
||||
f'{pp[0]:.4f}', f'{pp[1]:.4f}', f'{pp[2]:.4f}'])
|
||||
print(f'\nFull membership CSV: {members_csv}')
|
||||
|
||||
# Write C1 (hand-sign-leaning) members sorted by hand_frac desc
|
||||
c1_rows = [(name, firm, cm, dm, hf, n, pp[0])
|
||||
for (name, firm, cm, dm, hf, n), lab, pp
|
||||
in zip(rows, labels, post) if lab == 0]
|
||||
c1_rows.sort(key=lambda r: -r[4])
|
||||
c1_csv = OUT / 'C1_handsign_leaning_members.csv'
|
||||
with open(c1_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
w = csv.writer(f)
|
||||
w.writerow(['rank', 'cpa', 'firm', 'cos_mean', 'dh_mean',
|
||||
'paperA_hand_frac', 'n_signatures', 'p_C1'])
|
||||
for i, (name, firm, cm, dm, hf, n, pc1) in enumerate(c1_rows, 1):
|
||||
w.writerow([i, name, firm, f'{cm:.4f}', f'{dm:.4f}',
|
||||
f'{hf:.4f}', n, f'{pc1:.4f}'])
|
||||
print(f'C1 hand-sign-leaning CSV: {c1_csv}')
|
||||
|
||||
# Console preview: top 20 C1 members
|
||||
print(f'\n--- C1 (hand-sign-leaning) members: {len(c1_rows)} CPAs ---')
|
||||
print(f'{"Rank":<5} {"CPA":<10} {"Firm":<22} '
|
||||
f'{"cos":>6} {"dh":>5} {"hand_frac":>9} {"n":>5} {"p_C1":>5}')
|
||||
for i, (name, firm, cm, dm, hf, n, pc1) in enumerate(c1_rows[:30], 1):
|
||||
print(f'{i:<5} {name:<10} {firm:<22} '
|
||||
f'{cm:>6.3f} {dm:>5.2f} {hf:>9.3f} {n:>5} {pc1:>5.2f}')
|
||||
|
||||
# Cross-tab CSV
|
||||
crosstab_csv = OUT / 'cluster_by_firm.csv'
|
||||
with open(crosstab_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
w = csv.writer(f)
|
||||
w.writerow(['firm', 'C1_handsign_leaning', 'C2_mixed',
|
||||
'C3_replicated', 'total',
|
||||
'C1_pct', 'C2_pct', 'C3_pct'])
|
||||
for firm in BIG4:
|
||||
c = by_firm_cluster.get(firm, [0, 0, 0])
|
||||
total = sum(c) or 1
|
||||
w.writerow([firm, c[0], c[1], c[2], sum(c),
|
||||
f'{c[0]/total:.3f}', f'{c[1]/total:.3f}',
|
||||
f'{c[2]/total:.3f}'])
|
||||
print(f'Cross-tab CSV: {crosstab_csv}')
|
||||
|
||||
# Markdown report
|
||||
md = [
|
||||
'# Big-4 K=3 Cluster Membership Inspection',
|
||||
f'Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}',
|
||||
'',
|
||||
'## K=3 components (sorted by ascending cosine)',
|
||||
'',
|
||||
'| Component | mean cos | mean dh | weight | interpretation |',
|
||||
'|---|---|---|---|---|',
|
||||
f'| C1 | {means_sorted[0,0]:.4f} | {means_sorted[0,1]:.4f} | '
|
||||
f'{weights_sorted[0]:.3f} | hand-sign-leaning |',
|
||||
f'| C2 | {means_sorted[1,0]:.4f} | {means_sorted[1,1]:.4f} | '
|
||||
f'{weights_sorted[1]:.3f} | mixed / partial replication |',
|
||||
f'| C3 | {means_sorted[2,0]:.4f} | {means_sorted[2,1]:.4f} | '
|
||||
f'{weights_sorted[2]:.3f} | replicated (templated) |',
|
||||
'',
|
||||
'## Firm x cluster cross-tab',
|
||||
'',
|
||||
'| Firm | C1 (hand) | C2 (mixed) | C3 (replicated) | total | C1% | C2% | C3% |',
|
||||
'|---|---|---|---|---|---|---|---|',
|
||||
]
|
||||
for firm in BIG4:
|
||||
c = by_firm_cluster.get(firm, [0, 0, 0])
|
||||
total = sum(c) or 1
|
||||
md.append(f'| {firm} | {c[0]} | {c[1]} | {c[2]} | {sum(c)} | '
|
||||
f'{c[0]/total:.1%} | {c[1]/total:.1%} | {c[2]/total:.1%} |')
|
||||
md += ['', f'## C1 hand-sign-leaning members ({len(c1_rows)} CPAs)',
|
||||
'',
|
||||
'| Rank | CPA | Firm | cos_mean | dh_mean | paperA_hand_frac | '
|
||||
'n_signatures | p_C1 |',
|
||||
'|---|---|---|---|---|---|---|---|']
|
||||
for i, (name, firm, cm, dm, hf, n, pc1) in enumerate(c1_rows, 1):
|
||||
md.append(f'| {i} | {name} | {firm} | {cm:.4f} | {dm:.4f} | '
|
||||
f'{hf:.4f} | {n} | {pc1:.4f} |')
|
||||
|
||||
md += ['',
|
||||
'## Reading guide',
|
||||
'',
|
||||
'- **C1 (hand-sign-leaning)**: low cosine + high dHash relative to '
|
||||
'the Big-4 reference; high posterior probability (p_C1 close to '
|
||||
'1.0) means a confident assignment.',
|
||||
'- **paperA_hand_frac**: per-CPA fraction of signatures that '
|
||||
'fail Paper A operational rule (cos>0.95 AND dh<=5). '
|
||||
'Independent label for cross-validation.',
|
||||
'- High agreement between cluster assignment and paperA_hand_frac '
|
||||
'within C1 indicates the Big-4 K=3 mixture is recovering the same '
|
||||
'sub-population that Paper A operationally calls hand-signed.',
|
||||
'',
|
||||
('Note: cluster numbering is sorted by ascending cosine each '
|
||||
'run; same hyperparameters (random_state=42, n_init=15) are used '
|
||||
'as in Scripts 32/34 for reproducibility.'),
|
||||
]
|
||||
md_path = OUT / 'inspection_report.md'
|
||||
md_path.write_text('\n'.join(md), encoding='utf-8')
|
||||
print(f'\nReport: {md_path}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user