#!/usr/bin/env python3 """ Script 35: Big-4 K=3 Cluster Membership Inspection ==================================================== Companion to Script 34. Re-fits the Big-4-only 2D GMM with K=3 (Big-4 = Firm A + KPMG + PwC + EY) and hard-assigns each of the 437 CPAs to one of: C1 (~14% weight): cos~0.946, dh~9.17 -- hand-sign-leaning C2 (~54% weight): cos~0.956, dh~6.66 -- mixed / partial replication C3 (~32% weight): cos~0.983, dh~2.41 -- replicated (templated) Output: reports/big4_k3_cluster_inspection/ cluster_membership.csv all 437 CPAs with cluster + posterior C1_handsign_leaning_members.csv pretty-printed C1 list sorted by paperA_hand_frac descending cluster_by_firm.csv firm x cluster cross-tab inspection_report.md """ import sqlite3 import csv import json import numpy as np from pathlib import Path from datetime import datetime from sklearn.mixture import GaussianMixture DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' 'big4_k3_cluster_inspection') OUT.mkdir(parents=True, exist_ok=True) BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合') MIN_SIGS = 10 PAPER_A_COS_CUT = 0.95 PAPER_A_DH_CUT = 5 def load_big4_with_handfrac(): conn = sqlite3.connect(DB) cur = conn.cursor() cur.execute(''' SELECT s.assigned_accountant, a.firm, AVG(s.max_similarity_to_same_accountant) AS cos_mean, AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean, AVG(CASE WHEN s.max_similarity_to_same_accountant > ? AND s.min_dhash_independent <= ? THEN 0.0 ELSE 1.0 END) AS hand_frac, COUNT(*) AS n FROM signatures s JOIN accountants a ON s.assigned_accountant = a.name WHERE s.assigned_accountant IS NOT NULL AND s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL AND a.firm IN (?, ?, ?, ?) GROUP BY s.assigned_accountant HAVING n >= ? ''', (PAPER_A_COS_CUT, PAPER_A_DH_CUT) + BIG4 + (MIN_SIGS,)) rows = cur.fetchall() conn.close() return rows def main(): print('=' * 72) print('Script 35: Big-4 K=3 Cluster Membership Inspection') print('=' * 72) rows = load_big4_with_handfrac() print(f'\nN Big-4 CPAs (n_sigs >= {MIN_SIGS}): {len(rows)}') cos = np.array([r[2] for r in rows]) dh = np.array([r[3] for r in rows]) X = np.column_stack([cos, dh]) gmm = GaussianMixture(n_components=3, covariance_type='full', random_state=42, n_init=15, max_iter=500).fit(X) # Sort components by ascending cos so cluster numbering is stable order = np.argsort(gmm.means_[:, 0]) means_sorted = gmm.means_[order] weights_sorted = gmm.weights_[order] # remap component indices label_map = {old: new for new, old in enumerate(order)} raw_labels = gmm.predict(X) raw_post = gmm.predict_proba(X) labels = np.array([label_map[l] for l in raw_labels]) post = raw_post[:, order] print('\nK=3 components (sorted by cos ascending):') for i in range(3): print(f' C{i+1}: cos={means_sorted[i,0]:.4f}, ' f'dh={means_sorted[i,1]:.4f}, weight={weights_sorted[i]:.3f}') # Cross-tab firm x cluster by_firm_cluster = {} for (name, firm, cm, dm, hf, n), lab in zip(rows, labels): by_firm_cluster.setdefault(firm, [0, 0, 0])[lab] += 1 print('\nFirm x cluster cross-tab (counts):') print(f' {"Firm":<20} {"C1":>5} {"C2":>5} {"C3":>5} {"total":>7}') for firm in BIG4: c = by_firm_cluster.get(firm, [0, 0, 0]) total = sum(c) print(f' {firm:<20} {c[0]:>5} {c[1]:>5} {c[2]:>5} {total:>7}') # Write membership CSV members_csv = OUT / 'cluster_membership.csv' with open(members_csv, 'w', newline='', encoding='utf-8') as f: w = csv.writer(f) w.writerow(['cpa', 'firm', 'cos_mean', 'dh_mean', 'paperA_hand_frac', 'n_signatures', 'cluster', 'p_C1', 'p_C2', 'p_C3']) for (name, firm, cm, dm, hf, n), lab, pp in zip(rows, labels, post): w.writerow([name, firm, f'{cm:.4f}', f'{dm:.4f}', f'{hf:.4f}', n, f'C{lab+1}', f'{pp[0]:.4f}', f'{pp[1]:.4f}', f'{pp[2]:.4f}']) print(f'\nFull membership CSV: {members_csv}') # Write C1 (hand-sign-leaning) members sorted by hand_frac desc c1_rows = [(name, firm, cm, dm, hf, n, pp[0]) for (name, firm, cm, dm, hf, n), lab, pp in zip(rows, labels, post) if lab == 0] c1_rows.sort(key=lambda r: -r[4]) c1_csv = OUT / 'C1_handsign_leaning_members.csv' with open(c1_csv, 'w', newline='', encoding='utf-8') as f: w = csv.writer(f) w.writerow(['rank', 'cpa', 'firm', 'cos_mean', 'dh_mean', 'paperA_hand_frac', 'n_signatures', 'p_C1']) for i, (name, firm, cm, dm, hf, n, pc1) in enumerate(c1_rows, 1): w.writerow([i, name, firm, f'{cm:.4f}', f'{dm:.4f}', f'{hf:.4f}', n, f'{pc1:.4f}']) print(f'C1 hand-sign-leaning CSV: {c1_csv}') # Console preview: top 20 C1 members print(f'\n--- C1 (hand-sign-leaning) members: {len(c1_rows)} CPAs ---') print(f'{"Rank":<5} {"CPA":<10} {"Firm":<22} ' f'{"cos":>6} {"dh":>5} {"hand_frac":>9} {"n":>5} {"p_C1":>5}') for i, (name, firm, cm, dm, hf, n, pc1) in enumerate(c1_rows[:30], 1): print(f'{i:<5} {name:<10} {firm:<22} ' f'{cm:>6.3f} {dm:>5.2f} {hf:>9.3f} {n:>5} {pc1:>5.2f}') # Cross-tab CSV crosstab_csv = OUT / 'cluster_by_firm.csv' with open(crosstab_csv, 'w', newline='', encoding='utf-8') as f: w = csv.writer(f) w.writerow(['firm', 'C1_handsign_leaning', 'C2_mixed', 'C3_replicated', 'total', 'C1_pct', 'C2_pct', 'C3_pct']) for firm in BIG4: c = by_firm_cluster.get(firm, [0, 0, 0]) total = sum(c) or 1 w.writerow([firm, c[0], c[1], c[2], sum(c), f'{c[0]/total:.3f}', f'{c[1]/total:.3f}', f'{c[2]/total:.3f}']) print(f'Cross-tab CSV: {crosstab_csv}') # Markdown report md = [ '# Big-4 K=3 Cluster Membership Inspection', f'Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}', '', '## K=3 components (sorted by ascending cosine)', '', '| Component | mean cos | mean dh | weight | interpretation |', '|---|---|---|---|---|', f'| C1 | {means_sorted[0,0]:.4f} | {means_sorted[0,1]:.4f} | ' f'{weights_sorted[0]:.3f} | hand-sign-leaning |', f'| C2 | {means_sorted[1,0]:.4f} | {means_sorted[1,1]:.4f} | ' f'{weights_sorted[1]:.3f} | mixed / partial replication |', f'| C3 | {means_sorted[2,0]:.4f} | {means_sorted[2,1]:.4f} | ' f'{weights_sorted[2]:.3f} | replicated (templated) |', '', '## Firm x cluster cross-tab', '', '| Firm | C1 (hand) | C2 (mixed) | C3 (replicated) | total | C1% | C2% | C3% |', '|---|---|---|---|---|---|---|---|', ] for firm in BIG4: c = by_firm_cluster.get(firm, [0, 0, 0]) total = sum(c) or 1 md.append(f'| {firm} | {c[0]} | {c[1]} | {c[2]} | {sum(c)} | ' f'{c[0]/total:.1%} | {c[1]/total:.1%} | {c[2]/total:.1%} |') md += ['', f'## C1 hand-sign-leaning members ({len(c1_rows)} CPAs)', '', '| Rank | CPA | Firm | cos_mean | dh_mean | paperA_hand_frac | ' 'n_signatures | p_C1 |', '|---|---|---|---|---|---|---|---|'] for i, (name, firm, cm, dm, hf, n, pc1) in enumerate(c1_rows, 1): md.append(f'| {i} | {name} | {firm} | {cm:.4f} | {dm:.4f} | ' f'{hf:.4f} | {n} | {pc1:.4f} |') md += ['', '## Reading guide', '', '- **C1 (hand-sign-leaning)**: low cosine + high dHash relative to ' 'the Big-4 reference; high posterior probability (p_C1 close to ' '1.0) means a confident assignment.', '- **paperA_hand_frac**: per-CPA fraction of signatures that ' 'fail Paper A operational rule (cos>0.95 AND dh<=5). ' 'Independent label for cross-validation.', '- High agreement between cluster assignment and paperA_hand_frac ' 'within C1 indicates the Big-4 K=3 mixture is recovering the same ' 'sub-population that Paper A operationally calls hand-signed.', '', ('Note: cluster numbering is sorted by ascending cosine each ' 'run; same hyperparameters (random_state=42, n_init=15) are used ' 'as in Scripts 32/34 for reproducibility.'), ] md_path = OUT / 'inspection_report.md' md_path.write_text('\n'.join(md), encoding='utf-8') print(f'\nReport: {md_path}') if __name__ == '__main__': main()