pdf_signature_extraction/signature_analysis/35_big4_k3_cluster_names.py

#!/usr/bin/env python3
"""
Script 35: Big-4 K=3 Cluster Membership Inspection
====================================================
Companion to Script 34. Re-fits the Big-4-only 2D GMM with K=3
(Big-4 = Firm A + KPMG + PwC + EY) and hard-assigns each of the
437 CPAs to one of:

  C1 (~14% weight): cos~0.946, dh~9.17  -- hand-sign-leaning
  C2 (~54% weight): cos~0.956, dh~6.66  -- mixed / partial replication
  C3 (~32% weight): cos~0.983, dh~2.41  -- replicated (templated)

Output:
  reports/big4_k3_cluster_inspection/
    cluster_membership.csv          all 437 CPAs with cluster + posterior
    C1_handsign_leaning_members.csv pretty-printed C1 list sorted by
                                    paperA_hand_frac descending
    cluster_by_firm.csv             firm x cluster cross-tab
    inspection_report.md
"""

import sqlite3
import csv
import json
import numpy as np
from pathlib import Path
from datetime import datetime
from sklearn.mixture import GaussianMixture

DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
           'big4_k3_cluster_inspection')
OUT.mkdir(parents=True, exist_ok=True)

BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
MIN_SIGS = 10
PAPER_A_COS_CUT = 0.95
PAPER_A_DH_CUT = 5


def load_big4_with_handfrac():
    conn = sqlite3.connect(DB)
    cur = conn.cursor()
    cur.execute('''
        SELECT s.assigned_accountant,
               a.firm,
               AVG(s.max_similarity_to_same_accountant) AS cos_mean,
               AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean,
               AVG(CASE
                     WHEN s.max_similarity_to_same_accountant > ?
                          AND s.min_dhash_independent <= ?
                     THEN 0.0 ELSE 1.0
                   END) AS hand_frac,
               COUNT(*) AS n
        FROM signatures s
        JOIN accountants a ON s.assigned_accountant = a.name
        WHERE s.assigned_accountant IS NOT NULL
          AND s.max_similarity_to_same_accountant IS NOT NULL
          AND s.min_dhash_independent IS NOT NULL
          AND a.firm IN (?, ?, ?, ?)
        GROUP BY s.assigned_accountant
        HAVING n >= ?
    ''', (PAPER_A_COS_CUT, PAPER_A_DH_CUT) + BIG4 + (MIN_SIGS,))
    rows = cur.fetchall()
    conn.close()
    return rows


def main():
    print('=' * 72)
    print('Script 35: Big-4 K=3 Cluster Membership Inspection')
    print('=' * 72)
    rows = load_big4_with_handfrac()
    print(f'\nN Big-4 CPAs (n_sigs >= {MIN_SIGS}): {len(rows)}')

    cos = np.array([r[2] for r in rows])
    dh = np.array([r[3] for r in rows])
    X = np.column_stack([cos, dh])

    gmm = GaussianMixture(n_components=3, covariance_type='full',
                          random_state=42, n_init=15, max_iter=500).fit(X)
    # Sort components by ascending cos so cluster numbering is stable
    order = np.argsort(gmm.means_[:, 0])
    means_sorted = gmm.means_[order]
    weights_sorted = gmm.weights_[order]

    # remap component indices
    label_map = {old: new for new, old in enumerate(order)}
    raw_labels = gmm.predict(X)
    raw_post = gmm.predict_proba(X)
    labels = np.array([label_map[l] for l in raw_labels])
    post = raw_post[:, order]

    print('\nK=3 components (sorted by cos ascending):')
    for i in range(3):
        print(f'  C{i+1}: cos={means_sorted[i,0]:.4f}, '
              f'dh={means_sorted[i,1]:.4f}, weight={weights_sorted[i]:.3f}')

    # Cross-tab firm x cluster
    by_firm_cluster = {}
    for (name, firm, cm, dm, hf, n), lab in zip(rows, labels):
        by_firm_cluster.setdefault(firm, [0, 0, 0])[lab] += 1
    print('\nFirm x cluster cross-tab (counts):')
    print(f'  {"Firm":<20} {"C1":>5} {"C2":>5} {"C3":>5} {"total":>7}')
    for firm in BIG4:
        c = by_firm_cluster.get(firm, [0, 0, 0])
        total = sum(c)
        print(f'  {firm:<20} {c[0]:>5} {c[1]:>5} {c[2]:>5} {total:>7}')

    # Write membership CSV
    members_csv = OUT / 'cluster_membership.csv'
    with open(members_csv, 'w', newline='', encoding='utf-8') as f:
        w = csv.writer(f)
        w.writerow(['cpa', 'firm', 'cos_mean', 'dh_mean', 'paperA_hand_frac',
                    'n_signatures', 'cluster', 'p_C1', 'p_C2', 'p_C3'])
        for (name, firm, cm, dm, hf, n), lab, pp in zip(rows, labels, post):
            w.writerow([name, firm, f'{cm:.4f}', f'{dm:.4f}',
                        f'{hf:.4f}', n, f'C{lab+1}',
                        f'{pp[0]:.4f}', f'{pp[1]:.4f}', f'{pp[2]:.4f}'])
    print(f'\nFull membership CSV: {members_csv}')

    # Write C1 (hand-sign-leaning) members sorted by hand_frac desc
    c1_rows = [(name, firm, cm, dm, hf, n, pp[0])
               for (name, firm, cm, dm, hf, n), lab, pp
               in zip(rows, labels, post) if lab == 0]
    c1_rows.sort(key=lambda r: -r[4])
    c1_csv = OUT / 'C1_handsign_leaning_members.csv'
    with open(c1_csv, 'w', newline='', encoding='utf-8') as f:
        w = csv.writer(f)
        w.writerow(['rank', 'cpa', 'firm', 'cos_mean', 'dh_mean',
                    'paperA_hand_frac', 'n_signatures', 'p_C1'])
        for i, (name, firm, cm, dm, hf, n, pc1) in enumerate(c1_rows, 1):
            w.writerow([i, name, firm, f'{cm:.4f}', f'{dm:.4f}',
                        f'{hf:.4f}', n, f'{pc1:.4f}'])
    print(f'C1 hand-sign-leaning CSV: {c1_csv}')

    # Console preview: top 20 C1 members
    print(f'\n--- C1 (hand-sign-leaning) members: {len(c1_rows)} CPAs ---')
    print(f'{"Rank":<5} {"CPA":<10} {"Firm":<22} '
          f'{"cos":>6} {"dh":>5} {"hand_frac":>9} {"n":>5} {"p_C1":>5}')
    for i, (name, firm, cm, dm, hf, n, pc1) in enumerate(c1_rows[:30], 1):
        print(f'{i:<5} {name:<10} {firm:<22} '
              f'{cm:>6.3f} {dm:>5.2f} {hf:>9.3f} {n:>5} {pc1:>5.2f}')

    # Cross-tab CSV
    crosstab_csv = OUT / 'cluster_by_firm.csv'
    with open(crosstab_csv, 'w', newline='', encoding='utf-8') as f:
        w = csv.writer(f)
        w.writerow(['firm', 'C1_handsign_leaning', 'C2_mixed',
                    'C3_replicated', 'total',
                    'C1_pct', 'C2_pct', 'C3_pct'])
        for firm in BIG4:
            c = by_firm_cluster.get(firm, [0, 0, 0])
            total = sum(c) or 1
            w.writerow([firm, c[0], c[1], c[2], sum(c),
                        f'{c[0]/total:.3f}', f'{c[1]/total:.3f}',
                        f'{c[2]/total:.3f}'])
    print(f'Cross-tab CSV: {crosstab_csv}')

    # Markdown report
    md = [
        '# Big-4 K=3 Cluster Membership Inspection',
        f'Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}',
        '',
        '## K=3 components (sorted by ascending cosine)',
        '',
        '| Component | mean cos | mean dh | weight | interpretation |',
        '|---|---|---|---|---|',
        f'| C1 | {means_sorted[0,0]:.4f} | {means_sorted[0,1]:.4f} | '
        f'{weights_sorted[0]:.3f} | hand-sign-leaning |',
        f'| C2 | {means_sorted[1,0]:.4f} | {means_sorted[1,1]:.4f} | '
        f'{weights_sorted[1]:.3f} | mixed / partial replication |',
        f'| C3 | {means_sorted[2,0]:.4f} | {means_sorted[2,1]:.4f} | '
        f'{weights_sorted[2]:.3f} | replicated (templated) |',
        '',
        '## Firm x cluster cross-tab',
        '',
        '| Firm | C1 (hand) | C2 (mixed) | C3 (replicated) | total | C1% | C2% | C3% |',
        '|---|---|---|---|---|---|---|---|',
    ]
    for firm in BIG4:
        c = by_firm_cluster.get(firm, [0, 0, 0])
        total = sum(c) or 1
        md.append(f'| {firm} | {c[0]} | {c[1]} | {c[2]} | {sum(c)} | '
                  f'{c[0]/total:.1%} | {c[1]/total:.1%} | {c[2]/total:.1%} |')
    md += ['', f'## C1 hand-sign-leaning members ({len(c1_rows)} CPAs)',
           '',
           '| Rank | CPA | Firm | cos_mean | dh_mean | paperA_hand_frac | '
           'n_signatures | p_C1 |',
           '|---|---|---|---|---|---|---|---|']
    for i, (name, firm, cm, dm, hf, n, pc1) in enumerate(c1_rows, 1):
        md.append(f'| {i} | {name} | {firm} | {cm:.4f} | {dm:.4f} | '
                  f'{hf:.4f} | {n} | {pc1:.4f} |')

    md += ['',
           '## Reading guide',
           '',
           '- **C1 (hand-sign-leaning)**: low cosine + high dHash relative to '
           'the Big-4 reference; high posterior probability (p_C1 close to '
           '1.0) means a confident assignment.',
           '- **paperA_hand_frac**: per-CPA fraction of signatures that '
           'fail Paper A operational rule (cos>0.95 AND dh<=5).  '
           'Independent label for cross-validation.',
           '- High agreement between cluster assignment and paperA_hand_frac '
           'within C1 indicates the Big-4 K=3 mixture is recovering the same '
           'sub-population that Paper A operationally calls hand-signed.',
           '',
           ('Note: cluster numbering is sorted by ascending cosine each '
            'run; same hyperparameters (random_state=42, n_init=15) are used '
            'as in Scripts 32/34 for reproducibility.'),
           ]
    md_path = OUT / 'inspection_report.md'
    md_path.write_text('\n'.join(md), encoding='utf-8')
    print(f'\nReport: {md_path}')


if __name__ == '__main__':
    main()