#!/usr/bin/env python3 """ Script 22: Partner-Level Similarity Ranking (per Partner v4 Section F.3) ======================================================================== Rank all Big-4 engagement partners by their per-auditor-year max cosine similarity. Under Partner v4's benchmark validation argument, if Deloitte Taiwan applies firm-wide stamping, Deloitte partners should disproportionately occupy the upper ranks of the cosine distribution. Construction: - Unit of observation: auditor-year = (CPA name, fiscal year) - For each auditor-year compute: cos_auditor_year = mean(max_similarity_to_same_accountant) over that CPA's signatures in that year - Only include auditor-years with >= 5 signatures - Rank globally; compute per-firm share of top-K buckets - Report for the pooled 2013-2023 sample and year-by-year Output: reports/partner_ranking/partner_ranking_report.md reports/partner_ranking/partner_ranking_results.json reports/partner_ranking/partner_rank_distribution.png """ import sqlite3 import json import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from pathlib import Path from datetime import datetime from collections import defaultdict DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' 'partner_ranking') OUT.mkdir(parents=True, exist_ok=True) BIG4 = ['勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合'] FIRM_A = '勤業眾信聯合' MIN_SIGS_PER_AUDITOR_YEAR = 5 def load_auditor_years(): conn = sqlite3.connect(DB) cur = conn.cursor() cur.execute(''' SELECT s.assigned_accountant, a.firm, substr(s.year_month, 1, 4) AS year, AVG(s.max_similarity_to_same_accountant) AS cos_mean, COUNT(*) AS n FROM signatures s LEFT JOIN accountants a ON s.assigned_accountant = a.name WHERE s.assigned_accountant IS NOT NULL AND s.max_similarity_to_same_accountant IS NOT NULL AND s.year_month IS NOT NULL GROUP BY s.assigned_accountant, year HAVING n >= ? ''', (MIN_SIGS_PER_AUDITOR_YEAR,)) rows = cur.fetchall() conn.close() return [{'accountant': r[0], 'firm': r[1] or '(unknown)', 'year': int(r[2]), 'cos_mean': float(r[3]), 'n': int(r[4])} for r in rows] def firm_bucket(firm): if firm == '勤業眾信聯合': return 'Deloitte (Firm A)' elif firm == '安侯建業聯合': return 'KPMG' elif firm == '資誠聯合': return 'PwC' elif firm == '安永聯合': return 'EY' else: return 'Other / Non-Big-4' def top_decile_breakdown(rows, deciles=(10, 25, 50)): """For pooled or per-year rows, compute % of top-K positions by firm.""" sorted_rows = sorted(rows, key=lambda r: -r['cos_mean']) N = len(sorted_rows) results = {} for decile in deciles: k = max(1, int(N * decile / 100)) top = sorted_rows[:k] counts = defaultdict(int) for r in top: counts[firm_bucket(r['firm'])] += 1 results[f'top_{decile}pct'] = { 'k': k, 'N_total': N, 'by_firm': dict(counts), 'deloitte_share': counts['Deloitte (Firm A)'] / k, } return results def main(): print('=' * 70) print('Script 22: Partner-Level Similarity Ranking') print('=' * 70) rows = load_auditor_years() print(f'\nN auditor-years (>= {MIN_SIGS_PER_AUDITOR_YEAR} sigs): {len(rows):,}') # Firm-level counts firm_counts = defaultdict(int) for r in rows: firm_counts[firm_bucket(r['firm'])] += 1 print('\nAuditor-years by firm:') for f, c in sorted(firm_counts.items(), key=lambda x: -x[1]): print(f' {f}: {c}') # POOLED (2013-2023) print('\n--- POOLED 2013-2023 ---') pooled = top_decile_breakdown(rows) for bucket, data in pooled.items(): print(f' {bucket} (top {data["k"]} of {data["N_total"]}): ' f'Deloitte share = {data["deloitte_share"]*100:.1f}%') for firm, c in sorted(data['by_firm'].items(), key=lambda x: -x[1]): print(f' {firm}: {c}') # PER-YEAR print('\n--- PER-YEAR TOP-10% DELOITTE SHARE ---') per_year = {} for year in sorted(set(r['year'] for r in rows)): year_rows = [r for r in rows if r['year'] == year] breakdown = top_decile_breakdown(year_rows) per_year[year] = breakdown top10 = breakdown['top_10pct'] print(f' {year}: N={top10["N_total"]}, top-10% k={top10["k"]}, ' f'Deloitte share = {top10["deloitte_share"]*100:.1f}%, ' f'Deloitte count={top10["by_firm"].get("Deloitte (Firm A)",0)}') # Figure: partner rank distribution by firm sorted_rows = sorted(rows, key=lambda r: -r['cos_mean']) ranks_by_firm = defaultdict(list) for idx, r in enumerate(sorted_rows): ranks_by_firm[firm_bucket(r['firm'])].append(idx / len(sorted_rows)) fig, axes = plt.subplots(1, 2, figsize=(14, 5)) # (a) Stacked CDF of rank percentile by firm ax = axes[0] colors = {'Deloitte (Firm A)': '#d62728', 'KPMG': '#1f77b4', 'PwC': '#2ca02c', 'EY': '#9467bd', 'Other / Non-Big-4': '#7f7f7f'} for firm in ['Deloitte (Firm A)', 'KPMG', 'PwC', 'EY', 'Other / Non-Big-4']: if firm in ranks_by_firm and ranks_by_firm[firm]: sorted_pct = sorted(ranks_by_firm[firm]) ax.hist(sorted_pct, bins=40, alpha=0.55, density=True, label=f'{firm} (n={len(sorted_pct)})', color=colors.get(firm, 'gray')) ax.set_xlabel('Rank percentile (0 = highest similarity)') ax.set_ylabel('Density') ax.set_title('Auditor-year rank distribution by firm (pooled 2013-2023)') ax.legend(fontsize=9) # (b) Deloitte share of top-10% per year ax = axes[1] years = sorted(per_year.keys()) shares = [per_year[y]['top_10pct']['deloitte_share'] * 100 for y in years] base_share = [100.0 * sum(1 for r in rows if r['year'] == y and firm_bucket(r['firm']) == 'Deloitte (Firm A)') / sum(1 for r in rows if r['year'] == y) for y in years] ax.plot(years, shares, 'o-', color='#d62728', lw=2, label='Deloitte share of top-10% similarity') ax.plot(years, base_share, 's--', color='gray', lw=1.5, label='Deloitte baseline share of auditor-years') ax.set_xlabel('Fiscal year') ax.set_ylabel('Share (%)') ax.set_ylim(0, max(max(shares), max(base_share)) * 1.2) ax.set_title('Deloitte concentration in top-similarity auditor-years') ax.legend(fontsize=9) ax.grid(alpha=0.3) plt.tight_layout() fig.savefig(OUT / 'partner_rank_distribution.png', dpi=150) plt.close() print(f'\nFigure: {OUT / "partner_rank_distribution.png"}') # JSON summary = { 'generated_at': datetime.now().isoformat(), 'min_signatures_per_auditor_year': MIN_SIGS_PER_AUDITOR_YEAR, 'n_auditor_years': len(rows), 'firm_counts': dict(firm_counts), 'pooled_deciles': pooled, 'per_year': {int(k): v for k, v in per_year.items()}, } with open(OUT / 'partner_ranking_results.json', 'w') as f: json.dump(summary, f, indent=2, ensure_ascii=False) print(f'JSON: {OUT / "partner_ranking_results.json"}') # Markdown md = [ '# Partner-Level Similarity Ranking Report', f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", '', '## Method', '', f'* Unit of observation: auditor-year (CPA name, fiscal year) with ' f'at least {MIN_SIGS_PER_AUDITOR_YEAR} signatures in that year.', '* Similarity statistic: mean of max_similarity_to_same_accountant', ' across signatures in the auditor-year.', '* Auditor-years ranked globally; per-firm share of top-K positions', ' reported for the pooled 2013-2023 sample and per fiscal year.', '', f'Total auditor-years analyzed: **{len(rows):,}**', '', '## Auditor-year counts by firm', '', '| Firm | N auditor-years |', '|------|-----------------|', ] for f, c in sorted(firm_counts.items(), key=lambda x: -x[1]): md.append(f'| {f} | {c} |') md += ['', '## Top-K concentration (pooled 2013-2023)', '', '| Top-K | N in bucket | Deloitte | KPMG | PwC | EY | Other | Deloitte share |', '|-------|-------------|----------|------|-----|-----|-------|----------------|'] for key in ('top_10pct', 'top_25pct', 'top_50pct'): d = pooled[key] md.append( f"| {key.replace('top_', 'Top ').replace('pct', '%')} | " f"{d['k']} | " f"{d['by_firm'].get('Deloitte (Firm A)', 0)} | " f"{d['by_firm'].get('KPMG', 0)} | " f"{d['by_firm'].get('PwC', 0)} | " f"{d['by_firm'].get('EY', 0)} | " f"{d['by_firm'].get('Other / Non-Big-4', 0)} | " f"**{d['deloitte_share']*100:.1f}%** |" ) md += ['', '## Per-year Deloitte share of top-10% similarity', '', '| Year | N auditor-years | Top-10% k | Deloitte in top-10% | ' 'Deloitte share | Deloitte baseline share |', '|------|-----------------|-----------|---------------------|' '----------------|-------------------------|'] for y in sorted(per_year.keys()): d = per_year[y]['top_10pct'] baseline = sum(1 for r in rows if r['year'] == y and firm_bucket(r['firm']) == 'Deloitte (Firm A)') \ / sum(1 for r in rows if r['year'] == y) md.append( f"| {y} | {d['N_total']} | {d['k']} | " f"{d['by_firm'].get('Deloitte (Firm A)', 0)} | " f"{d['deloitte_share']*100:.1f}% | " f"{baseline*100:.1f}% |" ) md += [ '', '## Interpretation', '', 'If Deloitte Taiwan applies firm-wide stamping, Deloitte auditor-years', 'should over-represent in the top of the similarity distribution relative', 'to their baseline share of all auditor-years. The pooled top-10%', 'Deloitte share divided by the baseline gives a concentration ratio', "that is informative about the firm's signing practice without", 'requiring per-report ground-truth labels.', '', 'Year-by-year stability of this concentration provides evidence about', 'whether the stamping practice was maintained throughout 2013-2023 or', 'changed in response to the industry-wide shift to electronic signing', 'systems around 2020.', ] (OUT / 'partner_ranking_report.md').write_text('\n'.join(md), encoding='utf-8') print(f'Report: {OUT / "partner_ranking_report.md"}') if __name__ == '__main__': main()