pdf_signature_extraction/signature_analysis/22_partner_ranking.py

#!/usr/bin/env python3
"""
Script 22: Partner-Level Similarity Ranking (per Partner v4 Section F.3)
========================================================================
Rank all Big-4 engagement partners by their per-auditor-year max cosine
similarity.  Under Partner v4's benchmark validation argument, if Deloitte
Taiwan applies firm-wide stamping, Deloitte partners should disproportionately
occupy the upper ranks of the cosine distribution.

Construction:
  - Unit of observation: auditor-year = (CPA name, fiscal year)
  - For each auditor-year compute:
        cos_auditor_year = mean(max_similarity_to_same_accountant)
                             over that CPA's signatures in that year
  - Only include auditor-years with >= 5 signatures
  - Rank globally; compute per-firm share of top-K buckets
  - Report for the pooled 2013-2023 sample and year-by-year

Output:
  reports/partner_ranking/partner_ranking_report.md
  reports/partner_ranking/partner_ranking_results.json
  reports/partner_ranking/partner_rank_distribution.png
"""

import sqlite3
import json
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
from collections import defaultdict

DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
           'partner_ranking')
OUT.mkdir(parents=True, exist_ok=True)

BIG4 = ['勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合']
FIRM_A = '勤業眾信聯合'
MIN_SIGS_PER_AUDITOR_YEAR = 5


def load_auditor_years():
    conn = sqlite3.connect(DB)
    cur = conn.cursor()
    cur.execute('''
        SELECT s.assigned_accountant, a.firm,
               substr(s.year_month, 1, 4) AS year,
               AVG(s.max_similarity_to_same_accountant)   AS cos_mean,
               COUNT(*)                                    AS n
        FROM signatures s
        LEFT JOIN accountants a ON s.assigned_accountant = a.name
        WHERE s.assigned_accountant IS NOT NULL
          AND s.max_similarity_to_same_accountant IS NOT NULL
          AND s.year_month IS NOT NULL
        GROUP BY s.assigned_accountant, year
        HAVING n >= ?
    ''', (MIN_SIGS_PER_AUDITOR_YEAR,))
    rows = cur.fetchall()
    conn.close()
    return [{'accountant': r[0],
             'firm': r[1] or '(unknown)',
             'year': int(r[2]),
             'cos_mean': float(r[3]),
             'n': int(r[4])} for r in rows]


def firm_bucket(firm):
    if firm == '勤業眾信聯合':
        return 'Deloitte (Firm A)'
    elif firm == '安侯建業聯合':
        return 'KPMG'
    elif firm == '資誠聯合':
        return 'PwC'
    elif firm == '安永聯合':
        return 'EY'
    else:
        return 'Other / Non-Big-4'


def top_decile_breakdown(rows, deciles=(10, 25, 50)):
    """For pooled or per-year rows, compute % of top-K positions by firm."""
    sorted_rows = sorted(rows, key=lambda r: -r['cos_mean'])
    N = len(sorted_rows)
    results = {}
    for decile in deciles:
        k = max(1, int(N * decile / 100))
        top = sorted_rows[:k]
        counts = defaultdict(int)
        for r in top:
            counts[firm_bucket(r['firm'])] += 1
        results[f'top_{decile}pct'] = {
            'k': k,
            'N_total': N,
            'by_firm': dict(counts),
            'deloitte_share': counts['Deloitte (Firm A)'] / k,
        }
    return results


def main():
    print('=' * 70)
    print('Script 22: Partner-Level Similarity Ranking')
    print('=' * 70)

    rows = load_auditor_years()
    print(f'\nN auditor-years (>= {MIN_SIGS_PER_AUDITOR_YEAR} sigs): {len(rows):,}')

    # Firm-level counts
    firm_counts = defaultdict(int)
    for r in rows:
        firm_counts[firm_bucket(r['firm'])] += 1
    print('\nAuditor-years by firm:')
    for f, c in sorted(firm_counts.items(), key=lambda x: -x[1]):
        print(f'  {f}: {c}')

    # POOLED (2013-2023)
    print('\n--- POOLED 2013-2023 ---')
    pooled = top_decile_breakdown(rows)
    for bucket, data in pooled.items():
        print(f'  {bucket} (top {data["k"]} of {data["N_total"]}): '
              f'Deloitte share = {data["deloitte_share"]*100:.1f}%')
        for firm, c in sorted(data['by_firm'].items(), key=lambda x: -x[1]):
            print(f'    {firm}: {c}')

    # PER-YEAR
    print('\n--- PER-YEAR TOP-10% DELOITTE SHARE ---')
    per_year = {}
    for year in sorted(set(r['year'] for r in rows)):
        year_rows = [r for r in rows if r['year'] == year]
        breakdown = top_decile_breakdown(year_rows)
        per_year[year] = breakdown
        top10 = breakdown['top_10pct']
        print(f'  {year}: N={top10["N_total"]}, top-10% k={top10["k"]}, '
              f'Deloitte share = {top10["deloitte_share"]*100:.1f}%, '
              f'Deloitte count={top10["by_firm"].get("Deloitte (Firm A)",0)}')

    # Figure: partner rank distribution by firm
    sorted_rows = sorted(rows, key=lambda r: -r['cos_mean'])
    ranks_by_firm = defaultdict(list)
    for idx, r in enumerate(sorted_rows):
        ranks_by_firm[firm_bucket(r['firm'])].append(idx / len(sorted_rows))

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # (a) Stacked CDF of rank percentile by firm
    ax = axes[0]
    colors = {'Deloitte (Firm A)': '#d62728', 'KPMG': '#1f77b4',
              'PwC': '#2ca02c', 'EY': '#9467bd',
              'Other / Non-Big-4': '#7f7f7f'}
    for firm in ['Deloitte (Firm A)', 'KPMG', 'PwC', 'EY', 'Other / Non-Big-4']:
        if firm in ranks_by_firm and ranks_by_firm[firm]:
            sorted_pct = sorted(ranks_by_firm[firm])
            ax.hist(sorted_pct, bins=40, alpha=0.55, density=True,
                    label=f'{firm} (n={len(sorted_pct)})',
                    color=colors.get(firm, 'gray'))
    ax.set_xlabel('Rank percentile (0 = highest similarity)')
    ax.set_ylabel('Density')
    ax.set_title('Auditor-year rank distribution by firm (pooled 2013-2023)')
    ax.legend(fontsize=9)

    # (b) Deloitte share of top-10% per year
    ax = axes[1]
    years = sorted(per_year.keys())
    shares = [per_year[y]['top_10pct']['deloitte_share'] * 100 for y in years]
    base_share = [100.0 * sum(1 for r in rows if r['year'] == y
                              and firm_bucket(r['firm']) == 'Deloitte (Firm A)')
                  / sum(1 for r in rows if r['year'] == y) for y in years]
    ax.plot(years, shares, 'o-', color='#d62728', lw=2,
            label='Deloitte share of top-10% similarity')
    ax.plot(years, base_share, 's--', color='gray', lw=1.5,
            label='Deloitte baseline share of auditor-years')
    ax.set_xlabel('Fiscal year')
    ax.set_ylabel('Share (%)')
    ax.set_ylim(0, max(max(shares), max(base_share)) * 1.2)
    ax.set_title('Deloitte concentration in top-similarity auditor-years')
    ax.legend(fontsize=9)
    ax.grid(alpha=0.3)

    plt.tight_layout()
    fig.savefig(OUT / 'partner_rank_distribution.png', dpi=150)
    plt.close()
    print(f'\nFigure: {OUT / "partner_rank_distribution.png"}')

    # JSON
    summary = {
        'generated_at': datetime.now().isoformat(),
        'min_signatures_per_auditor_year': MIN_SIGS_PER_AUDITOR_YEAR,
        'n_auditor_years': len(rows),
        'firm_counts': dict(firm_counts),
        'pooled_deciles': pooled,
        'per_year': {int(k): v for k, v in per_year.items()},
    }
    with open(OUT / 'partner_ranking_results.json', 'w') as f:
        json.dump(summary, f, indent=2, ensure_ascii=False)
    print(f'JSON: {OUT / "partner_ranking_results.json"}')

    # Markdown
    md = [
        '# Partner-Level Similarity Ranking Report',
        f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        '',
        '## Method',
        '',
        f'* Unit of observation: auditor-year (CPA name, fiscal year) with '
        f'at least {MIN_SIGS_PER_AUDITOR_YEAR} signatures in that year.',
        '* Similarity statistic: mean of max_similarity_to_same_accountant',
        '  across signatures in the auditor-year.',
        '* Auditor-years ranked globally; per-firm share of top-K positions',
        '  reported for the pooled 2013-2023 sample and per fiscal year.',
        '',
        f'Total auditor-years analyzed: **{len(rows):,}**',
        '',
        '## Auditor-year counts by firm',
        '',
        '| Firm | N auditor-years |',
        '|------|-----------------|',
    ]
    for f, c in sorted(firm_counts.items(), key=lambda x: -x[1]):
        md.append(f'| {f} | {c} |')

    md += ['', '## Top-K concentration (pooled 2013-2023)', '',
           '| Top-K | N in bucket | Deloitte | KPMG | PwC | EY | Other | Deloitte share |',
           '|-------|-------------|----------|------|-----|-----|-------|----------------|']
    for key in ('top_10pct', 'top_25pct', 'top_50pct'):
        d = pooled[key]
        md.append(
            f"| {key.replace('top_', 'Top ').replace('pct', '%')} | "
            f"{d['k']} | "
            f"{d['by_firm'].get('Deloitte (Firm A)', 0)} | "
            f"{d['by_firm'].get('KPMG', 0)} | "
            f"{d['by_firm'].get('PwC', 0)} | "
            f"{d['by_firm'].get('EY', 0)} | "
            f"{d['by_firm'].get('Other / Non-Big-4', 0)} | "
            f"**{d['deloitte_share']*100:.1f}%** |"
        )

    md += ['', '## Per-year Deloitte share of top-10% similarity', '',
           '| Year | N auditor-years | Top-10% k | Deloitte in top-10% | '
           'Deloitte share | Deloitte baseline share |',
           '|------|-----------------|-----------|---------------------|'
           '----------------|-------------------------|']
    for y in sorted(per_year.keys()):
        d = per_year[y]['top_10pct']
        baseline = sum(1 for r in rows if r['year'] == y
                       and firm_bucket(r['firm']) == 'Deloitte (Firm A)') \
            / sum(1 for r in rows if r['year'] == y)
        md.append(
            f"| {y} | {d['N_total']} | {d['k']} | "
            f"{d['by_firm'].get('Deloitte (Firm A)', 0)} | "
            f"{d['deloitte_share']*100:.1f}% | "
            f"{baseline*100:.1f}% |"
        )

    md += [
        '',
        '## Interpretation',
        '',
        'If Deloitte Taiwan applies firm-wide stamping, Deloitte auditor-years',
        'should over-represent in the top of the similarity distribution relative',
        'to their baseline share of all auditor-years. The pooled top-10%',
        'Deloitte share divided by the baseline gives a concentration ratio',
        "that is informative about the firm's signing practice without",
        'requiring per-report ground-truth labels.',
        '',
        'Year-by-year stability of this concentration provides evidence about',
        'whether the stamping practice was maintained throughout 2013-2023 or',
        'changed in response to the industry-wide shift to electronic signing',
        'systems around 2020.',
    ]
    (OUT / 'partner_ranking_report.md').write_text('\n'.join(md),
                                                   encoding='utf-8')
    print(f'Report: {OUT / "partner_ranking_report.md"}')


if __name__ == '__main__':
    main()