#!/usr/bin/env python3 """ Script 30: Yearly Per-Firm Cosine Similarity Comparison ======================================================== Generates the per-firm year-by-year per-signature best-match cosine distribution: Firm A (Deloitte), Firm B (KPMG), Firm C (PwC), Firm D (EY), Non-Big-4. The two-panel figure (mean cosine; share above 0.95) is the headline cross-firm visual requested in partner review of v3.19.1 (2026-04-27): five lines, X-axis 2013-2023, Firm A at the top. Outputs: reports/figures/fig_yearly_big4_comparison.png reports/figures/fig_yearly_big4_comparison.pdf reports/firm_yearly_comparison/firm_yearly_comparison.json reports/firm_yearly_comparison/firm_yearly_comparison.md """ import json import sqlite3 from datetime import datetime from pathlib import Path import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import numpy as np DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' FIG_OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' 'figures') DATA_OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' 'firm_yearly_comparison') FIG_OUT.mkdir(parents=True, exist_ok=True) DATA_OUT.mkdir(parents=True, exist_ok=True) FIRM_BUCKETS = [ ('Firm A', '勤業眾信聯合'), ('Firm B', '安侯建業聯合'), ('Firm C', '資誠聯合'), ('Firm D', '安永聯合'), ] FIRM_COLORS = { 'Firm A': '#d62728', 'Firm B': '#1f77b4', 'Firm C': '#2ca02c', 'Firm D': '#9467bd', 'Non-Big-4': '#7f7f7f', } FIRM_MARKERS = { 'Firm A': 'o', 'Firm B': 's', 'Firm C': '^', 'Firm D': 'D', 'Non-Big-4': 'v', } COSINE_CUT = 0.95 def firm_bucket(firm): for label, name in FIRM_BUCKETS: if firm == name: return label return 'Non-Big-4' def load_rows(conn): cur = conn.cursor() cur.execute(""" SELECT a.firm, CAST(substr(s.year_month, 1, 4) AS INTEGER) AS year, s.max_similarity_to_same_accountant FROM signatures s LEFT JOIN accountants a ON s.assigned_accountant = a.name WHERE s.max_similarity_to_same_accountant IS NOT NULL AND s.year_month IS NOT NULL AND s.assigned_accountant IS NOT NULL """) return cur.fetchall() def aggregate(rows): """Returns dict keyed by (firm_label, year) -> {n, mean_cos, share_ge_cut}.""" by_firm_year = {} for firm, year, cos in rows: if year is None or year < 2013 or year > 2023: continue label = firm_bucket(firm) key = (label, int(year)) by_firm_year.setdefault(key, []).append(float(cos)) summary = {} for (label, year), vals in by_firm_year.items(): arr = np.array(vals, dtype=float) summary[(label, year)] = { 'n': int(arr.size), 'mean_cos': float(arr.mean()), 'share_ge_cut': float(np.mean(arr >= COSINE_CUT)), } return summary def plot_figure(summary, years, firm_labels, fig_path_png, fig_path_pdf): fig, axes = plt.subplots(1, 2, figsize=(13, 5)) ax = axes[0] for label in firm_labels: ys = [summary[(label, y)]['mean_cos'] if (label, y) in summary else np.nan for y in years] ax.plot(years, ys, marker=FIRM_MARKERS[label], color=FIRM_COLORS[label], lw=2.0, ms=6, label=label, zorder=3 if label == 'Firm A' else 2) ax.set_xlabel('Fiscal year') ax.set_ylabel('Mean per-signature best-match cosine') ax.set_title('(a) Mean per-signature best-match cosine, by firm and year') ax.set_xticks(years) ax.tick_params(axis='x', rotation=0) ax.grid(True, ls=':', alpha=0.4) ax.legend(loc='lower right', framealpha=0.95) ax = axes[1] for label in firm_labels: ys = [100.0 * summary[(label, y)]['share_ge_cut'] if (label, y) in summary else np.nan for y in years] ax.plot(years, ys, marker=FIRM_MARKERS[label], color=FIRM_COLORS[label], lw=2.0, ms=6, label=label, zorder=3 if label == 'Firm A' else 2) ax.set_xlabel('Fiscal year') ax.set_ylabel(f'% signatures with best-match cosine $\\geq$ {COSINE_CUT}') ax.set_title(f'(b) Share with cosine $\\geq$ {COSINE_CUT}, ' 'by firm and year') ax.set_xticks(years) ax.tick_params(axis='x', rotation=0) ax.grid(True, ls=':', alpha=0.4) ax.legend(loc='lower right', framealpha=0.95) ax.set_ylim(0, 100) fig.suptitle('Per-firm yearly per-signature best-match cosine ' '(operational cut shown as 0.95)', fontsize=12, y=1.02) fig.tight_layout() fig.savefig(fig_path_png, dpi=200, bbox_inches='tight') fig.savefig(fig_path_pdf, bbox_inches='tight') plt.close(fig) def write_markdown(summary, years, firm_labels, md_path): lines = ['# Per-Firm Yearly Cosine Comparison', '', f"Generated: {datetime.now().isoformat(timespec='seconds')}", '', ('Per-signature best-match cosine ' '(`max_similarity_to_same_accountant`), aggregated by firm ' 'bucket and fiscal year. Firm bucket via CPA registry ' '(`accountants.firm`).'), ''] lines.append('## Mean per-signature best-match cosine') lines.append('') header = '| Year | ' + ' | '.join(firm_labels) + ' |' sep = '|------|' + '|'.join(['------'] * len(firm_labels)) + '|' lines.append(header) lines.append(sep) for y in years: row = f'| {y} | ' cells = [] for lab in firm_labels: if (lab, y) in summary: cells.append(f"{summary[(lab, y)]['mean_cos']:.4f}") else: cells.append('---') row += ' | '.join(cells) + ' |' lines.append(row) lines.append('') lines.append(f'## Share with cosine $\\geq$ {COSINE_CUT}') lines.append('') lines.append(header) lines.append(sep) for y in years: row = f'| {y} | ' cells = [] for lab in firm_labels: if (lab, y) in summary: cells.append(f"{100*summary[(lab, y)]['share_ge_cut']:.1f}%") else: cells.append('---') row += ' | '.join(cells) + ' |' lines.append(row) lines.append('') lines.append('## Per-firm signature counts') lines.append('') lines.append(header) lines.append(sep) for y in years: row = f'| {y} | ' cells = [] for lab in firm_labels: if (lab, y) in summary: cells.append(f"{summary[(lab, y)]['n']:,}") else: cells.append('---') row += ' | '.join(cells) + ' |' lines.append(row) md_path.write_text('\n'.join(lines) + '\n', encoding='utf-8') def main(): conn = sqlite3.connect(DB) try: rows = load_rows(conn) finally: conn.close() print(f'Loaded {len(rows):,} signatures with cosine + year + firm.') summary = aggregate(rows) years = sorted({y for (_, y) in summary}) firm_labels = ['Firm A', 'Firm B', 'Firm C', 'Firm D', 'Non-Big-4'] fig_png = FIG_OUT / 'fig_yearly_big4_comparison.png' fig_pdf = FIG_OUT / 'fig_yearly_big4_comparison.pdf' plot_figure(summary, years, firm_labels, fig_png, fig_pdf) print(f'Wrote {fig_png}') print(f'Wrote {fig_pdf}') payload = { 'generated_at': datetime.now().isoformat(timespec='seconds'), 'database_path': DB, 'cosine_cut': COSINE_CUT, 'firm_buckets': dict(FIRM_BUCKETS) | {'Non-Big-4': 'all other'}, 'years': years, 'rows': [ {'firm': lab, 'year': y, **summary[(lab, y)]} for lab in firm_labels for y in years if (lab, y) in summary ], } json_path = DATA_OUT / 'firm_yearly_comparison.json' json_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding='utf-8') print(f'Wrote {json_path}') md_path = DATA_OUT / 'firm_yearly_comparison.md' write_markdown(summary, years, firm_labels, md_path) print(f'Wrote {md_path}') if __name__ == '__main__': main()