#!/usr/bin/env python3 """ Script 29: Firm A Per-Year Cosine Distribution (Table XIII) ============================================================ Generates the year-by-year Firm A per-signature best-match cosine distribution reported as Table XIII in the manuscript. Codex / Gemini round-19 review identified that this table previously had no dedicated generating script (Appendix B incorrectly attributed it to Script 08, which has no year_month extraction). Definition: Firm A membership is via CPA registry (accountants.firm joined on signatures.assigned_accountant), matching the convention used by scripts 24 and 28. For each fiscal year (substr(year_month, 1, 4)): - N signatures with non-null max_similarity_to_same_accountant - mean of max_similarity_to_same_accountant (the per-signature best-match cosine) - share with max_similarity_to_same_accountant < 0.95 (the left-tail rate cited in Section IV-G.1) Output: reports/firm_a_yearly/firm_a_yearly_distribution.json reports/firm_a_yearly/firm_a_yearly_distribution.md """ import json import sqlite3 from datetime import datetime from pathlib import Path DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' 'firm_a_yearly') OUT.mkdir(parents=True, exist_ok=True) FIRM_A = '勤業眾信聯合' def yearly_distribution(conn): cur = conn.cursor() cur.execute(""" SELECT substr(s.year_month, 1, 4) AS year, COUNT(*) AS n_sigs, AVG(s.max_similarity_to_same_accountant) AS mean_cos, SUM(CASE WHEN s.max_similarity_to_same_accountant < 0.95 THEN 1 ELSE 0 END) AS n_below_095 FROM signatures s JOIN accountants a ON s.assigned_accountant = a.name WHERE a.firm = ? AND s.max_similarity_to_same_accountant IS NOT NULL AND s.year_month IS NOT NULL GROUP BY year ORDER BY year """, (FIRM_A,)) rows = [] for year, n_sigs, mean_cos, n_below in cur.fetchall(): rows.append({ 'year': int(year), 'n_signatures': n_sigs, 'mean_best_match_cosine': round(mean_cos, 4), 'n_below_cosine_095': n_below, 'pct_below_cosine_095': round(100.0 * n_below / n_sigs, 2), }) return rows def write_markdown(payload, path): rows = payload['yearly_rows'] lines = [] lines.append('# Firm A Per-Year Cosine Distribution (Table XIII)') lines.append('') lines.append(f"Generated at: {payload['generated_at']}") lines.append('') lines.append('Firm A membership: CPA registry ' '(accountants.firm = "勤業眾信聯合"). Per-signature ' 'best-match cosine = ' 'signatures.max_similarity_to_same_accountant.') lines.append('') lines.append('| Year | N sigs | mean best-match cosine | % below 0.95 |') lines.append('|------|--------|------------------------|--------------|') for r in rows: lines.append( f"| {r['year']} | {r['n_signatures']:,} | " f"{r['mean_best_match_cosine']:.4f} | " f"{r['pct_below_cosine_095']:.2f}% |" ) path.write_text('\n'.join(lines) + '\n', encoding='utf-8') def main(): conn = sqlite3.connect(DB) try: payload = { 'generated_at': datetime.now().isoformat(timespec='seconds'), 'database_path': DB, 'firm_a_label': FIRM_A, 'firm_a_membership_definition': ( 'CPA registry: accountants.firm joined on ' 'signatures.assigned_accountant' ), 'cosine_metric': 'signatures.max_similarity_to_same_accountant', 'yearly_rows': yearly_distribution(conn), } finally: conn.close() json_path = OUT / 'firm_a_yearly_distribution.json' json_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding='utf-8') print(f'Wrote {json_path}') md_path = OUT / 'firm_a_yearly_distribution.md' write_markdown(payload, md_path) print(f'Wrote {md_path}') if __name__ == '__main__': main()