3c7fcc010f
- Re-anchor inter-CPA coincidence-rate (ICCR) calibration on a normative non-Firm-A baseline (Firms B/C/D); Firm A held out as an out-of-sample target. Locked canonical numbers (codex-audited; Scripts 46/52/53): per-comparison HC 0.00014->0.000018, per-signature HC 0.0116, per-document HC+MC 0.34->0.1905; KDE crossover 0.837 retained corpus-wide. - Reposition as an operator-tunable, semi-automated screening/triage framework (title -> "Automated Screening..."): HC = high-specificity operating point; MC band demoted to low-specificity advisory; Firm A = demonstration that the screening surfaces a templated end, audit-quality implications deferred. - Apply codex prose-review fixes: triage-neutral five-way labels, soften mechanism/specificity wording, supersede MC claim-strength, update stale Appendix script references (40b/43/45 -> 46/52/53). - Trim pass: compress Sec. V discussion + Sec. III echoes (27.7k -> 26.8k words); no substantive content removed. - Add analysis scripts 45-53 (firm-year trends; BCD-only ICCR recompute; canonical-sampler locked numbers; Firm-A out-of-sample; BCD regression + cross-firm hit matrix). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
61 lines
2.5 KiB
Python
61 lines
2.5 KiB
Python
#!/usr/bin/env python3
|
||
"""Firm x year descriptor trends (B-gate diagnostic).
|
||
|
||
Plots per-firm yearly mean cosine, mean dHash, and HC-box hit share to test
|
||
whether Firms B/C/D show a 2020 structural break converging toward Firm A.
|
||
Read-only against the production DB.
|
||
"""
|
||
import sqlite3
|
||
import matplotlib.pyplot as plt
|
||
|
||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||
FIRMS = [('勤業眾信聯合', 'Firm A (Deloitte)', '#d62728'),
|
||
('安侯建業聯合', 'Firm B (KPMG)', '#1f77b4'),
|
||
('資誠聯合', 'Firm C (PwC)', '#2ca02c'),
|
||
('安永聯合', 'Firm D (EY)', '#ff7f0e')]
|
||
|
||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||
cur = conn.cursor()
|
||
|
||
|
||
def series(firm_zh):
|
||
cur.execute("""
|
||
SELECT CAST(substr(s.year_month,1,4) AS INT) AS yr,
|
||
AVG(s.max_similarity_to_same_accountant),
|
||
AVG(s.min_dhash_independent),
|
||
AVG(CASE WHEN s.max_similarity_to_same_accountant>0.95
|
||
AND s.min_dhash_independent<=5 THEN 1.0 ELSE 0.0 END),
|
||
COUNT(*)
|
||
FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||
WHERE a.firm=? AND s.year_month IS NOT NULL
|
||
AND s.max_similarity_to_same_accountant IS NOT NULL
|
||
AND s.min_dhash_independent IS NOT NULL
|
||
GROUP BY yr ORDER BY yr""", (firm_zh,))
|
||
return cur.fetchall()
|
||
|
||
|
||
fig, axes = plt.subplots(1, 3, figsize=(16, 4.8))
|
||
for firm_zh, label, color in FIRMS:
|
||
rows = series(firm_zh)
|
||
yrs = [r[0] for r in rows]
|
||
axes[0].plot(yrs, [r[1] for r in rows], 'o-', color=color, label=label)
|
||
axes[1].plot(yrs, [r[2] for r in rows], 'o-', color=color, label=label)
|
||
axes[2].plot(yrs, [r[3] for r in rows], 'o-', color=color, label=label)
|
||
|
||
for ax in axes:
|
||
ax.axvline(2020, ls='--', color='grey', alpha=0.6)
|
||
ax.text(2020.05, ax.get_ylim()[0], ' 2020', color='grey', fontsize=8, va='bottom')
|
||
ax.set_xlabel('Fiscal year')
|
||
ax.grid(alpha=0.3)
|
||
axes[0].set_title('Mean best-match cosine'); axes[0].axhline(0.95, ls=':', color='k', alpha=0.4)
|
||
axes[1].set_title('Mean independent-min dHash'); axes[1].axhline(5, ls=':', color='k', alpha=0.4)
|
||
axes[2].set_title('HC-box share (cos>0.95 & dHash$\\leq$5)')
|
||
axes[0].legend(fontsize=8, loc='lower right')
|
||
fig.suptitle('Big-4 descriptor trends 2013–2023 (2023 = partial, to Apr) — no 2020 break, no convergence to A',
|
||
fontsize=11)
|
||
fig.tight_layout()
|
||
out = '/Volumes/NV2/pdf_recognize/signature_analysis/firm_year_trends.png'
|
||
fig.savefig(out, dpi=130, bbox_inches='tight')
|
||
print('saved', out)
|
||
conn.close()
|