"""Figure 3 (real data version): 2D density of the two measures over the five-region scheme. Replaces the earlier schematic with the actual distribution, with axis ticks and the rule cuts. Reproduces from signature_analysis.db; Big-4, is_valid=1, both measures present.""" import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from matplotlib.colors import LogNorm from matplotlib.patches import Rectangle import numpy as np import sqlite3 DB = "/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db" BIG4 = ('勤業眾信聯合', '資誠聯合', '安侯建業聯合', '安永聯合') con = sqlite3.connect(DB) cur = con.cursor() cur.execute(f""" SELECT max_similarity_to_same_accountant, min_dhash_independent FROM signatures WHERE is_valid=1 AND max_similarity_to_same_accountant IS NOT NULL AND min_dhash_independent IS NOT NULL AND excel_firm IN ({','.join(['?']*4)}) """, BIG4) rows = cur.fetchall() con.close() cos = np.array([r[0] for r in rows], dtype=float) dh = np.array([r[1] for r in rows], dtype=float) n = len(cos) LO, HI = 0.8547, 0.95 DH1, DH2 = 5, 15 xmin, xmax = 0.70, 1.002 ymin, ymax = -0.5, 30 ycap = 30 # display cap; values above are piled into the top row for visibility dh_disp = np.minimum(dh, ycap - 0.5) fig, ax = plt.subplots(figsize=(5.6, 4.4)) # faint region tint behind the density ax.add_patch(Rectangle((xmin, ymin), LO - xmin, ymax - ymin, facecolor='#bdc3c7', alpha=0.12, zorder=0)) ax.add_patch(Rectangle((LO, ymin), HI - LO, ymax - ymin, facecolor='#f7dc6f', alpha=0.12, zorder=0)) ax.add_patch(Rectangle((HI, ymin), xmax - HI, DH1 - ymin, facecolor='#cb4335', alpha=0.14, zorder=0)) ax.add_patch(Rectangle((HI, DH1), xmax - HI, DH2 - DH1, facecolor='#eb984e', alpha=0.14, zorder=0)) ax.add_patch(Rectangle((HI, DH2), xmax - HI, ymax - DH2, facecolor='#aed6f1', alpha=0.14, zorder=0)) # real 2D density (log counts) xedges = np.linspace(xmin, xmax, 90) yedges = np.arange(-0.5, ycap + 0.5, 1.0) # integer dHash bins H, xe, ye = np.histogram2d(cos, dh_disp, bins=[xedges, yedges]) pcm = ax.pcolormesh(xe, ye, H.T, norm=LogNorm(vmin=1, vmax=H.max()), cmap='viridis', zorder=1, shading='flat') cb = fig.colorbar(pcm, ax=ax, pad=0.02) cb.set_label('signatures per cell (log scale)', fontsize=8) cb.ax.tick_params(labelsize=7) # cut lines ax.axvline(LO, color='gray', ls=':', lw=1.1, zorder=3) ax.axvline(HI, color='black', ls='--', lw=1.1, zorder=3) ax.plot([HI, xmax], [DH1, DH1], 'k--', lw=0.9, zorder=3) ax.plot([HI, xmax], [DH2, DH2], 'k--', lw=0.9, zorder=3) # region labels ax.text((xmin + LO) / 2, 24, 'LH', ha='center', fontsize=10, weight='bold', color='#34495e', zorder=4) ax.text((LO + HI) / 2, 24, 'UN', ha='center', fontsize=10, weight='bold', color='#7d6608', zorder=4) ax.text((HI + xmax) / 2, 2.2, 'HC', ha='center', fontsize=10, weight='bold', color='#cb4335', zorder=4) ax.text((HI + xmax) / 2, 9.7, 'MC', ha='center', fontsize=10, weight='bold', color='#a04000', zorder=4) ax.text((HI + xmax) / 2, 24, 'HSC', ha='center', fontsize=9, weight='bold', color='#21618c', zorder=4) ax.set_xlim(xmin, xmax) ax.set_ylim(ymin, ymax) ax.set_xticks([0.70, 0.75, 0.80, 0.8547, 0.90, 0.95, 1.00]) ax.set_xticklabels(['0.70', '0.75', '0.80', '0.855', '0.90', '0.95', '1.00'], fontsize=7.5) ax.set_yticks([0, 5, 10, 15, 20, 25, 30]) ax.set_yticklabels(['0', '5', '10', '15', '20', '25', '≥30'], fontsize=7.5) ax.set_xlabel('cosine similarity to same accountant (style)', fontsize=9) ax.set_ylabel('min dHash distance (structure)', fontsize=9) ax.set_title(f'Figure 3. Two-measure plane: real density over the five regions (Big-4, n={n:,})', fontsize=8.5) fig.tight_layout() out = '/Volumes/NV2/pdf_recognize/paper/v13_build/figures/fig3.png' fig.savefig(out, dpi=200, bbox_inches='tight') plt.close(fig) print(f'fig3 density OK: n={n:,}, dHash>=30 piled: {(dh>=ycap).sum()}, written {out}')