cb38d413ad
Pre-emptively address the three residual points from a hostile GPT-5.5 reviewer pass that rev8 had not fully closed (the rest of that review matched the already-applied fusion revision): - Sensitivity surface (Major 5): new Figure 6 maps the deployed rule over the full (cosine cut x dHash cut) plane - clean-group flag rate and the Firm A-minus-B/C/D contrast. Shows no cliff at (0.95, dHash<=5), contrast >45pp across a broad region (58pp at 0.97/dHash<=3), and that extending to the MC bound (dHash<=15) halves the contrast - so the thresholds are not cherry-picked and the weaker MC band is shown, not hidden. Reproducible via make_fig6_sensitivity.py (DB columns only). - Soften "reuse-dominated" (Major 1): the assertion that Firm A "is" a reuse-dominated population now reads "behaves in the screen as," explicitly resting on interviews + byte-identity rather than per-signature ground truth; two other uses made conditional/generic. - Shared-pipeline contamination of ICCR (Major 2): Sec III-E now names the shared within-firm imaging pipeline (scanners, PDF assembly, red-stamp removal) as a channel that can lift the inter-CPA rate above true chance, distinct from "shared template," supported by the Sec V-B pipeline audit; bias direction (higher floor) keeps the Firm-A contrast conservative. rev9 docx rebuilt (6 figures embedded). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01Qn59FdF9JMyfFg3sjcUNNG
62 lines
3.0 KiB
Python
62 lines
3.0 KiB
Python
"""Figure 6: two-measure sensitivity surface over the (cosine cut x dHash cut) plane.
|
||
Panel A: clean-group (B/C/D) flag rate -- how permissive the operating point is.
|
||
Panel B: Firm A minus B/C/D flag-rate contrast (pp) -- discrimination across the plane.
|
||
Shows the chosen HC point (0.95, dHash<=5) is not a cherry-picked threshold and exposes
|
||
the weaker MC band (dHash<=15). Reproduces from signature_analysis.db (DB columns only).
|
||
"""
|
||
import matplotlib
|
||
matplotlib.use('Agg')
|
||
import matplotlib.pyplot as plt
|
||
import numpy as np
|
||
import sqlite3
|
||
|
||
DB = "/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db"
|
||
BIG4 = ('勤業眾信聯合', '資誠聯合', '安侯建業聯合', '安永聯合')
|
||
|
||
con = sqlite3.connect(DB); cur = con.cursor()
|
||
cur.execute(f"""SELECT CASE WHEN excel_firm='勤業眾信聯合' THEN 1 ELSE 0 END isA,
|
||
max_similarity_to_same_accountant c, min_dhash_independent d
|
||
FROM signatures WHERE is_valid=1 AND excel_firm IN ({','.join('?'*4)})
|
||
AND max_similarity_to_same_accountant IS NOT NULL AND min_dhash_independent IS NOT NULL""", BIG4)
|
||
rows = cur.fetchall(); con.close()
|
||
isA = np.array([r[0] for r in rows], bool)
|
||
c = np.array([r[1] for r in rows]); d = np.array([r[2] for r in rows])
|
||
cA, dA = c[isA], d[isA]; cB, dB = c[~isA], d[~isA]
|
||
|
||
cos_cuts = np.arange(0.85, 0.9901, 0.0025)
|
||
dh_cuts = np.arange(0, 21, 1)
|
||
A = np.zeros((len(dh_cuts), len(cos_cuts)))
|
||
B = np.zeros_like(A)
|
||
for j, cc in enumerate(cos_cuts):
|
||
for i, dd in enumerate(dh_cuts):
|
||
A[i, j] = 100 * np.mean((cA > cc) & (dA <= dd))
|
||
B[i, j] = 100 * np.mean((cB > cc) & (dB <= dd))
|
||
contrast = A - B
|
||
|
||
extent = [cos_cuts[0], cos_cuts[-1], dh_cuts[0], dh_cuts[-1]]
|
||
fig, axes = plt.subplots(1, 2, figsize=(10.5, 4.3))
|
||
|
||
for ax, Z, title, cmap, lab in [
|
||
(axes[0], B, '(a) Clean group (B/C/D) flag rate', 'viridis', 'flag rate (%)'),
|
||
(axes[1], contrast, '(b) Firm A − B/C/D contrast', 'magma', 'contrast (pp)')]:
|
||
im = ax.imshow(Z, origin='lower', aspect='auto', extent=extent, cmap=cmap)
|
||
cb = fig.colorbar(im, ax=ax, pad=0.02); cb.set_label(lab, fontsize=8); cb.ax.tick_params(labelsize=7)
|
||
# operating points
|
||
ax.scatter([0.95], [5], marker='*', s=180, color='white', edgecolor='black', zorder=5,
|
||
label='HC operating point (0.95, dHash≤5)')
|
||
ax.axhline(15, color='white', ls=':', lw=1.0)
|
||
ax.text(0.853, 15.4, 'MC upper bound (dHash≤15)', color='white', fontsize=6.5, va='bottom')
|
||
ax.set_xlabel('cosine cut', fontsize=9)
|
||
ax.set_ylabel('dHash cut (≤)', fontsize=9)
|
||
ax.set_title(title, fontsize=9)
|
||
ax.tick_params(labelsize=7.5)
|
||
ax.legend(loc='lower left', fontsize=6.5, framealpha=0.85)
|
||
|
||
fig.suptitle('Figure 6. Sensitivity surface of the deployed rule over the two-measure threshold plane (Big-4, n=%d).' % len(c),
|
||
fontsize=9, y=1.02)
|
||
fig.tight_layout()
|
||
out = '/Volumes/NV2/pdf_recognize/paper/v13_build/figures/fig6.png'
|
||
fig.savefig(out, dpi=200, bbox_inches='tight')
|
||
plt.close(fig)
|
||
print(f"fig6 OK n={len(c)}; HC(0.95,5) contrast={contrast[5, np.argmin(abs(cos_cuts-0.95))]:.1f}pp; written {out}")
|