Paper A v13 rev8: fusion-review revision (29 items) + verified data analysis

Address all 29 items from the fused reviewer report (Gemini 3.1 Pro + ChatGPT 5.5 + Opus 4.8): 3 fatal, 4 severe, arbitration A/B, 5 fusion-new, 15 minor. All new numbers computed from signature_analysis.db; nothing fabricated. Claim honesty (F1/F3/F4/F7/G3): - Retract all "139x the floor" comparisons; ICCR -> between-accountant specificity proxy throughout; state within-accountant FPR is not estimable and ICCR is not even a bound (anti-conservative direction). - Firm A reframed as quasi-positive known-positive benchmark (not blinded). - byte-identity recast as prevalence signal, not a recall/sanity check. - tunable -> single-direction conservativeness dial (no P-R frontier). New data analysis (verified, bit-reproducible via committed scripts): - F2/G1 (Sec V-B): 880-PDF imaging-pipeline audit (Table V) - plain scans 82% (2013) -> 1% (2021); producer strings name scanner hardware (Fuji Xerox D125 etc.); substrate transforms at 2020/21 = named confound. - F5 (Sec IV-C): four robustness checks - pool-size stratification, accountant-clustered bootstrap (gap 53.7pp [49.5,57.5]), firm+year FE logistic (B/C/D OR 0.06-0.12), leave-one-year-out (gap 53.1-54.9pp). - byte-identity era split: 30 scan-era (18 Firm A, pipeline-robust) vs 232 digital-era (detectability-inflated, hedged). - G5: archive-wide 888 expected chance HC flags [677,1098]. - M4: Figure 3 replaced with real 2D density (n=150,441). Structure/minor: abstract restructured (M1); operational definition (M2); interview disclaimer (M3); Threats to Validity subsection (M8); review protocol framed as design not evidence (M9); N reconciliations (M10/M11); Table II-c 2020-23 five-way (M12); Section refs, American spelling, notation table (M5/M13/M15); reference URLs verified (M14). Open (author-only): placeholders (M13), II-b/IV table merge (M15). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01Qn59FdF9JMyfFg3sjcUNNG
2026-06-23 14:36:51 +08:00
parent 61dd2dcaad
commit da455791de
7 changed files with 438 additions and 81 deletions
@@ -0,0 +1,63 @@
+"""F5 robustness: firm+year fixed-effects logistic regression and leave-one-year-out.
+Complements the pool-size stratification and accountant-clustered bootstrap (Section IV-C).
+Uses numpy+scipy only (no statsmodels). Reproduces from signature_analysis.db.
+"""
+import sqlite3, numpy as np
+from scipy.optimize import minimize
+
+DB = "/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db"
+BIG4 = ('勤業眾信聯合', '資誠聯合', '安侯建業聯合', '安永聯合')
+FM = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'}
+
+con = sqlite3.connect(DB); cur = con.cursor()
+cur.execute(f"""
+SELECT s.excel_firm, CAST(substr(s.year_month,1,4) AS INT) yr,
+  (CASE WHEN s.max_similarity_to_same_accountant>0.95 AND s.min_dhash_independent<=5 THEN 1 ELSE 0 END) hc,
+  p.psize
+FROM signatures s
+JOIN (SELECT accountant_id, COUNT(*) psize FROM signatures
+      WHERE is_valid=1 AND excel_firm IN ({','.join('?'*4)})
+        AND max_similarity_to_same_accountant IS NOT NULL AND min_dhash_independent IS NOT NULL
+      GROUP BY accountant_id) p ON s.accountant_id=p.accountant_id
+WHERE s.is_valid=1 AND s.excel_firm IN ({','.join('?'*4)})
+  AND s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL
+  AND s.year_month GLOB '2[0-9][0-9][0-9][0-9][0-9]'
+""", BIG4 + BIG4)
+rows = cur.fetchall(); con.close()
+firm = np.array([FM[r[0]] for r in rows]); yr = np.array([r[1] for r in rows])
+hc = np.array([r[2] for r in rows], float); pool = np.array([r[3] for r in rows], float)
+n = len(hc); years = sorted(set(yr.tolist()))
+
+# --- firm + year FE logistic (Firm A & first year = reference) ---
+cols = [np.ones(n)]; names = ['const']
+for f in ['B', 'C', 'D']:
+    cols.append((firm == f).astype(float)); names.append(f'firm_{f}')
+for y in years[1:]:
+    cols.append((yr == y).astype(float)); names.append(f'yr_{y}')
+lp = np.log(pool); lp = (lp - lp.mean()) / lp.std()
+cols.append(lp); names.append('logpool_z')
+X = np.column_stack(cols)
+
+def nll(b):
+    z = X @ b
+    return -np.sum(hc * z - np.logaddexp(0, z)) + 1e-6 * np.sum(b * b)
+def grad(b):
+    p = 1 / (1 + np.exp(-(X @ b)))
+    return -X.T @ (hc - p) + 2e-6 * b
+b = minimize(nll, np.zeros(X.shape[1]), jac=grad, method='L-BFGS-B').x
+print("Firm+Year FE logistic (Firm A & first year = ref):")
+for nm, bi in zip(names, b):
+    if nm.startswith('firm') or nm == 'logpool_z':
+        print(f"  {nm:11} coef={bi:7.3f}  OR={np.exp(bi):.4f}")
+
+# --- leave-one-year-out firm contrast ---
+grp = np.where(firm == 'A', 'A', 'BCD')
+def rate(mask, g):
+    m = mask & (grp == g); return 100 * hc[m].mean()
+print("\nLeave-one-year-out (Firm A minus B/C/D HC gap):")
+gaps = []
+for y in years:
+    keep = (yr != y); a = rate(keep, 'A'); bb = rate(keep, 'BCD'); gaps.append(a - bb)
+    print(f"  drop {y}: A={a:.1f}%  BCD={bb:.1f}%  gap={a-bb:.1f}pp")
+print(f"  full-sample gap={rate(np.ones(n, bool),'A')-rate(np.ones(n, bool),'BCD'):.1f}pp; "
+      f"LOYO range=[{min(gaps):.1f}, {max(gaps):.1f}]pp")
@@ -0,0 +1,84 @@
+"""Figure 3 (real data version): 2D density of the two measures over the five-region scheme.
+Replaces the earlier schematic with the actual distribution, with axis ticks and the rule cuts.
+Reproduces from signature_analysis.db; Big-4, is_valid=1, both measures present."""
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from matplotlib.colors import LogNorm
+from matplotlib.patches import Rectangle
+import numpy as np
+import sqlite3
+
+DB = "/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db"
+BIG4 = ('勤業眾信聯合', '資誠聯合', '安侯建業聯合', '安永聯合')
+
+con = sqlite3.connect(DB)
+cur = con.cursor()
+cur.execute(f"""
+SELECT max_similarity_to_same_accountant, min_dhash_independent
+FROM signatures
+WHERE is_valid=1 AND max_similarity_to_same_accountant IS NOT NULL
+  AND min_dhash_independent IS NOT NULL
+  AND excel_firm IN ({','.join(['?']*4)})
+""", BIG4)
+rows = cur.fetchall()
+con.close()
+cos = np.array([r[0] for r in rows], dtype=float)
+dh = np.array([r[1] for r in rows], dtype=float)
+n = len(cos)
+
+LO, HI = 0.8547, 0.95
+DH1, DH2 = 5, 15
+xmin, xmax = 0.70, 1.002
+ymin, ymax = -0.5, 30
+ycap = 30  # display cap; values above are piled into the top row for visibility
+
+dh_disp = np.minimum(dh, ycap - 0.5)
+
+fig, ax = plt.subplots(figsize=(5.6, 4.4))
+
+# faint region tint behind the density
+ax.add_patch(Rectangle((xmin, ymin), LO - xmin, ymax - ymin, facecolor='#bdc3c7', alpha=0.12, zorder=0))
+ax.add_patch(Rectangle((LO, ymin), HI - LO, ymax - ymin, facecolor='#f7dc6f', alpha=0.12, zorder=0))
+ax.add_patch(Rectangle((HI, ymin), xmax - HI, DH1 - ymin, facecolor='#cb4335', alpha=0.14, zorder=0))
+ax.add_patch(Rectangle((HI, DH1), xmax - HI, DH2 - DH1, facecolor='#eb984e', alpha=0.14, zorder=0))
+ax.add_patch(Rectangle((HI, DH2), xmax - HI, ymax - DH2, facecolor='#aed6f1', alpha=0.14, zorder=0))
+
+# real 2D density (log counts)
+xedges = np.linspace(xmin, xmax, 90)
+yedges = np.arange(-0.5, ycap + 0.5, 1.0)  # integer dHash bins
+H, xe, ye = np.histogram2d(cos, dh_disp, bins=[xedges, yedges])
+pcm = ax.pcolormesh(xe, ye, H.T, norm=LogNorm(vmin=1, vmax=H.max()),
+                    cmap='viridis', zorder=1, shading='flat')
+cb = fig.colorbar(pcm, ax=ax, pad=0.02)
+cb.set_label('signatures per cell (log scale)', fontsize=8)
+cb.ax.tick_params(labelsize=7)
+
+# cut lines
+ax.axvline(LO, color='gray', ls=':', lw=1.1, zorder=3)
+ax.axvline(HI, color='black', ls='--', lw=1.1, zorder=3)
+ax.plot([HI, xmax], [DH1, DH1], 'k--', lw=0.9, zorder=3)
+ax.plot([HI, xmax], [DH2, DH2], 'k--', lw=0.9, zorder=3)
+
+# region labels
+ax.text((xmin + LO) / 2, 24, 'LH', ha='center', fontsize=10, weight='bold', color='#34495e', zorder=4)
+ax.text((LO + HI) / 2, 24, 'UN', ha='center', fontsize=10, weight='bold', color='#7d6608', zorder=4)
+ax.text((HI + xmax) / 2, 2.2, 'HC', ha='center', fontsize=10, weight='bold', color='#cb4335', zorder=4)
+ax.text((HI + xmax) / 2, 9.7, 'MC', ha='center', fontsize=10, weight='bold', color='#a04000', zorder=4)
+ax.text((HI + xmax) / 2, 24, 'HSC', ha='center', fontsize=9, weight='bold', color='#21618c', zorder=4)
+
+ax.set_xlim(xmin, xmax)
+ax.set_ylim(ymin, ymax)
+ax.set_xticks([0.70, 0.75, 0.80, 0.8547, 0.90, 0.95, 1.00])
+ax.set_xticklabels(['0.70', '0.75', '0.80', '0.855', '0.90', '0.95', '1.00'], fontsize=7.5)
+ax.set_yticks([0, 5, 10, 15, 20, 25, 30])
+ax.set_yticklabels(['0', '5', '10', '15', '20', '25', '≥30'], fontsize=7.5)
+ax.set_xlabel('cosine similarity to same accountant (style)', fontsize=9)
+ax.set_ylabel('min dHash distance (structure)', fontsize=9)
+ax.set_title(f'Figure 3. Two-measure plane: real density over the five regions (Big-4, n={n:,})',
+             fontsize=8.5)
+fig.tight_layout()
+out = '/Volumes/NV2/pdf_recognize/paper/v13_build/figures/fig3.png'
+fig.savefig(out, dpi=200, bbox_inches='tight')
+plt.close(fig)
+print(f'fig3 density OK: n={n:,}, dHash>=30 piled: {(dh>=ycap).sum()}, written {out}')
@@ -0,0 +1,63 @@
+"""Imaging-pipeline audit (Table V) + byte-identity era split (Section V-B).
+Classifies a stratified sample of report PDFs as scanned / OCR'd / digital-native
+from embedded metadata + extractable-text heuristic, and tabulates by year and firm.
+Also reports the scan-era vs digital-era split of the 262 byte-identical signatures.
+
+Requires: PyMuPDF (fitz); signature_analysis.db; original PDFs under total-pdf/.
+"""
+import fitz, os, glob, sqlite3
+from collections import defaultdict
+
+fitz.TOOLS.mupdf_display_errors(False)
+DB = "/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db"
+PDF_ROOT = "/Volumes/NV2/PDF-Processing/total-pdf"
+BIG4 = ('勤業眾信聯合', '資誠聯合', '安侯建業聯合', '安永聯合')
+FMAP = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'}
+
+con = sqlite3.connect(DB); cur = con.cursor()
+
+# --- stratified sample: 20 distinct PDFs per firm-year ---
+cur.execute(f"""
+WITH d AS (SELECT DISTINCT excel_firm, substr(year_month,1,4) yr, source_pdf,
+  ROW_NUMBER() OVER (PARTITION BY excel_firm, substr(year_month,1,4) ORDER BY source_pdf) rn
+  FROM signatures WHERE excel_firm IN ({','.join(['?']*4)}) AND source_pdf IS NOT NULL)
+SELECT excel_firm, yr, source_pdf FROM d WHERE rn<=20 ORDER BY yr""", BIG4)
+rows = cur.fetchall()
+idx = {os.path.basename(p): p for p in glob.glob(PDF_ROOT + '/*/*.pdf')}
+
+def classify(path):
+    try:
+        doc = fitz.open(path)
+    except Exception:
+        return None
+    text = sum(len(doc[i].get_text().strip()) for i in range(min(len(doc), 4)))
+    doc.close()
+    return 'DIGITAL' if text > 2000 else ('OCR' if text > 200 else 'SCAN')
+
+byyear = defaultdict(lambda: defaultdict(int))
+for firm, yr, fn in rows:
+    p = idx.get(fn)
+    if not p:
+        continue
+    k = classify(p)
+    if k:
+        byyear[yr][k] += 1
+
+print("year | n | scan% | ocr% | digital%")
+for yr in sorted(byyear):
+    d = byyear[yr]; n = sum(d.values())
+    print(f"{yr} | {n} | {100*d['SCAN']//n} | {100*d['OCR']//n} | {100*d['DIGITAL']//n}")
+
+# --- byte-identity era split ---
+cur.execute(f"""
+SELECT CASE WHEN year_month<'202101' THEN 'scan-era' ELSE 'digital-era' END era,
+  CASE excel_firm WHEN '勤業眾信聯合' THEN 'A' WHEN '安侯建業聯合' THEN 'B'
+                  WHEN '資誠聯合' THEN 'C' WHEN '安永聯合' THEN 'D' END firm,
+  COUNT(*) n
+FROM signatures WHERE is_valid=1 AND pixel_identical_to_closest=1
+  AND excel_firm IN ({','.join(['?']*4)})
+GROUP BY era, firm ORDER BY era, firm""", BIG4)
+print("\nbyte-identical by era x firm:")
+for era, firm, n in cur.fetchall():
+    print(f"  {era} | {firm} | {n}")
+con.close()