diff --git a/paper/Paper_A_IEEE_Access_Draft_v3.docx b/paper/Paper_A_IEEE_Access_Draft_v3.docx
index 9605c47..d6299e9 100644
Binary files a/paper/Paper_A_IEEE_Access_Draft_v3.docx and b/paper/Paper_A_IEEE_Access_Draft_v3.docx differ
diff --git a/paper/export_v3.py b/paper/export_v3.py
index a5a68fe..6f89c87 100644
--- a/paper/export_v3.py
+++ b/paper/export_v3.py
@@ -5,9 +5,16 @@ from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from pathlib import Path
+import hashlib
import re
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
PAPER_DIR = Path("/Volumes/NV2/pdf_recognize/paper")
+EQUATION_CACHE_DIR = PAPER_DIR / "equations"
+EQUATION_CACHE_DIR.mkdir(exist_ok=True)
FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures")
EXTRA_FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
OUTPUT = PAPER_DIR / "Paper_A_IEEE_Access_Draft_v3.docx"
@@ -48,10 +55,10 @@ FIGURES = {
"Fig. 3. Firm A per-signature cosine and dHash distributions against the overall CPA population.",
3.5,
),
- "Fig. 4 visualizes the accountant-level clusters": (
- EXTRA_FIG_DIR / "accountant_mixture" / "accountant_mixture_2d.png",
- "Fig. 4. Accountant-level 3-component Gaussian mixture in the (cosine-mean, dHash-mean) plane.",
- 4.5,
+ "Fig. 4 summarises the per-firm yearly per-signature": (
+ EXTRA_FIG_DIR / "figures" / "fig_yearly_big4_comparison.png",
+ "Fig. 4. Per-firm yearly per-signature best-match cosine, 2013-2023. (a) Mean per-signature best-match cosine by firm bucket and fiscal year (threshold-free). (b) Share of per-signature best-match cosine ≥ 0.95 (operational cut of Section III-K). Five lines: Firm A, B, C, D, Non-Big-4. Firm A is above the other Big-4 firms in every year; Non-Big-4 is below all four Big-4 firms in every year.",
+ 6.5,
),
"conducted an ablation study comparing three": (
FIG_DIR / "fig4_ablation.png",
@@ -62,7 +69,321 @@ FIGURES = {
def strip_comments(text):
- return re.sub(r"", "", text, flags=re.DOTALL)
+ """Remove HTML comments, but UNWRAP comments whose first non-blank line
+ starts with `TABLE ` (or `TABLE\t`).
+
+ The v3 markdown sources wrap every numerical table in an HTML comment of
+ the form
+
+
+
+ The caption (`TABLE V: Hartigan Dip Test Results`) is on the same line as
+ the opening `` closes the block. The previous implementation wholesale-deleted
+ these comments, which silently dropped every table from the rendered
+ DOCX. We now (i) detect comments whose first non-empty line starts with
+ `TABLE `, (ii) emit a synthetic caption marker line `__TABLE_CAPTION__:
+
` so process_section can render the caption as a centered
+ bold paragraph above the table, and (iii) keep the table body so the
+ existing markdown-table detector picks it up. Non-TABLE comments
+ (figure placeholders, editorial notes) are stripped as before.
+ """
+ def _replace(match):
+ body = match.group(1)
+ # Find first non-blank line.
+ for line in body.splitlines():
+ stripped = line.strip()
+ if stripped:
+ first = stripped
+ break
+ else:
+ return ""
+ if not first.startswith("TABLE ") and not first.startswith("TABLE\t"):
+ return ""
+ # Split caption (first non-blank line) from the rest.
+ lines = body.splitlines()
+ # Find index of the first non-blank line and use everything after.
+ for idx, line in enumerate(lines):
+ if line.strip():
+ caption = line.strip()
+ rest = "\n".join(lines[idx + 1:])
+ break
+ else:
+ return ""
+ # Emit caption marker + body. Surround with blank lines so the
+ # paragraph/table detector treats the marker as its own paragraph.
+ return f"\n\n__TABLE_CAPTION__:{caption}\n{rest}\n"
+ # Non-greedy match across lines.
+ return re.sub(r"", _replace, text, flags=re.DOTALL)
+
+
+# ---------------------------------------------------------------------------
+# LaTeX → plain text + Unicode conversion
+# ---------------------------------------------------------------------------
+# The v3 markdown sources contain inline LaTeX ($...$) and a small number of
+# display-math blocks ($$...$$). Pandoc would render these natively; the
+# python-docx pipeline used here does not, so without preprocessing every
+# `\leq`, `\text{dHash}_\text{indep}`, `\Delta\text{BIC}`, `60{,}448`, etc.
+# leaks into the DOCX as raw LaTeX. The helpers below convert the common
+# inline cases to Unicode and split subscripts/superscripts into proper Word
+# runs. Display-math (rare; 3 equations in this paper) gets a best-effort
+# linearisation and is acceptable for a partner-handoff DOCX; final IEEE
+# typesetting is handled by the publisher's LaTeX/MathType pipeline.
+
+LATEX_TOKEN_REPLACEMENTS = [
+ # Greek letters (lower)
+ (r"\\alpha(?![A-Za-z])", "α"), (r"\\beta(?![A-Za-z])", "β"), (r"\\gamma(?![A-Za-z])", "γ"),
+ (r"\\delta(?![A-Za-z])", "δ"), (r"\\epsilon(?![A-Za-z])", "ε"), (r"\\zeta(?![A-Za-z])", "ζ"),
+ (r"\\eta(?![A-Za-z])", "η"), (r"\\theta(?![A-Za-z])", "θ"), (r"\\iota(?![A-Za-z])", "ι"),
+ (r"\\kappa(?![A-Za-z])", "κ"), (r"\\lambda(?![A-Za-z])", "λ"), (r"\\mu(?![A-Za-z])", "μ"),
+ (r"\\nu(?![A-Za-z])", "ν"), (r"\\xi(?![A-Za-z])", "ξ"), (r"\\pi(?![A-Za-z])", "π"),
+ (r"\\rho(?![A-Za-z])", "ρ"), (r"\\sigma(?![A-Za-z])", "σ"), (r"\\tau(?![A-Za-z])", "τ"),
+ (r"\\phi(?![A-Za-z])", "φ"), (r"\\chi(?![A-Za-z])", "χ"), (r"\\psi(?![A-Za-z])", "ψ"),
+ (r"\\omega(?![A-Za-z])", "ω"),
+ # Greek letters (upper, only those distinguishable from Latin)
+ (r"\\Gamma(?![A-Za-z])", "Γ"), (r"\\Delta(?![A-Za-z])", "Δ"), (r"\\Theta(?![A-Za-z])", "Θ"),
+ (r"\\Lambda(?![A-Za-z])", "Λ"), (r"\\Xi(?![A-Za-z])", "Ξ"), (r"\\Pi(?![A-Za-z])", "Π"),
+ (r"\\Sigma(?![A-Za-z])", "Σ"), (r"\\Phi(?![A-Za-z])", "Φ"), (r"\\Psi(?![A-Za-z])", "Ψ"),
+ (r"\\Omega(?![A-Za-z])", "Ω"),
+ # Relations / arrows
+ (r"\\leq(?![A-Za-z])", "≤"), (r"\\geq(?![A-Za-z])", "≥"),
+ (r"\\neq(?![A-Za-z])", "≠"), (r"\\approx(?![A-Za-z])", "≈"),
+ (r"\\equiv(?![A-Za-z])", "≡"), (r"\\sim(?![A-Za-z])", "~"),
+ (r"\\to(?![A-Za-z])", "→"), (r"\\rightarrow(?![A-Za-z])", "→"),
+ (r"\\leftarrow(?![A-Za-z])", "←"), (r"\\Rightarrow(?![A-Za-z])", "⇒"),
+ (r"\\Leftarrow(?![A-Za-z])", "⇐"),
+ # Binary operators
+ (r"\\times(?![A-Za-z])", "×"), (r"\\cdot(?![A-Za-z])", "·"),
+ (r"\\pm(?![A-Za-z])", "±"), (r"\\mp(?![A-Za-z])", "∓"),
+ (r"\\div(?![A-Za-z])", "÷"),
+ # Misc
+ (r"\\infty(?![A-Za-z])", "∞"), (r"\\partial(?![A-Za-z])", "∂"),
+ (r"\\sum(?![A-Za-z])", "∑"), (r"\\prod(?![A-Za-z])", "∏"),
+ (r"\\int(?![A-Za-z])", "∫"),
+ (r"\\ldots(?![A-Za-z])", "…"), (r"\\dots(?![A-Za-z])", "…"),
+ # Spacing commands (drop or replace with single space)
+ (r"\\,", " "), (r"\\;", " "), (r"\\:", " "),
+ (r"\\!", ""), (r"\\ ", " "),
+ (r"\\quad(?![A-Za-z])", " "), (r"\\qquad(?![A-Za-z])", " "),
+ # Escaped punctuation
+ (r"\\%", "%"), (r"\\#", "#"), (r"\\&", "&"),
+ (r"\\\$", "$"), (r"\\_", "_"),
+]
+
+
+def _unwrap_command(text, cmd):
+ """Repeatedly replace `\\cmd{X}` → `X` until stable."""
+ pat = re.compile(r"\\" + cmd + r"\{([^{}]*)\}")
+ prev = None
+ while prev != text:
+ prev = text
+ text = pat.sub(r"\1", text)
+ return text
+
+
+MATH_START = "" # Private Use Area: XML-safe
+MATH_END = ""
+
+
+def latex_to_unicode(text):
+ """Convert a LaTeX-laced markdown paragraph into plain text.
+
+ Math context is preserved with private-use sentinel characters
+ (MATH_START / MATH_END) so the downstream run-splitter only treats
+ `_X` / `^X` as subscript / superscript inside math regions; in body
+ text underscores in identifiers like `signature_analysis` survive.
+ """
+ if "$" not in text and "\\" not in text:
+ return text
+
+ # 1. Strip display-math delimiters first (keep the inner content for
+ # best-effort linearisation), wrapping math regions with sentinels.
+ # Then strip inline math delimiters with the same sentinel wrapping.
+ text = re.sub(r"\$\$([\s\S]+?)\$\$",
+ lambda m: MATH_START + m.group(1) + MATH_END, text)
+ text = re.sub(r"\$([^$]+?)\$",
+ lambda m: MATH_START + m.group(1) + MATH_END, text)
+
+ # 2. Replace token-level commands with Unicode glyphs *before* unwrapping
+ # `\text{...}` and friends, so that `\Delta\text{BIC}` becomes
+ # `Δ\text{BIC}` (then `ΔBIC`) rather than `\DeltaBIC` which would be
+ # stripped wholesale by the cleanup pass.
+ for pat, repl in LATEX_TOKEN_REPLACEMENTS:
+ text = re.sub(pat, repl, text)
+
+ # 3. Unwrap formatting / text commands (innermost first via _unwrap loop).
+ for cmd in ("text", "mathbf", "mathit", "mathrm", "mathsf", "mathtt",
+ "operatorname", "emph", "textbf", "textit"):
+ text = _unwrap_command(text, cmd)
+
+ # 4. \frac{a}{b} → (a)/(b); \sqrt{x} → √(x). Apply repeatedly to handle
+ # one level of nesting; deeper nesting is rare in this paper.
+ for _ in range(3):
+ text = re.sub(
+ r"\\t?frac\{([^{}]+)\}\{([^{}]+)\}",
+ r"(\1)/(\2)",
+ text,
+ )
+ text = re.sub(r"\\sqrt\{([^{}]+)\}", r"√(\1)", text)
+
+ # 5. TeX braces used purely for spacing/grouping: K{=}3 → K=3,
+ # 60{,}448 → 60,448, 10{,}175 → 10,175.
+ text = re.sub(r"\{([=<>+\-,])\}", r"\1", text)
+
+ # 6. Strip any remaining `\cmd{...}` (best effort) and `\cmd ` tokens.
+ text = re.sub(r"\\[a-zA-Z]+\{([^{}]*)\}", r"\1", text)
+ text = re.sub(r"\\[a-zA-Z]+(?![A-Za-z])", "", text)
+
+ # 7. Collapse runs of whitespace introduced by command stripping.
+ text = re.sub(r"[ \t]{2,}", " ", text)
+ return text
+
+
+_SUBSUP_PATTERN = re.compile(
+ r"_\{([^{}]*)\}" # _{...}
+ r"|\^\{([^{}]*)\}" # ^{...}
+ r"|_([A-Za-z0-9+\-])" # _X (single token)
+ r"|\^([A-Za-z0-9+\-])" # ^X (single token)
+)
+
+
+def _emit_plain(paragraph, text, font_name, font_size, bold, italic):
+ if not text:
+ return
+ run = paragraph.add_run(text)
+ run.font.name = font_name
+ run.font.size = font_size
+ run.bold = bold
+ run.italic = italic
+
+
+def _emit_math(paragraph, text, font_name, font_size, bold, italic):
+ """Emit `text` from a math region: split on `_X` / `_{X}` / `^X` / `^{X}`
+ and render those as Word subscripts / superscripts."""
+ if "_" not in text and "^" not in text:
+ _emit_plain(paragraph, text, font_name, font_size, bold, italic)
+ return
+ pos = 0
+ for m in _SUBSUP_PATTERN.finditer(text):
+ if m.start() > pos:
+ _emit_plain(paragraph, text[pos:m.start()],
+ font_name, font_size, bold, italic)
+ sub_text = m.group(1) or m.group(3)
+ sup_text = m.group(2) or m.group(4)
+ if sub_text is not None:
+ run = paragraph.add_run(sub_text)
+ run.font.subscript = True
+ else:
+ run = paragraph.add_run(sup_text)
+ run.font.superscript = True
+ run.font.name = font_name
+ run.font.size = font_size
+ run.bold = bold
+ run.italic = italic
+ pos = m.end()
+ if pos < len(text):
+ _emit_plain(paragraph, text[pos:],
+ font_name, font_size, bold, italic)
+
+
+def add_text_with_subsup(paragraph, text, font_name="Times New Roman",
+ font_size=Pt(10), bold=False, italic=False):
+ """Add `text` to `paragraph`. Subscript/superscript handling is scoped to
+ math regions delimited by MATH_START / MATH_END sentinels (set up by
+ `latex_to_unicode`). Outside math regions, underscores and carets are
+ preserved literally so identifiers like `signature_analysis` and
+ `paper_a_results_v3.md` survive intact.
+ """
+ if MATH_START not in text:
+ _emit_math(paragraph, text, font_name, font_size, bold, italic) \
+ if False else \
+ _emit_plain(paragraph, text, font_name, font_size, bold, italic)
+ return
+
+ pos = 0
+ while pos < len(text):
+ s = text.find(MATH_START, pos)
+ if s == -1:
+ _emit_plain(paragraph, text[pos:],
+ font_name, font_size, bold, italic)
+ break
+ if s > pos:
+ _emit_plain(paragraph, text[pos:s],
+ font_name, font_size, bold, italic)
+ e = text.find(MATH_END, s + 1)
+ if e == -1:
+ # Unterminated math region — emit rest as plain.
+ _emit_plain(paragraph, text[s + 1:],
+ font_name, font_size, bold, italic)
+ break
+ math_body = text[s + 1:e]
+ _emit_math(paragraph, math_body, font_name, font_size, bold, italic)
+ pos = e + 1
+
+
+# ---------------------------------------------------------------------------
+# Display-equation rendering (matplotlib mathtext → PNG → embedded image)
+# ---------------------------------------------------------------------------
+
+# matplotlib mathtext is a subset of LaTeX. A few common TeX-only macros need
+# to be substituted with mathtext-supported equivalents before parsing.
+_MATHTEXT_SUBS = [
+ (re.compile(r"\\tfrac\b"), r"\\frac"), # text-frac → frac
+ (re.compile(r"\\dfrac\b"), r"\\frac"), # display-frac → frac
+ (re.compile(r"\\operatorname\{([^{}]+)\}"),
+ lambda m: r"\mathrm{" + m.group(1) + "}"), # operatorname → mathrm
+ (re.compile(r"\\,"), " "), # thin space
+ (re.compile(r"\\;"), " "),
+ (re.compile(r"\\!"), ""),
+]
+
+
+def _sanitise_for_mathtext(latex: str) -> str:
+ out = latex
+ for pat, repl in _MATHTEXT_SUBS:
+ out = pat.sub(repl, out)
+ return out
+
+
+def render_equation_png(latex: str, fontsize: int = 14) -> Path:
+ """Render a LaTeX math expression to a tightly-cropped PNG using
+ matplotlib mathtext, with content-addressed caching so a re-build only
+ re-renders changed equations. Returns the cached PNG path."""
+ sanitised = _sanitise_for_mathtext(latex.strip())
+ digest = hashlib.sha1(
+ (sanitised + f"|fs{fontsize}").encode("utf-8")).hexdigest()[:16]
+ out_path = EQUATION_CACHE_DIR / f"eq_{digest}.png"
+ if out_path.exists():
+ return out_path
+ fig = plt.figure(figsize=(8, 1.6))
+ fig.text(0.5, 0.5, f"${sanitised}$",
+ fontsize=fontsize, ha="center", va="center")
+ fig.savefig(str(out_path), dpi=220, bbox_inches="tight",
+ pad_inches=0.05)
+ plt.close(fig)
+ return out_path
+
+
+def add_equation_block(doc, latex: str, equation_number: int,
+ width_inches: float = 4.5):
+ """Insert a centered display equation (rendered as PNG) followed by
+ a right-aligned equation number `(N)`. Width keeps the equation
+ visually proportional within the IEEE Access body column."""
+ img_path = render_equation_png(latex)
+ p = doc.add_paragraph()
+ p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+ p.paragraph_format.space_before = Pt(6)
+ p.paragraph_format.space_after = Pt(6)
+ run = p.add_run()
+ run.add_picture(str(img_path), width=Inches(width_inches))
+ # Equation number on the same paragraph, tab-aligned to the right.
+ num_run = p.add_run(f"\t({equation_number})")
+ num_run.font.name = "Times New Roman"
+ num_run.font.size = Pt(10)
def add_md_table(doc, table_lines):
@@ -79,14 +400,23 @@ def add_md_table(doc, table_lines):
for r_idx, row in enumerate(rows_data):
for c_idx in range(min(len(row), ncols)):
cell = table.rows[r_idx].cells[c_idx]
- cell.text = row[c_idx]
- for p in cell.paragraphs:
- p.alignment = WD_ALIGN_PARAGRAPH.CENTER
- for run in p.runs:
- run.font.size = Pt(8)
- run.font.name = "Times New Roman"
- if r_idx == 0:
- run.bold = True
+ raw = row[c_idx]
+ # Strip markdown emphasis markers; convert LaTeX before rendering.
+ raw = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", raw)
+ raw = re.sub(r"\*\*(.+?)\*\*", r"\1", raw)
+ raw = re.sub(r"\*(.+?)\*", r"\1", raw)
+ raw = re.sub(r"`(.+?)`", r"\1", raw)
+ cell_text = latex_to_unicode(raw)
+ # Replace the default empty paragraph with one we control.
+ cell.text = ""
+ cp = cell.paragraphs[0]
+ cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
+ add_text_with_subsup(
+ cp, cell_text,
+ font_name="Times New Roman",
+ font_size=Pt(8),
+ bold=(r_idx == 0),
+ )
doc.add_paragraph()
@@ -105,10 +435,27 @@ def _insert_figures(doc, para_text):
cr.italic = True
-def process_section(doc, filepath):
+def process_section(doc, filepath, equation_counter=None):
+ """Process one v3 markdown section. `equation_counter` is a single-element
+ list (used as a mutable counter shared across sections) tracking the
+ running display-equation number."""
+ if equation_counter is None:
+ equation_counter = [0]
text = filepath.read_text(encoding="utf-8")
text = strip_comments(text)
lines = text.split("\n")
+ # Defensive blockquote handling: markdown blockquote lines (`> body`) are
+ # not rendered as Word callout blocks here, but stripping the leading
+ # `> ` keeps the body text from leaking the literal `>` and the empty
+ # `>` separator lines into the DOCX.
+ cleaned = []
+ for ln in lines:
+ s = ln.lstrip()
+ if s == ">" or s.startswith("> "):
+ cleaned.append(ln[ln.index(">") + 1:].lstrip() if "> " in ln else "")
+ else:
+ cleaned.append(ln)
+ lines = cleaned
i = 0
while i < len(lines):
line = lines[i]
@@ -117,23 +464,44 @@ def process_section(doc, filepath):
i += 1
continue
if stripped.startswith("# "):
- h = doc.add_heading(stripped[2:], level=1)
+ h = doc.add_heading(
+ latex_to_unicode(stripped[2:]).replace(MATH_START, "").replace(MATH_END, ""),
+ level=1)
for run in h.runs:
run.font.color.rgb = RGBColor(0, 0, 0)
i += 1
continue
if stripped.startswith("## "):
- h = doc.add_heading(stripped[3:], level=2)
+ h = doc.add_heading(
+ latex_to_unicode(stripped[3:]).replace(MATH_START, "").replace(MATH_END, ""),
+ level=2)
for run in h.runs:
run.font.color.rgb = RGBColor(0, 0, 0)
i += 1
continue
if stripped.startswith("### "):
- h = doc.add_heading(stripped[4:], level=3)
+ h = doc.add_heading(
+ latex_to_unicode(stripped[4:]).replace(MATH_START, "").replace(MATH_END, ""),
+ level=3)
for run in h.runs:
run.font.color.rgb = RGBColor(0, 0, 0)
i += 1
continue
+ if stripped.startswith("__TABLE_CAPTION__:"):
+ caption_text = stripped[len("__TABLE_CAPTION__:"):].strip()
+ caption_text = latex_to_unicode(caption_text)
+ cp = doc.add_paragraph()
+ cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
+ cp.paragraph_format.space_before = Pt(6)
+ cp.paragraph_format.space_after = Pt(2)
+ add_text_with_subsup(
+ cp, caption_text,
+ font_name="Times New Roman",
+ font_size=Pt(9),
+ bold=True,
+ )
+ i += 1
+ continue
if "|" in stripped and i + 1 < len(lines) and re.match(r"\s*\|[-|: ]+\|", lines[i + 1]):
table_lines = []
while i < len(lines) and "|" in lines[i]:
@@ -141,22 +509,74 @@ def process_section(doc, filepath):
i += 1
add_md_table(doc, table_lines)
continue
+ # Display math: a line starting with `$$` is treated as a single-line
+ # equation block and rendered as an embedded mathtext PNG with an
+ # auto-incrementing equation number.
+ if stripped.startswith("$$"):
+ # Accumulate until a closing $$ is found (single line in our
+ # corpus, but defensively support multi-line just in case).
+ buf = [stripped]
+ if not (stripped.count("$$") >= 2 and stripped.endswith("$$")):
+ while i + 1 < len(lines):
+ i += 1
+ buf.append(lines[i])
+ if "$$" in lines[i]:
+ break
+ joined = "\n".join(buf).strip()
+ # Strip the leading and trailing $$ delimiters and any trailing
+ # punctuation (e.g. the `,` that some equation lines end with).
+ inner = joined
+ if inner.startswith("$$"):
+ inner = inner[2:]
+ if inner.endswith("$$"):
+ inner = inner[:-2]
+ inner = inner.rstrip(", ")
+ equation_counter[0] += 1
+ try:
+ add_equation_block(doc, inner, equation_counter[0])
+ except Exception as exc:
+ # Fallback: render as plain centered Times-Roman line so the
+ # build doesn't fail on a single un-renderable equation.
+ p = doc.add_paragraph()
+ p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+ run = p.add_run(f"[equation render failed: {exc}] {inner}")
+ run.font.name = "Times New Roman"
+ run.font.size = Pt(10)
+ run.italic = True
+ i += 1
+ continue
if re.match(r"^\d+\.\s", stripped):
- p = doc.add_paragraph(style="List Number")
- content = re.sub(r"^\d+\.\s", "", stripped)
+ # Manual numbering: keep the number from the markdown source and
+ # apply a hanging-indent paragraph format. Avoids python-docx's
+ # `style='List Number'` which depends on a properly-set-up
+ # numbering definition that the default Document() lacks.
+ m = re.match(r"^(\d+)\.\s+(.*)$", stripped)
+ num, content = m.group(1), m.group(2)
+ p = doc.add_paragraph()
+ p.paragraph_format.left_indent = Inches(0.4)
+ p.paragraph_format.first_line_indent = Inches(-0.25)
+ p.paragraph_format.space_after = Pt(4)
+ content = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", content)
content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
- run = p.add_run(content)
- run.font.size = Pt(10)
- run.font.name = "Times New Roman"
+ content = re.sub(r"\*(.+?)\*", r"\1", content)
+ content = re.sub(r"`(.+?)`", r"\1", content)
+ content = latex_to_unicode(content)
+ add_text_with_subsup(p, f"{num}. {content}")
i += 1
continue
if stripped.startswith("- "):
- p = doc.add_paragraph(style="List Bullet")
+ # Manual bullets with hanging indent (same rationale as numbered).
+ p = doc.add_paragraph()
+ p.paragraph_format.left_indent = Inches(0.4)
+ p.paragraph_format.first_line_indent = Inches(-0.25)
+ p.paragraph_format.space_after = Pt(4)
content = stripped[2:]
+ content = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", content)
content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
- run = p.add_run(content)
- run.font.size = Pt(10)
- run.font.name = "Times New Roman"
+ content = re.sub(r"\*(.+?)\*", r"\1", content)
+ content = re.sub(r"`(.+?)`", r"\1", content)
+ content = latex_to_unicode(content)
+ add_text_with_subsup(p, f"• {content}")
i += 1
continue
# Regular paragraph
@@ -179,14 +599,12 @@ def process_section(doc, filepath):
para_text = re.sub(r"\*\*(.+?)\*\*", r"\1", para_text)
para_text = re.sub(r"\*(.+?)\*", r"\1", para_text)
para_text = re.sub(r"`(.+?)`", r"\1", para_text)
- para_text = para_text.replace("$$", "")
para_text = para_text.replace("---", "\u2014")
+ para_text = latex_to_unicode(para_text)
p = doc.add_paragraph()
p.paragraph_format.space_after = Pt(6)
- run = p.add_run(para_text)
- run.font.size = Pt(10)
- run.font.name = "Times New Roman"
+ add_text_with_subsup(p, para_text)
_insert_figures(doc, para_text)
@@ -234,15 +652,38 @@ def main():
run.font.size = Pt(10)
run.italic = True
+ equation_counter = [0]
for section_file in SECTIONS:
filepath = PAPER_DIR / section_file
if filepath.exists():
- process_section(doc, filepath)
+ process_section(doc, filepath, equation_counter=equation_counter)
else:
print(f"WARNING: missing section file: {filepath}")
doc.save(str(OUTPUT))
print(f"Saved: {OUTPUT}")
+ _run_linter()
+
+
+def _run_linter():
+ """Run the leak linter on the freshly built DOCX. Non-fatal: prints a
+ summary line. For full output run `python3 paper/lint_paper_v3.py`."""
+ try:
+ import lint_paper_v3 # local module
+ except Exception as exc: # pragma: no cover
+ print(f"(lint skipped: {exc})")
+ return
+ findings = lint_paper_v3.lint_docx(OUTPUT)
+ errors = sum(1 for f in findings if f.severity == "ERROR")
+ warns = sum(1 for f in findings if f.severity == "WARN")
+ infos = sum(1 for f in findings if f.severity == "INFO")
+ if errors:
+ print(f"\n[lint] {errors} ERROR finding(s) in DOCX — run "
+ f"`python3 paper/lint_paper_v3.py --docx` for details.")
+ elif warns or infos:
+ print(f"[lint] DOCX clean of ERRORs ({warns} WARN, {infos} INFO).")
+ else:
+ print("[lint] DOCX clean.")
if __name__ == "__main__":
diff --git a/paper/lint_paper_v3.py b/paper/lint_paper_v3.py
new file mode 100644
index 0000000..05e9eda
--- /dev/null
+++ b/paper/lint_paper_v3.py
@@ -0,0 +1,399 @@
+#!/usr/bin/env python3
+"""Paper A v3 markdown / DOCX leak linter.
+
+Runs two pass:
+
+ Source pass — scans the v3 markdown sources for syntax patterns that the
+ python-docx export pipeline does NOT render natively. Each finding is a
+ file:line:severity:message tuple. Severity is ERROR (will leak literal
+ syntax into Word), WARN (sometimes leaks), or INFO (style nits).
+
+ DOCX pass — opens the rendered DOCX and scans every paragraph and table
+ cell for known leak signatures. This is the authoritative check: even
+ if the source pass is clean, the DOCX pass tells you what your partner
+ will actually see. The DOCX pass currently checks for:
+
+ - leftover LaTeX commands (`\\cmd`)
+ - unstripped `$` math delimiters
+ - pandoc footnote markers (`[^name]`)
+ - markdown blockquote markers (lines starting with `> `)
+ - TeX brace tricks (`{=}`, `{,}`)
+ - PUA sentinels (`\\uE000`, `\\uE001`) leaking from the math-region
+ run-splitter
+ - the synthetic table-caption marker `__TABLE_CAPTION__:` if it ever
+ survives processing
+
+Exit code:
+ 0 clean
+ 1 WARN-level findings only (ship-able after review)
+ 2 ERROR-level findings (do NOT ship)
+
+Usage:
+ python3 paper/lint_paper_v3.py # both passes
+ python3 paper/lint_paper_v3.py --source # source-side only
+ python3 paper/lint_paper_v3.py --docx # DOCX-side only
+
+Designed to be run after `python3 export_v3.py` and before copying the
+DOCX to ~/Downloads.
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+PAPER_DIR = Path(__file__).resolve().parent
+DOCX_PATH = PAPER_DIR / "Paper_A_IEEE_Access_Draft_v3.docx"
+
+V3_SOURCES = [
+ "paper_a_abstract_v3.md",
+ "paper_a_introduction_v3.md",
+ "paper_a_related_work_v3.md",
+ "paper_a_methodology_v3.md",
+ "paper_a_results_v3.md",
+ "paper_a_discussion_v3.md",
+ "paper_a_conclusion_v3.md",
+ "paper_a_appendix_v3.md",
+ "paper_a_declarations_v3.md",
+ "paper_a_references_v3.md",
+]
+
+
+# ---------------------------------------------------------------------------
+# Finding model + ANSI colour helpers
+# ---------------------------------------------------------------------------
+
+SEVERITY_RANK = {"ERROR": 2, "WARN": 1, "INFO": 0}
+COLOR = {
+ "ERROR": "\033[31m", # red
+ "WARN": "\033[33m", # yellow
+ "INFO": "\033[36m", # cyan
+ "RESET": "\033[0m",
+ "BOLD": "\033[1m",
+}
+
+
+@dataclass
+class Finding:
+ severity: str
+ rule: str
+ location: str # "file:line" or "DOCX:para 42" / "DOCX:table 6 row 3 col 2"
+ message: str
+ snippet: str = ""
+
+ def render(self, use_color: bool = True) -> str:
+ col = COLOR[self.severity] if use_color else ""
+ rst = COLOR["RESET"] if use_color else ""
+ bold = COLOR["BOLD"] if use_color else ""
+ head = f"{col}[{self.severity}]{rst} {bold}{self.rule}{rst} @ {self.location}"
+ body = f"\n {self.message}"
+ snip = f"\n > {self.snippet}" if self.snippet else ""
+ return head + body + snip
+
+
+# ---------------------------------------------------------------------------
+# Source-side rules
+# ---------------------------------------------------------------------------
+
+# Each rule: (pattern, severity, rule_id, message, predicate)
+# predicate(match, line) → bool: returns True to keep the finding (lets us
+# suppress matches that are inside HTML comments or fenced code blocks).
+
+def _outside_table_comment(match: re.Match, line: str, in_comment: bool, in_table: bool) -> bool:
+ """Suppress findings inside HTML comments (where they're allowed) or
+ inside markdown table rows (where they survive intact via add_md_table)."""
+ return not in_comment and not in_table
+
+
+def _always(match: re.Match, line: str, in_comment: bool, in_table: bool) -> bool:
+ return True
+
+
+SOURCE_RULES = [
+ # Pandoc footnote markers — leak as raw text in the DOCX.
+ (re.compile(r"\[\^[A-Za-z0-9_-]+\]"),
+ "ERROR", "pandoc-footnote",
+ "Pandoc-style footnote `[^name]` does not render in DOCX. "
+ "Inline the explanation as a parenthetical instead.",
+ _outside_table_comment),
+
+ # Markdown blockquote `> body` lines — exporter strips them defensively
+ # now, but flag for awareness so authors don't rely on them rendering.
+ (re.compile(r"^>\s"),
+ "WARN", "blockquote",
+ "Markdown blockquote `> ...` is stripped to plain paragraph in DOCX "
+ "(no quote-block formatting). If you intended a callout, use bold "
+ "lead-in instead.",
+ _always),
+
+ # Display-math fences `$$...$$` (only when the line itself starts with
+ # `$$`) — exporter does best-effort linearisation, but the result is
+ # ugly. Inline the equation as plain prose where possible.
+ (re.compile(r"^\$\$.+?\$\$\s*$|^\$\$\s*$"),
+ "WARN", "display-math",
+ "Display math `$$...$$` renders as a best-effort plain-text "
+ "linearisation in DOCX (no MathType/equation rendering). Consider "
+ "replacing with a numbered equation image or inline prose.",
+ _always),
+
+ # Inline math containing `\frac{...{...}...}` — nested braces in a
+ # frac argument are not handled by the exporter's regex.
+ (re.compile(r"\\t?frac\{[^{}]*\{[^{}]*\}[^{}]*\}\{|\\t?frac\{[^{}]+\}\{[^{}]*\{"),
+ "WARN", "nested-frac",
+ "Nested-brace `\\frac{...}{...}` may not linearise cleanly. Verify "
+ "the rendered DOCX paragraph or rewrite the math inline.",
+ _outside_table_comment),
+
+ # Setext-style headers (=== / ---) under a line of text — not handled.
+ (re.compile(r"^=+\s*$|^-{3,}\s*$"),
+ "INFO", "setext-header",
+ "Setext-style header (=== / ---) is not handled by the exporter; "
+ "use ATX (#, ##, ###) instead.",
+ _always),
+
+ # Pandoc fenced div `:::` — not handled.
+ (re.compile(r"^:::"),
+ "ERROR", "pandoc-fenced-div",
+ "Pandoc fenced div `:::` is not handled by the exporter and would "
+ "leak into the DOCX as plain text.",
+ _always),
+
+ # Pandoc bracketed-attribute spans `[text]{.class}` — not handled.
+ (re.compile(r"\][\{][^}]*[\}]"),
+ "WARN", "pandoc-attribute-span",
+ "Pandoc attribute span `[text]{.class}` is not parsed by the exporter "
+ "and the brace block will leak.",
+ _outside_table_comment),
+
+ # File paths in body text — Appendix B is the canonical home for
+ # script→artifact references.
+ (re.compile(r"`signature_analysis/\d+_[a-z_]+\.py`"),
+ "INFO", "script-path-in-body",
+ "Verbose script path in body text. Consider replacing with "
+ "'(reproduction artifact in Appendix B)' for body-prose tightness.",
+ _outside_table_comment),
+
+ # `reports/...json` paths in body text — same rationale.
+ (re.compile(r"`reports/[a-z_]+/[a-z_]+\.(?:json|md)`"),
+ "INFO", "report-path-in-body",
+ "Verbose report-artifact path in body text. Consider replacing with "
+ "'(see Appendix B provenance map)'.",
+ _outside_table_comment),
+
+ # Bare HTML comments that are NOT TABLE/FIGURE markers may indicate
+ # editorial residue. Stripped wholesale by exporter, so harmless, but
+ # worth visibility.
+ (re.compile(r"^" in line:
+ in_comment = False
+ return findings
+
+
+# ---------------------------------------------------------------------------
+# DOCX-side rules
+# ---------------------------------------------------------------------------
+
+DOCX_LEAK_PATTERNS = [
+ # (pattern, severity, rule_id, message)
+ (re.compile(r"\\[a-zA-Z]+(?:\{[^{}]*\})?"),
+ "ERROR", "leftover-latex-cmd",
+ "LaTeX command `\\cmd` leaked into DOCX. Either add a token rule to "
+ "`latex_to_unicode` in `export_v3.py` or rewrite the source as plain text."),
+
+ (re.compile(r"(?\s"),
+ "ERROR", "blockquote-leak",
+ "Markdown blockquote `> ...` leaked literal `>` into DOCX. The "
+ "exporter pre-pass should strip these — check `process_section`."),
+
+ (re.compile(r"\{[,=<>+\-]\}"),
+ "ERROR", "tex-brace-trick",
+ "TeX brace-trick `{=}` / `{,}` leaked. Should be stripped by "
+ "`latex_to_unicode`."),
+
+ (re.compile(r"[]"),
+ "ERROR", "pua-sentinel-leak",
+ "Math-region PUA sentinel (\\uE000 / \\uE001) leaked. A render path "
+ "is bypassing `add_text_with_subsup`; check headings / list items / "
+ "title-page paragraphs."),
+
+ (re.compile(r"__TABLE_CAPTION__"),
+ "ERROR", "table-caption-marker-leak",
+ "Synthetic `__TABLE_CAPTION__:` marker leaked. The marker is meant "
+ "to be consumed by `process_section` and rendered as a centered "
+ "bold caption paragraph."),
+
+ (re.compile(r"signature[a-z]+analysis/\d+[a-z_]+\.py"),
+ "ERROR", "underscore-eaten-path",
+ "Underscores eaten from a script path (e.g., "
+ "`signatureanalysis/28byteidentitydecomposition.py`). The "
+ "math-context-scoped subscript handler in `add_text_with_subsup` "
+ "should leave underscores intact in plain text."),
+
+ (re.compile(r"\b(\w+_\w+)+\b", flags=re.UNICODE),
+ "INFO", "underscore-identifier",
+ "Underscored identifier in body text (e.g., a code symbol or path). "
+ "Verify it renders with underscores intact, not as subscripts."),
+]
+
+
+def lint_docx(docx_path: Path = DOCX_PATH) -> list[Finding]:
+ try:
+ from docx import Document
+ except ImportError:
+ return [Finding("ERROR", "missing-dep",
+ "lint:docx",
+ "python-docx is not installed; cannot run DOCX pass.")]
+
+ if not docx_path.exists():
+ return [Finding("ERROR", "missing-docx",
+ str(docx_path),
+ "Built DOCX not found. Run `python3 export_v3.py` first.")]
+
+ doc = Document(str(docx_path))
+ findings: list[Finding] = []
+ seen_signatures = set() # dedupe identical leaks across paragraphs
+
+ def scan(text: str, location: str):
+ for pat, sev, rule, msg in DOCX_LEAK_PATTERNS:
+ for m in pat.finditer(text):
+ # Skip the INFO-level identifier rule unless it looks like
+ # an obvious math residue (e.g., dHash_indep or N_a).
+ if rule == "underscore-identifier":
+ sample = m.group(0)
+ # Only complain about identifiers that look like math
+ # residue: short, underscore-separated single-char tokens.
+ parts = sample.split("_")
+ if not all(len(p) <= 4 for p in parts):
+ continue
+ if not all(p.isalnum() and not p.isdigit() for p in parts):
+ continue
+ key = (rule, m.group(0))
+ if key in seen_signatures:
+ continue
+ seen_signatures.add(key)
+ findings.append(Finding(
+ severity=sev,
+ rule=rule,
+ location=location,
+ message=msg,
+ snippet=text[max(0, m.start() - 30):m.end() + 30].replace("\n", " ")[:140],
+ ))
+
+ for i, p in enumerate(doc.paragraphs):
+ if p.text:
+ scan(p.text, f"DOCX:para {i}")
+ for ti, t in enumerate(doc.tables):
+ for ri, row in enumerate(t.rows):
+ for ci, cell in enumerate(row.cells):
+ if cell.text:
+ scan(cell.text, f"DOCX:table {ti + 1} row {ri} col {ci}")
+
+ return findings
+
+
+# ---------------------------------------------------------------------------
+# Reporter
+# ---------------------------------------------------------------------------
+
+def summarise(findings: list[Finding], use_color: bool = True) -> int:
+ def c(key: str) -> str:
+ return COLOR[key] if use_color else ""
+
+ if not findings:
+ print(f"{c('BOLD')}{c('INFO')}clean — no leaks detected{c('RESET')}")
+ return 0
+ counts = {"ERROR": 0, "WARN": 0, "INFO": 0}
+ findings.sort(key=lambda f: (-SEVERITY_RANK[f.severity], f.location))
+ for f in findings:
+ counts[f.severity] += 1
+ print(f.render(use_color))
+ print()
+ print(f"{c('BOLD')}summary{c('RESET')}: "
+ f"{c('ERROR')}{counts['ERROR']} ERROR{c('RESET')} "
+ f"{c('WARN')}{counts['WARN']} WARN{c('RESET')} "
+ f"{c('INFO')}{counts['INFO']} INFO{c('RESET')}")
+ if counts["ERROR"]:
+ return 2
+ if counts["WARN"]:
+ return 1
+ return 0
+
+
+def main():
+ ap = argparse.ArgumentParser(
+ description="Lint Paper A v3 markdown sources and rendered DOCX for "
+ "syntax-leak issues.",
+ )
+ ap.add_argument("--source", action="store_true",
+ help="run only the markdown source pass")
+ ap.add_argument("--docx", action="store_true",
+ help="run only the rendered DOCX pass")
+ ap.add_argument("--no-color", action="store_true",
+ help="disable ANSI colour output")
+ args = ap.parse_args()
+
+ use_color = sys.stdout.isatty() and not args.no_color
+ findings: list[Finding] = []
+ if args.source or not (args.source or args.docx):
+ print(f"{COLOR['BOLD'] if use_color else ''}--- source pass "
+ f"({len(V3_SOURCES)} files) ---{COLOR['RESET'] if use_color else ''}")
+ findings.extend(lint_sources())
+ if args.docx or not (args.source or args.docx):
+ print(f"{COLOR['BOLD'] if use_color else ''}\n--- docx pass "
+ f"({DOCX_PATH.name}) ---{COLOR['RESET'] if use_color else ''}")
+ findings.extend(lint_docx())
+
+ print()
+ sys.exit(summarise(findings, use_color))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/paper/paper_a_abstract_v3.md b/paper/paper_a_abstract_v3.md
index f5843fc..34b28b9 100644
--- a/paper/paper_a_abstract_v3.md
+++ b/paper/paper_a_abstract_v3.md
@@ -2,6 +2,6 @@
-Regulations require Certified Public Accountants (CPAs) to attest to each audit report by affixing a signature, but digitization makes reusing a stored signature image across reports---through administrative stamping or firm-level electronic signing---technically trivial and visually invisible to report users, undermining individualized attestation. We build an end-to-end pipeline that detects such *non-hand-signed* signatures at scale: a Vision-Language Model identifies signature pages, a YOLOv11 detector localizes signature regions, ResNet-50 supplies deep features, and a dual-descriptor verification layer combines deep-feature cosine similarity with perceptual hashing (difference hash, dHash) to separate *style consistency* (high cosine, divergent dHash) from *image reproduction* (high cosine, low dHash). The operational classifier outputs a five-way verdict per signature with a worst-case document-level aggregation; the cosine cut is anchored on a transparent whole-sample Firm A P7.5 percentile (cos $> 0.95$), and the dHash cuts on the same reference. Applied to 90,282 audit reports filed in Taiwan over 2013-2023 (182,328 signatures from 758 CPAs), the operational dual rule cos $> 0.95$ AND $\text{dHash}_\text{indep} \leq 8$ captures 89.95\% of Firm A and yields FAR $\leq$ 0.001 against a $\sim$50,000-pair inter-CPA negative anchor; intra-report agreement is 89.9\% at Firm A versus 62-67\% at the other Big-4 firms (a 23-28 percentage-point cross-firm gap). Validation uses three annotation-free anchors (310 byte-identical positives, $\sim$50,000 inter-CPA negatives, and a 70/30 held-out Firm A fold) reported with Wilson 95\% intervals. Three statistical diagnostics applied to the per-signature similarity distribution (Hartigan dip test, EM-fitted Beta mixture with logit-Gaussian robustness check, Burgstahler-Dichev / McCrary density-smoothness procedure) jointly characterise the distribution as a continuous quality spectrum, which motivates the percentile-based anchor and is itself a substantive finding for similarity-threshold selection in document forensics.
+Regulations require Certified Public Accountants (CPAs) to attest to each audit report by affixing a signature, but digitization makes reusing a stored signature image across reports---through administrative stamping or firm-level electronic signing---technically trivial and visually invisible to report users, undermining individualized attestation. We build an end-to-end pipeline that detects such *non-hand-signed* signatures at scale: a Vision-Language Model identifies signature pages, a YOLOv11 detector localizes signature regions, ResNet-50 supplies deep features, and a dual-descriptor verification layer combines deep-feature cosine similarity with perceptual hashing (difference hash, dHash) to separate *style consistency* (high cosine, divergent dHash) from *image reproduction* (high cosine, low dHash). The operational classifier outputs a five-way verdict per signature with a worst-case document-level aggregation; the cosine cut is anchored on a transparent whole-sample Firm A P7.5 percentile (cos $> 0.95$), and the dHash cuts on the same reference. Applied to 90,282 audit reports filed in Taiwan over 2013-2023 (182,328 signatures from 758 CPAs), the operational dual rule cos $> 0.95$ AND $\text{dHash}_\text{indep} \leq 15$ captures 92.46\% of Firm A and yields FAR = 0.0005 against a $\sim$50,000-pair inter-CPA negative anchor; intra-report agreement is 89.9\% at Firm A versus 62-67\% at the other Big-4 firms (a 23-28 percentage-point cross-firm gap). Validation uses three annotation-free anchors (310 byte-identical positives, $\sim$50,000 inter-CPA negatives, and a 70/30 held-out Firm A fold) reported with Wilson 95\% intervals. Three statistical diagnostics applied to the per-signature similarity distribution (Hartigan dip test, EM-fitted Beta mixture with logit-Gaussian robustness check, Burgstahler-Dichev / McCrary density-smoothness procedure) jointly characterise the distribution as a continuous quality spectrum, which motivates the percentile-based anchor and is itself a substantive finding for similarity-threshold selection in document forensics.
diff --git a/paper/paper_a_appendix_v3.md b/paper/paper_a_appendix_v3.md
index ed7ae2e..71881ea 100644
--- a/paper/paper_a_appendix_v3.md
+++ b/paper/paper_a_appendix_v3.md
@@ -49,7 +49,9 @@ For reproducibility, the following table maps each numerical table in Section IV
| Table X (cosine threshold sweep, FAR vs inter-CPA negatives) | `21_expanded_validation.py` | `reports/expanded_validation/expanded_validation_results.json` |
| Table XI (held-out vs calibration Firm A capture rates) | `24_validation_recalibration.py` | `reports/validation_recalibration/validation_recalibration.json` |
| Table XII (operational-cut sensitivity 0.95 vs 0.945) | `24_validation_recalibration.py` | `reports/validation_recalibration/validation_recalibration.json` |
+| Table XII-B (cosine-threshold tradeoff: capture vs inter-CPA FAR) | `21_expanded_validation.py` (FAR column; canonical 50k-pair anchor); inline computation in revision (Firm A and non-Firm-A capture columns) | `reports/expanded_validation/expanded_validation_results.json` |
| Table XIII (Firm A per-year cosine distribution) | `29_firm_a_yearly_distribution.py` | `reports/firm_a_yearly/firm_a_yearly_distribution.json` |
+| Fig. 4 (per-firm yearly best-match cosine, 2013-2023) | `30_yearly_big4_comparison.py` | `reports/figures/fig_yearly_big4_comparison.{png,pdf}`; `reports/firm_yearly_comparison/firm_yearly_comparison.{json,md}` |
| Tables XIV / XV (partner-level similarity ranking) | `22_partner_ranking.py` | `reports/partner_ranking/partner_ranking_results.json` |
| Table XVI (intra-report classification agreement) | `23_intra_report_consistency.py` | `reports/intra_report/intra_report_results.json` |
| Table XVII (document-level five-way classification) | `09_pdf_signature_verdict.py`; `12_generate_pdf_level_report.py` | `reports/pdf_signature_verdicts.json`; `reports/pdf_signature_verdict_report.md` (CSV / XLSX bulk reports also at `reports/`) |
diff --git a/paper/paper_a_conclusion_v3.md b/paper/paper_a_conclusion_v3.md
index b13de1d..359402b 100644
--- a/paper/paper_a_conclusion_v3.md
+++ b/paper/paper_a_conclusion_v3.md
@@ -25,7 +25,6 @@ An ablation study comparing ResNet-50, VGG-16 and EfficientNet-B0 confirmed that
Several directions merit further investigation.
Domain-adapted feature extractors, trained or fine-tuned on signature-specific datasets, may improve discriminative performance beyond the transferred ImageNet features used in this study.
-Extending the analysis to auditor-year units---computing per-signature statistics within each fiscal year and tracking how individual CPAs move across years---could reveal within-CPA transitions between hand-signing and non-hand-signing over the decade and is the natural next step beyond the cross-sectional analysis reported here.
The pipeline's applicability to other jurisdictions and document types (e.g., corporate filings in other countries, legal documents, medical records) warrants exploration.
The replication-dominated calibration strategy and the pixel-identity anchor technique are both generalizable to settings in which (i) a reference subpopulation has a known dominant mechanism and (ii) the target mechanism leaves a byte-level signature in the artifact itself, conditional on the availability of analogous anchors in the new domain and on artifact-generation physics that preserve the byte-level trace.
Finally, integration with regulatory monitoring systems and a larger negative-anchor study---for example drawing from inter-CPA pairs under explicit accountant-level blocking---would strengthen the practical deployment potential of this approach.
diff --git a/paper/paper_a_discussion_v3.md b/paper/paper_a_discussion_v3.md
index 8196ce3..e38287e 100644
--- a/paper/paper_a_discussion_v3.md
+++ b/paper/paper_a_discussion_v3.md
@@ -61,7 +61,7 @@ The dual-descriptor framework correctly identifies these cases as distinct from
The use of Firm A as a calibration reference addresses a fundamental challenge in document forensics: the scarcity of ground truth labels.
In most forensic applications, establishing ground truth requires expensive manual verification or access to privileged information about document provenance.
-Our approach leverages domain knowledge---the established prevalence of non-hand-signing at a specific firm---to create a naturally occurring reference population within the dataset.
+Our approach uses practitioner background---one Big-4 firm reportedly relies predominantly on stamping or e-signing workflows---only as a *motivation* for selecting that firm as a candidate reference population; the calibration role is then established from the audit-report images themselves (byte-identical same-CPA pairs, the Firm A per-signature similarity distribution, partner-ranking concentration, and intra-report consistency), so the calibration does not depend on the practitioner-background claim being externally verified (Section III-H).
This calibration strategy has broader applicability beyond signature analysis.
Any forensic detection system operating on real-world corpora can benefit from identifying subpopulations with known dominant characteristics (positive or negative) to anchor threshold selection, particularly when the distributions of interest are non-normal and non-parametric or mixture-based thresholds are preferred over parametric alternatives.
@@ -97,15 +97,12 @@ This effect would bias classification toward false negatives rather than false p
Fourth, scanning equipment, PDF generation software, and compression algorithms may have changed over the 10-year study period (2013--2023), potentially affecting similarity measurements.
While cosine similarity and dHash are designed to be robust to such variations, longitudinal confounds cannot be entirely excluded.
-Fifth, our cross-sectional analysis does not track individual CPAs longitudinally and therefore cannot confirm or rule out within-CPA mechanism transitions over the sample period (e.g., a CPA who hand-signed early in the sample and switched to firm-level e-signing later, or vice versa).
-Extending the analysis to *auditor-year* units---computing per-signature statistics within each fiscal year and observing how individual CPAs move across years---is the natural next step for resolving such within-CPA transitions and is left to future work.
-
-Sixth, the max/min detection logic treats both ends of a near-identical same-CPA pair as non-hand-signed.
+Fifth, the max/min detection logic treats both ends of a near-identical same-CPA pair as non-hand-signed.
In the rare case that one of the two documents contains a genuinely hand-signed exemplar that was subsequently reused as the stamping or e-signature template, the pair correctly identifies image reuse but misattributes the non-hand-signed status to the source exemplar.
This misattribution affects at most one source document per template variant per CPA (the exemplar from which the template was produced), is not expected to be common given that stored signature templates are typically generated in a separate acquisition step rather than extracted from submitted audit reports, and does not materially affect aggregate capture rates at the firm level.
-Seventh, our analyses remain at the signature level; we abstain from partner-level frequency inferences such as "X% of CPAs hand-sign in a given year."
-Per-signature labels in this paper are not translated to per-report or per-partner mechanism assignments, because making such a translation would require an assumption of within-year uniformity of signing mechanisms that we do not adopt: a CPA's signatures within a single fiscal year may reflect a single replication template, multiple templates used in parallel (e.g., for different engagement positions or reporting pipelines), within-year mechanism mixing, or a combination, and the data at hand do not disambiguate these possibilities (Section III-G).
+Sixth, our analyses remain at the signature level; we abstain from partner-level frequency inferences such as "X% of CPAs hand-sign in a given year."
+Per-signature labels in this paper are not translated to per-report or per-partner mechanism assignments (Section III-G).
The signature-level rates we report, including the 92.5% / 7.5% Firm A split and the year-by-year left-tail share of Section IV-G.1, should accordingly be read as signature-level quantities rather than partner-level frequencies.
Finally, the legal and regulatory implications of our findings depend on jurisdictional definitions of "signature" and "signing."
diff --git a/paper/paper_a_introduction_v3.md b/paper/paper_a_introduction_v3.md
index 19ec10a..b5b39b4 100644
--- a/paper/paper_a_introduction_v3.md
+++ b/paper/paper_a_introduction_v3.md
@@ -25,7 +25,7 @@ This detection problem differs fundamentally from forgery detection: while it do
A secondary methodological concern shapes the research design.
Many prior similarity-based classification studies rely on ad-hoc thresholds---declaring two images equivalent above a hand-picked cosine cutoff, for example---without principled statistical justification.
-Such thresholds are fragile and invite reviewer skepticism, particularly in an archival-data setting where the cost of misclassification propagates into downstream inference.
+Such thresholds are fragile in an archival-data setting where the cost of misclassification propagates into downstream inference.
A defensible approach requires (i) a transparent threshold anchored to an empirical reference population drawn from the target corpus; (ii) statistical diagnostics that characterise the *shape* of the underlying similarity distribution and so motivate the choice of anchor; and (iii) external validation against naturally-occurring anchor populations---byte-level identical pairs as a conservative gold positive subset and large random inter-CPA pairs as a gold negative population---reported with Wilson 95% confidence intervals on per-rule capture / FAR rates, since precision and $F_1$ are not meaningful when the positive and negative anchor populations are sampled from different units.
Despite the significance of the problem for audit quality and regulatory oversight, no prior work has specifically addressed non-hand-signing detection in financial audit documents at scale with these methodological safeguards.
diff --git a/paper/paper_a_methodology_v3.md b/paper/paper_a_methodology_v3.md
index dd9601b..e326f49 100644
--- a/paper/paper_a_methodology_v3.md
+++ b/paper/paper_a_methodology_v3.md
@@ -109,8 +109,22 @@ Non-hand-signing yields extreme similarity under *both* descriptors, since the u
Hand-signing, by contrast, yields high dHash similarity (the overall layout of a signature is preserved across writing occasions) but measurably lower cosine similarity (fine execution varies).
Convergence of the two descriptors is therefore a natural robustness check; when they disagree, the case is flagged as borderline.
-We specifically excluded SSIM (Structural Similarity Index) [30] after empirical testing showed it to be unreliable for scanned documents: the calibration firm (Section III-H) exhibited a mean SSIM of only 0.70 due to scan-induced pixel-level variations, despite near-identical visual content.
-Cosine similarity and dHash are both robust to the noise introduced by the print-scan cycle.
+We did not use SSIM (Structural Similarity Index) [30] or pixel-level comparison as primary descriptors, and the reasons are specific to what each of those measures was designed to do rather than to how either happened to perform on our corpus.
+
+SSIM was developed by Wang et al. [30] as a perceptual quality index for *natural images*, and it factorises local-window image statistics into three components---luminance, contrast, and structural correlation---combined multiplicatively over a sliding window.
+Each of these components is computed at the pixel level on the original-resolution image and is *designed to be sensitive* to small fluctuations in local luminance and local contrast, because that is what makes SSIM track human perception of natural-image quality.
+Applied to a binarised auditor's signature crop, exactly those design choices become liabilities: the JPEG block artifacts, scan-noise speckle, and faint scanner-rule ghosts that are routine in a print-scan cycle perturb local luminance and local contrast in every window they touch, and SSIM amplifies those perturbations in the structural-correlation product.
+A signature reproduced twice from the same stored image---the very case that defines our positive class---is therefore one in which SSIM is structurally guaranteed to penalise the easily perturbed margins around the strokes, even though the strokes themselves are identical up to rendering noise.
+This is a property of how SSIM is constructed, not a finding about how it scored on our data; the empirical observation that the calibration firm exhibits a mean SSIM of only $0.70$ in our corpus is a confirmation of the design-level prediction rather than the basis for the rejection.
+
+Pixel-level comparison---whether $L_1$, $L_2$, or pixel-identity counting---fails on a stricter design ground.
+Pixel-level distances are defined on geometrically aligned images at a common resolution, and they treat any sub-pixel translation, rotation, or rescale as a large perturbation by construction (a one-pixel uniform translation flips a fraction of foreground pixels on a thin-stroke signature crop and inflates pixel L1 distance to the same magnitude as for a different signer's signature).
+Two scans of the same physical document, however, do not share a common pixel grid: scanner DPI, paper-handling alignment, and PDF-page rasterisation each contribute random sub-pixel offsets, and the print-scan cycle that intervenes between the stored stamp image and the audit-report PDF additionally introduces resolution mismatch and small geometric drift.
+A pixel-level descriptor cannot therefore satisfy the basic stability requirement for our task: two presentations of the same stored image must score nearly identically.
+We retain pixel-identity counting only as a *threshold-free anchor* (Section III-J), because byte-identical pairs in our corpus are necessarily produced by literal file reuse rather than by repeated scanning, and so they do not interact with the alignment-fragility argument; they are not used as a primary similarity descriptor.
+
+Cosine similarity on deep embeddings and dHash, in contrast, both remain stable across the print-scan-rasterise cycle by design: cosine on L2-normalised pooled features is invariant to overall scale and bias and degrades gracefully under local-pixel noise that the convolutional backbone has been trained to absorb [14], [21], while dHash compresses the image to a $9 \times 8$ grayscale grid before computing horizontal-gradient signs, which removes the resolution and sub-pixel-alignment sensitivity that breaks pixel-level comparison [19], [27].
+Together they constitute the dual descriptor used throughout the rest of this paper.
## G. Unit of Analysis and Summary Statistics
@@ -144,11 +158,11 @@ A distinctive aspect of our methodology is the use of Firm A---a major Big-4 acc
Rather than treating Firm A as a synthetic or laboratory positive control, we treat it as a naturally occurring *replication-dominated population*: a CPA population whose aggregate signing behavior is dominated by non-hand-signing but is not a pure positive class.
Practitioner knowledge motivated treating Firm A as a candidate calibration reference: the firm is understood within the audit profession to reproduce a stored signature image for the majority of certifying partners---originally via administrative stamping workflows and later via firm-level electronic signing systems---while not ruling out that a minority of partners may continue to hand-sign some or all of their reports.
-This practitioner background is *non-load-bearing* in our analysis: the evidentiary basis used in this paper is the observable image evidence reported below---byte-identical same-CPA pairs, the Firm A per-signature similarity distribution, partner-ranking concentration, and intra-report consistency---which does not depend on any claim about signing practice beyond what the audit-report images themselves show.
+This practitioner background motivates Firm A's selection but is not used as evidence: the evidentiary basis in the analyses below---byte-identical same-CPA pairs, the Firm A per-signature similarity distribution, partner-ranking concentration, and intra-report consistency---is derived entirely from the audit-report images themselves and does not depend on any claim about firm-level signing practice.
We establish Firm A's replication-dominated status through two primary independent quantitative analyses plus a third strand comprising three complementary checks, each of which can be reproduced from the public audit-report corpus alone:
-First, *automated byte-level pair analysis* (Section IV-F.1; reproduced by `signature_analysis/28_byte_identity_decomposition.py` with output in `reports/byte_identity_decomp/byte_identity_decomposition.json`) identifies 145 Firm A signatures that are byte-identical to at least one other same-CPA signature from a different audit report, distributed across 50 distinct Firm A partners (of 180 registered); 35 of these byte-identical matches span different fiscal years.
+First, *automated byte-level pair analysis* (Section IV-F.1; reproduction artifact listed in Appendix B) identifies 145 Firm A signatures that are byte-identical to at least one other same-CPA signature from a different audit report, distributed across 50 distinct Firm A partners (of 180 registered); 35 of these byte-identical matches span different fiscal years.
Byte-identity implies pixel-identity by construction, and independent hand-signing cannot produce pixel-identical images across distinct reports---these pairs therefore establish image reuse as a concrete, threshold-free phenomenon within Firm A and confirm that replication is widespread (50 of 180 registered partners) rather than confined to a handful of CPAs.
Second, *signature-level distributional evidence*: Firm A's per-signature best-match cosine distribution fails to reject unimodality (Hartigan dip test $p = 0.17$, $N = 60{,}448$ Firm A signatures; Section IV-D) and exhibits a long left tail, consistent with a dominant high-similarity regime plus residual within-firm heterogeneity rather than two cleanly separated mechanisms.
@@ -160,10 +174,8 @@ Third, we additionally validate the Firm A benchmark through three complementary
(b) *Partner-level similarity ranking (Section IV-G.2).* When every auditor-year is ranked globally by its per-auditor-year mean best-match cosine (across all firms: Big-4 and Non-Big-4), Firm A auditor-years account for 95.9% of the top decile against a baseline share of 27.8% (a 3.5$\times$ concentration ratio), and this over-representation is stable across 2013-2023. This analysis uses only the ordinal ranking and is independent of any absolute cutoff.
(c) *Intra-report consistency (Section IV-G.3).* Because each Taiwanese statutory audit report is co-signed by two engagement partners, firm-wide stamping practice predicts that both signers on a given Firm A report should receive the same signature-level label under the classifier. Firm A exhibits 89.9% intra-report agreement against 62-67% at the other Big-4 firms. This test uses the operational classifier and is therefore a *consistency* check on the classifier's firm-level output rather than a threshold-free test; the cross-firm gap (not the absolute rate) is the substantive finding.
-We emphasize that the 92.5% figure is a within-sample consistency check rather than an independent validation of Firm A's status; the validation role is played by the byte-level pixel-identity evidence, the unimodal-long-tail dip-test result, the three complementary analyses above, and the held-out Firm A fold (described in Section III-J; fold-level rate differences are disclosed in Section IV-F.2).
-
-We emphasize that Firm A's replication-dominated status was *not* derived from the thresholds we calibrate against it.
-Its identification rests on the byte-level pair evidence and the dip-test-confirmed unimodal-long-tail shape, both of which are independent of any threshold choice.
+The 92.5% figure is a within-sample consistency check rather than an independent validation of Firm A's status; the validation role is played by the byte-level pixel-identity evidence, the unimodal-long-tail dip-test result, the three complementary analyses above, and the held-out Firm A fold (described in Section III-J; fold-level rate differences are disclosed in Section IV-F.2).
+Firm A's replication-dominated status itself was *not* derived from the thresholds we calibrate against it; it rests on the byte-level pair evidence and the dip-test-confirmed unimodal-long-tail shape, both of which are independent of any threshold choice.
The "replication-dominated, not pure" framing is important both for internal consistency---it predicts and explains the long left tail observed in Firm A's cosine distribution (Section IV-D)---and for avoiding overclaim in downstream inference.
## I. Signature-Level Threshold Characterisation
@@ -171,9 +183,9 @@ The "replication-dominated, not pure" framing is important both for internal con
This section describes how we set the operational classifier's similarity threshold and how we characterise the per-signature similarity distribution that supports it.
The two roles are kept separate by design.
-> **Operational threshold (used by the classifier).** The cosine cut is anchored on the whole-sample Firm A P7.5 percentile (cos $> 0.95$; Section III-K).
->
-> **Statistical characterisation (used to motivate the choice of anchor and to describe the distributional structure).** A Hartigan dip test, an EM-fitted Beta mixture (with logit-Gaussian robustness check), and a Burgstahler-Dichev / McCrary density-smoothness procedure---all applied at the per-signature level (Section IV-D).
+**Operational threshold (used by the classifier).** The cosine cut is anchored on the whole-sample Firm A P7.5 percentile (cos $> 0.95$; Section III-K).
+
+**Statistical characterisation (used to motivate the choice of anchor and to describe the distributional structure).** A Hartigan dip test, an EM-fitted Beta mixture (with logit-Gaussian robustness check), and a Burgstahler-Dichev / McCrary density-smoothness procedure---all applied at the per-signature level (Section IV-D).
The reason for the split is empirical.
The three statistical diagnostics jointly find that per-signature similarity forms a continuous quality spectrum (Section IV-D, summarised below): the dip test fails to reject unimodality for Firm A; BIC strongly prefers a 3-component over a 2-component Beta fit, so the 2-component crossing is a forced fit; and the BD/McCrary candidate transition lies inside the non-hand-signed mode rather than between modes (and is not bin-width-stable; Appendix A).
@@ -279,9 +291,14 @@ High feature-level similarity without structural corroboration---consistent with
5. **Likely hand-signed:** Cosine below the all-pairs KDE crossover threshold.
We note three conventions about the thresholds.
-First, the cosine cutoff $0.95$ corresponds to approximately the whole-sample Firm A P7.5 of the per-signature best-match cosine distribution---that is, 92.5% of whole-sample Firm A signatures exceed this cutoff and 7.5% fall at or below it (Section III-H)---chosen as a round-number lower-tail boundary whose complement (92.5% above) has a transparent interpretation in the whole-sample reference distribution; the cosine crossover $0.837$ is the all-pairs intra/inter KDE crossover; both are derived from whole-sample distributions rather than from the 70% calibration fold, so the classifier inherits its operational cosine cuts from the whole-sample Firm A and all-pairs distributions.
-Section IV-F.3 reports a sensitivity check confirming that replacing $0.95$ with the nearby rounded sensitivity cut $0.945$ (motivated by the calibration-fold P5 = 0.9407, see Section IV-F.2) shifts whole-Firm-A dual-rule capture by 1.19 percentage points, so the round-number heuristic is robust to nearby percentile-based alternatives.
-Section IV-F.2 reports both calibration-fold and held-out-fold capture rates for this classifier so that fold-level sampling variance is visible.
+First, the cosine cutoff $0.95$ is the *operating point* chosen for the five-way classifier from a small grid of candidate cuts, on the basis of an explicit capture-vs-FAR tradeoff against the inter-CPA negative anchor of Section III-J---*not* a discovered natural boundary in the per-signature distribution.
+The candidate grid spans the calibration-fold P5 (0.9407), its rounded value (0.945), the operational anchor (0.95), and two reference points drawn from the signature-level threshold-estimator outputs of Section IV-D (the Firm A Beta-2 forced-fit crossing 0.977 and the BD/McCrary candidate transition 0.985); for each grid point Section IV-F.3 reports the Firm A capture rate, the non-Firm-A capture rate, and the inter-CPA FAR with Wilson 95% CI (Table XII-B).
+Three considerations motivate the operating point at 0.95.
+(i) *Inter-CPA specificity.* At cosine $> 0.95$ the inter-CPA FAR against the 50,000-pair anchor of Section IV-F.1 is $0.0005$ (Wilson 95% CI $[0.0003, 0.0007]$): one in two thousand random cross-CPA pairs exceeds the cut, an order-of-magnitude margin against the working assumption that random cross-CPA pairs do not arise from image reuse.
+(ii) *Capture stability under nearby alternatives.* Moving the cut to $0.945$ raises Firm A capture by 1.51 percentage points (operational dual rule cos $> t$ AND $\text{dHash}_\text{indep} \leq 15$; Section IV-F.3) and inter-CPA FAR by $0.00032$, while moving it to the calibration-fold P5 of $0.9407$ raises Firm A capture by 2.63 percentage points and inter-CPA FAR by $0.00076$; in either direction the qualitative finding---Firm A is replication-dominated, non-Firm-A capture is much lower at the same cut, and the inter-CPA noise floor is small---is preserved.
+(iii) *Interpretive transparency.* The complement $7.5\%$ corresponds to the whole-sample Firm A P7.5 of the per-signature best-match cosine distribution---that is, $92.5\%$ of whole-sample Firm A signatures exceed this cutoff and $7.5\%$ fall at or below it (Section III-H)---which gives the operational cut a transparent reading in the replication-dominated reference population without requiring a parametric mixture fit that the data of Section IV-D do not support.
+The cosine crossover $0.837$ is the all-pairs intra/inter KDE crossover; both $0.95$ and $0.837$ are derived from whole-sample distributions rather than from the 70% calibration fold, so the classifier inherits its operational cosine cuts from the whole-sample Firm A and all-pairs distributions.
+Section IV-F.2 reports both calibration-fold and held-out-fold capture rates for this classifier so that fold-level sampling variance is visible; Section IV-F.3 (Table XII-B) reports the full capture-vs-FAR tradeoff at the candidate grid above.
Second, the dHash cutoffs $\leq 5$ and $> 15$ are chosen from the whole-sample Firm A $\text{dHash}_\text{indep}$ distribution: $\leq 5$ captures the upper tail of the high-similarity mode (whole-sample Firm A median $\text{dHash}_\text{indep} = 2$, P75 $\approx 4$, so $\leq 5$ is the band immediately above median), while $> 15$ marks the regime in which independent-minimum structural similarity is no longer indicative of image reproduction.
Third, the signature-level threshold-estimator outputs of Section IV-D (KDE antimode, Beta-mixture and logit-Gaussian crossings, BD/McCrary diagnostic) are *not* the operational thresholds of this classifier: they are descriptive characterisation of the per-signature similarity distribution, and Section IV-D shows they do not converge to a clean two-mechanism boundary at the per-signature level---which is why the operational cosine cut is anchored on the whole-sample Firm A percentile rather than on any mixture-fit crossing.
diff --git a/paper/paper_a_results_v3.md b/paper/paper_a_results_v3.md
index 4fefd83..cba9109 100644
--- a/paper/paper_a_results_v3.md
+++ b/paper/paper_a_results_v3.md
@@ -102,17 +102,30 @@ The three diagnostics agree that per-signature similarity does not form a clean
Table VI summarises the signature-level threshold-estimator outputs for cross-method comparison.
Non-hand-signed replication quality is therefore best read as a continuous spectrum produced by firm-specific reproduction technologies (administrative stamping in early years, firm-level e-signing later) acting on a common stored exemplar.
@@ -126,20 +139,30 @@ Table IX reports the proportion of Firm A signatures crossing each candidate thr
-Table IX is a whole-sample consistency check rather than an external validation: the thresholds 0.95, dHash median, and dHash 95th percentile are themselves anchored to the whole-sample Firm A distribution described in Section III-K (the 70/30 calibration-fold thresholds of Table XI are separate and slightly different, e.g., calibration-fold cosine P5 = 0.9407 rather than the whole-sample heuristic 0.95).
-The dual rule cosine $> 0.95$ AND dHash $\leq 8$ captures 89.95% of Firm A, a value that is consistent with the dip-test-confirmed unimodal-long-tail shape of Firm A's per-signature cosine distribution (Section IV-D.1) and the 92.5% / 7.5% signature-level split (Section III-H).
+Table IX is a whole-sample consistency check rather than an external validation: the cosine cut $0.95$ and the operational dHash band edges ($\leq 5$ high-confidence cap and $\leq 15$ style-consistency boundary) are themselves anchored to the whole-sample Firm A distribution described in Section III-K (the 70/30 calibration-fold thresholds of Table XI are separate and slightly different, e.g., calibration-fold cosine P5 = 0.9407 rather than the whole-sample heuristic 0.95).
+The operational dual rule used by the five-way classifier of Section III-K---cosine $> 0.95$ AND $\text{dHash}_\text{indep} \leq 15$ (the union of the high-confidence and moderate-confidence non-hand-signed buckets)---captures 92.46% of Firm A; the high-confidence component alone (cosine $> 0.95$ AND $\text{dHash}_\text{indep} \leq 5$) captures 81.70%.
+For continuity with prior calibration-fold reporting (Section IV-F.2 reports the calibration-fold rate at the calibration-fold-P95-adjacent cut $\text{dHash}_\text{indep} \leq 8$), Table IX also lists the cosine $> 0.95$ AND $\text{dHash}_\text{indep} \leq 8$ rate of 89.95%; this is *not* the operational classifier rule but a cross-reference value.
+Both operational rates are consistent with the dip-test-confirmed unimodal-long-tail shape of Firm A's per-signature cosine distribution (Section IV-D.1) and the 92.5% / 7.5% signature-level split (Section III-H).
Section IV-F.2 reports the corresponding rates on the 30% Firm A hold-out fold, which provides the external check these whole-sample rates cannot.
## F. Pixel-Identity, Inter-CPA, and Held-Out Firm A Validation
@@ -149,7 +172,7 @@ We report three validation analyses corresponding to the anchors of Section III-
### 1) Pixel-Identity Positive Anchor with Inter-CPA Negative Anchor
Of the 182,328 extracted signatures, 310 have a same-CPA nearest match that is byte-identical after crop and normalization (pixel-identical-to-closest = 1); these form the byte-identity positive anchor---a pair-level proof of image reuse that serves as conservative ground truth for non-hand-signed signatures, subject to the source-template edge case discussed in Section V-G.
-Within Firm A specifically, 145 of these byte-identical signatures are distributed across 50 distinct partners (of 180 registered Firm A partners), with 35 of the byte-identical pairs spanning different fiscal years; this Firm A decomposition is reproduced by `signature_analysis/28_byte_identity_decomposition.py` and reported in `reports/byte_identity_decomp/byte_identity_decomposition.json` (Appendix B).
+Within Firm A specifically, 145 of these byte-identical signatures are distributed across 50 distinct partners (of 180 registered Firm A partners), with 35 of the byte-identical pairs spanning different fiscal years; reproduction artifact for this Firm A decomposition is listed in Appendix B.
As the gold-negative anchor we sample 50,000 i.i.d. random cross-CPA signature pairs from the full 168,755-signature matched corpus (inter-CPA cosine: mean $= 0.763$, $P_{95} = 0.886$, $P_{99} = 0.915$, max $= 0.992$).
Because the positive and negative anchor populations are constructed from different sampling units (byte-identical same-CPA pairs vs random inter-CPA pairs), their relative prevalence in the combined anchor set is arbitrary, and precision / $F_1$ / recall therefore have no meaningful population interpretation.
We accordingly report FAR with Wilson 95% confidence intervals against the large inter-CPA negative anchor in Table X.
@@ -163,8 +186,8 @@ We do not report an Equal Error Rate: EER is meaningful only when the positive a
| 0.900 | 0.0250 | [0.0237, 0.0264] |
| 0.945 (calibration-fold P5 rounded) | 0.0008 | [0.0006, 0.0011] |
| 0.950 (whole-sample Firm A P7.5; operational cut) | 0.0005 | [0.0003, 0.0007] |
-| 0.973 (signature-level Beta/KDE forced-fit reference) | 0.0002 | [0.0001, 0.0004] |
-| 0.979 (signature-level Beta-2 forced-fit crossing) | 0.0001 | [0.0001, 0.0003] |
+| 0.977 (Firm A Beta-2 forced-fit crossing; Section IV-D) | 0.00014 | [0.00007, 0.00029] |
+| 0.985 (BD/McCrary candidate transition; Appendix A) | 0.00004 | [0.00001, 0.00015] |
Table note: We do not include FRR against the byte-identical positive anchor as a column here: the byte-identical subset has cosine $\approx 1$ by construction, so FRR against that subset is trivially $0$ at every threshold below $1$ and carries no biometric information beyond verifying that the threshold does not exceed $1$. The conservative-subset FRR role of the byte-identical anchor is instead discussed qualitatively in Section V-F.
-->
@@ -193,17 +216,17 @@ Table XI reports both calibration-fold and held-out-fold capture rates with Wils
| dHash_indep ≤ 8 | 94.84% [94.63%, 95.04%] | 96.13% [95.82%, 96.43%] | -6.45 | <0.001 | 42,788/45,116 | 14,739/15,332 |
| dHash_indep ≤ 9 (calib-fold P95) | 96.65% [96.48%, 96.81%] | 97.48% [97.22%, 97.71%] | -5.07 | <0.001 | 43,604/45,116 | 14,945/15,332 |
| dHash_indep ≤ 15 | 99.83% [99.79%, 99.87%] | 99.84% [99.77%, 99.89%] | -0.31 | 0.754 n.s. | 45,040/45,116 | 15,308/15,332 |
-| cosine > 0.95 AND dHash_indep ≤ 8 | 89.40% [89.12%, 89.68%] | 91.54% [91.09%, 91.97%] | -7.60 | <0.001 | 40,335/45,116 | 14,035/15,332 |
+| cosine > 0.95 AND dHash_indep ≤ 8 (calibration-fold P95-adjacent reference; P95 = 9) | 89.40% [89.12%, 89.68%] | 91.54% [91.09%, 91.97%] | -7.60 | <0.001 | 40,335/45,116 | 14,035/15,332 |
+| cosine > 0.95 AND dHash_indep ≤ 15 (operational classifier rule, Section III-K) | 92.09% [91.84%, 92.34%] | 93.56% [93.16%, 93.93%] | -5.93 | <0.001 | 41,548/45,116 | 14,344/15,332 |
Calibration-fold thresholds: Firm A cosine median = 0.9862, P1 = 0.9067, P5 = 0.9407; dHash_indep median = 2, P95 = 9. Counts and z/p values are reproducible from the supplementary materials (fixed random seed).
-->
-Table XI reports both calibration-fold and held-out-fold capture rates with Wilson 95% CIs and a two-proportion $z$-test.
We report fold-versus-fold comparisons rather than fold-versus-whole-sample comparisons, because the whole-sample rate is a weighted average of the two folds and therefore cannot, in general, fall inside the Wilson CI of either fold when the folds differ in rate; the correct generalization reference is the calibration fold, which produced the thresholds.
Under this proper test the two extreme rules agree across folds (cosine $> 0.837$ and $\text{dHash}_\text{indep} \leq 15$; both $p > 0.7$).
The operationally relevant rules in the 85–95% capture band differ between folds by 1–5 percentage points ($p < 0.001$ given the $n \approx 45\text{k}/15\text{k}$ fold sizes).
-Both folds nevertheless sit in the same replication-dominated regime: every calibration-fold rate in the 85–99% range has a held-out counterpart in the 87–99% range, and the operational dual rule cosine $> 0.95$ AND $\text{dHash}_\text{indep} \leq 8$ captures 89.40% of the calibration fold and 91.54% of the held-out fold.
+Both folds nevertheless sit in the same replication-dominated regime: every calibration-fold rate in the 85–99% range has a held-out counterpart in the 87–99% range, and the calibration-fold-adjacent reference rule cosine $> 0.95$ AND $\text{dHash}_\text{indep} \leq 8$ (the integer cut immediately below the calibration-fold dHash P95 of 9) captures 89.40% of the calibration fold and 91.54% of the held-out fold; the operational classifier rule cosine $> 0.95$ AND $\text{dHash}_\text{indep} \leq 15$ used by the five-way classifier of Section III-K captures still higher rates in both folds (calibration 92.09%, 41,548 / 45,116; held-out 93.56%, 14,344 / 15,332).
The modest fold gap is consistent with within-Firm-A heterogeneity in replication intensity: the random 30% CPA sample evidently contained proportionally more high-replication CPAs.
We therefore interpret the held-out fold as confirming the qualitative finding (Firm A is strongly replication-dominated across both folds) while cautioning that exact rates carry fold-level sampling noise that a single 30% split cannot eliminate; the threshold-independent partner-ranking analysis (Section IV-G.2) is the cross-check that is robust to this fold variance.
@@ -214,25 +237,79 @@ We report a sensitivity check in which this round-number cut is replaced by the
Table XII reports the five-way classifier output under each cut.
-At the aggregate firm-level, the operational dual rule cos $> 0.95$ AND $\text{dHash}_\text{indep} \leq 8$ captures 89.95% of whole Firm A under the 0.95 cut and 91.14% under the 0.945 cut---a shift of 1.19 percentage points.
+At the aggregate firm-level, the calibration-fold-adjacent reference dual rule cos $> 0.95$ AND $\text{dHash}_\text{indep} \leq 8$ captures 89.95% of whole Firm A under the 0.95 cut and 91.14% under the 0.945 cut---a shift of 1.19 percentage points.
+The operational classifier rule cos $> 0.95$ AND $\text{dHash}_\text{indep} \leq 15$ used by the five-way classifier of Section III-K captures 92.46% under the 0.95 cut and 93.97% under the 0.945 cut---a shift of 1.51 percentage points.
+
+Reading the wider grid in Table XII: the High-confidence and Moderate-confidence shares shift by less than 5 percentage points across the 0.940-0.950 neighbourhood, while pushing the cosine cut to 0.970 or 0.985 produces qualitatively different classifier behaviour (Moderate-confidence collapses from 26.02% at $0.95$ to 8.81% at $0.97$ and 1.32% at $0.985$, with the displaced mass landing in Uncertain rather than reclassifying out of the corpus).
+The classifier output is therefore robust to small (~0.005-cosine) perturbations of the operational cut but not to wholesale reanchoring at the threshold-estimator outputs of Section IV-D, which is consistent with our reading that those outputs are not classifier thresholds.
At the per-signature categorization level, replacing 0.95 by 0.945 reclassifies 8,508 signatures (5.04% of the corpus) out of the Uncertain band; 6,095 of them migrate to Moderate-confidence non-hand-signed, 2,294 to High-confidence non-hand-signed, and 119 to High style consistency.
The Likely-hand-signed category is unaffected because it depends only on the fixed all-pairs KDE crossover cosine $= 0.837$.
The High-confidence non-hand-signed share grows from 45.62% to 46.98%.
We interpret this sensitivity pattern as indicating that the classifier's aggregate and high-confidence output is robust to the choice of operational cut within a 0.005-cosine neighbourhood of the Firm A P7.5 anchor, and that the movement is concentrated at the Uncertain/Moderate-confidence boundary.
-The paper therefore retains cos $> 0.95$ as the primary operational cut for transparency (round-number P7.5 of the whole-sample Firm A reference distribution) and reports the 0.945 results as a sensitivity check rather than as a deployed alternative.
+
+To make the operating-point selection (Section III-K) auditable rather than presented as a single fixed value, Table XII-B reports the capture-vs-FAR tradeoff over the candidate threshold grid spanning the calibration-fold P5 (0.9407), its rounded value (0.945), the operational anchor (0.95), the Firm A Beta-2 forced-fit crossing from Section IV-D.3 (0.977), and the BD/McCrary candidate transition from Section IV-D.2 (0.985).
+For each grid point we report Firm A capture (under both the cosine-only marginal and the operational dual rule cos $> t$ AND $\text{dHash}_\text{indep} \leq 15$ used by the five-way classifier of Section III-K), non-Firm-A capture (the cosine-only marginal in the 108,292 non-Firm-A matched signatures), and inter-CPA FAR with Wilson 95% CI against the 50,000-pair anchor of Section IV-F.1.
+
+
+
+Reading Table XII-B, three patterns motivate the choice of $0.95$ as the operating point.
+First, *Firm A capture* on the operational dual rule decays smoothly from 95.09% at $t = 0.9407$ to 55.26% at $t = 0.985$.
+Relaxing the cut from $0.95$ to $0.945$ buys 1.51 percentage points of additional Firm A capture, and to $0.9407$ buys 2.63 percentage points; tightening from $0.95$ to $0.977$ costs 17.96 percentage points and to $0.985$ costs 37.20 percentage points.
+The selected cut at $0.95$ is the strictest cut on this grid at which Firm A capture remains above $90\%$ on the operational dual rule.
+Second, *inter-CPA FAR* is small in absolute terms across the entire candidate grid ($0.00126$ at $0.9407$, falling to $0.00004$ at $0.985$): under any of these operating points the classifier's specificity against random cross-CPA pairs is in the per-mille range or better, so FAR alone does not determine the choice.
+The marginal FAR cost of relaxing from $0.95$ to $0.945$ is $+0.00032$ ($25 \to 41$ false positives per 50,000 pairs) and to $0.9407$ is $+0.00076$ ($25 \to 63$); the marginal FAR savings from tightening to $0.977$ and $0.985$ are $-0.00036$ and $-0.00046$ respectively.
+The FAR savings from going stricter are small in absolute terms compared with the corresponding Firm A capture loss, which makes $0.95$ a balanced operating point on this grid rather than a uniquely optimal one.
+Third, *non-Firm-A capture* (the cosine-only marginal in the 108,292 non-Firm-A signatures) decays from 67.51% at $0.945$ to 60.50% at $0.95$, 13.14% at $0.977$, and 5.73% at $0.985$.
+The Firm-A-minus-non-Firm-A gap widens with strictness through $0.977$ and then contracts (22.41 percentage points at $0.9407$; 26.46 at $0.945$; 31.97 at $0.95$; 61.36 at $0.977$; 49.54 at $0.985$): on the $0.95 \to 0.977$ segment non-Firm-A capture falls faster than Firm A capture in absolute terms ($-47.35$ vs $-17.96$ percentage points), so the widening is dominated by non-Firm-A removal rather than by an intrinsic property of Firm A; on the $0.977 \to 0.985$ segment Firm A capture falls faster than non-Firm-A's already-low residual, so the gap contracts.
+We do *not* read the gap pattern as evidence for a particular cut; it is reported here as cross-firm replication heterogeneity rather than as a selection criterion.
+The operating point at $0.95$ is therefore a defensible---not unique---selection in this neighbourhood, motivated by (i) keeping Firm A capture above $90\%$ on the operational dual rule, (ii) achieving an FAR of $0.0005$ at which marginal further savings from tightening are small relative to the corresponding capture loss, and (iii) preserving the interpretive transparency of the whole-sample Firm A P7.5 reading.
+It is *not* derived from the threshold-estimator outputs of Section IV-D, which the data do not support as classifier thresholds.
+
+The paper therefore retains cos $> 0.95$ as the primary operational cut and reports the 0.945 result of Table XII as a sensitivity check rather than as a deployed alternative; downstream document-level rates (Table XVII) and intra-report agreement (Table XVI) are robust to moderate cutoff shifts within the 0.945--0.95 neighbourhood as long as the same cutoff is applied uniformly across firms.
## G. Additional Firm A Benchmark Validation
+Before presenting the three threshold-robust analyses, Fig. 4 summarises the per-firm yearly per-signature best-match cosine distribution that motivates them.
+The left panel reports the mean per-signature best-match cosine within each firm bucket and fiscal year (a threshold-free statistic); the right panel reports the share of each firm-bucket-year with per-signature best-match cosine $\geq 0.95$ (the operational cut of Section III-K).
+Both panels show Firm A above the other Big-4 firms in every year of the 2013-2023 sample, with non-Big-4 firms below all four Big-4 firms throughout, and the cross-firm ordering is stable across the sample period.
+The mean-cosine separation between Firm A and the other Big-4 firms is on the order of 0.02-0.04 throughout the sample (e.g., 2013: Firm A $0.9733$ vs Firm B $0.9498$, Firm C $0.9464$, Firm D $0.9395$, Non-Big-4 $0.9227$; 2023: $0.9860$ vs $0.9668$, $0.9662$, $0.9525$, $0.9346$); the share-above-0.95 separation is wider (2013: Firm A $87.2\%$ vs $61.8\%$, $56.2\%$, $38.5\%$, $27.5\%$).
+This visual is the most direct cross-firm evidence in the paper that Firm A's high-similarity behaviour is firm-specific rather than corpus-wide; the three subsections below decompose this gap along three threshold-free or threshold-robust dimensions.
+
+
+
The capture rates of Section IV-E are an *internal* consistency check: they ask "how much of Firm A does our threshold capture?", but the threshold was itself derived from Firm A's percentiles, so a high capture rate is not surprising.
To go beyond this circular check, we report three further analyses, each chosen so that the *informative quantity* does not depend on the threshold's absolute value:
@@ -277,33 +354,38 @@ We test this prediction directly.
For each auditor-year (CPA $\times$ fiscal year) with at least 5 signatures we compute the mean best-match cosine similarity across the year's signatures, yielding 4,629 auditor-years across 2013-2023.
Firm A accounts for 1,287 of these (27.8% baseline share).
Table XIV reports per-firm occupancy of the top $K\%$ of the ranked distribution.
-The per-signature best-match cosine underlying each auditor-year mean is taken over the full same-CPA pool (Section III-G) and may match against signatures from other fiscal years, so the auditor-year mean reflects the year's signatures' position within the CPA's full-sample similarity structure rather than purely within-year similarity; a within-year-restricted sensitivity replication is a natural robustness check and is left to future work.
+The per-signature best-match cosine underlying each auditor-year mean is taken over the full same-CPA pool (Section III-G), consistent with the unit-of-analysis framing in Section III-G.
-Firm A occupies 95.9% of the top 10% and 90.1% of the top 25% of auditor-years by similarity, against its baseline share of 27.8%---a concentration ratio of 3.5$\times$ at the top decile and 3.2$\times$ at the top quartile.
+Firm A occupies 95.9% of the top 10%, 94.8% of the top 20%, 90.1% of the top 25%, and 81.3% of the top 30% of auditor-years by similarity, against its baseline share of 27.8%---a concentration ratio of $3.5\times$ at the top decile, $3.4\times$ at the top quintile, and $2.9\times$ at the top tercile.
+Firm A's share decays monotonically as the bracket widens (95.9% $\to$ 94.8% $\to$ 90.1% $\to$ 81.3% $\to$ 52.7% across top-10/20/25/30/50%), and only at the top 50% does its share approach its baseline; the over-representation is therefore concentrated in the very top of the distribution rather than spread uniformly through the upper half.
Year-by-year (Table XV), the top-10% Firm A share ranges from 88.4% (2020) to 100% (2013, 2014, 2017, 2018, 2019), showing that the concentration is stable across the sample period.
-
This over-representation is consistent with firm-wide non-hand-signing practice at Firm A and is not derived from any threshold we subsequently calibrate.
@@ -339,8 +421,7 @@ We note that this test uses the calibrated classifier of Section III-K rather th
## H. Classification Results
-Table XVII presents the final classification results under the dual-descriptor framework with Firm A-calibrated thresholds for 84,386 documents.
-The document count (84,386) differs from the 85,042 documents with any YOLO detection (Table III) because 656 documents have no signature whose extracted handwriting could be matched to a registered CPA name (every such signature has `assigned_accountant IS NULL` in the database, typically because the auditor's report page deviates from the standard two-signature layout or the OCRed printed CPA name was not present in the registry); the per-document classifier requires at least one CPA-matched signature so that a same-CPA best-match similarity exists, and these documents are therefore excluded from the classification reported here.
+Table XVII presents the final classification results under the dual-descriptor framework with Firm A-calibrated thresholds for 84,386 documents (656 documents excluded from the 85,042-document YOLO-detection cohort because no signature on the document could be matched to a registered CPA; see Table XVII note).
We emphasize that the document-level proportions below reflect the *worst-case aggregation rule* of Section III-K: a report carrying one stamped signature and one hand-signed signature is labeled with the most-replication-consistent of the two signature-level verdicts.
Document-level rates therefore represent the share of reports in which *at least one* signature is non-hand-signed rather than the share in which *both* are; the intra-report agreement analysis of Section IV-G.3 (Table XVI) reports how frequently the two co-signers share the same signature-level label within each firm, so that readers can judge what fraction of the non-hand-signed document-level share corresponds to fully non-hand-signed reports versus mixed reports.
@@ -354,6 +435,7 @@ Document-level rates therefore represent the share of reports in which *at least
| Likely hand-signed | 47 | 0.1% | 4 | 0.0% |
Per the worst-case aggregation rule of Section III-K, a document with two signatures inherits the most-replication-consistent of the two signature-level labels.
+The 84,386-document cohort excludes 656 documents (relative to the 85,042 YOLO-detected cohort of Table III) for which no signature could be matched to a registered CPA: the per-document classifier requires at least one CPA-matched signature so that a same-CPA best-match similarity is defined. The exclusion is definitional rather than discretionary; typical causes are auditor's-report-page formats deviating from the standard two-signature layout, or OCR returning a printed CPA name not present in the registry.
-->
Within the 71,656 documents exceeding cosine $0.95$, the dHash dimension stratifies them into three distinct populations:
@@ -366,7 +448,7 @@ A cosine-only classifier would treat all 71,656 identically; the dual-descriptor
96.9% of Firm A's documents fall into the high- or moderate-confidence non-hand-signed categories, 0.6% into high-style-consistency, and 2.5% into uncertain.
This pattern is consistent with the replication-dominated framing: the large majority is captured by non-hand-signed rules, while the small residual is consistent with the within-firm heterogeneity implied by the dip-test-confirmed unimodal-long-tail shape of Firm A's per-signature cosine distribution (Section IV-D.1) and the 7.5% signature-level left tail (Section III-H).
-The near-zero "likely hand-signed" rate (4 of 30,226 Firm A documents, 0.013%; the 30,226 count here is documents with at least one Firm A signer under the 84,386-document classification cohort, which differs from the 30,222 single-firm two-signer subset in Table XVI by 4 reports) indicates that the within-firm heterogeneity implied by the 7.5% signature-level left tail (Section IV-D) does not project into the lowest-cosine document-level category under the dual-descriptor rules; it is absorbed instead into the uncertain or high-style-consistency categories at this threshold set.
+The near-zero "likely hand-signed" rate (4 of 30,226 Firm A documents, 0.013%; the 30,226 denominator is documents with at least one Firm A signer under the 84,386-document classification cohort, which differs from the 30,222 single-firm two-signer subset of Table XVI by 4 mixed-firm reports excluded from the firm-level intra-report comparison) indicates that the within-firm heterogeneity implied by the 7.5% signature-level left tail (Section IV-D) does not project into the lowest-cosine document-level category under the dual-descriptor rules; it is absorbed instead into the uncertain or high-style-consistency categories at this threshold set.
We note that because the non-hand-signed thresholds are themselves calibrated to Firm A's empirical percentiles (Section III-H), these rates are an internal consistency check rather than an external validation; the held-out Firm A validation of Section IV-F.2 is the corresponding external check.
### 2) Cross-Firm Comparison of Dual-Descriptor Convergence
@@ -374,7 +456,7 @@ We note that because the non-hand-signed thresholds are themselves calibrated to
Among the 65,514 non-Firm-A signatures with per-signature best-match cosine $> 0.95$, 42.12% have $\text{dHash}_\text{indep} \leq 5$, compared to 88.32% of the 55,922 Firm A signatures meeting the same cosine condition---a $\sim 2.1\times$ difference that the structural-verification layer makes visible.
The Firm A denominator (55,922) matches Table IX exactly: both Table IX and the cross-firm decomposition define Firm A membership via the CPA registry (`accountants.firm`), and the cross-firm analysis additionally requires a non-null independent-min dHash record, which all 55,922 Firm A cosine-eligible signatures have in the current database.
This cross-firm gap is consistent with firm-wide non-hand-signing practice at Firm A versus partner-specific or per-engagement replication at other firms; it complements the partner-level ranking (Section IV-G.2) and intra-report consistency (Section IV-G.3) findings.
-Counts and percentages are reproduced by `signature_analysis/28_byte_identity_decomposition.py` and reported in `reports/byte_identity_decomp/byte_identity_decomposition.json` (see Appendix B for the table-to-script provenance map).
+Reproduction artifact for these counts is listed in Appendix B.
## I. Ablation Study: Feature Backbone Comparison
diff --git a/signature_analysis/21_expanded_validation.py b/signature_analysis/21_expanded_validation.py
index 5aa37da..39dea77 100644
--- a/signature_analysis/21_expanded_validation.py
+++ b/signature_analysis/21_expanded_validation.py
@@ -286,7 +286,8 @@ def main():
print(f" threshold={eer['threshold']:.4f}, EER={eer['eer']:.4f}")
# Canonical threshold evaluations with Wilson CIs
canonical = {}
- for tt in [0.70, 0.80, 0.837, 0.90, 0.945, 0.95, 0.973, 0.979]:
+ for tt in [0.70, 0.80, 0.837, 0.90, 0.9407, 0.945, 0.95, 0.973, 0.977,
+ 0.979, 0.985]:
y_pred = (scores > tt).astype(int)
m = classification_metrics(y, y_pred)
m['threshold'] = float(tt)
diff --git a/signature_analysis/30_yearly_big4_comparison.py b/signature_analysis/30_yearly_big4_comparison.py
new file mode 100644
index 0000000..7364924
--- /dev/null
+++ b/signature_analysis/30_yearly_big4_comparison.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+"""
+Script 30: Yearly Per-Firm Cosine Similarity Comparison
+========================================================
+Generates the per-firm year-by-year per-signature best-match cosine
+distribution: Firm A (Deloitte), Firm B (KPMG), Firm C (PwC),
+Firm D (EY), Non-Big-4. The two-panel figure (mean cosine; share above
+0.95) is the headline cross-firm visual requested in partner review of
+v3.19.1 (2026-04-27): five lines, X-axis 2013-2023, Firm A at the top.
+
+Outputs:
+ reports/figures/fig_yearly_big4_comparison.png
+ reports/figures/fig_yearly_big4_comparison.pdf
+ reports/firm_yearly_comparison/firm_yearly_comparison.json
+ reports/firm_yearly_comparison/firm_yearly_comparison.md
+"""
+
+import json
+import sqlite3
+from datetime import datetime
+from pathlib import Path
+
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import numpy as np
+
+DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+FIG_OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
+ 'figures')
+DATA_OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
+ 'firm_yearly_comparison')
+FIG_OUT.mkdir(parents=True, exist_ok=True)
+DATA_OUT.mkdir(parents=True, exist_ok=True)
+
+FIRM_BUCKETS = [
+ ('Firm A', '勤業眾信聯合'),
+ ('Firm B', '安侯建業聯合'),
+ ('Firm C', '資誠聯合'),
+ ('Firm D', '安永聯合'),
+]
+
+FIRM_COLORS = {
+ 'Firm A': '#d62728',
+ 'Firm B': '#1f77b4',
+ 'Firm C': '#2ca02c',
+ 'Firm D': '#9467bd',
+ 'Non-Big-4': '#7f7f7f',
+}
+FIRM_MARKERS = {
+ 'Firm A': 'o',
+ 'Firm B': 's',
+ 'Firm C': '^',
+ 'Firm D': 'D',
+ 'Non-Big-4': 'v',
+}
+COSINE_CUT = 0.95
+
+
+def firm_bucket(firm):
+ for label, name in FIRM_BUCKETS:
+ if firm == name:
+ return label
+ return 'Non-Big-4'
+
+
+def load_rows(conn):
+ cur = conn.cursor()
+ cur.execute("""
+ SELECT a.firm,
+ CAST(substr(s.year_month, 1, 4) AS INTEGER) AS year,
+ s.max_similarity_to_same_accountant
+ FROM signatures s
+ LEFT JOIN accountants a ON s.assigned_accountant = a.name
+ WHERE s.max_similarity_to_same_accountant IS NOT NULL
+ AND s.year_month IS NOT NULL
+ AND s.assigned_accountant IS NOT NULL
+ """)
+ return cur.fetchall()
+
+
+def aggregate(rows):
+ """Returns dict keyed by (firm_label, year) -> {n, mean_cos, share_ge_cut}."""
+ by_firm_year = {}
+ for firm, year, cos in rows:
+ if year is None or year < 2013 or year > 2023:
+ continue
+ label = firm_bucket(firm)
+ key = (label, int(year))
+ by_firm_year.setdefault(key, []).append(float(cos))
+
+ summary = {}
+ for (label, year), vals in by_firm_year.items():
+ arr = np.array(vals, dtype=float)
+ summary[(label, year)] = {
+ 'n': int(arr.size),
+ 'mean_cos': float(arr.mean()),
+ 'share_ge_cut': float(np.mean(arr >= COSINE_CUT)),
+ }
+ return summary
+
+
+def plot_figure(summary, years, firm_labels, fig_path_png, fig_path_pdf):
+ fig, axes = plt.subplots(1, 2, figsize=(13, 5))
+
+ ax = axes[0]
+ for label in firm_labels:
+ ys = [summary[(label, y)]['mean_cos']
+ if (label, y) in summary else np.nan
+ for y in years]
+ ax.plot(years, ys,
+ marker=FIRM_MARKERS[label], color=FIRM_COLORS[label],
+ lw=2.0, ms=6, label=label,
+ zorder=3 if label == 'Firm A' else 2)
+ ax.set_xlabel('Fiscal year')
+ ax.set_ylabel('Mean per-signature best-match cosine')
+ ax.set_title('(a) Mean per-signature best-match cosine, by firm and year')
+ ax.set_xticks(years)
+ ax.tick_params(axis='x', rotation=0)
+ ax.grid(True, ls=':', alpha=0.4)
+ ax.legend(loc='lower right', framealpha=0.95)
+
+ ax = axes[1]
+ for label in firm_labels:
+ ys = [100.0 * summary[(label, y)]['share_ge_cut']
+ if (label, y) in summary else np.nan
+ for y in years]
+ ax.plot(years, ys,
+ marker=FIRM_MARKERS[label], color=FIRM_COLORS[label],
+ lw=2.0, ms=6, label=label,
+ zorder=3 if label == 'Firm A' else 2)
+ ax.set_xlabel('Fiscal year')
+ ax.set_ylabel(f'% signatures with best-match cosine $\\geq$ {COSINE_CUT}')
+ ax.set_title(f'(b) Share with cosine $\\geq$ {COSINE_CUT}, '
+ 'by firm and year')
+ ax.set_xticks(years)
+ ax.tick_params(axis='x', rotation=0)
+ ax.grid(True, ls=':', alpha=0.4)
+ ax.legend(loc='lower right', framealpha=0.95)
+ ax.set_ylim(0, 100)
+
+ fig.suptitle('Per-firm yearly per-signature best-match cosine '
+ '(operational cut shown as 0.95)',
+ fontsize=12, y=1.02)
+ fig.tight_layout()
+ fig.savefig(fig_path_png, dpi=200, bbox_inches='tight')
+ fig.savefig(fig_path_pdf, bbox_inches='tight')
+ plt.close(fig)
+
+
+def write_markdown(summary, years, firm_labels, md_path):
+ lines = ['# Per-Firm Yearly Cosine Comparison',
+ '',
+ f"Generated: {datetime.now().isoformat(timespec='seconds')}",
+ '',
+ ('Per-signature best-match cosine '
+ '(`max_similarity_to_same_accountant`), aggregated by firm '
+ 'bucket and fiscal year. Firm bucket via CPA registry '
+ '(`accountants.firm`).'),
+ '']
+
+ lines.append('## Mean per-signature best-match cosine')
+ lines.append('')
+ header = '| Year | ' + ' | '.join(firm_labels) + ' |'
+ sep = '|------|' + '|'.join(['------'] * len(firm_labels)) + '|'
+ lines.append(header)
+ lines.append(sep)
+ for y in years:
+ row = f'| {y} | '
+ cells = []
+ for lab in firm_labels:
+ if (lab, y) in summary:
+ cells.append(f"{summary[(lab, y)]['mean_cos']:.4f}")
+ else:
+ cells.append('---')
+ row += ' | '.join(cells) + ' |'
+ lines.append(row)
+
+ lines.append('')
+ lines.append(f'## Share with cosine $\\geq$ {COSINE_CUT}')
+ lines.append('')
+ lines.append(header)
+ lines.append(sep)
+ for y in years:
+ row = f'| {y} | '
+ cells = []
+ for lab in firm_labels:
+ if (lab, y) in summary:
+ cells.append(f"{100*summary[(lab, y)]['share_ge_cut']:.1f}%")
+ else:
+ cells.append('---')
+ row += ' | '.join(cells) + ' |'
+ lines.append(row)
+
+ lines.append('')
+ lines.append('## Per-firm signature counts')
+ lines.append('')
+ lines.append(header)
+ lines.append(sep)
+ for y in years:
+ row = f'| {y} | '
+ cells = []
+ for lab in firm_labels:
+ if (lab, y) in summary:
+ cells.append(f"{summary[(lab, y)]['n']:,}")
+ else:
+ cells.append('---')
+ row += ' | '.join(cells) + ' |'
+ lines.append(row)
+
+ md_path.write_text('\n'.join(lines) + '\n', encoding='utf-8')
+
+
+def main():
+ conn = sqlite3.connect(DB)
+ try:
+ rows = load_rows(conn)
+ finally:
+ conn.close()
+ print(f'Loaded {len(rows):,} signatures with cosine + year + firm.')
+
+ summary = aggregate(rows)
+ years = sorted({y for (_, y) in summary})
+ firm_labels = ['Firm A', 'Firm B', 'Firm C', 'Firm D', 'Non-Big-4']
+
+ fig_png = FIG_OUT / 'fig_yearly_big4_comparison.png'
+ fig_pdf = FIG_OUT / 'fig_yearly_big4_comparison.pdf'
+ plot_figure(summary, years, firm_labels, fig_png, fig_pdf)
+ print(f'Wrote {fig_png}')
+ print(f'Wrote {fig_pdf}')
+
+ payload = {
+ 'generated_at': datetime.now().isoformat(timespec='seconds'),
+ 'database_path': DB,
+ 'cosine_cut': COSINE_CUT,
+ 'firm_buckets': dict(FIRM_BUCKETS) | {'Non-Big-4': 'all other'},
+ 'years': years,
+ 'rows': [
+ {'firm': lab, 'year': y, **summary[(lab, y)]}
+ for lab in firm_labels for y in years
+ if (lab, y) in summary
+ ],
+ }
+ json_path = DATA_OUT / 'firm_yearly_comparison.json'
+ json_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False),
+ encoding='utf-8')
+ print(f'Wrote {json_path}')
+
+ md_path = DATA_OUT / 'firm_yearly_comparison.md'
+ write_markdown(summary, years, firm_labels, md_path)
+ print(f'Wrote {md_path}')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/signature_analysis/31_within_year_ranking_robustness.py b/signature_analysis/31_within_year_ranking_robustness.py
new file mode 100644
index 0000000..1a759b8
--- /dev/null
+++ b/signature_analysis/31_within_year_ranking_robustness.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""
+Script 31: Within-Year Same-CPA Ranking Robustness Check
+==========================================================
+Recomputes the per-auditor-year mean cosine ranking of Table XIV using
+within-year same-CPA matching only (instead of cross-year same-CPA pool
+which Table XIV uses by construction). Reports pooled top-10/20/30%
+Firm A share under the within-year restriction so the partner-level
+ranking finding can be checked against the cross-year aggregation
+choice flagged in Section IV-G.2.
+
+Definition (within-year statistic):
+ For each signature s, with CPA = c, year = y:
+ cos_within(s) = max cosine(s, s') over s' != s, CPA(s')=c, year(s')=y
+ If a (CPA, year) block has only one signature, cos_within is undefined
+ and that signature is dropped from the auditor-year aggregation
+ (matching the same-CPA pair-existence requirement of Section III-G).
+
+Outputs:
+ reports/within_year_ranking/within_year_ranking.json
+ reports/within_year_ranking/within_year_ranking.md
+"""
+
+import json
+import sqlite3
+from collections import defaultdict
+from datetime import datetime
+from pathlib import Path
+
+import numpy as np
+
+DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
+ 'within_year_ranking')
+OUT.mkdir(parents=True, exist_ok=True)
+
+FIRM_A = '勤業眾信聯合'
+MIN_SIGS_PER_AUDITOR_YEAR = 5
+
+
+def firm_bucket(firm):
+ if firm == '勤業眾信聯合':
+ return 'Firm A'
+ if firm == '安侯建業聯合':
+ return 'Firm B'
+ if firm == '資誠聯合':
+ return 'Firm C'
+ if firm == '安永聯合':
+ return 'Firm D'
+ return 'Non-Big-4'
+
+
+def load_signatures():
+ conn = sqlite3.connect(DB)
+ cur = conn.cursor()
+ cur.execute("""
+ SELECT s.signature_id, s.assigned_accountant, a.firm,
+ CAST(substr(s.year_month, 1, 4) AS INTEGER) AS year,
+ s.feature_vector
+ FROM signatures s
+ LEFT JOIN accountants a ON s.assigned_accountant = a.name
+ WHERE s.feature_vector IS NOT NULL
+ AND s.assigned_accountant IS NOT NULL
+ AND s.year_month IS NOT NULL
+ """)
+ rows = cur.fetchall()
+ conn.close()
+ return rows
+
+
+def compute_within_year_max(rows):
+ """Group by (CPA, year), compute max cosine to other same-block sigs."""
+ blocks = defaultdict(list) # (cpa, year) -> [(sig_id, feat)]
+ for sig_id, cpa, firm, year, blob in rows:
+ if year is None:
+ continue
+ feat = np.frombuffer(blob, dtype=np.float32)
+ blocks[(cpa, int(year))].append((sig_id, feat, firm))
+
+ sig_max_within = {} # sig_id -> max within-year same-CPA cosine
+ sig_meta = {} # sig_id -> (cpa, year, firm)
+ for (cpa, year), entries in blocks.items():
+ if len(entries) < 2:
+ continue # singleton: max-within is undefined
+ feats = np.stack([e[1] for e in entries]) # (n, 2048)
+ sims = feats @ feats.T # (n, n)
+ np.fill_diagonal(sims, -np.inf)
+ maxs = sims.max(axis=1)
+ for i, (sig_id, _, firm) in enumerate(entries):
+ sig_max_within[sig_id] = float(maxs[i])
+ sig_meta[sig_id] = (cpa, year, firm)
+ return sig_max_within, sig_meta
+
+
+def auditor_year_aggregation(sig_max_within, sig_meta):
+ by_ay = defaultdict(list) # (cpa, year) -> list of cos
+ for sig_id, cos in sig_max_within.items():
+ cpa, year, firm = sig_meta[sig_id]
+ by_ay[(cpa, year)].append(cos)
+ rows = []
+ for (cpa, year), vals in by_ay.items():
+ if len(vals) < MIN_SIGS_PER_AUDITOR_YEAR:
+ continue
+ firm = sig_meta[next(s for s in sig_max_within
+ if sig_meta[s][0] == cpa
+ and sig_meta[s][1] == year)][2]
+ rows.append({
+ 'acct': cpa,
+ 'year': year,
+ 'firm': firm,
+ 'cos_mean_within_year': float(np.mean(vals)),
+ 'n': len(vals),
+ })
+ return rows
+
+
+def top_k_breakdown(rows, k_pcts=(10, 20, 25, 30, 50)):
+ sorted_rows = sorted(rows, key=lambda r: -r['cos_mean_within_year'])
+ N = len(sorted_rows)
+ out = {}
+ for k_pct in k_pcts:
+ k = max(1, int(N * k_pct / 100))
+ top = sorted_rows[:k]
+ counts = defaultdict(int)
+ for r in top:
+ counts[firm_bucket(r['firm'])] += 1
+ out[f'top_{k_pct}pct'] = {
+ 'k': k,
+ 'firm_counts': dict(counts),
+ 'firm_a_share': counts['Firm A'] / k,
+ }
+ return out
+
+
+def per_year_top_k(rows, k_pcts=(10, 20, 30)):
+ years = sorted(set(r['year'] for r in rows))
+ out = {}
+ for y in years:
+ yr = [r for r in rows if r['year'] == y]
+ if not yr:
+ continue
+ sr = sorted(yr, key=lambda r: -r['cos_mean_within_year'])
+ n_y = len(sr)
+ n_a = sum(1 for r in sr if r['firm'] == FIRM_A)
+ per = {'n_auditor_years': n_y,
+ 'firm_a_baseline_share': n_a / n_y,
+ 'top_k': {}}
+ for kp in k_pcts:
+ k = max(1, int(n_y * kp / 100))
+ n_a_top = sum(1 for r in sr[:k] if r['firm'] == FIRM_A)
+ per['top_k'][f'top_{kp}pct'] = {
+ 'k': k,
+ 'firm_a_in_top': n_a_top,
+ 'firm_a_share': n_a_top / k,
+ }
+ out[y] = per
+ return out
+
+
+def main():
+ print('Loading signatures + features...')
+ rows = load_signatures()
+ print(f' loaded {len(rows):,}')
+
+ print('Computing within-year same-CPA max cosine...')
+ sig_max_within, sig_meta = compute_within_year_max(rows)
+ print(f' signatures with within-year pair: {len(sig_max_within):,}')
+ n_dropped = len(rows) - len(sig_max_within)
+ print(f' dropped (singleton within year): {n_dropped:,}')
+
+ ay_rows = auditor_year_aggregation(sig_max_within, sig_meta)
+ print(f' auditor-years (>={MIN_SIGS_PER_AUDITOR_YEAR} sigs '
+ f'with within-year pair): {len(ay_rows):,}')
+
+ pooled = top_k_breakdown(ay_rows)
+ yearly = per_year_top_k(ay_rows)
+
+ payload = {
+ 'generated_at': datetime.now().isoformat(timespec='seconds'),
+ 'n_signatures_loaded': len(rows),
+ 'n_signatures_with_within_year_pair': len(sig_max_within),
+ 'n_singleton_dropped': n_dropped,
+ 'min_sigs_per_auditor_year': MIN_SIGS_PER_AUDITOR_YEAR,
+ 'n_auditor_years': len(ay_rows),
+ 'n_firm_a_auditor_years': sum(1 for r in ay_rows
+ if r['firm'] == FIRM_A),
+ 'pooled_top_k': pooled,
+ 'yearly_top_k': yearly,
+ }
+ json_path = OUT / 'within_year_ranking.json'
+ json_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False),
+ encoding='utf-8')
+ print(f'\nWrote {json_path}')
+
+ # Markdown
+ md = ['# Within-Year Same-CPA Ranking Robustness',
+ '',
+ f"Generated: {payload['generated_at']}",
+ '',
+ ('Per-signature best-match cosine recomputed using within-year '
+ 'same-CPA matching only. See Script 31 docstring for the '
+ 'precise definition.'),
+ '',
+ f"- Signatures loaded: {len(rows):,}",
+ f"- Signatures with at least one within-year same-CPA pair: "
+ f"{len(sig_max_within):,}",
+ f"- Singletons dropped (no within-year pair): {n_dropped:,}",
+ f"- Auditor-years with >= {MIN_SIGS_PER_AUDITOR_YEAR} sigs: "
+ f"{len(ay_rows):,}",
+ f"- Firm A auditor-years: {payload['n_firm_a_auditor_years']:,} "
+ f"({100*payload['n_firm_a_auditor_years']/len(ay_rows):.1f}% baseline)",
+ '',
+ '## Pooled (2013-2023) top-K Firm A share',
+ '',
+ '| Top-K | k | Firm A share | A | B | C | D | NB4 |',
+ '|-------|---|--------------|---|---|---|---|-----|']
+ for kp in [10, 20, 25, 30, 50]:
+ d = pooled[f'top_{kp}pct']
+ c = d['firm_counts']
+ md.append(f"| {kp}% | {d['k']:,} | "
+ f"{100*d['firm_a_share']:.1f}% | "
+ f"{c.get('Firm A', 0)} | {c.get('Firm B', 0)} | "
+ f"{c.get('Firm C', 0)} | {c.get('Firm D', 0)} | "
+ f"{c.get('Non-Big-4', 0)} |")
+
+ md.extend(['',
+ '## Year-by-year top-K Firm A share',
+ '',
+ '| Year | n AY | Top-10% share | Top-20% share | '
+ 'Top-30% share | A baseline |',
+ '|------|------|---------------|---------------|'
+ '---------------|------------|'])
+ for y in sorted(yearly):
+ per = yearly[y]
+ line = (f"| {y} | {per['n_auditor_years']:,} ")
+ for kp in [10, 20, 30]:
+ d = per['top_k'][f'top_{kp}pct']
+ line += (f"| {100*d['firm_a_share']:.1f}% "
+ f"({d['firm_a_in_top']}/{d['k']}) ")
+ line += f"| {100*per['firm_a_baseline_share']:.1f}% |"
+ md.append(line)
+
+ md_path = OUT / 'within_year_ranking.md'
+ md_path.write_text('\n'.join(md) + '\n', encoding='utf-8')
+ print(f'Wrote {md_path}')
+
+
+if __name__ == '__main__':
+ main()