Files
pdf_signature_extraction/paper/export_v3.py
T
gbanyan 3672c9343e Phase 6 round-6: soften firm-heterogeneity framing, fix DOCX table render
Framing softening (per partner tone decision: own the limitation rather
than defend the strong claim).

Abstract: "Firm heterogeneity is decisive ... consistent with firm-level
template-like reuse" -> "The framework surfaces pronounced firm-level
heterogeneity ... consistent with firm-level template-like reuse but not
independently diagnostic, since descriptor-only data cannot separate
reuse from digitisation-pipeline or signing-style homogeneity within a
firm; we report it as a scope limitation rather than a mechanism finding."

S V-H Limitations: new bullet "Mechanism attribution for the firm-level
heterogeneity is not identifiable from descriptor-only data." enumerates
three non-mutually-exclusive firm-level mechanisms (template-like reuse /
digitisation-pipeline homogeneity / signing-style homogeneity), notes the
(cosine, dHash) descriptor pair is by construction indifferent to which
mechanism generated a near-identical pair, and lists what additional data
would be needed for attribution.

S VI Conclusion items (3) and (4): "firm heterogeneity quantification" ->
"firm-level heterogeneity surfaced by the framework ... reported as a
framework-discriminative observation rather than a mechanism finding";
item (4) expanded from template/stamp/document-production reuse alone to
the three-mechanism scope, with explicit "not independently establishing"
and S V-H cross-reference.

DOCX export fix (export_v3.py): add missing LaTeX-to-Unicode tokens
(\checkmark, \lvert/\rvert, \lVert/\rVert, \in, \notin, \max, \min,
\log, \ln, \exp, \bullet) that were silently dropping content from
Table III rows 2-4 (integer-jitter robustness check marks empty) and
Table XVIII drift column (|Delta| empty).

Rebuild Paper_A_IEEE_Access_Draft_v3.docx via export_v3.py and install
copy as Paper_A_IEEE_Access_Draft_v4.0_20260515.docx (replaces prior
pandoc-built v4 DOCX which had empty cells in every table header with
LaTeX math and inconsistent column widths). All 43 tables now have
non-empty cells with sub/superscript runs.

Mirrored in paper_a_v4_combined.md for consistency with the single-file
combined source.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 02:16:05 +08:00

701 lines
29 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Export Paper A v3 (IEEE Access target) to Word, reading from v3 md section files."""
from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from pathlib import Path
import hashlib
import re
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
PAPER_DIR = Path("/Volumes/NV2/pdf_recognize/paper")
EQUATION_CACHE_DIR = PAPER_DIR / "equations"
EQUATION_CACHE_DIR.mkdir(exist_ok=True)
FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures")
EXTRA_FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
OUTPUT = PAPER_DIR / "Paper_A_IEEE_Access_Draft_v3.docx"
SECTIONS = [
"paper_a_abstract_v3.md",
# paper_a_impact_statement_v3.md removed: not a standard IEEE Access
# Regular Paper section. Content folded into cover letter / abstract.
"paper_a_introduction_v3.md",
"paper_a_related_work_v3.md",
"paper_a_methodology_v3.md",
"paper_a_results_v3.md",
"paper_a_discussion_v3.md",
"paper_a_conclusion_v3.md",
# Appendix A: BD/McCrary bin-width sensitivity (see v3.7 notes).
"paper_a_appendix_v3.md",
# Declarations (COI / data availability / funding) before References,
# per IEEE Access convention.
"paper_a_declarations_v3.md",
"paper_a_references_v3.md",
]
# Figure insertion hooks (trigger phrase -> (file, caption, width inches)).
# New figures for v3: dip test, BD/McCrary overlays, accountant GMM 2D + marginals.
FIGURES = {
"Fig. 1 illustrates": (
FIG_DIR / "fig1_pipeline.png",
"Fig. 1. Pipeline architecture for automated non-hand-signed signature detection.",
6.5,
),
"Fig. 2 presents the cosine similarity distributions for intra-class": (
FIG_DIR / "fig2_intra_inter_kde.png",
"Fig. 2. Cosine similarity distributions: intra-class vs. inter-class with KDE crossover at 0.837.",
3.5,
),
"Fig. 3 presents the per-signature cosine and dHash distributions of Firm A": (
FIG_DIR / "fig3_firm_a_calibration.png",
"Fig. 3. Firm A per-signature cosine and dHash distributions against the overall CPA population.",
3.5,
),
"Fig. 4 summarises the per-firm yearly per-signature": (
EXTRA_FIG_DIR / "figures" / "fig_yearly_big4_comparison.png",
"Fig. 4. Per-firm yearly per-signature best-match cosine, 2013-2023. (a) Mean per-signature best-match cosine by firm bucket and fiscal year (threshold-free). (b) Share of per-signature best-match cosine ≥ 0.95 (operational cut of Section III-K). Five lines: Firm A, B, C, D, Non-Big-4. Firm A is above the other Big-4 firms in every year; Non-Big-4 is below all four Big-4 firms in every year.",
6.5,
),
"conducted an ablation study comparing three": (
FIG_DIR / "fig4_ablation.png",
"Fig. 5. Ablation study comparing three feature extraction backbones.",
6.5,
),
}
def strip_comments(text):
"""Remove HTML comments, but UNWRAP comments whose first non-blank line
starts with `TABLE ` (or `TABLE\t`).
The v3 markdown sources wrap every numerical table in an HTML comment of
the form
<!-- TABLE V: Hartigan Dip Test Results
| Distribution | N | ... |
|--------------|---|-----|
| ... | … | ... |
-->
The caption (`TABLE V: Hartigan Dip Test Results`) is on the same line as
the opening `<!--`, the markdown table body is on the lines following,
and `-->` closes the block. The previous implementation wholesale-deleted
these comments, which silently dropped every table from the rendered
DOCX. We now (i) detect comments whose first non-empty line starts with
`TABLE `, (ii) emit a synthetic caption marker line `__TABLE_CAPTION__:
<caption>` so process_section can render the caption as a centered
bold paragraph above the table, and (iii) keep the table body so the
existing markdown-table detector picks it up. Non-TABLE comments
(figure placeholders, editorial notes) are stripped as before.
"""
def _replace(match):
body = match.group(1)
# Find first non-blank line.
for line in body.splitlines():
stripped = line.strip()
if stripped:
first = stripped
break
else:
return ""
if not first.startswith("TABLE ") and not first.startswith("TABLE\t"):
return ""
# Split caption (first non-blank line) from the rest.
lines = body.splitlines()
# Find index of the first non-blank line and use everything after.
for idx, line in enumerate(lines):
if line.strip():
caption = line.strip()
rest = "\n".join(lines[idx + 1:])
break
else:
return ""
# Emit caption marker + body. Surround with blank lines so the
# paragraph/table detector treats the marker as its own paragraph.
return f"\n\n__TABLE_CAPTION__:{caption}\n{rest}\n"
# Non-greedy match across lines.
return re.sub(r"<!--(.*?)-->", _replace, text, flags=re.DOTALL)
# ---------------------------------------------------------------------------
# LaTeX → plain text + Unicode conversion
# ---------------------------------------------------------------------------
# The v3 markdown sources contain inline LaTeX ($...$) and a small number of
# display-math blocks ($$...$$). Pandoc would render these natively; the
# python-docx pipeline used here does not, so without preprocessing every
# `\leq`, `\text{dHash}_\text{indep}`, `\Delta\text{BIC}`, `60{,}448`, etc.
# leaks into the DOCX as raw LaTeX. The helpers below convert the common
# inline cases to Unicode and split subscripts/superscripts into proper Word
# runs. Display-math (rare; 3 equations in this paper) gets a best-effort
# linearisation and is acceptable for a partner-handoff DOCX; final IEEE
# typesetting is handled by the publisher's LaTeX/MathType pipeline.
LATEX_TOKEN_REPLACEMENTS = [
# Greek letters (lower)
(r"\\alpha(?![A-Za-z])", "α"), (r"\\beta(?![A-Za-z])", "β"), (r"\\gamma(?![A-Za-z])", "γ"),
(r"\\delta(?![A-Za-z])", "δ"), (r"\\epsilon(?![A-Za-z])", "ε"), (r"\\zeta(?![A-Za-z])", "ζ"),
(r"\\eta(?![A-Za-z])", "η"), (r"\\theta(?![A-Za-z])", "θ"), (r"\\iota(?![A-Za-z])", "ι"),
(r"\\kappa(?![A-Za-z])", "κ"), (r"\\lambda(?![A-Za-z])", "λ"), (r"\\mu(?![A-Za-z])", "μ"),
(r"\\nu(?![A-Za-z])", "ν"), (r"\\xi(?![A-Za-z])", "ξ"), (r"\\pi(?![A-Za-z])", "π"),
(r"\\rho(?![A-Za-z])", "ρ"), (r"\\sigma(?![A-Za-z])", "σ"), (r"\\tau(?![A-Za-z])", "τ"),
(r"\\phi(?![A-Za-z])", "φ"), (r"\\chi(?![A-Za-z])", "χ"), (r"\\psi(?![A-Za-z])", "ψ"),
(r"\\omega(?![A-Za-z])", "ω"),
# Greek letters (upper, only those distinguishable from Latin)
(r"\\Gamma(?![A-Za-z])", "Γ"), (r"\\Delta(?![A-Za-z])", "Δ"), (r"\\Theta(?![A-Za-z])", "Θ"),
(r"\\Lambda(?![A-Za-z])", "Λ"), (r"\\Xi(?![A-Za-z])", "Ξ"), (r"\\Pi(?![A-Za-z])", "Π"),
(r"\\Sigma(?![A-Za-z])", "Σ"), (r"\\Phi(?![A-Za-z])", "Φ"), (r"\\Psi(?![A-Za-z])", "Ψ"),
(r"\\Omega(?![A-Za-z])", "Ω"),
# Relations / arrows
(r"\\leq(?![A-Za-z])", ""), (r"\\geq(?![A-Za-z])", ""),
(r"\\neq(?![A-Za-z])", ""), (r"\\approx(?![A-Za-z])", ""),
(r"\\equiv(?![A-Za-z])", ""), (r"\\sim(?![A-Za-z])", "~"),
(r"\\in(?![A-Za-z])", ""), (r"\\notin(?![A-Za-z])", ""),
(r"\\to(?![A-Za-z])", ""), (r"\\rightarrow(?![A-Za-z])", ""),
(r"\\leftarrow(?![A-Za-z])", ""), (r"\\Rightarrow(?![A-Za-z])", ""),
(r"\\Leftarrow(?![A-Za-z])", ""),
# Delimiters spelled as commands
(r"\\lvert(?![A-Za-z])", "|"), (r"\\rvert(?![A-Za-z])", "|"),
(r"\\lVert(?![A-Za-z])", ""), (r"\\rVert(?![A-Za-z])", ""),
# Symbols / operators that linearise to text
(r"\\checkmark(?![A-Za-z])", ""),
(r"\\bullet(?![A-Za-z])", ""),
(r"\\max(?![A-Za-z])", "max"), (r"\\min(?![A-Za-z])", "min"),
(r"\\log(?![A-Za-z])", "log"), (r"\\ln(?![A-Za-z])", "ln"),
(r"\\exp(?![A-Za-z])", "exp"),
# Binary operators
(r"\\times(?![A-Za-z])", "×"), (r"\\cdot(?![A-Za-z])", "·"),
(r"\\pm(?![A-Za-z])", "±"), (r"\\mp(?![A-Za-z])", ""),
(r"\\div(?![A-Za-z])", "÷"),
# Misc
(r"\\infty(?![A-Za-z])", ""), (r"\\partial(?![A-Za-z])", ""),
(r"\\sum(?![A-Za-z])", ""), (r"\\prod(?![A-Za-z])", ""),
(r"\\int(?![A-Za-z])", ""),
(r"\\ldots(?![A-Za-z])", ""), (r"\\dots(?![A-Za-z])", ""),
# Spacing commands (drop or replace with single space)
(r"\\,", " "), (r"\\;", " "), (r"\\:", " "),
(r"\\!", ""), (r"\\ ", " "),
(r"\\quad(?![A-Za-z])", " "), (r"\\qquad(?![A-Za-z])", " "),
# Escaped punctuation
(r"\\%", "%"), (r"\\#", "#"), (r"\\&", "&"),
(r"\\\$", "$"), (r"\\_", "_"),
]
def _unwrap_command(text, cmd):
"""Repeatedly replace `\\cmd{X}` → `X` until stable."""
pat = re.compile(r"\\" + cmd + r"\{([^{}]*)\}")
prev = None
while prev != text:
prev = text
text = pat.sub(r"\1", text)
return text
MATH_START = "" # Private Use Area: XML-safe
MATH_END = ""
def latex_to_unicode(text):
"""Convert a LaTeX-laced markdown paragraph into plain text.
Math context is preserved with private-use sentinel characters
(MATH_START / MATH_END) so the downstream run-splitter only treats
`_X` / `^X` as subscript / superscript inside math regions; in body
text underscores in identifiers like `signature_analysis` survive.
"""
if "$" not in text and "\\" not in text:
return text
# 1. Strip display-math delimiters first (keep the inner content for
# best-effort linearisation), wrapping math regions with sentinels.
# Then strip inline math delimiters with the same sentinel wrapping.
text = re.sub(r"\$\$([\s\S]+?)\$\$",
lambda m: MATH_START + m.group(1) + MATH_END, text)
text = re.sub(r"\$([^$]+?)\$",
lambda m: MATH_START + m.group(1) + MATH_END, text)
# 2. Replace token-level commands with Unicode glyphs *before* unwrapping
# `\text{...}` and friends, so that `\Delta\text{BIC}` becomes
# `Δ\text{BIC}` (then `ΔBIC`) rather than `\DeltaBIC` which would be
# stripped wholesale by the cleanup pass.
for pat, repl in LATEX_TOKEN_REPLACEMENTS:
text = re.sub(pat, repl, text)
# 3. Unwrap formatting / text commands (innermost first via _unwrap loop).
for cmd in ("text", "mathbf", "mathit", "mathrm", "mathsf", "mathtt",
"operatorname", "emph", "textbf", "textit"):
text = _unwrap_command(text, cmd)
# 4. \frac{a}{b} → (a)/(b); \sqrt{x} → √(x). Apply repeatedly to handle
# one level of nesting; deeper nesting is rare in this paper.
for _ in range(3):
text = re.sub(
r"\\t?frac\{([^{}]+)\}\{([^{}]+)\}",
r"(\1)/(\2)",
text,
)
text = re.sub(r"\\sqrt\{([^{}]+)\}", r"√(\1)", text)
# 5. TeX braces used purely for spacing/grouping: K{=}3 → K=3,
# 60{,}448 → 60,448, 10{,}175 → 10,175.
text = re.sub(r"\{([=<>+\-,])\}", r"\1", text)
# 6. Strip any remaining `\cmd{...}` (best effort) and `\cmd ` tokens.
text = re.sub(r"\\[a-zA-Z]+\{([^{}]*)\}", r"\1", text)
text = re.sub(r"\\[a-zA-Z]+(?![A-Za-z])", "", text)
# 7. Collapse runs of whitespace introduced by command stripping.
text = re.sub(r"[ \t]{2,}", " ", text)
return text
_SUBSUP_PATTERN = re.compile(
r"_\{([^{}]*)\}" # _{...}
r"|\^\{([^{}]*)\}" # ^{...}
r"|_([A-Za-z0-9+\-])" # _X (single token)
r"|\^([A-Za-z0-9+\-])" # ^X (single token)
)
def _emit_plain(paragraph, text, font_name, font_size, bold, italic):
if not text:
return
run = paragraph.add_run(text)
run.font.name = font_name
run.font.size = font_size
run.bold = bold
run.italic = italic
def _emit_math(paragraph, text, font_name, font_size, bold, italic):
"""Emit `text` from a math region: split on `_X` / `_{X}` / `^X` / `^{X}`
and render those as Word subscripts / superscripts."""
if "_" not in text and "^" not in text:
_emit_plain(paragraph, text, font_name, font_size, bold, italic)
return
pos = 0
for m in _SUBSUP_PATTERN.finditer(text):
if m.start() > pos:
_emit_plain(paragraph, text[pos:m.start()],
font_name, font_size, bold, italic)
sub_text = m.group(1) or m.group(3)
sup_text = m.group(2) or m.group(4)
if sub_text is not None:
run = paragraph.add_run(sub_text)
run.font.subscript = True
else:
run = paragraph.add_run(sup_text)
run.font.superscript = True
run.font.name = font_name
run.font.size = font_size
run.bold = bold
run.italic = italic
pos = m.end()
if pos < len(text):
_emit_plain(paragraph, text[pos:],
font_name, font_size, bold, italic)
def add_text_with_subsup(paragraph, text, font_name="Times New Roman",
font_size=Pt(10), bold=False, italic=False):
"""Add `text` to `paragraph`. Subscript/superscript handling is scoped to
math regions delimited by MATH_START / MATH_END sentinels (set up by
`latex_to_unicode`). Outside math regions, underscores and carets are
preserved literally so identifiers like `signature_analysis` and
`paper_a_results_v3.md` survive intact.
"""
if MATH_START not in text:
_emit_math(paragraph, text, font_name, font_size, bold, italic) \
if False else \
_emit_plain(paragraph, text, font_name, font_size, bold, italic)
return
pos = 0
while pos < len(text):
s = text.find(MATH_START, pos)
if s == -1:
_emit_plain(paragraph, text[pos:],
font_name, font_size, bold, italic)
break
if s > pos:
_emit_plain(paragraph, text[pos:s],
font_name, font_size, bold, italic)
e = text.find(MATH_END, s + 1)
if e == -1:
# Unterminated math region — emit rest as plain.
_emit_plain(paragraph, text[s + 1:],
font_name, font_size, bold, italic)
break
math_body = text[s + 1:e]
_emit_math(paragraph, math_body, font_name, font_size, bold, italic)
pos = e + 1
# ---------------------------------------------------------------------------
# Display-equation rendering (matplotlib mathtext → PNG → embedded image)
# ---------------------------------------------------------------------------
# matplotlib mathtext is a subset of LaTeX. A few common TeX-only macros need
# to be substituted with mathtext-supported equivalents before parsing.
_MATHTEXT_SUBS = [
(re.compile(r"\\tfrac\b"), r"\\frac"), # text-frac → frac
(re.compile(r"\\dfrac\b"), r"\\frac"), # display-frac → frac
(re.compile(r"\\operatorname\{([^{}]+)\}"),
lambda m: r"\mathrm{" + m.group(1) + "}"), # operatorname → mathrm
(re.compile(r"\\,"), " "), # thin space
(re.compile(r"\\;"), " "),
(re.compile(r"\\!"), ""),
]
def _sanitise_for_mathtext(latex: str) -> str:
out = latex
for pat, repl in _MATHTEXT_SUBS:
out = pat.sub(repl, out)
return out
def render_equation_png(latex: str, fontsize: int = 14) -> Path:
"""Render a LaTeX math expression to a tightly-cropped PNG using
matplotlib mathtext, with content-addressed caching so a re-build only
re-renders changed equations. Returns the cached PNG path."""
sanitised = _sanitise_for_mathtext(latex.strip())
digest = hashlib.sha1(
(sanitised + f"|fs{fontsize}").encode("utf-8")).hexdigest()[:16]
out_path = EQUATION_CACHE_DIR / f"eq_{digest}.png"
if out_path.exists():
return out_path
fig = plt.figure(figsize=(8, 1.6))
fig.text(0.5, 0.5, f"${sanitised}$",
fontsize=fontsize, ha="center", va="center")
fig.savefig(str(out_path), dpi=220, bbox_inches="tight",
pad_inches=0.05)
plt.close(fig)
return out_path
def add_equation_block(doc, latex: str, equation_number: int,
width_inches: float = 4.5):
"""Insert a centered display equation (rendered as PNG) followed by
a right-aligned equation number `(N)`. Width keeps the equation
visually proportional within the IEEE Access body column."""
img_path = render_equation_png(latex)
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_before = Pt(6)
p.paragraph_format.space_after = Pt(6)
run = p.add_run()
run.add_picture(str(img_path), width=Inches(width_inches))
# Equation number on the same paragraph, tab-aligned to the right.
num_run = p.add_run(f"\t({equation_number})")
num_run.font.name = "Times New Roman"
num_run.font.size = Pt(10)
def add_md_table(doc, table_lines):
rows_data = []
for line in table_lines:
cells = [c.strip() for c in line.strip("|").split("|")]
if not re.match(r"^[-: ]+$", cells[0]):
rows_data.append(cells)
if len(rows_data) < 2:
return
ncols = len(rows_data[0])
table = doc.add_table(rows=len(rows_data), cols=ncols)
table.style = "Table Grid"
for r_idx, row in enumerate(rows_data):
for c_idx in range(min(len(row), ncols)):
cell = table.rows[r_idx].cells[c_idx]
raw = row[c_idx]
# Strip markdown emphasis markers; convert LaTeX before rendering.
raw = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", raw)
raw = re.sub(r"\*\*(.+?)\*\*", r"\1", raw)
raw = re.sub(r"\*(.+?)\*", r"\1", raw)
raw = re.sub(r"`(.+?)`", r"\1", raw)
cell_text = latex_to_unicode(raw)
# Replace the default empty paragraph with one we control.
cell.text = ""
cp = cell.paragraphs[0]
cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
add_text_with_subsup(
cp, cell_text,
font_name="Times New Roman",
font_size=Pt(8),
bold=(r_idx == 0),
)
doc.add_paragraph()
def _insert_figures(doc, para_text):
for trigger, (fig_path, caption, width) in FIGURES.items():
if trigger in para_text and Path(fig_path).exists():
fp = doc.add_paragraph()
fp.alignment = WD_ALIGN_PARAGRAPH.CENTER
fr = fp.add_run()
fr.add_picture(str(fig_path), width=Inches(width))
cp = doc.add_paragraph()
cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
cr = cp.add_run(caption)
cr.font.size = Pt(9)
cr.font.name = "Times New Roman"
cr.italic = True
def process_section(doc, filepath, equation_counter=None):
"""Process one v3 markdown section. `equation_counter` is a single-element
list (used as a mutable counter shared across sections) tracking the
running display-equation number."""
if equation_counter is None:
equation_counter = [0]
text = filepath.read_text(encoding="utf-8")
text = strip_comments(text)
lines = text.split("\n")
# Defensive blockquote handling: markdown blockquote lines (`> body`) are
# not rendered as Word callout blocks here, but stripping the leading
# `> ` keeps the body text from leaking the literal `>` and the empty
# `>` separator lines into the DOCX.
cleaned = []
for ln in lines:
s = ln.lstrip()
if s == ">" or s.startswith("> "):
cleaned.append(ln[ln.index(">") + 1:].lstrip() if "> " in ln else "")
else:
cleaned.append(ln)
lines = cleaned
i = 0
while i < len(lines):
line = lines[i]
stripped = line.strip()
if not stripped:
i += 1
continue
if stripped.startswith("# "):
h = doc.add_heading(
latex_to_unicode(stripped[2:]).replace(MATH_START, "").replace(MATH_END, ""),
level=1)
for run in h.runs:
run.font.color.rgb = RGBColor(0, 0, 0)
i += 1
continue
if stripped.startswith("## "):
h = doc.add_heading(
latex_to_unicode(stripped[3:]).replace(MATH_START, "").replace(MATH_END, ""),
level=2)
for run in h.runs:
run.font.color.rgb = RGBColor(0, 0, 0)
i += 1
continue
if stripped.startswith("### "):
h = doc.add_heading(
latex_to_unicode(stripped[4:]).replace(MATH_START, "").replace(MATH_END, ""),
level=3)
for run in h.runs:
run.font.color.rgb = RGBColor(0, 0, 0)
i += 1
continue
if stripped.startswith("__TABLE_CAPTION__:"):
caption_text = stripped[len("__TABLE_CAPTION__:"):].strip()
caption_text = latex_to_unicode(caption_text)
cp = doc.add_paragraph()
cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
cp.paragraph_format.space_before = Pt(6)
cp.paragraph_format.space_after = Pt(2)
add_text_with_subsup(
cp, caption_text,
font_name="Times New Roman",
font_size=Pt(9),
bold=True,
)
i += 1
continue
if "|" in stripped and i + 1 < len(lines) and re.match(r"\s*\|[-|: ]+\|", lines[i + 1]):
table_lines = []
while i < len(lines) and "|" in lines[i]:
table_lines.append(lines[i])
i += 1
add_md_table(doc, table_lines)
continue
# Display math: a line starting with `$$` is treated as a single-line
# equation block and rendered as an embedded mathtext PNG with an
# auto-incrementing equation number.
if stripped.startswith("$$"):
# Accumulate until a closing $$ is found (single line in our
# corpus, but defensively support multi-line just in case).
buf = [stripped]
if not (stripped.count("$$") >= 2 and stripped.endswith("$$")):
while i + 1 < len(lines):
i += 1
buf.append(lines[i])
if "$$" in lines[i]:
break
joined = "\n".join(buf).strip()
# Strip the leading and trailing $$ delimiters and any trailing
# punctuation (e.g. the `,` that some equation lines end with).
inner = joined
if inner.startswith("$$"):
inner = inner[2:]
if inner.endswith("$$"):
inner = inner[:-2]
inner = inner.rstrip(", ")
equation_counter[0] += 1
try:
add_equation_block(doc, inner, equation_counter[0])
except Exception as exc:
# Fallback: render as plain centered Times-Roman line so the
# build doesn't fail on a single un-renderable equation.
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run(f"[equation render failed: {exc}] {inner}")
run.font.name = "Times New Roman"
run.font.size = Pt(10)
run.italic = True
i += 1
continue
if re.match(r"^\d+\.\s", stripped):
# Manual numbering: keep the number from the markdown source and
# apply a hanging-indent paragraph format. Avoids python-docx's
# `style='List Number'` which depends on a properly-set-up
# numbering definition that the default Document() lacks.
m = re.match(r"^(\d+)\.\s+(.*)$", stripped)
num, content = m.group(1), m.group(2)
p = doc.add_paragraph()
p.paragraph_format.left_indent = Inches(0.4)
p.paragraph_format.first_line_indent = Inches(-0.25)
p.paragraph_format.space_after = Pt(4)
content = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", content)
content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
content = re.sub(r"\*(.+?)\*", r"\1", content)
content = re.sub(r"`(.+?)`", r"\1", content)
content = latex_to_unicode(content)
add_text_with_subsup(p, f"{num}. {content}")
i += 1
continue
if stripped.startswith("- "):
# Manual bullets with hanging indent (same rationale as numbered).
p = doc.add_paragraph()
p.paragraph_format.left_indent = Inches(0.4)
p.paragraph_format.first_line_indent = Inches(-0.25)
p.paragraph_format.space_after = Pt(4)
content = stripped[2:]
content = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", content)
content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
content = re.sub(r"\*(.+?)\*", r"\1", content)
content = re.sub(r"`(.+?)`", r"\1", content)
content = latex_to_unicode(content)
add_text_with_subsup(p, f"{content}")
i += 1
continue
# Regular paragraph
para_lines = [stripped]
i += 1
while i < len(lines):
nxt = lines[i].strip()
if (
not nxt
or nxt.startswith("#")
or nxt.startswith("|")
or nxt.startswith("- ")
or re.match(r"^\d+\.\s", nxt)
):
break
para_lines.append(nxt)
i += 1
para_text = " ".join(para_lines)
para_text = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", para_text)
para_text = re.sub(r"\*\*(.+?)\*\*", r"\1", para_text)
para_text = re.sub(r"\*(.+?)\*", r"\1", para_text)
para_text = re.sub(r"`(.+?)`", r"\1", para_text)
para_text = para_text.replace("---", "\u2014")
para_text = latex_to_unicode(para_text)
p = doc.add_paragraph()
p.paragraph_format.space_after = Pt(6)
add_text_with_subsup(p, para_text)
_insert_figures(doc, para_text)
def main():
doc = Document()
style = doc.styles["Normal"]
style.font.name = "Times New Roman"
style.font.size = Pt(10)
# Title page
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(12)
run = p.add_run(
"Automated Identification of Non-Hand-Signed Auditor Signatures\n"
"in Large-Scale Financial Audit Reports:\n"
"A Dual-Descriptor Framework with Replication-Dominated Calibration"
)
run.font.size = Pt(16)
run.font.name = "Times New Roman"
run.bold = True
# IEEE Access uses single-anonymized review: author / affiliation
# / corresponding-author block must appear on the title page in the
# final submission. Fill these placeholders with real metadata
# before submitting the generated DOCX.
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(6)
run = p.add_run("[AUTHOR NAMES — fill in before submission]")
run.font.size = Pt(11)
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(6)
run = p.add_run("[Affiliations and corresponding-author email — fill in before submission]")
run.font.size = Pt(10)
run.italic = True
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(20)
run = p.add_run("Target journal: IEEE Access (Regular Paper, single-anonymized review)")
run.font.size = Pt(10)
run.italic = True
equation_counter = [0]
for section_file in SECTIONS:
filepath = PAPER_DIR / section_file
if filepath.exists():
process_section(doc, filepath, equation_counter=equation_counter)
else:
print(f"WARNING: missing section file: {filepath}")
doc.save(str(OUTPUT))
print(f"Saved: {OUTPUT}")
_run_linter()
def _run_linter():
"""Run the leak linter on the freshly built DOCX. Non-fatal: prints a
summary line. For full output run `python3 paper/lint_paper_v3.py`."""
try:
import lint_paper_v3 # local module
except Exception as exc: # pragma: no cover
print(f"(lint skipped: {exc})")
return
findings = lint_paper_v3.lint_docx(OUTPUT)
errors = sum(1 for f in findings if f.severity == "ERROR")
warns = sum(1 for f in findings if f.severity == "WARN")
infos = sum(1 for f in findings if f.severity == "INFO")
if errors:
print(f"\n[lint] {errors} ERROR finding(s) in DOCX — run "
f"`python3 paper/lint_paper_v3.py --docx` for details.")
elif warns or infos:
print(f"[lint] DOCX clean of ERRORs ({warns} WARN, {infos} INFO).")
else:
print("[lint] DOCX clean.")
if __name__ == "__main__":
main()