#!/usr/bin/env python3
"""Export Paper A v3 (IEEE Access target) to Word, reading from v3 md section files."""
from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from pathlib import Path
import hashlib
import re
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
PAPER_DIR = Path("/Volumes/NV2/pdf_recognize/paper")
EQUATION_CACHE_DIR = PAPER_DIR / "equations"
EQUATION_CACHE_DIR.mkdir(exist_ok=True)
FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures")
EXTRA_FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
OUTPUT = PAPER_DIR / "Paper_A_IEEE_Access_Draft_v3.docx"
SECTIONS = [
"paper_a_abstract_v3.md",
# paper_a_impact_statement_v3.md removed: not a standard IEEE Access
# Regular Paper section. Content folded into cover letter / abstract.
"paper_a_introduction_v3.md",
"paper_a_related_work_v3.md",
"paper_a_methodology_v3.md",
"paper_a_results_v3.md",
"paper_a_discussion_v3.md",
"paper_a_conclusion_v3.md",
# Appendix A: BD/McCrary bin-width sensitivity (see v3.7 notes).
"paper_a_appendix_v3.md",
# Declarations (COI / data availability / funding) before References,
# per IEEE Access convention.
"paper_a_declarations_v3.md",
"paper_a_references_v3.md",
]
# Figure insertion hooks (trigger phrase -> (file, caption, width inches)).
# New figures for v3: dip test, BD/McCrary overlays, accountant GMM 2D + marginals.
FIGURES = {
"Fig. 1 illustrates": (
FIG_DIR / "fig1_pipeline.png",
"Fig. 1. Pipeline architecture for automated non-hand-signed signature detection.",
6.5,
),
"Fig. 2 presents the cosine similarity distributions for intra-class": (
FIG_DIR / "fig2_intra_inter_kde.png",
"Fig. 2. Cosine similarity distributions: intra-class vs. inter-class with KDE crossover at 0.837.",
3.5,
),
"Fig. 3 presents the per-signature cosine and dHash distributions of Firm A": (
FIG_DIR / "fig3_firm_a_calibration.png",
"Fig. 3. Firm A per-signature cosine and dHash distributions against the overall CPA population.",
3.5,
),
"Fig. 4 summarises the per-firm yearly per-signature": (
EXTRA_FIG_DIR / "figures" / "fig_yearly_big4_comparison.png",
"Fig. 4. Per-firm yearly per-signature best-match cosine, 2013-2023. (a) Mean per-signature best-match cosine by firm bucket and fiscal year (threshold-free). (b) Share of per-signature best-match cosine ≥ 0.95 (operational cut of Section III-K). Five lines: Firm A, B, C, D, Non-Big-4. Firm A is above the other Big-4 firms in every year; Non-Big-4 is below all four Big-4 firms in every year.",
6.5,
),
"conducted an ablation study comparing three": (
FIG_DIR / "fig4_ablation.png",
"Fig. 5. Ablation study comparing three feature extraction backbones.",
6.5,
),
}
def strip_comments(text):
"""Remove HTML comments, but UNWRAP comments whose first non-blank line
starts with `TABLE ` (or `TABLE\t`).
The v3 markdown sources wrap every numerical table in an HTML comment of
the form
The caption (`TABLE V: Hartigan Dip Test Results`) is on the same line as
the opening `` closes the block. The previous implementation wholesale-deleted
these comments, which silently dropped every table from the rendered
DOCX. We now (i) detect comments whose first non-empty line starts with
`TABLE `, (ii) emit a synthetic caption marker line `__TABLE_CAPTION__:
` so process_section can render the caption as a centered
bold paragraph above the table, and (iii) keep the table body so the
existing markdown-table detector picks it up. Non-TABLE comments
(figure placeholders, editorial notes) are stripped as before.
"""
def _replace(match):
body = match.group(1)
# Find first non-blank line.
for line in body.splitlines():
stripped = line.strip()
if stripped:
first = stripped
break
else:
return ""
if not first.startswith("TABLE ") and not first.startswith("TABLE\t"):
return ""
# Split caption (first non-blank line) from the rest.
lines = body.splitlines()
# Find index of the first non-blank line and use everything after.
for idx, line in enumerate(lines):
if line.strip():
caption = line.strip()
rest = "\n".join(lines[idx + 1:])
break
else:
return ""
# Emit caption marker + body. Surround with blank lines so the
# paragraph/table detector treats the marker as its own paragraph.
return f"\n\n__TABLE_CAPTION__:{caption}\n{rest}\n"
# Non-greedy match across lines.
return re.sub(r"", _replace, text, flags=re.DOTALL)
# ---------------------------------------------------------------------------
# LaTeX → plain text + Unicode conversion
# ---------------------------------------------------------------------------
# The v3 markdown sources contain inline LaTeX ($...$) and a small number of
# display-math blocks ($$...$$). Pandoc would render these natively; the
# python-docx pipeline used here does not, so without preprocessing every
# `\leq`, `\text{dHash}_\text{indep}`, `\Delta\text{BIC}`, `60{,}448`, etc.
# leaks into the DOCX as raw LaTeX. The helpers below convert the common
# inline cases to Unicode and split subscripts/superscripts into proper Word
# runs. Display-math (rare; 3 equations in this paper) gets a best-effort
# linearisation and is acceptable for a partner-handoff DOCX; final IEEE
# typesetting is handled by the publisher's LaTeX/MathType pipeline.
LATEX_TOKEN_REPLACEMENTS = [
# Greek letters (lower)
(r"\\alpha(?![A-Za-z])", "α"), (r"\\beta(?![A-Za-z])", "β"), (r"\\gamma(?![A-Za-z])", "γ"),
(r"\\delta(?![A-Za-z])", "δ"), (r"\\epsilon(?![A-Za-z])", "ε"), (r"\\zeta(?![A-Za-z])", "ζ"),
(r"\\eta(?![A-Za-z])", "η"), (r"\\theta(?![A-Za-z])", "θ"), (r"\\iota(?![A-Za-z])", "ι"),
(r"\\kappa(?![A-Za-z])", "κ"), (r"\\lambda(?![A-Za-z])", "λ"), (r"\\mu(?![A-Za-z])", "μ"),
(r"\\nu(?![A-Za-z])", "ν"), (r"\\xi(?![A-Za-z])", "ξ"), (r"\\pi(?![A-Za-z])", "π"),
(r"\\rho(?![A-Za-z])", "ρ"), (r"\\sigma(?![A-Za-z])", "σ"), (r"\\tau(?![A-Za-z])", "τ"),
(r"\\phi(?![A-Za-z])", "φ"), (r"\\chi(?![A-Za-z])", "χ"), (r"\\psi(?![A-Za-z])", "ψ"),
(r"\\omega(?![A-Za-z])", "ω"),
# Greek letters (upper, only those distinguishable from Latin)
(r"\\Gamma(?![A-Za-z])", "Γ"), (r"\\Delta(?![A-Za-z])", "Δ"), (r"\\Theta(?![A-Za-z])", "Θ"),
(r"\\Lambda(?![A-Za-z])", "Λ"), (r"\\Xi(?![A-Za-z])", "Ξ"), (r"\\Pi(?![A-Za-z])", "Π"),
(r"\\Sigma(?![A-Za-z])", "Σ"), (r"\\Phi(?![A-Za-z])", "Φ"), (r"\\Psi(?![A-Za-z])", "Ψ"),
(r"\\Omega(?![A-Za-z])", "Ω"),
# Relations / arrows
(r"\\leq(?![A-Za-z])", "≤"), (r"\\geq(?![A-Za-z])", "≥"),
(r"\\neq(?![A-Za-z])", "≠"), (r"\\approx(?![A-Za-z])", "≈"),
(r"\\equiv(?![A-Za-z])", "≡"), (r"\\sim(?![A-Za-z])", "~"),
(r"\\to(?![A-Za-z])", "→"), (r"\\rightarrow(?![A-Za-z])", "→"),
(r"\\leftarrow(?![A-Za-z])", "←"), (r"\\Rightarrow(?![A-Za-z])", "⇒"),
(r"\\Leftarrow(?![A-Za-z])", "⇐"),
# Binary operators
(r"\\times(?![A-Za-z])", "×"), (r"\\cdot(?![A-Za-z])", "·"),
(r"\\pm(?![A-Za-z])", "±"), (r"\\mp(?![A-Za-z])", "∓"),
(r"\\div(?![A-Za-z])", "÷"),
# Misc
(r"\\infty(?![A-Za-z])", "∞"), (r"\\partial(?![A-Za-z])", "∂"),
(r"\\sum(?![A-Za-z])", "∑"), (r"\\prod(?![A-Za-z])", "∏"),
(r"\\int(?![A-Za-z])", "∫"),
(r"\\ldots(?![A-Za-z])", "…"), (r"\\dots(?![A-Za-z])", "…"),
# Spacing commands (drop or replace with single space)
(r"\\,", " "), (r"\\;", " "), (r"\\:", " "),
(r"\\!", ""), (r"\\ ", " "),
(r"\\quad(?![A-Za-z])", " "), (r"\\qquad(?![A-Za-z])", " "),
# Escaped punctuation
(r"\\%", "%"), (r"\\#", "#"), (r"\\&", "&"),
(r"\\\$", "$"), (r"\\_", "_"),
]
def _unwrap_command(text, cmd):
"""Repeatedly replace `\\cmd{X}` → `X` until stable."""
pat = re.compile(r"\\" + cmd + r"\{([^{}]*)\}")
prev = None
while prev != text:
prev = text
text = pat.sub(r"\1", text)
return text
MATH_START = "" # Private Use Area: XML-safe
MATH_END = ""
def latex_to_unicode(text):
"""Convert a LaTeX-laced markdown paragraph into plain text.
Math context is preserved with private-use sentinel characters
(MATH_START / MATH_END) so the downstream run-splitter only treats
`_X` / `^X` as subscript / superscript inside math regions; in body
text underscores in identifiers like `signature_analysis` survive.
"""
if "$" not in text and "\\" not in text:
return text
# 1. Strip display-math delimiters first (keep the inner content for
# best-effort linearisation), wrapping math regions with sentinels.
# Then strip inline math delimiters with the same sentinel wrapping.
text = re.sub(r"\$\$([\s\S]+?)\$\$",
lambda m: MATH_START + m.group(1) + MATH_END, text)
text = re.sub(r"\$([^$]+?)\$",
lambda m: MATH_START + m.group(1) + MATH_END, text)
# 2. Replace token-level commands with Unicode glyphs *before* unwrapping
# `\text{...}` and friends, so that `\Delta\text{BIC}` becomes
# `Δ\text{BIC}` (then `ΔBIC`) rather than `\DeltaBIC` which would be
# stripped wholesale by the cleanup pass.
for pat, repl in LATEX_TOKEN_REPLACEMENTS:
text = re.sub(pat, repl, text)
# 3. Unwrap formatting / text commands (innermost first via _unwrap loop).
for cmd in ("text", "mathbf", "mathit", "mathrm", "mathsf", "mathtt",
"operatorname", "emph", "textbf", "textit"):
text = _unwrap_command(text, cmd)
# 4. \frac{a}{b} → (a)/(b); \sqrt{x} → √(x). Apply repeatedly to handle
# one level of nesting; deeper nesting is rare in this paper.
for _ in range(3):
text = re.sub(
r"\\t?frac\{([^{}]+)\}\{([^{}]+)\}",
r"(\1)/(\2)",
text,
)
text = re.sub(r"\\sqrt\{([^{}]+)\}", r"√(\1)", text)
# 5. TeX braces used purely for spacing/grouping: K{=}3 → K=3,
# 60{,}448 → 60,448, 10{,}175 → 10,175.
text = re.sub(r"\{([=<>+\-,])\}", r"\1", text)
# 6. Strip any remaining `\cmd{...}` (best effort) and `\cmd ` tokens.
text = re.sub(r"\\[a-zA-Z]+\{([^{}]*)\}", r"\1", text)
text = re.sub(r"\\[a-zA-Z]+(?![A-Za-z])", "", text)
# 7. Collapse runs of whitespace introduced by command stripping.
text = re.sub(r"[ \t]{2,}", " ", text)
return text
_SUBSUP_PATTERN = re.compile(
r"_\{([^{}]*)\}" # _{...}
r"|\^\{([^{}]*)\}" # ^{...}
r"|_([A-Za-z0-9+\-])" # _X (single token)
r"|\^([A-Za-z0-9+\-])" # ^X (single token)
)
def _emit_plain(paragraph, text, font_name, font_size, bold, italic):
if not text:
return
run = paragraph.add_run(text)
run.font.name = font_name
run.font.size = font_size
run.bold = bold
run.italic = italic
def _emit_math(paragraph, text, font_name, font_size, bold, italic):
"""Emit `text` from a math region: split on `_X` / `_{X}` / `^X` / `^{X}`
and render those as Word subscripts / superscripts."""
if "_" not in text and "^" not in text:
_emit_plain(paragraph, text, font_name, font_size, bold, italic)
return
pos = 0
for m in _SUBSUP_PATTERN.finditer(text):
if m.start() > pos:
_emit_plain(paragraph, text[pos:m.start()],
font_name, font_size, bold, italic)
sub_text = m.group(1) or m.group(3)
sup_text = m.group(2) or m.group(4)
if sub_text is not None:
run = paragraph.add_run(sub_text)
run.font.subscript = True
else:
run = paragraph.add_run(sup_text)
run.font.superscript = True
run.font.name = font_name
run.font.size = font_size
run.bold = bold
run.italic = italic
pos = m.end()
if pos < len(text):
_emit_plain(paragraph, text[pos:],
font_name, font_size, bold, italic)
def add_text_with_subsup(paragraph, text, font_name="Times New Roman",
font_size=Pt(10), bold=False, italic=False):
"""Add `text` to `paragraph`. Subscript/superscript handling is scoped to
math regions delimited by MATH_START / MATH_END sentinels (set up by
`latex_to_unicode`). Outside math regions, underscores and carets are
preserved literally so identifiers like `signature_analysis` and
`paper_a_results_v3.md` survive intact.
"""
if MATH_START not in text:
_emit_math(paragraph, text, font_name, font_size, bold, italic) \
if False else \
_emit_plain(paragraph, text, font_name, font_size, bold, italic)
return
pos = 0
while pos < len(text):
s = text.find(MATH_START, pos)
if s == -1:
_emit_plain(paragraph, text[pos:],
font_name, font_size, bold, italic)
break
if s > pos:
_emit_plain(paragraph, text[pos:s],
font_name, font_size, bold, italic)
e = text.find(MATH_END, s + 1)
if e == -1:
# Unterminated math region — emit rest as plain.
_emit_plain(paragraph, text[s + 1:],
font_name, font_size, bold, italic)
break
math_body = text[s + 1:e]
_emit_math(paragraph, math_body, font_name, font_size, bold, italic)
pos = e + 1
# ---------------------------------------------------------------------------
# Display-equation rendering (matplotlib mathtext → PNG → embedded image)
# ---------------------------------------------------------------------------
# matplotlib mathtext is a subset of LaTeX. A few common TeX-only macros need
# to be substituted with mathtext-supported equivalents before parsing.
_MATHTEXT_SUBS = [
(re.compile(r"\\tfrac\b"), r"\\frac"), # text-frac → frac
(re.compile(r"\\dfrac\b"), r"\\frac"), # display-frac → frac
(re.compile(r"\\operatorname\{([^{}]+)\}"),
lambda m: r"\mathrm{" + m.group(1) + "}"), # operatorname → mathrm
(re.compile(r"\\,"), " "), # thin space
(re.compile(r"\\;"), " "),
(re.compile(r"\\!"), ""),
]
def _sanitise_for_mathtext(latex: str) -> str:
out = latex
for pat, repl in _MATHTEXT_SUBS:
out = pat.sub(repl, out)
return out
def render_equation_png(latex: str, fontsize: int = 14) -> Path:
"""Render a LaTeX math expression to a tightly-cropped PNG using
matplotlib mathtext, with content-addressed caching so a re-build only
re-renders changed equations. Returns the cached PNG path."""
sanitised = _sanitise_for_mathtext(latex.strip())
digest = hashlib.sha1(
(sanitised + f"|fs{fontsize}").encode("utf-8")).hexdigest()[:16]
out_path = EQUATION_CACHE_DIR / f"eq_{digest}.png"
if out_path.exists():
return out_path
fig = plt.figure(figsize=(8, 1.6))
fig.text(0.5, 0.5, f"${sanitised}$",
fontsize=fontsize, ha="center", va="center")
fig.savefig(str(out_path), dpi=220, bbox_inches="tight",
pad_inches=0.05)
plt.close(fig)
return out_path
def add_equation_block(doc, latex: str, equation_number: int,
width_inches: float = 4.5):
"""Insert a centered display equation (rendered as PNG) followed by
a right-aligned equation number `(N)`. Width keeps the equation
visually proportional within the IEEE Access body column."""
img_path = render_equation_png(latex)
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_before = Pt(6)
p.paragraph_format.space_after = Pt(6)
run = p.add_run()
run.add_picture(str(img_path), width=Inches(width_inches))
# Equation number on the same paragraph, tab-aligned to the right.
num_run = p.add_run(f"\t({equation_number})")
num_run.font.name = "Times New Roman"
num_run.font.size = Pt(10)
def add_md_table(doc, table_lines):
rows_data = []
for line in table_lines:
cells = [c.strip() for c in line.strip("|").split("|")]
if not re.match(r"^[-: ]+$", cells[0]):
rows_data.append(cells)
if len(rows_data) < 2:
return
ncols = len(rows_data[0])
table = doc.add_table(rows=len(rows_data), cols=ncols)
table.style = "Table Grid"
for r_idx, row in enumerate(rows_data):
for c_idx in range(min(len(row), ncols)):
cell = table.rows[r_idx].cells[c_idx]
raw = row[c_idx]
# Strip markdown emphasis markers; convert LaTeX before rendering.
raw = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", raw)
raw = re.sub(r"\*\*(.+?)\*\*", r"\1", raw)
raw = re.sub(r"\*(.+?)\*", r"\1", raw)
raw = re.sub(r"`(.+?)`", r"\1", raw)
cell_text = latex_to_unicode(raw)
# Replace the default empty paragraph with one we control.
cell.text = ""
cp = cell.paragraphs[0]
cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
add_text_with_subsup(
cp, cell_text,
font_name="Times New Roman",
font_size=Pt(8),
bold=(r_idx == 0),
)
doc.add_paragraph()
def _insert_figures(doc, para_text):
for trigger, (fig_path, caption, width) in FIGURES.items():
if trigger in para_text and Path(fig_path).exists():
fp = doc.add_paragraph()
fp.alignment = WD_ALIGN_PARAGRAPH.CENTER
fr = fp.add_run()
fr.add_picture(str(fig_path), width=Inches(width))
cp = doc.add_paragraph()
cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
cr = cp.add_run(caption)
cr.font.size = Pt(9)
cr.font.name = "Times New Roman"
cr.italic = True
def process_section(doc, filepath, equation_counter=None):
"""Process one v3 markdown section. `equation_counter` is a single-element
list (used as a mutable counter shared across sections) tracking the
running display-equation number."""
if equation_counter is None:
equation_counter = [0]
text = filepath.read_text(encoding="utf-8")
text = strip_comments(text)
lines = text.split("\n")
# Defensive blockquote handling: markdown blockquote lines (`> body`) are
# not rendered as Word callout blocks here, but stripping the leading
# `> ` keeps the body text from leaking the literal `>` and the empty
# `>` separator lines into the DOCX.
cleaned = []
for ln in lines:
s = ln.lstrip()
if s == ">" or s.startswith("> "):
cleaned.append(ln[ln.index(">") + 1:].lstrip() if "> " in ln else "")
else:
cleaned.append(ln)
lines = cleaned
i = 0
while i < len(lines):
line = lines[i]
stripped = line.strip()
if not stripped:
i += 1
continue
if stripped.startswith("# "):
h = doc.add_heading(
latex_to_unicode(stripped[2:]).replace(MATH_START, "").replace(MATH_END, ""),
level=1)
for run in h.runs:
run.font.color.rgb = RGBColor(0, 0, 0)
i += 1
continue
if stripped.startswith("## "):
h = doc.add_heading(
latex_to_unicode(stripped[3:]).replace(MATH_START, "").replace(MATH_END, ""),
level=2)
for run in h.runs:
run.font.color.rgb = RGBColor(0, 0, 0)
i += 1
continue
if stripped.startswith("### "):
h = doc.add_heading(
latex_to_unicode(stripped[4:]).replace(MATH_START, "").replace(MATH_END, ""),
level=3)
for run in h.runs:
run.font.color.rgb = RGBColor(0, 0, 0)
i += 1
continue
if stripped.startswith("__TABLE_CAPTION__:"):
caption_text = stripped[len("__TABLE_CAPTION__:"):].strip()
caption_text = latex_to_unicode(caption_text)
cp = doc.add_paragraph()
cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
cp.paragraph_format.space_before = Pt(6)
cp.paragraph_format.space_after = Pt(2)
add_text_with_subsup(
cp, caption_text,
font_name="Times New Roman",
font_size=Pt(9),
bold=True,
)
i += 1
continue
if "|" in stripped and i + 1 < len(lines) and re.match(r"\s*\|[-|: ]+\|", lines[i + 1]):
table_lines = []
while i < len(lines) and "|" in lines[i]:
table_lines.append(lines[i])
i += 1
add_md_table(doc, table_lines)
continue
# Display math: a line starting with `$$` is treated as a single-line
# equation block and rendered as an embedded mathtext PNG with an
# auto-incrementing equation number.
if stripped.startswith("$$"):
# Accumulate until a closing $$ is found (single line in our
# corpus, but defensively support multi-line just in case).
buf = [stripped]
if not (stripped.count("$$") >= 2 and stripped.endswith("$$")):
while i + 1 < len(lines):
i += 1
buf.append(lines[i])
if "$$" in lines[i]:
break
joined = "\n".join(buf).strip()
# Strip the leading and trailing $$ delimiters and any trailing
# punctuation (e.g. the `,` that some equation lines end with).
inner = joined
if inner.startswith("$$"):
inner = inner[2:]
if inner.endswith("$$"):
inner = inner[:-2]
inner = inner.rstrip(", ")
equation_counter[0] += 1
try:
add_equation_block(doc, inner, equation_counter[0])
except Exception as exc:
# Fallback: render as plain centered Times-Roman line so the
# build doesn't fail on a single un-renderable equation.
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run(f"[equation render failed: {exc}] {inner}")
run.font.name = "Times New Roman"
run.font.size = Pt(10)
run.italic = True
i += 1
continue
if re.match(r"^\d+\.\s", stripped):
# Manual numbering: keep the number from the markdown source and
# apply a hanging-indent paragraph format. Avoids python-docx's
# `style='List Number'` which depends on a properly-set-up
# numbering definition that the default Document() lacks.
m = re.match(r"^(\d+)\.\s+(.*)$", stripped)
num, content = m.group(1), m.group(2)
p = doc.add_paragraph()
p.paragraph_format.left_indent = Inches(0.4)
p.paragraph_format.first_line_indent = Inches(-0.25)
p.paragraph_format.space_after = Pt(4)
content = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", content)
content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
content = re.sub(r"\*(.+?)\*", r"\1", content)
content = re.sub(r"`(.+?)`", r"\1", content)
content = latex_to_unicode(content)
add_text_with_subsup(p, f"{num}. {content}")
i += 1
continue
if stripped.startswith("- "):
# Manual bullets with hanging indent (same rationale as numbered).
p = doc.add_paragraph()
p.paragraph_format.left_indent = Inches(0.4)
p.paragraph_format.first_line_indent = Inches(-0.25)
p.paragraph_format.space_after = Pt(4)
content = stripped[2:]
content = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", content)
content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
content = re.sub(r"\*(.+?)\*", r"\1", content)
content = re.sub(r"`(.+?)`", r"\1", content)
content = latex_to_unicode(content)
add_text_with_subsup(p, f"• {content}")
i += 1
continue
# Regular paragraph
para_lines = [stripped]
i += 1
while i < len(lines):
nxt = lines[i].strip()
if (
not nxt
or nxt.startswith("#")
or nxt.startswith("|")
or nxt.startswith("- ")
or re.match(r"^\d+\.\s", nxt)
):
break
para_lines.append(nxt)
i += 1
para_text = " ".join(para_lines)
para_text = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", para_text)
para_text = re.sub(r"\*\*(.+?)\*\*", r"\1", para_text)
para_text = re.sub(r"\*(.+?)\*", r"\1", para_text)
para_text = re.sub(r"`(.+?)`", r"\1", para_text)
para_text = para_text.replace("---", "\u2014")
para_text = latex_to_unicode(para_text)
p = doc.add_paragraph()
p.paragraph_format.space_after = Pt(6)
add_text_with_subsup(p, para_text)
_insert_figures(doc, para_text)
def main():
doc = Document()
style = doc.styles["Normal"]
style.font.name = "Times New Roman"
style.font.size = Pt(10)
# Title page
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(12)
run = p.add_run(
"Automated Identification of Non-Hand-Signed Auditor Signatures\n"
"in Large-Scale Financial Audit Reports:\n"
"A Dual-Descriptor Framework with Replication-Dominated Calibration"
)
run.font.size = Pt(16)
run.font.name = "Times New Roman"
run.bold = True
# IEEE Access uses single-anonymized review: author / affiliation
# / corresponding-author block must appear on the title page in the
# final submission. Fill these placeholders with real metadata
# before submitting the generated DOCX.
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(6)
run = p.add_run("[AUTHOR NAMES — fill in before submission]")
run.font.size = Pt(11)
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(6)
run = p.add_run("[Affiliations and corresponding-author email — fill in before submission]")
run.font.size = Pt(10)
run.italic = True
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(20)
run = p.add_run("Target journal: IEEE Access (Regular Paper, single-anonymized review)")
run.font.size = Pt(10)
run.italic = True
equation_counter = [0]
for section_file in SECTIONS:
filepath = PAPER_DIR / section_file
if filepath.exists():
process_section(doc, filepath, equation_counter=equation_counter)
else:
print(f"WARNING: missing section file: {filepath}")
doc.save(str(OUTPUT))
print(f"Saved: {OUTPUT}")
_run_linter()
def _run_linter():
"""Run the leak linter on the freshly built DOCX. Non-fatal: prints a
summary line. For full output run `python3 paper/lint_paper_v3.py`."""
try:
import lint_paper_v3 # local module
except Exception as exc: # pragma: no cover
print(f"(lint skipped: {exc})")
return
findings = lint_paper_v3.lint_docx(OUTPUT)
errors = sum(1 for f in findings if f.severity == "ERROR")
warns = sum(1 for f in findings if f.severity == "WARN")
infos = sum(1 for f in findings if f.severity == "INFO")
if errors:
print(f"\n[lint] {errors} ERROR finding(s) in DOCX — run "
f"`python3 paper/lint_paper_v3.py --docx` for details.")
elif warns or infos:
print(f"[lint] DOCX clean of ERRORs ({warns} WARN, {infos} INFO).")
else:
print("[lint] DOCX clean.")
if __name__ == "__main__":
main()