pdf_signature_extraction/paper/export_v3.py

#!/usr/bin/env python3
"""Export Paper A v3 (IEEE Access target) to Word, reading from v3 md section files."""

from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from pathlib import Path
import hashlib
import re

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

PAPER_DIR = Path("/Volumes/NV2/pdf_recognize/paper")
EQUATION_CACHE_DIR = PAPER_DIR / "equations"
EQUATION_CACHE_DIR.mkdir(exist_ok=True)
FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures")
EXTRA_FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
OUTPUT = PAPER_DIR / "Paper_A_IEEE_Access_Draft_v3.docx"

SECTIONS = [
    "paper_a_abstract_v3.md",
    # paper_a_impact_statement_v3.md removed: not a standard IEEE Access
    # Regular Paper section. Content folded into cover letter / abstract.
    "paper_a_introduction_v3.md",
    "paper_a_related_work_v3.md",
    "paper_a_methodology_v3.md",
    "paper_a_results_v3.md",
    "paper_a_discussion_v3.md",
    "paper_a_conclusion_v3.md",
    # Appendix A: BD/McCrary bin-width sensitivity (see v3.7 notes).
    "paper_a_appendix_v3.md",
    # Declarations (COI / data availability / funding) before References,
    # per IEEE Access convention.
    "paper_a_declarations_v3.md",
    "paper_a_references_v3.md",
]

# Figure insertion hooks (trigger phrase -> (file, caption, width inches)).
# New figures for v3: dip test, BD/McCrary overlays, accountant GMM 2D + marginals.
FIGURES = {
    "Fig. 1 illustrates": (
        FIG_DIR / "fig1_pipeline.png",
        "Fig. 1. Pipeline architecture for automated non-hand-signed signature detection.",
        6.5,
    ),
    "Fig. 2 presents the cosine similarity distributions for intra-class": (
        FIG_DIR / "fig2_intra_inter_kde.png",
        "Fig. 2. Cosine similarity distributions: intra-class vs. inter-class with KDE crossover at 0.837.",
        3.5,
    ),
    "Fig. 3 presents the per-signature cosine and dHash distributions of Firm A": (
        FIG_DIR / "fig3_firm_a_calibration.png",
        "Fig. 3. Firm A per-signature cosine and dHash distributions against the overall CPA population.",
        3.5,
    ),
    "Fig. 4 summarises the per-firm yearly per-signature": (
        EXTRA_FIG_DIR / "figures" / "fig_yearly_big4_comparison.png",
        "Fig. 4. Per-firm yearly per-signature best-match cosine, 2013-2023. (a) Mean per-signature best-match cosine by firm bucket and fiscal year (threshold-free). (b) Share of per-signature best-match cosine ≥ 0.95 (operational cut of Section III-K). Five lines: Firm A, B, C, D, Non-Big-4. Firm A is above the other Big-4 firms in every year; Non-Big-4 is below all four Big-4 firms in every year.",
        6.5,
    ),
    "conducted an ablation study comparing three": (
        FIG_DIR / "fig4_ablation.png",
        "Fig. 5. Ablation study comparing three feature extraction backbones.",
        6.5,
    ),
}


def strip_comments(text):
    """Remove HTML comments, but UNWRAP comments whose first non-blank line
    starts with `TABLE ` (or `TABLE\t`).

    The v3 markdown sources wrap every numerical table in an HTML comment of
    the form

        <!-- TABLE V: Hartigan Dip Test Results
        | Distribution | N | ... |
        |--------------|---|-----|
        | ...          | … | ... |
        -->

    The caption (`TABLE V: Hartigan Dip Test Results`) is on the same line as
    the opening `<!--`, the markdown table body is on the lines following,
    and `-->` closes the block. The previous implementation wholesale-deleted
    these comments, which silently dropped every table from the rendered
    DOCX. We now (i) detect comments whose first non-empty line starts with
    `TABLE `, (ii) emit a synthetic caption marker line `__TABLE_CAPTION__:
    <caption>` so process_section can render the caption as a centered
    bold paragraph above the table, and (iii) keep the table body so the
    existing markdown-table detector picks it up. Non-TABLE comments
    (figure placeholders, editorial notes) are stripped as before.
    """
    def _replace(match):
        body = match.group(1)
        # Find first non-blank line.
        for line in body.splitlines():
            stripped = line.strip()
            if stripped:
                first = stripped
                break
        else:
            return ""
        if not first.startswith("TABLE ") and not first.startswith("TABLE\t"):
            return ""
        # Split caption (first non-blank line) from the rest.
        lines = body.splitlines()
        # Find index of the first non-blank line and use everything after.
        for idx, line in enumerate(lines):
            if line.strip():
                caption = line.strip()
                rest = "\n".join(lines[idx + 1:])
                break
        else:
            return ""
        # Emit caption marker + body. Surround with blank lines so the
        # paragraph/table detector treats the marker as its own paragraph.
        return f"\n\n__TABLE_CAPTION__:{caption}\n{rest}\n"
    # Non-greedy match across lines.
    return re.sub(r"<!--(.*?)-->", _replace, text, flags=re.DOTALL)


# ---------------------------------------------------------------------------
# LaTeX → plain text + Unicode conversion
# ---------------------------------------------------------------------------
# The v3 markdown sources contain inline LaTeX ($...$) and a small number of
# display-math blocks ($$...$$). Pandoc would render these natively; the
# python-docx pipeline used here does not, so without preprocessing every
# `\leq`, `\text{dHash}_\text{indep}`, `\Delta\text{BIC}`, `60{,}448`, etc.
# leaks into the DOCX as raw LaTeX. The helpers below convert the common
# inline cases to Unicode and split subscripts/superscripts into proper Word
# runs. Display-math (rare; 3 equations in this paper) gets a best-effort
# linearisation and is acceptable for a partner-handoff DOCX; final IEEE
# typesetting is handled by the publisher's LaTeX/MathType pipeline.

LATEX_TOKEN_REPLACEMENTS = [
    # Greek letters (lower)
    (r"\\alpha(?![A-Za-z])", "α"), (r"\\beta(?![A-Za-z])", "β"), (r"\\gamma(?![A-Za-z])", "γ"),
    (r"\\delta(?![A-Za-z])", "δ"), (r"\\epsilon(?![A-Za-z])", "ε"), (r"\\zeta(?![A-Za-z])", "ζ"),
    (r"\\eta(?![A-Za-z])", "η"), (r"\\theta(?![A-Za-z])", "θ"), (r"\\iota(?![A-Za-z])", "ι"),
    (r"\\kappa(?![A-Za-z])", "κ"), (r"\\lambda(?![A-Za-z])", "λ"), (r"\\mu(?![A-Za-z])", "μ"),
    (r"\\nu(?![A-Za-z])", "ν"), (r"\\xi(?![A-Za-z])", "ξ"), (r"\\pi(?![A-Za-z])", "π"),
    (r"\\rho(?![A-Za-z])", "ρ"), (r"\\sigma(?![A-Za-z])", "σ"), (r"\\tau(?![A-Za-z])", "τ"),
    (r"\\phi(?![A-Za-z])", "φ"), (r"\\chi(?![A-Za-z])", "χ"), (r"\\psi(?![A-Za-z])", "ψ"),
    (r"\\omega(?![A-Za-z])", "ω"),
    # Greek letters (upper, only those distinguishable from Latin)
    (r"\\Gamma(?![A-Za-z])", "Γ"), (r"\\Delta(?![A-Za-z])", "Δ"), (r"\\Theta(?![A-Za-z])", "Θ"),
    (r"\\Lambda(?![A-Za-z])", "Λ"), (r"\\Xi(?![A-Za-z])", "Ξ"), (r"\\Pi(?![A-Za-z])", "Π"),
    (r"\\Sigma(?![A-Za-z])", "Σ"), (r"\\Phi(?![A-Za-z])", "Φ"), (r"\\Psi(?![A-Za-z])", "Ψ"),
    (r"\\Omega(?![A-Za-z])", "Ω"),
    # Relations / arrows
    (r"\\leq(?![A-Za-z])", "≤"), (r"\\geq(?![A-Za-z])", "≥"),
    (r"\\neq(?![A-Za-z])", "≠"), (r"\\approx(?![A-Za-z])", "≈"),
    (r"\\equiv(?![A-Za-z])", "≡"), (r"\\sim(?![A-Za-z])", "~"),
    (r"\\to(?![A-Za-z])", "→"), (r"\\rightarrow(?![A-Za-z])", "→"),
    (r"\\leftarrow(?![A-Za-z])", "←"), (r"\\Rightarrow(?![A-Za-z])", "⇒"),
    (r"\\Leftarrow(?![A-Za-z])", "⇐"),
    # Binary operators
    (r"\\times(?![A-Za-z])", "×"), (r"\\cdot(?![A-Za-z])", "·"),
    (r"\\pm(?![A-Za-z])", "±"), (r"\\mp(?![A-Za-z])", "∓"),
    (r"\\div(?![A-Za-z])", "÷"),
    # Misc
    (r"\\infty(?![A-Za-z])", "∞"), (r"\\partial(?![A-Za-z])", "∂"),
    (r"\\sum(?![A-Za-z])", "∑"), (r"\\prod(?![A-Za-z])", "∏"),
    (r"\\int(?![A-Za-z])", "∫"),
    (r"\\ldots(?![A-Za-z])", "…"), (r"\\dots(?![A-Za-z])", "…"),
    # Spacing commands (drop or replace with single space)
    (r"\\,", " "), (r"\\;", " "), (r"\\:", " "),
    (r"\\!", ""), (r"\\ ", " "),
    (r"\\quad(?![A-Za-z])", "  "), (r"\\qquad(?![A-Za-z])", "    "),
    # Escaped punctuation
    (r"\\%", "%"), (r"\\#", "#"), (r"\\&", "&"),
    (r"\\\$", "$"), (r"\\_", "_"),
]


def _unwrap_command(text, cmd):
    """Repeatedly replace `\\cmd{X}` → `X` until stable."""
    pat = re.compile(r"\\" + cmd + r"\{([^{}]*)\}")
    prev = None
    while prev != text:
        prev = text
        text = pat.sub(r"\1", text)
    return text


MATH_START = ""  # Private Use Area: XML-safe
MATH_END = ""


def latex_to_unicode(text):
    """Convert a LaTeX-laced markdown paragraph into plain text.

    Math context is preserved with private-use sentinel characters
    (MATH_START / MATH_END) so the downstream run-splitter only treats
    `_X` / `^X` as subscript / superscript inside math regions; in body
    text underscores in identifiers like `signature_analysis` survive.
    """
    if "$" not in text and "\\" not in text:
        return text

    # 1. Strip display-math delimiters first (keep the inner content for
    #    best-effort linearisation), wrapping math regions with sentinels.
    #    Then strip inline math delimiters with the same sentinel wrapping.
    text = re.sub(r"\$\$([\s\S]+?)\$\$",
                  lambda m: MATH_START + m.group(1) + MATH_END, text)
    text = re.sub(r"\$([^$]+?)\$",
                  lambda m: MATH_START + m.group(1) + MATH_END, text)

    # 2. Replace token-level commands with Unicode glyphs *before* unwrapping
    #    `\text{...}` and friends, so that `\Delta\text{BIC}` becomes
    #    `Δ\text{BIC}` (then `ΔBIC`) rather than `\DeltaBIC` which would be
    #    stripped wholesale by the cleanup pass.
    for pat, repl in LATEX_TOKEN_REPLACEMENTS:
        text = re.sub(pat, repl, text)

    # 3. Unwrap formatting / text commands (innermost first via _unwrap loop).
    for cmd in ("text", "mathbf", "mathit", "mathrm", "mathsf", "mathtt",
                "operatorname", "emph", "textbf", "textit"):
        text = _unwrap_command(text, cmd)

    # 4. \frac{a}{b} → (a)/(b); \sqrt{x} → √(x). Apply repeatedly to handle
    #    one level of nesting; deeper nesting is rare in this paper.
    for _ in range(3):
        text = re.sub(
            r"\\t?frac\{([^{}]+)\}\{([^{}]+)\}",
            r"(\1)/(\2)",
            text,
        )
    text = re.sub(r"\\sqrt\{([^{}]+)\}", r"√(\1)", text)

    # 5. TeX braces used purely for spacing/grouping: K{=}3 → K=3,
    #    60{,}448 → 60,448, 10{,}175 → 10,175.
    text = re.sub(r"\{([=<>+\-,])\}", r"\1", text)

    # 6. Strip any remaining `\cmd{...}` (best effort) and `\cmd ` tokens.
    text = re.sub(r"\\[a-zA-Z]+\{([^{}]*)\}", r"\1", text)
    text = re.sub(r"\\[a-zA-Z]+(?![A-Za-z])", "", text)

    # 7. Collapse runs of whitespace introduced by command stripping.
    text = re.sub(r"[ \t]{2,}", " ", text)
    return text


_SUBSUP_PATTERN = re.compile(
    r"_\{([^{}]*)\}"     # _{...}
    r"|\^\{([^{}]*)\}"   # ^{...}
    r"|_([A-Za-z0-9+\-])"  # _X (single token)
    r"|\^([A-Za-z0-9+\-])"  # ^X (single token)
)


def _emit_plain(paragraph, text, font_name, font_size, bold, italic):
    if not text:
        return
    run = paragraph.add_run(text)
    run.font.name = font_name
    run.font.size = font_size
    run.bold = bold
    run.italic = italic


def _emit_math(paragraph, text, font_name, font_size, bold, italic):
    """Emit `text` from a math region: split on `_X` / `_{X}` / `^X` / `^{X}`
    and render those as Word subscripts / superscripts."""
    if "_" not in text and "^" not in text:
        _emit_plain(paragraph, text, font_name, font_size, bold, italic)
        return
    pos = 0
    for m in _SUBSUP_PATTERN.finditer(text):
        if m.start() > pos:
            _emit_plain(paragraph, text[pos:m.start()],
                        font_name, font_size, bold, italic)
        sub_text = m.group(1) or m.group(3)
        sup_text = m.group(2) or m.group(4)
        if sub_text is not None:
            run = paragraph.add_run(sub_text)
            run.font.subscript = True
        else:
            run = paragraph.add_run(sup_text)
            run.font.superscript = True
        run.font.name = font_name
        run.font.size = font_size
        run.bold = bold
        run.italic = italic
        pos = m.end()
    if pos < len(text):
        _emit_plain(paragraph, text[pos:],
                    font_name, font_size, bold, italic)


def add_text_with_subsup(paragraph, text, font_name="Times New Roman",
                         font_size=Pt(10), bold=False, italic=False):
    """Add `text` to `paragraph`. Subscript/superscript handling is scoped to
    math regions delimited by MATH_START / MATH_END sentinels (set up by
    `latex_to_unicode`). Outside math regions, underscores and carets are
    preserved literally so identifiers like `signature_analysis` and
    `paper_a_results_v3.md` survive intact.
    """
    if MATH_START not in text:
        _emit_math(paragraph, text, font_name, font_size, bold, italic) \
            if False else \
            _emit_plain(paragraph, text, font_name, font_size, bold, italic)
        return

    pos = 0
    while pos < len(text):
        s = text.find(MATH_START, pos)
        if s == -1:
            _emit_plain(paragraph, text[pos:],
                        font_name, font_size, bold, italic)
            break
        if s > pos:
            _emit_plain(paragraph, text[pos:s],
                        font_name, font_size, bold, italic)
        e = text.find(MATH_END, s + 1)
        if e == -1:
            # Unterminated math region — emit rest as plain.
            _emit_plain(paragraph, text[s + 1:],
                        font_name, font_size, bold, italic)
            break
        math_body = text[s + 1:e]
        _emit_math(paragraph, math_body, font_name, font_size, bold, italic)
        pos = e + 1


# ---------------------------------------------------------------------------
# Display-equation rendering (matplotlib mathtext → PNG → embedded image)
# ---------------------------------------------------------------------------

# matplotlib mathtext is a subset of LaTeX. A few common TeX-only macros need
# to be substituted with mathtext-supported equivalents before parsing.
_MATHTEXT_SUBS = [
    (re.compile(r"\\tfrac\b"), r"\\frac"),       # text-frac → frac
    (re.compile(r"\\dfrac\b"), r"\\frac"),       # display-frac → frac
    (re.compile(r"\\operatorname\{([^{}]+)\}"),
     lambda m: r"\mathrm{" + m.group(1) + "}"),  # operatorname → mathrm
    (re.compile(r"\\,"), " "),                   # thin space
    (re.compile(r"\\;"), " "),
    (re.compile(r"\\!"), ""),
]


def _sanitise_for_mathtext(latex: str) -> str:
    out = latex
    for pat, repl in _MATHTEXT_SUBS:
        out = pat.sub(repl, out)
    return out


def render_equation_png(latex: str, fontsize: int = 14) -> Path:
    """Render a LaTeX math expression to a tightly-cropped PNG using
    matplotlib mathtext, with content-addressed caching so a re-build only
    re-renders changed equations. Returns the cached PNG path."""
    sanitised = _sanitise_for_mathtext(latex.strip())
    digest = hashlib.sha1(
        (sanitised + f"|fs{fontsize}").encode("utf-8")).hexdigest()[:16]
    out_path = EQUATION_CACHE_DIR / f"eq_{digest}.png"
    if out_path.exists():
        return out_path
    fig = plt.figure(figsize=(8, 1.6))
    fig.text(0.5, 0.5, f"${sanitised}$",
             fontsize=fontsize, ha="center", va="center")
    fig.savefig(str(out_path), dpi=220, bbox_inches="tight",
                pad_inches=0.05)
    plt.close(fig)
    return out_path


def add_equation_block(doc, latex: str, equation_number: int,
                       width_inches: float = 4.5):
    """Insert a centered display equation (rendered as PNG) followed by
    a right-aligned equation number `(N)`. Width keeps the equation
    visually proportional within the IEEE Access body column."""
    img_path = render_equation_png(latex)
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p.paragraph_format.space_before = Pt(6)
    p.paragraph_format.space_after = Pt(6)
    run = p.add_run()
    run.add_picture(str(img_path), width=Inches(width_inches))
    # Equation number on the same paragraph, tab-aligned to the right.
    num_run = p.add_run(f"\t({equation_number})")
    num_run.font.name = "Times New Roman"
    num_run.font.size = Pt(10)


def add_md_table(doc, table_lines):
    rows_data = []
    for line in table_lines:
        cells = [c.strip() for c in line.strip("|").split("|")]
        if not re.match(r"^[-: ]+$", cells[0]):
            rows_data.append(cells)
    if len(rows_data) < 2:
        return
    ncols = len(rows_data[0])
    table = doc.add_table(rows=len(rows_data), cols=ncols)
    table.style = "Table Grid"
    for r_idx, row in enumerate(rows_data):
        for c_idx in range(min(len(row), ncols)):
            cell = table.rows[r_idx].cells[c_idx]
            raw = row[c_idx]
            # Strip markdown emphasis markers; convert LaTeX before rendering.
            raw = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", raw)
            raw = re.sub(r"\*\*(.+?)\*\*", r"\1", raw)
            raw = re.sub(r"\*(.+?)\*", r"\1", raw)
            raw = re.sub(r"`(.+?)`", r"\1", raw)
            cell_text = latex_to_unicode(raw)
            # Replace the default empty paragraph with one we control.
            cell.text = ""
            cp = cell.paragraphs[0]
            cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
            add_text_with_subsup(
                cp, cell_text,
                font_name="Times New Roman",
                font_size=Pt(8),
                bold=(r_idx == 0),
            )
    doc.add_paragraph()


def _insert_figures(doc, para_text):
    for trigger, (fig_path, caption, width) in FIGURES.items():
        if trigger in para_text and Path(fig_path).exists():
            fp = doc.add_paragraph()
            fp.alignment = WD_ALIGN_PARAGRAPH.CENTER
            fr = fp.add_run()
            fr.add_picture(str(fig_path), width=Inches(width))
            cp = doc.add_paragraph()
            cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
            cr = cp.add_run(caption)
            cr.font.size = Pt(9)
            cr.font.name = "Times New Roman"
            cr.italic = True


def process_section(doc, filepath, equation_counter=None):
    """Process one v3 markdown section. `equation_counter` is a single-element
    list (used as a mutable counter shared across sections) tracking the
    running display-equation number."""
    if equation_counter is None:
        equation_counter = [0]
    text = filepath.read_text(encoding="utf-8")
    text = strip_comments(text)
    lines = text.split("\n")
    # Defensive blockquote handling: markdown blockquote lines (`> body`) are
    # not rendered as Word callout blocks here, but stripping the leading
    # `> ` keeps the body text from leaking the literal `>` and the empty
    # `>` separator lines into the DOCX.
    cleaned = []
    for ln in lines:
        s = ln.lstrip()
        if s == ">" or s.startswith("> "):
            cleaned.append(ln[ln.index(">") + 1:].lstrip() if "> " in ln else "")
        else:
            cleaned.append(ln)
    lines = cleaned
    i = 0
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()
        if not stripped:
            i += 1
            continue
        if stripped.startswith("# "):
            h = doc.add_heading(
                latex_to_unicode(stripped[2:]).replace(MATH_START, "").replace(MATH_END, ""),
                level=1)
            for run in h.runs:
                run.font.color.rgb = RGBColor(0, 0, 0)
            i += 1
            continue
        if stripped.startswith("## "):
            h = doc.add_heading(
                latex_to_unicode(stripped[3:]).replace(MATH_START, "").replace(MATH_END, ""),
                level=2)
            for run in h.runs:
                run.font.color.rgb = RGBColor(0, 0, 0)
            i += 1
            continue
        if stripped.startswith("### "):
            h = doc.add_heading(
                latex_to_unicode(stripped[4:]).replace(MATH_START, "").replace(MATH_END, ""),
                level=3)
            for run in h.runs:
                run.font.color.rgb = RGBColor(0, 0, 0)
            i += 1
            continue
        if stripped.startswith("__TABLE_CAPTION__:"):
            caption_text = stripped[len("__TABLE_CAPTION__:"):].strip()
            caption_text = latex_to_unicode(caption_text)
            cp = doc.add_paragraph()
            cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
            cp.paragraph_format.space_before = Pt(6)
            cp.paragraph_format.space_after = Pt(2)
            add_text_with_subsup(
                cp, caption_text,
                font_name="Times New Roman",
                font_size=Pt(9),
                bold=True,
            )
            i += 1
            continue
        if "|" in stripped and i + 1 < len(lines) and re.match(r"\s*\|[-|: ]+\|", lines[i + 1]):
            table_lines = []
            while i < len(lines) and "|" in lines[i]:
                table_lines.append(lines[i])
                i += 1
            add_md_table(doc, table_lines)
            continue
        # Display math: a line starting with `$$` is treated as a single-line
        # equation block and rendered as an embedded mathtext PNG with an
        # auto-incrementing equation number.
        if stripped.startswith("$$"):
            # Accumulate until a closing $$ is found (single line in our
            # corpus, but defensively support multi-line just in case).
            buf = [stripped]
            if not (stripped.count("$$") >= 2 and stripped.endswith("$$")):
                while i + 1 < len(lines):
                    i += 1
                    buf.append(lines[i])
                    if "$$" in lines[i]:
                        break
            joined = "\n".join(buf).strip()
            # Strip the leading and trailing $$ delimiters and any trailing
            # punctuation (e.g. the `,` that some equation lines end with).
            inner = joined
            if inner.startswith("$$"):
                inner = inner[2:]
            if inner.endswith("$$"):
                inner = inner[:-2]
            inner = inner.rstrip(", ")
            equation_counter[0] += 1
            try:
                add_equation_block(doc, inner, equation_counter[0])
            except Exception as exc:
                # Fallback: render as plain centered Times-Roman line so the
                # build doesn't fail on a single un-renderable equation.
                p = doc.add_paragraph()
                p.alignment = WD_ALIGN_PARAGRAPH.CENTER
                run = p.add_run(f"[equation render failed: {exc}] {inner}")
                run.font.name = "Times New Roman"
                run.font.size = Pt(10)
                run.italic = True
            i += 1
            continue
        if re.match(r"^\d+\.\s", stripped):
            # Manual numbering: keep the number from the markdown source and
            # apply a hanging-indent paragraph format. Avoids python-docx's
            # `style='List Number'` which depends on a properly-set-up
            # numbering definition that the default Document() lacks.
            m = re.match(r"^(\d+)\.\s+(.*)$", stripped)
            num, content = m.group(1), m.group(2)
            p = doc.add_paragraph()
            p.paragraph_format.left_indent = Inches(0.4)
            p.paragraph_format.first_line_indent = Inches(-0.25)
            p.paragraph_format.space_after = Pt(4)
            content = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", content)
            content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
            content = re.sub(r"\*(.+?)\*", r"\1", content)
            content = re.sub(r"`(.+?)`", r"\1", content)
            content = latex_to_unicode(content)
            add_text_with_subsup(p, f"{num}. {content}")
            i += 1
            continue
        if stripped.startswith("- "):
            # Manual bullets with hanging indent (same rationale as numbered).
            p = doc.add_paragraph()
            p.paragraph_format.left_indent = Inches(0.4)
            p.paragraph_format.first_line_indent = Inches(-0.25)
            p.paragraph_format.space_after = Pt(4)
            content = stripped[2:]
            content = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", content)
            content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
            content = re.sub(r"\*(.+?)\*", r"\1", content)
            content = re.sub(r"`(.+?)`", r"\1", content)
            content = latex_to_unicode(content)
            add_text_with_subsup(p, f"•  {content}")
            i += 1
            continue
        # Regular paragraph
        para_lines = [stripped]
        i += 1
        while i < len(lines):
            nxt = lines[i].strip()
            if (
                not nxt
                or nxt.startswith("#")
                or nxt.startswith("|")
                or nxt.startswith("- ")
                or re.match(r"^\d+\.\s", nxt)
            ):
                break
            para_lines.append(nxt)
            i += 1
        para_text = " ".join(para_lines)
        para_text = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", para_text)
        para_text = re.sub(r"\*\*(.+?)\*\*", r"\1", para_text)
        para_text = re.sub(r"\*(.+?)\*", r"\1", para_text)
        para_text = re.sub(r"`(.+?)`", r"\1", para_text)
        para_text = para_text.replace("---", "\u2014")
        para_text = latex_to_unicode(para_text)

        p = doc.add_paragraph()
        p.paragraph_format.space_after = Pt(6)
        add_text_with_subsup(p, para_text)

        _insert_figures(doc, para_text)


def main():
    doc = Document()
    style = doc.styles["Normal"]
    style.font.name = "Times New Roman"
    style.font.size = Pt(10)

    # Title page
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p.paragraph_format.space_after = Pt(12)
    run = p.add_run(
        "Automated Identification of Non-Hand-Signed Auditor Signatures\n"
        "in Large-Scale Financial Audit Reports:\n"
        "A Dual-Descriptor Framework with Replication-Dominated Calibration"
    )
    run.font.size = Pt(16)
    run.font.name = "Times New Roman"
    run.bold = True

    # IEEE Access uses single-anonymized review: author / affiliation
    # / corresponding-author block must appear on the title page in the
    # final submission. Fill these placeholders with real metadata
    # before submitting the generated DOCX.
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p.paragraph_format.space_after = Pt(6)
    run = p.add_run("[AUTHOR NAMES — fill in before submission]")
    run.font.size = Pt(11)

    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p.paragraph_format.space_after = Pt(6)
    run = p.add_run("[Affiliations and corresponding-author email — fill in before submission]")
    run.font.size = Pt(10)
    run.italic = True

    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p.paragraph_format.space_after = Pt(20)
    run = p.add_run("Target journal: IEEE Access (Regular Paper, single-anonymized review)")
    run.font.size = Pt(10)
    run.italic = True

    equation_counter = [0]
    for section_file in SECTIONS:
        filepath = PAPER_DIR / section_file
        if filepath.exists():
            process_section(doc, filepath, equation_counter=equation_counter)
        else:
            print(f"WARNING: missing section file: {filepath}")

    doc.save(str(OUTPUT))
    print(f"Saved: {OUTPUT}")
    _run_linter()


def _run_linter():
    """Run the leak linter on the freshly built DOCX. Non-fatal: prints a
    summary line. For full output run `python3 paper/lint_paper_v3.py`."""
    try:
        import lint_paper_v3  # local module
    except Exception as exc:  # pragma: no cover
        print(f"(lint skipped: {exc})")
        return
    findings = lint_paper_v3.lint_docx(OUTPUT)
    errors = sum(1 for f in findings if f.severity == "ERROR")
    warns = sum(1 for f in findings if f.severity == "WARN")
    infos = sum(1 for f in findings if f.severity == "INFO")
    if errors:
        print(f"\n[lint] {errors} ERROR finding(s) in DOCX — run "
              f"`python3 paper/lint_paper_v3.py --docx` for details.")
    elif warns or infos:
        print(f"[lint] DOCX clean of ERRORs ({warns} WARN, {infos} INFO).")
    else:
        print("[lint] DOCX clean.")


if __name__ == "__main__":
    main()