pdf_signature_extraction/paper/export_v3.py

#!/usr/bin/env python3
"""Export Paper A v3 (IEEE Access target) to Word, reading from v3 md section files."""

from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from pathlib import Path
import re

PAPER_DIR = Path("/Volumes/NV2/pdf_recognize/paper")
FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures")
EXTRA_FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
OUTPUT = PAPER_DIR / "Paper_A_IEEE_Access_Draft_v3.docx"

SECTIONS = [
    "paper_a_abstract_v3.md",
    # paper_a_impact_statement_v3.md removed: not a standard IEEE Access
    # Regular Paper section. Content folded into cover letter / abstract.
    "paper_a_introduction_v3.md",
    "paper_a_related_work_v3.md",
    "paper_a_methodology_v3.md",
    "paper_a_results_v3.md",
    "paper_a_discussion_v3.md",
    "paper_a_conclusion_v3.md",
    # Appendix A: BD/McCrary bin-width sensitivity (see v3.7 notes).
    "paper_a_appendix_v3.md",
    "paper_a_references_v3.md",
]

# Figure insertion hooks (trigger phrase -> (file, caption, width inches)).
# New figures for v3: dip test, BD/McCrary overlays, accountant GMM 2D + marginals.
FIGURES = {
    "Fig. 1 illustrates": (
        FIG_DIR / "fig1_pipeline.png",
        "Fig. 1. Pipeline architecture for automated non-hand-signed signature detection.",
        6.5,
    ),
    "Fig. 2 presents the cosine similarity distributions for intra-class": (
        FIG_DIR / "fig2_intra_inter_kde.png",
        "Fig. 2. Cosine similarity distributions: intra-class vs. inter-class with KDE crossover at 0.837.",
        3.5,
    ),
    "Fig. 3 presents the per-signature cosine and dHash distributions of Firm A": (
        FIG_DIR / "fig3_firm_a_calibration.png",
        "Fig. 3. Firm A per-signature cosine and dHash distributions against the overall CPA population.",
        3.5,
    ),
    "Fig. 4 visualizes the accountant-level clusters": (
        EXTRA_FIG_DIR / "accountant_mixture" / "accountant_mixture_2d.png",
        "Fig. 4. Accountant-level 3-component Gaussian mixture in the (cosine-mean, dHash-mean) plane.",
        4.5,
    ),
    "conducted an ablation study comparing three": (
        FIG_DIR / "fig4_ablation.png",
        "Fig. 5. Ablation study comparing three feature extraction backbones.",
        6.5,
    ),
}


def strip_comments(text):
    return re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)


def add_md_table(doc, table_lines):
    rows_data = []
    for line in table_lines:
        cells = [c.strip() for c in line.strip("|").split("|")]
        if not re.match(r"^[-: ]+$", cells[0]):
            rows_data.append(cells)
    if len(rows_data) < 2:
        return
    ncols = len(rows_data[0])
    table = doc.add_table(rows=len(rows_data), cols=ncols)
    table.style = "Table Grid"
    for r_idx, row in enumerate(rows_data):
        for c_idx in range(min(len(row), ncols)):
            cell = table.rows[r_idx].cells[c_idx]
            cell.text = row[c_idx]
            for p in cell.paragraphs:
                p.alignment = WD_ALIGN_PARAGRAPH.CENTER
                for run in p.runs:
                    run.font.size = Pt(8)
                    run.font.name = "Times New Roman"
                    if r_idx == 0:
                        run.bold = True
    doc.add_paragraph()


def _insert_figures(doc, para_text):
    for trigger, (fig_path, caption, width) in FIGURES.items():
        if trigger in para_text and Path(fig_path).exists():
            fp = doc.add_paragraph()
            fp.alignment = WD_ALIGN_PARAGRAPH.CENTER
            fr = fp.add_run()
            fr.add_picture(str(fig_path), width=Inches(width))
            cp = doc.add_paragraph()
            cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
            cr = cp.add_run(caption)
            cr.font.size = Pt(9)
            cr.font.name = "Times New Roman"
            cr.italic = True


def process_section(doc, filepath):
    text = filepath.read_text(encoding="utf-8")
    text = strip_comments(text)
    lines = text.split("\n")
    i = 0
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()
        if not stripped:
            i += 1
            continue
        if stripped.startswith("# "):
            h = doc.add_heading(stripped[2:], level=1)
            for run in h.runs:
                run.font.color.rgb = RGBColor(0, 0, 0)
            i += 1
            continue
        if stripped.startswith("## "):
            h = doc.add_heading(stripped[3:], level=2)
            for run in h.runs:
                run.font.color.rgb = RGBColor(0, 0, 0)
            i += 1
            continue
        if stripped.startswith("### "):
            h = doc.add_heading(stripped[4:], level=3)
            for run in h.runs:
                run.font.color.rgb = RGBColor(0, 0, 0)
            i += 1
            continue
        if "|" in stripped and i + 1 < len(lines) and re.match(r"\s*\|[-|: ]+\|", lines[i + 1]):
            table_lines = []
            while i < len(lines) and "|" in lines[i]:
                table_lines.append(lines[i])
                i += 1
            add_md_table(doc, table_lines)
            continue
        if re.match(r"^\d+\.\s", stripped):
            p = doc.add_paragraph(style="List Number")
            content = re.sub(r"^\d+\.\s", "", stripped)
            content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
            run = p.add_run(content)
            run.font.size = Pt(10)
            run.font.name = "Times New Roman"
            i += 1
            continue
        if stripped.startswith("- "):
            p = doc.add_paragraph(style="List Bullet")
            content = stripped[2:]
            content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
            run = p.add_run(content)
            run.font.size = Pt(10)
            run.font.name = "Times New Roman"
            i += 1
            continue
        # Regular paragraph
        para_lines = [stripped]
        i += 1
        while i < len(lines):
            nxt = lines[i].strip()
            if (
                not nxt
                or nxt.startswith("#")
                or nxt.startswith("|")
                or nxt.startswith("- ")
                or re.match(r"^\d+\.\s", nxt)
            ):
                break
            para_lines.append(nxt)
            i += 1
        para_text = " ".join(para_lines)
        para_text = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", para_text)
        para_text = re.sub(r"\*\*(.+?)\*\*", r"\1", para_text)
        para_text = re.sub(r"\*(.+?)\*", r"\1", para_text)
        para_text = re.sub(r"`(.+?)`", r"\1", para_text)
        para_text = para_text.replace("$$", "")
        para_text = para_text.replace("---", "\u2014")

        p = doc.add_paragraph()
        p.paragraph_format.space_after = Pt(6)
        run = p.add_run(para_text)
        run.font.size = Pt(10)
        run.font.name = "Times New Roman"

        _insert_figures(doc, para_text)


def main():
    doc = Document()
    style = doc.styles["Normal"]
    style.font.name = "Times New Roman"
    style.font.size = Pt(10)

    # Title page
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p.paragraph_format.space_after = Pt(12)
    run = p.add_run(
        "Automated Identification of Non-Hand-Signed Auditor Signatures\n"
        "in Large-Scale Financial Audit Reports:\n"
        "A Dual-Descriptor Framework with Three-Method Convergent Thresholding"
    )
    run.font.size = Pt(16)
    run.font.name = "Times New Roman"
    run.bold = True

    # IEEE Access uses single-anonymized review: author / affiliation
    # / corresponding-author block must appear on the title page in the
    # final submission. Fill these placeholders with real metadata
    # before submitting the generated DOCX.
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p.paragraph_format.space_after = Pt(6)
    run = p.add_run("[AUTHOR NAMES — fill in before submission]")
    run.font.size = Pt(11)

    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p.paragraph_format.space_after = Pt(6)
    run = p.add_run("[Affiliations and corresponding-author email — fill in before submission]")
    run.font.size = Pt(10)
    run.italic = True

    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p.paragraph_format.space_after = Pt(20)
    run = p.add_run("Target journal: IEEE Access (Regular Paper, single-anonymized review)")
    run.font.size = Pt(10)
    run.italic = True

    for section_file in SECTIONS:
        filepath = PAPER_DIR / section_file
        if filepath.exists():
            process_section(doc, filepath)
        else:
            print(f"WARNING: missing section file: {filepath}")

    doc.save(str(OUTPUT))
    print(f"Saved: {OUTPUT}")


if __name__ == "__main__":
    main()