Paper A v3: full rewrite for IEEE Access with three-method convergence

Major changes from v2: Terminology: - "digitally replicated" -> "non-hand-signed" throughout (per partner v3 feedback and to avoid implicit accusation) - "Firm A near-universal non-hand-signing" -> "replication-dominated" (per interview nuance: most but not all Firm A partners use replication) Target journal: IEEE TAI -> IEEE Access (per NCKU CSIE list) New methodological sections (III.G-III.L + IV.D-IV.G): - Three convergent threshold methods (KDE antimode + Hartigan dip test / Burgstahler-Dichev McCrary / EM-fitted Beta mixture + logit-GMM robustness check) - Explicit unit-of-analysis discussion (signature vs accountant) - Accountant-level 2D Gaussian mixture (BIC-best K=3 found empirically) - Pixel-identity validation anchor (no manual annotation needed) - Low-similarity negative anchor + Firm A replication-dominated anchor New empirical findings integrated: - Firm A signature cosine UNIMODAL (dip p=0.17) - long left tail = minority hand-signers - Full-sample cosine MULTIMODAL but not cleanly bimodal (BIC prefers 3-comp mixture) - signature-level is continuous quality spectrum - Accountant-level mixture trimodal (C1 Deloitte-heavy 139/141, C2 other Big-4, C3 smaller firms). 2-comp crossings cos=0.945, dh=8.10 - Pixel-identity anchor (310 pairs) gives perfect recall at all cosine thresholds - Firm A anchor rates: cos>0.95=92.5%, dual-rule cos>0.95 AND dh<=8=89.95% New discussion section V.B: "Continuous-quality spectrum vs discrete- behavior regimes" - the core interpretive contribution of v3. References added: Hartigan & Hartigan 1985, Burgstahler & Dichev 1997, McCrary 2008, Dempster-Laird-Rubin 1977, White 1982 (refs 37-41). export_v3.py builds Paper_A_IEEE_Access_Draft_v3.docx (462 KB, +40% vs v2 from expanded methodology + results sections). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-21 00:14:47 +08:00
parent 68689c9f9b
commit 9b11f03548
11 changed files with 1148 additions and 0 deletions
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+"""Export Paper A v3 (IEEE Access target) to Word, reading from v3 md section files."""
+
+from docx import Document
+from docx.shared import Inches, Pt, RGBColor
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+from pathlib import Path
+import re
+
+PAPER_DIR = Path("/Volumes/NV2/pdf_recognize/paper")
+FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures")
+EXTRA_FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/reports")
+OUTPUT = PAPER_DIR / "Paper_A_IEEE_Access_Draft_v3.docx"
+
+SECTIONS = [
+    "paper_a_abstract_v3.md",
+    "paper_a_impact_statement_v3.md",
+    "paper_a_introduction_v3.md",
+    "paper_a_related_work_v3.md",
+    "paper_a_methodology_v3.md",
+    "paper_a_results_v3.md",
+    "paper_a_discussion_v3.md",
+    "paper_a_conclusion_v3.md",
+    "paper_a_references_v3.md",
+]
+
+# Figure insertion hooks (trigger phrase -> (file, caption, width inches)).
+# New figures for v3: dip test, BD/McCrary overlays, accountant GMM 2D + marginals.
+FIGURES = {
+    "Fig. 1 illustrates": (
+        FIG_DIR / "fig1_pipeline.png",
+        "Fig. 1. Pipeline architecture for automated non-hand-signed signature detection.",
+        6.5,
+    ),
+    "Fig. 2 presents the cosine similarity distributions for intra-class": (
+        FIG_DIR / "fig2_intra_inter_kde.png",
+        "Fig. 2. Cosine similarity distributions: intra-class vs. inter-class with KDE crossover at 0.837.",
+        3.5,
+    ),
+    "Fig. 3 presents the per-signature cosine and dHash distributions of Firm A": (
+        FIG_DIR / "fig3_firm_a_calibration.png",
+        "Fig. 3. Firm A per-signature cosine and dHash distributions against the overall CPA population.",
+        3.5,
+    ),
+    "Fig. 4 visualizes the accountant-level clusters": (
+        EXTRA_FIG_DIR / "accountant_mixture" / "accountant_mixture_2d.png",
+        "Fig. 4. Accountant-level 3-component Gaussian mixture in the (cosine-mean, dHash-mean) plane.",
+        4.5,
+    ),
+    "conducted an ablation study comparing three": (
+        FIG_DIR / "fig4_ablation.png",
+        "Fig. 5. Ablation study comparing three feature extraction backbones.",
+        6.5,
+    ),
+}
+
+
+def strip_comments(text):
+    return re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
+
+
+def add_md_table(doc, table_lines):
+    rows_data = []
+    for line in table_lines:
+        cells = [c.strip() for c in line.strip("|").split("|")]
+        if not re.match(r"^[-: ]+$", cells[0]):
+            rows_data.append(cells)
+    if len(rows_data) < 2:
+        return
+    ncols = len(rows_data[0])
+    table = doc.add_table(rows=len(rows_data), cols=ncols)
+    table.style = "Table Grid"
+    for r_idx, row in enumerate(rows_data):
+        for c_idx in range(min(len(row), ncols)):
+            cell = table.rows[r_idx].cells[c_idx]
+            cell.text = row[c_idx]
+            for p in cell.paragraphs:
+                p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+                for run in p.runs:
+                    run.font.size = Pt(8)
+                    run.font.name = "Times New Roman"
+                    if r_idx == 0:
+                        run.bold = True
+    doc.add_paragraph()
+
+
+def _insert_figures(doc, para_text):
+    for trigger, (fig_path, caption, width) in FIGURES.items():
+        if trigger in para_text and Path(fig_path).exists():
+            fp = doc.add_paragraph()
+            fp.alignment = WD_ALIGN_PARAGRAPH.CENTER
+            fr = fp.add_run()
+            fr.add_picture(str(fig_path), width=Inches(width))
+            cp = doc.add_paragraph()
+            cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
+            cr = cp.add_run(caption)
+            cr.font.size = Pt(9)
+            cr.font.name = "Times New Roman"
+            cr.italic = True
+
+
+def process_section(doc, filepath):
+    text = filepath.read_text(encoding="utf-8")
+    text = strip_comments(text)
+    lines = text.split("\n")
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        stripped = line.strip()
+        if not stripped:
+            i += 1
+            continue
+        if stripped.startswith("# "):
+            h = doc.add_heading(stripped[2:], level=1)
+            for run in h.runs:
+                run.font.color.rgb = RGBColor(0, 0, 0)
+            i += 1
+            continue
+        if stripped.startswith("## "):
+            h = doc.add_heading(stripped[3:], level=2)
+            for run in h.runs:
+                run.font.color.rgb = RGBColor(0, 0, 0)
+            i += 1
+            continue
+        if stripped.startswith("### "):
+            h = doc.add_heading(stripped[4:], level=3)
+            for run in h.runs:
+                run.font.color.rgb = RGBColor(0, 0, 0)
+            i += 1
+            continue
+        if "|" in stripped and i + 1 < len(lines) and re.match(r"\s*\|[-|: ]+\|", lines[i + 1]):
+            table_lines = []
+            while i < len(lines) and "|" in lines[i]:
+                table_lines.append(lines[i])
+                i += 1
+            add_md_table(doc, table_lines)
+            continue
+        if re.match(r"^\d+\.\s", stripped):
+            p = doc.add_paragraph(style="List Number")
+            content = re.sub(r"^\d+\.\s", "", stripped)
+            content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
+            run = p.add_run(content)
+            run.font.size = Pt(10)
+            run.font.name = "Times New Roman"
+            i += 1
+            continue
+        if stripped.startswith("- "):
+            p = doc.add_paragraph(style="List Bullet")
+            content = stripped[2:]
+            content = re.sub(r"\*\*(.+?)\*\*", r"\1", content)
+            run = p.add_run(content)
+            run.font.size = Pt(10)
+            run.font.name = "Times New Roman"
+            i += 1
+            continue
+        # Regular paragraph
+        para_lines = [stripped]
+        i += 1
+        while i < len(lines):
+            nxt = lines[i].strip()
+            if (
+                not nxt
+                or nxt.startswith("#")
+                or nxt.startswith("|")
+                or nxt.startswith("- ")
+                or re.match(r"^\d+\.\s", nxt)
+            ):
+                break
+            para_lines.append(nxt)
+            i += 1
+        para_text = " ".join(para_lines)
+        para_text = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", para_text)
+        para_text = re.sub(r"\*\*(.+?)\*\*", r"\1", para_text)
+        para_text = re.sub(r"\*(.+?)\*", r"\1", para_text)
+        para_text = re.sub(r"`(.+?)`", r"\1", para_text)
+        para_text = para_text.replace("$$", "")
+        para_text = para_text.replace("---", "\u2014")
+
+        p = doc.add_paragraph()
+        p.paragraph_format.space_after = Pt(6)
+        run = p.add_run(para_text)
+        run.font.size = Pt(10)
+        run.font.name = "Times New Roman"
+
+        _insert_figures(doc, para_text)
+
+
+def main():
+    doc = Document()
+    style = doc.styles["Normal"]
+    style.font.name = "Times New Roman"
+    style.font.size = Pt(10)
+
+    # Title page
+    p = doc.add_paragraph()
+    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    p.paragraph_format.space_after = Pt(12)
+    run = p.add_run(
+        "Automated Identification of Non-Hand-Signed Auditor Signatures\n"
+        "in Large-Scale Financial Audit Reports:\n"
+        "A Dual-Descriptor Framework with Three-Method Convergent Thresholding"
+    )
+    run.font.size = Pt(16)
+    run.font.name = "Times New Roman"
+    run.bold = True
+
+    p = doc.add_paragraph()
+    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    p.paragraph_format.space_after = Pt(6)
+    run = p.add_run("[Authors removed for double-blind review]")
+    run.font.size = Pt(10)
+    run.italic = True
+
+    p = doc.add_paragraph()
+    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    p.paragraph_format.space_after = Pt(20)
+    run = p.add_run("Target journal: IEEE Access (Regular Paper)")
+    run.font.size = Pt(10)
+    run.italic = True
+
+    for section_file in SECTIONS:
+        filepath = PAPER_DIR / section_file
+        if filepath.exists():
+            process_section(doc, filepath)
+        else:
+            print(f"WARNING: missing section file: {filepath}")
+
+    doc.save(str(OUTPUT))
+    print(f"Saved: {OUTPUT}")
+
+
+if __name__ == "__main__":
+    main()