pdf_signature_extraction/paper/export_v2.py

#!/usr/bin/env python3
"""Export Paper A v2 to Word, reading from md section files."""

from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from pathlib import Path
import re

PAPER_DIR = Path("/Volumes/NV2/pdf_recognize/paper")
FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures")
OUTPUT = PAPER_DIR / "Paper_A_IEEE_TAI_Draft_v2.docx"

SECTIONS = [
    "paper_a_abstract.md",
    "paper_a_impact_statement.md",
    "paper_a_introduction.md",
    "paper_a_related_work.md",
    "paper_a_methodology.md",
    "paper_a_results.md",
    "paper_a_discussion.md",
    "paper_a_conclusion.md",
    "paper_a_references.md",
]

FIGURES = {
    "Fig. 1 illustrates": ("fig1_pipeline.png", "Fig. 1. Pipeline architecture for automated signature replication detection.", 6.5),
    "Fig. 2 presents": ("fig2_intra_inter_kde.png", "Fig. 2. Cosine similarity distributions: intra-class vs. inter-class with KDE crossover at 0.837.", 3.5),
    "Fig. 3 presents": ("fig3_firm_a_calibration.png", "Fig. 3. Per-signature best-match cosine similarity: Firm A (known replication) vs. other CPAs.", 3.5),
    "conducted an ablation study comparing three": ("fig4_ablation.png", "Fig. 4. Ablation study comparing three feature extraction backbones.", 6.5),
}


def strip_comments(text):
    """Remove HTML comments from markdown."""
    return re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)


def extract_tables(text):
    """Find markdown tables and return (before, table_lines, after) tuples."""
    lines = text.split('\n')
    tables = []
    i = 0
    while i < len(lines):
        if '|' in lines[i] and i + 1 < len(lines) and re.match(r'\s*\|[-|: ]+\|', lines[i+1]):
            start = i
            while i < len(lines) and '|' in lines[i]:
                i += 1
            tables.append((start, lines[start:i]))
        else:
            i += 1
    return tables


def add_md_table(doc, table_lines):
    """Convert markdown table to docx table."""
    rows_data = []
    for line in table_lines:
        cells = [c.strip() for c in line.strip('|').split('|')]
        if not re.match(r'^[-: ]+$', cells[0]):
            rows_data.append(cells)

    if len(rows_data) < 2:
        return

    ncols = len(rows_data[0])
    table = doc.add_table(rows=len(rows_data), cols=ncols)
    table.style = 'Table Grid'

    for r_idx, row in enumerate(rows_data):
        for c_idx in range(min(len(row), ncols)):
            cell = table.rows[r_idx].cells[c_idx]
            cell.text = row[c_idx]
            for p in cell.paragraphs:
                p.alignment = WD_ALIGN_PARAGRAPH.CENTER
                for run in p.runs:
                    run.font.size = Pt(8)
                    run.font.name = 'Times New Roman'
                    if r_idx == 0:
                        run.bold = True

    doc.add_paragraph()


def process_section(doc, filepath):
    """Process a markdown section file into docx."""
    text = filepath.read_text(encoding='utf-8')
    text = strip_comments(text)

    lines = text.split('\n')
    i = 0
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()

        # Skip empty lines
        if not stripped:
            i += 1
            continue

        # Headings
        if stripped.startswith('# '):
            h = doc.add_heading(stripped[2:], level=1)
            for run in h.runs:
                run.font.color.rgb = RGBColor(0, 0, 0)
            i += 1
            continue
        elif stripped.startswith('## '):
            h = doc.add_heading(stripped[3:], level=2)
            for run in h.runs:
                run.font.color.rgb = RGBColor(0, 0, 0)
            i += 1
            continue
        elif stripped.startswith('### '):
            h = doc.add_heading(stripped[4:], level=3)
            for run in h.runs:
                run.font.color.rgb = RGBColor(0, 0, 0)
            i += 1
            continue

        # Markdown table
        if '|' in stripped and i + 1 < len(lines) and re.match(r'\s*\|[-|: ]+\|', lines[i+1]):
            table_lines = []
            while i < len(lines) and '|' in lines[i]:
                table_lines.append(lines[i])
                i += 1
            add_md_table(doc, table_lines)
            continue

        # Numbered list
        if re.match(r'^\d+\.\s', stripped):
            p = doc.add_paragraph(style='List Number')
            content = re.sub(r'^\d+\.\s', '', stripped)
            content = re.sub(r'\*\*(.+?)\*\*', r'\1', content)  # strip bold markers
            run = p.add_run(content)
            run.font.size = Pt(10)
            run.font.name = 'Times New Roman'
            i += 1
            continue

        # Bullet list
        if stripped.startswith('- '):
            p = doc.add_paragraph(style='List Bullet')
            content = stripped[2:]
            content = re.sub(r'\*\*(.+?)\*\*', r'\1', content)
            run = p.add_run(content)
            run.font.size = Pt(10)
            run.font.name = 'Times New Roman'
            i += 1
            continue

        # Regular paragraph - collect continuation lines
        para_lines = [stripped]
        i += 1
        while i < len(lines):
            next_line = lines[i].strip()
            if not next_line or next_line.startswith('#') or next_line.startswith('|') or \
               next_line.startswith('- ') or re.match(r'^\d+\.\s', next_line):
                break
            para_lines.append(next_line)
            i += 1

        para_text = ' '.join(para_lines)
        # Clean markdown formatting
        para_text = re.sub(r'\*\*\*(.+?)\*\*\*', r'\1', para_text)  # bold italic
        para_text = re.sub(r'\*\*(.+?)\*\*', r'\1', para_text)  # bold
        para_text = re.sub(r'\*(.+?)\*', r'\1', para_text)  # italic
        para_text = re.sub(r'`(.+?)`', r'\1', para_text)  # code
        para_text = para_text.replace('$$', '')  # LaTeX delimiters
        para_text = para_text.replace('---', '\u2014')  # em dash

        p = doc.add_paragraph()
        p.paragraph_format.space_after = Pt(6)
        run = p.add_run(para_text)
        run.font.size = Pt(10)
        run.font.name = 'Times New Roman'

        # Check if we should insert a figure after this paragraph
        for trigger, (fig_file, caption, width) in FIGURES.items():
            if trigger in para_text:
                fig_path = FIG_DIR / fig_file
                if fig_path.exists():
                    fp = doc.add_paragraph()
                    fp.alignment = WD_ALIGN_PARAGRAPH.CENTER
                    fr = fp.add_run()
                    fr.add_picture(str(fig_path), width=Inches(width))

                    cp = doc.add_paragraph()
                    cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
                    cr = cp.add_run(caption)
                    cr.font.size = Pt(9)
                    cr.font.name = 'Times New Roman'
                    cr.italic = True


def main():
    doc = Document()

    # Set default font
    style = doc.styles['Normal']
    style.font.name = 'Times New Roman'
    style.font.size = Pt(10)

    # Title page
    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p.paragraph_format.space_after = Pt(12)
    run = p.add_run("Automated Detection of Digitally Replicated Signatures\nin Large-Scale Financial Audit Reports")
    run.font.size = Pt(16)
    run.font.name = 'Times New Roman'
    run.bold = True

    p = doc.add_paragraph()
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p.paragraph_format.space_after = Pt(20)
    run = p.add_run("[Authors removed for double-blind review]")
    run.font.size = Pt(10)
    run.italic = True

    # Process each section
    for section_file in SECTIONS:
        filepath = PAPER_DIR / section_file
        if filepath.exists():
            process_section(doc, filepath)

    doc.save(str(OUTPUT))
    print(f"Saved: {OUTPUT}")


if __name__ == "__main__":
    main()