#!/usr/bin/env python3 """Export Paper A v2 to Word, reading from md section files.""" from docx import Document from docx.shared import Inches, Pt, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH from pathlib import Path import re PAPER_DIR = Path("/Volumes/NV2/pdf_recognize/paper") FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures") OUTPUT = PAPER_DIR / "Paper_A_IEEE_TAI_Draft_v2.docx" SECTIONS = [ "paper_a_abstract.md", "paper_a_impact_statement.md", "paper_a_introduction.md", "paper_a_related_work.md", "paper_a_methodology.md", "paper_a_results.md", "paper_a_discussion.md", "paper_a_conclusion.md", "paper_a_references.md", ] FIGURES = { "Fig. 1 illustrates": ("fig1_pipeline.png", "Fig. 1. Pipeline architecture for automated signature replication detection.", 6.5), "Fig. 2 presents": ("fig2_intra_inter_kde.png", "Fig. 2. Cosine similarity distributions: intra-class vs. inter-class with KDE crossover at 0.837.", 3.5), "Fig. 3 presents": ("fig3_firm_a_calibration.png", "Fig. 3. Per-signature best-match cosine similarity: Firm A (known replication) vs. other CPAs.", 3.5), "conducted an ablation study comparing three": ("fig4_ablation.png", "Fig. 4. Ablation study comparing three feature extraction backbones.", 6.5), } def strip_comments(text): """Remove HTML comments from markdown.""" return re.sub(r'', '', text, flags=re.DOTALL) def extract_tables(text): """Find markdown tables and return (before, table_lines, after) tuples.""" lines = text.split('\n') tables = [] i = 0 while i < len(lines): if '|' in lines[i] and i + 1 < len(lines) and re.match(r'\s*\|[-|: ]+\|', lines[i+1]): start = i while i < len(lines) and '|' in lines[i]: i += 1 tables.append((start, lines[start:i])) else: i += 1 return tables def add_md_table(doc, table_lines): """Convert markdown table to docx table.""" rows_data = [] for line in table_lines: cells = [c.strip() for c in line.strip('|').split('|')] if not re.match(r'^[-: ]+$', cells[0]): rows_data.append(cells) if len(rows_data) < 2: return ncols = len(rows_data[0]) table = doc.add_table(rows=len(rows_data), cols=ncols) table.style = 'Table Grid' for r_idx, row in enumerate(rows_data): for c_idx in range(min(len(row), ncols)): cell = table.rows[r_idx].cells[c_idx] cell.text = row[c_idx] for p in cell.paragraphs: p.alignment = WD_ALIGN_PARAGRAPH.CENTER for run in p.runs: run.font.size = Pt(8) run.font.name = 'Times New Roman' if r_idx == 0: run.bold = True doc.add_paragraph() def process_section(doc, filepath): """Process a markdown section file into docx.""" text = filepath.read_text(encoding='utf-8') text = strip_comments(text) lines = text.split('\n') i = 0 while i < len(lines): line = lines[i] stripped = line.strip() # Skip empty lines if not stripped: i += 1 continue # Headings if stripped.startswith('# '): h = doc.add_heading(stripped[2:], level=1) for run in h.runs: run.font.color.rgb = RGBColor(0, 0, 0) i += 1 continue elif stripped.startswith('## '): h = doc.add_heading(stripped[3:], level=2) for run in h.runs: run.font.color.rgb = RGBColor(0, 0, 0) i += 1 continue elif stripped.startswith('### '): h = doc.add_heading(stripped[4:], level=3) for run in h.runs: run.font.color.rgb = RGBColor(0, 0, 0) i += 1 continue # Markdown table if '|' in stripped and i + 1 < len(lines) and re.match(r'\s*\|[-|: ]+\|', lines[i+1]): table_lines = [] while i < len(lines) and '|' in lines[i]: table_lines.append(lines[i]) i += 1 add_md_table(doc, table_lines) continue # Numbered list if re.match(r'^\d+\.\s', stripped): p = doc.add_paragraph(style='List Number') content = re.sub(r'^\d+\.\s', '', stripped) content = re.sub(r'\*\*(.+?)\*\*', r'\1', content) # strip bold markers run = p.add_run(content) run.font.size = Pt(10) run.font.name = 'Times New Roman' i += 1 continue # Bullet list if stripped.startswith('- '): p = doc.add_paragraph(style='List Bullet') content = stripped[2:] content = re.sub(r'\*\*(.+?)\*\*', r'\1', content) run = p.add_run(content) run.font.size = Pt(10) run.font.name = 'Times New Roman' i += 1 continue # Regular paragraph - collect continuation lines para_lines = [stripped] i += 1 while i < len(lines): next_line = lines[i].strip() if not next_line or next_line.startswith('#') or next_line.startswith('|') or \ next_line.startswith('- ') or re.match(r'^\d+\.\s', next_line): break para_lines.append(next_line) i += 1 para_text = ' '.join(para_lines) # Clean markdown formatting para_text = re.sub(r'\*\*\*(.+?)\*\*\*', r'\1', para_text) # bold italic para_text = re.sub(r'\*\*(.+?)\*\*', r'\1', para_text) # bold para_text = re.sub(r'\*(.+?)\*', r'\1', para_text) # italic para_text = re.sub(r'`(.+?)`', r'\1', para_text) # code para_text = para_text.replace('$$', '') # LaTeX delimiters para_text = para_text.replace('---', '\u2014') # em dash p = doc.add_paragraph() p.paragraph_format.space_after = Pt(6) run = p.add_run(para_text) run.font.size = Pt(10) run.font.name = 'Times New Roman' # Check if we should insert a figure after this paragraph for trigger, (fig_file, caption, width) in FIGURES.items(): if trigger in para_text: fig_path = FIG_DIR / fig_file if fig_path.exists(): fp = doc.add_paragraph() fp.alignment = WD_ALIGN_PARAGRAPH.CENTER fr = fp.add_run() fr.add_picture(str(fig_path), width=Inches(width)) cp = doc.add_paragraph() cp.alignment = WD_ALIGN_PARAGRAPH.CENTER cr = cp.add_run(caption) cr.font.size = Pt(9) cr.font.name = 'Times New Roman' cr.italic = True def main(): doc = Document() # Set default font style = doc.styles['Normal'] style.font.name = 'Times New Roman' style.font.size = Pt(10) # Title page p = doc.add_paragraph() p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.paragraph_format.space_after = Pt(12) run = p.add_run("Automated Detection of Digitally Replicated Signatures\nin Large-Scale Financial Audit Reports") run.font.size = Pt(16) run.font.name = 'Times New Roman' run.bold = True p = doc.add_paragraph() p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.paragraph_format.space_after = Pt(20) run = p.add_run("[Authors removed for double-blind review]") run.font.size = Pt(10) run.italic = True # Process each section for section_file in SECTIONS: filepath = PAPER_DIR / section_file if filepath.exists(): process_section(doc, filepath) doc.save(str(OUTPUT)) print(f"Saved: {OUTPUT}") if __name__ == "__main__": main()