Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification

Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 23:05:33 +08:00
parent 21df0ff387
commit 939a348da4
33 changed files with 9315 additions and 0 deletions
@@ -0,0 +1,231 @@
+#!/usr/bin/env python3
+"""Export Paper A v2 to Word, reading from md section files."""
+
+from docx import Document
+from docx.shared import Inches, Pt, RGBColor
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+from pathlib import Path
+import re
+
+PAPER_DIR = Path("/Volumes/NV2/pdf_recognize/paper")
+FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures")
+OUTPUT = PAPER_DIR / "Paper_A_IEEE_TAI_Draft_v2.docx"
+
+SECTIONS = [
+    "paper_a_abstract.md",
+    "paper_a_impact_statement.md",
+    "paper_a_introduction.md",
+    "paper_a_related_work.md",
+    "paper_a_methodology.md",
+    "paper_a_results.md",
+    "paper_a_discussion.md",
+    "paper_a_conclusion.md",
+    "paper_a_references.md",
+]
+
+FIGURES = {
+    "Fig. 1 illustrates": ("fig1_pipeline.png", "Fig. 1. Pipeline architecture for automated signature replication detection.", 6.5),
+    "Fig. 2 presents": ("fig2_intra_inter_kde.png", "Fig. 2. Cosine similarity distributions: intra-class vs. inter-class with KDE crossover at 0.837.", 3.5),
+    "Fig. 3 presents": ("fig3_firm_a_calibration.png", "Fig. 3. Per-signature best-match cosine similarity: Firm A (known replication) vs. other CPAs.", 3.5),
+    "conducted an ablation study comparing three": ("fig4_ablation.png", "Fig. 4. Ablation study comparing three feature extraction backbones.", 6.5),
+}
+
+
+def strip_comments(text):
+    """Remove HTML comments from markdown."""
+    return re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
+
+
+def extract_tables(text):
+    """Find markdown tables and return (before, table_lines, after) tuples."""
+    lines = text.split('\n')
+    tables = []
+    i = 0
+    while i < len(lines):
+        if '|' in lines[i] and i + 1 < len(lines) and re.match(r'\s*\|[-|: ]+\|', lines[i+1]):
+            start = i
+            while i < len(lines) and '|' in lines[i]:
+                i += 1
+            tables.append((start, lines[start:i]))
+        else:
+            i += 1
+    return tables
+
+
+def add_md_table(doc, table_lines):
+    """Convert markdown table to docx table."""
+    rows_data = []
+    for line in table_lines:
+        cells = [c.strip() for c in line.strip('|').split('|')]
+        if not re.match(r'^[-: ]+$', cells[0]):
+            rows_data.append(cells)
+
+    if len(rows_data) < 2:
+        return
+
+    ncols = len(rows_data[0])
+    table = doc.add_table(rows=len(rows_data), cols=ncols)
+    table.style = 'Table Grid'
+
+    for r_idx, row in enumerate(rows_data):
+        for c_idx in range(min(len(row), ncols)):
+            cell = table.rows[r_idx].cells[c_idx]
+            cell.text = row[c_idx]
+            for p in cell.paragraphs:
+                p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+                for run in p.runs:
+                    run.font.size = Pt(8)
+                    run.font.name = 'Times New Roman'
+                    if r_idx == 0:
+                        run.bold = True
+
+    doc.add_paragraph()
+
+
+def process_section(doc, filepath):
+    """Process a markdown section file into docx."""
+    text = filepath.read_text(encoding='utf-8')
+    text = strip_comments(text)
+
+    lines = text.split('\n')
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        stripped = line.strip()
+
+        # Skip empty lines
+        if not stripped:
+            i += 1
+            continue
+
+        # Headings
+        if stripped.startswith('# '):
+            h = doc.add_heading(stripped[2:], level=1)
+            for run in h.runs:
+                run.font.color.rgb = RGBColor(0, 0, 0)
+            i += 1
+            continue
+        elif stripped.startswith('## '):
+            h = doc.add_heading(stripped[3:], level=2)
+            for run in h.runs:
+                run.font.color.rgb = RGBColor(0, 0, 0)
+            i += 1
+            continue
+        elif stripped.startswith('### '):
+            h = doc.add_heading(stripped[4:], level=3)
+            for run in h.runs:
+                run.font.color.rgb = RGBColor(0, 0, 0)
+            i += 1
+            continue
+
+        # Markdown table
+        if '|' in stripped and i + 1 < len(lines) and re.match(r'\s*\|[-|: ]+\|', lines[i+1]):
+            table_lines = []
+            while i < len(lines) and '|' in lines[i]:
+                table_lines.append(lines[i])
+                i += 1
+            add_md_table(doc, table_lines)
+            continue
+
+        # Numbered list
+        if re.match(r'^\d+\.\s', stripped):
+            p = doc.add_paragraph(style='List Number')
+            content = re.sub(r'^\d+\.\s', '', stripped)
+            content = re.sub(r'\*\*(.+?)\*\*', r'\1', content)  # strip bold markers
+            run = p.add_run(content)
+            run.font.size = Pt(10)
+            run.font.name = 'Times New Roman'
+            i += 1
+            continue
+
+        # Bullet list
+        if stripped.startswith('- '):
+            p = doc.add_paragraph(style='List Bullet')
+            content = stripped[2:]
+            content = re.sub(r'\*\*(.+?)\*\*', r'\1', content)
+            run = p.add_run(content)
+            run.font.size = Pt(10)
+            run.font.name = 'Times New Roman'
+            i += 1
+            continue
+
+        # Regular paragraph - collect continuation lines
+        para_lines = [stripped]
+        i += 1
+        while i < len(lines):
+            next_line = lines[i].strip()
+            if not next_line or next_line.startswith('#') or next_line.startswith('|') or \
+               next_line.startswith('- ') or re.match(r'^\d+\.\s', next_line):
+                break
+            para_lines.append(next_line)
+            i += 1
+
+        para_text = ' '.join(para_lines)
+        # Clean markdown formatting
+        para_text = re.sub(r'\*\*\*(.+?)\*\*\*', r'\1', para_text)  # bold italic
+        para_text = re.sub(r'\*\*(.+?)\*\*', r'\1', para_text)  # bold
+        para_text = re.sub(r'\*(.+?)\*', r'\1', para_text)  # italic
+        para_text = re.sub(r'`(.+?)`', r'\1', para_text)  # code
+        para_text = para_text.replace('$$', '')  # LaTeX delimiters
+        para_text = para_text.replace('---', '\u2014')  # em dash
+
+        p = doc.add_paragraph()
+        p.paragraph_format.space_after = Pt(6)
+        run = p.add_run(para_text)
+        run.font.size = Pt(10)
+        run.font.name = 'Times New Roman'
+
+        # Check if we should insert a figure after this paragraph
+        for trigger, (fig_file, caption, width) in FIGURES.items():
+            if trigger in para_text:
+                fig_path = FIG_DIR / fig_file
+                if fig_path.exists():
+                    fp = doc.add_paragraph()
+                    fp.alignment = WD_ALIGN_PARAGRAPH.CENTER
+                    fr = fp.add_run()
+                    fr.add_picture(str(fig_path), width=Inches(width))
+
+                    cp = doc.add_paragraph()
+                    cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
+                    cr = cp.add_run(caption)
+                    cr.font.size = Pt(9)
+                    cr.font.name = 'Times New Roman'
+                    cr.italic = True
+
+
+def main():
+    doc = Document()
+
+    # Set default font
+    style = doc.styles['Normal']
+    style.font.name = 'Times New Roman'
+    style.font.size = Pt(10)
+
+    # Title page
+    p = doc.add_paragraph()
+    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    p.paragraph_format.space_after = Pt(12)
+    run = p.add_run("Automated Detection of Digitally Replicated Signatures\nin Large-Scale Financial Audit Reports")
+    run.font.size = Pt(16)
+    run.font.name = 'Times New Roman'
+    run.bold = True
+
+    p = doc.add_paragraph()
+    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    p.paragraph_format.space_after = Pt(20)
+    run = p.add_run("[Authors removed for double-blind review]")
+    run.font.size = Pt(10)
+    run.italic = True
+
+    # Process each section
+    for section_file in SECTIONS:
+        filepath = PAPER_DIR / section_file
+        if filepath.exists():
+            process_section(doc, filepath)
+
+    doc.save(str(OUTPUT))
+    print(f"Saved: {OUTPUT}")
+
+
+if __name__ == "__main__":
+    main()