939a348da4
Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
232 lines
7.8 KiB
Python
232 lines
7.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Export Paper A v2 to Word, reading from md section files."""
|
|
|
|
from docx import Document
|
|
from docx.shared import Inches, Pt, RGBColor
|
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
from pathlib import Path
|
|
import re
|
|
|
|
PAPER_DIR = Path("/Volumes/NV2/pdf_recognize/paper")
|
|
FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures")
|
|
OUTPUT = PAPER_DIR / "Paper_A_IEEE_TAI_Draft_v2.docx"
|
|
|
|
SECTIONS = [
|
|
"paper_a_abstract.md",
|
|
"paper_a_impact_statement.md",
|
|
"paper_a_introduction.md",
|
|
"paper_a_related_work.md",
|
|
"paper_a_methodology.md",
|
|
"paper_a_results.md",
|
|
"paper_a_discussion.md",
|
|
"paper_a_conclusion.md",
|
|
"paper_a_references.md",
|
|
]
|
|
|
|
FIGURES = {
|
|
"Fig. 1 illustrates": ("fig1_pipeline.png", "Fig. 1. Pipeline architecture for automated signature replication detection.", 6.5),
|
|
"Fig. 2 presents": ("fig2_intra_inter_kde.png", "Fig. 2. Cosine similarity distributions: intra-class vs. inter-class with KDE crossover at 0.837.", 3.5),
|
|
"Fig. 3 presents": ("fig3_firm_a_calibration.png", "Fig. 3. Per-signature best-match cosine similarity: Firm A (known replication) vs. other CPAs.", 3.5),
|
|
"conducted an ablation study comparing three": ("fig4_ablation.png", "Fig. 4. Ablation study comparing three feature extraction backbones.", 6.5),
|
|
}
|
|
|
|
|
|
def strip_comments(text):
|
|
"""Remove HTML comments from markdown."""
|
|
return re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
|
|
|
|
|
|
def extract_tables(text):
|
|
"""Find markdown tables and return (before, table_lines, after) tuples."""
|
|
lines = text.split('\n')
|
|
tables = []
|
|
i = 0
|
|
while i < len(lines):
|
|
if '|' in lines[i] and i + 1 < len(lines) and re.match(r'\s*\|[-|: ]+\|', lines[i+1]):
|
|
start = i
|
|
while i < len(lines) and '|' in lines[i]:
|
|
i += 1
|
|
tables.append((start, lines[start:i]))
|
|
else:
|
|
i += 1
|
|
return tables
|
|
|
|
|
|
def add_md_table(doc, table_lines):
|
|
"""Convert markdown table to docx table."""
|
|
rows_data = []
|
|
for line in table_lines:
|
|
cells = [c.strip() for c in line.strip('|').split('|')]
|
|
if not re.match(r'^[-: ]+$', cells[0]):
|
|
rows_data.append(cells)
|
|
|
|
if len(rows_data) < 2:
|
|
return
|
|
|
|
ncols = len(rows_data[0])
|
|
table = doc.add_table(rows=len(rows_data), cols=ncols)
|
|
table.style = 'Table Grid'
|
|
|
|
for r_idx, row in enumerate(rows_data):
|
|
for c_idx in range(min(len(row), ncols)):
|
|
cell = table.rows[r_idx].cells[c_idx]
|
|
cell.text = row[c_idx]
|
|
for p in cell.paragraphs:
|
|
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
for run in p.runs:
|
|
run.font.size = Pt(8)
|
|
run.font.name = 'Times New Roman'
|
|
if r_idx == 0:
|
|
run.bold = True
|
|
|
|
doc.add_paragraph()
|
|
|
|
|
|
def process_section(doc, filepath):
|
|
"""Process a markdown section file into docx."""
|
|
text = filepath.read_text(encoding='utf-8')
|
|
text = strip_comments(text)
|
|
|
|
lines = text.split('\n')
|
|
i = 0
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
stripped = line.strip()
|
|
|
|
# Skip empty lines
|
|
if not stripped:
|
|
i += 1
|
|
continue
|
|
|
|
# Headings
|
|
if stripped.startswith('# '):
|
|
h = doc.add_heading(stripped[2:], level=1)
|
|
for run in h.runs:
|
|
run.font.color.rgb = RGBColor(0, 0, 0)
|
|
i += 1
|
|
continue
|
|
elif stripped.startswith('## '):
|
|
h = doc.add_heading(stripped[3:], level=2)
|
|
for run in h.runs:
|
|
run.font.color.rgb = RGBColor(0, 0, 0)
|
|
i += 1
|
|
continue
|
|
elif stripped.startswith('### '):
|
|
h = doc.add_heading(stripped[4:], level=3)
|
|
for run in h.runs:
|
|
run.font.color.rgb = RGBColor(0, 0, 0)
|
|
i += 1
|
|
continue
|
|
|
|
# Markdown table
|
|
if '|' in stripped and i + 1 < len(lines) and re.match(r'\s*\|[-|: ]+\|', lines[i+1]):
|
|
table_lines = []
|
|
while i < len(lines) and '|' in lines[i]:
|
|
table_lines.append(lines[i])
|
|
i += 1
|
|
add_md_table(doc, table_lines)
|
|
continue
|
|
|
|
# Numbered list
|
|
if re.match(r'^\d+\.\s', stripped):
|
|
p = doc.add_paragraph(style='List Number')
|
|
content = re.sub(r'^\d+\.\s', '', stripped)
|
|
content = re.sub(r'\*\*(.+?)\*\*', r'\1', content) # strip bold markers
|
|
run = p.add_run(content)
|
|
run.font.size = Pt(10)
|
|
run.font.name = 'Times New Roman'
|
|
i += 1
|
|
continue
|
|
|
|
# Bullet list
|
|
if stripped.startswith('- '):
|
|
p = doc.add_paragraph(style='List Bullet')
|
|
content = stripped[2:]
|
|
content = re.sub(r'\*\*(.+?)\*\*', r'\1', content)
|
|
run = p.add_run(content)
|
|
run.font.size = Pt(10)
|
|
run.font.name = 'Times New Roman'
|
|
i += 1
|
|
continue
|
|
|
|
# Regular paragraph - collect continuation lines
|
|
para_lines = [stripped]
|
|
i += 1
|
|
while i < len(lines):
|
|
next_line = lines[i].strip()
|
|
if not next_line or next_line.startswith('#') or next_line.startswith('|') or \
|
|
next_line.startswith('- ') or re.match(r'^\d+\.\s', next_line):
|
|
break
|
|
para_lines.append(next_line)
|
|
i += 1
|
|
|
|
para_text = ' '.join(para_lines)
|
|
# Clean markdown formatting
|
|
para_text = re.sub(r'\*\*\*(.+?)\*\*\*', r'\1', para_text) # bold italic
|
|
para_text = re.sub(r'\*\*(.+?)\*\*', r'\1', para_text) # bold
|
|
para_text = re.sub(r'\*(.+?)\*', r'\1', para_text) # italic
|
|
para_text = re.sub(r'`(.+?)`', r'\1', para_text) # code
|
|
para_text = para_text.replace('$$', '') # LaTeX delimiters
|
|
para_text = para_text.replace('---', '\u2014') # em dash
|
|
|
|
p = doc.add_paragraph()
|
|
p.paragraph_format.space_after = Pt(6)
|
|
run = p.add_run(para_text)
|
|
run.font.size = Pt(10)
|
|
run.font.name = 'Times New Roman'
|
|
|
|
# Check if we should insert a figure after this paragraph
|
|
for trigger, (fig_file, caption, width) in FIGURES.items():
|
|
if trigger in para_text:
|
|
fig_path = FIG_DIR / fig_file
|
|
if fig_path.exists():
|
|
fp = doc.add_paragraph()
|
|
fp.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
fr = fp.add_run()
|
|
fr.add_picture(str(fig_path), width=Inches(width))
|
|
|
|
cp = doc.add_paragraph()
|
|
cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
cr = cp.add_run(caption)
|
|
cr.font.size = Pt(9)
|
|
cr.font.name = 'Times New Roman'
|
|
cr.italic = True
|
|
|
|
|
|
def main():
|
|
doc = Document()
|
|
|
|
# Set default font
|
|
style = doc.styles['Normal']
|
|
style.font.name = 'Times New Roman'
|
|
style.font.size = Pt(10)
|
|
|
|
# Title page
|
|
p = doc.add_paragraph()
|
|
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
p.paragraph_format.space_after = Pt(12)
|
|
run = p.add_run("Automated Detection of Digitally Replicated Signatures\nin Large-Scale Financial Audit Reports")
|
|
run.font.size = Pt(16)
|
|
run.font.name = 'Times New Roman'
|
|
run.bold = True
|
|
|
|
p = doc.add_paragraph()
|
|
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
p.paragraph_format.space_after = Pt(20)
|
|
run = p.add_run("[Authors removed for double-blind review]")
|
|
run.font.size = Pt(10)
|
|
run.italic = True
|
|
|
|
# Process each section
|
|
for section_file in SECTIONS:
|
|
filepath = PAPER_DIR / section_file
|
|
if filepath.exists():
|
|
process_section(doc, filepath)
|
|
|
|
doc.save(str(OUTPUT))
|
|
print(f"Saved: {OUTPUT}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|