Files
pdf_signature_extraction/paper/export_v2.py
T
gbanyan 939a348da4 Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification
Paper draft includes all sections (Abstract through Conclusion), 36 references,
and supporting scripts. Key methodology: Cosine similarity + dHash dual-method
verification with thresholds calibrated against known-replication firm (Firm A).

Includes:
- 8 section markdown files (paper_a_*.md)
- Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0)
- Recalibrated classification script (84,386 PDFs, 5-tier system)
- Figure generation and Word export scripts
- Citation renumbering script ([1]-[36])
- Signature analysis pipeline (12 steps)
- YOLO extraction scripts

Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 23:05:33 +08:00

232 lines
7.8 KiB
Python

#!/usr/bin/env python3
"""Export Paper A v2 to Word, reading from md section files."""
from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from pathlib import Path
import re
PAPER_DIR = Path("/Volumes/NV2/pdf_recognize/paper")
FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures")
OUTPUT = PAPER_DIR / "Paper_A_IEEE_TAI_Draft_v2.docx"
SECTIONS = [
"paper_a_abstract.md",
"paper_a_impact_statement.md",
"paper_a_introduction.md",
"paper_a_related_work.md",
"paper_a_methodology.md",
"paper_a_results.md",
"paper_a_discussion.md",
"paper_a_conclusion.md",
"paper_a_references.md",
]
FIGURES = {
"Fig. 1 illustrates": ("fig1_pipeline.png", "Fig. 1. Pipeline architecture for automated signature replication detection.", 6.5),
"Fig. 2 presents": ("fig2_intra_inter_kde.png", "Fig. 2. Cosine similarity distributions: intra-class vs. inter-class with KDE crossover at 0.837.", 3.5),
"Fig. 3 presents": ("fig3_firm_a_calibration.png", "Fig. 3. Per-signature best-match cosine similarity: Firm A (known replication) vs. other CPAs.", 3.5),
"conducted an ablation study comparing three": ("fig4_ablation.png", "Fig. 4. Ablation study comparing three feature extraction backbones.", 6.5),
}
def strip_comments(text):
"""Remove HTML comments from markdown."""
return re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
def extract_tables(text):
"""Find markdown tables and return (before, table_lines, after) tuples."""
lines = text.split('\n')
tables = []
i = 0
while i < len(lines):
if '|' in lines[i] and i + 1 < len(lines) and re.match(r'\s*\|[-|: ]+\|', lines[i+1]):
start = i
while i < len(lines) and '|' in lines[i]:
i += 1
tables.append((start, lines[start:i]))
else:
i += 1
return tables
def add_md_table(doc, table_lines):
"""Convert markdown table to docx table."""
rows_data = []
for line in table_lines:
cells = [c.strip() for c in line.strip('|').split('|')]
if not re.match(r'^[-: ]+$', cells[0]):
rows_data.append(cells)
if len(rows_data) < 2:
return
ncols = len(rows_data[0])
table = doc.add_table(rows=len(rows_data), cols=ncols)
table.style = 'Table Grid'
for r_idx, row in enumerate(rows_data):
for c_idx in range(min(len(row), ncols)):
cell = table.rows[r_idx].cells[c_idx]
cell.text = row[c_idx]
for p in cell.paragraphs:
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
for run in p.runs:
run.font.size = Pt(8)
run.font.name = 'Times New Roman'
if r_idx == 0:
run.bold = True
doc.add_paragraph()
def process_section(doc, filepath):
"""Process a markdown section file into docx."""
text = filepath.read_text(encoding='utf-8')
text = strip_comments(text)
lines = text.split('\n')
i = 0
while i < len(lines):
line = lines[i]
stripped = line.strip()
# Skip empty lines
if not stripped:
i += 1
continue
# Headings
if stripped.startswith('# '):
h = doc.add_heading(stripped[2:], level=1)
for run in h.runs:
run.font.color.rgb = RGBColor(0, 0, 0)
i += 1
continue
elif stripped.startswith('## '):
h = doc.add_heading(stripped[3:], level=2)
for run in h.runs:
run.font.color.rgb = RGBColor(0, 0, 0)
i += 1
continue
elif stripped.startswith('### '):
h = doc.add_heading(stripped[4:], level=3)
for run in h.runs:
run.font.color.rgb = RGBColor(0, 0, 0)
i += 1
continue
# Markdown table
if '|' in stripped and i + 1 < len(lines) and re.match(r'\s*\|[-|: ]+\|', lines[i+1]):
table_lines = []
while i < len(lines) and '|' in lines[i]:
table_lines.append(lines[i])
i += 1
add_md_table(doc, table_lines)
continue
# Numbered list
if re.match(r'^\d+\.\s', stripped):
p = doc.add_paragraph(style='List Number')
content = re.sub(r'^\d+\.\s', '', stripped)
content = re.sub(r'\*\*(.+?)\*\*', r'\1', content) # strip bold markers
run = p.add_run(content)
run.font.size = Pt(10)
run.font.name = 'Times New Roman'
i += 1
continue
# Bullet list
if stripped.startswith('- '):
p = doc.add_paragraph(style='List Bullet')
content = stripped[2:]
content = re.sub(r'\*\*(.+?)\*\*', r'\1', content)
run = p.add_run(content)
run.font.size = Pt(10)
run.font.name = 'Times New Roman'
i += 1
continue
# Regular paragraph - collect continuation lines
para_lines = [stripped]
i += 1
while i < len(lines):
next_line = lines[i].strip()
if not next_line or next_line.startswith('#') or next_line.startswith('|') or \
next_line.startswith('- ') or re.match(r'^\d+\.\s', next_line):
break
para_lines.append(next_line)
i += 1
para_text = ' '.join(para_lines)
# Clean markdown formatting
para_text = re.sub(r'\*\*\*(.+?)\*\*\*', r'\1', para_text) # bold italic
para_text = re.sub(r'\*\*(.+?)\*\*', r'\1', para_text) # bold
para_text = re.sub(r'\*(.+?)\*', r'\1', para_text) # italic
para_text = re.sub(r'`(.+?)`', r'\1', para_text) # code
para_text = para_text.replace('$$', '') # LaTeX delimiters
para_text = para_text.replace('---', '\u2014') # em dash
p = doc.add_paragraph()
p.paragraph_format.space_after = Pt(6)
run = p.add_run(para_text)
run.font.size = Pt(10)
run.font.name = 'Times New Roman'
# Check if we should insert a figure after this paragraph
for trigger, (fig_file, caption, width) in FIGURES.items():
if trigger in para_text:
fig_path = FIG_DIR / fig_file
if fig_path.exists():
fp = doc.add_paragraph()
fp.alignment = WD_ALIGN_PARAGRAPH.CENTER
fr = fp.add_run()
fr.add_picture(str(fig_path), width=Inches(width))
cp = doc.add_paragraph()
cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
cr = cp.add_run(caption)
cr.font.size = Pt(9)
cr.font.name = 'Times New Roman'
cr.italic = True
def main():
doc = Document()
# Set default font
style = doc.styles['Normal']
style.font.name = 'Times New Roman'
style.font.size = Pt(10)
# Title page
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(12)
run = p.add_run("Automated Detection of Digitally Replicated Signatures\nin Large-Scale Financial Audit Reports")
run.font.size = Pt(16)
run.font.name = 'Times New Roman'
run.bold = True
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(20)
run = p.add_run("[Authors removed for double-blind review]")
run.font.size = Pt(10)
run.italic = True
# Process each section
for section_file in SECTIONS:
filepath = PAPER_DIR / section_file
if filepath.exists():
process_section(doc, filepath)
doc.save(str(OUTPUT))
print(f"Saved: {OUTPUT}")
if __name__ == "__main__":
main()