Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification
Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,231 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Export Paper A v2 to Word, reading from md section files."""
|
||||
|
||||
from docx import Document
|
||||
from docx.shared import Inches, Pt, RGBColor
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
PAPER_DIR = Path("/Volumes/NV2/pdf_recognize/paper")
|
||||
FIG_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/paper_figures")
|
||||
OUTPUT = PAPER_DIR / "Paper_A_IEEE_TAI_Draft_v2.docx"
|
||||
|
||||
SECTIONS = [
|
||||
"paper_a_abstract.md",
|
||||
"paper_a_impact_statement.md",
|
||||
"paper_a_introduction.md",
|
||||
"paper_a_related_work.md",
|
||||
"paper_a_methodology.md",
|
||||
"paper_a_results.md",
|
||||
"paper_a_discussion.md",
|
||||
"paper_a_conclusion.md",
|
||||
"paper_a_references.md",
|
||||
]
|
||||
|
||||
FIGURES = {
|
||||
"Fig. 1 illustrates": ("fig1_pipeline.png", "Fig. 1. Pipeline architecture for automated signature replication detection.", 6.5),
|
||||
"Fig. 2 presents": ("fig2_intra_inter_kde.png", "Fig. 2. Cosine similarity distributions: intra-class vs. inter-class with KDE crossover at 0.837.", 3.5),
|
||||
"Fig. 3 presents": ("fig3_firm_a_calibration.png", "Fig. 3. Per-signature best-match cosine similarity: Firm A (known replication) vs. other CPAs.", 3.5),
|
||||
"conducted an ablation study comparing three": ("fig4_ablation.png", "Fig. 4. Ablation study comparing three feature extraction backbones.", 6.5),
|
||||
}
|
||||
|
||||
|
||||
def strip_comments(text):
|
||||
"""Remove HTML comments from markdown."""
|
||||
return re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
|
||||
|
||||
|
||||
def extract_tables(text):
|
||||
"""Find markdown tables and return (before, table_lines, after) tuples."""
|
||||
lines = text.split('\n')
|
||||
tables = []
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
if '|' in lines[i] and i + 1 < len(lines) and re.match(r'\s*\|[-|: ]+\|', lines[i+1]):
|
||||
start = i
|
||||
while i < len(lines) and '|' in lines[i]:
|
||||
i += 1
|
||||
tables.append((start, lines[start:i]))
|
||||
else:
|
||||
i += 1
|
||||
return tables
|
||||
|
||||
|
||||
def add_md_table(doc, table_lines):
|
||||
"""Convert markdown table to docx table."""
|
||||
rows_data = []
|
||||
for line in table_lines:
|
||||
cells = [c.strip() for c in line.strip('|').split('|')]
|
||||
if not re.match(r'^[-: ]+$', cells[0]):
|
||||
rows_data.append(cells)
|
||||
|
||||
if len(rows_data) < 2:
|
||||
return
|
||||
|
||||
ncols = len(rows_data[0])
|
||||
table = doc.add_table(rows=len(rows_data), cols=ncols)
|
||||
table.style = 'Table Grid'
|
||||
|
||||
for r_idx, row in enumerate(rows_data):
|
||||
for c_idx in range(min(len(row), ncols)):
|
||||
cell = table.rows[r_idx].cells[c_idx]
|
||||
cell.text = row[c_idx]
|
||||
for p in cell.paragraphs:
|
||||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
for run in p.runs:
|
||||
run.font.size = Pt(8)
|
||||
run.font.name = 'Times New Roman'
|
||||
if r_idx == 0:
|
||||
run.bold = True
|
||||
|
||||
doc.add_paragraph()
|
||||
|
||||
|
||||
def process_section(doc, filepath):
|
||||
"""Process a markdown section file into docx."""
|
||||
text = filepath.read_text(encoding='utf-8')
|
||||
text = strip_comments(text)
|
||||
|
||||
lines = text.split('\n')
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
stripped = line.strip()
|
||||
|
||||
# Skip empty lines
|
||||
if not stripped:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Headings
|
||||
if stripped.startswith('# '):
|
||||
h = doc.add_heading(stripped[2:], level=1)
|
||||
for run in h.runs:
|
||||
run.font.color.rgb = RGBColor(0, 0, 0)
|
||||
i += 1
|
||||
continue
|
||||
elif stripped.startswith('## '):
|
||||
h = doc.add_heading(stripped[3:], level=2)
|
||||
for run in h.runs:
|
||||
run.font.color.rgb = RGBColor(0, 0, 0)
|
||||
i += 1
|
||||
continue
|
||||
elif stripped.startswith('### '):
|
||||
h = doc.add_heading(stripped[4:], level=3)
|
||||
for run in h.runs:
|
||||
run.font.color.rgb = RGBColor(0, 0, 0)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Markdown table
|
||||
if '|' in stripped and i + 1 < len(lines) and re.match(r'\s*\|[-|: ]+\|', lines[i+1]):
|
||||
table_lines = []
|
||||
while i < len(lines) and '|' in lines[i]:
|
||||
table_lines.append(lines[i])
|
||||
i += 1
|
||||
add_md_table(doc, table_lines)
|
||||
continue
|
||||
|
||||
# Numbered list
|
||||
if re.match(r'^\d+\.\s', stripped):
|
||||
p = doc.add_paragraph(style='List Number')
|
||||
content = re.sub(r'^\d+\.\s', '', stripped)
|
||||
content = re.sub(r'\*\*(.+?)\*\*', r'\1', content) # strip bold markers
|
||||
run = p.add_run(content)
|
||||
run.font.size = Pt(10)
|
||||
run.font.name = 'Times New Roman'
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Bullet list
|
||||
if stripped.startswith('- '):
|
||||
p = doc.add_paragraph(style='List Bullet')
|
||||
content = stripped[2:]
|
||||
content = re.sub(r'\*\*(.+?)\*\*', r'\1', content)
|
||||
run = p.add_run(content)
|
||||
run.font.size = Pt(10)
|
||||
run.font.name = 'Times New Roman'
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Regular paragraph - collect continuation lines
|
||||
para_lines = [stripped]
|
||||
i += 1
|
||||
while i < len(lines):
|
||||
next_line = lines[i].strip()
|
||||
if not next_line or next_line.startswith('#') or next_line.startswith('|') or \
|
||||
next_line.startswith('- ') or re.match(r'^\d+\.\s', next_line):
|
||||
break
|
||||
para_lines.append(next_line)
|
||||
i += 1
|
||||
|
||||
para_text = ' '.join(para_lines)
|
||||
# Clean markdown formatting
|
||||
para_text = re.sub(r'\*\*\*(.+?)\*\*\*', r'\1', para_text) # bold italic
|
||||
para_text = re.sub(r'\*\*(.+?)\*\*', r'\1', para_text) # bold
|
||||
para_text = re.sub(r'\*(.+?)\*', r'\1', para_text) # italic
|
||||
para_text = re.sub(r'`(.+?)`', r'\1', para_text) # code
|
||||
para_text = para_text.replace('$$', '') # LaTeX delimiters
|
||||
para_text = para_text.replace('---', '\u2014') # em dash
|
||||
|
||||
p = doc.add_paragraph()
|
||||
p.paragraph_format.space_after = Pt(6)
|
||||
run = p.add_run(para_text)
|
||||
run.font.size = Pt(10)
|
||||
run.font.name = 'Times New Roman'
|
||||
|
||||
# Check if we should insert a figure after this paragraph
|
||||
for trigger, (fig_file, caption, width) in FIGURES.items():
|
||||
if trigger in para_text:
|
||||
fig_path = FIG_DIR / fig_file
|
||||
if fig_path.exists():
|
||||
fp = doc.add_paragraph()
|
||||
fp.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
fr = fp.add_run()
|
||||
fr.add_picture(str(fig_path), width=Inches(width))
|
||||
|
||||
cp = doc.add_paragraph()
|
||||
cp.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
cr = cp.add_run(caption)
|
||||
cr.font.size = Pt(9)
|
||||
cr.font.name = 'Times New Roman'
|
||||
cr.italic = True
|
||||
|
||||
|
||||
def main():
|
||||
doc = Document()
|
||||
|
||||
# Set default font
|
||||
style = doc.styles['Normal']
|
||||
style.font.name = 'Times New Roman'
|
||||
style.font.size = Pt(10)
|
||||
|
||||
# Title page
|
||||
p = doc.add_paragraph()
|
||||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
p.paragraph_format.space_after = Pt(12)
|
||||
run = p.add_run("Automated Detection of Digitally Replicated Signatures\nin Large-Scale Financial Audit Reports")
|
||||
run.font.size = Pt(16)
|
||||
run.font.name = 'Times New Roman'
|
||||
run.bold = True
|
||||
|
||||
p = doc.add_paragraph()
|
||||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
p.paragraph_format.space_after = Pt(20)
|
||||
run = p.add_run("[Authors removed for double-blind review]")
|
||||
run.font.size = Pt(10)
|
||||
run.italic = True
|
||||
|
||||
# Process each section
|
||||
for section_file in SECTIONS:
|
||||
filepath = PAPER_DIR / section_file
|
||||
if filepath.exists():
|
||||
process_section(doc, filepath)
|
||||
|
||||
doc.save(str(OUTPUT))
|
||||
print(f"Saved: {OUTPUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user