#!/usr/bin/env python3 """Paper A v3 markdown / DOCX leak linter. Runs two pass: Source pass — scans the v3 markdown sources for syntax patterns that the python-docx export pipeline does NOT render natively. Each finding is a file:line:severity:message tuple. Severity is ERROR (will leak literal syntax into Word), WARN (sometimes leaks), or INFO (style nits). DOCX pass — opens the rendered DOCX and scans every paragraph and table cell for known leak signatures. This is the authoritative check: even if the source pass is clean, the DOCX pass tells you what your partner will actually see. The DOCX pass currently checks for: - leftover LaTeX commands (`\\cmd`) - unstripped `$` math delimiters - pandoc footnote markers (`[^name]`) - markdown blockquote markers (lines starting with `> `) - TeX brace tricks (`{=}`, `{,}`) - PUA sentinels (`\\uE000`, `\\uE001`) leaking from the math-region run-splitter - the synthetic table-caption marker `__TABLE_CAPTION__:` if it ever survives processing Exit code: 0 clean 1 WARN-level findings only (ship-able after review) 2 ERROR-level findings (do NOT ship) Usage: python3 paper/lint_paper_v3.py # both passes python3 paper/lint_paper_v3.py --source # source-side only python3 paper/lint_paper_v3.py --docx # DOCX-side only Designed to be run after `python3 export_v3.py` and before copying the DOCX to ~/Downloads. """ from __future__ import annotations import argparse import re import sys from dataclasses import dataclass from pathlib import Path PAPER_DIR = Path(__file__).resolve().parent DOCX_PATH = PAPER_DIR / "Paper_A_IEEE_Access_Draft_v3.docx" V3_SOURCES = [ "paper_a_abstract_v3.md", "paper_a_introduction_v3.md", "paper_a_related_work_v3.md", "paper_a_methodology_v3.md", "paper_a_results_v3.md", "paper_a_discussion_v3.md", "paper_a_conclusion_v3.md", "paper_a_appendix_v3.md", "paper_a_declarations_v3.md", "paper_a_references_v3.md", ] # --------------------------------------------------------------------------- # Finding model + ANSI colour helpers # --------------------------------------------------------------------------- SEVERITY_RANK = {"ERROR": 2, "WARN": 1, "INFO": 0} COLOR = { "ERROR": "\033[31m", # red "WARN": "\033[33m", # yellow "INFO": "\033[36m", # cyan "RESET": "\033[0m", "BOLD": "\033[1m", } @dataclass class Finding: severity: str rule: str location: str # "file:line" or "DOCX:para 42" / "DOCX:table 6 row 3 col 2" message: str snippet: str = "" def render(self, use_color: bool = True) -> str: col = COLOR[self.severity] if use_color else "" rst = COLOR["RESET"] if use_color else "" bold = COLOR["BOLD"] if use_color else "" head = f"{col}[{self.severity}]{rst} {bold}{self.rule}{rst} @ {self.location}" body = f"\n {self.message}" snip = f"\n > {self.snippet}" if self.snippet else "" return head + body + snip # --------------------------------------------------------------------------- # Source-side rules # --------------------------------------------------------------------------- # Each rule: (pattern, severity, rule_id, message, predicate) # predicate(match, line) → bool: returns True to keep the finding (lets us # suppress matches that are inside HTML comments or fenced code blocks). def _outside_table_comment(match: re.Match, line: str, in_comment: bool, in_table: bool) -> bool: """Suppress findings inside HTML comments (where they're allowed) or inside markdown table rows (where they survive intact via add_md_table).""" return not in_comment and not in_table def _always(match: re.Match, line: str, in_comment: bool, in_table: bool) -> bool: return True SOURCE_RULES = [ # Pandoc footnote markers — leak as raw text in the DOCX. (re.compile(r"\[\^[A-Za-z0-9_-]+\]"), "ERROR", "pandoc-footnote", "Pandoc-style footnote `[^name]` does not render in DOCX. " "Inline the explanation as a parenthetical instead.", _outside_table_comment), # Markdown blockquote `> body` lines — exporter strips them defensively # now, but flag for awareness so authors don't rely on them rendering. (re.compile(r"^>\s"), "WARN", "blockquote", "Markdown blockquote `> ...` is stripped to plain paragraph in DOCX " "(no quote-block formatting). If you intended a callout, use bold " "lead-in instead.", _always), # Display-math fences `$$...$$` (only when the line itself starts with # `$$`) — exporter does best-effort linearisation, but the result is # ugly. Inline the equation as plain prose where possible. (re.compile(r"^\$\$.+?\$\$\s*$|^\$\$\s*$"), "WARN", "display-math", "Display math `$$...$$` renders as a best-effort plain-text " "linearisation in DOCX (no MathType/equation rendering). Consider " "replacing with a numbered equation image or inline prose.", _always), # Inline math containing `\frac{...{...}...}` — nested braces in a # frac argument are not handled by the exporter's regex. (re.compile(r"\\t?frac\{[^{}]*\{[^{}]*\}[^{}]*\}\{|\\t?frac\{[^{}]+\}\{[^{}]*\{"), "WARN", "nested-frac", "Nested-brace `\\frac{...}{...}` may not linearise cleanly. Verify " "the rendered DOCX paragraph or rewrite the math inline.", _outside_table_comment), # Setext-style headers (=== / ---) under a line of text — not handled. (re.compile(r"^=+\s*$|^-{3,}\s*$"), "INFO", "setext-header", "Setext-style header (=== / ---) is not handled by the exporter; " "use ATX (#, ##, ###) instead.", _always), # Pandoc fenced div `:::` — not handled. (re.compile(r"^:::"), "ERROR", "pandoc-fenced-div", "Pandoc fenced div `:::` is not handled by the exporter and would " "leak into the DOCX as plain text.", _always), # Pandoc bracketed-attribute spans `[text]{.class}` — not handled. (re.compile(r"\][\{][^}]*[\}]"), "WARN", "pandoc-attribute-span", "Pandoc attribute span `[text]{.class}` is not parsed by the exporter " "and the brace block will leak.", _outside_table_comment), # File paths in body text — Appendix B is the canonical home for # script→artifact references. (re.compile(r"`signature_analysis/\d+_[a-z_]+\.py`"), "INFO", "script-path-in-body", "Verbose script path in body text. Consider replacing with " "'(reproduction artifact in Appendix B)' for body-prose tightness.", _outside_table_comment), # `reports/...json` paths in body text — same rationale. (re.compile(r"`reports/[a-z_]+/[a-z_]+\.(?:json|md)`"), "INFO", "report-path-in-body", "Verbose report-artifact path in body text. Consider replacing with " "'(see Appendix B provenance map)'.", _outside_table_comment), # Bare HTML comments that are NOT TABLE/FIGURE markers may indicate # editorial residue. Stripped wholesale by exporter, so harmless, but # worth visibility. (re.compile(r"^" in line: in_comment = False return findings # --------------------------------------------------------------------------- # DOCX-side rules # --------------------------------------------------------------------------- DOCX_LEAK_PATTERNS = [ # (pattern, severity, rule_id, message) (re.compile(r"\\[a-zA-Z]+(?:\{[^{}]*\})?"), "ERROR", "leftover-latex-cmd", "LaTeX command `\\cmd` leaked into DOCX. Either add a token rule to " "`latex_to_unicode` in `export_v3.py` or rewrite the source as plain text."), (re.compile(r"(?\s"), "ERROR", "blockquote-leak", "Markdown blockquote `> ...` leaked literal `>` into DOCX. The " "exporter pre-pass should strip these — check `process_section`."), (re.compile(r"\{[,=<>+\-]\}"), "ERROR", "tex-brace-trick", "TeX brace-trick `{=}` / `{,}` leaked. Should be stripped by " "`latex_to_unicode`."), (re.compile(r"[]"), "ERROR", "pua-sentinel-leak", "Math-region PUA sentinel (\\uE000 / \\uE001) leaked. A render path " "is bypassing `add_text_with_subsup`; check headings / list items / " "title-page paragraphs."), (re.compile(r"__TABLE_CAPTION__"), "ERROR", "table-caption-marker-leak", "Synthetic `__TABLE_CAPTION__:` marker leaked. The marker is meant " "to be consumed by `process_section` and rendered as a centered " "bold caption paragraph."), (re.compile(r"signature[a-z]+analysis/\d+[a-z_]+\.py"), "ERROR", "underscore-eaten-path", "Underscores eaten from a script path (e.g., " "`signatureanalysis/28byteidentitydecomposition.py`). The " "math-context-scoped subscript handler in `add_text_with_subsup` " "should leave underscores intact in plain text."), (re.compile(r"\b(\w+_\w+)+\b", flags=re.UNICODE), "INFO", "underscore-identifier", "Underscored identifier in body text (e.g., a code symbol or path). " "Verify it renders with underscores intact, not as subscripts."), ] def lint_docx(docx_path: Path = DOCX_PATH) -> list[Finding]: try: from docx import Document except ImportError: return [Finding("ERROR", "missing-dep", "lint:docx", "python-docx is not installed; cannot run DOCX pass.")] if not docx_path.exists(): return [Finding("ERROR", "missing-docx", str(docx_path), "Built DOCX not found. Run `python3 export_v3.py` first.")] doc = Document(str(docx_path)) findings: list[Finding] = [] seen_signatures = set() # dedupe identical leaks across paragraphs def scan(text: str, location: str): for pat, sev, rule, msg in DOCX_LEAK_PATTERNS: for m in pat.finditer(text): # Skip the INFO-level identifier rule unless it looks like # an obvious math residue (e.g., dHash_indep or N_a). if rule == "underscore-identifier": sample = m.group(0) # Only complain about identifiers that look like math # residue: short, underscore-separated single-char tokens. parts = sample.split("_") if not all(len(p) <= 4 for p in parts): continue if not all(p.isalnum() and not p.isdigit() for p in parts): continue key = (rule, m.group(0)) if key in seen_signatures: continue seen_signatures.add(key) findings.append(Finding( severity=sev, rule=rule, location=location, message=msg, snippet=text[max(0, m.start() - 30):m.end() + 30].replace("\n", " ")[:140], )) for i, p in enumerate(doc.paragraphs): if p.text: scan(p.text, f"DOCX:para {i}") for ti, t in enumerate(doc.tables): for ri, row in enumerate(t.rows): for ci, cell in enumerate(row.cells): if cell.text: scan(cell.text, f"DOCX:table {ti + 1} row {ri} col {ci}") return findings # --------------------------------------------------------------------------- # Reporter # --------------------------------------------------------------------------- def summarise(findings: list[Finding], use_color: bool = True) -> int: def c(key: str) -> str: return COLOR[key] if use_color else "" if not findings: print(f"{c('BOLD')}{c('INFO')}clean — no leaks detected{c('RESET')}") return 0 counts = {"ERROR": 0, "WARN": 0, "INFO": 0} findings.sort(key=lambda f: (-SEVERITY_RANK[f.severity], f.location)) for f in findings: counts[f.severity] += 1 print(f.render(use_color)) print() print(f"{c('BOLD')}summary{c('RESET')}: " f"{c('ERROR')}{counts['ERROR']} ERROR{c('RESET')} " f"{c('WARN')}{counts['WARN']} WARN{c('RESET')} " f"{c('INFO')}{counts['INFO']} INFO{c('RESET')}") if counts["ERROR"]: return 2 if counts["WARN"]: return 1 return 0 def main(): ap = argparse.ArgumentParser( description="Lint Paper A v3 markdown sources and rendered DOCX for " "syntax-leak issues.", ) ap.add_argument("--source", action="store_true", help="run only the markdown source pass") ap.add_argument("--docx", action="store_true", help="run only the rendered DOCX pass") ap.add_argument("--no-color", action="store_true", help="disable ANSI colour output") args = ap.parse_args() use_color = sys.stdout.isatty() and not args.no_color findings: list[Finding] = [] if args.source or not (args.source or args.docx): print(f"{COLOR['BOLD'] if use_color else ''}--- source pass " f"({len(V3_SOURCES)} files) ---{COLOR['RESET'] if use_color else ''}") findings.extend(lint_sources()) if args.docx or not (args.source or args.docx): print(f"{COLOR['BOLD'] if use_color else ''}\n--- docx pass " f"({DOCX_PATH.name}) ---{COLOR['RESET'] if use_color else ''}") findings.extend(lint_docx()) print() sys.exit(summarise(findings, use_color)) if __name__ == "__main__": main()