pdf_signature_extraction/paper/lint_paper_v3.py

#!/usr/bin/env python3
"""Paper A v3 markdown / DOCX leak linter.

Runs two pass:

  Source pass — scans the v3 markdown sources for syntax patterns that the
  python-docx export pipeline does NOT render natively. Each finding is a
  file:line:severity:message tuple. Severity is ERROR (will leak literal
  syntax into Word), WARN (sometimes leaks), or INFO (style nits).

  DOCX pass — opens the rendered DOCX and scans every paragraph and table
  cell for known leak signatures. This is the authoritative check: even
  if the source pass is clean, the DOCX pass tells you what your partner
  will actually see. The DOCX pass currently checks for:

    - leftover LaTeX commands (`\\cmd`)
    - unstripped `$` math delimiters
    - pandoc footnote markers (`[^name]`)
    - markdown blockquote markers (lines starting with `> `)
    - TeX brace tricks (`{=}`, `{,}`)
    - PUA sentinels (`\\uE000`, `\\uE001`) leaking from the math-region
      run-splitter
    - the synthetic table-caption marker `__TABLE_CAPTION__:` if it ever
      survives processing

Exit code:
  0  clean
  1  WARN-level findings only (ship-able after review)
  2  ERROR-level findings (do NOT ship)

Usage:
  python3 paper/lint_paper_v3.py           # both passes
  python3 paper/lint_paper_v3.py --source  # source-side only
  python3 paper/lint_paper_v3.py --docx    # DOCX-side only

Designed to be run after `python3 export_v3.py` and before copying the
DOCX to ~/Downloads.
"""

from __future__ import annotations

import argparse
import re
import sys
from dataclasses import dataclass
from pathlib import Path

PAPER_DIR = Path(__file__).resolve().parent
DOCX_PATH = PAPER_DIR / "Paper_A_IEEE_Access_Draft_v3.docx"

V3_SOURCES = [
    "paper_a_abstract_v3.md",
    "paper_a_introduction_v3.md",
    "paper_a_related_work_v3.md",
    "paper_a_methodology_v3.md",
    "paper_a_results_v3.md",
    "paper_a_discussion_v3.md",
    "paper_a_conclusion_v3.md",
    "paper_a_appendix_v3.md",
    "paper_a_declarations_v3.md",
    "paper_a_references_v3.md",
]


# ---------------------------------------------------------------------------
# Finding model + ANSI colour helpers
# ---------------------------------------------------------------------------

SEVERITY_RANK = {"ERROR": 2, "WARN": 1, "INFO": 0}
COLOR = {
    "ERROR": "\033[31m",  # red
    "WARN":  "\033[33m",  # yellow
    "INFO":  "\033[36m",  # cyan
    "RESET": "\033[0m",
    "BOLD":  "\033[1m",
}


@dataclass
class Finding:
    severity: str
    rule: str
    location: str  # "file:line" or "DOCX:para 42" / "DOCX:table 6 row 3 col 2"
    message: str
    snippet: str = ""

    def render(self, use_color: bool = True) -> str:
        col = COLOR[self.severity] if use_color else ""
        rst = COLOR["RESET"] if use_color else ""
        bold = COLOR["BOLD"] if use_color else ""
        head = f"{col}[{self.severity}]{rst} {bold}{self.rule}{rst} @ {self.location}"
        body = f"\n    {self.message}"
        snip = f"\n    > {self.snippet}" if self.snippet else ""
        return head + body + snip


# ---------------------------------------------------------------------------
# Source-side rules
# ---------------------------------------------------------------------------

# Each rule: (pattern, severity, rule_id, message, predicate)
# predicate(match, line) → bool: returns True to keep the finding (lets us
# suppress matches that are inside HTML comments or fenced code blocks).

def _outside_table_comment(match: re.Match, line: str, in_comment: bool, in_table: bool) -> bool:
    """Suppress findings inside HTML comments (where they're allowed) or
    inside markdown table rows (where they survive intact via add_md_table)."""
    return not in_comment and not in_table


def _always(match: re.Match, line: str, in_comment: bool, in_table: bool) -> bool:
    return True


SOURCE_RULES = [
    # Pandoc footnote markers — leak as raw text in the DOCX.
    (re.compile(r"\[\^[A-Za-z0-9_-]+\]"),
     "ERROR", "pandoc-footnote",
     "Pandoc-style footnote `[^name]` does not render in DOCX. "
     "Inline the explanation as a parenthetical instead.",
     _outside_table_comment),

    # Markdown blockquote `> body` lines — exporter strips them defensively
    # now, but flag for awareness so authors don't rely on them rendering.
    (re.compile(r"^>\s"),
     "WARN", "blockquote",
     "Markdown blockquote `> ...` is stripped to plain paragraph in DOCX "
     "(no quote-block formatting). If you intended a callout, use bold "
     "lead-in instead.",
     _always),

    # Display-math fences `$$...$$` (only when the line itself starts with
    # `$$`) — exporter does best-effort linearisation, but the result is
    # ugly. Inline the equation as plain prose where possible.
    (re.compile(r"^\$\$.+?\$\$\s*$|^\$\$\s*$"),
     "WARN", "display-math",
     "Display math `$$...$$` renders as a best-effort plain-text "
     "linearisation in DOCX (no MathType/equation rendering). Consider "
     "replacing with a numbered equation image or inline prose.",
     _always),

    # Inline math containing `\frac{...{...}...}` — nested braces in a
    # frac argument are not handled by the exporter's regex.
    (re.compile(r"\\t?frac\{[^{}]*\{[^{}]*\}[^{}]*\}\{|\\t?frac\{[^{}]+\}\{[^{}]*\{"),
     "WARN", "nested-frac",
     "Nested-brace `\\frac{...}{...}` may not linearise cleanly. Verify "
     "the rendered DOCX paragraph or rewrite the math inline.",
     _outside_table_comment),

    # Setext-style headers (=== / ---) under a line of text — not handled.
    (re.compile(r"^=+\s*$|^-{3,}\s*$"),
     "INFO", "setext-header",
     "Setext-style header (=== / ---) is not handled by the exporter; "
     "use ATX (#, ##, ###) instead.",
     _always),

    # Pandoc fenced div `:::` — not handled.
    (re.compile(r"^:::"),
     "ERROR", "pandoc-fenced-div",
     "Pandoc fenced div `:::` is not handled by the exporter and would "
     "leak into the DOCX as plain text.",
     _always),

    # Pandoc bracketed-attribute spans `[text]{.class}` — not handled.
    (re.compile(r"\][\{][^}]*[\}]"),
     "WARN", "pandoc-attribute-span",
     "Pandoc attribute span `[text]{.class}` is not parsed by the exporter "
     "and the brace block will leak.",
     _outside_table_comment),

    # File paths in body text — Appendix B is the canonical home for
    # script→artifact references.
    (re.compile(r"`signature_analysis/\d+_[a-z_]+\.py`"),
     "INFO", "script-path-in-body",
     "Verbose script path in body text. Consider replacing with "
     "'(reproduction artifact in Appendix B)' for body-prose tightness.",
     _outside_table_comment),

    # `reports/...json` paths in body text — same rationale.
    (re.compile(r"`reports/[a-z_]+/[a-z_]+\.(?:json|md)`"),
     "INFO", "report-path-in-body",
     "Verbose report-artifact path in body text. Consider replacing with "
     "'(see Appendix B provenance map)'.",
     _outside_table_comment),

    # Bare HTML comments that are NOT TABLE/FIGURE markers may indicate
    # editorial residue. Stripped wholesale by exporter, so harmless, but
    # worth visibility.
    (re.compile(r"^<!--\s*$|^<!-- (?!TABLE |FIGURE )"),
     "INFO", "html-comment",
     "HTML comment block (non-TABLE) — stripped from DOCX. Keep for "
     "editorial notes or remove for tidiness.",
     _always),
]


def lint_sources() -> list[Finding]:
    findings: list[Finding] = []
    for src in V3_SOURCES:
        path = PAPER_DIR / src
        if not path.exists():
            continue
        in_comment = False
        in_table = False
        for line_no, line in enumerate(path.read_text(encoding="utf-8").splitlines(), 1):
            # Track HTML-comment context (multi-line aware).
            if "<!--" in line:
                in_comment = True
            stripped = line.strip()
            if stripped.startswith("|") and stripped.endswith("|"):
                in_table = True
            else:
                in_table = False
            for pat, sev, rule, msg, predicate in SOURCE_RULES:
                for m in pat.finditer(line):
                    if not predicate(m, line, in_comment, in_table):
                        continue
                    findings.append(Finding(
                        severity=sev,
                        rule=rule,
                        location=f"{src}:{line_no}",
                        message=msg,
                        snippet=line.rstrip()[:120],
                    ))
            if "-->" in line:
                in_comment = False
    return findings


# ---------------------------------------------------------------------------
# DOCX-side rules
# ---------------------------------------------------------------------------

DOCX_LEAK_PATTERNS = [
    # (pattern, severity, rule_id, message)
    (re.compile(r"\\[a-zA-Z]+(?:\{[^{}]*\})?"),
     "ERROR", "leftover-latex-cmd",
     "LaTeX command `\\cmd` leaked into DOCX. Either add a token rule to "
     "`latex_to_unicode` in `export_v3.py` or rewrite the source as plain text."),

    (re.compile(r"(?<!\\)\$[^$\s][^$]*\$"),
     "ERROR", "unstripped-dollar-math",
     "Inline math `$...$` was not stripped. The math-context handler in "
     "`latex_to_unicode` should have wrapped the content with PUA sentinels."),

    (re.compile(r"\[\^[A-Za-z0-9_-]+\]"),
     "ERROR", "pandoc-footnote-leak",
     "Pandoc footnote marker leaked into DOCX. Inline the footnote body "
     "as a parenthetical at the source."),

    (re.compile(r"^>\s"),
     "ERROR", "blockquote-leak",
     "Markdown blockquote `> ...` leaked literal `>` into DOCX. The "
     "exporter pre-pass should strip these — check `process_section`."),

    (re.compile(r"\{[,=<>+\-]\}"),
     "ERROR", "tex-brace-trick",
     "TeX brace-trick `{=}` / `{,}` leaked. Should be stripped by "
     "`latex_to_unicode`."),

    (re.compile(r"[]"),
     "ERROR", "pua-sentinel-leak",
     "Math-region PUA sentinel (\\uE000 / \\uE001) leaked. A render path "
     "is bypassing `add_text_with_subsup`; check headings / list items / "
     "title-page paragraphs."),

    (re.compile(r"__TABLE_CAPTION__"),
     "ERROR", "table-caption-marker-leak",
     "Synthetic `__TABLE_CAPTION__:` marker leaked. The marker is meant "
     "to be consumed by `process_section` and rendered as a centered "
     "bold caption paragraph."),

    (re.compile(r"signature[a-z]+analysis/\d+[a-z_]+\.py"),
     "ERROR", "underscore-eaten-path",
     "Underscores eaten from a script path (e.g., "
     "`signatureanalysis/28byteidentitydecomposition.py`). The "
     "math-context-scoped subscript handler in `add_text_with_subsup` "
     "should leave underscores intact in plain text."),

    (re.compile(r"\b(\w+_\w+)+\b", flags=re.UNICODE),
     "INFO", "underscore-identifier",
     "Underscored identifier in body text (e.g., a code symbol or path). "
     "Verify it renders with underscores intact, not as subscripts."),
]


def lint_docx(docx_path: Path = DOCX_PATH) -> list[Finding]:
    try:
        from docx import Document
    except ImportError:
        return [Finding("ERROR", "missing-dep",
                        "lint:docx",
                        "python-docx is not installed; cannot run DOCX pass.")]

    if not docx_path.exists():
        return [Finding("ERROR", "missing-docx",
                        str(docx_path),
                        "Built DOCX not found. Run `python3 export_v3.py` first.")]

    doc = Document(str(docx_path))
    findings: list[Finding] = []
    seen_signatures = set()  # dedupe identical leaks across paragraphs

    def scan(text: str, location: str):
        for pat, sev, rule, msg in DOCX_LEAK_PATTERNS:
            for m in pat.finditer(text):
                # Skip the INFO-level identifier rule unless it looks like
                # an obvious math residue (e.g., dHash_indep or N_a).
                if rule == "underscore-identifier":
                    sample = m.group(0)
                    # Only complain about identifiers that look like math
                    # residue: short, underscore-separated single-char tokens.
                    parts = sample.split("_")
                    if not all(len(p) <= 4 for p in parts):
                        continue
                    if not all(p.isalnum() and not p.isdigit() for p in parts):
                        continue
                key = (rule, m.group(0))
                if key in seen_signatures:
                    continue
                seen_signatures.add(key)
                findings.append(Finding(
                    severity=sev,
                    rule=rule,
                    location=location,
                    message=msg,
                    snippet=text[max(0, m.start() - 30):m.end() + 30].replace("\n", " ")[:140],
                ))

    for i, p in enumerate(doc.paragraphs):
        if p.text:
            scan(p.text, f"DOCX:para {i}")
    for ti, t in enumerate(doc.tables):
        for ri, row in enumerate(t.rows):
            for ci, cell in enumerate(row.cells):
                if cell.text:
                    scan(cell.text, f"DOCX:table {ti + 1} row {ri} col {ci}")

    return findings


# ---------------------------------------------------------------------------
# Reporter
# ---------------------------------------------------------------------------

def summarise(findings: list[Finding], use_color: bool = True) -> int:
    def c(key: str) -> str:
        return COLOR[key] if use_color else ""

    if not findings:
        print(f"{c('BOLD')}{c('INFO')}clean — no leaks detected{c('RESET')}")
        return 0
    counts = {"ERROR": 0, "WARN": 0, "INFO": 0}
    findings.sort(key=lambda f: (-SEVERITY_RANK[f.severity], f.location))
    for f in findings:
        counts[f.severity] += 1
        print(f.render(use_color))
        print()
    print(f"{c('BOLD')}summary{c('RESET')}: "
          f"{c('ERROR')}{counts['ERROR']} ERROR{c('RESET')}  "
          f"{c('WARN')}{counts['WARN']} WARN{c('RESET')}  "
          f"{c('INFO')}{counts['INFO']} INFO{c('RESET')}")
    if counts["ERROR"]:
        return 2
    if counts["WARN"]:
        return 1
    return 0


def main():
    ap = argparse.ArgumentParser(
        description="Lint Paper A v3 markdown sources and rendered DOCX for "
                    "syntax-leak issues.",
    )
    ap.add_argument("--source", action="store_true",
                    help="run only the markdown source pass")
    ap.add_argument("--docx", action="store_true",
                    help="run only the rendered DOCX pass")
    ap.add_argument("--no-color", action="store_true",
                    help="disable ANSI colour output")
    args = ap.parse_args()

    use_color = sys.stdout.isatty() and not args.no_color
    findings: list[Finding] = []
    if args.source or not (args.source or args.docx):
        print(f"{COLOR['BOLD'] if use_color else ''}--- source pass "
              f"({len(V3_SOURCES)} files) ---{COLOR['RESET'] if use_color else ''}")
        findings.extend(lint_sources())
    if args.docx or not (args.source or args.docx):
        print(f"{COLOR['BOLD'] if use_color else ''}\n--- docx pass "
              f"({DOCX_PATH.name}) ---{COLOR['RESET'] if use_color else ''}")
        findings.extend(lint_docx())

    print()
    sys.exit(summarise(findings, use_color))


if __name__ == "__main__":
    main()