pdf_signature_extraction/paper/v13_build/scripts/pipeline_audit.py

"""Imaging-pipeline audit (Table V) + byte-identity era split (Section V-B).
Classifies a stratified sample of report PDFs as scanned / OCR'd / digital-native
from embedded metadata + extractable-text heuristic, and tabulates by year and firm.
Also reports the scan-era vs digital-era split of the 262 byte-identical signatures.

Requires: PyMuPDF (fitz); signature_analysis.db; original PDFs under total-pdf/.
"""
import fitz, os, glob, sqlite3
from collections import defaultdict

fitz.TOOLS.mupdf_display_errors(False)
DB = "/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db"
PDF_ROOT = "/Volumes/NV2/PDF-Processing/total-pdf"
BIG4 = ('勤業眾信聯合', '資誠聯合', '安侯建業聯合', '安永聯合')
FMAP = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'}

con = sqlite3.connect(DB); cur = con.cursor()

# --- stratified sample: 20 distinct PDFs per firm-year ---
cur.execute(f"""
WITH d AS (SELECT DISTINCT excel_firm, substr(year_month,1,4) yr, source_pdf,
  ROW_NUMBER() OVER (PARTITION BY excel_firm, substr(year_month,1,4) ORDER BY source_pdf) rn
  FROM signatures WHERE excel_firm IN ({','.join(['?']*4)}) AND source_pdf IS NOT NULL)
SELECT excel_firm, yr, source_pdf FROM d WHERE rn<=20 ORDER BY yr""", BIG4)
rows = cur.fetchall()
idx = {os.path.basename(p): p for p in glob.glob(PDF_ROOT + '/*/*.pdf')}

def classify(path):
    try:
        doc = fitz.open(path)
    except Exception:
        return None
    text = sum(len(doc[i].get_text().strip()) for i in range(min(len(doc), 4)))
    doc.close()
    return 'DIGITAL' if text > 2000 else ('OCR' if text > 200 else 'SCAN')

byyear = defaultdict(lambda: defaultdict(int))
for firm, yr, fn in rows:
    p = idx.get(fn)
    if not p:
        continue
    k = classify(p)
    if k:
        byyear[yr][k] += 1

print("year | n | scan% | ocr% | digital%")
for yr in sorted(byyear):
    d = byyear[yr]; n = sum(d.values())
    print(f"{yr} | {n} | {100*d['SCAN']//n} | {100*d['OCR']//n} | {100*d['DIGITAL']//n}")

# --- byte-identity era split ---
cur.execute(f"""
SELECT CASE WHEN year_month<'202101' THEN 'scan-era' ELSE 'digital-era' END era,
  CASE excel_firm WHEN '勤業眾信聯合' THEN 'A' WHEN '安侯建業聯合' THEN 'B'
                  WHEN '資誠聯合' THEN 'C' WHEN '安永聯合' THEN 'D' END firm,
  COUNT(*) n
FROM signatures WHERE is_valid=1 AND pixel_identical_to_closest=1
  AND excel_firm IN ({','.join(['?']*4)})
GROUP BY era, firm ORDER BY era, firm""", BIG4)
print("\nbyte-identical by era x firm:")
for era, firm, n in cur.fetchall():
    print(f"  {era} | {firm} | {n}")
con.close()