"""Imaging-pipeline audit (Table V) + byte-identity era split (Section V-B). Classifies a stratified sample of report PDFs as scanned / OCR'd / digital-native from embedded metadata + extractable-text heuristic, and tabulates by year and firm. Also reports the scan-era vs digital-era split of the 262 byte-identical signatures. Requires: PyMuPDF (fitz); signature_analysis.db; original PDFs under total-pdf/. """ import fitz, os, glob, sqlite3 from collections import defaultdict fitz.TOOLS.mupdf_display_errors(False) DB = "/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db" PDF_ROOT = "/Volumes/NV2/PDF-Processing/total-pdf" BIG4 = ('勤業眾信聯合', '資誠聯合', '安侯建業聯合', '安永聯合') FMAP = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'} con = sqlite3.connect(DB); cur = con.cursor() # --- stratified sample: 20 distinct PDFs per firm-year --- cur.execute(f""" WITH d AS (SELECT DISTINCT excel_firm, substr(year_month,1,4) yr, source_pdf, ROW_NUMBER() OVER (PARTITION BY excel_firm, substr(year_month,1,4) ORDER BY source_pdf) rn FROM signatures WHERE excel_firm IN ({','.join(['?']*4)}) AND source_pdf IS NOT NULL) SELECT excel_firm, yr, source_pdf FROM d WHERE rn<=20 ORDER BY yr""", BIG4) rows = cur.fetchall() idx = {os.path.basename(p): p for p in glob.glob(PDF_ROOT + '/*/*.pdf')} def classify(path): try: doc = fitz.open(path) except Exception: return None text = sum(len(doc[i].get_text().strip()) for i in range(min(len(doc), 4))) doc.close() return 'DIGITAL' if text > 2000 else ('OCR' if text > 200 else 'SCAN') byyear = defaultdict(lambda: defaultdict(int)) for firm, yr, fn in rows: p = idx.get(fn) if not p: continue k = classify(p) if k: byyear[yr][k] += 1 print("year | n | scan% | ocr% | digital%") for yr in sorted(byyear): d = byyear[yr]; n = sum(d.values()) print(f"{yr} | {n} | {100*d['SCAN']//n} | {100*d['OCR']//n} | {100*d['DIGITAL']//n}") # --- byte-identity era split --- cur.execute(f""" SELECT CASE WHEN year_month<'202101' THEN 'scan-era' ELSE 'digital-era' END era, CASE excel_firm WHEN '勤業眾信聯合' THEN 'A' WHEN '安侯建業聯合' THEN 'B' WHEN '資誠聯合' THEN 'C' WHEN '安永聯合' THEN 'D' END firm, COUNT(*) n FROM signatures WHERE is_valid=1 AND pixel_identical_to_closest=1 AND excel_firm IN ({','.join(['?']*4)}) GROUP BY era, firm ORDER BY era, firm""", BIG4) print("\nbyte-identical by era x firm:") for era, firm, n in cur.fetchall(): print(f" {era} | {firm} | {n}") con.close()