#!/usr/bin/env python3 """ Script 23: Intra-Report Consistency Check (per Partner v4 Section F.4) ====================================================================== Taiwanese statutory audit reports are co-signed by two engagement partners (primary + secondary). Under firm-wide stamping practice, both signatures on the same report should be classified as non-hand-signed. This script: 1. Identifies reports with exactly 2 signatures in the DB. 2. Classifies each signature using the dual-descriptor thresholds of the paper (cosine > 0.95 AND dHash_indep <= 8 = high-confidence replication). 3. Reports intra-report agreement per firm. 4. Flags disagreement cases for sensitivity analysis. Output: reports/intra_report/intra_report_report.md reports/intra_report/intra_report_results.json reports/intra_report/intra_report_disagreements.csv """ import sqlite3 import json import numpy as np from pathlib import Path from datetime import datetime from collections import defaultdict DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' 'intra_report') OUT.mkdir(parents=True, exist_ok=True) BIG4 = ['勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合'] def classify_signature(cos, dhash_indep): """Return one of: high_conf_non_hand_signed, moderate_non_hand_signed, style_consistency, uncertain, likely_hand_signed, unknown (if missing data).""" if cos is None: return 'unknown' if cos > 0.95 and dhash_indep is not None and dhash_indep <= 5: return 'high_conf_non_hand_signed' if cos > 0.95 and dhash_indep is not None and 5 < dhash_indep <= 15: return 'moderate_non_hand_signed' if cos > 0.95 and dhash_indep is not None and dhash_indep > 15: return 'style_consistency' if 0.837 < cos <= 0.95: return 'uncertain' if cos <= 0.837: return 'likely_hand_signed' return 'unknown' def binary_bucket(label): """Collapse to binary: non_hand_signed vs hand_signed vs other.""" if label in ('high_conf_non_hand_signed', 'moderate_non_hand_signed'): return 'non_hand_signed' if label == 'likely_hand_signed': return 'hand_signed' if label == 'style_consistency': return 'style_consistency' return 'uncertain' def firm_bucket(firm): if firm == '勤業眾信聯合': return 'Deloitte (Firm A)' elif firm == '安侯建業聯合': return 'KPMG' elif firm == '資誠聯合': return 'PwC' elif firm == '安永聯合': return 'EY' return 'Other / Non-Big-4' def load_two_signer_reports(): conn = sqlite3.connect(DB) cur = conn.cursor() # Select reports that have exactly 2 signatures with complete data cur.execute(''' WITH report_counts AS ( SELECT source_pdf, COUNT(*) AS n_sigs FROM signatures WHERE max_similarity_to_same_accountant IS NOT NULL GROUP BY source_pdf ) SELECT s.source_pdf, s.signature_id, s.assigned_accountant, a.firm, s.max_similarity_to_same_accountant, s.min_dhash_independent, s.sig_index, s.year_month FROM signatures s LEFT JOIN accountants a ON s.assigned_accountant = a.name JOIN report_counts rc ON rc.source_pdf = s.source_pdf WHERE rc.n_sigs = 2 AND s.max_similarity_to_same_accountant IS NOT NULL ORDER BY s.source_pdf, s.sig_index ''') rows = cur.fetchall() conn.close() return rows def main(): print('=' * 70) print('Script 23: Intra-Report Consistency Check') print('=' * 70) rows = load_two_signer_reports() print(f'\nLoaded {len(rows):,} signatures from 2-signer reports') # Group by source_pdf by_pdf = defaultdict(list) for r in rows: by_pdf[r[0]].append({ 'sig_id': r[1], 'accountant': r[2], 'firm': r[3] or '(unknown)', 'cos': r[4], 'dhash': r[5], 'sig_index': r[6], 'year_month': r[7], }) reports = [{'pdf': pdf, 'sigs': sigs} for pdf, sigs in by_pdf.items() if len(sigs) == 2] print(f'Total 2-signer reports: {len(reports):,}') # Classify each signature and check agreement results = { 'total_reports': len(reports), 'by_firm': defaultdict(lambda: { 'total': 0, 'both_non_hand_signed': 0, 'both_hand_signed': 0, 'both_style_consistency': 0, 'both_uncertain': 0, 'mixed': 0, 'mixed_details': defaultdict(int), }), } disagreements = [] for rep in reports: s1, s2 = rep['sigs'] l1 = classify_signature(s1['cos'], s1['dhash']) l2 = classify_signature(s2['cos'], s2['dhash']) b1, b2 = binary_bucket(l1), binary_bucket(l2) # Determine report-level firm (usually both signers from same firm) firm1 = firm_bucket(s1['firm']) firm2 = firm_bucket(s2['firm']) firm = firm1 if firm1 == firm2 else f'{firm1}+{firm2}' bucket = results['by_firm'][firm] bucket['total'] += 1 if b1 == b2 == 'non_hand_signed': bucket['both_non_hand_signed'] += 1 elif b1 == b2 == 'hand_signed': bucket['both_hand_signed'] += 1 elif b1 == b2 == 'style_consistency': bucket['both_style_consistency'] += 1 elif b1 == b2 == 'uncertain': bucket['both_uncertain'] += 1 else: bucket['mixed'] += 1 combo = tuple(sorted([b1, b2])) bucket['mixed_details'][str(combo)] += 1 disagreements.append({ 'pdf': rep['pdf'], 'firm': firm, 'sig1': {'accountant': s1['accountant'], 'cos': s1['cos'], 'dhash': s1['dhash'], 'label': l1}, 'sig2': {'accountant': s2['accountant'], 'cos': s2['cos'], 'dhash': s2['dhash'], 'label': l2}, 'year_month': s1['year_month'], }) # Print summary print('\n--- Per-firm agreement ---') for firm, d in sorted(results['by_firm'].items(), key=lambda x: -x[1]['total']): agree = (d['both_non_hand_signed'] + d['both_hand_signed'] + d['both_style_consistency'] + d['both_uncertain']) rate = agree / d['total'] if d['total'] else 0 print(f' {firm}: total={d["total"]:,}, agree={agree} ' f'({rate*100:.2f}%), mixed={d["mixed"]}') print(f' both_non_hand_signed={d["both_non_hand_signed"]}, ' f'both_uncertain={d["both_uncertain"]}, ' f'both_style_consistency={d["both_style_consistency"]}, ' f'both_hand_signed={d["both_hand_signed"]}') # Write disagreements CSV (first 500) csv_path = OUT / 'intra_report_disagreements.csv' with open(csv_path, 'w', encoding='utf-8') as f: f.write('pdf,firm,year_month,acc1,cos1,dhash1,label1,' 'acc2,cos2,dhash2,label2\n') for d in disagreements[:500]: f.write(f"{d['pdf']},{d['firm']},{d['year_month']}," f"{d['sig1']['accountant']},{d['sig1']['cos']:.4f}," f"{d['sig1']['dhash']},{d['sig1']['label']}," f"{d['sig2']['accountant']},{d['sig2']['cos']:.4f}," f"{d['sig2']['dhash']},{d['sig2']['label']}\n") print(f'\nCSV: {csv_path} (first 500 of {len(disagreements)} disagreements)') # Convert for JSON summary = { 'generated_at': datetime.now().isoformat(), 'total_reports': len(reports), 'total_disagreements': len(disagreements), 'by_firm': {}, } for firm, d in results['by_firm'].items(): agree = (d['both_non_hand_signed'] + d['both_hand_signed'] + d['both_style_consistency'] + d['both_uncertain']) summary['by_firm'][firm] = { 'total': d['total'], 'both_non_hand_signed': d['both_non_hand_signed'], 'both_hand_signed': d['both_hand_signed'], 'both_style_consistency': d['both_style_consistency'], 'both_uncertain': d['both_uncertain'], 'mixed': d['mixed'], 'agreement_rate': float(agree / d['total']) if d['total'] else 0, 'mixed_details': dict(d['mixed_details']), } with open(OUT / 'intra_report_results.json', 'w') as f: json.dump(summary, f, indent=2, ensure_ascii=False) print(f'JSON: {OUT / "intra_report_results.json"}') # Markdown md = [ '# Intra-Report Consistency Report', f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", '', '## Method', '', '* 2-signer reports (primary + secondary engagement partner).', '* Each signature classified using the dual-descriptor rules of the', ' paper (cos > 0.95 AND dHash_indep ≤ 5 = high-confidence replication;', ' dHash 6-15 = moderate; > 15 = style consistency; cos ≤ 0.837 = likely', ' hand-signed; otherwise uncertain).', '* For each report, both signature-level labels are compared.', ' A report is "in agreement" if both fall in the same coarse bucket', ' (non-hand-signed = high+moderate combined, style_consistency,', ' uncertain, or hand-signed); otherwise "mixed".', '', f'Total 2-signer reports analyzed: **{len(reports):,}**', '', '## Per-firm agreement', '', '| Firm | Total | Both non-hand-signed | Both style | Both uncertain | Both hand-signed | Mixed | Agreement rate |', '|------|-------|----------------------|------------|----------------|------------------|-------|----------------|', ] for firm, d in sorted(summary['by_firm'].items(), key=lambda x: -x[1]['total']): md.append( f"| {firm} | {d['total']} | {d['both_non_hand_signed']} | " f"{d['both_style_consistency']} | {d['both_uncertain']} | " f"{d['both_hand_signed']} | {d['mixed']} | " f"**{d['agreement_rate']*100:.2f}%** |" ) md += [ '', '## Interpretation', '', 'Under firmwide stamping practice the two engagement partners on a', 'given report should both exhibit high-confidence non-hand-signed', 'classifications. High intra-report agreement at Firm A (Deloitte) is', 'consistent with uniform firm-level stamping; declining agreement at', 'the other Big-4 firms reflects the interview evidence that stamping', 'was applied only to a subset of partners.', '', 'Mixed-classification reports (one signer non-hand-signed, the other', 'hand-signed or style-consistent) are flagged for sensitivity review.', 'Absent firmwide homogeneity, one would expect substantial mixed-rate', 'contamination even at Firm A; the observed Firm A mixed rate is a', 'direct empirical check on the identification assumption used in the', 'threshold calibration.', ] (OUT / 'intra_report_report.md').write_text('\n'.join(md), encoding='utf-8') print(f'Report: {OUT / "intra_report_report.md"}') if __name__ == '__main__': main()