pdf_signature_extraction/signature_analysis/23_intra_report_consistency.py

#!/usr/bin/env python3
"""
Script 23: Intra-Report Consistency Check (per Partner v4 Section F.4)
======================================================================
Taiwanese statutory audit reports are co-signed by two engagement partners
(primary + secondary).  Under firm-wide stamping practice, both signatures
on the same report should be classified as non-hand-signed.

This script:
  1. Identifies reports with exactly 2 signatures in the DB.
  2. Classifies each signature using the dual-descriptor thresholds of the
     paper (cosine > 0.95 AND dHash_indep <= 8 = high-confidence replication).
  3. Reports intra-report agreement per firm.
  4. Flags disagreement cases for sensitivity analysis.

Output:
  reports/intra_report/intra_report_report.md
  reports/intra_report/intra_report_results.json
  reports/intra_report/intra_report_disagreements.csv
"""

import sqlite3
import json
import numpy as np
from pathlib import Path
from datetime import datetime
from collections import defaultdict

DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
           'intra_report')
OUT.mkdir(parents=True, exist_ok=True)

BIG4 = ['勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合']


def classify_signature(cos, dhash_indep):
    """Return one of: high_conf_non_hand_signed, moderate_non_hand_signed,
                     style_consistency, uncertain, likely_hand_signed,
                     unknown (if missing data)."""
    if cos is None:
        return 'unknown'
    if cos > 0.95 and dhash_indep is not None and dhash_indep <= 5:
        return 'high_conf_non_hand_signed'
    if cos > 0.95 and dhash_indep is not None and 5 < dhash_indep <= 15:
        return 'moderate_non_hand_signed'
    if cos > 0.95 and dhash_indep is not None and dhash_indep > 15:
        return 'style_consistency'
    if 0.837 < cos <= 0.95:
        return 'uncertain'
    if cos <= 0.837:
        return 'likely_hand_signed'
    return 'unknown'


def binary_bucket(label):
    """Collapse to binary: non_hand_signed vs hand_signed vs other."""
    if label in ('high_conf_non_hand_signed', 'moderate_non_hand_signed'):
        return 'non_hand_signed'
    if label == 'likely_hand_signed':
        return 'hand_signed'
    if label == 'style_consistency':
        return 'style_consistency'
    return 'uncertain'


def firm_bucket(firm):
    if firm == '勤業眾信聯合':
        return 'Deloitte (Firm A)'
    elif firm == '安侯建業聯合':
        return 'KPMG'
    elif firm == '資誠聯合':
        return 'PwC'
    elif firm == '安永聯合':
        return 'EY'
    return 'Other / Non-Big-4'


def load_two_signer_reports():
    conn = sqlite3.connect(DB)
    cur = conn.cursor()
    # Select reports that have exactly 2 signatures with complete data
    cur.execute('''
        WITH report_counts AS (
            SELECT source_pdf, COUNT(*) AS n_sigs
            FROM signatures
            WHERE max_similarity_to_same_accountant IS NOT NULL
            GROUP BY source_pdf
        )
        SELECT s.source_pdf, s.signature_id, s.assigned_accountant, a.firm,
               s.max_similarity_to_same_accountant,
               s.min_dhash_independent, s.sig_index, s.year_month
        FROM signatures s
        LEFT JOIN accountants a ON s.assigned_accountant = a.name
        JOIN report_counts rc ON rc.source_pdf = s.source_pdf
        WHERE rc.n_sigs = 2
          AND s.max_similarity_to_same_accountant IS NOT NULL
        ORDER BY s.source_pdf, s.sig_index
    ''')
    rows = cur.fetchall()
    conn.close()
    return rows


def main():
    print('=' * 70)
    print('Script 23: Intra-Report Consistency Check')
    print('=' * 70)

    rows = load_two_signer_reports()
    print(f'\nLoaded {len(rows):,} signatures from 2-signer reports')

    # Group by source_pdf
    by_pdf = defaultdict(list)
    for r in rows:
        by_pdf[r[0]].append({
            'sig_id': r[1], 'accountant': r[2], 'firm': r[3] or '(unknown)',
            'cos': r[4], 'dhash': r[5], 'sig_index': r[6], 'year_month': r[7],
        })

    reports = [{'pdf': pdf, 'sigs': sigs}
               for pdf, sigs in by_pdf.items() if len(sigs) == 2]
    print(f'Total 2-signer reports: {len(reports):,}')

    # Classify each signature and check agreement
    results = {
        'total_reports': len(reports),
        'by_firm': defaultdict(lambda: {
            'total': 0,
            'both_non_hand_signed': 0,
            'both_hand_signed': 0,
            'both_style_consistency': 0,
            'both_uncertain': 0,
            'mixed': 0,
            'mixed_details': defaultdict(int),
        }),
    }

    disagreements = []
    for rep in reports:
        s1, s2 = rep['sigs']
        l1 = classify_signature(s1['cos'], s1['dhash'])
        l2 = classify_signature(s2['cos'], s2['dhash'])
        b1, b2 = binary_bucket(l1), binary_bucket(l2)

        # Determine report-level firm (usually both signers from same firm)
        firm1 = firm_bucket(s1['firm'])
        firm2 = firm_bucket(s2['firm'])
        firm = firm1 if firm1 == firm2 else f'{firm1}+{firm2}'

        bucket = results['by_firm'][firm]
        bucket['total'] += 1

        if b1 == b2 == 'non_hand_signed':
            bucket['both_non_hand_signed'] += 1
        elif b1 == b2 == 'hand_signed':
            bucket['both_hand_signed'] += 1
        elif b1 == b2 == 'style_consistency':
            bucket['both_style_consistency'] += 1
        elif b1 == b2 == 'uncertain':
            bucket['both_uncertain'] += 1
        else:
            bucket['mixed'] += 1
            combo = tuple(sorted([b1, b2]))
            bucket['mixed_details'][str(combo)] += 1
            disagreements.append({
                'pdf': rep['pdf'],
                'firm': firm,
                'sig1': {'accountant': s1['accountant'], 'cos': s1['cos'],
                         'dhash': s1['dhash'], 'label': l1},
                'sig2': {'accountant': s2['accountant'], 'cos': s2['cos'],
                         'dhash': s2['dhash'], 'label': l2},
                'year_month': s1['year_month'],
            })

    # Print summary
    print('\n--- Per-firm agreement ---')
    for firm, d in sorted(results['by_firm'].items(), key=lambda x: -x[1]['total']):
        agree = (d['both_non_hand_signed'] + d['both_hand_signed']
                 + d['both_style_consistency'] + d['both_uncertain'])
        rate = agree / d['total'] if d['total'] else 0
        print(f'  {firm}: total={d["total"]:,}, agree={agree} '
              f'({rate*100:.2f}%), mixed={d["mixed"]}')
        print(f'    both_non_hand_signed={d["both_non_hand_signed"]}, '
              f'both_uncertain={d["both_uncertain"]}, '
              f'both_style_consistency={d["both_style_consistency"]}, '
              f'both_hand_signed={d["both_hand_signed"]}')

    # Write disagreements CSV (first 500)
    csv_path = OUT / 'intra_report_disagreements.csv'
    with open(csv_path, 'w', encoding='utf-8') as f:
        f.write('pdf,firm,year_month,acc1,cos1,dhash1,label1,'
                'acc2,cos2,dhash2,label2\n')
        for d in disagreements[:500]:
            f.write(f"{d['pdf']},{d['firm']},{d['year_month']},"
                    f"{d['sig1']['accountant']},{d['sig1']['cos']:.4f},"
                    f"{d['sig1']['dhash']},{d['sig1']['label']},"
                    f"{d['sig2']['accountant']},{d['sig2']['cos']:.4f},"
                    f"{d['sig2']['dhash']},{d['sig2']['label']}\n")
    print(f'\nCSV: {csv_path} (first 500 of {len(disagreements)} disagreements)')

    # Convert for JSON
    summary = {
        'generated_at': datetime.now().isoformat(),
        'total_reports': len(reports),
        'total_disagreements': len(disagreements),
        'by_firm': {},
    }
    for firm, d in results['by_firm'].items():
        agree = (d['both_non_hand_signed'] + d['both_hand_signed']
                 + d['both_style_consistency'] + d['both_uncertain'])
        summary['by_firm'][firm] = {
            'total': d['total'],
            'both_non_hand_signed': d['both_non_hand_signed'],
            'both_hand_signed': d['both_hand_signed'],
            'both_style_consistency': d['both_style_consistency'],
            'both_uncertain': d['both_uncertain'],
            'mixed': d['mixed'],
            'agreement_rate': float(agree / d['total']) if d['total'] else 0,
            'mixed_details': dict(d['mixed_details']),
        }
    with open(OUT / 'intra_report_results.json', 'w') as f:
        json.dump(summary, f, indent=2, ensure_ascii=False)
    print(f'JSON: {OUT / "intra_report_results.json"}')

    # Markdown
    md = [
        '# Intra-Report Consistency Report',
        f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        '',
        '## Method',
        '',
        '* 2-signer reports (primary + secondary engagement partner).',
        '* Each signature classified using the dual-descriptor rules of the',
        '  paper (cos > 0.95 AND dHash_indep ≤ 5 = high-confidence replication;',
        '  dHash 6-15 = moderate; > 15 = style consistency; cos ≤ 0.837 = likely',
        '  hand-signed; otherwise uncertain).',
        '* For each report, both signature-level labels are compared.',
        '  A report is "in agreement" if both fall in the same coarse bucket',
        '  (non-hand-signed = high+moderate combined, style_consistency,',
        '  uncertain, or hand-signed); otherwise "mixed".',
        '',
        f'Total 2-signer reports analyzed: **{len(reports):,}**',
        '',
        '## Per-firm agreement',
        '',
        '| Firm | Total | Both non-hand-signed | Both style | Both uncertain | Both hand-signed | Mixed | Agreement rate |',
        '|------|-------|----------------------|------------|----------------|------------------|-------|----------------|',
    ]
    for firm, d in sorted(summary['by_firm'].items(),
                          key=lambda x: -x[1]['total']):
        md.append(
            f"| {firm} | {d['total']} | {d['both_non_hand_signed']} | "
            f"{d['both_style_consistency']} | {d['both_uncertain']} | "
            f"{d['both_hand_signed']} | {d['mixed']} | "
            f"**{d['agreement_rate']*100:.2f}%** |"
        )

    md += [
        '',
        '## Interpretation',
        '',
        'Under firmwide stamping practice the two engagement partners on a',
        'given report should both exhibit high-confidence non-hand-signed',
        'classifications. High intra-report agreement at Firm A (Deloitte) is',
        'consistent with uniform firm-level stamping; declining agreement at',
        'the other Big-4 firms reflects the interview evidence that stamping',
        'was applied only to a subset of partners.',
        '',
        'Mixed-classification reports (one signer non-hand-signed, the other',
        'hand-signed or style-consistent) are flagged for sensitivity review.',
        'Absent firmwide homogeneity, one would expect substantial mixed-rate',
        'contamination even at Firm A; the observed Firm A mixed rate is a',
        'direct empirical check on the identification assumption used in the',
        'threshold calibration.',
    ]
    (OUT / 'intra_report_report.md').write_text('\n'.join(md), encoding='utf-8')
    print(f'Report: {OUT / "intra_report_report.md"}')


if __name__ == '__main__':
    main()