pdf_signature_extraction/signature_analysis/40b_inter_cpa_far_sweep.py

#!/usr/bin/env python3
"""
Script 40b: Inter-CPA FAR Sweep for cos and dHash (joint + marginal)
=====================================================================
After codex round-29 destroyed the distributional path to thresholds
(K=3 mixture / dip / antimode shown composition-driven by Scripts
39b–39e), v4.0 pivots to an anchor-based threshold framework:
empirically derived from inter-CPA negative anchor specificity.

Inter-CPA pairs (different CPAs, all-firm) are the negative anchor:
they are by definition not same-CPA replications, and the user's
within-CPA mechanism-transition concern (a CPA might switch from
hand-sign to template mid-career) does not enter the inter-CPA
calibration because each sampled pair crosses CPA boundaries.

This script samples a large number of inter-CPA pairs and computes
both descriptors per pair (cosine via feature_vector dot product;
Hamming distance via dhash_vector XOR). It then sweeps:

  1. FAR(cos > k) across k in [0.80, 0.99]
  2. FAR(dHash <= k) across k in [0, 20]
  3. Joint FAR(cos > 0.95 AND dHash <= k) for k in [0, 20]
  4. Conditional FAR(dHash <= k | cos > 0.95) -- the v3 inherited
     rule's marginal specificity contribution from dHash

Outputs:
  reports/v4_big4/inter_cpa_far_sweep/
    far_sweep_results.json
    far_sweep_report.md

Sample size: 500,000 inter-CPA pairs (matches v3 Script 10
convention). Big-4-only and full-corpus variants both reported.
"""

import json
import sqlite3
import numpy as np
from pathlib import Path
from datetime import datetime
from collections import defaultdict

DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
           'v4_big4/inter_cpa_far_sweep')
OUT.mkdir(parents=True, exist_ok=True)

BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
ALIAS = {'勤業眾信聯合': 'Firm A',
         '安侯建業聯合': 'Firm B',
         '資誠聯合': 'Firm C',
         '安永聯合': 'Firm D'}
N_PAIRS = 500_000
SEED = 42

COS_GRID = [0.80, 0.83, 0.85, 0.87, 0.89, 0.90, 0.91, 0.92, 0.93, 0.94,
            0.945, 0.95, 0.955, 0.96, 0.965, 0.97, 0.975, 0.98, 0.985,
            0.99]
DH_GRID = list(range(0, 21))


def hamming_64bit(a_bytes, b_bytes):
    """Hamming distance between two 8-byte (64-bit) dHash byte strings."""
    a = int.from_bytes(a_bytes, 'big')
    b = int.from_bytes(b_bytes, 'big')
    return (a ^ b).bit_count()


def load_signatures():
    conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
    cur = conn.cursor()
    cur.execute('''
        SELECT s.signature_id, s.assigned_accountant, a.firm,
               s.feature_vector, s.dhash_vector
        FROM signatures s
        JOIN accountants a ON s.assigned_accountant = a.name
        WHERE s.assigned_accountant IS NOT NULL
          AND s.feature_vector IS NOT NULL
          AND s.dhash_vector IS NOT NULL
          AND a.firm IS NOT NULL
    ''')
    rows = cur.fetchall()
    conn.close()
    return rows


def sample_inter_cpa_pairs(rows, n_pairs, seed, restrict_to_big4=False):
    """Sample inter-CPA pairs and compute (cos, dh) for each."""
    rng = np.random.default_rng(seed)
    if restrict_to_big4:
        rows = [r for r in rows if r[2] in BIG4]
        scope = 'big4_only'
    else:
        scope = 'all_firms'
    print(f'  [{scope}] {len(rows):,} signatures available')

    by_acct = defaultdict(list)
    for r in rows:
        by_acct[r[1]].append(r)
    accountants = list(by_acct.keys())
    n_acct = len(accountants)
    print(f'  [{scope}] {n_acct} accountants')

    features = {a: np.stack(
        [np.frombuffer(r[3], dtype=np.float32) for r in by_acct[a]]
    ) for a in accountants}
    dhashes = {a: [r[4] for r in by_acct[a]] for a in accountants}

    cos_vals = np.empty(n_pairs, dtype=np.float32)
    dh_vals = np.empty(n_pairs, dtype=np.int32)
    n_done = 0
    for _ in range(n_pairs):
        i, j = rng.choice(n_acct, 2, replace=False)
        a1, a2 = accountants[i], accountants[j]
        n1, n2 = len(by_acct[a1]), len(by_acct[a2])
        k1 = int(rng.integers(0, n1))
        k2 = int(rng.integers(0, n2))
        f1 = features[a1][k1]
        f2 = features[a2][k2]
        cos = float(f1 @ f2)
        d = hamming_64bit(dhashes[a1][k1], dhashes[a2][k2])
        cos_vals[n_done] = cos
        dh_vals[n_done] = d
        n_done += 1
    return scope, cos_vals, dh_vals


def wilson_ci(k, n, z=1.96):
    if n == 0:
        return (None, None)
    phat = k / n
    denom = 1 + z * z / n
    centre = (phat + z * z / (2 * n)) / denom
    half = z * np.sqrt(phat * (1 - phat) / n + z * z / (4 * n * n)) / denom
    return (max(0.0, centre - half), min(1.0, centre + half))


def far_at_cos(cos_vals, k):
    n = len(cos_vals)
    hits = int((cos_vals > k).sum())
    lo, hi = wilson_ci(hits, n)
    return {'k': float(k), 'n': n, 'hits': hits,
            'far': hits / n, 'ci95_lo': lo, 'ci95_hi': hi}


def far_at_dh_le(dh_vals, k):
    n = len(dh_vals)
    hits = int((dh_vals <= k).sum())
    lo, hi = wilson_ci(hits, n)
    return {'k': int(k), 'n': n, 'hits': hits,
            'far': hits / n, 'ci95_lo': lo, 'ci95_hi': hi}


def joint_far(cos_vals, dh_vals, cos_k, dh_k):
    n = len(cos_vals)
    hits = int(((cos_vals > cos_k) & (dh_vals <= dh_k)).sum())
    lo, hi = wilson_ci(hits, n)
    return {'cos_k': float(cos_k), 'dh_k': int(dh_k),
            'n': n, 'hits': hits,
            'far': hits / n, 'ci95_lo': lo, 'ci95_hi': hi}


def cond_far(cos_vals, dh_vals, cos_k, dh_k):
    """FAR(dh<=k | cos>cos_k)"""
    cos_mask = cos_vals > cos_k
    n_cond = int(cos_mask.sum())
    if n_cond == 0:
        return {'cos_k': float(cos_k), 'dh_k': int(dh_k),
                'n_cond': 0, 'hits': 0,
                'cond_far': None, 'ci95_lo': None, 'ci95_hi': None}
    hits = int(((dh_vals <= dh_k) & cos_mask).sum())
    lo, hi = wilson_ci(hits, n_cond)
    return {'cos_k': float(cos_k), 'dh_k': int(dh_k),
            'n_cond': n_cond, 'hits': hits,
            'cond_far': hits / n_cond, 'ci95_lo': lo, 'ci95_hi': hi}


def invert_far_target(curve_entries, target, key='far'):
    """Return the entries bracketing the target FAR (linear scan)."""
    sorted_e = sorted(curve_entries, key=lambda e: e[key])
    for e in sorted_e:
        if e[key] <= target:
            best = e
        else:
            break
    return best if sorted_e and sorted_e[0][key] <= target else None


def _fmt(x, fmt='.5f'):
    return 'None' if x is None else format(x, fmt)


def run_scope(rows, scope_name, restrict_to_big4):
    print(f'\n== Scope: {scope_name} ==')
    scope_label, cos_vals, dh_vals = sample_inter_cpa_pairs(
        rows, N_PAIRS, SEED, restrict_to_big4=restrict_to_big4)
    print(f'  Sampled {len(cos_vals):,} inter-CPA pairs')
    print(f'  cos: mean={cos_vals.mean():.4f}, '
          f'median={np.median(cos_vals):.4f}, '
          f'std={cos_vals.std():.4f}')
    print(f'  dh : mean={dh_vals.mean():.4f}, '
          f'median={np.median(dh_vals):.4f}, '
          f'std={dh_vals.std():.4f}')

    cos_curve = [far_at_cos(cos_vals, k) for k in COS_GRID]
    dh_curve = [far_at_dh_le(dh_vals, k) for k in DH_GRID]
    joint_curve_95 = [joint_far(cos_vals, dh_vals, 0.95, k) for k in DH_GRID]
    cond_curve_95 = [cond_far(cos_vals, dh_vals, 0.95, k) for k in DH_GRID]

    print('\n  [Cos FAR sweep]')
    for e in cos_curve:
        print(f'    cos > {e["k"]:.3f}: FAR={_fmt(e["far"])}, '
              f'CI=[{_fmt(e["ci95_lo"])}, {_fmt(e["ci95_hi"])}], '
              f'hits={e["hits"]}/{e["n"]}')

    print('\n  [dHash FAR sweep]')
    for e in dh_curve:
        print(f'    dh <= {e["k"]:2d}: FAR={_fmt(e["far"])}, '
              f'CI=[{_fmt(e["ci95_lo"])}, {_fmt(e["ci95_hi"])}], '
              f'hits={e["hits"]}/{e["n"]}')

    print('\n  [Joint FAR (cos > 0.95 AND dh <= k)]')
    for e in joint_curve_95:
        print(f'    dh <= {e["dh_k"]:2d}: FAR={_fmt(e["far"])}, '
              f'hits={e["hits"]}/{e["n"]}')

    print('\n  [Conditional FAR(dh <= k | cos > 0.95)]')
    for e in cond_curve_95:
        cf = e['cond_far']
        print(f'    dh <= {e["dh_k"]:2d}: P(dh<=k | cos>0.95)='
              f'{_fmt(cf) if cf is not None else "n/a"}, '
              f'hits={e["hits"]}/{e["n_cond"]}')

    targets = [0.005, 0.001, 0.0005, 0.0001]
    inv = {}
    for t in targets:
        inv[f'cos_far_<=_{t}'] = invert_far_target(cos_curve, t, 'far')
        inv[f'dh_far_<=_{t}'] = invert_far_target(dh_curve, t, 'far')
        inv[f'joint_at_cos95_far_<=_{t}'] = invert_far_target(
            joint_curve_95, t, 'far')

    print('\n  [Threshold inversion]')
    for tgt in targets:
        e = inv[f'cos_far_<=_{tgt}']
        if e is not None:
            print(f'    FAR <= {tgt}: max cos threshold with FAR<=tgt is '
                  f'cos > {e["k"]:.3f} (FAR={e["far"]:.5f})')
        e = inv[f'dh_far_<=_{tgt}']
        if e is not None:
            print(f'    FAR <= {tgt}: max dh threshold with FAR<=tgt is '
                  f'dh <= {e["k"]} (FAR={e["far"]:.5f})')
        e = inv[f'joint_at_cos95_far_<=_{tgt}']
        if e is not None:
            print(f'    FAR <= {tgt}: under cos>0.95, max dh threshold '
                  f'with joint FAR<=tgt is dh <= {e["dh_k"]} '
                  f'(joint FAR={e["far"]:.5f})')

    return {
        'scope': scope_label,
        'n_pairs': int(len(cos_vals)),
        'cos_summary': {
            'mean': float(cos_vals.mean()),
            'median': float(np.median(cos_vals)),
            'std': float(cos_vals.std()),
            'p99': float(np.percentile(cos_vals, 99)),
            'p999': float(np.percentile(cos_vals, 99.9)),
            'max': float(cos_vals.max()),
        },
        'dh_summary': {
            'mean': float(dh_vals.mean()),
            'median': float(np.median(dh_vals)),
            'std': float(dh_vals.std()),
            'p01': float(np.percentile(dh_vals, 1)),
            'p001': float(np.percentile(dh_vals, 0.1)),
            'min': int(dh_vals.min()),
        },
        'cos_far_curve': cos_curve,
        'dh_far_curve': dh_curve,
        'joint_far_at_cos95_curve': joint_curve_95,
        'cond_far_at_cos95_curve': cond_curve_95,
        'threshold_inversions': inv,
    }


def main():
    print('=' * 72)
    print('Script 40b: Inter-CPA FAR Sweep (cos + dHash, joint + marginal)')
    print('=' * 72)
    rows = load_signatures()
    print(f'\nLoaded {len(rows):,} signatures (full corpus)')

    results = {
        'meta': {
            'script': '40b',
            'timestamp': datetime.now().isoformat(timespec='seconds'),
            'n_pairs_sampled': N_PAIRS,
            'seed': SEED,
            'note': ('Inter-CPA pair-level FAR sweep for cos and dHash. '
                     'Anchor-based threshold derivation; replaces '
                     'distributional path attacked in codex round-29.'),
        },
        'scopes': {},
    }

    results['scopes']['big4_only'] = run_scope(
        rows, 'Big-4 only', restrict_to_big4=True)
    results['scopes']['all_firms'] = run_scope(
        rows, 'All firms', restrict_to_big4=False)

    json_path = OUT / 'far_sweep_results.json'
    json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False),
                         encoding='utf-8')
    print(f'\n[json] {json_path}')

    md = [
        '# Inter-CPA FAR Sweep (Script 40b)',
        '',
        f'Generated: {results["meta"]["timestamp"]}',
        f'Inter-CPA pair samples per scope: {N_PAIRS:,}; seed: {SEED}',
        '',
        ('Anchor-based threshold derivation. For each scope (Big-4 only '
         'or all firms), sample random inter-CPA pairs and compute '
         'cosine + Hamming distance per pair. Report False Acceptance '
         'Rates (FAR) at various thresholds; invert FAR target to '
         'derive thresholds with empirical specificity guarantees.'),
        '',
    ]

    for scope in ['big4_only', 'all_firms']:
        s = results['scopes'][scope]
        md += [f'## Scope: {scope} ({s["n_pairs"]:,} pairs)', '',
               '### Cosine FAR curve', '',
               '| cos > k | FAR | 95% CI | hits / n |',
               '|---|---|---|---|']
        for e in s['cos_far_curve']:
            md.append(f'| {e["k"]:.3f} | {_fmt(e["far"])} | '
                      f'[{_fmt(e["ci95_lo"])}, {_fmt(e["ci95_hi"])}] | '
                      f'{e["hits"]:,} / {e["n"]:,} |')
        md += ['', '### dHash FAR curve', '',
               '| dh <= k | FAR | 95% CI | hits / n |',
               '|---|---|---|---|']
        for e in s['dh_far_curve']:
            md.append(f'| {e["k"]:2d} | {_fmt(e["far"])} | '
                      f'[{_fmt(e["ci95_lo"])}, {_fmt(e["ci95_hi"])}] | '
                      f'{e["hits"]:,} / {e["n"]:,} |')
        md += ['', '### Joint FAR (cos > 0.95 AND dh <= k)', '',
               '| dh <= k | Joint FAR | hits / n |',
               '|---|---|---|']
        for e in s['joint_far_at_cos95_curve']:
            md.append(f'| {e["dh_k"]:2d} | {_fmt(e["far"])} | '
                      f'{e["hits"]:,} / {e["n"]:,} |')
        md += ['',
               '### Conditional FAR(dh <= k | cos > 0.95)',
               '',
               'Among inter-CPA pairs that already exceed cos > 0.95, '
               'what fraction also have dh <= k? This quantifies '
               "dHash's marginal specificity contribution given the cos "
               "gate is already applied.",
               '',
               '| dh <= k | Conditional FAR | hits / n_cond |',
               '|---|---|---|']
        for e in s['cond_far_at_cos95_curve']:
            cf = e['cond_far']
            md.append(f'| {e["dh_k"]:2d} | '
                      f'{_fmt(cf) if cf is not None else "n/a"} | '
                      f'{e["hits"]:,} / {e["n_cond"]:,} |')
        md += ['', '### Threshold inversion', '',
               '| FAR target | cos thresh | dh thresh | joint dh thresh '
               '(under cos>0.95) |',
               '|---|---|---|---|']
        for tgt in [0.005, 0.001, 0.0005, 0.0001]:
            e_c = s['threshold_inversions'].get(f'cos_far_<=_{tgt}')
            e_d = s['threshold_inversions'].get(f'dh_far_<=_{tgt}')
            e_j = s['threshold_inversions'].get(
                f'joint_at_cos95_far_<=_{tgt}')
            c_str = (f'cos > {e_c["k"]:.3f} (FAR={e_c["far"]:.5f})'
                     if e_c else 'unachievable')
            d_str = (f'dh <= {e_d["k"]} (FAR={e_d["far"]:.5f})'
                     if e_d else 'unachievable')
            j_str = (f'dh <= {e_j["dh_k"]} (FAR={e_j["far"]:.5f})'
                     if e_j else 'unachievable')
            md.append(f'| {tgt} | {c_str} | {d_str} | {j_str} |')
        md.append('')

    md += [
        '## Interpretation',
        '',
        ('- The cosine FAR curve replicates and extends v3.x §IV-I '
         'Table X (which reported FAR=0.0005 at cos>0.95 from a '
         'similar but smaller-sample inter-CPA negative anchor).'),
        ('- The dHash FAR curve is the v4 contribution: prior v3.x '
         'work used dh<=5 by convention without an empirical '
         'specificity derivation. This script derives a specificity '
         "target → dh threshold mapping."),
        ('- The conditional FAR(dh<=k | cos>0.95) curve tells us '
         'whether dHash adds specificity given the cos gate. If the '
         "conditional FAR at dh<=5 is meaningfully lower than 1.0, "
         'dHash is providing additional specificity. If it is near '
         '1.0, dHash is largely redundant given cos>0.95 and the '
         'five-way rule should be simplified.'),
        ('- Thresholds derived by inverting FAR targets are '
         'specificity-anchored operating points, not distributional '
         'antimodes. They are robust to the integer-mass-point and '
         'between-firm-composition artefacts identified in Scripts '
         '39b–39e.'),
        '',
    ]
    md_path = OUT / 'far_sweep_report.md'
    md_path.write_text('\n'.join(md), encoding='utf-8')
    print(f'[md  ] {md_path}')


if __name__ == '__main__':
    main()