pdf_signature_extraction/signature_analysis/33_reverse_anchor_spike.py

#!/usr/bin/env python3
"""
Script 33: Reverse-Anchor Spike
================================
Follow-up to Script 32 verdict C.

Hypothesis:
    Instead of using Firm A as the "hand-signed anchor" (Paper A's
    framing), use the non-Firm-A population as the
    "fully-replicated reference" and detect hand-signed CPAs by
    their deviation from that reference.

Why this might be better:
  * Reference population is 3x larger (515 vs 171 accountants)
  * Removes the "why is Firm A ground truth?" reviewer attack
  * Firm A becomes a validation target, not the calibration anchor

Pipeline:
  1. Build 2D Gaussian reference from all_non_A accountant means
     (cos_mean, dh_mean), with robust covariance estimate.
  2. Score every Firm A accountant by:
       * Mahalanobis distance to the reference center
       * Log-likelihood under the 2D Gaussian reference
       * Tail percentile in the marginal cosine direction
         (low = more hand-signed-like)
  3. Cross-validate against Paper A's existing per-CPA hand-sign
     proxy: fraction of that CPA's signatures with
       (cos < 0.95) OR (dh > 5)
     This is the same operational rule used in Paper A v3.20.0
     (cos>0.95 AND dh<=5 -> non-hand-signed) inverted to a hand-sign
     fraction.
  4. Verdict on Paper C viability (uses the directional metric
     -cos_left_tail_pct as primary; symmetric Mahalanobis confounds
     "more-replicated" and "more-hand-signed" anomaly directions):
       PAPER_C_STRONG    Spearman rho_directional >= 0.70
       PAPER_C_PARTIAL   0.40 <= rho_directional < 0.70
       PAPER_C_WEAK      rho_directional < 0.40 OR n_firmA < 30
     A large |rho_mahalanobis| with opposite sign is reported as
     "two-sided anomaly" diagnostic (Firm A bifurcates into both
     extreme-replicated and hand-signed sub-populations).

Output:
  reports/reverse_anchor_spike/
    reverse_anchor_results.json
    reverse_anchor_report.md
    scatter_anomaly_vs_paperA.png
    ranked_firmA_cpas.csv
"""

import sqlite3
import json
import csv
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
from scipy import stats
from sklearn.covariance import MinCovDet

DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
           'reverse_anchor_spike')
OUT.mkdir(parents=True, exist_ok=True)

FIRM_A = '勤業眾信聯合'  # Deloitte
MIN_SIGS = 10

# Paper A v3.20.0 operational signature-level rule (non-hand-signed):
#   cos > 0.95 AND dh_indep <= 5
# Hand-sign fraction = 1 - (fraction passing this rule)
PAPER_A_COS_CUT = 0.95
PAPER_A_DH_CUT = 5


def load_accountant_table(firm_filter_sql, params):
    """Return list of (name, cos_mean, dh_mean, hand_frac, n)."""
    conn = sqlite3.connect(DB)
    cur = conn.cursor()
    sql = f'''
        SELECT s.assigned_accountant,
               AVG(s.max_similarity_to_same_accountant) AS cos_mean,
               AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean,
               AVG(CASE
                     WHEN s.max_similarity_to_same_accountant > ?
                          AND s.min_dhash_independent <= ?
                     THEN 0.0 ELSE 1.0
                   END) AS hand_frac,
               COUNT(*) AS n
        FROM signatures s
        JOIN accountants a ON s.assigned_accountant = a.name
        WHERE s.assigned_accountant IS NOT NULL
          AND s.max_similarity_to_same_accountant IS NOT NULL
          AND s.min_dhash_independent IS NOT NULL
          {firm_filter_sql}
        GROUP BY s.assigned_accountant
        HAVING n >= ?
    '''
    cur.execute(sql, [PAPER_A_COS_CUT, PAPER_A_DH_CUT] + params + [MIN_SIGS])
    rows = cur.fetchall()
    conn.close()
    return [(r[0], float(r[1]), float(r[2]), float(r[3]), int(r[4]))
            for r in rows]


def fit_reference_gaussian(points):
    """Fit a 2D Gaussian to the reference population using MCD for
    robustness against the small handful of non-Firm-A CPAs that may
    themselves contain hand-signed contamination.
    """
    X = np.asarray(points, dtype=float)
    mcd = MinCovDet(random_state=42, support_fraction=0.85).fit(X)
    return {
        'mean': mcd.location_,
        'cov': mcd.covariance_,
        'cov_inv': np.linalg.inv(mcd.covariance_),
        'support_fraction': 0.85,
        'n_reference': int(len(X)),
    }


def score_under_reference(point, ref):
    """Return (mahalanobis_distance, log_likelihood, tail_percentile_cos).

    tail_percentile_cos: P(reference cosine <= point_cos) -- a small
    value means the point sits in the LEFT tail of the reference
    cosine distribution (lower than typical replicated population),
    which is the direction we expect for hand-signed CPAs.
    """
    diff = np.asarray(point, dtype=float) - ref['mean']
    md_sq = float(diff @ ref['cov_inv'] @ diff)
    md = float(np.sqrt(max(md_sq, 0.0)))
    # Multivariate normal log-likelihood (kernel only matters for ranking)
    sign, logdet = np.linalg.slogdet(ref['cov'])
    ll = float(-0.5 * (md_sq + logdet + 2 * np.log(2 * np.pi)))
    # Marginal cosine tail percentile under reference Gaussian
    mu_c = ref['mean'][0]
    sd_c = float(np.sqrt(ref['cov'][0, 0]))
    tail = float(stats.norm.cdf(point[0], loc=mu_c, scale=sd_c))
    return md, ll, tail


def render_scatter(firmA_data, ref, out_path):
    """Anomaly score (Mahalanobis) vs Paper A hand-sign fraction."""
    md = np.array([d['mahalanobis'] for d in firmA_data])
    hf = np.array([d['paperA_hand_frac'] for d in firmA_data])
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.scatter(md, hf, s=40, alpha=0.6, color='steelblue', edgecolor='white')
    rho, p = stats.spearmanr(md, hf)
    pearson_r, pearson_p = stats.pearsonr(md, hf)
    ax.set_xlabel('Mahalanobis distance to non-Firm-A reference '
                  '(higher = more anomalous)')
    ax.set_ylabel('Paper A signature-level hand-sign fraction\n'
                  '(NOT [cos>0.95 AND dh<=5])')
    ax.set_title(f'Firm A CPAs: reverse-anchor anomaly vs Paper A label\n'
                 f'Spearman rho={rho:.3f} (p={p:.2e}); '
                 f'Pearson r={pearson_r:.3f}')
    ax.grid(alpha=0.3)
    fig.tight_layout()
    fig.savefig(out_path, dpi=150)
    plt.close(fig)
    return float(rho), float(p), float(pearson_r), float(pearson_p)


def render_2d_overlay(ref_points, firmA_points, ref, out_path):
    """2D scatter of both populations + reference center + 1/2/3-sigma
    Mahalanobis ellipses."""
    fig, ax = plt.subplots(figsize=(9, 7))
    ax.scatter(ref_points[:, 0], ref_points[:, 1], s=18, alpha=0.4,
               color='gray', label=f'Non-Firm-A CPAs (n={len(ref_points)})')
    ax.scatter(firmA_points[:, 0], firmA_points[:, 1], s=42, alpha=0.85,
               color='crimson', edgecolor='white',
               label=f'Firm A CPAs (n={len(firmA_points)})')
    # Reference Gaussian ellipses
    eigvals, eigvecs = np.linalg.eigh(ref['cov'])
    angle = float(np.degrees(np.arctan2(eigvecs[1, 0], eigvecs[0, 0])))
    from matplotlib.patches import Ellipse
    for k_sigma, ls in [(1, '-'), (2, '--'), (3, ':')]:
        width = 2 * k_sigma * float(np.sqrt(eigvals[0]))
        height = 2 * k_sigma * float(np.sqrt(eigvals[1]))
        e = Ellipse(xy=ref['mean'], width=width, height=height, angle=angle,
                    fill=False, edgecolor='black', lw=1.4, ls=ls,
                    label=f'{k_sigma}-sigma reference contour')
        ax.add_patch(e)
    ax.scatter([ref['mean'][0]], [ref['mean'][1]], marker='+', s=160,
               color='black', label='Reference center (MCD)')
    ax.set_xlabel('Accountant cos_mean')
    ax.set_ylabel('Accountant dh_mean')
    ax.set_title('Reverse-anchor: non-Firm-A reference Gaussian + Firm A overlay')
    ax.legend(fontsize=8, loc='upper right')
    ax.grid(alpha=0.3)
    fig.tight_layout()
    fig.savefig(out_path, dpi=150)
    plt.close(fig)


def classify_verdict(rho_directional, p_directional, rho_mahalanobis,
                     n_firmA):
    bifurcation = (
        f'(diagnostic: rho_mahalanobis={rho_mahalanobis:.3f} -- a large '
        f'magnitude with opposite sign indicates Firm A bifurcates into '
        f'BOTH ultra-replicated and hand-signed sub-populations relative '
        f'to the non-Firm-A reference center, rather than only deviating '
        f'in the hand-sign direction.)')
    if n_firmA < 30:
        return 'PAPER_C_WEAK', (
            f'Only {n_firmA} Firm A CPAs meet n>=10 -- statistical '
            f'underpowering precludes a reliable correlation.')
    if rho_directional >= 0.70 and p_directional < 0.001:
        return 'PAPER_C_STRONG', (
            f'Directional Spearman rho={rho_directional:.3f} '
            f'(p={p_directional:.2e}) -- reverse-anchor with directional '
            f'cosine-left-tail score recovers Paper A label; Paper C '
            f'viable. {bifurcation}')
    if rho_directional >= 0.40 and p_directional < 0.05:
        return 'PAPER_C_PARTIAL', (
            f'Directional Spearman rho={rho_directional:.3f} '
            f'(p={p_directional:.2e}) -- moderate directional alignment; '
            f'reverse-anchor captures part of the signal. {bifurcation}')
    return 'PAPER_C_WEAK', (
        f'Directional Spearman rho={rho_directional:.3f} '
        f'(p={p_directional:.2e}) -- reverse-anchor diverges from Paper '
        f'A label even in the directional formulation. {bifurcation}')


def main():
    print('=' * 72)
    print('Script 33: Reverse-Anchor Spike')
    print('=' * 72)

    # 1. Reference: all_non_A
    ref_rows = load_accountant_table(
        'AND a.firm IS NOT NULL AND a.firm != ?', [FIRM_A])
    print(f'\nReference population (all_non_A): {len(ref_rows)} CPAs')
    ref_points = np.array([[r[1], r[2]] for r in ref_rows])
    ref = fit_reference_gaussian(ref_points)
    print(f'  Reference center (MCD): cos={ref["mean"][0]:.4f}, '
          f'dh={ref["mean"][1]:.4f}')
    print(f'  Reference cov diag: var(cos)={ref["cov"][0,0]:.5f}, '
          f'var(dh)={ref["cov"][1,1]:.4f}, '
          f'cov(cos,dh)={ref["cov"][0,1]:.5f}')

    # 2. Score: Firm A
    firmA_rows = load_accountant_table('AND a.firm = ?', [FIRM_A])
    print(f'\nTarget population (Firm A): {len(firmA_rows)} CPAs')
    firmA_points = np.array([[r[1], r[2]] for r in firmA_rows])

    firmA_data = []
    for (name, cos_m, dh_m, hand_frac, n_sig) in firmA_rows:
        md, ll, tail_cos = score_under_reference([cos_m, dh_m], ref)
        firmA_data.append({
            'cpa': name,
            'n_signatures': n_sig,
            'cos_mean': cos_m,
            'dh_mean': dh_m,
            'paperA_hand_frac': hand_frac,
            'mahalanobis': md,
            'log_likelihood': ll,
            'cos_left_tail_pct': tail_cos,
        })

    # 3. Scatter + correlation
    scatter_png = OUT / 'scatter_anomaly_vs_paperA.png'
    rho, rho_p, pearson_r, pearson_p = render_scatter(
        firmA_data, ref, scatter_png)
    print(f'\nSpearman rho (Mahalanobis vs Paper A hand_frac) = '
          f'{rho:.4f} (p={rho_p:.2e})')
    print(f'Pearson  r              = {pearson_r:.4f} (p={pearson_p:.2e})')

    # Also Spearman for log-likelihood (negated, since higher LL = less anomalous)
    md_arr = np.array([d['mahalanobis'] for d in firmA_data])
    ll_arr = np.array([d['log_likelihood'] for d in firmA_data])
    tail_arr = np.array([d['cos_left_tail_pct'] for d in firmA_data])
    hf_arr = np.array([d['paperA_hand_frac'] for d in firmA_data])
    rho_ll, p_ll = stats.spearmanr(-ll_arr, hf_arr)
    rho_tail, p_tail = stats.spearmanr(-tail_arr, hf_arr)  # negated: small tail = high hand_frac expected
    print(f'Spearman rho (-log-likelihood vs hand_frac) = '
          f'{rho_ll:.4f} (p={p_ll:.2e})')
    print(f'Spearman rho (-cos_left_tail_pct vs hand_frac) = '
          f'{rho_tail:.4f} (p={p_tail:.2e})')

    # 2D overlay
    overlay_png = OUT / 'overlay_2d_reference_vs_firmA.png'
    render_2d_overlay(ref_points, firmA_points, ref, overlay_png)
    print(f'\nPlots: {scatter_png}, {overlay_png}')

    # 4. Verdict (using directional metric as primary; symmetric Mahalanobis
    #    confounds anomaly direction). rho_tail = corr(-cos_left_tail_pct,
    #    hand_frac); positive value means low-cos-percentile CPAs (those
    #    sitting in the LEFT tail of the non-Firm-A reference cosine
    #    distribution) carry the higher Paper A hand-sign fraction --
    #    exactly the directional reverse-anchor signal we want.
    rho_directional = float(rho_tail)
    p_directional = float(p_tail)
    verdict_class, verdict_msg = classify_verdict(
        rho_directional, p_directional, float(rho), len(firmA_data))
    print(f'\nVerdict: {verdict_class} -- {verdict_msg}')

    # Persist ranked CSV
    csv_path = OUT / 'ranked_firmA_cpas.csv'
    with open(csv_path, 'w', newline='', encoding='utf-8') as f:
        w = csv.writer(f)
        w.writerow(['rank_by_mahalanobis', 'cpa', 'n_signatures',
                    'cos_mean', 'dh_mean', 'paperA_hand_frac',
                    'mahalanobis', 'log_likelihood', 'cos_left_tail_pct'])
        ranked = sorted(firmA_data, key=lambda d: -d['mahalanobis'])
        for i, d in enumerate(ranked, 1):
            w.writerow([i, d['cpa'], d['n_signatures'],
                        f'{d["cos_mean"]:.4f}', f'{d["dh_mean"]:.4f}',
                        f'{d["paperA_hand_frac"]:.4f}',
                        f'{d["mahalanobis"]:.4f}',
                        f'{d["log_likelihood"]:.4f}',
                        f'{d["cos_left_tail_pct"]:.4f}'])
    print(f'CSV: {csv_path}')

    # JSON
    payload = {
        'generated_at': datetime.now().isoformat(),
        'paper_a_operational_cuts': {'cos': PAPER_A_COS_CUT,
                                      'dh': PAPER_A_DH_CUT},
        'min_signatures_per_accountant': MIN_SIGS,
        'reference': {
            'population': 'all_non_A',
            'n_cpas': int(len(ref_rows)),
            'mean': [float(x) for x in ref['mean']],
            'cov': [[float(x) for x in row] for row in ref['cov']],
            'mcd_support_fraction': ref['support_fraction'],
        },
        'firm_a': {
            'n_cpas': int(len(firmA_data)),
            'records': firmA_data,
        },
        'correlations': {
            'spearman_mahalanobis_vs_handfrac': {
                'rho': float(rho), 'p': float(rho_p),
            },
            'pearson_mahalanobis_vs_handfrac': {
                'r': float(pearson_r), 'p': float(pearson_p),
            },
            'spearman_neglogL_vs_handfrac': {
                'rho': float(rho_ll), 'p': float(p_ll),
            },
            'spearman_negcostail_vs_handfrac': {
                'rho': float(rho_tail), 'p': float(p_tail),
            },
        },
        'verdict': {'class': verdict_class, 'explanation': verdict_msg},
    }
    json_path = OUT / 'reverse_anchor_results.json'
    json_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False),
                         encoding='utf-8')
    print(f'JSON: {json_path}')

    # Markdown
    md = [
        '# Reverse-Anchor Spike (Script 33)',
        f'Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}',
        '',
        '## Hypothesis',
        '',
        ('Use the non-Firm-A population (n=515 CPAs) as a "fully-replicated '
         'reference" and detect hand-signed CPAs by deviation from that '
         'reference, instead of using Firm A as the hand-signed anchor.'),
        '',
        '## Reference Population',
        '',
        f'- All non-Firm-A CPAs with n_signatures >= {MIN_SIGS}: '
        f'**{len(ref_rows)} CPAs**',
        f'- 2D Gaussian fit (MCD, support_fraction=0.85) to '
        f'(cos_mean, dh_mean):',
        f'  - center: cos = **{ref["mean"][0]:.4f}**, dh = '
        f'**{ref["mean"][1]:.4f}**',
        f'  - var(cos) = {ref["cov"][0,0]:.5f}, var(dh) = '
        f'{ref["cov"][1,1]:.4f}, cov(cos,dh) = {ref["cov"][0,1]:.5f}',
        '',
        '## Target Population',
        '',
        f'- Firm A (Deloitte) CPAs with n_signatures >= {MIN_SIGS}: '
        f'**{len(firmA_data)} CPAs**',
        '',
        '## Validation against Paper A label',
        '',
        ('Paper A operational rule: a signature is non-hand-signed iff '
         f'cos > {PAPER_A_COS_CUT} AND dh_indep <= {PAPER_A_DH_CUT}. '
         'For each CPA we compute hand_frac = 1 - mean(rule passes).'),
        '',
        '| Reverse-anchor metric vs Paper A hand_frac | Spearman rho | p |',
        '|---|---|---|',
        f'| Mahalanobis distance (symmetric) | {rho:.4f} | {rho_p:.2e} |',
        f'| -log-likelihood (symmetric) | {rho_ll:.4f} | {p_ll:.2e} |',
        f'| -cos_left_tail_percentile (**directional**) | '
        f'**{rho_tail:.4f}** | {p_tail:.2e} |',
        f'| Pearson(Mahalanobis, hand_frac) | {pearson_r:.4f} (r) | '
        f'{pearson_p:.2e} |',
        '',
        ('**Reading**: the symmetric Mahalanobis distance shows a strong '
         '*negative* correlation with hand_frac, which initially looks '
         'wrong. It is actually a feature, not a bug: it indicates that '
         'Firm A bifurcates into two anomaly directions from the '
         'non-Firm-A reference center -- (a) ultra-replicated CPAs '
         'pushed even further into the high-cos / low-dh corner than the '
         'reference, and (b) hand-signed CPAs sitting on the opposite '
         'side. Mahalanobis distance lumps both into a single positive '
         'magnitude. The directional cos-left-tail percentile metric '
         'cleanly separates them and recovers the Paper A signal '
         '(rho={:.3f}).').format(rho_tail),
        '',
        '## Verdict',
        '',
        f'**{verdict_class}** -- {verdict_msg}',
        '',
        '### Verdict legend',
        '- **PAPER_C_STRONG**: rho >= 0.70, p < 0.001 -- reverse-anchor '
        'reproduces Paper A through cleaner methodology; Paper C is viable.',
        '- **PAPER_C_PARTIAL**: 0.40 <= rho < 0.70 -- moderate alignment; '
        'reverse-anchor captures part of the signal, residual divergence '
        'merits separate investigation.',
        '- **PAPER_C_WEAK**: rho < 0.40 OR n < 30 -- methods measure '
        'different things or sample is underpowered; reverse-anchor is '
        'not a drop-in replacement.',
        '',
        '## Files',
        '',
        f'- Scatter: `{scatter_png.name}`',
        f'- 2D overlay: `{overlay_png.name}`',
        f'- Ranked CPAs CSV: `{csv_path.name}`',
        f'- Full JSON: `{json_path.name}`',
        '',
    ]
    md_path = OUT / 'reverse_anchor_report.md'
    md_path.write_text('\n'.join(md), encoding='utf-8')
    print(f'Report: {md_path}')


if __name__ == '__main__':
    main()