From 8ac09888aebbb11428938a0902416f579572d756 Mon Sep 17 00:00:00 2001
From: gbanyan <gbanyan.huang@gmail.com>
Date: Tue, 12 May 2026 12:09:36 +0800
Subject: [PATCH] Add script 33: reverse-anchor spike (PAPER_C_STRONG verdict)

Follow-up to Script 32 verdict C. Tests whether using the non-Firm-A
population (515 CPAs) as a "fully-replicated reference" recovers the
Paper A hand-signed signal through deviation analysis on Firm A.

Methodology:
  * Robust 2D Gaussian fit (MCD, support_fraction=0.85) on
    (cos_mean, dh_mean) of all_non_A CPAs.  Reference center =
    (cos=0.946, dh=8.29).
  * Score Firm A CPAs by symmetric Mahalanobis distance, log-
    likelihood, and directional cosine left-tail percentile.
  * Cross-validate against Paper A's per-CPA hand_frac proxy
    (signatures with cos<=0.95 OR dh>5).

Key findings:
  * Directional metric (-cos_left_tail_pct) vs Paper A hand_frac:
    Spearman rho = +0.744 (p < 1e-30) -- PAPER_C_STRONG.
  * Symmetric Mahalanobis vs hand_frac: rho = -0.927 (p < 1e-73).
    The negative sign is a feature, not a bug: Firm A bifurcates
    into two anomaly directions from the non-Firm-A reference --
    (a) ultra-replicated CPAs (cos>=0.985, dh~1) sitting beyond
    the reference's high-cos tail, and (b) hand-signed CPAs
    (cos~0.95, dh~6-7) sitting near or below the reference
    center.  Symmetric distance lumps both into a positive
    magnitude; directional metrics distinguish them.

Implication: a "Paper C" reframing is statistically supported.
Use non-Firm-A as the replication reference, not Firm A as the
hand-signed anchor.  This removes the "why is Firm A ground
truth?" reviewer attack and reveals the bifurcation structure
that Paper A's symmetric framing obscures.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 signature_analysis/33_reverse_anchor_spike.py | 437 ++++++++++++++++++
 1 file changed, 437 insertions(+)
 create mode 100644 signature_analysis/33_reverse_anchor_spike.py

diff --git a/signature_analysis/33_reverse_anchor_spike.py b/signature_analysis/33_reverse_anchor_spike.py
new file mode 100644
index 0000000..eeb971d
--- /dev/null
+++ b/signature_analysis/33_reverse_anchor_spike.py
@@ -0,0 +1,437 @@
+#!/usr/bin/env python3
+"""
+Script 33: Reverse-Anchor Spike
+================================
+Follow-up to Script 32 verdict C.
+
+Hypothesis:
+    Instead of using Firm A as the "hand-signed anchor" (Paper A's
+    framing), use the non-Firm-A population as the
+    "fully-replicated reference" and detect hand-signed CPAs by
+    their deviation from that reference.
+
+Why this might be better:
+  * Reference population is 3x larger (515 vs 171 accountants)
+  * Removes the "why is Firm A ground truth?" reviewer attack
+  * Firm A becomes a validation target, not the calibration anchor
+
+Pipeline:
+  1. Build 2D Gaussian reference from all_non_A accountant means
+     (cos_mean, dh_mean), with robust covariance estimate.
+  2. Score every Firm A accountant by:
+       * Mahalanobis distance to the reference center
+       * Log-likelihood under the 2D Gaussian reference
+       * Tail percentile in the marginal cosine direction
+         (low = more hand-signed-like)
+  3. Cross-validate against Paper A's existing per-CPA hand-sign
+     proxy: fraction of that CPA's signatures with
+       (cos < 0.95) OR (dh > 5)
+     This is the same operational rule used in Paper A v3.20.0
+     (cos>0.95 AND dh<=5 -> non-hand-signed) inverted to a hand-sign
+     fraction.
+  4. Verdict on Paper C viability (uses the directional metric
+     -cos_left_tail_pct as primary; symmetric Mahalanobis confounds
+     "more-replicated" and "more-hand-signed" anomaly directions):
+       PAPER_C_STRONG    Spearman rho_directional >= 0.70
+       PAPER_C_PARTIAL   0.40 <= rho_directional < 0.70
+       PAPER_C_WEAK      rho_directional < 0.40 OR n_firmA < 30
+     A large |rho_mahalanobis| with opposite sign is reported as
+     "two-sided anomaly" diagnostic (Firm A bifurcates into both
+     extreme-replicated and hand-signed sub-populations).
+
+Output:
+  reports/reverse_anchor_spike/
+    reverse_anchor_results.json
+    reverse_anchor_report.md
+    scatter_anomaly_vs_paperA.png
+    ranked_firmA_cpas.csv
+"""
+
+import sqlite3
+import json
+import csv
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from pathlib import Path
+from datetime import datetime
+from scipy import stats
+from sklearn.covariance import MinCovDet
+
+DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
+           'reverse_anchor_spike')
+OUT.mkdir(parents=True, exist_ok=True)
+
+FIRM_A = '勤業眾信聯合'  # Deloitte
+MIN_SIGS = 10
+
+# Paper A v3.20.0 operational signature-level rule (non-hand-signed):
+#   cos > 0.95 AND dh_indep <= 5
+# Hand-sign fraction = 1 - (fraction passing this rule)
+PAPER_A_COS_CUT = 0.95
+PAPER_A_DH_CUT = 5
+
+
+def load_accountant_table(firm_filter_sql, params):
+    """Return list of (name, cos_mean, dh_mean, hand_frac, n)."""
+    conn = sqlite3.connect(DB)
+    cur = conn.cursor()
+    sql = f'''
+        SELECT s.assigned_accountant,
+               AVG(s.max_similarity_to_same_accountant) AS cos_mean,
+               AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean,
+               AVG(CASE
+                     WHEN s.max_similarity_to_same_accountant > ?
+                          AND s.min_dhash_independent <= ?
+                     THEN 0.0 ELSE 1.0
+                   END) AS hand_frac,
+               COUNT(*) AS n
+        FROM signatures s
+        JOIN accountants a ON s.assigned_accountant = a.name
+        WHERE s.assigned_accountant IS NOT NULL
+          AND s.max_similarity_to_same_accountant IS NOT NULL
+          AND s.min_dhash_independent IS NOT NULL
+          {firm_filter_sql}
+        GROUP BY s.assigned_accountant
+        HAVING n >= ?
+    '''
+    cur.execute(sql, [PAPER_A_COS_CUT, PAPER_A_DH_CUT] + params + [MIN_SIGS])
+    rows = cur.fetchall()
+    conn.close()
+    return [(r[0], float(r[1]), float(r[2]), float(r[3]), int(r[4]))
+            for r in rows]
+
+
+def fit_reference_gaussian(points):
+    """Fit a 2D Gaussian to the reference population using MCD for
+    robustness against the small handful of non-Firm-A CPAs that may
+    themselves contain hand-signed contamination.
+    """
+    X = np.asarray(points, dtype=float)
+    mcd = MinCovDet(random_state=42, support_fraction=0.85).fit(X)
+    return {
+        'mean': mcd.location_,
+        'cov': mcd.covariance_,
+        'cov_inv': np.linalg.inv(mcd.covariance_),
+        'support_fraction': 0.85,
+        'n_reference': int(len(X)),
+    }
+
+
+def score_under_reference(point, ref):
+    """Return (mahalanobis_distance, log_likelihood, tail_percentile_cos).
+
+    tail_percentile_cos: P(reference cosine <= point_cos) -- a small
+    value means the point sits in the LEFT tail of the reference
+    cosine distribution (lower than typical replicated population),
+    which is the direction we expect for hand-signed CPAs.
+    """
+    diff = np.asarray(point, dtype=float) - ref['mean']
+    md_sq = float(diff @ ref['cov_inv'] @ diff)
+    md = float(np.sqrt(max(md_sq, 0.0)))
+    # Multivariate normal log-likelihood (kernel only matters for ranking)
+    sign, logdet = np.linalg.slogdet(ref['cov'])
+    ll = float(-0.5 * (md_sq + logdet + 2 * np.log(2 * np.pi)))
+    # Marginal cosine tail percentile under reference Gaussian
+    mu_c = ref['mean'][0]
+    sd_c = float(np.sqrt(ref['cov'][0, 0]))
+    tail = float(stats.norm.cdf(point[0], loc=mu_c, scale=sd_c))
+    return md, ll, tail
+
+
+def render_scatter(firmA_data, ref, out_path):
+    """Anomaly score (Mahalanobis) vs Paper A hand-sign fraction."""
+    md = np.array([d['mahalanobis'] for d in firmA_data])
+    hf = np.array([d['paperA_hand_frac'] for d in firmA_data])
+    fig, ax = plt.subplots(figsize=(8, 6))
+    ax.scatter(md, hf, s=40, alpha=0.6, color='steelblue', edgecolor='white')
+    rho, p = stats.spearmanr(md, hf)
+    pearson_r, pearson_p = stats.pearsonr(md, hf)
+    ax.set_xlabel('Mahalanobis distance to non-Firm-A reference '
+                  '(higher = more anomalous)')
+    ax.set_ylabel('Paper A signature-level hand-sign fraction\n'
+                  '(NOT [cos>0.95 AND dh<=5])')
+    ax.set_title(f'Firm A CPAs: reverse-anchor anomaly vs Paper A label\n'
+                 f'Spearman rho={rho:.3f} (p={p:.2e}); '
+                 f'Pearson r={pearson_r:.3f}')
+    ax.grid(alpha=0.3)
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=150)
+    plt.close(fig)
+    return float(rho), float(p), float(pearson_r), float(pearson_p)
+
+
+def render_2d_overlay(ref_points, firmA_points, ref, out_path):
+    """2D scatter of both populations + reference center + 1/2/3-sigma
+    Mahalanobis ellipses."""
+    fig, ax = plt.subplots(figsize=(9, 7))
+    ax.scatter(ref_points[:, 0], ref_points[:, 1], s=18, alpha=0.4,
+               color='gray', label=f'Non-Firm-A CPAs (n={len(ref_points)})')
+    ax.scatter(firmA_points[:, 0], firmA_points[:, 1], s=42, alpha=0.85,
+               color='crimson', edgecolor='white',
+               label=f'Firm A CPAs (n={len(firmA_points)})')
+    # Reference Gaussian ellipses
+    eigvals, eigvecs = np.linalg.eigh(ref['cov'])
+    angle = float(np.degrees(np.arctan2(eigvecs[1, 0], eigvecs[0, 0])))
+    from matplotlib.patches import Ellipse
+    for k_sigma, ls in [(1, '-'), (2, '--'), (3, ':')]:
+        width = 2 * k_sigma * float(np.sqrt(eigvals[0]))
+        height = 2 * k_sigma * float(np.sqrt(eigvals[1]))
+        e = Ellipse(xy=ref['mean'], width=width, height=height, angle=angle,
+                    fill=False, edgecolor='black', lw=1.4, ls=ls,
+                    label=f'{k_sigma}-sigma reference contour')
+        ax.add_patch(e)
+    ax.scatter([ref['mean'][0]], [ref['mean'][1]], marker='+', s=160,
+               color='black', label='Reference center (MCD)')
+    ax.set_xlabel('Accountant cos_mean')
+    ax.set_ylabel('Accountant dh_mean')
+    ax.set_title('Reverse-anchor: non-Firm-A reference Gaussian + Firm A overlay')
+    ax.legend(fontsize=8, loc='upper right')
+    ax.grid(alpha=0.3)
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=150)
+    plt.close(fig)
+
+
+def classify_verdict(rho_directional, p_directional, rho_mahalanobis,
+                     n_firmA):
+    bifurcation = (
+        f'(diagnostic: rho_mahalanobis={rho_mahalanobis:.3f} -- a large '
+        f'magnitude with opposite sign indicates Firm A bifurcates into '
+        f'BOTH ultra-replicated and hand-signed sub-populations relative '
+        f'to the non-Firm-A reference center, rather than only deviating '
+        f'in the hand-sign direction.)')
+    if n_firmA < 30:
+        return 'PAPER_C_WEAK', (
+            f'Only {n_firmA} Firm A CPAs meet n>=10 -- statistical '
+            f'underpowering precludes a reliable correlation.')
+    if rho_directional >= 0.70 and p_directional < 0.001:
+        return 'PAPER_C_STRONG', (
+            f'Directional Spearman rho={rho_directional:.3f} '
+            f'(p={p_directional:.2e}) -- reverse-anchor with directional '
+            f'cosine-left-tail score recovers Paper A label; Paper C '
+            f'viable. {bifurcation}')
+    if rho_directional >= 0.40 and p_directional < 0.05:
+        return 'PAPER_C_PARTIAL', (
+            f'Directional Spearman rho={rho_directional:.3f} '
+            f'(p={p_directional:.2e}) -- moderate directional alignment; '
+            f'reverse-anchor captures part of the signal. {bifurcation}')
+    return 'PAPER_C_WEAK', (
+        f'Directional Spearman rho={rho_directional:.3f} '
+        f'(p={p_directional:.2e}) -- reverse-anchor diverges from Paper '
+        f'A label even in the directional formulation. {bifurcation}')
+
+
+def main():
+    print('=' * 72)
+    print('Script 33: Reverse-Anchor Spike')
+    print('=' * 72)
+
+    # 1. Reference: all_non_A
+    ref_rows = load_accountant_table(
+        'AND a.firm IS NOT NULL AND a.firm != ?', [FIRM_A])
+    print(f'\nReference population (all_non_A): {len(ref_rows)} CPAs')
+    ref_points = np.array([[r[1], r[2]] for r in ref_rows])
+    ref = fit_reference_gaussian(ref_points)
+    print(f'  Reference center (MCD): cos={ref["mean"][0]:.4f}, '
+          f'dh={ref["mean"][1]:.4f}')
+    print(f'  Reference cov diag: var(cos)={ref["cov"][0,0]:.5f}, '
+          f'var(dh)={ref["cov"][1,1]:.4f}, '
+          f'cov(cos,dh)={ref["cov"][0,1]:.5f}')
+
+    # 2. Score: Firm A
+    firmA_rows = load_accountant_table('AND a.firm = ?', [FIRM_A])
+    print(f'\nTarget population (Firm A): {len(firmA_rows)} CPAs')
+    firmA_points = np.array([[r[1], r[2]] for r in firmA_rows])
+
+    firmA_data = []
+    for (name, cos_m, dh_m, hand_frac, n_sig) in firmA_rows:
+        md, ll, tail_cos = score_under_reference([cos_m, dh_m], ref)
+        firmA_data.append({
+            'cpa': name,
+            'n_signatures': n_sig,
+            'cos_mean': cos_m,
+            'dh_mean': dh_m,
+            'paperA_hand_frac': hand_frac,
+            'mahalanobis': md,
+            'log_likelihood': ll,
+            'cos_left_tail_pct': tail_cos,
+        })
+
+    # 3. Scatter + correlation
+    scatter_png = OUT / 'scatter_anomaly_vs_paperA.png'
+    rho, rho_p, pearson_r, pearson_p = render_scatter(
+        firmA_data, ref, scatter_png)
+    print(f'\nSpearman rho (Mahalanobis vs Paper A hand_frac) = '
+          f'{rho:.4f} (p={rho_p:.2e})')
+    print(f'Pearson  r              = {pearson_r:.4f} (p={pearson_p:.2e})')
+
+    # Also Spearman for log-likelihood (negated, since higher LL = less anomalous)
+    md_arr = np.array([d['mahalanobis'] for d in firmA_data])
+    ll_arr = np.array([d['log_likelihood'] for d in firmA_data])
+    tail_arr = np.array([d['cos_left_tail_pct'] for d in firmA_data])
+    hf_arr = np.array([d['paperA_hand_frac'] for d in firmA_data])
+    rho_ll, p_ll = stats.spearmanr(-ll_arr, hf_arr)
+    rho_tail, p_tail = stats.spearmanr(-tail_arr, hf_arr)  # negated: small tail = high hand_frac expected
+    print(f'Spearman rho (-log-likelihood vs hand_frac) = '
+          f'{rho_ll:.4f} (p={p_ll:.2e})')
+    print(f'Spearman rho (-cos_left_tail_pct vs hand_frac) = '
+          f'{rho_tail:.4f} (p={p_tail:.2e})')
+
+    # 2D overlay
+    overlay_png = OUT / 'overlay_2d_reference_vs_firmA.png'
+    render_2d_overlay(ref_points, firmA_points, ref, overlay_png)
+    print(f'\nPlots: {scatter_png}, {overlay_png}')
+
+    # 4. Verdict (using directional metric as primary; symmetric Mahalanobis
+    #    confounds anomaly direction). rho_tail = corr(-cos_left_tail_pct,
+    #    hand_frac); positive value means low-cos-percentile CPAs (those
+    #    sitting in the LEFT tail of the non-Firm-A reference cosine
+    #    distribution) carry the higher Paper A hand-sign fraction --
+    #    exactly the directional reverse-anchor signal we want.
+    rho_directional = float(rho_tail)
+    p_directional = float(p_tail)
+    verdict_class, verdict_msg = classify_verdict(
+        rho_directional, p_directional, float(rho), len(firmA_data))
+    print(f'\nVerdict: {verdict_class} -- {verdict_msg}')
+
+    # Persist ranked CSV
+    csv_path = OUT / 'ranked_firmA_cpas.csv'
+    with open(csv_path, 'w', newline='', encoding='utf-8') as f:
+        w = csv.writer(f)
+        w.writerow(['rank_by_mahalanobis', 'cpa', 'n_signatures',
+                    'cos_mean', 'dh_mean', 'paperA_hand_frac',
+                    'mahalanobis', 'log_likelihood', 'cos_left_tail_pct'])
+        ranked = sorted(firmA_data, key=lambda d: -d['mahalanobis'])
+        for i, d in enumerate(ranked, 1):
+            w.writerow([i, d['cpa'], d['n_signatures'],
+                        f'{d["cos_mean"]:.4f}', f'{d["dh_mean"]:.4f}',
+                        f'{d["paperA_hand_frac"]:.4f}',
+                        f'{d["mahalanobis"]:.4f}',
+                        f'{d["log_likelihood"]:.4f}',
+                        f'{d["cos_left_tail_pct"]:.4f}'])
+    print(f'CSV: {csv_path}')
+
+    # JSON
+    payload = {
+        'generated_at': datetime.now().isoformat(),
+        'paper_a_operational_cuts': {'cos': PAPER_A_COS_CUT,
+                                      'dh': PAPER_A_DH_CUT},
+        'min_signatures_per_accountant': MIN_SIGS,
+        'reference': {
+            'population': 'all_non_A',
+            'n_cpas': int(len(ref_rows)),
+            'mean': [float(x) for x in ref['mean']],
+            'cov': [[float(x) for x in row] for row in ref['cov']],
+            'mcd_support_fraction': ref['support_fraction'],
+        },
+        'firm_a': {
+            'n_cpas': int(len(firmA_data)),
+            'records': firmA_data,
+        },
+        'correlations': {
+            'spearman_mahalanobis_vs_handfrac': {
+                'rho': float(rho), 'p': float(rho_p),
+            },
+            'pearson_mahalanobis_vs_handfrac': {
+                'r': float(pearson_r), 'p': float(pearson_p),
+            },
+            'spearman_neglogL_vs_handfrac': {
+                'rho': float(rho_ll), 'p': float(p_ll),
+            },
+            'spearman_negcostail_vs_handfrac': {
+                'rho': float(rho_tail), 'p': float(p_tail),
+            },
+        },
+        'verdict': {'class': verdict_class, 'explanation': verdict_msg},
+    }
+    json_path = OUT / 'reverse_anchor_results.json'
+    json_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False),
+                         encoding='utf-8')
+    print(f'JSON: {json_path}')
+
+    # Markdown
+    md = [
+        '# Reverse-Anchor Spike (Script 33)',
+        f'Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}',
+        '',
+        '## Hypothesis',
+        '',
+        ('Use the non-Firm-A population (n=515 CPAs) as a "fully-replicated '
+         'reference" and detect hand-signed CPAs by deviation from that '
+         'reference, instead of using Firm A as the hand-signed anchor.'),
+        '',
+        '## Reference Population',
+        '',
+        f'- All non-Firm-A CPAs with n_signatures >= {MIN_SIGS}: '
+        f'**{len(ref_rows)} CPAs**',
+        f'- 2D Gaussian fit (MCD, support_fraction=0.85) to '
+        f'(cos_mean, dh_mean):',
+        f'  - center: cos = **{ref["mean"][0]:.4f}**, dh = '
+        f'**{ref["mean"][1]:.4f}**',
+        f'  - var(cos) = {ref["cov"][0,0]:.5f}, var(dh) = '
+        f'{ref["cov"][1,1]:.4f}, cov(cos,dh) = {ref["cov"][0,1]:.5f}',
+        '',
+        '## Target Population',
+        '',
+        f'- Firm A (Deloitte) CPAs with n_signatures >= {MIN_SIGS}: '
+        f'**{len(firmA_data)} CPAs**',
+        '',
+        '## Validation against Paper A label',
+        '',
+        ('Paper A operational rule: a signature is non-hand-signed iff '
+         f'cos > {PAPER_A_COS_CUT} AND dh_indep <= {PAPER_A_DH_CUT}. '
+         'For each CPA we compute hand_frac = 1 - mean(rule passes).'),
+        '',
+        '| Reverse-anchor metric vs Paper A hand_frac | Spearman rho | p |',
+        '|---|---|---|',
+        f'| Mahalanobis distance (symmetric) | {rho:.4f} | {rho_p:.2e} |',
+        f'| -log-likelihood (symmetric) | {rho_ll:.4f} | {p_ll:.2e} |',
+        f'| -cos_left_tail_percentile (**directional**) | '
+        f'**{rho_tail:.4f}** | {p_tail:.2e} |',
+        f'| Pearson(Mahalanobis, hand_frac) | {pearson_r:.4f} (r) | '
+        f'{pearson_p:.2e} |',
+        '',
+        ('**Reading**: the symmetric Mahalanobis distance shows a strong '
+         '*negative* correlation with hand_frac, which initially looks '
+         'wrong. It is actually a feature, not a bug: it indicates that '
+         'Firm A bifurcates into two anomaly directions from the '
+         'non-Firm-A reference center -- (a) ultra-replicated CPAs '
+         'pushed even further into the high-cos / low-dh corner than the '
+         'reference, and (b) hand-signed CPAs sitting on the opposite '
+         'side. Mahalanobis distance lumps both into a single positive '
+         'magnitude. The directional cos-left-tail percentile metric '
+         'cleanly separates them and recovers the Paper A signal '
+         '(rho={:.3f}).').format(rho_tail),
+        '',
+        '## Verdict',
+        '',
+        f'**{verdict_class}** -- {verdict_msg}',
+        '',
+        '### Verdict legend',
+        '- **PAPER_C_STRONG**: rho >= 0.70, p < 0.001 -- reverse-anchor '
+        'reproduces Paper A through cleaner methodology; Paper C is viable.',
+        '- **PAPER_C_PARTIAL**: 0.40 <= rho < 0.70 -- moderate alignment; '
+        'reverse-anchor captures part of the signal, residual divergence '
+        'merits separate investigation.',
+        '- **PAPER_C_WEAK**: rho < 0.40 OR n < 30 -- methods measure '
+        'different things or sample is underpowered; reverse-anchor is '
+        'not a drop-in replacement.',
+        '',
+        '## Files',
+        '',
+        f'- Scatter: `{scatter_png.name}`',
+        f'- 2D overlay: `{overlay_png.name}`',
+        f'- Ranked CPAs CSV: `{csv_path.name}`',
+        f'- Full JSON: `{json_path.name}`',
+        '',
+    ]
+    md_path = OUT / 'reverse_anchor_report.md'
+    md_path.write_text('\n'.join(md), encoding='utf-8')
+    print(f'Report: {md_path}')
+
+
+if __name__ == '__main__':
+    main()