pdf_signature_extraction/signature_analysis/16_bd_mccrary_discontinuity.py

#!/usr/bin/env python3
"""
Script 16: Burgstahler-Dichev / McCrary Discontinuity Test
==========================================================
Tests for a discontinuity in the empirical density of similarity scores,
following:
  - Burgstahler & Dichev (1997) - earnings-management style smoothness test
  - McCrary (2008)             - rigorous density-discontinuity asymptotics

Idea:
  Discretize the distribution into equal-width bins. For each bin i compute
  the standardized deviation Z_i between observed count and the smooth
  expectation (average of neighbours). Under H0 (distributional smoothness),
  Z_i ~ N(0,1). A threshold is identified at the transition where Z_{i-1}
  is significantly negative (below expectation) next to Z_i significantly
  positive (above expectation) -- marking the boundary between two
  generative mechanisms (hand-signed vs non-hand-signed).

Inputs:
  - Firm A cosine max-similarity and independent min dHash
  - Full-sample cosine and dHash (for comparison)

Output:
  reports/bd_mccrary/bd_mccrary_report.md
  reports/bd_mccrary/bd_mccrary_results.json
  reports/bd_mccrary/bd_mccrary_<variant>.png (overlay plots)
"""

import sqlite3
import json
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime

DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/bd_mccrary')
OUT.mkdir(parents=True, exist_ok=True)

FIRM_A = '勤業眾信聯合'

# BD/McCrary critical values (two-sided, alpha=0.05)
Z_CRIT = 1.96


def bd_mccrary(values, bin_width, lo=None, hi=None):
    """
    Compute Burgstahler-Dichev standardized deviations per bin.

    For each bin i with count n_i:
      expected = 0.5 * (n_{i-1} + n_{i+1})
      SE       = sqrt(N*p_i*(1-p_i) + 0.25*N*(p_{i-1}+p_{i+1})*(1-p_{i-1}-p_{i+1}))
      Z_i      = (n_i - expected) / SE

    Returns arrays of (bin_centers, counts, z_scores, expected).
    """
    arr = np.asarray(values, dtype=float)
    arr = arr[~np.isnan(arr)]
    if lo is None:
        lo = float(np.floor(arr.min() / bin_width) * bin_width)
    if hi is None:
        hi = float(np.ceil(arr.max() / bin_width) * bin_width)
    edges = np.arange(lo, hi + bin_width, bin_width)
    counts, _ = np.histogram(arr, bins=edges)
    centers = (edges[:-1] + edges[1:]) / 2.0

    N = counts.sum()
    p = counts / N if N else counts.astype(float)

    n_bins = len(counts)
    z = np.full(n_bins, np.nan)
    expected = np.full(n_bins, np.nan)

    for i in range(1, n_bins - 1):
        p_lo = p[i - 1]
        p_hi = p[i + 1]
        exp_i = 0.5 * (counts[i - 1] + counts[i + 1])
        var_i = (N * p[i] * (1 - p[i])
                 + 0.25 * N * (p_lo + p_hi) * (1 - p_lo - p_hi))
        if var_i <= 0:
            continue
        se = np.sqrt(var_i)
        z[i] = (counts[i] - exp_i) / se
        expected[i] = exp_i

    return centers, counts, z, expected


def find_transition(centers, z, direction='neg_to_pos'):
    """
    Find the first bin pair where Z_{i-1} significantly negative and
    Z_i significantly positive (or vice versa).

    direction='neg_to_pos' -> threshold where hand-signed density drops
                              (below expectation) and non-hand-signed
                              density rises (above expectation). For
                              cosine similarity, this transition is
                              expected around the separation point, so
                              the threshold sits between centers[i-1]
                              and centers[i].
    """
    transitions = []
    for i in range(1, len(z)):
        if np.isnan(z[i - 1]) or np.isnan(z[i]):
            continue
        if direction == 'neg_to_pos':
            if z[i - 1] < -Z_CRIT and z[i] > Z_CRIT:
                transitions.append({
                    'idx': int(i),
                    'threshold_between': float(
                        (centers[i - 1] + centers[i]) / 2.0),
                    'z_below': float(z[i - 1]),
                    'z_above': float(z[i]),
                    'left_center': float(centers[i - 1]),
                    'right_center': float(centers[i]),
                })
        else:  # pos_to_neg
            if z[i - 1] > Z_CRIT and z[i] < -Z_CRIT:
                transitions.append({
                    'idx': int(i),
                    'threshold_between': float(
                        (centers[i - 1] + centers[i]) / 2.0),
                    'z_above': float(z[i - 1]),
                    'z_below': float(z[i]),
                    'left_center': float(centers[i - 1]),
                    'right_center': float(centers[i]),
                })
    return transitions


def plot_bd(centers, counts, z, expected, title, out_path, threshold=None):
    fig, axes = plt.subplots(2, 1, figsize=(11, 7), sharex=True)

    ax = axes[0]
    ax.bar(centers, counts, width=(centers[1] - centers[0]) * 0.9,
           color='steelblue', alpha=0.6, edgecolor='white', label='Observed')
    mask = ~np.isnan(expected)
    ax.plot(centers[mask], expected[mask], 'r-', lw=1.5,
            label='Expected (smooth null)')
    ax.set_ylabel('Count')
    ax.set_title(title)
    ax.legend()
    if threshold is not None:
        ax.axvline(threshold, color='green', ls='--', lw=2,
                   label=f'Threshold≈{threshold:.4f}')

    ax = axes[1]
    ax.axhline(0, color='black', lw=0.5)
    ax.axhline(Z_CRIT, color='red', ls=':', alpha=0.7,
               label=f'±{Z_CRIT} critical')
    ax.axhline(-Z_CRIT, color='red', ls=':', alpha=0.7)
    colors = ['coral' if zi > Z_CRIT else 'steelblue' if zi < -Z_CRIT
              else 'lightgray' for zi in z]
    ax.bar(centers, z, width=(centers[1] - centers[0]) * 0.9, color=colors,
           edgecolor='black', lw=0.3)
    ax.set_xlabel('Value')
    ax.set_ylabel('Z statistic')
    ax.legend()
    if threshold is not None:
        ax.axvline(threshold, color='green', ls='--', lw=2)

    plt.tight_layout()
    fig.savefig(out_path, dpi=150)
    plt.close()


def fetch(label):
    conn = sqlite3.connect(DB)
    cur = conn.cursor()
    if label == 'firm_a_cosine':
        cur.execute('''
            SELECT s.max_similarity_to_same_accountant
            FROM signatures s
            JOIN accountants a ON s.assigned_accountant = a.name
            WHERE a.firm = ? AND s.max_similarity_to_same_accountant IS NOT NULL
        ''', (FIRM_A,))
    elif label == 'firm_a_dhash':
        cur.execute('''
            SELECT s.min_dhash_independent
            FROM signatures s
            JOIN accountants a ON s.assigned_accountant = a.name
            WHERE a.firm = ? AND s.min_dhash_independent IS NOT NULL
        ''', (FIRM_A,))
    elif label == 'full_cosine':
        cur.execute('''
            SELECT max_similarity_to_same_accountant FROM signatures
            WHERE max_similarity_to_same_accountant IS NOT NULL
        ''')
    elif label == 'full_dhash':
        cur.execute('''
            SELECT min_dhash_independent FROM signatures
            WHERE min_dhash_independent IS NOT NULL
        ''')
    else:
        raise ValueError(label)
    vals = [r[0] for r in cur.fetchall() if r[0] is not None]
    conn.close()
    return np.array(vals, dtype=float)


def main():
    print('='*70)
    print('Script 16: Burgstahler-Dichev / McCrary Discontinuity Test')
    print('='*70)

    cases = [
        ('firm_a_cosine', 0.005, 'Firm A cosine max-similarity', 'neg_to_pos'),
        ('firm_a_dhash', 1.0, 'Firm A independent min dHash', 'pos_to_neg'),
        ('full_cosine', 0.005, 'Full-sample cosine max-similarity',
         'neg_to_pos'),
        ('full_dhash', 1.0, 'Full-sample independent min dHash', 'pos_to_neg'),
    ]

    all_results = {}
    for key, bw, label, direction in cases:
        print(f'\n[{label}] bin width={bw}')
        arr = fetch(key)
        print(f'  N = {len(arr):,}')
        centers, counts, z, expected = bd_mccrary(arr, bw)
        transitions = find_transition(centers, z, direction=direction)

        # Summarize
        if transitions:
            # Choose the most extreme (highest |z_above * z_below|) transition
            best = max(transitions,
                       key=lambda t: abs(t.get('z_above', 0))
                                     + abs(t.get('z_below', 0)))
            threshold = best['threshold_between']
            print(f'  {len(transitions)} candidate transition(s); '
                  f'best at {threshold:.4f}')
        else:
            best = None
            threshold = None
            print('  No significant transition detected (no Z^- next to Z^+)')

        # Plot
        png = OUT / f'bd_mccrary_{key}.png'
        plot_bd(centers, counts, z, expected, label, png, threshold=threshold)
        print(f'  plot: {png}')

        all_results[key] = {
            'label': label,
            'n': int(len(arr)),
            'bin_width': float(bw),
            'direction': direction,
            'n_bins': int(len(centers)),
            'bin_centers': [float(c) for c in centers],
            'counts': [int(c) for c in counts],
            'z_scores': [None if np.isnan(zi) else float(zi) for zi in z],
            'transitions': transitions,
            'best_transition': best,
            'threshold': threshold,
        }

    # Write JSON
    json_path = OUT / 'bd_mccrary_results.json'
    with open(json_path, 'w') as f:
        json.dump({
            'generated_at': datetime.now().isoformat(),
            'z_critical': Z_CRIT,
            'results': all_results,
        }, f, indent=2, ensure_ascii=False)
    print(f'\nJSON: {json_path}')

    # Markdown
    md = [
        '# Burgstahler-Dichev / McCrary Discontinuity Test Report',
        f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        '',
        '## Method',
        '',
        'For each bin i of width δ, under the null of distributional',
        'smoothness the expected count is the average of neighbours,',
        'and the standardized deviation',
        '',
        '  Z_i = (n_i - 0.5*(n_{i-1}+n_{i+1})) / SE',
        '',
        'is approximately N(0,1). We flag a transition when Z_{i-1} < -1.96',
        'and Z_i > 1.96 (or reversed, depending on the scale direction).',
        'The threshold is taken at the midpoint of the two bin centres.',
        '',
        '## Results',
        '',
        '| Test | N | bin width | Transitions | Threshold |',
        '|------|---|-----------|-------------|-----------|',
    ]
    for r in all_results.values():
        thr = (f"{r['threshold']:.4f}" if r['threshold'] is not None
               else '—')
        md.append(
            f"| {r['label']} | {r['n']:,} | {r['bin_width']} | "
            f"{len(r['transitions'])} | {thr} |"
        )
    md += [
        '',
        '## Notes',
        '',
        '* For cosine (direction `neg_to_pos`), the transition marks the',
        "  boundary below which hand-signed dominates and above which",
        '  non-hand-signed replication dominates.',
        '* For dHash (direction `pos_to_neg`), the transition marks the',
        "  boundary below which replication dominates (small distances)",
        '  and above which hand-signed variation dominates.',
        '* Multiple candidate transitions are ranked by total |Z| magnitude',
        '  on both sides of the boundary; the strongest is reported.',
        '* Absence of a significant transition is itself informative: it',
        '  is consistent with a single dominant generative mechanism (e.g.',
        '  Firm A, a replication-dominated population per interviews with',
        '  multiple Firm A accountants -- most use replication, a minority',
        '  may hand-sign).',
    ]
    md_path = OUT / 'bd_mccrary_report.md'
    md_path.write_text('\n'.join(md), encoding='utf-8')
    print(f'Report: {md_path}')


if __name__ == '__main__':
    main()