#!/usr/bin/env python3 """ Script 16: Burgstahler-Dichev / McCrary Discontinuity Test ========================================================== Tests for a discontinuity in the empirical density of similarity scores, following: - Burgstahler & Dichev (1997) - earnings-management style smoothness test - McCrary (2008) - rigorous density-discontinuity asymptotics Idea: Discretize the distribution into equal-width bins. For each bin i compute the standardized deviation Z_i between observed count and the smooth expectation (average of neighbours). Under H0 (distributional smoothness), Z_i ~ N(0,1). A threshold is identified at the transition where Z_{i-1} is significantly negative (below expectation) next to Z_i significantly positive (above expectation) -- marking the boundary between two generative mechanisms (hand-signed vs non-hand-signed). Inputs: - Firm A cosine max-similarity and independent min dHash - Full-sample cosine and dHash (for comparison) Output: reports/bd_mccrary/bd_mccrary_report.md reports/bd_mccrary/bd_mccrary_results.json reports/bd_mccrary/bd_mccrary_.png (overlay plots) """ import sqlite3 import json import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from pathlib import Path from datetime import datetime DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/bd_mccrary') OUT.mkdir(parents=True, exist_ok=True) FIRM_A = '勤業眾信聯合' # BD/McCrary critical values (two-sided, alpha=0.05) Z_CRIT = 1.96 def bd_mccrary(values, bin_width, lo=None, hi=None): """ Compute Burgstahler-Dichev standardized deviations per bin. For each bin i with count n_i: expected = 0.5 * (n_{i-1} + n_{i+1}) SE = sqrt(N*p_i*(1-p_i) + 0.25*N*(p_{i-1}+p_{i+1})*(1-p_{i-1}-p_{i+1})) Z_i = (n_i - expected) / SE Returns arrays of (bin_centers, counts, z_scores, expected). """ arr = np.asarray(values, dtype=float) arr = arr[~np.isnan(arr)] if lo is None: lo = float(np.floor(arr.min() / bin_width) * bin_width) if hi is None: hi = float(np.ceil(arr.max() / bin_width) * bin_width) edges = np.arange(lo, hi + bin_width, bin_width) counts, _ = np.histogram(arr, bins=edges) centers = (edges[:-1] + edges[1:]) / 2.0 N = counts.sum() p = counts / N if N else counts.astype(float) n_bins = len(counts) z = np.full(n_bins, np.nan) expected = np.full(n_bins, np.nan) for i in range(1, n_bins - 1): p_lo = p[i - 1] p_hi = p[i + 1] exp_i = 0.5 * (counts[i - 1] + counts[i + 1]) var_i = (N * p[i] * (1 - p[i]) + 0.25 * N * (p_lo + p_hi) * (1 - p_lo - p_hi)) if var_i <= 0: continue se = np.sqrt(var_i) z[i] = (counts[i] - exp_i) / se expected[i] = exp_i return centers, counts, z, expected def find_transition(centers, z, direction='neg_to_pos'): """ Find the first bin pair where Z_{i-1} significantly negative and Z_i significantly positive (or vice versa). direction='neg_to_pos' -> threshold where hand-signed density drops (below expectation) and non-hand-signed density rises (above expectation). For cosine similarity, this transition is expected around the separation point, so the threshold sits between centers[i-1] and centers[i]. """ transitions = [] for i in range(1, len(z)): if np.isnan(z[i - 1]) or np.isnan(z[i]): continue if direction == 'neg_to_pos': if z[i - 1] < -Z_CRIT and z[i] > Z_CRIT: transitions.append({ 'idx': int(i), 'threshold_between': float( (centers[i - 1] + centers[i]) / 2.0), 'z_below': float(z[i - 1]), 'z_above': float(z[i]), 'left_center': float(centers[i - 1]), 'right_center': float(centers[i]), }) else: # pos_to_neg if z[i - 1] > Z_CRIT and z[i] < -Z_CRIT: transitions.append({ 'idx': int(i), 'threshold_between': float( (centers[i - 1] + centers[i]) / 2.0), 'z_above': float(z[i - 1]), 'z_below': float(z[i]), 'left_center': float(centers[i - 1]), 'right_center': float(centers[i]), }) return transitions def plot_bd(centers, counts, z, expected, title, out_path, threshold=None): fig, axes = plt.subplots(2, 1, figsize=(11, 7), sharex=True) ax = axes[0] ax.bar(centers, counts, width=(centers[1] - centers[0]) * 0.9, color='steelblue', alpha=0.6, edgecolor='white', label='Observed') mask = ~np.isnan(expected) ax.plot(centers[mask], expected[mask], 'r-', lw=1.5, label='Expected (smooth null)') ax.set_ylabel('Count') ax.set_title(title) ax.legend() if threshold is not None: ax.axvline(threshold, color='green', ls='--', lw=2, label=f'Threshold≈{threshold:.4f}') ax = axes[1] ax.axhline(0, color='black', lw=0.5) ax.axhline(Z_CRIT, color='red', ls=':', alpha=0.7, label=f'±{Z_CRIT} critical') ax.axhline(-Z_CRIT, color='red', ls=':', alpha=0.7) colors = ['coral' if zi > Z_CRIT else 'steelblue' if zi < -Z_CRIT else 'lightgray' for zi in z] ax.bar(centers, z, width=(centers[1] - centers[0]) * 0.9, color=colors, edgecolor='black', lw=0.3) ax.set_xlabel('Value') ax.set_ylabel('Z statistic') ax.legend() if threshold is not None: ax.axvline(threshold, color='green', ls='--', lw=2) plt.tight_layout() fig.savefig(out_path, dpi=150) plt.close() def fetch(label): conn = sqlite3.connect(DB) cur = conn.cursor() if label == 'firm_a_cosine': cur.execute(''' SELECT s.max_similarity_to_same_accountant FROM signatures s JOIN accountants a ON s.assigned_accountant = a.name WHERE a.firm = ? AND s.max_similarity_to_same_accountant IS NOT NULL ''', (FIRM_A,)) elif label == 'firm_a_dhash': cur.execute(''' SELECT s.min_dhash_independent FROM signatures s JOIN accountants a ON s.assigned_accountant = a.name WHERE a.firm = ? AND s.min_dhash_independent IS NOT NULL ''', (FIRM_A,)) elif label == 'full_cosine': cur.execute(''' SELECT max_similarity_to_same_accountant FROM signatures WHERE max_similarity_to_same_accountant IS NOT NULL ''') elif label == 'full_dhash': cur.execute(''' SELECT min_dhash_independent FROM signatures WHERE min_dhash_independent IS NOT NULL ''') else: raise ValueError(label) vals = [r[0] for r in cur.fetchall() if r[0] is not None] conn.close() return np.array(vals, dtype=float) def main(): print('='*70) print('Script 16: Burgstahler-Dichev / McCrary Discontinuity Test') print('='*70) cases = [ ('firm_a_cosine', 0.005, 'Firm A cosine max-similarity', 'neg_to_pos'), ('firm_a_dhash', 1.0, 'Firm A independent min dHash', 'pos_to_neg'), ('full_cosine', 0.005, 'Full-sample cosine max-similarity', 'neg_to_pos'), ('full_dhash', 1.0, 'Full-sample independent min dHash', 'pos_to_neg'), ] all_results = {} for key, bw, label, direction in cases: print(f'\n[{label}] bin width={bw}') arr = fetch(key) print(f' N = {len(arr):,}') centers, counts, z, expected = bd_mccrary(arr, bw) transitions = find_transition(centers, z, direction=direction) # Summarize if transitions: # Choose the most extreme (highest |z_above * z_below|) transition best = max(transitions, key=lambda t: abs(t.get('z_above', 0)) + abs(t.get('z_below', 0))) threshold = best['threshold_between'] print(f' {len(transitions)} candidate transition(s); ' f'best at {threshold:.4f}') else: best = None threshold = None print(' No significant transition detected (no Z^- next to Z^+)') # Plot png = OUT / f'bd_mccrary_{key}.png' plot_bd(centers, counts, z, expected, label, png, threshold=threshold) print(f' plot: {png}') all_results[key] = { 'label': label, 'n': int(len(arr)), 'bin_width': float(bw), 'direction': direction, 'n_bins': int(len(centers)), 'bin_centers': [float(c) for c in centers], 'counts': [int(c) for c in counts], 'z_scores': [None if np.isnan(zi) else float(zi) for zi in z], 'transitions': transitions, 'best_transition': best, 'threshold': threshold, } # Write JSON json_path = OUT / 'bd_mccrary_results.json' with open(json_path, 'w') as f: json.dump({ 'generated_at': datetime.now().isoformat(), 'z_critical': Z_CRIT, 'results': all_results, }, f, indent=2, ensure_ascii=False) print(f'\nJSON: {json_path}') # Markdown md = [ '# Burgstahler-Dichev / McCrary Discontinuity Test Report', f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", '', '## Method', '', 'For each bin i of width δ, under the null of distributional', 'smoothness the expected count is the average of neighbours,', 'and the standardized deviation', '', ' Z_i = (n_i - 0.5*(n_{i-1}+n_{i+1})) / SE', '', 'is approximately N(0,1). We flag a transition when Z_{i-1} < -1.96', 'and Z_i > 1.96 (or reversed, depending on the scale direction).', 'The threshold is taken at the midpoint of the two bin centres.', '', '## Results', '', '| Test | N | bin width | Transitions | Threshold |', '|------|---|-----------|-------------|-----------|', ] for r in all_results.values(): thr = (f"{r['threshold']:.4f}" if r['threshold'] is not None else '—') md.append( f"| {r['label']} | {r['n']:,} | {r['bin_width']} | " f"{len(r['transitions'])} | {thr} |" ) md += [ '', '## Notes', '', '* For cosine (direction `neg_to_pos`), the transition marks the', " boundary below which hand-signed dominates and above which", ' non-hand-signed replication dominates.', '* For dHash (direction `pos_to_neg`), the transition marks the', " boundary below which replication dominates (small distances)", ' and above which hand-signed variation dominates.', '* Multiple candidate transitions are ranked by total |Z| magnitude', ' on both sides of the boundary; the strongest is reported.', '* Absence of a significant transition is itself informative: it', ' is consistent with a single generative mechanism (e.g. Firm A', ' which is near-universally non-hand-signed).', ] md_path = OUT / 'bd_mccrary_report.md' md_path.write_text('\n'.join(md), encoding='utf-8') print(f'Report: {md_path}') if __name__ == '__main__': main()