68689c9f9b
Interview evidence from multiple Firm A accountants confirms that MOST use replication (stamping / firm-level e-signing) but a MINORITY may still hand-sign. Firm A is therefore a "replication-dominated" population, not a "pure" one. This framing is consistent with: - 92.5% of Firm A signatures exceed cosine 0.95 (majority replication) - The long left tail (~7%) captures the minority hand-signers, not scan noise or preprocessing artifacts - Hartigan dip test: Firm A cosine unimodal long-tail (p=0.17) - Accountant-level GMM: of 180 Firm A accountants, 139 cluster in C1 (high-replication) and 32 in C2 (middle band = minority hand-signers) Updates docstrings and report text in Scripts 15, 16, 18, 19 to match. Partner v3's "near-universal non-hand-signing" language corrected. Script 19 regenerated with the updated text. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
321 lines
12 KiB
Python
321 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script 16: Burgstahler-Dichev / McCrary Discontinuity Test
|
|
==========================================================
|
|
Tests for a discontinuity in the empirical density of similarity scores,
|
|
following:
|
|
- Burgstahler & Dichev (1997) - earnings-management style smoothness test
|
|
- McCrary (2008) - rigorous density-discontinuity asymptotics
|
|
|
|
Idea:
|
|
Discretize the distribution into equal-width bins. For each bin i compute
|
|
the standardized deviation Z_i between observed count and the smooth
|
|
expectation (average of neighbours). Under H0 (distributional smoothness),
|
|
Z_i ~ N(0,1). A threshold is identified at the transition where Z_{i-1}
|
|
is significantly negative (below expectation) next to Z_i significantly
|
|
positive (above expectation) -- marking the boundary between two
|
|
generative mechanisms (hand-signed vs non-hand-signed).
|
|
|
|
Inputs:
|
|
- Firm A cosine max-similarity and independent min dHash
|
|
- Full-sample cosine and dHash (for comparison)
|
|
|
|
Output:
|
|
reports/bd_mccrary/bd_mccrary_report.md
|
|
reports/bd_mccrary/bd_mccrary_results.json
|
|
reports/bd_mccrary/bd_mccrary_<variant>.png (overlay plots)
|
|
"""
|
|
|
|
import sqlite3
|
|
import json
|
|
import numpy as np
|
|
import matplotlib
|
|
matplotlib.use('Agg')
|
|
import matplotlib.pyplot as plt
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
|
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/bd_mccrary')
|
|
OUT.mkdir(parents=True, exist_ok=True)
|
|
|
|
FIRM_A = '勤業眾信聯合'
|
|
|
|
# BD/McCrary critical values (two-sided, alpha=0.05)
|
|
Z_CRIT = 1.96
|
|
|
|
|
|
def bd_mccrary(values, bin_width, lo=None, hi=None):
|
|
"""
|
|
Compute Burgstahler-Dichev standardized deviations per bin.
|
|
|
|
For each bin i with count n_i:
|
|
expected = 0.5 * (n_{i-1} + n_{i+1})
|
|
SE = sqrt(N*p_i*(1-p_i) + 0.25*N*(p_{i-1}+p_{i+1})*(1-p_{i-1}-p_{i+1}))
|
|
Z_i = (n_i - expected) / SE
|
|
|
|
Returns arrays of (bin_centers, counts, z_scores, expected).
|
|
"""
|
|
arr = np.asarray(values, dtype=float)
|
|
arr = arr[~np.isnan(arr)]
|
|
if lo is None:
|
|
lo = float(np.floor(arr.min() / bin_width) * bin_width)
|
|
if hi is None:
|
|
hi = float(np.ceil(arr.max() / bin_width) * bin_width)
|
|
edges = np.arange(lo, hi + bin_width, bin_width)
|
|
counts, _ = np.histogram(arr, bins=edges)
|
|
centers = (edges[:-1] + edges[1:]) / 2.0
|
|
|
|
N = counts.sum()
|
|
p = counts / N if N else counts.astype(float)
|
|
|
|
n_bins = len(counts)
|
|
z = np.full(n_bins, np.nan)
|
|
expected = np.full(n_bins, np.nan)
|
|
|
|
for i in range(1, n_bins - 1):
|
|
p_lo = p[i - 1]
|
|
p_hi = p[i + 1]
|
|
exp_i = 0.5 * (counts[i - 1] + counts[i + 1])
|
|
var_i = (N * p[i] * (1 - p[i])
|
|
+ 0.25 * N * (p_lo + p_hi) * (1 - p_lo - p_hi))
|
|
if var_i <= 0:
|
|
continue
|
|
se = np.sqrt(var_i)
|
|
z[i] = (counts[i] - exp_i) / se
|
|
expected[i] = exp_i
|
|
|
|
return centers, counts, z, expected
|
|
|
|
|
|
def find_transition(centers, z, direction='neg_to_pos'):
|
|
"""
|
|
Find the first bin pair where Z_{i-1} significantly negative and
|
|
Z_i significantly positive (or vice versa).
|
|
|
|
direction='neg_to_pos' -> threshold where hand-signed density drops
|
|
(below expectation) and non-hand-signed
|
|
density rises (above expectation). For
|
|
cosine similarity, this transition is
|
|
expected around the separation point, so
|
|
the threshold sits between centers[i-1]
|
|
and centers[i].
|
|
"""
|
|
transitions = []
|
|
for i in range(1, len(z)):
|
|
if np.isnan(z[i - 1]) or np.isnan(z[i]):
|
|
continue
|
|
if direction == 'neg_to_pos':
|
|
if z[i - 1] < -Z_CRIT and z[i] > Z_CRIT:
|
|
transitions.append({
|
|
'idx': int(i),
|
|
'threshold_between': float(
|
|
(centers[i - 1] + centers[i]) / 2.0),
|
|
'z_below': float(z[i - 1]),
|
|
'z_above': float(z[i]),
|
|
'left_center': float(centers[i - 1]),
|
|
'right_center': float(centers[i]),
|
|
})
|
|
else: # pos_to_neg
|
|
if z[i - 1] > Z_CRIT and z[i] < -Z_CRIT:
|
|
transitions.append({
|
|
'idx': int(i),
|
|
'threshold_between': float(
|
|
(centers[i - 1] + centers[i]) / 2.0),
|
|
'z_above': float(z[i - 1]),
|
|
'z_below': float(z[i]),
|
|
'left_center': float(centers[i - 1]),
|
|
'right_center': float(centers[i]),
|
|
})
|
|
return transitions
|
|
|
|
|
|
def plot_bd(centers, counts, z, expected, title, out_path, threshold=None):
|
|
fig, axes = plt.subplots(2, 1, figsize=(11, 7), sharex=True)
|
|
|
|
ax = axes[0]
|
|
ax.bar(centers, counts, width=(centers[1] - centers[0]) * 0.9,
|
|
color='steelblue', alpha=0.6, edgecolor='white', label='Observed')
|
|
mask = ~np.isnan(expected)
|
|
ax.plot(centers[mask], expected[mask], 'r-', lw=1.5,
|
|
label='Expected (smooth null)')
|
|
ax.set_ylabel('Count')
|
|
ax.set_title(title)
|
|
ax.legend()
|
|
if threshold is not None:
|
|
ax.axvline(threshold, color='green', ls='--', lw=2,
|
|
label=f'Threshold≈{threshold:.4f}')
|
|
|
|
ax = axes[1]
|
|
ax.axhline(0, color='black', lw=0.5)
|
|
ax.axhline(Z_CRIT, color='red', ls=':', alpha=0.7,
|
|
label=f'±{Z_CRIT} critical')
|
|
ax.axhline(-Z_CRIT, color='red', ls=':', alpha=0.7)
|
|
colors = ['coral' if zi > Z_CRIT else 'steelblue' if zi < -Z_CRIT
|
|
else 'lightgray' for zi in z]
|
|
ax.bar(centers, z, width=(centers[1] - centers[0]) * 0.9, color=colors,
|
|
edgecolor='black', lw=0.3)
|
|
ax.set_xlabel('Value')
|
|
ax.set_ylabel('Z statistic')
|
|
ax.legend()
|
|
if threshold is not None:
|
|
ax.axvline(threshold, color='green', ls='--', lw=2)
|
|
|
|
plt.tight_layout()
|
|
fig.savefig(out_path, dpi=150)
|
|
plt.close()
|
|
|
|
|
|
def fetch(label):
|
|
conn = sqlite3.connect(DB)
|
|
cur = conn.cursor()
|
|
if label == 'firm_a_cosine':
|
|
cur.execute('''
|
|
SELECT s.max_similarity_to_same_accountant
|
|
FROM signatures s
|
|
JOIN accountants a ON s.assigned_accountant = a.name
|
|
WHERE a.firm = ? AND s.max_similarity_to_same_accountant IS NOT NULL
|
|
''', (FIRM_A,))
|
|
elif label == 'firm_a_dhash':
|
|
cur.execute('''
|
|
SELECT s.min_dhash_independent
|
|
FROM signatures s
|
|
JOIN accountants a ON s.assigned_accountant = a.name
|
|
WHERE a.firm = ? AND s.min_dhash_independent IS NOT NULL
|
|
''', (FIRM_A,))
|
|
elif label == 'full_cosine':
|
|
cur.execute('''
|
|
SELECT max_similarity_to_same_accountant FROM signatures
|
|
WHERE max_similarity_to_same_accountant IS NOT NULL
|
|
''')
|
|
elif label == 'full_dhash':
|
|
cur.execute('''
|
|
SELECT min_dhash_independent FROM signatures
|
|
WHERE min_dhash_independent IS NOT NULL
|
|
''')
|
|
else:
|
|
raise ValueError(label)
|
|
vals = [r[0] for r in cur.fetchall() if r[0] is not None]
|
|
conn.close()
|
|
return np.array(vals, dtype=float)
|
|
|
|
|
|
def main():
|
|
print('='*70)
|
|
print('Script 16: Burgstahler-Dichev / McCrary Discontinuity Test')
|
|
print('='*70)
|
|
|
|
cases = [
|
|
('firm_a_cosine', 0.005, 'Firm A cosine max-similarity', 'neg_to_pos'),
|
|
('firm_a_dhash', 1.0, 'Firm A independent min dHash', 'pos_to_neg'),
|
|
('full_cosine', 0.005, 'Full-sample cosine max-similarity',
|
|
'neg_to_pos'),
|
|
('full_dhash', 1.0, 'Full-sample independent min dHash', 'pos_to_neg'),
|
|
]
|
|
|
|
all_results = {}
|
|
for key, bw, label, direction in cases:
|
|
print(f'\n[{label}] bin width={bw}')
|
|
arr = fetch(key)
|
|
print(f' N = {len(arr):,}')
|
|
centers, counts, z, expected = bd_mccrary(arr, bw)
|
|
transitions = find_transition(centers, z, direction=direction)
|
|
|
|
# Summarize
|
|
if transitions:
|
|
# Choose the most extreme (highest |z_above * z_below|) transition
|
|
best = max(transitions,
|
|
key=lambda t: abs(t.get('z_above', 0))
|
|
+ abs(t.get('z_below', 0)))
|
|
threshold = best['threshold_between']
|
|
print(f' {len(transitions)} candidate transition(s); '
|
|
f'best at {threshold:.4f}')
|
|
else:
|
|
best = None
|
|
threshold = None
|
|
print(' No significant transition detected (no Z^- next to Z^+)')
|
|
|
|
# Plot
|
|
png = OUT / f'bd_mccrary_{key}.png'
|
|
plot_bd(centers, counts, z, expected, label, png, threshold=threshold)
|
|
print(f' plot: {png}')
|
|
|
|
all_results[key] = {
|
|
'label': label,
|
|
'n': int(len(arr)),
|
|
'bin_width': float(bw),
|
|
'direction': direction,
|
|
'n_bins': int(len(centers)),
|
|
'bin_centers': [float(c) for c in centers],
|
|
'counts': [int(c) for c in counts],
|
|
'z_scores': [None if np.isnan(zi) else float(zi) for zi in z],
|
|
'transitions': transitions,
|
|
'best_transition': best,
|
|
'threshold': threshold,
|
|
}
|
|
|
|
# Write JSON
|
|
json_path = OUT / 'bd_mccrary_results.json'
|
|
with open(json_path, 'w') as f:
|
|
json.dump({
|
|
'generated_at': datetime.now().isoformat(),
|
|
'z_critical': Z_CRIT,
|
|
'results': all_results,
|
|
}, f, indent=2, ensure_ascii=False)
|
|
print(f'\nJSON: {json_path}')
|
|
|
|
# Markdown
|
|
md = [
|
|
'# Burgstahler-Dichev / McCrary Discontinuity Test Report',
|
|
f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
|
'',
|
|
'## Method',
|
|
'',
|
|
'For each bin i of width δ, under the null of distributional',
|
|
'smoothness the expected count is the average of neighbours,',
|
|
'and the standardized deviation',
|
|
'',
|
|
' Z_i = (n_i - 0.5*(n_{i-1}+n_{i+1})) / SE',
|
|
'',
|
|
'is approximately N(0,1). We flag a transition when Z_{i-1} < -1.96',
|
|
'and Z_i > 1.96 (or reversed, depending on the scale direction).',
|
|
'The threshold is taken at the midpoint of the two bin centres.',
|
|
'',
|
|
'## Results',
|
|
'',
|
|
'| Test | N | bin width | Transitions | Threshold |',
|
|
'|------|---|-----------|-------------|-----------|',
|
|
]
|
|
for r in all_results.values():
|
|
thr = (f"{r['threshold']:.4f}" if r['threshold'] is not None
|
|
else '—')
|
|
md.append(
|
|
f"| {r['label']} | {r['n']:,} | {r['bin_width']} | "
|
|
f"{len(r['transitions'])} | {thr} |"
|
|
)
|
|
md += [
|
|
'',
|
|
'## Notes',
|
|
'',
|
|
'* For cosine (direction `neg_to_pos`), the transition marks the',
|
|
" boundary below which hand-signed dominates and above which",
|
|
' non-hand-signed replication dominates.',
|
|
'* For dHash (direction `pos_to_neg`), the transition marks the',
|
|
" boundary below which replication dominates (small distances)",
|
|
' and above which hand-signed variation dominates.',
|
|
'* Multiple candidate transitions are ranked by total |Z| magnitude',
|
|
' on both sides of the boundary; the strongest is reported.',
|
|
'* Absence of a significant transition is itself informative: it',
|
|
' is consistent with a single dominant generative mechanism (e.g.',
|
|
' Firm A, a replication-dominated population per interviews with',
|
|
' multiple Firm A accountants -- most use replication, a minority',
|
|
' may hand-sign).',
|
|
]
|
|
md_path = OUT / 'bd_mccrary_report.md'
|
|
md_path.write_text('\n'.join(md), encoding='utf-8')
|
|
print(f'Report: {md_path}')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|