Add script 32: non-Firm-A calibration spike (verdict C with twist)
Spike for the from-outside-of-firmA branch. Runs the three-method threshold framework (KDE+dip, BD/McCrary, Beta mixture / logit-GMM, 2D-GMM) on three subsets: Subset I big4_non_A KPMG+PwC+EY pooled (266 CPAs, 89.9k sigs) Subset II all_non_A every firm except Firm A (515 CPAs, 108k sigs) Subset III firm_A reference baseline (171 CPAs, 60.4k sigs) Plus pre_2018 / post_2020 time-stratified secondary on subsets I and II. Result: verdict C -- every subset is unimodal at the dip-test level (dip p > 0.76 across the board), including Firm A itself. Time stratification does not recover bimodality. Cross-subset Beta-2 cosine crossings: Firm A 0.977, big4_non_A 0.930, all_non_A 0.938; Paper A's published 0.945 sits between the two mass centers, indicating the published "natural threshold" is effectively a between-firm separator rather than a within-pool mechanism boundary. This finding motivates a follow-up reverse-anchor spike (script 33). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,778 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script 32: Non-Firm-A Calibration Spike
|
||||
========================================
|
||||
Research question (branch ``from-outside-of-firmA``):
|
||||
If we throw away Firm A entirely, can we still derive meaningful
|
||||
cosine / dHash thresholds at the accountant level?
|
||||
|
||||
Three subset analyses (per user's "1. 我們可以分開做" clarification):
|
||||
|
||||
Subset I — Big-4 minus Firm A: KPMG + PwC + EY pooled
|
||||
Subset II — All non-Firm-A firms: every firm except 勤業眾信聯合
|
||||
Subset III (baseline reference) — Firm A only
|
||||
|
||||
Each subset is run through Script 20's three-method framework
|
||||
(KDE+dip, BD/McCrary, 2-component Beta mixture + logit-GMM) plus the
|
||||
2D-GMM 2-comp marginal crossing from Script 18, on the
|
||||
per-accountant means of:
|
||||
* cos_mean = AVG(s.max_similarity_to_same_accountant)
|
||||
* dh_mean = AVG(s.min_dhash_independent)
|
||||
|
||||
Time-stratified contingency analysis:
|
||||
If Subset I/II fail to expose bimodality, we re-load each
|
||||
accountant's signatures stratified into pre-2018 vs post-2020
|
||||
sub-buckets (>=5 sigs per bucket required) and re-run the
|
||||
three-method framework on the resulting bucket-level means.
|
||||
This tests whether the time axis can substitute for the
|
||||
firm-anchor axis.
|
||||
|
||||
Verdict (A/B/C):
|
||||
A Bimodal structure emerges in Subset I or II without time
|
||||
stratification, with crossings within +-0.02 (cos) / +-2.0 (dh)
|
||||
of Paper A baselines (0.945, 8.10) and dip-test multimodal at
|
||||
alpha=0.05. -> "outside-Firm-A calibration is viable"
|
||||
B Bimodal structure only emerges after time stratification.
|
||||
-> "time axis substitutes for firm anchor; v3.21 robustness or
|
||||
Paper C with time-stratified design"
|
||||
C No bimodality in either; crossings are unstable / outside
|
||||
plausible range. -> "Firm A is required as anchor; this
|
||||
strengthens Paper A's framing"
|
||||
|
||||
Output:
|
||||
reports/non_firm_a_calibration/
|
||||
non_firm_a_calibration_results.json
|
||||
non_firm_a_calibration_report.md
|
||||
panel_<subset>_<measure>.png
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import json
|
||||
import numpy as np
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
import matplotlib.pyplot as plt
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from scipy import stats
|
||||
from scipy.signal import find_peaks
|
||||
from scipy.optimize import brentq
|
||||
from sklearn.mixture import GaussianMixture
|
||||
import diptest
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
|
||||
'non_firm_a_calibration')
|
||||
OUT.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
EPS = 1e-6
|
||||
Z_CRIT = 1.96
|
||||
MIN_SIGS = 10
|
||||
MIN_SIGS_PER_BUCKET = 5
|
||||
|
||||
FIRM_A = '勤業眾信聯合' # Deloitte
|
||||
BIG4_NON_A = ('安侯建業聯合', '資誠聯合', '安永聯合') # KPMG, PwC, EY
|
||||
|
||||
PAPER_A_COS_BASELINE = 0.945
|
||||
PAPER_A_DH_BASELINE = 8.10
|
||||
|
||||
|
||||
# ---------- Loaders ----------
|
||||
def _accountant_means_query(firm_filter_sql, params, time_filter_sql=''):
|
||||
sql = f'''
|
||||
SELECT s.assigned_accountant,
|
||||
AVG(s.max_similarity_to_same_accountant) AS cos_mean,
|
||||
AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean,
|
||||
COUNT(*) AS n
|
||||
FROM signatures s
|
||||
JOIN accountants a ON s.assigned_accountant = a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL
|
||||
AND s.max_similarity_to_same_accountant IS NOT NULL
|
||||
AND s.min_dhash_independent IS NOT NULL
|
||||
{firm_filter_sql}
|
||||
{time_filter_sql}
|
||||
GROUP BY s.assigned_accountant
|
||||
HAVING n >= ?
|
||||
'''
|
||||
return sql, params + [MIN_SIGS]
|
||||
|
||||
|
||||
def load_subset(label):
|
||||
"""Return (cos, dh, n_accountants, n_signatures)."""
|
||||
conn = sqlite3.connect(DB)
|
||||
cur = conn.cursor()
|
||||
if label == 'big4_non_A':
|
||||
firm_filter = 'AND a.firm IN (?, ?, ?)'
|
||||
params = list(BIG4_NON_A)
|
||||
elif label == 'all_non_A':
|
||||
firm_filter = 'AND a.firm IS NOT NULL AND a.firm != ?'
|
||||
params = [FIRM_A]
|
||||
elif label == 'firm_A':
|
||||
firm_filter = 'AND a.firm = ?'
|
||||
params = [FIRM_A]
|
||||
else:
|
||||
raise ValueError(label)
|
||||
sql, p = _accountant_means_query(firm_filter, params)
|
||||
cur.execute(sql, p)
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
cos = np.array([r[1] for r in rows])
|
||||
dh = np.array([r[2] for r in rows])
|
||||
n_sigs = int(sum(r[3] for r in rows))
|
||||
return cos, dh, len(rows), n_sigs
|
||||
|
||||
|
||||
def load_subset_time_stratified(label, period):
|
||||
"""Per-accountant means computed only from `period` signatures.
|
||||
|
||||
period: 'pre_2018' (year_month < 2018-01) or 'post_2020' (>= 2020-01).
|
||||
"""
|
||||
conn = sqlite3.connect(DB)
|
||||
cur = conn.cursor()
|
||||
if period == 'pre_2018':
|
||||
time_filter = "AND s.year_month < '2018-01'"
|
||||
elif period == 'post_2020':
|
||||
time_filter = "AND s.year_month >= '2020-01'"
|
||||
else:
|
||||
raise ValueError(period)
|
||||
if label == 'big4_non_A':
|
||||
firm_filter = 'AND a.firm IN (?, ?, ?)'
|
||||
params = list(BIG4_NON_A)
|
||||
elif label == 'all_non_A':
|
||||
firm_filter = 'AND a.firm IS NOT NULL AND a.firm != ?'
|
||||
params = [FIRM_A]
|
||||
else:
|
||||
raise ValueError(label)
|
||||
sql = f'''
|
||||
SELECT s.assigned_accountant,
|
||||
AVG(s.max_similarity_to_same_accountant) AS cos_mean,
|
||||
AVG(CAST(s.min_dhash_independent AS REAL)) AS dh_mean,
|
||||
COUNT(*) AS n
|
||||
FROM signatures s
|
||||
JOIN accountants a ON s.assigned_accountant = a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL
|
||||
AND s.max_similarity_to_same_accountant IS NOT NULL
|
||||
AND s.min_dhash_independent IS NOT NULL
|
||||
{firm_filter}
|
||||
{time_filter}
|
||||
GROUP BY s.assigned_accountant
|
||||
HAVING n >= ?
|
||||
'''
|
||||
cur.execute(sql, params + [MIN_SIGS_PER_BUCKET])
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
cos = np.array([r[1] for r in rows])
|
||||
dh = np.array([r[2] for r in rows])
|
||||
return cos, dh, len(rows), int(sum(r[3] for r in rows))
|
||||
|
||||
|
||||
# ---------- Methods (lifted from Script 20) ----------
|
||||
def method_kde_antimode(values):
|
||||
arr = np.asarray(values, dtype=float)
|
||||
arr = arr[np.isfinite(arr)]
|
||||
if len(arr) < 8:
|
||||
return {'n': int(len(arr)), 'note': 'too few points'}
|
||||
dip, pval = diptest.diptest(arr, boot_pval=True, n_boot=2000)
|
||||
kde = stats.gaussian_kde(arr, bw_method='silverman')
|
||||
xs = np.linspace(arr.min(), arr.max(), 2000)
|
||||
density = kde(xs)
|
||||
peaks, _ = find_peaks(density, prominence=density.max() * 0.02)
|
||||
antimodes = []
|
||||
for i in range(len(peaks) - 1):
|
||||
seg = density[peaks[i]:peaks[i + 1]]
|
||||
if not len(seg):
|
||||
continue
|
||||
local = peaks[i] + int(np.argmin(seg))
|
||||
antimodes.append(float(xs[local]))
|
||||
sens = {}
|
||||
for bwf in [0.5, 0.75, 1.0, 1.5, 2.0]:
|
||||
kde_s = stats.gaussian_kde(arr, bw_method=kde.factor * bwf)
|
||||
d_s = kde_s(xs)
|
||||
p_s, _ = find_peaks(d_s, prominence=d_s.max() * 0.02)
|
||||
sens[f'bw_x{bwf}'] = int(len(p_s))
|
||||
return {
|
||||
'n': int(len(arr)),
|
||||
'dip': float(dip),
|
||||
'dip_pvalue': float(pval),
|
||||
'unimodal_alpha05': bool(pval > 0.05),
|
||||
'kde_bandwidth_silverman': float(kde.factor),
|
||||
'n_modes': int(len(peaks)),
|
||||
'mode_locations': [float(xs[p]) for p in peaks],
|
||||
'antimodes': antimodes,
|
||||
'primary_antimode': (antimodes[0] if antimodes else None),
|
||||
'bandwidth_sensitivity_n_modes': sens,
|
||||
}
|
||||
|
||||
|
||||
def method_bd_mccrary(values, bin_width, direction):
|
||||
arr = np.asarray(values, dtype=float)
|
||||
arr = arr[np.isfinite(arr)]
|
||||
if len(arr) < 8:
|
||||
return {'n': int(len(arr)), 'note': 'too few points'}
|
||||
lo = float(np.floor(arr.min() / bin_width) * bin_width)
|
||||
hi = float(np.ceil(arr.max() / bin_width) * bin_width)
|
||||
edges = np.arange(lo, hi + bin_width, bin_width)
|
||||
counts, _ = np.histogram(arr, bins=edges)
|
||||
centers = (edges[:-1] + edges[1:]) / 2.0
|
||||
N = counts.sum()
|
||||
p = counts / N if N else counts.astype(float)
|
||||
n_bins = len(counts)
|
||||
z = np.full(n_bins, np.nan)
|
||||
for i in range(1, n_bins - 1):
|
||||
p_lo = p[i - 1]
|
||||
p_hi = p[i + 1]
|
||||
exp_i = 0.5 * (counts[i - 1] + counts[i + 1])
|
||||
var_i = (N * p[i] * (1 - p[i])
|
||||
+ 0.25 * N * (p_lo + p_hi) * (1 - p_lo - p_hi))
|
||||
if var_i <= 0:
|
||||
continue
|
||||
z[i] = (counts[i] - exp_i) / np.sqrt(var_i)
|
||||
transitions = []
|
||||
for i in range(1, len(z)):
|
||||
if np.isnan(z[i - 1]) or np.isnan(z[i]):
|
||||
continue
|
||||
ok = ((direction == 'neg_to_pos' and z[i - 1] < -Z_CRIT and z[i] > Z_CRIT)
|
||||
or (direction == 'pos_to_neg' and z[i - 1] > Z_CRIT and z[i] < -Z_CRIT))
|
||||
if ok:
|
||||
transitions.append({
|
||||
'threshold_between': float((centers[i - 1] + centers[i]) / 2.0),
|
||||
'z_before': float(z[i - 1]),
|
||||
'z_after': float(z[i]),
|
||||
})
|
||||
best = (max(transitions,
|
||||
key=lambda t: abs(t['z_before']) + abs(t['z_after']))
|
||||
if transitions else None)
|
||||
return {
|
||||
'n': int(len(arr)),
|
||||
'bin_width': float(bin_width),
|
||||
'direction': direction,
|
||||
'n_transitions': len(transitions),
|
||||
'transitions': transitions,
|
||||
'threshold': (best['threshold_between'] if best else None),
|
||||
}
|
||||
|
||||
|
||||
def fit_beta_mixture_em(x, K=2, max_iter=300, tol=1e-6, seed=42):
|
||||
x = np.clip(np.asarray(x, dtype=float), EPS, 1 - EPS)
|
||||
n = len(x)
|
||||
q = np.linspace(0, 1, K + 1)
|
||||
thresh = np.quantile(x, q[1:-1])
|
||||
labels = np.digitize(x, thresh)
|
||||
resp = np.zeros((n, K))
|
||||
resp[np.arange(n), labels] = 1.0
|
||||
ll_hist = []
|
||||
for it in range(max_iter):
|
||||
nk = resp.sum(axis=0) + 1e-12
|
||||
weights = nk / nk.sum()
|
||||
mus = (resp * x[:, None]).sum(axis=0) / nk
|
||||
var_num = (resp * (x[:, None] - mus) ** 2).sum(axis=0)
|
||||
vars_ = var_num / nk
|
||||
upper = mus * (1 - mus) - 1e-9
|
||||
vars_ = np.minimum(vars_, upper)
|
||||
vars_ = np.maximum(vars_, 1e-9)
|
||||
factor = np.maximum(mus * (1 - mus) / vars_ - 1, 1e-6)
|
||||
alphas = mus * factor
|
||||
betas = (1 - mus) * factor
|
||||
log_pdfs = np.column_stack([
|
||||
stats.beta.logpdf(x, alphas[k], betas[k]) + np.log(weights[k])
|
||||
for k in range(K)
|
||||
])
|
||||
m = log_pdfs.max(axis=1, keepdims=True)
|
||||
ll = (m.ravel() + np.log(np.exp(log_pdfs - m).sum(axis=1))).sum()
|
||||
ll_hist.append(float(ll))
|
||||
new_resp = np.exp(log_pdfs - m)
|
||||
new_resp = new_resp / new_resp.sum(axis=1, keepdims=True)
|
||||
if it > 0 and abs(ll_hist[-1] - ll_hist[-2]) < tol:
|
||||
resp = new_resp
|
||||
break
|
||||
resp = new_resp
|
||||
order = np.argsort(mus)
|
||||
alphas = alphas[order]
|
||||
betas = betas[order]
|
||||
weights = weights[order]
|
||||
mus = mus[order]
|
||||
k_params = 3 * K - 1
|
||||
ll_final = ll_hist[-1]
|
||||
return {
|
||||
'K': K,
|
||||
'alphas': [float(a) for a in alphas],
|
||||
'betas': [float(b) for b in betas],
|
||||
'weights': [float(w) for w in weights],
|
||||
'mus': [float(m) for m in mus],
|
||||
'log_likelihood': ll_final,
|
||||
'aic': float(2 * k_params - 2 * ll_final),
|
||||
'bic': float(k_params * np.log(n) - 2 * ll_final),
|
||||
'n_iter': it + 1,
|
||||
}
|
||||
|
||||
|
||||
def beta_crossing(fit):
|
||||
if fit['K'] != 2:
|
||||
return None
|
||||
a1, b1, w1 = fit['alphas'][0], fit['betas'][0], fit['weights'][0]
|
||||
a2, b2, w2 = fit['alphas'][1], fit['betas'][1], fit['weights'][1]
|
||||
|
||||
def diff(x):
|
||||
return (w2 * stats.beta.pdf(x, a2, b2)
|
||||
- w1 * stats.beta.pdf(x, a1, b1))
|
||||
xs = np.linspace(EPS, 1 - EPS, 2000)
|
||||
ys = diff(xs)
|
||||
changes = np.where(np.diff(np.sign(ys)) != 0)[0]
|
||||
if not len(changes):
|
||||
return None
|
||||
mid = 0.5 * (fit['mus'][0] + fit['mus'][1])
|
||||
crossings = []
|
||||
for i in changes:
|
||||
try:
|
||||
crossings.append(brentq(diff, xs[i], xs[i + 1]))
|
||||
except ValueError:
|
||||
continue
|
||||
if not crossings:
|
||||
return None
|
||||
return float(min(crossings, key=lambda c: abs(c - mid)))
|
||||
|
||||
|
||||
def fit_logit_gmm(x, K=2, seed=42):
|
||||
x = np.clip(np.asarray(x, dtype=float), EPS, 1 - EPS)
|
||||
z = np.log(x / (1 - x)).reshape(-1, 1)
|
||||
gmm = GaussianMixture(n_components=K, random_state=seed,
|
||||
max_iter=500).fit(z)
|
||||
order = np.argsort(gmm.means_.ravel())
|
||||
means = gmm.means_.ravel()[order]
|
||||
stds = np.sqrt(gmm.covariances_.ravel())[order]
|
||||
weights = gmm.weights_[order]
|
||||
crossing = None
|
||||
if K == 2:
|
||||
m1, s1, w1 = means[0], stds[0], weights[0]
|
||||
m2, s2, w2 = means[1], stds[1], weights[1]
|
||||
|
||||
def diff(z0):
|
||||
return (w2 * stats.norm.pdf(z0, m2, s2)
|
||||
- w1 * stats.norm.pdf(z0, m1, s1))
|
||||
zs = np.linspace(min(m1, m2) - 2, max(m1, m2) + 2, 2000)
|
||||
ys = diff(zs)
|
||||
ch = np.where(np.diff(np.sign(ys)) != 0)[0]
|
||||
if len(ch):
|
||||
try:
|
||||
z_cross = brentq(diff, zs[ch[0]], zs[ch[0] + 1])
|
||||
crossing = float(1 / (1 + np.exp(-z_cross)))
|
||||
except ValueError:
|
||||
pass
|
||||
return {
|
||||
'K': K,
|
||||
'means_logit': [float(m) for m in means],
|
||||
'stds_logit': [float(s) for s in stds],
|
||||
'weights': [float(w) for w in weights],
|
||||
'means_original_scale': [float(1 / (1 + np.exp(-m))) for m in means],
|
||||
'aic': float(gmm.aic(z)),
|
||||
'bic': float(gmm.bic(z)),
|
||||
'crossing_original': crossing,
|
||||
}
|
||||
|
||||
|
||||
def method_beta_mixture(values, is_cosine=True):
|
||||
arr = np.asarray(values, dtype=float)
|
||||
arr = arr[np.isfinite(arr)]
|
||||
if len(arr) < 8:
|
||||
return {'n': int(len(arr)), 'note': 'too few points'}
|
||||
x = arr if is_cosine else arr / 64.0
|
||||
beta2 = fit_beta_mixture_em(x, K=2)
|
||||
beta3 = fit_beta_mixture_em(x, K=3)
|
||||
cross_beta2 = beta_crossing(beta2)
|
||||
if not is_cosine and cross_beta2 is not None:
|
||||
cross_beta2 = cross_beta2 * 64.0
|
||||
gmm2 = fit_logit_gmm(x, K=2)
|
||||
gmm3 = fit_logit_gmm(x, K=3)
|
||||
if not is_cosine and gmm2.get('crossing_original') is not None:
|
||||
gmm2['crossing_original'] = gmm2['crossing_original'] * 64.0
|
||||
return {
|
||||
'n': int(len(x)),
|
||||
'scale_transform': ('identity' if is_cosine else 'dhash/64'),
|
||||
'beta_2': beta2,
|
||||
'beta_3': beta3,
|
||||
'bic_preferred_K': (2 if beta2['bic'] < beta3['bic'] else 3),
|
||||
'beta_2_crossing_original': cross_beta2,
|
||||
'logit_gmm_2': gmm2,
|
||||
'logit_gmm_3': gmm3,
|
||||
}
|
||||
|
||||
|
||||
def gmm_2d_marginal_crossing(cos, dh, dim):
|
||||
"""2-comp 2D GMM, then marginal crossing on the requested dim."""
|
||||
X = np.column_stack([cos, dh])
|
||||
if len(X) < 8:
|
||||
return None
|
||||
gmm = GaussianMixture(n_components=2, covariance_type='full',
|
||||
random_state=42, n_init=15, max_iter=500).fit(X)
|
||||
means = gmm.means_
|
||||
covs = gmm.covariances_
|
||||
weights = gmm.weights_
|
||||
m1, m2 = means[0][dim], means[1][dim]
|
||||
s1 = np.sqrt(covs[0][dim, dim])
|
||||
s2 = np.sqrt(covs[1][dim, dim])
|
||||
w1, w2 = weights[0], weights[1]
|
||||
|
||||
def diff(x):
|
||||
return (w2 * stats.norm.pdf(x, m2, s2)
|
||||
- w1 * stats.norm.pdf(x, m1, s1))
|
||||
xs = np.linspace(float(X[:, dim].min()), float(X[:, dim].max()), 2000)
|
||||
ys = diff(xs)
|
||||
ch = np.where(np.diff(np.sign(ys)) != 0)[0]
|
||||
if not len(ch):
|
||||
return None
|
||||
mid = 0.5 * (m1 + m2)
|
||||
crossings = []
|
||||
for i in ch:
|
||||
try:
|
||||
crossings.append(brentq(diff, xs[i], xs[i + 1]))
|
||||
except ValueError:
|
||||
continue
|
||||
if not crossings:
|
||||
return None
|
||||
return float(min(crossings, key=lambda c: abs(c - mid)))
|
||||
|
||||
|
||||
def gmm_2d_3comp_summary(cos, dh):
|
||||
"""K=3 2D GMM for completeness; report component means + weights."""
|
||||
X = np.column_stack([cos, dh])
|
||||
if len(X) < 12:
|
||||
return None
|
||||
gmm = GaussianMixture(n_components=3, covariance_type='full',
|
||||
random_state=42, n_init=15, max_iter=500).fit(X)
|
||||
order = np.argsort(gmm.means_[:, 0]) # sort by cosine ascending
|
||||
return {
|
||||
'means': [[float(m[0]), float(m[1])] for m in gmm.means_[order]],
|
||||
'weights': [float(w) for w in gmm.weights_[order]],
|
||||
'bic': float(gmm.bic(X)),
|
||||
'aic': float(gmm.aic(X)),
|
||||
}
|
||||
|
||||
|
||||
# ---------- Driver ----------
|
||||
def run_three_method(cos, dh, label):
|
||||
results = {}
|
||||
for desc, arr, bin_width, direction, is_cos in [
|
||||
('cos_mean', cos, 0.002, 'neg_to_pos', True),
|
||||
('dh_mean', dh, 0.2, 'pos_to_neg', False),
|
||||
]:
|
||||
m1 = method_kde_antimode(arr)
|
||||
m2 = method_bd_mccrary(arr, bin_width, direction)
|
||||
m3 = method_beta_mixture(arr, is_cosine=is_cos)
|
||||
gmm2_marginal = gmm_2d_marginal_crossing(
|
||||
cos, dh, dim=(0 if desc == 'cos_mean' else 1))
|
||||
results[desc] = {
|
||||
'method_1_kde_antimode': m1,
|
||||
'method_2_bd_mccrary': m2,
|
||||
'method_3_beta_mixture': m3,
|
||||
'gmm_2d_2comp_marginal_crossing': gmm2_marginal,
|
||||
}
|
||||
results['gmm_2d_3comp'] = gmm_2d_3comp_summary(cos, dh)
|
||||
return results
|
||||
|
||||
|
||||
def plot_panel(values, methods, title, out_path, bin_width=None):
|
||||
arr = np.asarray(values, dtype=float)
|
||||
fig, axes = plt.subplots(2, 1, figsize=(11, 7),
|
||||
gridspec_kw={'height_ratios': [3, 1]})
|
||||
ax = axes[0]
|
||||
if bin_width is None:
|
||||
bins = 40
|
||||
else:
|
||||
lo = float(np.floor(arr.min() / bin_width) * bin_width)
|
||||
hi = float(np.ceil(arr.max() / bin_width) * bin_width)
|
||||
bins = np.arange(lo, hi + bin_width, bin_width)
|
||||
ax.hist(arr, bins=bins, density=True, alpha=0.55, color='steelblue',
|
||||
edgecolor='white')
|
||||
kde = stats.gaussian_kde(arr, bw_method='silverman')
|
||||
xs = np.linspace(arr.min(), arr.max(), 500)
|
||||
ax.plot(xs, kde(xs), 'g-', lw=1.5, label='KDE (Silverman)')
|
||||
colors = {'kde': 'green', 'bd': 'red', 'beta': 'purple',
|
||||
'gmm2': 'orange', 'baseline': 'black'}
|
||||
for key, (val, lbl) in methods.items():
|
||||
if val is None:
|
||||
continue
|
||||
ls = ':' if key == 'baseline' else '--'
|
||||
ax.axvline(val, color=colors.get(key, 'gray'), lw=2, ls=ls,
|
||||
label=f'{lbl} = {val:.4f}')
|
||||
ax.set_xlabel(title)
|
||||
ax.set_ylabel('Density')
|
||||
ax.set_title(title)
|
||||
ax.legend(fontsize=8)
|
||||
|
||||
ax2 = axes[1]
|
||||
ax2.set_title('Thresholds across methods')
|
||||
ax2.set_xlim(ax.get_xlim())
|
||||
for i, (key, (val, lbl)) in enumerate(methods.items()):
|
||||
if val is None:
|
||||
continue
|
||||
ax2.scatter([val], [i], color=colors.get(key, 'gray'), s=100, zorder=3)
|
||||
ax2.annotate(f' {lbl}: {val:.4f}', (val, i), fontsize=8, va='center')
|
||||
ax2.set_yticks(range(len(methods)))
|
||||
ax2.set_yticklabels([m for m in methods.keys()])
|
||||
ax2.grid(alpha=0.3)
|
||||
plt.tight_layout()
|
||||
fig.savefig(out_path, dpi=150)
|
||||
plt.close()
|
||||
|
||||
|
||||
def emit_panel(subset_label, results):
|
||||
for desc, bin_width in [('cos_mean', 0.002), ('dh_mean', 0.2)]:
|
||||
if 'note' in results[desc]['method_1_kde_antimode']:
|
||||
continue
|
||||
baseline = (PAPER_A_COS_BASELINE if desc == 'cos_mean'
|
||||
else PAPER_A_DH_BASELINE)
|
||||
methods_for_plot = {
|
||||
'kde': (results[desc]['method_1_kde_antimode'].get('primary_antimode'),
|
||||
'KDE antimode'),
|
||||
'bd': (results[desc]['method_2_bd_mccrary'].get('threshold'),
|
||||
'BD/McCrary'),
|
||||
'beta': (results[desc]['method_3_beta_mixture'].get(
|
||||
'beta_2_crossing_original'), 'Beta-2 crossing'),
|
||||
'gmm2': (results[desc]['gmm_2d_2comp_marginal_crossing'],
|
||||
'2D GMM 2-comp'),
|
||||
'baseline': (baseline, f'Paper A baseline'),
|
||||
}
|
||||
# Need data array for the plot; reload for size only
|
||||
arr = np.array([]) # filled by caller via closure if needed
|
||||
# Plot caller passes arr
|
||||
return methods_for_plot
|
||||
return None
|
||||
|
||||
|
||||
def classify_verdict(results_by_subset):
|
||||
"""Return ('A'|'B'|'C', explanation)."""
|
||||
def well_separated(res, baseline_cos, baseline_dh):
|
||||
cos_cross = res['cos_mean']['method_3_beta_mixture'].get(
|
||||
'beta_2_crossing_original')
|
||||
dh_cross = res['dh_mean']['method_3_beta_mixture'].get(
|
||||
'beta_2_crossing_original')
|
||||
cos_dip_p = res['cos_mean']['method_1_kde_antimode'].get('dip_pvalue')
|
||||
dh_dip_p = res['dh_mean']['method_1_kde_antimode'].get('dip_pvalue')
|
||||
cos_ok = (cos_cross is not None
|
||||
and abs(cos_cross - baseline_cos) <= 0.02
|
||||
and cos_dip_p is not None and cos_dip_p <= 0.05)
|
||||
dh_ok = (dh_cross is not None
|
||||
and abs(dh_cross - baseline_dh) <= 2.0
|
||||
and dh_dip_p is not None and dh_dip_p <= 0.05)
|
||||
return cos_ok, dh_ok
|
||||
|
||||
for subset in ('big4_non_A', 'all_non_A'):
|
||||
res = results_by_subset.get(subset)
|
||||
if not res:
|
||||
continue
|
||||
cos_ok, dh_ok = well_separated(res, PAPER_A_COS_BASELINE,
|
||||
PAPER_A_DH_BASELINE)
|
||||
if cos_ok and dh_ok:
|
||||
return 'A', (f"Subset '{subset}' shows bimodal cos+dh with "
|
||||
f"crossings within tolerance of Paper A baselines.")
|
||||
# B: time-stratified rescues it?
|
||||
for subset_period in ('big4_non_A_pre_2018',
|
||||
'big4_non_A_post_2020',
|
||||
'all_non_A_pre_2018',
|
||||
'all_non_A_post_2020'):
|
||||
res = results_by_subset.get(subset_period)
|
||||
if not res:
|
||||
continue
|
||||
cos_ok, dh_ok = well_separated(res, PAPER_A_COS_BASELINE,
|
||||
PAPER_A_DH_BASELINE)
|
||||
if cos_ok and dh_ok:
|
||||
return 'B', (f"Time-stratified subset '{subset_period}' recovers "
|
||||
f"separable bimodality.")
|
||||
return 'C', ("Neither pooled nor time-stratified non-Firm-A calibration "
|
||||
"produces a baseline-consistent bimodal threshold.")
|
||||
|
||||
|
||||
def render_report(results_by_subset, sample_sizes, verdict):
|
||||
md = [
|
||||
'# Non-Firm-A Calibration Spike (Script 32)',
|
||||
f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
||||
'',
|
||||
'## Research Question',
|
||||
'',
|
||||
('If we exclude Firm A (Deloitte) from calibration, can the '
|
||||
'three-method framework still recover a meaningful '
|
||||
'cosine / dHash threshold at the accountant level?'),
|
||||
'',
|
||||
'## Sample Sizes',
|
||||
'',
|
||||
'| Subset | N accountants (>=10 sigs) | N signatures |',
|
||||
'|--------|---------------------------|--------------|',
|
||||
]
|
||||
for label, (n_acc, n_sig) in sample_sizes.items():
|
||||
md.append(f'| `{label}` | {n_acc} | {n_sig} |')
|
||||
md += ['',
|
||||
'## Paper A Baselines (for comparison)',
|
||||
'',
|
||||
f'- Accountant-level 2D GMM 2-comp marginal crossings: '
|
||||
f'cos = **{PAPER_A_COS_BASELINE}**, dHash = **{PAPER_A_DH_BASELINE}**',
|
||||
'']
|
||||
|
||||
for label, results in results_by_subset.items():
|
||||
md += [f'## Subset: `{label}`', '']
|
||||
for measure, baseline in [('cos_mean', PAPER_A_COS_BASELINE),
|
||||
('dh_mean', PAPER_A_DH_BASELINE)]:
|
||||
r = results[measure]
|
||||
md += [f'### {measure}', '',
|
||||
'| Method | Threshold | Supporting statistic |',
|
||||
'|--------|-----------|----------------------|']
|
||||
kde = r['method_1_kde_antimode']
|
||||
if 'note' in kde:
|
||||
md.append(f'| Method 1: KDE+dip | n/a | {kde["note"]} |')
|
||||
else:
|
||||
tag = 'unimodal' if kde['unimodal_alpha05'] else 'multimodal'
|
||||
md.append(
|
||||
f'| Method 1: KDE antimode (dip test) | '
|
||||
f'{kde["primary_antimode"]} | '
|
||||
f'dip={kde["dip"]:.4f}, p={kde["dip_pvalue"]:.4f} '
|
||||
f'({tag}); n_modes={kde["n_modes"]} |')
|
||||
bd = r['method_2_bd_mccrary']
|
||||
md.append(
|
||||
f'| Method 2: BD/McCrary | {bd.get("threshold")} | '
|
||||
f'{bd.get("n_transitions", 0)} transition(s) |')
|
||||
beta = r['method_3_beta_mixture']
|
||||
if 'note' in beta:
|
||||
md.append(f'| Method 3: Beta mixture | n/a | {beta["note"]} |')
|
||||
else:
|
||||
md.append(
|
||||
f'| Method 3: 2-comp Beta mixture | '
|
||||
f'{beta["beta_2_crossing_original"]} | '
|
||||
f'Beta-2 BIC={beta["beta_2"]["bic"]:.2f}, '
|
||||
f'Beta-3 BIC={beta["beta_3"]["bic"]:.2f} '
|
||||
f'(K*={beta["bic_preferred_K"]}) |')
|
||||
md.append(
|
||||
f'| Method 3\': LogGMM-2 | '
|
||||
f'{beta["logit_gmm_2"].get("crossing_original")} | '
|
||||
f'logit-Gaussian robustness check |')
|
||||
md.append(
|
||||
f'| 2D GMM 2-comp marginal crossing | '
|
||||
f'{r["gmm_2d_2comp_marginal_crossing"]} | '
|
||||
f'paired with Paper A baseline = {baseline} |')
|
||||
md.append('')
|
||||
if results.get('gmm_2d_3comp'):
|
||||
g3 = results['gmm_2d_3comp']
|
||||
md += ['### 2D GMM K=3 components (for completeness)',
|
||||
'',
|
||||
'| Component | mean cos | mean dh | weight |',
|
||||
'|-----------|----------|---------|--------|']
|
||||
for i, (m, w) in enumerate(zip(g3['means'], g3['weights'])):
|
||||
md.append(f'| C{i + 1} | {m[0]:.4f} | {m[1]:.4f} | {w:.3f} |')
|
||||
md.append('')
|
||||
md.append(f'BIC(K=3 2D)={g3["bic"]:.2f}, AIC={g3["aic"]:.2f}')
|
||||
md.append('')
|
||||
|
||||
md += ['## Verdict',
|
||||
'',
|
||||
f'**{verdict[0]}** — {verdict[1]}',
|
||||
'',
|
||||
'### Verdict legend',
|
||||
'- **A**: outside-Firm-A calibration is viable in pooled form '
|
||||
'(crossings within +-0.02 cos / +-2.0 dh of Paper A baselines '
|
||||
'AND dip-test multimodal at alpha=0.05).',
|
||||
'- **B**: time-stratified subset recovers separable bimodality.',
|
||||
'- **C**: neither rescue works; Firm A remains required as anchor.',
|
||||
'']
|
||||
return '\n'.join(md)
|
||||
|
||||
|
||||
def main():
|
||||
print('=' * 72)
|
||||
print('Script 32: Non-Firm-A Calibration Spike')
|
||||
print('=' * 72)
|
||||
|
||||
sample_sizes = {}
|
||||
results_by_subset = {}
|
||||
arrays_by_subset = {}
|
||||
|
||||
# --- Pooled subsets ---
|
||||
for label in ('big4_non_A', 'all_non_A', 'firm_A'):
|
||||
cos, dh, n_acc, n_sig = load_subset(label)
|
||||
sample_sizes[label] = (n_acc, n_sig)
|
||||
arrays_by_subset[label] = (cos, dh)
|
||||
print(f'\n[{label}] N accountants={n_acc}, N sigs={n_sig}')
|
||||
results_by_subset[label] = run_three_method(cos, dh, label)
|
||||
for desc in ('cos_mean', 'dh_mean'):
|
||||
r = results_by_subset[label][desc]
|
||||
kde = r['method_1_kde_antimode']
|
||||
beta = r['method_3_beta_mixture']
|
||||
print(f' {desc}: dip p={kde.get("dip_pvalue")} '
|
||||
f'(n_modes={kde.get("n_modes")}); '
|
||||
f'Beta-2 cross={beta.get("beta_2_crossing_original")}; '
|
||||
f'2D-GMM marginal={r["gmm_2d_2comp_marginal_crossing"]}')
|
||||
|
||||
# --- Time-stratified secondary (run unconditionally; verdict logic decides) ---
|
||||
for label in ('big4_non_A', 'all_non_A'):
|
||||
for period in ('pre_2018', 'post_2020'):
|
||||
cos, dh, n_acc, n_sig = load_subset_time_stratified(label, period)
|
||||
key = f'{label}_{period}'
|
||||
sample_sizes[key] = (n_acc, n_sig)
|
||||
arrays_by_subset[key] = (cos, dh)
|
||||
print(f'\n[{key}] N accountants={n_acc}, N sigs={n_sig}')
|
||||
if n_acc < 8:
|
||||
print(f' (skipped: too few accountants for analysis)')
|
||||
continue
|
||||
results_by_subset[key] = run_three_method(cos, dh, key)
|
||||
for desc in ('cos_mean', 'dh_mean'):
|
||||
r = results_by_subset[key][desc]
|
||||
kde = r['method_1_kde_antimode']
|
||||
beta = r['method_3_beta_mixture']
|
||||
print(f' {desc}: dip p={kde.get("dip_pvalue")} '
|
||||
f'(n_modes={kde.get("n_modes")}); '
|
||||
f'Beta-2 cross={beta.get("beta_2_crossing_original")}; '
|
||||
f'2D-GMM marginal={r["gmm_2d_2comp_marginal_crossing"]}')
|
||||
|
||||
# --- Plots ---
|
||||
for label, results in results_by_subset.items():
|
||||
cos, dh = arrays_by_subset[label]
|
||||
for desc, arr, bin_width, baseline in [
|
||||
('cos_mean', cos, 0.002, PAPER_A_COS_BASELINE),
|
||||
('dh_mean', dh, 0.2, PAPER_A_DH_BASELINE),
|
||||
]:
|
||||
r = results[desc]
|
||||
if 'note' in r['method_1_kde_antimode']:
|
||||
continue
|
||||
methods_for_plot = {
|
||||
'kde': (r['method_1_kde_antimode'].get('primary_antimode'),
|
||||
'KDE antimode'),
|
||||
'bd': (r['method_2_bd_mccrary'].get('threshold'),
|
||||
'BD/McCrary'),
|
||||
'beta': (r['method_3_beta_mixture'].get(
|
||||
'beta_2_crossing_original'), 'Beta-2 crossing'),
|
||||
'gmm2': (r['gmm_2d_2comp_marginal_crossing'],
|
||||
'2D GMM 2-comp'),
|
||||
'baseline': (baseline, 'Paper A baseline'),
|
||||
}
|
||||
png = OUT / f'panel_{label}_{desc}.png'
|
||||
plot_panel(arr, methods_for_plot,
|
||||
f'{label} -- accountant-level {desc}',
|
||||
png, bin_width=bin_width)
|
||||
print(f' plot: {png}')
|
||||
|
||||
# --- Verdict ---
|
||||
verdict = classify_verdict(results_by_subset)
|
||||
print(f'\nVerdict: {verdict[0]} -- {verdict[1]}')
|
||||
|
||||
# --- Persist ---
|
||||
payload = {
|
||||
'generated_at': datetime.now().isoformat(),
|
||||
'min_sigs_per_accountant': MIN_SIGS,
|
||||
'min_sigs_per_bucket_time_stratified': MIN_SIGS_PER_BUCKET,
|
||||
'paper_a_baselines': {
|
||||
'cos': PAPER_A_COS_BASELINE,
|
||||
'dh': PAPER_A_DH_BASELINE,
|
||||
},
|
||||
'sample_sizes': {k: {'n_accountants': v[0], 'n_signatures': v[1]}
|
||||
for k, v in sample_sizes.items()},
|
||||
'results': results_by_subset,
|
||||
'verdict': {'class': verdict[0], 'explanation': verdict[1]},
|
||||
}
|
||||
(OUT / 'non_firm_a_calibration_results.json').write_text(
|
||||
json.dumps(payload, indent=2, ensure_ascii=False), encoding='utf-8')
|
||||
print(f'\nJSON: {OUT / "non_firm_a_calibration_results.json"}')
|
||||
|
||||
md = render_report(results_by_subset, sample_sizes, verdict)
|
||||
(OUT / 'non_firm_a_calibration_report.md').write_text(md, encoding='utf-8')
|
||||
print(f'Report: {OUT / "non_firm_a_calibration_report.md"}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user