Add Scripts 39b/c/d/e + 40b + 43: anchor-based FAR diagnostics
Spike checkpoint in response to codex rounds 28-30 review:
- 39b/c: signature-level dip test on Big-4 and non-Big-4 marginals
- 39d: dHash discrete-value robustness (raw vs jittered + histogram
valleys + firm residualization); confirms within-firm dHash dip
rejection is integer-mass-point artefact
- 39e: dHash firm-residualized + jittered 2x2 factorial decomposition;
confirms Big-4 pooled dh "multimodality" is composition + integer
artefact (centered + jittered p=0.35, 0/5 seeds reject)
- 40b: inter-CPA per-pair FAR sweep (cos + dh marginal + joint +
conditional); replicates v3 cos>0.95 FAR=0.0006 and provides
v4-new dh FAR curve
- 43: pool-normalized per-signature FAR (codex round-30 fix for
per-pair vs per-signature conflation); per-sig FAR for deployed
any-pair rule = 11.02%, per-firm structure shows Firm A 20% vs
B/C/D <1%
These scripts replace the distributional path (K=3 mixture / dip /
antimode) with anchor-based threshold derivation. Companion
artefacts in reports/v4_big4/{signature_level_diptest,
midsmall_signature_diptest, dhash_discrete_robustness,
inter_cpa_far_sweep, pool_normalized_far}/.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,250 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script 39e: dHash Firm-Residualized + Jittered Dip (final test)
|
||||
================================================================
|
||||
Script 39d showed:
|
||||
- Within-firm dh dip rejections all vanish after jitter (integer
|
||||
artifact)
|
||||
- Big-4 pooled dh dip survives jitter (p_median=0 over 5 seeds)
|
||||
|
||||
But Firm A mean dh = 2.73 vs Firms B/C/D ~6.5-7.4 -- a large
|
||||
between-firm location shift, analogous to the cosine case where
|
||||
firm-mean centering eliminated rejection.
|
||||
|
||||
This script applies BOTH corrections simultaneously:
|
||||
1. Firm-mean centering (remove between-firm location shifts)
|
||||
2. Uniform jitter in [-0.5, +0.5] (remove integer ties)
|
||||
|
||||
If the doubly-corrected dh distribution rejects unimodality, the
|
||||
Big-4 pooled multimodality is a genuine within-population, continuous
|
||||
phenomenon. If it fails to reject, dh "multimodality" is fully
|
||||
explained by between-firm composition (same conclusion as cosine).
|
||||
|
||||
Multi-seed (5 seeds) for robustness.
|
||||
|
||||
Outputs:
|
||||
reports/v4_big4/dhash_discrete_robustness/
|
||||
dhash_residualized_jittered_results.json
|
||||
dhash_residualized_jittered_report.md
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
import diptest
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/'
|
||||
'v4_big4/dhash_discrete_robustness')
|
||||
OUT.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合')
|
||||
ALIAS = {'勤業眾信聯合': 'Firm A',
|
||||
'安侯建業聯合': 'Firm B',
|
||||
'資誠聯合': 'Firm C',
|
||||
'安永聯合': 'Firm D'}
|
||||
N_BOOT = 2000
|
||||
SEEDS = [42, 43, 44, 45, 46]
|
||||
|
||||
|
||||
def load_signatures():
|
||||
conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True)
|
||||
cur = conn.cursor()
|
||||
cur.execute('''
|
||||
SELECT a.firm, CAST(s.min_dhash_independent AS REAL)
|
||||
FROM signatures s
|
||||
JOIN accountants a ON s.assigned_accountant = a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL
|
||||
AND s.max_similarity_to_same_accountant IS NOT NULL
|
||||
AND s.min_dhash_independent IS NOT NULL
|
||||
AND a.firm IS NOT NULL
|
||||
''')
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
|
||||
def firm_residualize(values, firm_labels):
|
||||
arr = np.asarray(values, dtype=float)
|
||||
firms = np.asarray(firm_labels)
|
||||
out = arr.copy()
|
||||
grand = float(np.mean(arr))
|
||||
for f in np.unique(firms):
|
||||
m = firms == f
|
||||
out[m] = arr[m] - float(np.mean(arr[m])) + grand
|
||||
return out
|
||||
|
||||
|
||||
def dip_multi(values, seeds, with_jitter, n_boot=N_BOOT):
|
||||
arr = np.asarray(values, dtype=float)
|
||||
arr = arr[np.isfinite(arr)]
|
||||
results = []
|
||||
for seed in seeds:
|
||||
rng = np.random.default_rng(seed)
|
||||
v = arr + rng.uniform(-0.5, 0.5, len(arr)) if with_jitter else arr
|
||||
d, p = diptest.diptest(v, boot_pval=True, n_boot=n_boot)
|
||||
results.append({'seed': seed, 'dip': float(d), 'p': float(p)})
|
||||
if not with_jitter:
|
||||
break # without jitter the seed is irrelevant
|
||||
return results
|
||||
|
||||
|
||||
def _fmt_p(p):
|
||||
return '< 5e-4' if p == 0.0 else f'{p:.4g}'
|
||||
|
||||
|
||||
def summarize(name, results):
|
||||
ps = [r['p'] for r in results]
|
||||
ds = [r['dip'] for r in results]
|
||||
return {
|
||||
'name': name,
|
||||
'n_seeds': len(results),
|
||||
'dip_min': min(ds), 'dip_max': max(ds), 'dip_median': float(np.median(ds)),
|
||||
'p_min': min(ps), 'p_max': max(ps), 'p_median': float(np.median(ps)),
|
||||
'reject_at_05_count': int(sum(1 for p in ps if p <= 0.05)),
|
||||
'per_seed': results,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
print('=' * 72)
|
||||
print('Script 39e: dHash Firm-Residualized + Jittered Dip')
|
||||
print('=' * 72)
|
||||
rows = load_signatures()
|
||||
firms_raw = np.array([r[0] for r in rows])
|
||||
dh = np.array([r[1] for r in rows], dtype=float)
|
||||
is_big4 = np.isin(firms_raw, BIG4)
|
||||
big4_dh = dh[is_big4]
|
||||
big4_firms = np.array([ALIAS[f] for f in firms_raw[is_big4]])
|
||||
|
||||
print(f'\nLoaded {len(rows):,} signatures; Big-4 {is_big4.sum():,}')
|
||||
print('\nPer-firm Big-4 dh summary:')
|
||||
for f in sorted(set(big4_firms)):
|
||||
v = big4_dh[big4_firms == f]
|
||||
print(f' {f}: n={len(v):,} mean={v.mean():.3f} '
|
||||
f'median={np.median(v):.1f} sd={v.std():.3f}')
|
||||
|
||||
# ---- Test conditions, all on Big-4 signature-level dh ----
|
||||
panels = {}
|
||||
|
||||
# 1. Raw (no centering, no jitter)
|
||||
print('\n[1] Raw dh')
|
||||
r = dip_multi(big4_dh, [42], with_jitter=False)
|
||||
panels['raw'] = summarize('raw', r)
|
||||
print(f' dip={r[0]["dip"]:.5f}, p={_fmt_p(r[0]["p"])}')
|
||||
|
||||
# 2. Centered only (no jitter; integer values preserved)
|
||||
print('\n[2] Firm-mean centered, no jitter')
|
||||
centered = firm_residualize(big4_dh, big4_firms)
|
||||
r = dip_multi(centered, [42], with_jitter=False)
|
||||
panels['centered_only'] = summarize('centered_only', r)
|
||||
print(f' dip={r[0]["dip"]:.5f}, p={_fmt_p(r[0]["p"])}')
|
||||
|
||||
# 3. Jittered only (no centering)
|
||||
print('\n[3] Jittered (5 seeds), no centering')
|
||||
r = dip_multi(big4_dh, SEEDS, with_jitter=True)
|
||||
panels['jitter_only'] = summarize('jitter_only', r)
|
||||
print(f' p_median={panels["jitter_only"]["p_median"]:.4g}, '
|
||||
f'reject@.05 in '
|
||||
f'{panels["jitter_only"]["reject_at_05_count"]}/5 seeds')
|
||||
|
||||
# 4. Centered + jittered (THE key test)
|
||||
print('\n[4] Firm-mean centered + jittered (5 seeds) -- KEY TEST')
|
||||
r = dip_multi(centered, SEEDS, with_jitter=True)
|
||||
panels['centered_jittered'] = summarize('centered_jittered', r)
|
||||
print(f' p_median={panels["centered_jittered"]["p_median"]:.4g}, '
|
||||
f'reject@.05 in '
|
||||
f'{panels["centered_jittered"]["reject_at_05_count"]}/5 seeds')
|
||||
for s in r:
|
||||
print(f' seed {s["seed"]}: dip={s["dip"]:.5f}, p={_fmt_p(s["p"])}')
|
||||
|
||||
# Per-firm dh stats (re-confirm Firm A shift)
|
||||
firm_stats = {}
|
||||
for f in sorted(set(big4_firms)):
|
||||
v = big4_dh[big4_firms == f]
|
||||
firm_stats[f] = {
|
||||
'n': int(len(v)),
|
||||
'mean': float(v.mean()),
|
||||
'median': float(np.median(v)),
|
||||
'sd': float(v.std()),
|
||||
'p25': float(np.percentile(v, 25)),
|
||||
'p75': float(np.percentile(v, 75)),
|
||||
'pct_le_5': float(np.mean(v <= 5)),
|
||||
'pct_gt_15': float(np.mean(v > 15)),
|
||||
}
|
||||
|
||||
results = {
|
||||
'meta': {
|
||||
'script': '39e',
|
||||
'timestamp': datetime.now().isoformat(timespec='seconds'),
|
||||
'n_big4_signatures': int(big4_dh.size),
|
||||
'n_boot': N_BOOT,
|
||||
'seeds': SEEDS,
|
||||
'note': ('Final test: does Big-4 pooled dh multimodality '
|
||||
'survive BOTH firm-mean centering and integer-tie '
|
||||
'jitter?'),
|
||||
},
|
||||
'panels': panels,
|
||||
'per_firm_dh_stats': firm_stats,
|
||||
}
|
||||
|
||||
json_path = OUT / 'dhash_residualized_jittered_results.json'
|
||||
json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False),
|
||||
encoding='utf-8')
|
||||
print(f'\n[json] {json_path}')
|
||||
|
||||
md = [
|
||||
'# dHash Firm-Residualized + Jittered Dip (Script 39e)',
|
||||
'', f'Generated: {results["meta"]["timestamp"]}',
|
||||
f'Bootstrap replicates: {N_BOOT}; jitter seeds: {SEEDS}',
|
||||
'',
|
||||
'## Per-firm Big-4 dh summary',
|
||||
'', '| Firm | n | mean | median | sd | P25 | P75 | %<=5 | %>15 |',
|
||||
'|---|---|---|---|---|---|---|---|---|',
|
||||
]
|
||||
for f, s in firm_stats.items():
|
||||
md.append(f'| {f} | {s["n"]:,} | {s["mean"]:.3f} | '
|
||||
f'{s["median"]:.1f} | {s["sd"]:.3f} | '
|
||||
f'{s["p25"]:.1f} | {s["p75"]:.1f} | '
|
||||
f'{s["pct_le_5"]:.3f} | {s["pct_gt_15"]:.3f} |')
|
||||
md += [
|
||||
'',
|
||||
'## Dip test under four conditions (Big-4 pooled, sig-level)',
|
||||
'',
|
||||
'| Condition | dip | p (or p_median) | reject@.05 (seeds) |',
|
||||
'|---|---|---|---|',
|
||||
f'| 1. Raw (integer values) | {panels["raw"]["dip_median"]:.5f} '
|
||||
f'| {_fmt_p(panels["raw"]["p_median"])} | n/a (1 seed) |',
|
||||
f'| 2. Firm-mean centered, no jitter '
|
||||
f'| {panels["centered_only"]["dip_median"]:.5f} '
|
||||
f'| {_fmt_p(panels["centered_only"]["p_median"])} | n/a (1 seed) |',
|
||||
f'| 3. Jittered only (5 seeds) '
|
||||
f'| median {panels["jitter_only"]["dip_median"]:.5f} '
|
||||
f'| median {_fmt_p(panels["jitter_only"]["p_median"])} '
|
||||
f'| {panels["jitter_only"]["reject_at_05_count"]}/5 |',
|
||||
f'| 4. **Centered + jittered (5 seeds)** '
|
||||
f'| median {panels["centered_jittered"]["dip_median"]:.5f} '
|
||||
f'| median {_fmt_p(panels["centered_jittered"]["p_median"])} '
|
||||
f'| {panels["centered_jittered"]["reject_at_05_count"]}/5 |',
|
||||
'',
|
||||
'## Interpretation',
|
||||
'',
|
||||
('If Condition 4 still rejects unimodality, Big-4 dh has '
|
||||
'genuine within-population continuous multimodality '
|
||||
'independent of both between-firm location shifts and '
|
||||
'integer mass points. If Condition 4 fails to reject, the '
|
||||
'Big-4 pooled dh multimodality is fully explained by '
|
||||
'(between-firm mean shift) + (integer mass points). In the '
|
||||
'latter case, the dh axis carries no independent within-firm '
|
||||
'regime evidence beyond the cos axis.'),
|
||||
'',
|
||||
]
|
||||
md_path = OUT / 'dhash_residualized_jittered_report.md'
|
||||
md_path.write_text('\n'.join(md), encoding='utf-8')
|
||||
print(f'[md ] {md_path}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user