diff --git a/signature_analysis/39b_v4_signature_level_diptest.py b/signature_analysis/39b_v4_signature_level_diptest.py new file mode 100644 index 0000000..bc24531 --- /dev/null +++ b/signature_analysis/39b_v4_signature_level_diptest.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +""" +Script 39b: Signature-Level Dip Test (multimodality at the signature cloud) +============================================================================ +Phase 5 pre-emptive evidence. Script 34 / 36 already report Hartigan +dip tests on the 437 accountant-level (cos_mean, dh_mean) means and +both marginals reject unimodality at p < 5e-4. Reviewers may ask +whether the same multimodality is detectable at the signature level +itself (n = 150,442 Big-4 signatures) and whether the multimodality +is a within-firm or only a between-firm phenomenon. + +This script supplies the missing dip evidence on the raw signature +cloud. It is a *diagnostic* in the same role as Scripts 34/36 dip +tests: it does not derive an operational threshold; it characterises +the marginal distributions of (cos, dh_indep) at the signature level. + +Outputs: + reports/v4_big4/signature_level_diptest/ + sig_diptest_results.json + sig_diptest_report.md + +Tests performed: + A. Pooled Big-4 marginals (cos, dh_indep), n = 150,442 + B. Per-firm marginals (Firm A / B / C / D separately) +""" + +import json +import sqlite3 +import numpy as np +import diptest +from pathlib import Path +from datetime import datetime +from scipy import stats +from scipy.signal import find_peaks + +DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' +OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' + 'v4_big4/signature_level_diptest') +OUT.mkdir(parents=True, exist_ok=True) + +BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合') +ALIAS = {'勤業眾信聯合': 'Firm A', + '安侯建業聯合': 'Firm B', + '資誠聯合': 'Firm C', + '安永聯合': 'Firm D'} +N_BOOT = 2000 + + +def load_big4_signatures(): + conn = sqlite3.connect(DB) + cur = conn.cursor() + cur.execute(''' + SELECT s.assigned_accountant, a.firm, + s.max_similarity_to_same_accountant, + CAST(s.min_dhash_independent AS REAL) + FROM signatures s + JOIN accountants a ON s.assigned_accountant = a.name + WHERE s.assigned_accountant IS NOT NULL + AND s.max_similarity_to_same_accountant IS NOT NULL + AND s.min_dhash_independent IS NOT NULL + AND a.firm IN (?, ?, ?, ?) + ''', BIG4) + rows = cur.fetchall() + conn.close() + return rows + + +def kde_dip(values, n_boot=N_BOOT): + arr = np.asarray(values, dtype=float) + arr = arr[np.isfinite(arr)] + dip, pval = diptest.diptest(arr, boot_pval=True, n_boot=n_boot) + kde = stats.gaussian_kde(arr, bw_method='silverman') + xs = np.linspace(arr.min(), arr.max(), 2000) + density = kde(xs) + peaks, _ = find_peaks(density, prominence=density.max() * 0.02) + antimodes = [] + for i in range(len(peaks) - 1): + seg = density[peaks[i]:peaks[i + 1]] + if not len(seg): + continue + local = peaks[i] + int(np.argmin(seg)) + antimodes.append(float(xs[local])) + return { + 'n': int(len(arr)), + 'dip': float(dip), + 'dip_pvalue': float(pval), + 'unimodal_alpha05': bool(pval > 0.05), + 'n_modes': int(len(peaks)), + 'mode_locations': [float(xs[p]) for p in peaks], + 'antimodes': antimodes, + 'n_boot': int(n_boot), + } + + +def _fmt_p(p): + if p == 0.0: + return '< 5e-4 (no bootstrap replicate exceeded observed dip)' + return f'{p:.4g}' + + +def main(): + print('=' * 72) + print('Script 39b: Signature-Level Dip Test') + print('=' * 72) + rows = load_big4_signatures() + cos_all = np.array([r[2] for r in rows], dtype=float) + dh_all = np.array([r[3] for r in rows], dtype=float) + firms = np.array([ALIAS[r[1]] for r in rows]) + print(f'\nLoaded {len(rows):,} Big-4 signatures') + for f in sorted(set(firms)): + print(f' {f}: {(firms == f).sum():,}') + + results = { + 'meta': { + 'script': '39b', + 'timestamp': datetime.now().isoformat(timespec='seconds'), + 'n_total': int(len(rows)), + 'n_boot': N_BOOT, + 'note': ('Signature-level Hartigan dip test on Big-4 ' + '(cos, dh_indep) marginals; pooled and per-firm.'), + }, + 'pooled': {}, + 'per_firm': {}, + } + + # A. Pooled + print('\n[A] Pooled Big-4') + for desc, arr in [('cos', cos_all), ('dh_indep', dh_all)]: + r = kde_dip(arr) + results['pooled'][desc] = r + print(f' {desc}: n={r["n"]:,}, dip={r["dip"]:.5f}, ' + f'p={_fmt_p(r["dip_pvalue"])}, n_modes={r["n_modes"]}') + + # B. Per-firm + print('\n[B] Per-firm') + for f in sorted(set(firms)): + mask = firms == f + results['per_firm'][f] = {} + for desc, arr in [('cos', cos_all[mask]), ('dh_indep', dh_all[mask])]: + r = kde_dip(arr) + results['per_firm'][f][desc] = r + print(f' {f} {desc}: n={r["n"]:,}, dip={r["dip"]:.5f}, ' + f'p={_fmt_p(r["dip_pvalue"])}, n_modes={r["n_modes"]}') + + json_path = OUT / 'sig_diptest_results.json' + json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False), + encoding='utf-8') + print(f'\n[json] {json_path}') + + md = ['# Signature-Level Dip Test (Script 39b)', + '', + f'Generated: {results["meta"]["timestamp"]}', + f'Bootstrap replicates: {N_BOOT}', + '', + '## A. Pooled Big-4 signature cloud', + '', + f'n = {results["meta"]["n_total"]:,} signatures', + '', + '| Marginal | dip | p (boot) | n_modes | unimodal @0.05 |', + '|---|---|---|---|---|'] + for desc in ['cos', 'dh_indep']: + r = results['pooled'][desc] + md.append(f'| {desc} | {r["dip"]:.5f} | {_fmt_p(r["dip_pvalue"])} | ' + f'{r["n_modes"]} | {r["unimodal_alpha05"]} |') + + md += ['', '## B. Per-firm signature-level dip tests', '', + '| Firm | Marginal | n | dip | p (boot) | n_modes | unimodal @0.05 |', + '|---|---|---|---|---|---|---|'] + for f in sorted(results['per_firm']): + for desc in ['cos', 'dh_indep']: + r = results['per_firm'][f][desc] + md.append(f'| {f} | {desc} | {r["n"]:,} | {r["dip"]:.5f} | ' + f'{_fmt_p(r["dip_pvalue"])} | {r["n_modes"]} | ' + f'{r["unimodal_alpha05"]} |') + md += ['', + '## Reading guide', + '', + ('A unimodality rejection at the signature level confirms ' + 'multimodal structure independent of accountant-level ' + 'aggregation. A within-firm rejection further indicates the ' + 'multimodality is not solely a between-firm artefact. A ' + 'within-firm non-rejection (e.g., Firm A) is consistent with ' + 'that firm being concentrated in a single mechanism corner.'), + '', + ('All thresholds and operational classifiers remain those of ' + 'v3.x §III-K and v4.0 §III-J; this script supplies diagnostic ' + 'evidence only.'), + ''] + md_path = OUT / 'sig_diptest_report.md' + md_path.write_text('\n'.join(md), encoding='utf-8') + print(f'[md ] {md_path}') + + +if __name__ == '__main__': + main() diff --git a/signature_analysis/39c_v4_midsmall_signature_diptest.py b/signature_analysis/39c_v4_midsmall_signature_diptest.py new file mode 100644 index 0000000..aeb793a --- /dev/null +++ b/signature_analysis/39c_v4_midsmall_signature_diptest.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +""" +Script 39c: Mid/Small-Firm Signature-Level Dip Test +==================================================== +Companion to Script 39b. 39b showed every Big-4 firm rejects +unimodality on the dHash signature marginal (p < 5e-4 in each +of A/B/C/D) while every Big-4 firm fails to reject unimodality +on the cosine marginal. This script asks the same questions of +the mid/small-firm population (non-Big-4): + + 1. Does the pooled mid/small-firm signature cloud show the same + dHash multimodality? + 2. Within individual mid/small firms (those with enough + signatures to support the test), does the dHash multimodality + hold firm-internally as it does in Big-4? + +If yes, the dHash signature-level multimodality is corpus-universal +and the Big-4 scope restriction of v4.0 is not necessary on dHash +grounds (cf §III-G item 2 which currently rests on Big-4-level +multimodality). The cosine axis is reported alongside for +completeness, but no v4.0 claim turns on cosine multimodality +outside Big-4. + +Outputs: + reports/v4_big4/midsmall_signature_diptest/ + midsmall_diptest_results.json + midsmall_diptest_report.md +""" + +import json +import sqlite3 +import numpy as np +import diptest +from pathlib import Path +from datetime import datetime +from scipy import stats +from scipy.signal import find_peaks + +DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' +OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' + 'v4_big4/midsmall_signature_diptest') +OUT.mkdir(parents=True, exist_ok=True) + +BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合') +N_BOOT = 2000 +SINGLE_FIRM_MIN_SIG = 500 # minimum signature count to run a per-firm dip test + + +def load_non_big4_signatures(): + conn = sqlite3.connect(DB) + cur = conn.cursor() + cur.execute(''' + SELECT a.firm, + s.max_similarity_to_same_accountant, + CAST(s.min_dhash_independent AS REAL) + FROM signatures s + JOIN accountants a ON s.assigned_accountant = a.name + WHERE s.assigned_accountant IS NOT NULL + AND s.max_similarity_to_same_accountant IS NOT NULL + AND s.min_dhash_independent IS NOT NULL + AND a.firm IS NOT NULL + AND a.firm NOT IN (?, ?, ?, ?) + ''', BIG4) + rows = cur.fetchall() + conn.close() + return rows + + +def kde_dip(values, n_boot=N_BOOT): + arr = np.asarray(values, dtype=float) + arr = arr[np.isfinite(arr)] + if len(arr) < 10: + return {'n': int(len(arr)), 'skipped': 'too few points'} + dip, pval = diptest.diptest(arr, boot_pval=True, n_boot=n_boot) + kde = stats.gaussian_kde(arr, bw_method='silverman') + xs = np.linspace(arr.min(), arr.max(), 2000) + density = kde(xs) + peaks, _ = find_peaks(density, prominence=density.max() * 0.02) + antimodes = [] + for i in range(len(peaks) - 1): + seg = density[peaks[i]:peaks[i + 1]] + if not len(seg): + continue + local = peaks[i] + int(np.argmin(seg)) + antimodes.append(float(xs[local])) + return { + 'n': int(len(arr)), + 'dip': float(dip), + 'dip_pvalue': float(pval), + 'unimodal_alpha05': bool(pval > 0.05), + 'n_modes': int(len(peaks)), + 'mode_locations': [float(xs[p]) for p in peaks], + 'antimodes': antimodes, + 'n_boot': int(n_boot), + } + + +def _fmt_p(p): + if p == 0.0: + return '< 5e-4' + return f'{p:.4g}' + + +def main(): + print('=' * 72) + print('Script 39c: Mid/Small-Firm Signature-Level Dip Test') + print('=' * 72) + rows = load_non_big4_signatures() + cos_all = np.array([r[1] for r in rows], dtype=float) + dh_all = np.array([r[2] for r in rows], dtype=float) + firms = np.array([r[0] for r in rows]) + n_total = len(rows) + print(f'\nLoaded {n_total:,} non-Big-4 signatures across ' + f'{len(set(firms))} firms') + + # Firm size table + firm_counts = {} + for f in firms: + firm_counts[f] = firm_counts.get(f, 0) + 1 + top = sorted(firm_counts.items(), key=lambda x: -x[1]) + print('\nTop firms by signature count:') + for f, n in top[:10]: + print(f' {f}: {n:,}') + + results = { + 'meta': { + 'script': '39c', + 'timestamp': datetime.now().isoformat(timespec='seconds'), + 'n_total': int(n_total), + 'n_firms': int(len(firm_counts)), + 'n_boot': N_BOOT, + 'single_firm_min_sig': SINGLE_FIRM_MIN_SIG, + }, + 'pooled': {}, + 'per_firm_eligible': {}, + 'firm_counts': dict(firm_counts), + } + + # A. Pooled non-Big-4 + print('\n[A] Pooled non-Big-4') + for desc, arr in [('cos', cos_all), ('dh_indep', dh_all)]: + r = kde_dip(arr) + results['pooled'][desc] = r + print(f' {desc}: n={r["n"]:,}, dip={r["dip"]:.5f}, ' + f'p={_fmt_p(r["dip_pvalue"])}, n_modes={r["n_modes"]}') + + # B. Per-firm (only firms with >= SINGLE_FIRM_MIN_SIG signatures) + eligible = [f for f, n in firm_counts.items() if n >= SINGLE_FIRM_MIN_SIG] + print(f'\n[B] Per-firm dip test ' + f'(firms with >= {SINGLE_FIRM_MIN_SIG} signatures: {len(eligible)})') + for f in sorted(eligible, key=lambda x: -firm_counts[x]): + mask = firms == f + results['per_firm_eligible'][f] = {'n': int(mask.sum())} + for desc, arr in [('cos', cos_all[mask]), ('dh_indep', dh_all[mask])]: + r = kde_dip(arr) + results['per_firm_eligible'][f][desc] = r + print(f' {f[:20]:<22s} {desc}: n={r["n"]:,}, dip={r["dip"]:.5f}, ' + f'p={_fmt_p(r["dip_pvalue"])}, n_modes={r["n_modes"]}') + + json_path = OUT / 'midsmall_diptest_results.json' + json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False), + encoding='utf-8') + print(f'\n[json] {json_path}') + + md = ['# Mid/Small-Firm Signature-Level Dip Test (Script 39c)', + '', + f'Generated: {results["meta"]["timestamp"]}', + f'Bootstrap replicates: {N_BOOT}', + '', + '## A. Pooled non-Big-4 signature cloud', + '', + f'n = {n_total:,} signatures across ' + f'{results["meta"]["n_firms"]} firms', + '', + '| Marginal | dip | p (boot) | n_modes | unimodal @0.05 |', + '|---|---|---|---|---|'] + for desc in ['cos', 'dh_indep']: + r = results['pooled'][desc] + md.append(f'| {desc} | {r["dip"]:.5f} | {_fmt_p(r["dip_pvalue"])} | ' + f'{r["n_modes"]} | {r["unimodal_alpha05"]} |') + + md += ['', f'## B. Single mid/small firms (>= {SINGLE_FIRM_MIN_SIG} ' + f'signatures), {len(eligible)} qualify', '', + '| Firm | Marginal | n | dip | p (boot) | n_modes | unimodal @0.05 |', + '|---|---|---|---|---|---|---|'] + for f in sorted(eligible, key=lambda x: -firm_counts[x]): + for desc in ['cos', 'dh_indep']: + r = results['per_firm_eligible'][f][desc] + md.append(f'| {f[:20]} | {desc} | {r["n"]:,} | {r["dip"]:.5f} | ' + f'{_fmt_p(r["dip_pvalue"])} | {r["n_modes"]} | ' + f'{r["unimodal_alpha05"]} |') + + md += ['', + '## Reading guide', + '', + ('If the pooled-non-Big-4 dHash marginal rejects unimodality ' + 'AND the qualifying individual mid/small firms also reject, ' + 'the dHash within-firm replication regime structure is ' + 'corpus-universal and not Big-4-specific. In that case the ' + 'Big-4 scope of v4.0 is justified on cosine-axis grounds ' + '(Firm-A composition; §III-G item 1) and accountant-level ' + 'LOOO reproducibility (§III-G item 3), but not on dHash ' + 'multimodality grounds (§III-G item 2 should be re-scoped or ' + 'qualified). If the per-firm dHash tests instead fail to ' + 'reject inside mid/small firms, the dHash multimodality is ' + 'Big-4-specific and §III-G item 2 holds as stated.'), + ''] + md_path = OUT / 'midsmall_diptest_report.md' + md_path.write_text('\n'.join(md), encoding='utf-8') + print(f'[md ] {md_path}') + + +if __name__ == '__main__': + main() diff --git a/signature_analysis/39d_dhash_discrete_robustness.py b/signature_analysis/39d_dhash_discrete_robustness.py new file mode 100644 index 0000000..d91e7fe --- /dev/null +++ b/signature_analysis/39d_dhash_discrete_robustness.py @@ -0,0 +1,446 @@ +#!/usr/bin/env python3 +""" +Script 39d: dHash Discrete-Value Robustness Diagnostics +======================================================== +Codex (gpt-5.5 xhigh) attack on Script 39b/39c findings revealed that +the within-firm dHash dip-test rejections are driven by integer mass +points (dHash takes integer values 0..64). A uniform jitter of +[-0.5, +0.5] eliminates dip rejection in every firm tested. This +script consolidates that finding into a permanent diagnostic and adds: + + 1. Raw vs jittered dip with multi-seed robustness (5 seeds) + 2. Integer-histogram valley analysis: locate local minima between + adjacent peaks in the binned integer distribution; report whether + any valley centers near dh = 5 + 3. Firm-residualized dip on dHash (analog of cosine firm-mean + centering that confirmed the cosine reframe) + 4. Pairwise pair-coincidence: does the same same-CPA pair achieve + both max cosine and min dHash, or are the two descriptors + attached to different pairs? Foundation for "is (cos, dh) a + joint signature regime descriptor or two parallel descriptors" + +This script does not derive operational thresholds; it characterises +whether the v4.0 K=3 mixture and v3.x cos>0.95 AND dh<=5 rule are +robustly supported once integer-discreteness artifacts are removed. + +Outputs: + reports/v4_big4/dhash_discrete_robustness/ + dhash_discrete_results.json + dhash_discrete_report.md +""" + +import json +import sqlite3 +import numpy as np +import diptest +from pathlib import Path +from datetime import datetime + +DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' +OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' + 'v4_big4/dhash_discrete_robustness') +OUT.mkdir(parents=True, exist_ok=True) + +BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合') +ALIAS = {'勤業眾信聯合': 'Firm A', + '安侯建業聯合': 'Firm B', + '資誠聯合': 'Firm C', + '安永聯合': 'Firm D'} +N_BOOT = 2000 +JITTER_SEEDS = [42, 43, 44, 45, 46] +SINGLE_FIRM_MIN_SIG = 500 + + +def load_signatures(): + conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True) + cur = conn.cursor() + cur.execute(''' + SELECT a.firm, s.assigned_accountant, + s.max_similarity_to_same_accountant, + CAST(s.min_dhash_independent AS REAL) + FROM signatures s + JOIN accountants a ON s.assigned_accountant = a.name + WHERE s.assigned_accountant IS NOT NULL + AND s.max_similarity_to_same_accountant IS NOT NULL + AND s.min_dhash_independent IS NOT NULL + AND a.firm IS NOT NULL + ''') + rows = cur.fetchall() + conn.close() + return rows + + +def dip(values, n_boot=N_BOOT): + arr = np.asarray(values, dtype=float) + arr = arr[np.isfinite(arr)] + d, p = diptest.diptest(arr, boot_pval=True, n_boot=n_boot) + return float(d), float(p) + + +def multi_seed_jitter_dip(values, seeds=JITTER_SEEDS, n_boot=N_BOOT): + """Compute dip stat + p-value across seeds; return distribution.""" + arr = np.asarray(values, dtype=float) + arr = arr[np.isfinite(arr)] + stats = [] + for seed in seeds: + rng = np.random.default_rng(seed) + j = arr + rng.uniform(-0.5, 0.5, len(arr)) + d, p = diptest.diptest(j, boot_pval=True, n_boot=n_boot) + stats.append({'seed': seed, 'dip': float(d), 'p': float(p)}) + return { + 'n_seeds': len(seeds), + 'p_min': min(s['p'] for s in stats), + 'p_max': max(s['p'] for s in stats), + 'p_median': float(np.median([s['p'] for s in stats])), + 'dip_min': min(s['dip'] for s in stats), + 'dip_max': max(s['dip'] for s in stats), + 'reject_at_05_count': int(sum(1 for s in stats if s['p'] <= 0.05)), + 'per_seed': stats, + } + + +def integer_histogram_valleys(values, max_bin=20): + """For integer-valued data, locate local minima in the count + histogram on bins 0..max_bin. Returns valley positions and depths + relative to flanking peaks.""" + arr = np.asarray(values, dtype=float) + arr = arr[np.isfinite(arr)] + bins = np.arange(0, max_bin + 2) # 0, 1, ..., max_bin+1 + counts, edges = np.histogram(arr, bins=bins) + centers = (edges[:-1] + edges[1:]) / 2.0 + valleys = [] + for i in range(1, len(counts) - 1): + if counts[i] < counts[i - 1] and counts[i] < counts[i + 1]: + left_peak = counts[i - 1] + right_peak = counts[i + 1] + min_peak = min(left_peak, right_peak) + depth_rel = (min_peak - counts[i]) / min_peak if min_peak else 0 + valleys.append({ + 'bin_center': float(centers[i]), + 'count': int(counts[i]), + 'left_peak_bin': int(centers[i - 1]), + 'left_peak_count': int(left_peak), + 'right_peak_bin': int(centers[i + 1]), + 'right_peak_count': int(right_peak), + 'depth_rel': float(depth_rel), + }) + return { + 'histogram_bins_0_to_max': counts[:max_bin + 1].tolist(), + 'valleys': valleys, + 'note': ('valleys are bins where count < both neighbours; ' + 'depth_rel = (min(neighbour) - bin) / min(neighbour). ' + 'A genuine antimode would have a deep, stable valley ' + 'with depth_rel > 0.1.'), + } + + +def firm_residualized(values, firm_labels): + """Return values with firm means subtracted (centered to grand mean + over firms). Used to test whether residual within-firm structure + rejects unimodality.""" + arr = np.asarray(values, dtype=float) + firms = np.asarray(firm_labels) + out = arr.copy() + grand = float(np.mean(arr)) + for f in np.unique(firms): + m = firms == f + out[m] = arr[m] - float(np.mean(arr[m])) + grand + return out + + +def pair_coincidence_rate(): + """Fraction of signatures whose max-cosine partner equals the + min-dHash partner within the same-CPA cross-year pool.""" + conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True) + cur = conn.cursor() + cur.execute(''' + SELECT COUNT(*) AS n_total, + SUM(CASE WHEN max_cosine_pair_id IS NOT NULL + AND min_dhash_pair_id IS NOT NULL + AND max_cosine_pair_id = min_dhash_pair_id + THEN 1 ELSE 0 END) AS n_same_pair, + SUM(CASE WHEN max_cosine_pair_id IS NOT NULL + AND min_dhash_pair_id IS NOT NULL + AND max_cosine_pair_id != min_dhash_pair_id + THEN 1 ELSE 0 END) AS n_diff_pair, + SUM(CASE WHEN max_cosine_pair_id IS NULL + OR min_dhash_pair_id IS NULL + THEN 1 ELSE 0 END) AS n_null + FROM signatures + ''') + row = cur.fetchone() + conn.close() + n_total, n_same, n_diff, n_null = row + n_with_both = (n_same or 0) + (n_diff or 0) + return { + 'n_total': int(n_total or 0), + 'n_with_both_pair_ids': int(n_with_both), + 'n_same_pair': int(n_same or 0), + 'n_diff_pair': int(n_diff or 0), + 'n_null': int(n_null or 0), + 'same_pair_rate': (float(n_same) / n_with_both + if n_with_both else None), + 'note': ('rate computed over signatures where both ' + 'max_cosine_pair_id and min_dhash_pair_id are present'), + } + + +def _fmt_p(p): + return '< 5e-4' if p == 0.0 else f'{p:.4g}' + + +def main(): + print('=' * 72) + print('Script 39d: dHash Discrete-Value Robustness Diagnostics') + print('=' * 72) + rows = load_signatures() + firms_raw = np.array([r[0] for r in rows]) + cos = np.array([r[2] for r in rows], dtype=float) + dh = np.array([r[3] for r in rows], dtype=float) + is_big4 = np.isin(firms_raw, BIG4) + n = len(rows) + print(f'\nLoaded {n:,} signatures; Big-4 {is_big4.sum():,}, ' + f'non-Big-4 {(~is_big4).sum():,}') + + results = { + 'meta': { + 'script': '39d', + 'timestamp': datetime.now().isoformat(timespec='seconds'), + 'n_total_signatures': int(n), + 'n_big4': int(is_big4.sum()), + 'n_non_big4': int((~is_big4).sum()), + 'n_boot': N_BOOT, + 'jitter_seeds': JITTER_SEEDS, + 'note': ('Diagnostic for dHash integer-mass-point artifact ' + 'in dip test; codex round-29 attack on Script 39b/c'), + }, + } + + # ---- A. Raw vs multi-seed jittered dip ---- + print('\n[A] Raw vs jittered dip (5 seeds, n_boot=2000)') + panels = {} + # Big-4 pooled + print(' Big-4 pooled:') + raw_d, raw_p = dip(dh[is_big4]) + j = multi_seed_jitter_dip(dh[is_big4]) + panels['big4_pooled'] = { + 'n': int(is_big4.sum()), + 'raw': {'dip': raw_d, 'p': raw_p}, + 'jittered': j, + } + print(f' raw: dip={raw_d:.5f}, p={_fmt_p(raw_p)}') + print(f' jitter: p_median={j["p_median"]:.4g}, ' + f'p_range=[{j["p_min"]:.4g}, {j["p_max"]:.4g}], ' + f'reject@.05 in {j["reject_at_05_count"]}/5 seeds') + # Each Big-4 firm + for f in BIG4: + mask = firms_raw == f + if mask.sum() == 0: + continue + raw_d, raw_p = dip(dh[mask]) + j = multi_seed_jitter_dip(dh[mask]) + panels[ALIAS[f]] = { + 'n': int(mask.sum()), + 'raw': {'dip': raw_d, 'p': raw_p}, + 'jittered': j, + } + print(f' {ALIAS[f]} (n={mask.sum():,}):') + print(f' raw: dip={raw_d:.5f}, p={_fmt_p(raw_p)}') + print(f' jitter: p_median={j["p_median"]:.4g}, ' + f'reject@.05 in {j["reject_at_05_count"]}/5 seeds') + # Non-Big-4 pooled + print(' Non-Big-4 pooled:') + raw_d, raw_p = dip(dh[~is_big4]) + j = multi_seed_jitter_dip(dh[~is_big4]) + panels['non_big4_pooled'] = { + 'n': int((~is_big4).sum()), + 'raw': {'dip': raw_d, 'p': raw_p}, + 'jittered': j, + } + print(f' raw: dip={raw_d:.5f}, p={_fmt_p(raw_p)}') + print(f' jitter: p_median={j["p_median"]:.4g}, ' + f'reject@.05 in {j["reject_at_05_count"]}/5 seeds') + results['raw_vs_jittered_dip'] = panels + + # ---- B. Integer-histogram valley analysis ---- + print('\n[B] Integer-histogram valley analysis (bins 0..20)') + valleys = {} + valleys['big4_pooled'] = integer_histogram_valleys(dh[is_big4]) + print(f' Big-4 pooled: {len(valleys["big4_pooled"]["valleys"])} valleys') + for v in valleys['big4_pooled']['valleys']: + print(f' bin {v["bin_center"]:.1f}: count={v["count"]}, ' + f'depth_rel={v["depth_rel"]:.3f}') + for f in BIG4: + mask = firms_raw == f + if mask.sum() == 0: + continue + valleys[ALIAS[f]] = integer_histogram_valleys(dh[mask]) + print(f' {ALIAS[f]}: ' + f'{len(valleys[ALIAS[f]]["valleys"])} valleys') + for v in valleys[ALIAS[f]]['valleys']: + print(f' bin {v["bin_center"]:.1f}: count={v["count"]}, ' + f'depth_rel={v["depth_rel"]:.3f}') + valleys['non_big4_pooled'] = integer_histogram_valleys(dh[~is_big4]) + print(f' Non-Big-4 pooled: ' + f'{len(valleys["non_big4_pooled"]["valleys"])} valleys') + for v in valleys['non_big4_pooled']['valleys']: + print(f' bin {v["bin_center"]:.1f}: count={v["count"]}, ' + f'depth_rel={v["depth_rel"]:.3f}') + results['integer_histogram_valleys'] = valleys + + # ---- C. Firm-residualized dip on dHash, signature level ---- + print('\n[C] Firm-residualized dHash dip (signature level)') + firm_labels = np.array([ + ALIAS[f] if f in ALIAS else f'M:{f}' + for f in firms_raw + ]) + # Big-4 only residualized over A/B/C/D + dh_resid_big4 = firm_residualized(dh[is_big4], firm_labels[is_big4]) + raw_d, raw_p = dip(dh[is_big4]) + res_d, res_p = dip(dh_resid_big4) + print(f' Big-4 raw: dip={raw_d:.5f}, p={_fmt_p(raw_p)}') + print(f' Big-4 residualized: dip={res_d:.5f}, p={_fmt_p(res_p)}') + # Also non-Big-4 residualized over their firms + dh_resid_nbig4 = firm_residualized(dh[~is_big4], firm_labels[~is_big4]) + raw_d_n, raw_p_n = dip(dh[~is_big4]) + res_d_n, res_p_n = dip(dh_resid_nbig4) + print(f' Non-Big-4 raw: dip={raw_d_n:.5f}, p={_fmt_p(raw_p_n)}') + print(f' Non-Big-4 residualized: dip={res_d_n:.5f}, p={_fmt_p(res_p_n)}') + results['firm_residualized_dh_dip'] = { + 'big4': { + 'raw': {'dip': raw_d, 'p': raw_p}, + 'firm_residualized': {'dip': res_d, 'p': res_p}, + }, + 'non_big4': { + 'raw': {'dip': raw_d_n, 'p': raw_p_n}, + 'firm_residualized': {'dip': res_d_n, 'p': res_p_n}, + }, + 'note': ('Residualization subtracts each firm mean dh and adds ' + 'back the grand mean. If residual dip rejects, there is ' + 'genuine within-firm dh multimodality independent of ' + 'between-firm mean shifts. If residual fails to reject, ' + 'all dh "multimodality" was between-firm composition.'), + } + + # ---- D. Pair-coincidence rate ---- + print('\n[D] Pair-coincidence rate (max-cos pair vs min-dh pair)') + try: + pc = pair_coincidence_rate() + if pc['same_pair_rate'] is not None: + print(f' n_with_both: {pc["n_with_both_pair_ids"]:,}, ' + f'same-pair rate: {pc["same_pair_rate"]:.4f}') + else: + print(' Pair IDs not stored in signatures table (skipped)') + results['pair_coincidence'] = pc + except sqlite3.OperationalError as e: + print(f' SQL error (pair_id columns may not exist): {e}') + results['pair_coincidence'] = { + 'error': str(e), + 'note': ('signatures table lacks max_cosine_pair_id / ' + 'min_dhash_pair_id columns; analysis skipped'), + } + + json_path = OUT / 'dhash_discrete_results.json' + json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False), + encoding='utf-8') + print(f'\n[json] {json_path}') + + # ---- Report markdown ---- + md = ['# dHash Discrete-Value Robustness Diagnostics (Script 39d)', + '', f'Generated: {results["meta"]["timestamp"]}', + f'Bootstrap replicates: {N_BOOT}; jitter seeds: {JITTER_SEEDS}', + '', + '## A. Raw vs jittered dHash dip (signature level)', + '', + ('dHash is integer-valued in [0, 64]. A raw dip test on ' + 'integer mass points may reject unimodality due to discrete ' + 'spikes rather than a continuous bimodal density. We add ' + 'uniform jitter in [-0.5, +0.5] over 5 seeds and re-test.'), + '', + '| Scope | n | raw dip | raw p | jitter p median | jitter reject@.05 / 5 seeds |', + '|---|---|---|---|---|---|'] + for key, label in [('big4_pooled', 'Big-4 pooled')] + \ + [(ALIAS[f], ALIAS[f]) for f in BIG4] + \ + [('non_big4_pooled', 'Non-Big-4 pooled')]: + if key in panels: + p = panels[key] + md.append(f'| {label} | {p["n"]:,} | ' + f'{p["raw"]["dip"]:.5f} | ' + f'{_fmt_p(p["raw"]["p"])} | ' + f'{p["jittered"]["p_median"]:.4g} | ' + f'{p["jittered"]["reject_at_05_count"]}/5 |') + md += ['', + '**Interpretation.** If jittered dip ceases to reject in all ' + 'panels, the raw-data rejection was driven by integer ties ' + 'rather than a continuous bimodal density. Codex round-29 ' + 'observed this pattern; this script confirms with multi-seed ' + 'robustness.', + '', + '## B. Integer-histogram valley locations (bins 0..20)', + '', + ('For each scope, list bins where count is strictly less ' + 'than both neighbours, with relative depth ' + '(min(neighbour) - bin) / min(neighbour). A genuine ' + 'antimode would show a deep, stable valley; integer-noise ' + 'valleys are shallow and inconsistent across firms.'), + ''] + for key, label in [('big4_pooled', 'Big-4 pooled')] + \ + [(ALIAS[f], ALIAS[f]) for f in BIG4] + \ + [('non_big4_pooled', 'Non-Big-4 pooled')]: + if key in valleys: + v_list = valleys[key]['valleys'] + if not v_list: + md.append(f'- **{label}**: no integer-histogram valleys ' + f'in 0..20') + else: + desc = ', '.join( + f'dh={v["bin_center"]:.0f} (depth_rel={v["depth_rel"]:.3f})' + for v in v_list) + md.append(f'- **{label}**: {desc}') + md += ['', + '## C. Firm-residualized dHash dip', + '', + ('Subtract each firm mean dHash; add back grand mean. If ' + 'residual rejects, within-firm multimodality is genuine. ' + 'If residual fails to reject, all dh "multimodality" was ' + 'between-firm composition.'), + '', + '| Scope | raw dip | raw p | residualized dip | residualized p |', + '|---|---|---|---|---|'] + fr = results['firm_residualized_dh_dip'] + md += [f'| Big-4 | {fr["big4"]["raw"]["dip"]:.5f} | ' + f'{_fmt_p(fr["big4"]["raw"]["p"])} | ' + f'{fr["big4"]["firm_residualized"]["dip"]:.5f} | ' + f'{_fmt_p(fr["big4"]["firm_residualized"]["p"])} |', + f'| Non-Big-4 | {fr["non_big4"]["raw"]["dip"]:.5f} | ' + f'{_fmt_p(fr["non_big4"]["raw"]["p"])} | ' + f'{fr["non_big4"]["firm_residualized"]["dip"]:.5f} | ' + f'{_fmt_p(fr["non_big4"]["firm_residualized"]["p"])} |'] + md += ['', + '## D. Max-cos pair vs min-dh pair coincidence', + ''] + pc = results.get('pair_coincidence', {}) + if 'same_pair_rate' in pc and pc['same_pair_rate'] is not None: + md += [f'- n_signatures with both pair IDs: ' + f'{pc["n_with_both_pair_ids"]:,}', + f'- same-pair rate: {pc["same_pair_rate"]:.4f} ' + f'({pc["n_same_pair"]:,} of ' + f'{pc["n_with_both_pair_ids"]:,})', + '', + ('A high rate (>0.8) supports a single-pair regime ' + 'descriptor language (cos and dh attached to the same ' + 'partner). A low rate indicates the two descriptors ' + 'attach to different partners and should be discussed ' + 'as parallel-but-different evidence.')] + elif 'error' in pc: + md += [f'- column not present in DB: {pc["error"]}', + ('- note: schema-dependent; pair IDs not currently stored ' + 'in signatures table.')] + md.append('') + md_path = OUT / 'dhash_discrete_report.md' + md_path.write_text('\n'.join(md), encoding='utf-8') + print(f'[md ] {md_path}') + + +if __name__ == '__main__': + main() diff --git a/signature_analysis/39e_dhash_residualized_jittered.py b/signature_analysis/39e_dhash_residualized_jittered.py new file mode 100644 index 0000000..6d3cf99 --- /dev/null +++ b/signature_analysis/39e_dhash_residualized_jittered.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +""" +Script 39e: dHash Firm-Residualized + Jittered Dip (final test) +================================================================ +Script 39d showed: + - Within-firm dh dip rejections all vanish after jitter (integer + artifact) + - Big-4 pooled dh dip survives jitter (p_median=0 over 5 seeds) + +But Firm A mean dh = 2.73 vs Firms B/C/D ~6.5-7.4 -- a large +between-firm location shift, analogous to the cosine case where +firm-mean centering eliminated rejection. + +This script applies BOTH corrections simultaneously: + 1. Firm-mean centering (remove between-firm location shifts) + 2. Uniform jitter in [-0.5, +0.5] (remove integer ties) + +If the doubly-corrected dh distribution rejects unimodality, the +Big-4 pooled multimodality is a genuine within-population, continuous +phenomenon. If it fails to reject, dh "multimodality" is fully +explained by between-firm composition (same conclusion as cosine). + +Multi-seed (5 seeds) for robustness. + +Outputs: + reports/v4_big4/dhash_discrete_robustness/ + dhash_residualized_jittered_results.json + dhash_residualized_jittered_report.md +""" + +import json +import sqlite3 +import numpy as np +import diptest +from pathlib import Path +from datetime import datetime + +DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' +OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' + 'v4_big4/dhash_discrete_robustness') +OUT.mkdir(parents=True, exist_ok=True) + +BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合') +ALIAS = {'勤業眾信聯合': 'Firm A', + '安侯建業聯合': 'Firm B', + '資誠聯合': 'Firm C', + '安永聯合': 'Firm D'} +N_BOOT = 2000 +SEEDS = [42, 43, 44, 45, 46] + + +def load_signatures(): + conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True) + cur = conn.cursor() + cur.execute(''' + SELECT a.firm, CAST(s.min_dhash_independent AS REAL) + FROM signatures s + JOIN accountants a ON s.assigned_accountant = a.name + WHERE s.assigned_accountant IS NOT NULL + AND s.max_similarity_to_same_accountant IS NOT NULL + AND s.min_dhash_independent IS NOT NULL + AND a.firm IS NOT NULL + ''') + rows = cur.fetchall() + conn.close() + return rows + + +def firm_residualize(values, firm_labels): + arr = np.asarray(values, dtype=float) + firms = np.asarray(firm_labels) + out = arr.copy() + grand = float(np.mean(arr)) + for f in np.unique(firms): + m = firms == f + out[m] = arr[m] - float(np.mean(arr[m])) + grand + return out + + +def dip_multi(values, seeds, with_jitter, n_boot=N_BOOT): + arr = np.asarray(values, dtype=float) + arr = arr[np.isfinite(arr)] + results = [] + for seed in seeds: + rng = np.random.default_rng(seed) + v = arr + rng.uniform(-0.5, 0.5, len(arr)) if with_jitter else arr + d, p = diptest.diptest(v, boot_pval=True, n_boot=n_boot) + results.append({'seed': seed, 'dip': float(d), 'p': float(p)}) + if not with_jitter: + break # without jitter the seed is irrelevant + return results + + +def _fmt_p(p): + return '< 5e-4' if p == 0.0 else f'{p:.4g}' + + +def summarize(name, results): + ps = [r['p'] for r in results] + ds = [r['dip'] for r in results] + return { + 'name': name, + 'n_seeds': len(results), + 'dip_min': min(ds), 'dip_max': max(ds), 'dip_median': float(np.median(ds)), + 'p_min': min(ps), 'p_max': max(ps), 'p_median': float(np.median(ps)), + 'reject_at_05_count': int(sum(1 for p in ps if p <= 0.05)), + 'per_seed': results, + } + + +def main(): + print('=' * 72) + print('Script 39e: dHash Firm-Residualized + Jittered Dip') + print('=' * 72) + rows = load_signatures() + firms_raw = np.array([r[0] for r in rows]) + dh = np.array([r[1] for r in rows], dtype=float) + is_big4 = np.isin(firms_raw, BIG4) + big4_dh = dh[is_big4] + big4_firms = np.array([ALIAS[f] for f in firms_raw[is_big4]]) + + print(f'\nLoaded {len(rows):,} signatures; Big-4 {is_big4.sum():,}') + print('\nPer-firm Big-4 dh summary:') + for f in sorted(set(big4_firms)): + v = big4_dh[big4_firms == f] + print(f' {f}: n={len(v):,} mean={v.mean():.3f} ' + f'median={np.median(v):.1f} sd={v.std():.3f}') + + # ---- Test conditions, all on Big-4 signature-level dh ---- + panels = {} + + # 1. Raw (no centering, no jitter) + print('\n[1] Raw dh') + r = dip_multi(big4_dh, [42], with_jitter=False) + panels['raw'] = summarize('raw', r) + print(f' dip={r[0]["dip"]:.5f}, p={_fmt_p(r[0]["p"])}') + + # 2. Centered only (no jitter; integer values preserved) + print('\n[2] Firm-mean centered, no jitter') + centered = firm_residualize(big4_dh, big4_firms) + r = dip_multi(centered, [42], with_jitter=False) + panels['centered_only'] = summarize('centered_only', r) + print(f' dip={r[0]["dip"]:.5f}, p={_fmt_p(r[0]["p"])}') + + # 3. Jittered only (no centering) + print('\n[3] Jittered (5 seeds), no centering') + r = dip_multi(big4_dh, SEEDS, with_jitter=True) + panels['jitter_only'] = summarize('jitter_only', r) + print(f' p_median={panels["jitter_only"]["p_median"]:.4g}, ' + f'reject@.05 in ' + f'{panels["jitter_only"]["reject_at_05_count"]}/5 seeds') + + # 4. Centered + jittered (THE key test) + print('\n[4] Firm-mean centered + jittered (5 seeds) -- KEY TEST') + r = dip_multi(centered, SEEDS, with_jitter=True) + panels['centered_jittered'] = summarize('centered_jittered', r) + print(f' p_median={panels["centered_jittered"]["p_median"]:.4g}, ' + f'reject@.05 in ' + f'{panels["centered_jittered"]["reject_at_05_count"]}/5 seeds') + for s in r: + print(f' seed {s["seed"]}: dip={s["dip"]:.5f}, p={_fmt_p(s["p"])}') + + # Per-firm dh stats (re-confirm Firm A shift) + firm_stats = {} + for f in sorted(set(big4_firms)): + v = big4_dh[big4_firms == f] + firm_stats[f] = { + 'n': int(len(v)), + 'mean': float(v.mean()), + 'median': float(np.median(v)), + 'sd': float(v.std()), + 'p25': float(np.percentile(v, 25)), + 'p75': float(np.percentile(v, 75)), + 'pct_le_5': float(np.mean(v <= 5)), + 'pct_gt_15': float(np.mean(v > 15)), + } + + results = { + 'meta': { + 'script': '39e', + 'timestamp': datetime.now().isoformat(timespec='seconds'), + 'n_big4_signatures': int(big4_dh.size), + 'n_boot': N_BOOT, + 'seeds': SEEDS, + 'note': ('Final test: does Big-4 pooled dh multimodality ' + 'survive BOTH firm-mean centering and integer-tie ' + 'jitter?'), + }, + 'panels': panels, + 'per_firm_dh_stats': firm_stats, + } + + json_path = OUT / 'dhash_residualized_jittered_results.json' + json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False), + encoding='utf-8') + print(f'\n[json] {json_path}') + + md = [ + '# dHash Firm-Residualized + Jittered Dip (Script 39e)', + '', f'Generated: {results["meta"]["timestamp"]}', + f'Bootstrap replicates: {N_BOOT}; jitter seeds: {SEEDS}', + '', + '## Per-firm Big-4 dh summary', + '', '| Firm | n | mean | median | sd | P25 | P75 | %<=5 | %>15 |', + '|---|---|---|---|---|---|---|---|---|', + ] + for f, s in firm_stats.items(): + md.append(f'| {f} | {s["n"]:,} | {s["mean"]:.3f} | ' + f'{s["median"]:.1f} | {s["sd"]:.3f} | ' + f'{s["p25"]:.1f} | {s["p75"]:.1f} | ' + f'{s["pct_le_5"]:.3f} | {s["pct_gt_15"]:.3f} |') + md += [ + '', + '## Dip test under four conditions (Big-4 pooled, sig-level)', + '', + '| Condition | dip | p (or p_median) | reject@.05 (seeds) |', + '|---|---|---|---|', + f'| 1. Raw (integer values) | {panels["raw"]["dip_median"]:.5f} ' + f'| {_fmt_p(panels["raw"]["p_median"])} | n/a (1 seed) |', + f'| 2. Firm-mean centered, no jitter ' + f'| {panels["centered_only"]["dip_median"]:.5f} ' + f'| {_fmt_p(panels["centered_only"]["p_median"])} | n/a (1 seed) |', + f'| 3. Jittered only (5 seeds) ' + f'| median {panels["jitter_only"]["dip_median"]:.5f} ' + f'| median {_fmt_p(panels["jitter_only"]["p_median"])} ' + f'| {panels["jitter_only"]["reject_at_05_count"]}/5 |', + f'| 4. **Centered + jittered (5 seeds)** ' + f'| median {panels["centered_jittered"]["dip_median"]:.5f} ' + f'| median {_fmt_p(panels["centered_jittered"]["p_median"])} ' + f'| {panels["centered_jittered"]["reject_at_05_count"]}/5 |', + '', + '## Interpretation', + '', + ('If Condition 4 still rejects unimodality, Big-4 dh has ' + 'genuine within-population continuous multimodality ' + 'independent of both between-firm location shifts and ' + 'integer mass points. If Condition 4 fails to reject, the ' + 'Big-4 pooled dh multimodality is fully explained by ' + '(between-firm mean shift) + (integer mass points). In the ' + 'latter case, the dh axis carries no independent within-firm ' + 'regime evidence beyond the cos axis.'), + '', + ] + md_path = OUT / 'dhash_residualized_jittered_report.md' + md_path.write_text('\n'.join(md), encoding='utf-8') + print(f'[md ] {md_path}') + + +if __name__ == '__main__': + main() diff --git a/signature_analysis/40b_inter_cpa_far_sweep.py b/signature_analysis/40b_inter_cpa_far_sweep.py new file mode 100644 index 0000000..997be2d --- /dev/null +++ b/signature_analysis/40b_inter_cpa_far_sweep.py @@ -0,0 +1,413 @@ +#!/usr/bin/env python3 +""" +Script 40b: Inter-CPA FAR Sweep for cos and dHash (joint + marginal) +===================================================================== +After codex round-29 destroyed the distributional path to thresholds +(K=3 mixture / dip / antimode shown composition-driven by Scripts +39b–39e), v4.0 pivots to an anchor-based threshold framework: +empirically derived from inter-CPA negative anchor specificity. + +Inter-CPA pairs (different CPAs, all-firm) are the negative anchor: +they are by definition not same-CPA replications, and the user's +within-CPA mechanism-transition concern (a CPA might switch from +hand-sign to template mid-career) does not enter the inter-CPA +calibration because each sampled pair crosses CPA boundaries. + +This script samples a large number of inter-CPA pairs and computes +both descriptors per pair (cosine via feature_vector dot product; +Hamming distance via dhash_vector XOR). It then sweeps: + + 1. FAR(cos > k) across k in [0.80, 0.99] + 2. FAR(dHash <= k) across k in [0, 20] + 3. Joint FAR(cos > 0.95 AND dHash <= k) for k in [0, 20] + 4. Conditional FAR(dHash <= k | cos > 0.95) -- the v3 inherited + rule's marginal specificity contribution from dHash + +Outputs: + reports/v4_big4/inter_cpa_far_sweep/ + far_sweep_results.json + far_sweep_report.md + +Sample size: 500,000 inter-CPA pairs (matches v3 Script 10 +convention). Big-4-only and full-corpus variants both reported. +""" + +import json +import sqlite3 +import numpy as np +from pathlib import Path +from datetime import datetime +from collections import defaultdict + +DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' +OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' + 'v4_big4/inter_cpa_far_sweep') +OUT.mkdir(parents=True, exist_ok=True) + +BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合') +ALIAS = {'勤業眾信聯合': 'Firm A', + '安侯建業聯合': 'Firm B', + '資誠聯合': 'Firm C', + '安永聯合': 'Firm D'} +N_PAIRS = 500_000 +SEED = 42 + +COS_GRID = [0.80, 0.83, 0.85, 0.87, 0.89, 0.90, 0.91, 0.92, 0.93, 0.94, + 0.945, 0.95, 0.955, 0.96, 0.965, 0.97, 0.975, 0.98, 0.985, + 0.99] +DH_GRID = list(range(0, 21)) + + +def hamming_64bit(a_bytes, b_bytes): + """Hamming distance between two 8-byte (64-bit) dHash byte strings.""" + a = int.from_bytes(a_bytes, 'big') + b = int.from_bytes(b_bytes, 'big') + return (a ^ b).bit_count() + + +def load_signatures(): + conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True) + cur = conn.cursor() + cur.execute(''' + SELECT s.signature_id, s.assigned_accountant, a.firm, + s.feature_vector, s.dhash_vector + FROM signatures s + JOIN accountants a ON s.assigned_accountant = a.name + WHERE s.assigned_accountant IS NOT NULL + AND s.feature_vector IS NOT NULL + AND s.dhash_vector IS NOT NULL + AND a.firm IS NOT NULL + ''') + rows = cur.fetchall() + conn.close() + return rows + + +def sample_inter_cpa_pairs(rows, n_pairs, seed, restrict_to_big4=False): + """Sample inter-CPA pairs and compute (cos, dh) for each.""" + rng = np.random.default_rng(seed) + if restrict_to_big4: + rows = [r for r in rows if r[2] in BIG4] + scope = 'big4_only' + else: + scope = 'all_firms' + print(f' [{scope}] {len(rows):,} signatures available') + + by_acct = defaultdict(list) + for r in rows: + by_acct[r[1]].append(r) + accountants = list(by_acct.keys()) + n_acct = len(accountants) + print(f' [{scope}] {n_acct} accountants') + + features = {a: np.stack( + [np.frombuffer(r[3], dtype=np.float32) for r in by_acct[a]] + ) for a in accountants} + dhashes = {a: [r[4] for r in by_acct[a]] for a in accountants} + + cos_vals = np.empty(n_pairs, dtype=np.float32) + dh_vals = np.empty(n_pairs, dtype=np.int32) + n_done = 0 + for _ in range(n_pairs): + i, j = rng.choice(n_acct, 2, replace=False) + a1, a2 = accountants[i], accountants[j] + n1, n2 = len(by_acct[a1]), len(by_acct[a2]) + k1 = int(rng.integers(0, n1)) + k2 = int(rng.integers(0, n2)) + f1 = features[a1][k1] + f2 = features[a2][k2] + cos = float(f1 @ f2) + d = hamming_64bit(dhashes[a1][k1], dhashes[a2][k2]) + cos_vals[n_done] = cos + dh_vals[n_done] = d + n_done += 1 + return scope, cos_vals, dh_vals + + +def wilson_ci(k, n, z=1.96): + if n == 0: + return (None, None) + phat = k / n + denom = 1 + z * z / n + centre = (phat + z * z / (2 * n)) / denom + half = z * np.sqrt(phat * (1 - phat) / n + z * z / (4 * n * n)) / denom + return (max(0.0, centre - half), min(1.0, centre + half)) + + +def far_at_cos(cos_vals, k): + n = len(cos_vals) + hits = int((cos_vals > k).sum()) + lo, hi = wilson_ci(hits, n) + return {'k': float(k), 'n': n, 'hits': hits, + 'far': hits / n, 'ci95_lo': lo, 'ci95_hi': hi} + + +def far_at_dh_le(dh_vals, k): + n = len(dh_vals) + hits = int((dh_vals <= k).sum()) + lo, hi = wilson_ci(hits, n) + return {'k': int(k), 'n': n, 'hits': hits, + 'far': hits / n, 'ci95_lo': lo, 'ci95_hi': hi} + + +def joint_far(cos_vals, dh_vals, cos_k, dh_k): + n = len(cos_vals) + hits = int(((cos_vals > cos_k) & (dh_vals <= dh_k)).sum()) + lo, hi = wilson_ci(hits, n) + return {'cos_k': float(cos_k), 'dh_k': int(dh_k), + 'n': n, 'hits': hits, + 'far': hits / n, 'ci95_lo': lo, 'ci95_hi': hi} + + +def cond_far(cos_vals, dh_vals, cos_k, dh_k): + """FAR(dh<=k | cos>cos_k)""" + cos_mask = cos_vals > cos_k + n_cond = int(cos_mask.sum()) + if n_cond == 0: + return {'cos_k': float(cos_k), 'dh_k': int(dh_k), + 'n_cond': 0, 'hits': 0, + 'cond_far': None, 'ci95_lo': None, 'ci95_hi': None} + hits = int(((dh_vals <= dh_k) & cos_mask).sum()) + lo, hi = wilson_ci(hits, n_cond) + return {'cos_k': float(cos_k), 'dh_k': int(dh_k), + 'n_cond': n_cond, 'hits': hits, + 'cond_far': hits / n_cond, 'ci95_lo': lo, 'ci95_hi': hi} + + +def invert_far_target(curve_entries, target, key='far'): + """Return the entries bracketing the target FAR (linear scan).""" + sorted_e = sorted(curve_entries, key=lambda e: e[key]) + for e in sorted_e: + if e[key] <= target: + best = e + else: + break + return best if sorted_e and sorted_e[0][key] <= target else None + + +def _fmt(x, fmt='.5f'): + return 'None' if x is None else format(x, fmt) + + +def run_scope(rows, scope_name, restrict_to_big4): + print(f'\n== Scope: {scope_name} ==') + scope_label, cos_vals, dh_vals = sample_inter_cpa_pairs( + rows, N_PAIRS, SEED, restrict_to_big4=restrict_to_big4) + print(f' Sampled {len(cos_vals):,} inter-CPA pairs') + print(f' cos: mean={cos_vals.mean():.4f}, ' + f'median={np.median(cos_vals):.4f}, ' + f'std={cos_vals.std():.4f}') + print(f' dh : mean={dh_vals.mean():.4f}, ' + f'median={np.median(dh_vals):.4f}, ' + f'std={dh_vals.std():.4f}') + + cos_curve = [far_at_cos(cos_vals, k) for k in COS_GRID] + dh_curve = [far_at_dh_le(dh_vals, k) for k in DH_GRID] + joint_curve_95 = [joint_far(cos_vals, dh_vals, 0.95, k) for k in DH_GRID] + cond_curve_95 = [cond_far(cos_vals, dh_vals, 0.95, k) for k in DH_GRID] + + print('\n [Cos FAR sweep]') + for e in cos_curve: + print(f' cos > {e["k"]:.3f}: FAR={_fmt(e["far"])}, ' + f'CI=[{_fmt(e["ci95_lo"])}, {_fmt(e["ci95_hi"])}], ' + f'hits={e["hits"]}/{e["n"]}') + + print('\n [dHash FAR sweep]') + for e in dh_curve: + print(f' dh <= {e["k"]:2d}: FAR={_fmt(e["far"])}, ' + f'CI=[{_fmt(e["ci95_lo"])}, {_fmt(e["ci95_hi"])}], ' + f'hits={e["hits"]}/{e["n"]}') + + print('\n [Joint FAR (cos > 0.95 AND dh <= k)]') + for e in joint_curve_95: + print(f' dh <= {e["dh_k"]:2d}: FAR={_fmt(e["far"])}, ' + f'hits={e["hits"]}/{e["n"]}') + + print('\n [Conditional FAR(dh <= k | cos > 0.95)]') + for e in cond_curve_95: + cf = e['cond_far'] + print(f' dh <= {e["dh_k"]:2d}: P(dh<=k | cos>0.95)=' + f'{_fmt(cf) if cf is not None else "n/a"}, ' + f'hits={e["hits"]}/{e["n_cond"]}') + + targets = [0.005, 0.001, 0.0005, 0.0001] + inv = {} + for t in targets: + inv[f'cos_far_<=_{t}'] = invert_far_target(cos_curve, t, 'far') + inv[f'dh_far_<=_{t}'] = invert_far_target(dh_curve, t, 'far') + inv[f'joint_at_cos95_far_<=_{t}'] = invert_far_target( + joint_curve_95, t, 'far') + + print('\n [Threshold inversion]') + for tgt in targets: + e = inv[f'cos_far_<=_{tgt}'] + if e is not None: + print(f' FAR <= {tgt}: max cos threshold with FAR<=tgt is ' + f'cos > {e["k"]:.3f} (FAR={e["far"]:.5f})') + e = inv[f'dh_far_<=_{tgt}'] + if e is not None: + print(f' FAR <= {tgt}: max dh threshold with FAR<=tgt is ' + f'dh <= {e["k"]} (FAR={e["far"]:.5f})') + e = inv[f'joint_at_cos95_far_<=_{tgt}'] + if e is not None: + print(f' FAR <= {tgt}: under cos>0.95, max dh threshold ' + f'with joint FAR<=tgt is dh <= {e["dh_k"]} ' + f'(joint FAR={e["far"]:.5f})') + + return { + 'scope': scope_label, + 'n_pairs': int(len(cos_vals)), + 'cos_summary': { + 'mean': float(cos_vals.mean()), + 'median': float(np.median(cos_vals)), + 'std': float(cos_vals.std()), + 'p99': float(np.percentile(cos_vals, 99)), + 'p999': float(np.percentile(cos_vals, 99.9)), + 'max': float(cos_vals.max()), + }, + 'dh_summary': { + 'mean': float(dh_vals.mean()), + 'median': float(np.median(dh_vals)), + 'std': float(dh_vals.std()), + 'p01': float(np.percentile(dh_vals, 1)), + 'p001': float(np.percentile(dh_vals, 0.1)), + 'min': int(dh_vals.min()), + }, + 'cos_far_curve': cos_curve, + 'dh_far_curve': dh_curve, + 'joint_far_at_cos95_curve': joint_curve_95, + 'cond_far_at_cos95_curve': cond_curve_95, + 'threshold_inversions': inv, + } + + +def main(): + print('=' * 72) + print('Script 40b: Inter-CPA FAR Sweep (cos + dHash, joint + marginal)') + print('=' * 72) + rows = load_signatures() + print(f'\nLoaded {len(rows):,} signatures (full corpus)') + + results = { + 'meta': { + 'script': '40b', + 'timestamp': datetime.now().isoformat(timespec='seconds'), + 'n_pairs_sampled': N_PAIRS, + 'seed': SEED, + 'note': ('Inter-CPA pair-level FAR sweep for cos and dHash. ' + 'Anchor-based threshold derivation; replaces ' + 'distributional path attacked in codex round-29.'), + }, + 'scopes': {}, + } + + results['scopes']['big4_only'] = run_scope( + rows, 'Big-4 only', restrict_to_big4=True) + results['scopes']['all_firms'] = run_scope( + rows, 'All firms', restrict_to_big4=False) + + json_path = OUT / 'far_sweep_results.json' + json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False), + encoding='utf-8') + print(f'\n[json] {json_path}') + + md = [ + '# Inter-CPA FAR Sweep (Script 40b)', + '', + f'Generated: {results["meta"]["timestamp"]}', + f'Inter-CPA pair samples per scope: {N_PAIRS:,}; seed: {SEED}', + '', + ('Anchor-based threshold derivation. For each scope (Big-4 only ' + 'or all firms), sample random inter-CPA pairs and compute ' + 'cosine + Hamming distance per pair. Report False Acceptance ' + 'Rates (FAR) at various thresholds; invert FAR target to ' + 'derive thresholds with empirical specificity guarantees.'), + '', + ] + + for scope in ['big4_only', 'all_firms']: + s = results['scopes'][scope] + md += [f'## Scope: {scope} ({s["n_pairs"]:,} pairs)', '', + '### Cosine FAR curve', '', + '| cos > k | FAR | 95% CI | hits / n |', + '|---|---|---|---|'] + for e in s['cos_far_curve']: + md.append(f'| {e["k"]:.3f} | {_fmt(e["far"])} | ' + f'[{_fmt(e["ci95_lo"])}, {_fmt(e["ci95_hi"])}] | ' + f'{e["hits"]:,} / {e["n"]:,} |') + md += ['', '### dHash FAR curve', '', + '| dh <= k | FAR | 95% CI | hits / n |', + '|---|---|---|---|'] + for e in s['dh_far_curve']: + md.append(f'| {e["k"]:2d} | {_fmt(e["far"])} | ' + f'[{_fmt(e["ci95_lo"])}, {_fmt(e["ci95_hi"])}] | ' + f'{e["hits"]:,} / {e["n"]:,} |') + md += ['', '### Joint FAR (cos > 0.95 AND dh <= k)', '', + '| dh <= k | Joint FAR | hits / n |', + '|---|---|---|'] + for e in s['joint_far_at_cos95_curve']: + md.append(f'| {e["dh_k"]:2d} | {_fmt(e["far"])} | ' + f'{e["hits"]:,} / {e["n"]:,} |') + md += ['', + '### Conditional FAR(dh <= k | cos > 0.95)', + '', + 'Among inter-CPA pairs that already exceed cos > 0.95, ' + 'what fraction also have dh <= k? This quantifies ' + "dHash's marginal specificity contribution given the cos " + "gate is already applied.", + '', + '| dh <= k | Conditional FAR | hits / n_cond |', + '|---|---|---|'] + for e in s['cond_far_at_cos95_curve']: + cf = e['cond_far'] + md.append(f'| {e["dh_k"]:2d} | ' + f'{_fmt(cf) if cf is not None else "n/a"} | ' + f'{e["hits"]:,} / {e["n_cond"]:,} |') + md += ['', '### Threshold inversion', '', + '| FAR target | cos thresh | dh thresh | joint dh thresh ' + '(under cos>0.95) |', + '|---|---|---|---|'] + for tgt in [0.005, 0.001, 0.0005, 0.0001]: + e_c = s['threshold_inversions'].get(f'cos_far_<=_{tgt}') + e_d = s['threshold_inversions'].get(f'dh_far_<=_{tgt}') + e_j = s['threshold_inversions'].get( + f'joint_at_cos95_far_<=_{tgt}') + c_str = (f'cos > {e_c["k"]:.3f} (FAR={e_c["far"]:.5f})' + if e_c else 'unachievable') + d_str = (f'dh <= {e_d["k"]} (FAR={e_d["far"]:.5f})' + if e_d else 'unachievable') + j_str = (f'dh <= {e_j["dh_k"]} (FAR={e_j["far"]:.5f})' + if e_j else 'unachievable') + md.append(f'| {tgt} | {c_str} | {d_str} | {j_str} |') + md.append('') + + md += [ + '## Interpretation', + '', + ('- The cosine FAR curve replicates and extends v3.x §IV-I ' + 'Table X (which reported FAR=0.0005 at cos>0.95 from a ' + 'similar but smaller-sample inter-CPA negative anchor).'), + ('- The dHash FAR curve is the v4 contribution: prior v3.x ' + 'work used dh<=5 by convention without an empirical ' + 'specificity derivation. This script derives a specificity ' + "target → dh threshold mapping."), + ('- The conditional FAR(dh<=k | cos>0.95) curve tells us ' + 'whether dHash adds specificity given the cos gate. If the ' + "conditional FAR at dh<=5 is meaningfully lower than 1.0, " + 'dHash is providing additional specificity. If it is near ' + '1.0, dHash is largely redundant given cos>0.95 and the ' + 'five-way rule should be simplified.'), + ('- Thresholds derived by inverting FAR targets are ' + 'specificity-anchored operating points, not distributional ' + 'antimodes. They are robust to the integer-mass-point and ' + 'between-firm-composition artefacts identified in Scripts ' + '39b–39e.'), + '', + ] + md_path = OUT / 'far_sweep_report.md' + md_path.write_text('\n'.join(md), encoding='utf-8') + print(f'[md ] {md_path}') + + +if __name__ == '__main__': + main() diff --git a/signature_analysis/43_pool_normalized_per_signature_far.py b/signature_analysis/43_pool_normalized_per_signature_far.py new file mode 100644 index 0000000..5e73183 --- /dev/null +++ b/signature_analysis/43_pool_normalized_per_signature_far.py @@ -0,0 +1,568 @@ +#!/usr/bin/env python3 +""" +Script 43: Pool-Normalized Per-Signature FAR (anchor-based calibration) +======================================================================== +Codex round-30 verdict on Script 40b: per-pair FAR (~0.00060 at +cos>0.95) is NOT the per-signature classifier specificity. The +deployed classifier uses max-cosine and min-dHash over each CPA's +same-CPA pool, so the inter-CPA-equivalent specificity for a +signature with pool size n is approximately 1 - (1 - pair_FAR)^n, +which for Big-4 median pool ~280 is several percent, not 0.00014. + +This script computes pool-normalized per-signature FAR by drawing, +for each source signature s, a random inter-CPA candidate pool of +size n_pool(s) (= same-CPA pool size of s), and computing the +deployed descriptors against the random pool. The fraction of +source signatures whose max-cosine exceeds k (and/or min-dHash <= k) +is the per-signature FAR at that operating point. + +We also report: + - "Any-pair" joint FAR: max_cos > c AND min_dh <= d (descriptors + may come from different candidates) + - "Same-pair" joint FAR: at least one candidate has both + cos > c AND dh <= d + - Per-firm and pool-size-decile stratification + - CPA-block bootstrap CI on key FAR points + - Threshold inversion for target per-signature FAR + +Inputs: full Big-4 sub-corpus (n=150,453 sigs / 468 CPAs). +Random pool draws use one realisation per source signature, with +seed control. CPA-block bootstrap quantifies sampling noise. + +Outputs: + reports/v4_big4/pool_normalized_far/ + pool_normalized_results.json + pool_normalized_report.md +""" + +import json +import sqlite3 +import numpy as np +from pathlib import Path +from datetime import datetime +from collections import defaultdict + +DB = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' +OUT = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/' + 'v4_big4/pool_normalized_far') +OUT.mkdir(parents=True, exist_ok=True) + +BIG4 = ('勤業眾信聯合', '安侯建業聯合', '資誠聯合', '安永聯合') +ALIAS = {'勤業眾信聯合': 'Firm A', + '安侯建業聯合': 'Firm B', + '資誠聯合': 'Firm C', + '安永聯合': 'Firm D'} +SEED = 42 +BATCH = 200 # source signatures per batch +N_BOOT_CPA = 1000 # CPA-block bootstrap replicates + +COS_KS = [0.90, 0.92, 0.93, 0.94, 0.945, 0.95, 0.955, 0.96, 0.97, 0.98] +DH_KS = [2, 3, 4, 5, 6, 8, 10, 15] + + +def load_big4(): + conn = sqlite3.connect(f'file:{DB}?mode=ro', uri=True) + cur = conn.cursor() + cur.execute(''' + SELECT s.signature_id, s.assigned_accountant, a.firm, s.source_pdf, + s.feature_vector, s.dhash_vector + FROM signatures s + JOIN accountants a ON s.assigned_accountant = a.name + WHERE s.assigned_accountant IS NOT NULL + AND s.feature_vector IS NOT NULL + AND s.dhash_vector IS NOT NULL + AND a.firm IN (?, ?, ?, ?) + ''', BIG4) + rows = cur.fetchall() + conn.close() + return rows + + +def hamming_vec(query_bytes, cand_bytes_array): + """Hamming between one 8-byte hash and an array of 8-byte hashes.""" + q = int.from_bytes(query_bytes, 'big') + out = np.empty(len(cand_bytes_array), dtype=np.int32) + for i, c in enumerate(cand_bytes_array): + c_int = int.from_bytes(c, 'big') + out[i] = (q ^ c_int).bit_count() + return out + + +def wilson_ci(k, n, z=1.96): + if n == 0: + return (None, None) + phat = k / n + denom = 1 + z * z / n + centre = (phat + z * z / (2 * n)) / denom + half = z * np.sqrt(phat * (1 - phat) / n + z * z / (4 * n * n)) / denom + return (max(0.0, centre - half), min(1.0, centre + half)) + + +def main(): + print('=' * 72) + print('Script 43: Pool-Normalized Per-Signature FAR') + print('=' * 72) + rows = load_big4() + n_sigs = len(rows) + print(f'\nLoaded {n_sigs:,} Big-4 signatures') + + # Build index arrays + sig_ids = np.array([r[0] for r in rows], dtype=np.int64) + cpas = np.array([r[1] for r in rows]) + firms = np.array([ALIAS[r[2]] for r in rows]) + source_pdfs = np.array([r[3] for r in rows]) + + # Feature matrix + feats = np.stack([np.frombuffer(r[4], dtype=np.float32) + for r in rows]).astype(np.float32) + print(f' Feature matrix: {feats.shape}, ' + f'{feats.nbytes / 1e9:.2f} GB') + # L2-normalize + norms = np.linalg.norm(feats, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + feats = feats / norms + + # dHash bytes + dhashes = [r[5] for r in rows] + + # CPA → indices + cpa_to_idx = defaultdict(list) + for i, c in enumerate(cpas): + cpa_to_idx[c].append(i) + cpa_to_idx = {c: np.array(v, dtype=np.int64) + for c, v in cpa_to_idx.items()} + n_cpas = len(cpa_to_idx) + pool_sizes = {c: len(v) - 1 for c, v in cpa_to_idx.items()} + print(f' CPAs: {n_cpas}; pool-size summary: ' + f'min={min(pool_sizes.values())}, ' + f'median={int(np.median(list(pool_sizes.values())))}, ' + f'max={max(pool_sizes.values())}') + + # Pre-compute: for sampling non-same-CPA candidates, we need fast + # index sampling. The total available pool for each source sig is + # all_indices \ same_cpa_indices. + all_idx = np.arange(n_sigs, dtype=np.int64) + + # ── Per-source-signature simulation ───────────────────── + print('\nSimulating per-source-signature inter-CPA-equivalent pool...') + rng = np.random.default_rng(SEED) + + # Per-signature stored statistics + max_cos = np.zeros(n_sigs, dtype=np.float32) + min_dh = np.zeros(n_sigs, dtype=np.int32) + cos_at_min_dh = np.zeros(n_sigs, dtype=np.float32) + dh_at_max_cos = np.zeros(n_sigs, dtype=np.int32) + pool_size_arr = np.zeros(n_sigs, dtype=np.int32) + + # For each source signature, we also record indicator for same-pair + # joint event at (cos>0.95, dh<=5) -- the headline operational rule. + # This requires keeping per-signature any() flag for that pair. + headline_same_pair_95_5 = np.zeros(n_sigs, dtype=bool) + headline_same_pair_95_4 = np.zeros(n_sigs, dtype=bool) + headline_same_pair_95_3 = np.zeros(n_sigs, dtype=bool) + + # process batches of source signatures + for batch_start in range(0, n_sigs, BATCH): + batch_end = min(batch_start + BATCH, n_sigs) + if batch_start % 5000 == 0: + pct = batch_start / n_sigs * 100 + print(f' {batch_start:,}/{n_sigs:,} ({pct:.1f}%)') + + for si in range(batch_start, batch_end): + s_cpa = cpas[si] + n_pool = pool_sizes[s_cpa] + pool_size_arr[si] = n_pool + + if n_pool <= 0: + max_cos[si] = 0.0 + min_dh[si] = 64 + continue + + # Sample n_pool candidates from non-same-CPA indices + same_cpa = cpa_to_idx[s_cpa] + # Use random.choice over all_idx excluding same_cpa is slow; + # instead reject-sample from all_idx + need = n_pool + cand_indices = [] + attempts = 0 + while need > 0 and attempts < 10: + draw = rng.choice(n_sigs, size=need * 2, replace=True) + # filter out same_cpa + same_mask = np.isin(draw, same_cpa) + ok = draw[~same_mask] + cand_indices.extend(ok[:need].tolist()) + need -= len(ok[:need]) + attempts += 1 + if need > 0: + # fallback: deterministic sample without same-CPA + pool_mask = np.ones(n_sigs, dtype=bool) + pool_mask[same_cpa] = False + pool_idx = all_idx[pool_mask] + fb = rng.choice(pool_idx, size=need, replace=False) + cand_indices.extend(fb.tolist()) + cand_indices = np.array(cand_indices[:n_pool], dtype=np.int64) + + # Cosine: source feat @ cand feats + cos_vec = feats[cand_indices] @ feats[si] + # dHash + dh_vec = hamming_vec(dhashes[si], + [dhashes[c] for c in cand_indices]) + + mc_idx = int(np.argmax(cos_vec)) + md_idx = int(np.argmin(dh_vec)) + max_cos[si] = float(cos_vec[mc_idx]) + min_dh[si] = int(dh_vec[md_idx]) + dh_at_max_cos[si] = int(dh_vec[mc_idx]) + cos_at_min_dh[si] = float(cos_vec[md_idx]) + + # Same-pair joint indicators + cos_gt = cos_vec > 0.95 + if cos_gt.any(): + dh_under_5 = dh_vec <= 5 + dh_under_4 = dh_vec <= 4 + dh_under_3 = dh_vec <= 3 + headline_same_pair_95_5[si] = bool((cos_gt & dh_under_5).any()) + headline_same_pair_95_4[si] = bool((cos_gt & dh_under_4).any()) + headline_same_pair_95_3[si] = bool((cos_gt & dh_under_3).any()) + + print(f' Done.') + + # ── Aggregate ────────────────────────────────────────── + print('\nAggregating per-signature FAR statistics...') + + def far_marginal_cos(k): + hits = int((max_cos > k).sum()) + lo, hi = wilson_ci(hits, n_sigs) + return {'k': k, 'hits': hits, 'n': n_sigs, + 'far': hits / n_sigs, + 'ci95_lo': lo, 'ci95_hi': hi} + + def far_marginal_dh(k): + hits = int((min_dh <= k).sum()) + lo, hi = wilson_ci(hits, n_sigs) + return {'k': k, 'hits': hits, 'n': n_sigs, + 'far': hits / n_sigs, + 'ci95_lo': lo, 'ci95_hi': hi} + + def far_any_pair_joint(cos_k, dh_k): + hits = int(((max_cos > cos_k) & (min_dh <= dh_k)).sum()) + lo, hi = wilson_ci(hits, n_sigs) + return {'cos_k': cos_k, 'dh_k': dh_k, + 'hits': hits, 'n': n_sigs, + 'far': hits / n_sigs, + 'ci95_lo': lo, 'ci95_hi': hi} + + def far_same_pair_joint(cos_k, dh_k, indicator): + hits = int(indicator.sum()) + lo, hi = wilson_ci(hits, n_sigs) + return {'cos_k': cos_k, 'dh_k': dh_k, + 'hits': hits, 'n': n_sigs, + 'far': hits / n_sigs, + 'ci95_lo': lo, 'ci95_hi': hi} + + cos_curve = [far_marginal_cos(k) for k in COS_KS] + dh_curve = [far_marginal_dh(k) for k in DH_KS] + any_pair_curve = [far_any_pair_joint(0.95, k) for k in DH_KS] + same_pair_curve = [ + far_same_pair_joint(0.95, 5, headline_same_pair_95_5), + far_same_pair_joint(0.95, 4, headline_same_pair_95_4), + far_same_pair_joint(0.95, 3, headline_same_pair_95_3), + ] + + print('\n[Per-signature marginal cos FAR]') + for e in cos_curve: + print(f' max-cos > {e["k"]:.3f}: FAR={e["far"]:.4f}, ' + f'CI=[{e["ci95_lo"]:.4f}, {e["ci95_hi"]:.4f}], ' + f'hits={e["hits"]}/{e["n"]:,}') + + print('\n[Per-signature marginal dh FAR]') + for e in dh_curve: + print(f' min-dh <= {e["k"]:2d}: FAR={e["far"]:.4f}, ' + f'CI=[{e["ci95_lo"]:.4f}, {e["ci95_hi"]:.4f}], ' + f'hits={e["hits"]}/{e["n"]:,}') + + print('\n[Per-signature any-pair joint FAR (cos>0.95 AND dh<=k)]') + for e in any_pair_curve: + print(f' dh <= {e["dh_k"]:2d}: FAR={e["far"]:.4f}, ' + f'hits={e["hits"]}/{e["n"]:,}') + + print('\n[Per-signature SAME-pair joint FAR]') + for e in same_pair_curve: + print(f' cos>0.95 AND dh<={e["dh_k"]}: FAR={e["far"]:.4f}, ' + f'CI=[{e["ci95_lo"]:.4f}, {e["ci95_hi"]:.4f}], ' + f'hits={e["hits"]}/{e["n"]:,}') + + # Per-firm and per-pool-decile stratification + print('\n[Per-firm headline FAR (any-pair, cos>0.95 AND dh<=5)]') + per_firm = {} + for f in sorted(set(firms)): + mask = firms == f + n_f = int(mask.sum()) + hits_anypair = int(((max_cos[mask] > 0.95) & + (min_dh[mask] <= 5)).sum()) + hits_samepair = int(headline_same_pair_95_5[mask].sum()) + per_firm[f] = { + 'n': n_f, + 'any_pair_far': hits_anypair / n_f, + 'same_pair_far': hits_samepair / n_f, + } + print(f' {f}: n={n_f:,} ' + f'any-pair FAR={hits_anypair/n_f:.4f}, ' + f'same-pair FAR={hits_samepair/n_f:.4f}') + + print('\n[Pool-size decile × headline FAR]') + pool_arr = pool_size_arr + deciles = np.percentile(pool_arr, np.arange(0, 110, 10)) + per_decile = {} + for d in range(10): + lo, hi = deciles[d], deciles[d + 1] + mask = (pool_arr >= lo) & (pool_arr <= hi if d == 9 + else pool_arr < hi) + n_d = int(mask.sum()) + if n_d == 0: + continue + hits_any = int(((max_cos[mask] > 0.95) & + (min_dh[mask] <= 5)).sum()) + hits_same = int(headline_same_pair_95_5[mask].sum()) + per_decile[f'decile_{d+1}'] = { + 'pool_range': [float(lo), float(hi)], + 'n': n_d, + 'any_pair_far': hits_any / n_d, + 'same_pair_far': hits_same / n_d, + } + print(f' Decile {d+1} (pool {lo:.0f}-{hi:.0f}): n={n_d:,} ' + f'any-FAR={hits_any/n_d:.4f}, ' + f'same-FAR={hits_same/n_d:.4f}') + + # CPA bootstrap on headline (cos>0.95 AND dh<=5, same-pair) + print(f'\n[CPA-block bootstrap {N_BOOT_CPA} replicates]') + rng_b = np.random.default_rng(SEED + 1) + all_cpa_list = list(cpa_to_idx.keys()) + boot_anypair = np.zeros(N_BOOT_CPA) + boot_samepair = np.zeros(N_BOOT_CPA) + for b in range(N_BOOT_CPA): + cpas_b = rng_b.choice(all_cpa_list, size=len(all_cpa_list), + replace=True) + idx_b = np.concatenate([cpa_to_idx[c] for c in cpas_b]) + n_b = len(idx_b) + boot_anypair[b] = ((max_cos[idx_b] > 0.95) & + (min_dh[idx_b] <= 5)).mean() + boot_samepair[b] = headline_same_pair_95_5[idx_b].mean() + boot_anypair_ci = (float(np.percentile(boot_anypair, 2.5)), + float(np.percentile(boot_anypair, 97.5))) + boot_samepair_ci = (float(np.percentile(boot_samepair, 2.5)), + float(np.percentile(boot_samepair, 97.5))) + print(f' any-pair FAR boot mean={boot_anypair.mean():.4f}, ' + f'95% CI={boot_anypair_ci}') + print(f' same-pair FAR boot mean={boot_samepair.mean():.4f}, ' + f'95% CI={boot_samepair_ci}') + + # Document-level aggregation: a document is flagged if any of its + # signatures has max_cos > 0.95 AND min_dh <= 5 (the worst-case rule) + print('\n[Document-level aggregation]') + doc_idx = defaultdict(list) + for i, pdf in enumerate(source_pdfs): + doc_idx[pdf].append(i) + n_docs = len(doc_idx) + doc_anypair_flag = 0 + doc_samepair_flag = 0 + for pdf, idxs in doc_idx.items(): + idxs_a = np.array(idxs, dtype=np.int64) + if ((max_cos[idxs_a] > 0.95) & (min_dh[idxs_a] <= 5)).any(): + doc_anypair_flag += 1 + if headline_same_pair_95_5[idxs_a].any(): + doc_samepair_flag += 1 + print(f' n_documents: {n_docs:,}') + print(f' doc-level any-pair FAR (any sig flagged) = ' + f'{doc_anypair_flag/n_docs:.4f} ({doc_anypair_flag}/{n_docs})') + print(f' doc-level same-pair FAR = ' + f'{doc_samepair_flag/n_docs:.4f} ({doc_samepair_flag}/{n_docs})') + + # Threshold inversion: find cos and dh thresholds that hit per-sig + # FAR targets at the marginal level + print('\n[Per-signature marginal threshold inversion]') + inversions = {} + for tgt in [0.10, 0.05, 0.02, 0.01, 0.005]: + c_pick = None + for e in cos_curve: + if e['far'] <= tgt: + c_pick = e + break + d_pick = None + for e in dh_curve: + if e['far'] <= tgt: + d_pick = e + break + any_pick = None + for e in any_pair_curve: + if e['far'] <= tgt: + any_pick = e + break + same_pick = None + for e in same_pair_curve: + if e['far'] <= tgt: + same_pick = e + break + inversions[f'per_sig_far_<=_{tgt}'] = { + 'marginal_cos': c_pick, 'marginal_dh': d_pick, + 'any_pair_joint': any_pick, 'same_pair_joint': same_pick, + } + print(f' per-sig FAR <= {tgt}:') + if c_pick: + print(f' marginal cos: cos > {c_pick["k"]} ' + f'(FAR={c_pick["far"]:.4f})') + if d_pick: + print(f' marginal dh: dh <= {d_pick["k"]} ' + f'(FAR={d_pick["far"]:.4f})') + if any_pick: + print(f' any-pair joint: dh <= {any_pick["dh_k"]} ' + f'(FAR={any_pick["far"]:.4f})') + if same_pick: + print(f' same-pair joint: dh <= {same_pick["dh_k"]} ' + f'(FAR={same_pick["far"]:.4f})') + + results = { + 'meta': { + 'script': '43', + 'timestamp': datetime.now().isoformat(timespec='seconds'), + 'n_signatures': n_sigs, + 'n_cpas': n_cpas, + 'n_boot_cpa': N_BOOT_CPA, + 'seed': SEED, + 'note': ('Pool-normalized per-signature FAR. For each ' + 'source signature, simulate inter-CPA candidate ' + 'pool of size n_pool(s); compute deployed max-cos ' + 'and min-dh; aggregate per-signature FAR.'), + }, + 'marginal_cos_curve': cos_curve, + 'marginal_dh_curve': dh_curve, + 'any_pair_joint_curve': any_pair_curve, + 'same_pair_joint': same_pair_curve, + 'per_firm_headline': per_firm, + 'per_pool_decile_headline': per_decile, + 'cpa_bootstrap_headline': { + 'any_pair_mean': float(boot_anypair.mean()), + 'any_pair_ci95': boot_anypair_ci, + 'same_pair_mean': float(boot_samepair.mean()), + 'same_pair_ci95': boot_samepair_ci, + }, + 'document_level_headline': { + 'n_docs': n_docs, + 'any_pair_far': doc_anypair_flag / n_docs, + 'same_pair_far': doc_samepair_flag / n_docs, + }, + 'threshold_inversions': inversions, + } + + json_path = OUT / 'pool_normalized_results.json' + json_path.write_text(json.dumps(results, indent=2, ensure_ascii=False), + encoding='utf-8') + print(f'\n[json] {json_path}') + + md = ['# Pool-Normalized Per-Signature FAR (Script 43)', + '', f'Generated: {results["meta"]["timestamp"]}', + (f'Big-4 source signatures: {n_sigs:,} across {n_cpas} CPAs; ' + f'pool-size median={int(np.median(list(pool_sizes.values())))}, ' + f'max={max(pool_sizes.values())}'), + (f'CPA-block bootstrap: {N_BOOT_CPA} replicates. Per source ' + 'signature, one realisation of n_pool(s)-sized random ' + 'inter-CPA candidate pool.'), + '', + '## Headline (cos>0.95 AND dh<=5)', + '', + '| Variant | per-sig FAR | 95% Wilson CI | CPA-bootstrap 95% CI |', + '|---|---|---|---|'] + md.append(f'| any-pair joint | ' + f'{((max_cos > 0.95) & (min_dh <= 5)).mean():.4f} | ' + f'see JSON | [{boot_anypair_ci[0]:.4f}, ' + f'{boot_anypair_ci[1]:.4f}] |') + md.append(f'| same-pair joint | ' + f'{headline_same_pair_95_5.mean():.4f} | ' + f'see JSON | [{boot_samepair_ci[0]:.4f}, ' + f'{boot_samepair_ci[1]:.4f}] |') + md += [ + '', + '## Marginal cos FAR (per-signature)', + '', + '| max-cos > k | FAR | 95% CI | hits / n |', + '|---|---|---|---|'] + for e in cos_curve: + md.append(f'| {e["k"]} | {e["far"]:.4f} | ' + f'[{e["ci95_lo"]:.4f}, {e["ci95_hi"]:.4f}] | ' + f'{e["hits"]} / {e["n"]:,} |') + md += ['', '## Marginal dh FAR (per-signature)', '', + '| min-dh <= k | FAR | 95% CI | hits / n |', + '|---|---|---|---|'] + for e in dh_curve: + md.append(f'| {e["k"]} | {e["far"]:.4f} | ' + f'[{e["ci95_lo"]:.4f}, {e["ci95_hi"]:.4f}] | ' + f'{e["hits"]} / {e["n"]:,} |') + md += ['', + '## Any-pair joint FAR (cos>0.95 AND dh<=k)', + '', + '| dh <= k | FAR | hits / n |', + '|---|---|---|'] + for e in any_pair_curve: + md.append(f'| {e["dh_k"]} | {e["far"]:.4f} | ' + f'{e["hits"]} / {e["n"]:,} |') + md += ['', + '## Same-pair joint FAR (one candidate satisfies both)', + '', + '| cos>0.95 AND dh<=k | FAR | 95% CI | hits / n |', + '|---|---|---|---|'] + for e in same_pair_curve: + md.append(f'| dh <= {e["dh_k"]} | {e["far"]:.4f} | ' + f'[{e["ci95_lo"]:.4f}, {e["ci95_hi"]:.4f}] | ' + f'{e["hits"]} / {e["n"]:,} |') + md += ['', '## Per-firm headline', '', + '| Firm | n | any-pair FAR | same-pair FAR |', + '|---|---|---|---|'] + for f, s in per_firm.items(): + md.append(f'| {f} | {s["n"]:,} | {s["any_pair_far"]:.4f} | ' + f'{s["same_pair_far"]:.4f} |') + md += ['', '## Per-pool-decile headline', '', + '| Decile | pool range | n | any-pair FAR | same-pair FAR |', + '|---|---|---|---|---|'] + for k, s in per_decile.items(): + md.append(f'| {k} | {s["pool_range"][0]:.0f}-' + f'{s["pool_range"][1]:.0f} | {s["n"]:,} | ' + f'{s["any_pair_far"]:.4f} | ' + f'{s["same_pair_far"]:.4f} |') + md += ['', '## Document-level', + '', + f'- n_documents: {n_docs:,}', + f'- any-pair FAR (any sig flagged): ' + f'{doc_anypair_flag/n_docs:.4f} ' + f'({doc_anypair_flag}/{n_docs})', + f'- same-pair FAR: {doc_samepair_flag/n_docs:.4f} ' + f'({doc_samepair_flag}/{n_docs})', + '', + '## Threshold inversion (per-signature FAR targets)', + '', + '| target | marginal cos | marginal dh | any-pair joint ' + '| same-pair joint |', + '|---|---|---|---|---|'] + for tgt in [0.10, 0.05, 0.02, 0.01, 0.005]: + inv = inversions[f'per_sig_far_<=_{tgt}'] + c = inv['marginal_cos'] + d = inv['marginal_dh'] + a = inv['any_pair_joint'] + s = inv['same_pair_joint'] + cs = (f'cos > {c["k"]} (FAR={c["far"]:.4f})' + if c else 'unachievable') + ds = (f'dh <= {d["k"]} (FAR={d["far"]:.4f})' + if d else 'unachievable') + as_ = (f'dh <= {a["dh_k"]} (FAR={a["far"]:.4f})' + if a else 'unachievable') + ss = (f'dh <= {s["dh_k"]} (FAR={s["far"]:.4f})' + if s else 'unachievable') + md.append(f'| {tgt} | {cs} | {ds} | {as_} | {ss} |') + md.append('') + + md_path = OUT / 'pool_normalized_report.md' + md_path.write_text('\n'.join(md), encoding='utf-8') + print(f'[md ] {md_path}') + + +if __name__ == '__main__': + main()