Paper A v13: filled submission draft (rev7) + reproducible build bundle
Fill all 18 placeholders in the condensed v13 submission draft with data verified against the analysis DB and LOCKED canonical scripts; close 12/13 co-author review items (only #8b protocol first-run open). Key changes (need co-author sign-off; see handoff doc): - Firm A out-of-sample HC 0.01% -> 0.42% (buggy 0.0001 from Script 49 same-pair bug, propagated v4.2->v13; never reuse 0.0001) - §III-D empty cell ~=0 -> 7,681 honest reframe (not degenerate crops) - low cosine cut 0.837 -> 0.8547 primary (BCD 2013-2019 closed-world, held-out discipline; 0.8489 confirmed = BCD all-period); HC/MC/HSC unchanged, UN/LH move <=0.4pp Adds Figures 1-5 (real-data plots + schematics), full references, Appendix A/B, UN/HSC ICCR, n-reconciliation, #13 MOPS-metadata survival verification, "參" set-level feasibility probe (negative). Two codex (gpt-5.5) adversarial rounds applied; no fabrication found. Bundle: paper/v13_build/ (markdown source, harvest/figure scripts, figures) for reproducibility. Handoff note for co-author included. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,79 @@
|
||||
import sqlite3, numpy as np
|
||||
from collections import defaultdict
|
||||
from scipy.stats import gaussian_kde
|
||||
DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
FIRM_A='勤業眾信聯合'; BIG4=('勤業眾信聯合','安侯建業聯合','資誠聯合','安永聯合')
|
||||
SEED=42; POP=np.array([bin(i).count('1') for i in range(256)],dtype=np.uint8)
|
||||
def load():
|
||||
c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True)
|
||||
r=c.execute("""SELECT s.assigned_accountant,a.firm,s.source_pdf,s.feature_vector,s.dhash_vector,
|
||||
CAST(substr(s.year_month,1,4) AS INT) FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""").fetchall()
|
||||
c.close(); return r
|
||||
def crossover(keep,label):
|
||||
feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
|
||||
feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None)
|
||||
cpas=np.array([r[0] for r in keep]); by=defaultdict(list)
|
||||
for i,c in enumerate(cpas): by[c].append(i)
|
||||
by={c:np.array(v) for c,v in by.items() if len(v)>=3}; accts=list(by.keys())
|
||||
pw=np.array([len(by[c])*(len(by[c])-1)/2 for c in accts],float); pw/=pw.sum()
|
||||
rng=np.random.default_rng(SEED); M=100_000
|
||||
intra=np.empty(M,np.float32); ci=rng.choice(len(accts),M,p=pw)
|
||||
for t in range(M):
|
||||
a,b=rng.choice(by[accts[ci[t]]],2,replace=False); intra[t]=feats[a]@feats[b]
|
||||
inter=np.empty(M,np.float32)
|
||||
for t in range(M):
|
||||
i,j=rng.choice(len(accts),2,replace=False); inter[t]=feats[rng.choice(by[accts[i]])]@feats[rng.choice(by[accts[j]])]
|
||||
xs=np.linspace(0.3,1.0,10000); diff=gaussian_kde(intra)(xs)-gaussian_kde(inter)(xs)
|
||||
cr=[float(x) for x in xs[np.where(np.diff(np.sign(diff)))[0]] if 0.6<x<0.99]
|
||||
print(f' [{label}] crossover {[f"{x:.4f}" for x in cr]} (n={len(keep)}, accts>=3={len(accts)})')
|
||||
def percomp_bands(keep,label,M=500_000):
|
||||
feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
|
||||
feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None)
|
||||
dh=np.stack([np.frombuffer(r[4],np.uint8) for r in keep]); cpas=np.array([r[0] for r in keep])
|
||||
by=defaultdict(list)
|
||||
for i,c in enumerate(cpas): by[c].append(i)
|
||||
accts=[c for c,v in by.items() if len(v)>=1]; rng=np.random.default_rng(SEED)
|
||||
n=len(keep); ii=rng.integers(0,n,M*2); jj=rng.integers(0,n,M*2)
|
||||
keepm=cpas[ii]!=cpas[jj]; ii=ii[keepm][:M]; jj=jj[keepm][:M]
|
||||
cos=np.einsum('ij,ij->i',feats[ii],feats[jj]); d=POP[dh[ii]^dh[jj]].sum(1)
|
||||
hc=(cos>0.95)&(d<=5); mc=(cos>0.95)&(d>5)&(d<=15); hsc=(cos>0.95)&(d>15)
|
||||
un=(cos>0.837)&(cos<=0.95); lh=cos<=0.837
|
||||
print(f' [{label}] per-COMPARISON ICCR (M={len(ii)}): HC {hc.mean():.6f} MC {mc.mean():.6f} HSC {hsc.mean():.6f} UN {un.mean():.4f} LH {lh.mean():.4f}')
|
||||
def persig_perdoc_bands(keep,label):
|
||||
n=len(keep); feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
|
||||
feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None)
|
||||
dh=np.stack([np.frombuffer(r[4],np.uint8) for r in keep]); cpas=np.array([r[0] for r in keep]); docs=np.array([r[2] for r in keep])
|
||||
ci=defaultdict(list)
|
||||
for i,c in enumerate(cpas): ci[c].append(i)
|
||||
ci={c:np.array(v) for c,v in ci.items()}; ps={c:len(v)-1 for c,v in ci.items()}
|
||||
allidx=np.arange(n); rng=np.random.default_rng(SEED); mc=np.zeros(n,np.float32); md=np.full(n,64,np.int32)
|
||||
for si in range(n):
|
||||
p=ps[cpas[si]]
|
||||
if p<=0: continue
|
||||
same=ci[cpas[si]]; need=p; cand=[]; att=0
|
||||
while need>0 and att<10:
|
||||
dr=rng.choice(n,size=need*2,replace=True); ok=dr[~np.isin(dr,same)]; cand.extend(ok[:need].tolist()); need-=len(ok[:need]); att+=1
|
||||
cand=np.array(cand[:p],dtype=np.int64)
|
||||
mc[si]=(feats[cand]@feats[si]).max(); md[si]=int(POP[dh[cand]^dh[si]].sum(1).min())
|
||||
un=(mc>0.837)&(mc<=0.95); hsc=(mc>0.95)&(md>15)
|
||||
# per-doc: any signature in band
|
||||
dd=defaultdict(list)
|
||||
for i in range(n): dd[docs[i]].append(i)
|
||||
docs_un=np.mean([un[v].any() for v in dd.values()]); docs_hsc=np.mean([hsc[v].any() for v in dd.values()])
|
||||
print(f' [{label}] per-SIGNATURE ICCR: UN {un.mean():.4f} HSC {hsc.mean():.6f}')
|
||||
print(f' [{label}] per-REPORT ICCR: UN {docs_un:.4f} HSC {docs_hsc:.6f} (n_doc={len(dd)})')
|
||||
|
||||
rows=load()
|
||||
bcd_all=[r for r in rows if r[1] in BIG4 and r[1]!=FIRM_A]
|
||||
bcd_19=[r for r in bcd_all if 2013<=r[5]<=2019]
|
||||
print("=== ITEM 11: KDE crossover (verify corpus 0.837 / BCD-all 0.8489, then closed-world 2013-2019) ===")
|
||||
crossover(rows,'corpus-wide (verify ~0.8367)')
|
||||
crossover(bcd_all,'BCD-only ALL period (verify 0.8489)')
|
||||
crossover(bcd_19,'BCD 2013-2019 CLOSED-WORLD (NEW primary candidate)')
|
||||
print("\n=== ITEM 3: UN / HSC full ICCR on BCD 2013-2019 ===")
|
||||
percomp_bands(bcd_19,'BCD 2013-2019')
|
||||
persig_perdoc_bands(bcd_19,'BCD 2013-2019')
|
||||
print("\n=== ITEM 12: n reconciliation ===")
|
||||
print(f" BCD full-period (2013-2023) signatures = {len(bcd_all)} <- Script53 logged n=89,994")
|
||||
print(f" BCD 2013-2019 signatures = {len(bcd_19)} <- headline ICCR base (reproduces 0.0059)")
|
||||
Reference in New Issue
Block a user