Files
pdf_signature_extraction/paper/v13_build/scripts/close_items.py
T
gbanyan 66c9194fcf Paper A v13: filled submission draft (rev7) + reproducible build bundle
Fill all 18 placeholders in the condensed v13 submission draft with
data verified against the analysis DB and LOCKED canonical scripts;
close 12/13 co-author review items (only #8b protocol first-run open).

Key changes (need co-author sign-off; see handoff doc):
- Firm A out-of-sample HC 0.01% -> 0.42% (buggy 0.0001 from Script 49
  same-pair bug, propagated v4.2->v13; never reuse 0.0001)
- §III-D empty cell ~=0 -> 7,681 honest reframe (not degenerate crops)
- low cosine cut 0.837 -> 0.8547 primary (BCD 2013-2019 closed-world,
  held-out discipline; 0.8489 confirmed = BCD all-period); HC/MC/HSC
  unchanged, UN/LH move <=0.4pp

Adds Figures 1-5 (real-data plots + schematics), full references,
Appendix A/B, UN/HSC ICCR, n-reconciliation, #13 MOPS-metadata
survival verification, "參" set-level feasibility probe (negative).
Two codex (gpt-5.5) adversarial rounds applied; no fabrication found.

Bundle: paper/v13_build/ (markdown source, harvest/figure scripts,
figures) for reproducibility. Handoff note for co-author included.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-15 03:24:50 +08:00

80 lines
5.1 KiB
Python

import sqlite3, numpy as np
from collections import defaultdict
from scipy.stats import gaussian_kde
DB='/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
FIRM_A='勤業眾信聯合'; BIG4=('勤業眾信聯合','安侯建業聯合','資誠聯合','安永聯合')
SEED=42; POP=np.array([bin(i).count('1') for i in range(256)],dtype=np.uint8)
def load():
c=sqlite3.connect(f'file:{DB}?mode=ro',uri=True)
r=c.execute("""SELECT s.assigned_accountant,a.firm,s.source_pdf,s.feature_vector,s.dhash_vector,
CAST(substr(s.year_month,1,4) AS INT) FROM signatures s JOIN accountants a ON s.assigned_accountant=a.name
WHERE s.assigned_accountant IS NOT NULL AND a.firm IS NOT NULL AND s.feature_vector IS NOT NULL AND s.dhash_vector IS NOT NULL""").fetchall()
c.close(); return r
def crossover(keep,label):
feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None)
cpas=np.array([r[0] for r in keep]); by=defaultdict(list)
for i,c in enumerate(cpas): by[c].append(i)
by={c:np.array(v) for c,v in by.items() if len(v)>=3}; accts=list(by.keys())
pw=np.array([len(by[c])*(len(by[c])-1)/2 for c in accts],float); pw/=pw.sum()
rng=np.random.default_rng(SEED); M=100_000
intra=np.empty(M,np.float32); ci=rng.choice(len(accts),M,p=pw)
for t in range(M):
a,b=rng.choice(by[accts[ci[t]]],2,replace=False); intra[t]=feats[a]@feats[b]
inter=np.empty(M,np.float32)
for t in range(M):
i,j=rng.choice(len(accts),2,replace=False); inter[t]=feats[rng.choice(by[accts[i]])]@feats[rng.choice(by[accts[j]])]
xs=np.linspace(0.3,1.0,10000); diff=gaussian_kde(intra)(xs)-gaussian_kde(inter)(xs)
cr=[float(x) for x in xs[np.where(np.diff(np.sign(diff)))[0]] if 0.6<x<0.99]
print(f' [{label}] crossover {[f"{x:.4f}" for x in cr]} (n={len(keep)}, accts>=3={len(accts)})')
def percomp_bands(keep,label,M=500_000):
feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None)
dh=np.stack([np.frombuffer(r[4],np.uint8) for r in keep]); cpas=np.array([r[0] for r in keep])
by=defaultdict(list)
for i,c in enumerate(cpas): by[c].append(i)
accts=[c for c,v in by.items() if len(v)>=1]; rng=np.random.default_rng(SEED)
n=len(keep); ii=rng.integers(0,n,M*2); jj=rng.integers(0,n,M*2)
keepm=cpas[ii]!=cpas[jj]; ii=ii[keepm][:M]; jj=jj[keepm][:M]
cos=np.einsum('ij,ij->i',feats[ii],feats[jj]); d=POP[dh[ii]^dh[jj]].sum(1)
hc=(cos>0.95)&(d<=5); mc=(cos>0.95)&(d>5)&(d<=15); hsc=(cos>0.95)&(d>15)
un=(cos>0.837)&(cos<=0.95); lh=cos<=0.837
print(f' [{label}] per-COMPARISON ICCR (M={len(ii)}): HC {hc.mean():.6f} MC {mc.mean():.6f} HSC {hsc.mean():.6f} UN {un.mean():.4f} LH {lh.mean():.4f}')
def persig_perdoc_bands(keep,label):
n=len(keep); feats=np.stack([np.frombuffer(r[3],np.float32) for r in keep]).astype(np.float32)
feats/=np.clip(np.linalg.norm(feats,axis=1,keepdims=True),1e-9,None)
dh=np.stack([np.frombuffer(r[4],np.uint8) for r in keep]); cpas=np.array([r[0] for r in keep]); docs=np.array([r[2] for r in keep])
ci=defaultdict(list)
for i,c in enumerate(cpas): ci[c].append(i)
ci={c:np.array(v) for c,v in ci.items()}; ps={c:len(v)-1 for c,v in ci.items()}
allidx=np.arange(n); rng=np.random.default_rng(SEED); mc=np.zeros(n,np.float32); md=np.full(n,64,np.int32)
for si in range(n):
p=ps[cpas[si]]
if p<=0: continue
same=ci[cpas[si]]; need=p; cand=[]; att=0
while need>0 and att<10:
dr=rng.choice(n,size=need*2,replace=True); ok=dr[~np.isin(dr,same)]; cand.extend(ok[:need].tolist()); need-=len(ok[:need]); att+=1
cand=np.array(cand[:p],dtype=np.int64)
mc[si]=(feats[cand]@feats[si]).max(); md[si]=int(POP[dh[cand]^dh[si]].sum(1).min())
un=(mc>0.837)&(mc<=0.95); hsc=(mc>0.95)&(md>15)
# per-doc: any signature in band
dd=defaultdict(list)
for i in range(n): dd[docs[i]].append(i)
docs_un=np.mean([un[v].any() for v in dd.values()]); docs_hsc=np.mean([hsc[v].any() for v in dd.values()])
print(f' [{label}] per-SIGNATURE ICCR: UN {un.mean():.4f} HSC {hsc.mean():.6f}')
print(f' [{label}] per-REPORT ICCR: UN {docs_un:.4f} HSC {docs_hsc:.6f} (n_doc={len(dd)})')
rows=load()
bcd_all=[r for r in rows if r[1] in BIG4 and r[1]!=FIRM_A]
bcd_19=[r for r in bcd_all if 2013<=r[5]<=2019]
print("=== ITEM 11: KDE crossover (verify corpus 0.837 / BCD-all 0.8489, then closed-world 2013-2019) ===")
crossover(rows,'corpus-wide (verify ~0.8367)')
crossover(bcd_all,'BCD-only ALL period (verify 0.8489)')
crossover(bcd_19,'BCD 2013-2019 CLOSED-WORLD (NEW primary candidate)')
print("\n=== ITEM 3: UN / HSC full ICCR on BCD 2013-2019 ===")
percomp_bands(bcd_19,'BCD 2013-2019')
persig_perdoc_bands(bcd_19,'BCD 2013-2019')
print("\n=== ITEM 12: n reconciliation ===")
print(f" BCD full-period (2013-2023) signatures = {len(bcd_all)} <- Script53 logged n=89,994")
print(f" BCD 2013-2019 signatures = {len(bcd_19)} <- headline ICCR base (reproduces 0.0059)")