"""F5 robustness: firm+year fixed-effects logistic regression and leave-one-year-out. Complements the pool-size stratification and accountant-clustered bootstrap (Section IV-C). Uses numpy+scipy only (no statsmodels). Reproduces from signature_analysis.db. """ import sqlite3, numpy as np from scipy.optimize import minimize DB = "/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db" BIG4 = ('勤業眾信聯合', '資誠聯合', '安侯建業聯合', '安永聯合') FM = {'勤業眾信聯合': 'A', '安侯建業聯合': 'B', '資誠聯合': 'C', '安永聯合': 'D'} con = sqlite3.connect(DB); cur = con.cursor() cur.execute(f""" SELECT s.excel_firm, CAST(substr(s.year_month,1,4) AS INT) yr, (CASE WHEN s.max_similarity_to_same_accountant>0.95 AND s.min_dhash_independent<=5 THEN 1 ELSE 0 END) hc, p.psize FROM signatures s JOIN (SELECT accountant_id, COUNT(*) psize FROM signatures WHERE is_valid=1 AND excel_firm IN ({','.join('?'*4)}) AND max_similarity_to_same_accountant IS NOT NULL AND min_dhash_independent IS NOT NULL GROUP BY accountant_id) p ON s.accountant_id=p.accountant_id WHERE s.is_valid=1 AND s.excel_firm IN ({','.join('?'*4)}) AND s.max_similarity_to_same_accountant IS NOT NULL AND s.min_dhash_independent IS NOT NULL AND s.year_month GLOB '2[0-9][0-9][0-9][0-9][0-9]' """, BIG4 + BIG4) rows = cur.fetchall(); con.close() firm = np.array([FM[r[0]] for r in rows]); yr = np.array([r[1] for r in rows]) hc = np.array([r[2] for r in rows], float); pool = np.array([r[3] for r in rows], float) n = len(hc); years = sorted(set(yr.tolist())) # --- firm + year FE logistic (Firm A & first year = reference) --- cols = [np.ones(n)]; names = ['const'] for f in ['B', 'C', 'D']: cols.append((firm == f).astype(float)); names.append(f'firm_{f}') for y in years[1:]: cols.append((yr == y).astype(float)); names.append(f'yr_{y}') lp = np.log(pool); lp = (lp - lp.mean()) / lp.std() cols.append(lp); names.append('logpool_z') X = np.column_stack(cols) def nll(b): z = X @ b return -np.sum(hc * z - np.logaddexp(0, z)) + 1e-6 * np.sum(b * b) def grad(b): p = 1 / (1 + np.exp(-(X @ b))) return -X.T @ (hc - p) + 2e-6 * b b = minimize(nll, np.zeros(X.shape[1]), jac=grad, method='L-BFGS-B').x print("Firm+Year FE logistic (Firm A & first year = ref):") for nm, bi in zip(names, b): if nm.startswith('firm') or nm == 'logpool_z': print(f" {nm:11} coef={bi:7.3f} OR={np.exp(bi):.4f}") # --- leave-one-year-out firm contrast --- grp = np.where(firm == 'A', 'A', 'BCD') def rate(mask, g): m = mask & (grp == g); return 100 * hc[m].mean() print("\nLeave-one-year-out (Firm A minus B/C/D HC gap):") gaps = [] for y in years: keep = (yr != y); a = rate(keep, 'A'); bb = rate(keep, 'BCD'); gaps.append(a - bb) print(f" drop {y}: A={a:.1f}% BCD={bb:.1f}% gap={a-bb:.1f}pp") print(f" full-sample gap={rate(np.ones(n, bool),'A')-rate(np.ones(n, bool),'BCD'):.1f}pp; " f"LOYO range=[{min(gaps):.1f}, {max(gaps):.1f}]pp")