#!/usr/bin/env python3 """ Ablation Study: Backbone Comparison for Signature Feature Extraction ==================================================================== Compares ResNet-50 vs VGG-16 vs EfficientNet-B0 on: 1. Feature extraction speed 2. Intra/Inter class cosine similarity separation (Cohen's d) 3. KDE crossover point 4. Firm A (known replication) distribution Usage: python ablation_backbone_comparison.py # Run all backbones python ablation_backbone_comparison.py --extract # Feature extraction only python ablation_backbone_comparison.py --analyze # Analysis only (features must exist) """ import torch import torch.nn as nn import torchvision.models as models import torchvision.transforms as transforms from torch.utils.data import Dataset, DataLoader import numpy as np import sqlite3 import time import argparse import json from pathlib import Path from collections import defaultdict from tqdm import tqdm import warnings warnings.filterwarnings('ignore') # === Configuration === IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images") FEATURES_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/features") DB_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db") OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/ablation") FILENAMES_PATH = FEATURES_DIR / "signature_filenames.txt" BATCH_SIZE = 64 NUM_WORKERS = 4 DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu") # Sampling for analysis INTER_CLASS_SAMPLE_SIZE = 500_000 INTRA_CLASS_MIN_SIGNATURES = 3 RANDOM_SEED = 42 # Known replication firm (Deloitte Taiwan = 勤業眾信) FIRM_A_NAME = "勤業眾信聯合" BACKBONES = { "resnet50": { "model_fn": lambda: models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2), "feature_dim": 2048, "description": "ResNet-50 (ImageNet1K_V2)", }, "vgg16": { "model_fn": lambda: models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1), "feature_dim": 4096, "description": "VGG-16 (ImageNet1K_V1)", }, "efficientnet_b0": { "model_fn": lambda: models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1), "feature_dim": 1280, "description": "EfficientNet-B0 (ImageNet1K_V1)", }, } class SignatureDataset(Dataset): def __init__(self, image_paths, transform=None): self.image_paths = image_paths self.transform = transform def __len__(self): return len(self.image_paths) def __getitem__(self, idx): import cv2 img_path = self.image_paths[idx] img = cv2.imread(str(img_path)) if img is None: img = np.ones((224, 224, 3), dtype=np.uint8) * 255 else: img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = self._resize_with_padding(img, 224, 224) if self.transform: img = self.transform(img) return img, str(img_path.name) @staticmethod def _resize_with_padding(img, target_w, target_h): h, w = img.shape[:2] scale = min(target_w / w, target_h / h) new_w, new_h = int(w * scale), int(h * scale) import cv2 resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA) canvas = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255 x_off = (target_w - new_w) // 2 y_off = (target_h - new_h) // 2 canvas[y_off:y_off+new_h, x_off:x_off+new_w] = resized return canvas def build_feature_extractor(backbone_name): """Build a feature extractor for the given backbone.""" config = BACKBONES[backbone_name] model = config["model_fn"]() if backbone_name == "vgg16": features_part = model.features avgpool = model.avgpool # Drop last Linear (classifier) to get 4096-dim output classifier_part = nn.Sequential(*list(model.classifier.children())[:-1]) class VGGFeatureExtractor(nn.Module): def __init__(self, features, avgpool, classifier): super().__init__() self.features = features self.avgpool = avgpool self.classifier = classifier def forward(self, x): x = self.features(x) x = self.avgpool(x) x = torch.flatten(x, 1) x = self.classifier(x) return x model = VGGFeatureExtractor(features_part, avgpool, classifier_part) elif backbone_name == "resnet50": model = nn.Sequential(*list(model.children())[:-1]) elif backbone_name == "efficientnet_b0": model.classifier = nn.Identity() model = model.to(DEVICE) model.eval() return model def extract_features(backbone_name): """Extract features for all signatures using the given backbone.""" print(f"\n{'='*60}") print(f"Extracting features: {BACKBONES[backbone_name]['description']}") print(f"{'='*60}") output_path = OUTPUT_DIR / f"features_{backbone_name}.npy" if output_path.exists(): print(f" Features already exist: {output_path}") print(f" Skipping extraction. Delete file to re-extract.") return np.load(output_path) # Load filenames with open(FILENAMES_PATH) as f: filenames = [line.strip() for line in f if line.strip()] print(f" Images: {len(filenames):,}") image_paths = [IMAGES_DIR / fn for fn in filenames] # Build model model = build_feature_extractor(backbone_name) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) dataset = SignatureDataset(image_paths, transform=transform) dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True) all_features = [] start_time = time.time() with torch.no_grad(): for images, _ in tqdm(dataloader, desc=f" {backbone_name}"): images = images.to(DEVICE) feats = model(images) feats = feats.view(feats.size(0), -1) # flatten feats = nn.functional.normalize(feats, p=2, dim=1) # L2 normalize all_features.append(feats.cpu().numpy()) elapsed = time.time() - start_time all_features = np.vstack(all_features) print(f" Feature shape: {all_features.shape}") print(f" Time: {elapsed:.1f}s ({elapsed/60:.1f}min)") print(f" Speed: {len(filenames)/elapsed:.1f} images/sec") # Save OUTPUT_DIR.mkdir(parents=True, exist_ok=True) np.save(output_path, all_features) print(f" Saved: {output_path} ({all_features.nbytes / 1e9:.2f} GB)") return all_features def load_accountant_data(): """Load accountant assignments and firm info from DB.""" conn = sqlite3.connect(DB_PATH) cur = conn.cursor() cur.execute(''' SELECT image_filename, assigned_accountant FROM signatures WHERE feature_vector IS NOT NULL ORDER BY signature_id ''') sig_rows = cur.fetchall() cur.execute('SELECT name, firm FROM accountants') acct_firm = {r[0]: r[1] for r in cur.fetchall()} conn.close() filename_to_acct = {r[0]: r[1] for r in sig_rows} return filename_to_acct, acct_firm def analyze_backbone(backbone_name, features, filenames, filename_to_acct, acct_firm): """Compute intra/inter class stats for a backbone's features.""" print(f"\n{'='*60}") print(f"Analyzing: {BACKBONES[backbone_name]['description']}") print(f"{'='*60}") np.random.seed(RANDOM_SEED) # Map features to accountants accountants = [] valid_indices = [] for i, fn in enumerate(filenames): acct = filename_to_acct.get(fn) if acct: accountants.append(acct) valid_indices.append(i) valid_features = features[valid_indices] print(f" Valid signatures with accountant: {len(valid_indices):,}") # Group by accountant acct_groups = defaultdict(list) for i, acct in enumerate(accountants): acct_groups[acct].append(i) # --- Intra-class --- print(" Computing intra-class similarities...") intra_sims = [] for acct, indices in tqdm(acct_groups.items(), desc=" Intra-class", leave=False): if len(indices) < INTRA_CLASS_MIN_SIGNATURES: continue vecs = valid_features[indices] sim_matrix = vecs @ vecs.T n = len(indices) triu_idx = np.triu_indices(n, k=1) intra_sims.extend(sim_matrix[triu_idx].tolist()) intra_sims = np.array(intra_sims) print(f" Intra-class pairs: {len(intra_sims):,}") # --- Inter-class --- print(" Computing inter-class similarities...") all_acct_list = list(acct_groups.keys()) inter_sims = [] for _ in range(INTER_CLASS_SAMPLE_SIZE): a1, a2 = np.random.choice(len(all_acct_list), 2, replace=False) i1 = np.random.choice(acct_groups[all_acct_list[a1]]) i2 = np.random.choice(acct_groups[all_acct_list[a2]]) sim = float(valid_features[i1] @ valid_features[i2]) inter_sims.append(sim) inter_sims = np.array(inter_sims) print(f" Inter-class pairs: {len(inter_sims):,}") # --- Firm A (known replication) --- print(f" Computing Firm A ({FIRM_A_NAME}) distribution...") firm_a_accts = [acct for acct in acct_groups if acct_firm.get(acct) == FIRM_A_NAME] firm_a_sims = [] for acct in firm_a_accts: indices = acct_groups[acct] if len(indices) < 2: continue vecs = valid_features[indices] sim_matrix = vecs @ vecs.T n = len(indices) triu_idx = np.triu_indices(n, k=1) firm_a_sims.extend(sim_matrix[triu_idx].tolist()) firm_a_sims = np.array(firm_a_sims) if firm_a_sims else np.array([]) print(f" Firm A accountants: {len(firm_a_accts)}, pairs: {len(firm_a_sims):,}") # --- Statistics --- def dist_stats(arr, name): return { "name": name, "n": len(arr), "mean": float(np.mean(arr)), "std": float(np.std(arr)), "median": float(np.median(arr)), "p1": float(np.percentile(arr, 1)), "p5": float(np.percentile(arr, 5)), "p25": float(np.percentile(arr, 25)), "p75": float(np.percentile(arr, 75)), "p95": float(np.percentile(arr, 95)), "p99": float(np.percentile(arr, 99)), "min": float(np.min(arr)), "max": float(np.max(arr)), } intra_stats = dist_stats(intra_sims, "intra") inter_stats = dist_stats(inter_sims, "inter") firm_a_stats = dist_stats(firm_a_sims, "firm_a") if len(firm_a_sims) > 0 else None # Cohen's d pooled_std = np.sqrt((intra_stats["std"]**2 + inter_stats["std"]**2) / 2) cohens_d = (intra_stats["mean"] - inter_stats["mean"]) / pooled_std if pooled_std > 0 else 0 # KDE crossover try: from scipy.stats import gaussian_kde x_grid = np.linspace(0, 1, 1000) kde_intra = gaussian_kde(intra_sims) kde_inter = gaussian_kde(inter_sims) diff = kde_intra(x_grid) - kde_inter(x_grid) sign_changes = np.where(np.diff(np.sign(diff)))[0] crossovers = x_grid[sign_changes] valid_crossovers = crossovers[(crossovers > 0.5) & (crossovers < 1.0)] kde_crossover = float(valid_crossovers[-1]) if len(valid_crossovers) > 0 else None except Exception as e: print(f" KDE crossover computation failed: {e}") kde_crossover = None results = { "backbone": backbone_name, "description": BACKBONES[backbone_name]["description"], "feature_dim": BACKBONES[backbone_name]["feature_dim"], "intra": intra_stats, "inter": inter_stats, "firm_a": firm_a_stats, "cohens_d": float(cohens_d), "kde_crossover": kde_crossover, } # Print summary print(f"\n --- {backbone_name} Summary ---") print(f" Feature dim: {results['feature_dim']}") print(f" Intra mean: {intra_stats['mean']:.4f} +/- {intra_stats['std']:.4f}") print(f" Inter mean: {inter_stats['mean']:.4f} +/- {inter_stats['std']:.4f}") print(f" Cohen's d: {cohens_d:.4f}") print(f" KDE crossover: {kde_crossover}") if firm_a_stats: print(f" Firm A mean: {firm_a_stats['mean']:.4f} +/- {firm_a_stats['std']:.4f}") print(f" Firm A 1st pct: {firm_a_stats['p1']:.4f}") return results def generate_comparison_table(all_results): """Generate a markdown comparison table.""" print(f"\n{'='*60}") print("COMPARISON TABLE") print(f"{'='*60}\n") results_by_name = {r["backbone"]: r for r in all_results} def get_val(backbone, key, sub=None): r = results_by_name.get(backbone) if not r: return None if sub: section = r.get(sub) if isinstance(section, dict): return section.get(key) return None return r.get(key) def fmt(val, fmt_str=".4f"): if val is None: return "---" if isinstance(val, int): return str(val) return f"{val:{fmt_str}}" names = ["resnet50", "vgg16", "efficientnet_b0"] header = "| Metric | ResNet-50 | VGG-16 | EfficientNet-B0 |" sep = "|--------|-----------|--------|-----------------|" rows = [ f"| Feature dim | {fmt(get_val('resnet50','feature_dim'),'')} | {fmt(get_val('vgg16','feature_dim'),'')} | {fmt(get_val('efficientnet_b0','feature_dim'),'')} |", f"| Intra mean | {fmt(get_val('resnet50','mean','intra'))} | {fmt(get_val('vgg16','mean','intra'))} | {fmt(get_val('efficientnet_b0','mean','intra'))} |", f"| Intra std | {fmt(get_val('resnet50','std','intra'))} | {fmt(get_val('vgg16','std','intra'))} | {fmt(get_val('efficientnet_b0','std','intra'))} |", f"| Inter mean | {fmt(get_val('resnet50','mean','inter'))} | {fmt(get_val('vgg16','mean','inter'))} | {fmt(get_val('efficientnet_b0','mean','inter'))} |", f"| Inter std | {fmt(get_val('resnet50','std','inter'))} | {fmt(get_val('vgg16','std','inter'))} | {fmt(get_val('efficientnet_b0','std','inter'))} |", f"| **Cohen's d** | **{fmt(get_val('resnet50','cohens_d'))}** | **{fmt(get_val('vgg16','cohens_d'))}** | **{fmt(get_val('efficientnet_b0','cohens_d'))}** |", f"| KDE crossover | {fmt(get_val('resnet50','kde_crossover'))} | {fmt(get_val('vgg16','kde_crossover'))} | {fmt(get_val('efficientnet_b0','kde_crossover'))} |", f"| Firm A mean | {fmt(get_val('resnet50','mean','firm_a'))} | {fmt(get_val('vgg16','mean','firm_a'))} | {fmt(get_val('efficientnet_b0','mean','firm_a'))} |", f"| Firm A 1st pct | {fmt(get_val('resnet50','p1','firm_a'))} | {fmt(get_val('vgg16','p1','firm_a'))} | {fmt(get_val('efficientnet_b0','p1','firm_a'))} |", ] table = "\n".join([header, sep] + rows) print(table) # Save report report_path = OUTPUT_DIR / "ablation_comparison.md" with open(report_path, 'w') as f: f.write("# Ablation Study: Backbone Comparison\n\n") f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M')}\n\n") f.write("## Comparison Table\n\n") f.write(table + "\n\n") f.write("## Interpretation\n\n") f.write("- **Cohen's d**: Higher = better separation between same-CPA and different-CPA signatures\n") f.write("- **KDE crossover**: The Bayes-optimal decision boundary (higher = easier to classify)\n") f.write("- **Firm A**: Known replication firm; expect very high mean similarity\n") f.write("- **Firm A 1st percentile**: Lower bound of known-replication similarity\n") json_path = OUTPUT_DIR / "ablation_results.json" with open(json_path, 'w') as f: json.dump(all_results, f, indent=2, ensure_ascii=False) print(f"\n Report saved: {report_path}") print(f" Raw data saved: {json_path}") return table def main(): parser = argparse.ArgumentParser(description="Ablation: backbone comparison") parser.add_argument("--extract", action="store_true", help="Feature extraction only") parser.add_argument("--analyze", action="store_true", help="Analysis only") parser.add_argument("--backbone", type=str, help="Run single backbone (resnet50/vgg16/efficientnet_b0)") args = parser.parse_args() OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Load filenames with open(FILENAMES_PATH) as f: filenames = [line.strip() for line in f if line.strip()] backbones_to_run = [args.backbone] if args.backbone else list(BACKBONES.keys()) if not args.analyze: # === Phase 1: Feature Extraction === print("\n" + "=" * 60) print("PHASE 1: FEATURE EXTRACTION") print("=" * 60) # For ResNet-50, copy existing features instead of re-extracting resnet_ablation_path = OUTPUT_DIR / "features_resnet50.npy" resnet_existing_path = FEATURES_DIR / "signature_features.npy" if "resnet50" in backbones_to_run and not resnet_ablation_path.exists() and resnet_existing_path.exists(): print(f"\nCopying existing ResNet-50 features...") import shutil resnet_ablation_path.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(resnet_existing_path, resnet_ablation_path) print(f" Copied: {resnet_ablation_path}") for name in backbones_to_run: if name == "resnet50" and resnet_ablation_path.exists(): continue extract_features(name) if args.extract: print("\nFeature extraction complete. Run with --analyze to compute statistics.") return # === Phase 2: Analysis === print("\n" + "=" * 60) print("PHASE 2: ANALYSIS") print("=" * 60) filename_to_acct, acct_firm = load_accountant_data() all_results = [] for name in backbones_to_run: feat_path = OUTPUT_DIR / f"features_{name}.npy" if not feat_path.exists(): print(f"\n WARNING: {feat_path} not found, skipping {name}") continue features = np.load(feat_path) results = analyze_backbone(name, features, filenames, filename_to_acct, acct_firm) all_results.append(results) if len(all_results) > 1: generate_comparison_table(all_results) elif len(all_results) == 1: print(f"\nOnly one backbone analyzed. Run all three for comparison table.") print("\nDone!") if __name__ == "__main__": main()