939a348da4
Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
494 lines
18 KiB
Python
494 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Ablation Study: Backbone Comparison for Signature Feature Extraction
|
|
====================================================================
|
|
Compares ResNet-50 vs VGG-16 vs EfficientNet-B0 on:
|
|
1. Feature extraction speed
|
|
2. Intra/Inter class cosine similarity separation (Cohen's d)
|
|
3. KDE crossover point
|
|
4. Firm A (known replication) distribution
|
|
|
|
Usage:
|
|
python ablation_backbone_comparison.py # Run all backbones
|
|
python ablation_backbone_comparison.py --extract # Feature extraction only
|
|
python ablation_backbone_comparison.py --analyze # Analysis only (features must exist)
|
|
"""
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
import torchvision.models as models
|
|
import torchvision.transforms as transforms
|
|
from torch.utils.data import Dataset, DataLoader
|
|
import numpy as np
|
|
import sqlite3
|
|
import time
|
|
import argparse
|
|
import json
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
from tqdm import tqdm
|
|
import warnings
|
|
warnings.filterwarnings('ignore')
|
|
|
|
# === Configuration ===
|
|
IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
|
|
FEATURES_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/features")
|
|
DB_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db")
|
|
OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/ablation")
|
|
FILENAMES_PATH = FEATURES_DIR / "signature_filenames.txt"
|
|
|
|
BATCH_SIZE = 64
|
|
NUM_WORKERS = 4
|
|
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else
|
|
"cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
# Sampling for analysis
|
|
INTER_CLASS_SAMPLE_SIZE = 500_000
|
|
INTRA_CLASS_MIN_SIGNATURES = 3
|
|
RANDOM_SEED = 42
|
|
|
|
# Known replication firm (Deloitte Taiwan = 勤業眾信)
|
|
FIRM_A_NAME = "勤業眾信聯合"
|
|
|
|
BACKBONES = {
|
|
"resnet50": {
|
|
"model_fn": lambda: models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2),
|
|
"feature_dim": 2048,
|
|
"description": "ResNet-50 (ImageNet1K_V2)",
|
|
},
|
|
"vgg16": {
|
|
"model_fn": lambda: models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1),
|
|
"feature_dim": 4096,
|
|
"description": "VGG-16 (ImageNet1K_V1)",
|
|
},
|
|
"efficientnet_b0": {
|
|
"model_fn": lambda: models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1),
|
|
"feature_dim": 1280,
|
|
"description": "EfficientNet-B0 (ImageNet1K_V1)",
|
|
},
|
|
}
|
|
|
|
|
|
class SignatureDataset(Dataset):
|
|
def __init__(self, image_paths, transform=None):
|
|
self.image_paths = image_paths
|
|
self.transform = transform
|
|
|
|
def __len__(self):
|
|
return len(self.image_paths)
|
|
|
|
def __getitem__(self, idx):
|
|
import cv2
|
|
img_path = self.image_paths[idx]
|
|
img = cv2.imread(str(img_path))
|
|
if img is None:
|
|
img = np.ones((224, 224, 3), dtype=np.uint8) * 255
|
|
else:
|
|
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
|
img = self._resize_with_padding(img, 224, 224)
|
|
if self.transform:
|
|
img = self.transform(img)
|
|
return img, str(img_path.name)
|
|
|
|
@staticmethod
|
|
def _resize_with_padding(img, target_w, target_h):
|
|
h, w = img.shape[:2]
|
|
scale = min(target_w / w, target_h / h)
|
|
new_w, new_h = int(w * scale), int(h * scale)
|
|
import cv2
|
|
resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
|
canvas = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255
|
|
x_off = (target_w - new_w) // 2
|
|
y_off = (target_h - new_h) // 2
|
|
canvas[y_off:y_off+new_h, x_off:x_off+new_w] = resized
|
|
return canvas
|
|
|
|
|
|
def build_feature_extractor(backbone_name):
|
|
"""Build a feature extractor for the given backbone."""
|
|
config = BACKBONES[backbone_name]
|
|
model = config["model_fn"]()
|
|
|
|
if backbone_name == "vgg16":
|
|
features_part = model.features
|
|
avgpool = model.avgpool
|
|
# Drop last Linear (classifier) to get 4096-dim output
|
|
classifier_part = nn.Sequential(*list(model.classifier.children())[:-1])
|
|
|
|
class VGGFeatureExtractor(nn.Module):
|
|
def __init__(self, features, avgpool, classifier):
|
|
super().__init__()
|
|
self.features = features
|
|
self.avgpool = avgpool
|
|
self.classifier = classifier
|
|
|
|
def forward(self, x):
|
|
x = self.features(x)
|
|
x = self.avgpool(x)
|
|
x = torch.flatten(x, 1)
|
|
x = self.classifier(x)
|
|
return x
|
|
|
|
model = VGGFeatureExtractor(features_part, avgpool, classifier_part)
|
|
|
|
elif backbone_name == "resnet50":
|
|
model = nn.Sequential(*list(model.children())[:-1])
|
|
|
|
elif backbone_name == "efficientnet_b0":
|
|
model.classifier = nn.Identity()
|
|
|
|
model = model.to(DEVICE)
|
|
model.eval()
|
|
return model
|
|
|
|
|
|
def extract_features(backbone_name):
|
|
"""Extract features for all signatures using the given backbone."""
|
|
print(f"\n{'='*60}")
|
|
print(f"Extracting features: {BACKBONES[backbone_name]['description']}")
|
|
print(f"{'='*60}")
|
|
|
|
output_path = OUTPUT_DIR / f"features_{backbone_name}.npy"
|
|
if output_path.exists():
|
|
print(f" Features already exist: {output_path}")
|
|
print(f" Skipping extraction. Delete file to re-extract.")
|
|
return np.load(output_path)
|
|
|
|
# Load filenames
|
|
with open(FILENAMES_PATH) as f:
|
|
filenames = [line.strip() for line in f if line.strip()]
|
|
print(f" Images: {len(filenames):,}")
|
|
|
|
image_paths = [IMAGES_DIR / fn for fn in filenames]
|
|
|
|
# Build model
|
|
model = build_feature_extractor(backbone_name)
|
|
|
|
transform = transforms.Compose([
|
|
transforms.ToTensor(),
|
|
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
|
])
|
|
|
|
dataset = SignatureDataset(image_paths, transform=transform)
|
|
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False,
|
|
num_workers=NUM_WORKERS, pin_memory=True)
|
|
|
|
all_features = []
|
|
start_time = time.time()
|
|
|
|
with torch.no_grad():
|
|
for images, _ in tqdm(dataloader, desc=f" {backbone_name}"):
|
|
images = images.to(DEVICE)
|
|
feats = model(images)
|
|
feats = feats.view(feats.size(0), -1) # flatten
|
|
feats = nn.functional.normalize(feats, p=2, dim=1) # L2 normalize
|
|
all_features.append(feats.cpu().numpy())
|
|
|
|
elapsed = time.time() - start_time
|
|
all_features = np.vstack(all_features)
|
|
|
|
print(f" Feature shape: {all_features.shape}")
|
|
print(f" Time: {elapsed:.1f}s ({elapsed/60:.1f}min)")
|
|
print(f" Speed: {len(filenames)/elapsed:.1f} images/sec")
|
|
|
|
# Save
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
np.save(output_path, all_features)
|
|
print(f" Saved: {output_path} ({all_features.nbytes / 1e9:.2f} GB)")
|
|
|
|
return all_features
|
|
|
|
|
|
def load_accountant_data():
|
|
"""Load accountant assignments and firm info from DB."""
|
|
conn = sqlite3.connect(DB_PATH)
|
|
cur = conn.cursor()
|
|
|
|
cur.execute('''
|
|
SELECT image_filename, assigned_accountant
|
|
FROM signatures
|
|
WHERE feature_vector IS NOT NULL
|
|
ORDER BY signature_id
|
|
''')
|
|
sig_rows = cur.fetchall()
|
|
|
|
cur.execute('SELECT name, firm FROM accountants')
|
|
acct_firm = {r[0]: r[1] for r in cur.fetchall()}
|
|
|
|
conn.close()
|
|
|
|
filename_to_acct = {r[0]: r[1] for r in sig_rows}
|
|
return filename_to_acct, acct_firm
|
|
|
|
|
|
def analyze_backbone(backbone_name, features, filenames, filename_to_acct, acct_firm):
|
|
"""Compute intra/inter class stats for a backbone's features."""
|
|
print(f"\n{'='*60}")
|
|
print(f"Analyzing: {BACKBONES[backbone_name]['description']}")
|
|
print(f"{'='*60}")
|
|
|
|
np.random.seed(RANDOM_SEED)
|
|
|
|
# Map features to accountants
|
|
accountants = []
|
|
valid_indices = []
|
|
for i, fn in enumerate(filenames):
|
|
acct = filename_to_acct.get(fn)
|
|
if acct:
|
|
accountants.append(acct)
|
|
valid_indices.append(i)
|
|
|
|
valid_features = features[valid_indices]
|
|
print(f" Valid signatures with accountant: {len(valid_indices):,}")
|
|
|
|
# Group by accountant
|
|
acct_groups = defaultdict(list)
|
|
for i, acct in enumerate(accountants):
|
|
acct_groups[acct].append(i)
|
|
|
|
# --- Intra-class ---
|
|
print(" Computing intra-class similarities...")
|
|
intra_sims = []
|
|
for acct, indices in tqdm(acct_groups.items(), desc=" Intra-class", leave=False):
|
|
if len(indices) < INTRA_CLASS_MIN_SIGNATURES:
|
|
continue
|
|
vecs = valid_features[indices]
|
|
sim_matrix = vecs @ vecs.T
|
|
n = len(indices)
|
|
triu_idx = np.triu_indices(n, k=1)
|
|
intra_sims.extend(sim_matrix[triu_idx].tolist())
|
|
|
|
intra_sims = np.array(intra_sims)
|
|
print(f" Intra-class pairs: {len(intra_sims):,}")
|
|
|
|
# --- Inter-class ---
|
|
print(" Computing inter-class similarities...")
|
|
all_acct_list = list(acct_groups.keys())
|
|
inter_sims = []
|
|
for _ in range(INTER_CLASS_SAMPLE_SIZE):
|
|
a1, a2 = np.random.choice(len(all_acct_list), 2, replace=False)
|
|
i1 = np.random.choice(acct_groups[all_acct_list[a1]])
|
|
i2 = np.random.choice(acct_groups[all_acct_list[a2]])
|
|
sim = float(valid_features[i1] @ valid_features[i2])
|
|
inter_sims.append(sim)
|
|
inter_sims = np.array(inter_sims)
|
|
print(f" Inter-class pairs: {len(inter_sims):,}")
|
|
|
|
# --- Firm A (known replication) ---
|
|
print(f" Computing Firm A ({FIRM_A_NAME}) distribution...")
|
|
firm_a_accts = [acct for acct in acct_groups if acct_firm.get(acct) == FIRM_A_NAME]
|
|
firm_a_sims = []
|
|
for acct in firm_a_accts:
|
|
indices = acct_groups[acct]
|
|
if len(indices) < 2:
|
|
continue
|
|
vecs = valid_features[indices]
|
|
sim_matrix = vecs @ vecs.T
|
|
n = len(indices)
|
|
triu_idx = np.triu_indices(n, k=1)
|
|
firm_a_sims.extend(sim_matrix[triu_idx].tolist())
|
|
firm_a_sims = np.array(firm_a_sims) if firm_a_sims else np.array([])
|
|
print(f" Firm A accountants: {len(firm_a_accts)}, pairs: {len(firm_a_sims):,}")
|
|
|
|
# --- Statistics ---
|
|
def dist_stats(arr, name):
|
|
return {
|
|
"name": name,
|
|
"n": len(arr),
|
|
"mean": float(np.mean(arr)),
|
|
"std": float(np.std(arr)),
|
|
"median": float(np.median(arr)),
|
|
"p1": float(np.percentile(arr, 1)),
|
|
"p5": float(np.percentile(arr, 5)),
|
|
"p25": float(np.percentile(arr, 25)),
|
|
"p75": float(np.percentile(arr, 75)),
|
|
"p95": float(np.percentile(arr, 95)),
|
|
"p99": float(np.percentile(arr, 99)),
|
|
"min": float(np.min(arr)),
|
|
"max": float(np.max(arr)),
|
|
}
|
|
|
|
intra_stats = dist_stats(intra_sims, "intra")
|
|
inter_stats = dist_stats(inter_sims, "inter")
|
|
firm_a_stats = dist_stats(firm_a_sims, "firm_a") if len(firm_a_sims) > 0 else None
|
|
|
|
# Cohen's d
|
|
pooled_std = np.sqrt((intra_stats["std"]**2 + inter_stats["std"]**2) / 2)
|
|
cohens_d = (intra_stats["mean"] - inter_stats["mean"]) / pooled_std if pooled_std > 0 else 0
|
|
|
|
# KDE crossover
|
|
try:
|
|
from scipy.stats import gaussian_kde
|
|
x_grid = np.linspace(0, 1, 1000)
|
|
kde_intra = gaussian_kde(intra_sims)
|
|
kde_inter = gaussian_kde(inter_sims)
|
|
diff = kde_intra(x_grid) - kde_inter(x_grid)
|
|
sign_changes = np.where(np.diff(np.sign(diff)))[0]
|
|
crossovers = x_grid[sign_changes]
|
|
valid_crossovers = crossovers[(crossovers > 0.5) & (crossovers < 1.0)]
|
|
kde_crossover = float(valid_crossovers[-1]) if len(valid_crossovers) > 0 else None
|
|
except Exception as e:
|
|
print(f" KDE crossover computation failed: {e}")
|
|
kde_crossover = None
|
|
|
|
results = {
|
|
"backbone": backbone_name,
|
|
"description": BACKBONES[backbone_name]["description"],
|
|
"feature_dim": BACKBONES[backbone_name]["feature_dim"],
|
|
"intra": intra_stats,
|
|
"inter": inter_stats,
|
|
"firm_a": firm_a_stats,
|
|
"cohens_d": float(cohens_d),
|
|
"kde_crossover": kde_crossover,
|
|
}
|
|
|
|
# Print summary
|
|
print(f"\n --- {backbone_name} Summary ---")
|
|
print(f" Feature dim: {results['feature_dim']}")
|
|
print(f" Intra mean: {intra_stats['mean']:.4f} +/- {intra_stats['std']:.4f}")
|
|
print(f" Inter mean: {inter_stats['mean']:.4f} +/- {inter_stats['std']:.4f}")
|
|
print(f" Cohen's d: {cohens_d:.4f}")
|
|
print(f" KDE crossover: {kde_crossover}")
|
|
if firm_a_stats:
|
|
print(f" Firm A mean: {firm_a_stats['mean']:.4f} +/- {firm_a_stats['std']:.4f}")
|
|
print(f" Firm A 1st pct: {firm_a_stats['p1']:.4f}")
|
|
|
|
return results
|
|
|
|
|
|
def generate_comparison_table(all_results):
|
|
"""Generate a markdown comparison table."""
|
|
print(f"\n{'='*60}")
|
|
print("COMPARISON TABLE")
|
|
print(f"{'='*60}\n")
|
|
|
|
results_by_name = {r["backbone"]: r for r in all_results}
|
|
|
|
def get_val(backbone, key, sub=None):
|
|
r = results_by_name.get(backbone)
|
|
if not r:
|
|
return None
|
|
if sub:
|
|
section = r.get(sub)
|
|
if isinstance(section, dict):
|
|
return section.get(key)
|
|
return None
|
|
return r.get(key)
|
|
|
|
def fmt(val, fmt_str=".4f"):
|
|
if val is None:
|
|
return "---"
|
|
if isinstance(val, int):
|
|
return str(val)
|
|
return f"{val:{fmt_str}}"
|
|
|
|
names = ["resnet50", "vgg16", "efficientnet_b0"]
|
|
header = "| Metric | ResNet-50 | VGG-16 | EfficientNet-B0 |"
|
|
sep = "|--------|-----------|--------|-----------------|"
|
|
|
|
rows = [
|
|
f"| Feature dim | {fmt(get_val('resnet50','feature_dim'),'')} | {fmt(get_val('vgg16','feature_dim'),'')} | {fmt(get_val('efficientnet_b0','feature_dim'),'')} |",
|
|
f"| Intra mean | {fmt(get_val('resnet50','mean','intra'))} | {fmt(get_val('vgg16','mean','intra'))} | {fmt(get_val('efficientnet_b0','mean','intra'))} |",
|
|
f"| Intra std | {fmt(get_val('resnet50','std','intra'))} | {fmt(get_val('vgg16','std','intra'))} | {fmt(get_val('efficientnet_b0','std','intra'))} |",
|
|
f"| Inter mean | {fmt(get_val('resnet50','mean','inter'))} | {fmt(get_val('vgg16','mean','inter'))} | {fmt(get_val('efficientnet_b0','mean','inter'))} |",
|
|
f"| Inter std | {fmt(get_val('resnet50','std','inter'))} | {fmt(get_val('vgg16','std','inter'))} | {fmt(get_val('efficientnet_b0','std','inter'))} |",
|
|
f"| **Cohen's d** | **{fmt(get_val('resnet50','cohens_d'))}** | **{fmt(get_val('vgg16','cohens_d'))}** | **{fmt(get_val('efficientnet_b0','cohens_d'))}** |",
|
|
f"| KDE crossover | {fmt(get_val('resnet50','kde_crossover'))} | {fmt(get_val('vgg16','kde_crossover'))} | {fmt(get_val('efficientnet_b0','kde_crossover'))} |",
|
|
f"| Firm A mean | {fmt(get_val('resnet50','mean','firm_a'))} | {fmt(get_val('vgg16','mean','firm_a'))} | {fmt(get_val('efficientnet_b0','mean','firm_a'))} |",
|
|
f"| Firm A 1st pct | {fmt(get_val('resnet50','p1','firm_a'))} | {fmt(get_val('vgg16','p1','firm_a'))} | {fmt(get_val('efficientnet_b0','p1','firm_a'))} |",
|
|
]
|
|
|
|
table = "\n".join([header, sep] + rows)
|
|
print(table)
|
|
|
|
# Save report
|
|
report_path = OUTPUT_DIR / "ablation_comparison.md"
|
|
with open(report_path, 'w') as f:
|
|
f.write("# Ablation Study: Backbone Comparison\n\n")
|
|
f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M')}\n\n")
|
|
f.write("## Comparison Table\n\n")
|
|
f.write(table + "\n\n")
|
|
f.write("## Interpretation\n\n")
|
|
f.write("- **Cohen's d**: Higher = better separation between same-CPA and different-CPA signatures\n")
|
|
f.write("- **KDE crossover**: The Bayes-optimal decision boundary (higher = easier to classify)\n")
|
|
f.write("- **Firm A**: Known replication firm; expect very high mean similarity\n")
|
|
f.write("- **Firm A 1st percentile**: Lower bound of known-replication similarity\n")
|
|
|
|
json_path = OUTPUT_DIR / "ablation_results.json"
|
|
with open(json_path, 'w') as f:
|
|
json.dump(all_results, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n Report saved: {report_path}")
|
|
print(f" Raw data saved: {json_path}")
|
|
|
|
return table
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Ablation: backbone comparison")
|
|
parser.add_argument("--extract", action="store_true", help="Feature extraction only")
|
|
parser.add_argument("--analyze", action="store_true", help="Analysis only")
|
|
parser.add_argument("--backbone", type=str, help="Run single backbone (resnet50/vgg16/efficientnet_b0)")
|
|
args = parser.parse_args()
|
|
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Load filenames
|
|
with open(FILENAMES_PATH) as f:
|
|
filenames = [line.strip() for line in f if line.strip()]
|
|
|
|
backbones_to_run = [args.backbone] if args.backbone else list(BACKBONES.keys())
|
|
|
|
if not args.analyze:
|
|
# === Phase 1: Feature Extraction ===
|
|
print("\n" + "=" * 60)
|
|
print("PHASE 1: FEATURE EXTRACTION")
|
|
print("=" * 60)
|
|
|
|
# For ResNet-50, copy existing features instead of re-extracting
|
|
resnet_ablation_path = OUTPUT_DIR / "features_resnet50.npy"
|
|
resnet_existing_path = FEATURES_DIR / "signature_features.npy"
|
|
if "resnet50" in backbones_to_run and not resnet_ablation_path.exists() and resnet_existing_path.exists():
|
|
print(f"\nCopying existing ResNet-50 features...")
|
|
import shutil
|
|
resnet_ablation_path.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.copy2(resnet_existing_path, resnet_ablation_path)
|
|
print(f" Copied: {resnet_ablation_path}")
|
|
|
|
for name in backbones_to_run:
|
|
if name == "resnet50" and resnet_ablation_path.exists():
|
|
continue
|
|
extract_features(name)
|
|
|
|
if args.extract:
|
|
print("\nFeature extraction complete. Run with --analyze to compute statistics.")
|
|
return
|
|
|
|
# === Phase 2: Analysis ===
|
|
print("\n" + "=" * 60)
|
|
print("PHASE 2: ANALYSIS")
|
|
print("=" * 60)
|
|
|
|
filename_to_acct, acct_firm = load_accountant_data()
|
|
|
|
all_results = []
|
|
for name in backbones_to_run:
|
|
feat_path = OUTPUT_DIR / f"features_{name}.npy"
|
|
if not feat_path.exists():
|
|
print(f"\n WARNING: {feat_path} not found, skipping {name}")
|
|
continue
|
|
features = np.load(feat_path)
|
|
results = analyze_backbone(name, features, filenames, filename_to_acct, acct_firm)
|
|
all_results.append(results)
|
|
|
|
if len(all_results) > 1:
|
|
generate_comparison_table(all_results)
|
|
elif len(all_results) == 1:
|
|
print(f"\nOnly one backbone analyzed. Run all three for comparison table.")
|
|
|
|
print("\nDone!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|