Files
pdf_signature_extraction/paper/ablation_backbone_comparison.py
T
gbanyan 939a348da4 Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification
Paper draft includes all sections (Abstract through Conclusion), 36 references,
and supporting scripts. Key methodology: Cosine similarity + dHash dual-method
verification with thresholds calibrated against known-replication firm (Firm A).

Includes:
- 8 section markdown files (paper_a_*.md)
- Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0)
- Recalibrated classification script (84,386 PDFs, 5-tier system)
- Figure generation and Word export scripts
- Citation renumbering script ([1]-[36])
- Signature analysis pipeline (12 steps)
- YOLO extraction scripts

Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 23:05:33 +08:00

494 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Ablation Study: Backbone Comparison for Signature Feature Extraction
====================================================================
Compares ResNet-50 vs VGG-16 vs EfficientNet-B0 on:
1. Feature extraction speed
2. Intra/Inter class cosine similarity separation (Cohen's d)
3. KDE crossover point
4. Firm A (known replication) distribution
Usage:
python ablation_backbone_comparison.py # Run all backbones
python ablation_backbone_comparison.py --extract # Feature extraction only
python ablation_backbone_comparison.py --analyze # Analysis only (features must exist)
"""
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import numpy as np
import sqlite3
import time
import argparse
import json
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
# === Configuration ===
IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
FEATURES_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/features")
DB_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db")
OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/ablation")
FILENAMES_PATH = FEATURES_DIR / "signature_filenames.txt"
BATCH_SIZE = 64
NUM_WORKERS = 4
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else
"cuda" if torch.cuda.is_available() else "cpu")
# Sampling for analysis
INTER_CLASS_SAMPLE_SIZE = 500_000
INTRA_CLASS_MIN_SIGNATURES = 3
RANDOM_SEED = 42
# Known replication firm (Deloitte Taiwan = 勤業眾信)
FIRM_A_NAME = "勤業眾信聯合"
BACKBONES = {
"resnet50": {
"model_fn": lambda: models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2),
"feature_dim": 2048,
"description": "ResNet-50 (ImageNet1K_V2)",
},
"vgg16": {
"model_fn": lambda: models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1),
"feature_dim": 4096,
"description": "VGG-16 (ImageNet1K_V1)",
},
"efficientnet_b0": {
"model_fn": lambda: models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1),
"feature_dim": 1280,
"description": "EfficientNet-B0 (ImageNet1K_V1)",
},
}
class SignatureDataset(Dataset):
def __init__(self, image_paths, transform=None):
self.image_paths = image_paths
self.transform = transform
def __len__(self):
return len(self.image_paths)
def __getitem__(self, idx):
import cv2
img_path = self.image_paths[idx]
img = cv2.imread(str(img_path))
if img is None:
img = np.ones((224, 224, 3), dtype=np.uint8) * 255
else:
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = self._resize_with_padding(img, 224, 224)
if self.transform:
img = self.transform(img)
return img, str(img_path.name)
@staticmethod
def _resize_with_padding(img, target_w, target_h):
h, w = img.shape[:2]
scale = min(target_w / w, target_h / h)
new_w, new_h = int(w * scale), int(h * scale)
import cv2
resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
canvas = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255
x_off = (target_w - new_w) // 2
y_off = (target_h - new_h) // 2
canvas[y_off:y_off+new_h, x_off:x_off+new_w] = resized
return canvas
def build_feature_extractor(backbone_name):
"""Build a feature extractor for the given backbone."""
config = BACKBONES[backbone_name]
model = config["model_fn"]()
if backbone_name == "vgg16":
features_part = model.features
avgpool = model.avgpool
# Drop last Linear (classifier) to get 4096-dim output
classifier_part = nn.Sequential(*list(model.classifier.children())[:-1])
class VGGFeatureExtractor(nn.Module):
def __init__(self, features, avgpool, classifier):
super().__init__()
self.features = features
self.avgpool = avgpool
self.classifier = classifier
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
model = VGGFeatureExtractor(features_part, avgpool, classifier_part)
elif backbone_name == "resnet50":
model = nn.Sequential(*list(model.children())[:-1])
elif backbone_name == "efficientnet_b0":
model.classifier = nn.Identity()
model = model.to(DEVICE)
model.eval()
return model
def extract_features(backbone_name):
"""Extract features for all signatures using the given backbone."""
print(f"\n{'='*60}")
print(f"Extracting features: {BACKBONES[backbone_name]['description']}")
print(f"{'='*60}")
output_path = OUTPUT_DIR / f"features_{backbone_name}.npy"
if output_path.exists():
print(f" Features already exist: {output_path}")
print(f" Skipping extraction. Delete file to re-extract.")
return np.load(output_path)
# Load filenames
with open(FILENAMES_PATH) as f:
filenames = [line.strip() for line in f if line.strip()]
print(f" Images: {len(filenames):,}")
image_paths = [IMAGES_DIR / fn for fn in filenames]
# Build model
model = build_feature_extractor(backbone_name)
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
dataset = SignatureDataset(image_paths, transform=transform)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False,
num_workers=NUM_WORKERS, pin_memory=True)
all_features = []
start_time = time.time()
with torch.no_grad():
for images, _ in tqdm(dataloader, desc=f" {backbone_name}"):
images = images.to(DEVICE)
feats = model(images)
feats = feats.view(feats.size(0), -1) # flatten
feats = nn.functional.normalize(feats, p=2, dim=1) # L2 normalize
all_features.append(feats.cpu().numpy())
elapsed = time.time() - start_time
all_features = np.vstack(all_features)
print(f" Feature shape: {all_features.shape}")
print(f" Time: {elapsed:.1f}s ({elapsed/60:.1f}min)")
print(f" Speed: {len(filenames)/elapsed:.1f} images/sec")
# Save
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
np.save(output_path, all_features)
print(f" Saved: {output_path} ({all_features.nbytes / 1e9:.2f} GB)")
return all_features
def load_accountant_data():
"""Load accountant assignments and firm info from DB."""
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
cur.execute('''
SELECT image_filename, assigned_accountant
FROM signatures
WHERE feature_vector IS NOT NULL
ORDER BY signature_id
''')
sig_rows = cur.fetchall()
cur.execute('SELECT name, firm FROM accountants')
acct_firm = {r[0]: r[1] for r in cur.fetchall()}
conn.close()
filename_to_acct = {r[0]: r[1] for r in sig_rows}
return filename_to_acct, acct_firm
def analyze_backbone(backbone_name, features, filenames, filename_to_acct, acct_firm):
"""Compute intra/inter class stats for a backbone's features."""
print(f"\n{'='*60}")
print(f"Analyzing: {BACKBONES[backbone_name]['description']}")
print(f"{'='*60}")
np.random.seed(RANDOM_SEED)
# Map features to accountants
accountants = []
valid_indices = []
for i, fn in enumerate(filenames):
acct = filename_to_acct.get(fn)
if acct:
accountants.append(acct)
valid_indices.append(i)
valid_features = features[valid_indices]
print(f" Valid signatures with accountant: {len(valid_indices):,}")
# Group by accountant
acct_groups = defaultdict(list)
for i, acct in enumerate(accountants):
acct_groups[acct].append(i)
# --- Intra-class ---
print(" Computing intra-class similarities...")
intra_sims = []
for acct, indices in tqdm(acct_groups.items(), desc=" Intra-class", leave=False):
if len(indices) < INTRA_CLASS_MIN_SIGNATURES:
continue
vecs = valid_features[indices]
sim_matrix = vecs @ vecs.T
n = len(indices)
triu_idx = np.triu_indices(n, k=1)
intra_sims.extend(sim_matrix[triu_idx].tolist())
intra_sims = np.array(intra_sims)
print(f" Intra-class pairs: {len(intra_sims):,}")
# --- Inter-class ---
print(" Computing inter-class similarities...")
all_acct_list = list(acct_groups.keys())
inter_sims = []
for _ in range(INTER_CLASS_SAMPLE_SIZE):
a1, a2 = np.random.choice(len(all_acct_list), 2, replace=False)
i1 = np.random.choice(acct_groups[all_acct_list[a1]])
i2 = np.random.choice(acct_groups[all_acct_list[a2]])
sim = float(valid_features[i1] @ valid_features[i2])
inter_sims.append(sim)
inter_sims = np.array(inter_sims)
print(f" Inter-class pairs: {len(inter_sims):,}")
# --- Firm A (known replication) ---
print(f" Computing Firm A ({FIRM_A_NAME}) distribution...")
firm_a_accts = [acct for acct in acct_groups if acct_firm.get(acct) == FIRM_A_NAME]
firm_a_sims = []
for acct in firm_a_accts:
indices = acct_groups[acct]
if len(indices) < 2:
continue
vecs = valid_features[indices]
sim_matrix = vecs @ vecs.T
n = len(indices)
triu_idx = np.triu_indices(n, k=1)
firm_a_sims.extend(sim_matrix[triu_idx].tolist())
firm_a_sims = np.array(firm_a_sims) if firm_a_sims else np.array([])
print(f" Firm A accountants: {len(firm_a_accts)}, pairs: {len(firm_a_sims):,}")
# --- Statistics ---
def dist_stats(arr, name):
return {
"name": name,
"n": len(arr),
"mean": float(np.mean(arr)),
"std": float(np.std(arr)),
"median": float(np.median(arr)),
"p1": float(np.percentile(arr, 1)),
"p5": float(np.percentile(arr, 5)),
"p25": float(np.percentile(arr, 25)),
"p75": float(np.percentile(arr, 75)),
"p95": float(np.percentile(arr, 95)),
"p99": float(np.percentile(arr, 99)),
"min": float(np.min(arr)),
"max": float(np.max(arr)),
}
intra_stats = dist_stats(intra_sims, "intra")
inter_stats = dist_stats(inter_sims, "inter")
firm_a_stats = dist_stats(firm_a_sims, "firm_a") if len(firm_a_sims) > 0 else None
# Cohen's d
pooled_std = np.sqrt((intra_stats["std"]**2 + inter_stats["std"]**2) / 2)
cohens_d = (intra_stats["mean"] - inter_stats["mean"]) / pooled_std if pooled_std > 0 else 0
# KDE crossover
try:
from scipy.stats import gaussian_kde
x_grid = np.linspace(0, 1, 1000)
kde_intra = gaussian_kde(intra_sims)
kde_inter = gaussian_kde(inter_sims)
diff = kde_intra(x_grid) - kde_inter(x_grid)
sign_changes = np.where(np.diff(np.sign(diff)))[0]
crossovers = x_grid[sign_changes]
valid_crossovers = crossovers[(crossovers > 0.5) & (crossovers < 1.0)]
kde_crossover = float(valid_crossovers[-1]) if len(valid_crossovers) > 0 else None
except Exception as e:
print(f" KDE crossover computation failed: {e}")
kde_crossover = None
results = {
"backbone": backbone_name,
"description": BACKBONES[backbone_name]["description"],
"feature_dim": BACKBONES[backbone_name]["feature_dim"],
"intra": intra_stats,
"inter": inter_stats,
"firm_a": firm_a_stats,
"cohens_d": float(cohens_d),
"kde_crossover": kde_crossover,
}
# Print summary
print(f"\n --- {backbone_name} Summary ---")
print(f" Feature dim: {results['feature_dim']}")
print(f" Intra mean: {intra_stats['mean']:.4f} +/- {intra_stats['std']:.4f}")
print(f" Inter mean: {inter_stats['mean']:.4f} +/- {inter_stats['std']:.4f}")
print(f" Cohen's d: {cohens_d:.4f}")
print(f" KDE crossover: {kde_crossover}")
if firm_a_stats:
print(f" Firm A mean: {firm_a_stats['mean']:.4f} +/- {firm_a_stats['std']:.4f}")
print(f" Firm A 1st pct: {firm_a_stats['p1']:.4f}")
return results
def generate_comparison_table(all_results):
"""Generate a markdown comparison table."""
print(f"\n{'='*60}")
print("COMPARISON TABLE")
print(f"{'='*60}\n")
results_by_name = {r["backbone"]: r for r in all_results}
def get_val(backbone, key, sub=None):
r = results_by_name.get(backbone)
if not r:
return None
if sub:
section = r.get(sub)
if isinstance(section, dict):
return section.get(key)
return None
return r.get(key)
def fmt(val, fmt_str=".4f"):
if val is None:
return "---"
if isinstance(val, int):
return str(val)
return f"{val:{fmt_str}}"
names = ["resnet50", "vgg16", "efficientnet_b0"]
header = "| Metric | ResNet-50 | VGG-16 | EfficientNet-B0 |"
sep = "|--------|-----------|--------|-----------------|"
rows = [
f"| Feature dim | {fmt(get_val('resnet50','feature_dim'),'')} | {fmt(get_val('vgg16','feature_dim'),'')} | {fmt(get_val('efficientnet_b0','feature_dim'),'')} |",
f"| Intra mean | {fmt(get_val('resnet50','mean','intra'))} | {fmt(get_val('vgg16','mean','intra'))} | {fmt(get_val('efficientnet_b0','mean','intra'))} |",
f"| Intra std | {fmt(get_val('resnet50','std','intra'))} | {fmt(get_val('vgg16','std','intra'))} | {fmt(get_val('efficientnet_b0','std','intra'))} |",
f"| Inter mean | {fmt(get_val('resnet50','mean','inter'))} | {fmt(get_val('vgg16','mean','inter'))} | {fmt(get_val('efficientnet_b0','mean','inter'))} |",
f"| Inter std | {fmt(get_val('resnet50','std','inter'))} | {fmt(get_val('vgg16','std','inter'))} | {fmt(get_val('efficientnet_b0','std','inter'))} |",
f"| **Cohen's d** | **{fmt(get_val('resnet50','cohens_d'))}** | **{fmt(get_val('vgg16','cohens_d'))}** | **{fmt(get_val('efficientnet_b0','cohens_d'))}** |",
f"| KDE crossover | {fmt(get_val('resnet50','kde_crossover'))} | {fmt(get_val('vgg16','kde_crossover'))} | {fmt(get_val('efficientnet_b0','kde_crossover'))} |",
f"| Firm A mean | {fmt(get_val('resnet50','mean','firm_a'))} | {fmt(get_val('vgg16','mean','firm_a'))} | {fmt(get_val('efficientnet_b0','mean','firm_a'))} |",
f"| Firm A 1st pct | {fmt(get_val('resnet50','p1','firm_a'))} | {fmt(get_val('vgg16','p1','firm_a'))} | {fmt(get_val('efficientnet_b0','p1','firm_a'))} |",
]
table = "\n".join([header, sep] + rows)
print(table)
# Save report
report_path = OUTPUT_DIR / "ablation_comparison.md"
with open(report_path, 'w') as f:
f.write("# Ablation Study: Backbone Comparison\n\n")
f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M')}\n\n")
f.write("## Comparison Table\n\n")
f.write(table + "\n\n")
f.write("## Interpretation\n\n")
f.write("- **Cohen's d**: Higher = better separation between same-CPA and different-CPA signatures\n")
f.write("- **KDE crossover**: The Bayes-optimal decision boundary (higher = easier to classify)\n")
f.write("- **Firm A**: Known replication firm; expect very high mean similarity\n")
f.write("- **Firm A 1st percentile**: Lower bound of known-replication similarity\n")
json_path = OUTPUT_DIR / "ablation_results.json"
with open(json_path, 'w') as f:
json.dump(all_results, f, indent=2, ensure_ascii=False)
print(f"\n Report saved: {report_path}")
print(f" Raw data saved: {json_path}")
return table
def main():
parser = argparse.ArgumentParser(description="Ablation: backbone comparison")
parser.add_argument("--extract", action="store_true", help="Feature extraction only")
parser.add_argument("--analyze", action="store_true", help="Analysis only")
parser.add_argument("--backbone", type=str, help="Run single backbone (resnet50/vgg16/efficientnet_b0)")
args = parser.parse_args()
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Load filenames
with open(FILENAMES_PATH) as f:
filenames = [line.strip() for line in f if line.strip()]
backbones_to_run = [args.backbone] if args.backbone else list(BACKBONES.keys())
if not args.analyze:
# === Phase 1: Feature Extraction ===
print("\n" + "=" * 60)
print("PHASE 1: FEATURE EXTRACTION")
print("=" * 60)
# For ResNet-50, copy existing features instead of re-extracting
resnet_ablation_path = OUTPUT_DIR / "features_resnet50.npy"
resnet_existing_path = FEATURES_DIR / "signature_features.npy"
if "resnet50" in backbones_to_run and not resnet_ablation_path.exists() and resnet_existing_path.exists():
print(f"\nCopying existing ResNet-50 features...")
import shutil
resnet_ablation_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(resnet_existing_path, resnet_ablation_path)
print(f" Copied: {resnet_ablation_path}")
for name in backbones_to_run:
if name == "resnet50" and resnet_ablation_path.exists():
continue
extract_features(name)
if args.extract:
print("\nFeature extraction complete. Run with --analyze to compute statistics.")
return
# === Phase 2: Analysis ===
print("\n" + "=" * 60)
print("PHASE 2: ANALYSIS")
print("=" * 60)
filename_to_acct, acct_firm = load_accountant_data()
all_results = []
for name in backbones_to_run:
feat_path = OUTPUT_DIR / f"features_{name}.npy"
if not feat_path.exists():
print(f"\n WARNING: {feat_path} not found, skipping {name}")
continue
features = np.load(feat_path)
results = analyze_backbone(name, features, filenames, filename_to_acct, acct_firm)
all_results.append(results)
if len(all_results) > 1:
generate_comparison_table(all_results)
elif len(all_results) == 1:
print(f"\nOnly one backbone analyzed. Run all three for comparison table.")
print("\nDone!")
if __name__ == "__main__":
main()