Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification

Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 23:05:33 +08:00
parent 21df0ff387
commit 939a348da4
33 changed files with 9315 additions and 0 deletions
@@ -0,0 +1,493 @@
+#!/usr/bin/env python3
+"""
+Ablation Study: Backbone Comparison for Signature Feature Extraction
+====================================================================
+Compares ResNet-50 vs VGG-16 vs EfficientNet-B0 on:
+  1. Feature extraction speed
+  2. Intra/Inter class cosine similarity separation (Cohen's d)
+  3. KDE crossover point
+  4. Firm A (known replication) distribution
+
+Usage:
+  python ablation_backbone_comparison.py              # Run all backbones
+  python ablation_backbone_comparison.py --extract     # Feature extraction only
+  python ablation_backbone_comparison.py --analyze     # Analysis only (features must exist)
+"""
+
+import torch
+import torch.nn as nn
+import torchvision.models as models
+import torchvision.transforms as transforms
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+import sqlite3
+import time
+import argparse
+import json
+from pathlib import Path
+from collections import defaultdict
+from tqdm import tqdm
+import warnings
+warnings.filterwarnings('ignore')
+
+# === Configuration ===
+IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
+FEATURES_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/features")
+DB_PATH = Path("/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db")
+OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis/ablation")
+FILENAMES_PATH = FEATURES_DIR / "signature_filenames.txt"
+
+BATCH_SIZE = 64
+NUM_WORKERS = 4
+DEVICE = torch.device("mps" if torch.backends.mps.is_available() else
+                      "cuda" if torch.cuda.is_available() else "cpu")
+
+# Sampling for analysis
+INTER_CLASS_SAMPLE_SIZE = 500_000
+INTRA_CLASS_MIN_SIGNATURES = 3
+RANDOM_SEED = 42
+
+# Known replication firm (Deloitte Taiwan = 勤業眾信)
+FIRM_A_NAME = "勤業眾信聯合"
+
+BACKBONES = {
+    "resnet50": {
+        "model_fn": lambda: models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2),
+        "feature_dim": 2048,
+        "description": "ResNet-50 (ImageNet1K_V2)",
+    },
+    "vgg16": {
+        "model_fn": lambda: models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1),
+        "feature_dim": 4096,
+        "description": "VGG-16 (ImageNet1K_V1)",
+    },
+    "efficientnet_b0": {
+        "model_fn": lambda: models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1),
+        "feature_dim": 1280,
+        "description": "EfficientNet-B0 (ImageNet1K_V1)",
+    },
+}
+
+
+class SignatureDataset(Dataset):
+    def __init__(self, image_paths, transform=None):
+        self.image_paths = image_paths
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.image_paths)
+
+    def __getitem__(self, idx):
+        import cv2
+        img_path = self.image_paths[idx]
+        img = cv2.imread(str(img_path))
+        if img is None:
+            img = np.ones((224, 224, 3), dtype=np.uint8) * 255
+        else:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = self._resize_with_padding(img, 224, 224)
+        if self.transform:
+            img = self.transform(img)
+        return img, str(img_path.name)
+
+    @staticmethod
+    def _resize_with_padding(img, target_w, target_h):
+        h, w = img.shape[:2]
+        scale = min(target_w / w, target_h / h)
+        new_w, new_h = int(w * scale), int(h * scale)
+        import cv2
+        resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
+        canvas = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255
+        x_off = (target_w - new_w) // 2
+        y_off = (target_h - new_h) // 2
+        canvas[y_off:y_off+new_h, x_off:x_off+new_w] = resized
+        return canvas
+
+
+def build_feature_extractor(backbone_name):
+    """Build a feature extractor for the given backbone."""
+    config = BACKBONES[backbone_name]
+    model = config["model_fn"]()
+
+    if backbone_name == "vgg16":
+        features_part = model.features
+        avgpool = model.avgpool
+        # Drop last Linear (classifier) to get 4096-dim output
+        classifier_part = nn.Sequential(*list(model.classifier.children())[:-1])
+
+        class VGGFeatureExtractor(nn.Module):
+            def __init__(self, features, avgpool, classifier):
+                super().__init__()
+                self.features = features
+                self.avgpool = avgpool
+                self.classifier = classifier
+
+            def forward(self, x):
+                x = self.features(x)
+                x = self.avgpool(x)
+                x = torch.flatten(x, 1)
+                x = self.classifier(x)
+                return x
+
+        model = VGGFeatureExtractor(features_part, avgpool, classifier_part)
+
+    elif backbone_name == "resnet50":
+        model = nn.Sequential(*list(model.children())[:-1])
+
+    elif backbone_name == "efficientnet_b0":
+        model.classifier = nn.Identity()
+
+    model = model.to(DEVICE)
+    model.eval()
+    return model
+
+
+def extract_features(backbone_name):
+    """Extract features for all signatures using the given backbone."""
+    print(f"\n{'='*60}")
+    print(f"Extracting features: {BACKBONES[backbone_name]['description']}")
+    print(f"{'='*60}")
+
+    output_path = OUTPUT_DIR / f"features_{backbone_name}.npy"
+    if output_path.exists():
+        print(f"  Features already exist: {output_path}")
+        print(f"  Skipping extraction. Delete file to re-extract.")
+        return np.load(output_path)
+
+    # Load filenames
+    with open(FILENAMES_PATH) as f:
+        filenames = [line.strip() for line in f if line.strip()]
+    print(f"  Images: {len(filenames):,}")
+
+    image_paths = [IMAGES_DIR / fn for fn in filenames]
+
+    # Build model
+    model = build_feature_extractor(backbone_name)
+
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    ])
+
+    dataset = SignatureDataset(image_paths, transform=transform)
+    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False,
+                           num_workers=NUM_WORKERS, pin_memory=True)
+
+    all_features = []
+    start_time = time.time()
+
+    with torch.no_grad():
+        for images, _ in tqdm(dataloader, desc=f"  {backbone_name}"):
+            images = images.to(DEVICE)
+            feats = model(images)
+            feats = feats.view(feats.size(0), -1)  # flatten
+            feats = nn.functional.normalize(feats, p=2, dim=1)  # L2 normalize
+            all_features.append(feats.cpu().numpy())
+
+    elapsed = time.time() - start_time
+    all_features = np.vstack(all_features)
+
+    print(f"  Feature shape: {all_features.shape}")
+    print(f"  Time: {elapsed:.1f}s ({elapsed/60:.1f}min)")
+    print(f"  Speed: {len(filenames)/elapsed:.1f} images/sec")
+
+    # Save
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    np.save(output_path, all_features)
+    print(f"  Saved: {output_path} ({all_features.nbytes / 1e9:.2f} GB)")
+
+    return all_features
+
+
+def load_accountant_data():
+    """Load accountant assignments and firm info from DB."""
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+
+    cur.execute('''
+        SELECT image_filename, assigned_accountant
+        FROM signatures
+        WHERE feature_vector IS NOT NULL
+        ORDER BY signature_id
+    ''')
+    sig_rows = cur.fetchall()
+
+    cur.execute('SELECT name, firm FROM accountants')
+    acct_firm = {r[0]: r[1] for r in cur.fetchall()}
+
+    conn.close()
+
+    filename_to_acct = {r[0]: r[1] for r in sig_rows}
+    return filename_to_acct, acct_firm
+
+
+def analyze_backbone(backbone_name, features, filenames, filename_to_acct, acct_firm):
+    """Compute intra/inter class stats for a backbone's features."""
+    print(f"\n{'='*60}")
+    print(f"Analyzing: {BACKBONES[backbone_name]['description']}")
+    print(f"{'='*60}")
+
+    np.random.seed(RANDOM_SEED)
+
+    # Map features to accountants
+    accountants = []
+    valid_indices = []
+    for i, fn in enumerate(filenames):
+        acct = filename_to_acct.get(fn)
+        if acct:
+            accountants.append(acct)
+            valid_indices.append(i)
+
+    valid_features = features[valid_indices]
+    print(f"  Valid signatures with accountant: {len(valid_indices):,}")
+
+    # Group by accountant
+    acct_groups = defaultdict(list)
+    for i, acct in enumerate(accountants):
+        acct_groups[acct].append(i)
+
+    # --- Intra-class ---
+    print("  Computing intra-class similarities...")
+    intra_sims = []
+    for acct, indices in tqdm(acct_groups.items(), desc="  Intra-class", leave=False):
+        if len(indices) < INTRA_CLASS_MIN_SIGNATURES:
+            continue
+        vecs = valid_features[indices]
+        sim_matrix = vecs @ vecs.T
+        n = len(indices)
+        triu_idx = np.triu_indices(n, k=1)
+        intra_sims.extend(sim_matrix[triu_idx].tolist())
+
+    intra_sims = np.array(intra_sims)
+    print(f"  Intra-class pairs: {len(intra_sims):,}")
+
+    # --- Inter-class ---
+    print("  Computing inter-class similarities...")
+    all_acct_list = list(acct_groups.keys())
+    inter_sims = []
+    for _ in range(INTER_CLASS_SAMPLE_SIZE):
+        a1, a2 = np.random.choice(len(all_acct_list), 2, replace=False)
+        i1 = np.random.choice(acct_groups[all_acct_list[a1]])
+        i2 = np.random.choice(acct_groups[all_acct_list[a2]])
+        sim = float(valid_features[i1] @ valid_features[i2])
+        inter_sims.append(sim)
+    inter_sims = np.array(inter_sims)
+    print(f"  Inter-class pairs: {len(inter_sims):,}")
+
+    # --- Firm A (known replication) ---
+    print(f"  Computing Firm A ({FIRM_A_NAME}) distribution...")
+    firm_a_accts = [acct for acct in acct_groups if acct_firm.get(acct) == FIRM_A_NAME]
+    firm_a_sims = []
+    for acct in firm_a_accts:
+        indices = acct_groups[acct]
+        if len(indices) < 2:
+            continue
+        vecs = valid_features[indices]
+        sim_matrix = vecs @ vecs.T
+        n = len(indices)
+        triu_idx = np.triu_indices(n, k=1)
+        firm_a_sims.extend(sim_matrix[triu_idx].tolist())
+    firm_a_sims = np.array(firm_a_sims) if firm_a_sims else np.array([])
+    print(f"  Firm A accountants: {len(firm_a_accts)}, pairs: {len(firm_a_sims):,}")
+
+    # --- Statistics ---
+    def dist_stats(arr, name):
+        return {
+            "name": name,
+            "n": len(arr),
+            "mean": float(np.mean(arr)),
+            "std": float(np.std(arr)),
+            "median": float(np.median(arr)),
+            "p1": float(np.percentile(arr, 1)),
+            "p5": float(np.percentile(arr, 5)),
+            "p25": float(np.percentile(arr, 25)),
+            "p75": float(np.percentile(arr, 75)),
+            "p95": float(np.percentile(arr, 95)),
+            "p99": float(np.percentile(arr, 99)),
+            "min": float(np.min(arr)),
+            "max": float(np.max(arr)),
+        }
+
+    intra_stats = dist_stats(intra_sims, "intra")
+    inter_stats = dist_stats(inter_sims, "inter")
+    firm_a_stats = dist_stats(firm_a_sims, "firm_a") if len(firm_a_sims) > 0 else None
+
+    # Cohen's d
+    pooled_std = np.sqrt((intra_stats["std"]**2 + inter_stats["std"]**2) / 2)
+    cohens_d = (intra_stats["mean"] - inter_stats["mean"]) / pooled_std if pooled_std > 0 else 0
+
+    # KDE crossover
+    try:
+        from scipy.stats import gaussian_kde
+        x_grid = np.linspace(0, 1, 1000)
+        kde_intra = gaussian_kde(intra_sims)
+        kde_inter = gaussian_kde(inter_sims)
+        diff = kde_intra(x_grid) - kde_inter(x_grid)
+        sign_changes = np.where(np.diff(np.sign(diff)))[0]
+        crossovers = x_grid[sign_changes]
+        valid_crossovers = crossovers[(crossovers > 0.5) & (crossovers < 1.0)]
+        kde_crossover = float(valid_crossovers[-1]) if len(valid_crossovers) > 0 else None
+    except Exception as e:
+        print(f"  KDE crossover computation failed: {e}")
+        kde_crossover = None
+
+    results = {
+        "backbone": backbone_name,
+        "description": BACKBONES[backbone_name]["description"],
+        "feature_dim": BACKBONES[backbone_name]["feature_dim"],
+        "intra": intra_stats,
+        "inter": inter_stats,
+        "firm_a": firm_a_stats,
+        "cohens_d": float(cohens_d),
+        "kde_crossover": kde_crossover,
+    }
+
+    # Print summary
+    print(f"\n  --- {backbone_name} Summary ---")
+    print(f"  Feature dim:    {results['feature_dim']}")
+    print(f"  Intra mean:     {intra_stats['mean']:.4f} +/- {intra_stats['std']:.4f}")
+    print(f"  Inter mean:     {inter_stats['mean']:.4f} +/- {inter_stats['std']:.4f}")
+    print(f"  Cohen's d:      {cohens_d:.4f}")
+    print(f"  KDE crossover:  {kde_crossover}")
+    if firm_a_stats:
+        print(f"  Firm A mean:    {firm_a_stats['mean']:.4f} +/- {firm_a_stats['std']:.4f}")
+        print(f"  Firm A 1st pct: {firm_a_stats['p1']:.4f}")
+
+    return results
+
+
+def generate_comparison_table(all_results):
+    """Generate a markdown comparison table."""
+    print(f"\n{'='*60}")
+    print("COMPARISON TABLE")
+    print(f"{'='*60}\n")
+
+    results_by_name = {r["backbone"]: r for r in all_results}
+
+    def get_val(backbone, key, sub=None):
+        r = results_by_name.get(backbone)
+        if not r:
+            return None
+        if sub:
+            section = r.get(sub)
+            if isinstance(section, dict):
+                return section.get(key)
+            return None
+        return r.get(key)
+
+    def fmt(val, fmt_str=".4f"):
+        if val is None:
+            return "---"
+        if isinstance(val, int):
+            return str(val)
+        return f"{val:{fmt_str}}"
+
+    names = ["resnet50", "vgg16", "efficientnet_b0"]
+    header = "| Metric | ResNet-50 | VGG-16 | EfficientNet-B0 |"
+    sep    = "|--------|-----------|--------|-----------------|"
+
+    rows = [
+        f"| Feature dim | {fmt(get_val('resnet50','feature_dim'),'')} | {fmt(get_val('vgg16','feature_dim'),'')} | {fmt(get_val('efficientnet_b0','feature_dim'),'')} |",
+        f"| Intra mean | {fmt(get_val('resnet50','mean','intra'))} | {fmt(get_val('vgg16','mean','intra'))} | {fmt(get_val('efficientnet_b0','mean','intra'))} |",
+        f"| Intra std | {fmt(get_val('resnet50','std','intra'))} | {fmt(get_val('vgg16','std','intra'))} | {fmt(get_val('efficientnet_b0','std','intra'))} |",
+        f"| Inter mean | {fmt(get_val('resnet50','mean','inter'))} | {fmt(get_val('vgg16','mean','inter'))} | {fmt(get_val('efficientnet_b0','mean','inter'))} |",
+        f"| Inter std | {fmt(get_val('resnet50','std','inter'))} | {fmt(get_val('vgg16','std','inter'))} | {fmt(get_val('efficientnet_b0','std','inter'))} |",
+        f"| **Cohen's d** | **{fmt(get_val('resnet50','cohens_d'))}** | **{fmt(get_val('vgg16','cohens_d'))}** | **{fmt(get_val('efficientnet_b0','cohens_d'))}** |",
+        f"| KDE crossover | {fmt(get_val('resnet50','kde_crossover'))} | {fmt(get_val('vgg16','kde_crossover'))} | {fmt(get_val('efficientnet_b0','kde_crossover'))} |",
+        f"| Firm A mean | {fmt(get_val('resnet50','mean','firm_a'))} | {fmt(get_val('vgg16','mean','firm_a'))} | {fmt(get_val('efficientnet_b0','mean','firm_a'))} |",
+        f"| Firm A 1st pct | {fmt(get_val('resnet50','p1','firm_a'))} | {fmt(get_val('vgg16','p1','firm_a'))} | {fmt(get_val('efficientnet_b0','p1','firm_a'))} |",
+    ]
+
+    table = "\n".join([header, sep] + rows)
+    print(table)
+
+    # Save report
+    report_path = OUTPUT_DIR / "ablation_comparison.md"
+    with open(report_path, 'w') as f:
+        f.write("# Ablation Study: Backbone Comparison\n\n")
+        f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M')}\n\n")
+        f.write("## Comparison Table\n\n")
+        f.write(table + "\n\n")
+        f.write("## Interpretation\n\n")
+        f.write("- **Cohen's d**: Higher = better separation between same-CPA and different-CPA signatures\n")
+        f.write("- **KDE crossover**: The Bayes-optimal decision boundary (higher = easier to classify)\n")
+        f.write("- **Firm A**: Known replication firm; expect very high mean similarity\n")
+        f.write("- **Firm A 1st percentile**: Lower bound of known-replication similarity\n")
+
+    json_path = OUTPUT_DIR / "ablation_results.json"
+    with open(json_path, 'w') as f:
+        json.dump(all_results, f, indent=2, ensure_ascii=False)
+
+    print(f"\n  Report saved: {report_path}")
+    print(f"  Raw data saved: {json_path}")
+
+    return table
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Ablation: backbone comparison")
+    parser.add_argument("--extract", action="store_true", help="Feature extraction only")
+    parser.add_argument("--analyze", action="store_true", help="Analysis only")
+    parser.add_argument("--backbone", type=str, help="Run single backbone (resnet50/vgg16/efficientnet_b0)")
+    args = parser.parse_args()
+
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+    # Load filenames
+    with open(FILENAMES_PATH) as f:
+        filenames = [line.strip() for line in f if line.strip()]
+
+    backbones_to_run = [args.backbone] if args.backbone else list(BACKBONES.keys())
+
+    if not args.analyze:
+        # === Phase 1: Feature Extraction ===
+        print("\n" + "=" * 60)
+        print("PHASE 1: FEATURE EXTRACTION")
+        print("=" * 60)
+
+        # For ResNet-50, copy existing features instead of re-extracting
+        resnet_ablation_path = OUTPUT_DIR / "features_resnet50.npy"
+        resnet_existing_path = FEATURES_DIR / "signature_features.npy"
+        if "resnet50" in backbones_to_run and not resnet_ablation_path.exists() and resnet_existing_path.exists():
+            print(f"\nCopying existing ResNet-50 features...")
+            import shutil
+            resnet_ablation_path.parent.mkdir(parents=True, exist_ok=True)
+            shutil.copy2(resnet_existing_path, resnet_ablation_path)
+            print(f"  Copied: {resnet_ablation_path}")
+
+        for name in backbones_to_run:
+            if name == "resnet50" and resnet_ablation_path.exists():
+                continue
+            extract_features(name)
+
+    if args.extract:
+        print("\nFeature extraction complete. Run with --analyze to compute statistics.")
+        return
+
+    # === Phase 2: Analysis ===
+    print("\n" + "=" * 60)
+    print("PHASE 2: ANALYSIS")
+    print("=" * 60)
+
+    filename_to_acct, acct_firm = load_accountant_data()
+
+    all_results = []
+    for name in backbones_to_run:
+        feat_path = OUTPUT_DIR / f"features_{name}.npy"
+        if not feat_path.exists():
+            print(f"\n  WARNING: {feat_path} not found, skipping {name}")
+            continue
+        features = np.load(feat_path)
+        results = analyze_backbone(name, features, filenames, filename_to_acct, acct_firm)
+        all_results.append(results)
+
+    if len(all_results) > 1:
+        generate_comparison_table(all_results)
+    elif len(all_results) == 1:
+        print(f"\nOnly one backbone analyzed. Run all three for comparison table.")
+
+    print("\nDone!")
+
+
+if __name__ == "__main__":
+    main()