Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification

Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 23:05:33 +08:00
parent 21df0ff387
commit 939a348da4
33 changed files with 9315 additions and 0 deletions
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+"""
+Compute SSIM and pHash for all signature pairs (closest match per accountant).
+Uses multiprocessing for parallel image loading and computation.
+Saves results to database and outputs complete CSV.
+"""
+
+import sqlite3
+import numpy as np
+import cv2
+import os
+import sys
+import json
+import csv
+import time
+from datetime import datetime
+from collections import defaultdict
+from multiprocessing import Pool, cpu_count
+from pathlib import Path
+
+DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+IMAGE_DIR = '/Volumes/NV2/PDF-Processing/yolo-signatures/images'
+OUTPUT_CSV = '/Volumes/NV2/PDF-Processing/signature-analysis/reports/complete_pdf_report.csv'
+CHECKPOINT_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/ssim_checkpoint.json'
+NUM_WORKERS = max(1, cpu_count() - 2)  # Leave 2 cores free
+BATCH_SIZE = 1000
+
+
+def compute_phash(img, hash_size=8):
+    """Compute perceptual hash."""
+    resized = cv2.resize(img, (hash_size + 1, hash_size))
+    diff = resized[:, 1:] > resized[:, :-1]
+    return diff.flatten()
+
+
+def compute_pair_ssim(args):
+    """Compute SSIM, pHash, histogram correlation for a pair of images."""
+    sig_id, file1, file2, cosine_sim = args
+
+    path1 = os.path.join(IMAGE_DIR, file1)
+    path2 = os.path.join(IMAGE_DIR, file2)
+
+    result = {
+        'signature_id': sig_id,
+        'match_file': file2,
+        'cosine_similarity': cosine_sim,
+        'ssim': None,
+        'phash_distance': None,
+        'histogram_corr': None,
+        'pixel_identical': False,
+    }
+
+    try:
+        img1 = cv2.imread(path1, cv2.IMREAD_GRAYSCALE)
+        img2 = cv2.imread(path2, cv2.IMREAD_GRAYSCALE)
+
+        if img1 is None or img2 is None:
+            return result
+
+        # Resize to same dimensions
+        h = min(img1.shape[0], img2.shape[0])
+        w = min(img1.shape[1], img2.shape[1])
+        if h < 3 or w < 3:
+            return result
+
+        img1_r = cv2.resize(img1, (w, h))
+        img2_r = cv2.resize(img2, (w, h))
+
+        # Pixel identical check
+        result['pixel_identical'] = bool(np.array_equal(img1_r, img2_r))
+
+        # SSIM
+        try:
+            from skimage.metrics import structural_similarity as ssim
+            win_size = min(7, min(h, w))
+            if win_size % 2 == 0:
+                win_size -= 1
+            if win_size >= 3:
+                result['ssim'] = float(ssim(img1_r, img2_r, win_size=win_size))
+            else:
+                result['ssim'] = None
+        except Exception:
+            result['ssim'] = None
+
+        # Histogram correlation
+        hist1 = cv2.calcHist([img1_r], [0], None, [256], [0, 256])
+        hist2 = cv2.calcHist([img2_r], [0], None, [256], [0, 256])
+        result['histogram_corr'] = float(cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL))
+
+        # pHash distance
+        h1 = compute_phash(img1_r)
+        h2 = compute_phash(img2_r)
+        result['phash_distance'] = int(np.sum(h1 != h2))
+
+    except Exception as e:
+        pass
+
+    return result
+
+
+def load_checkpoint():
+    """Load checkpoint of already processed signature IDs."""
+    if os.path.exists(CHECKPOINT_PATH):
+        with open(CHECKPOINT_PATH, 'r') as f:
+            data = json.load(f)
+            return set(data.get('processed_ids', []))
+    return set()
+
+
+def save_checkpoint(processed_ids):
+    """Save checkpoint."""
+    with open(CHECKPOINT_PATH, 'w') as f:
+        json.dump({'processed_ids': list(processed_ids), 'timestamp': str(datetime.now())}, f)
+
+
+def main():
+    start_time = time.time()
+    print("=" * 70)
+    print("SSIM & pHash Computation for All Signature Pairs")
+    print(f"Workers: {NUM_WORKERS}")
+    print("=" * 70)
+
+    # --- Step 1: Load data ---
+    print("\n[1/4] Loading data from database...")
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+
+    cur.execute('''
+        SELECT signature_id, image_filename, assigned_accountant, feature_vector
+        FROM signatures
+        WHERE feature_vector IS NOT NULL AND assigned_accountant IS NOT NULL
+    ''')
+    rows = cur.fetchall()
+
+    sig_ids = []
+    filenames = []
+    accountants = []
+    features = []
+
+    for row in rows:
+        sig_ids.append(row[0])
+        filenames.append(row[1])
+        accountants.append(row[2])
+        features.append(np.frombuffer(row[3], dtype=np.float32))
+
+    features = np.array(features)
+    print(f"  Loaded {len(sig_ids)} signatures")
+
+    # --- Step 2: Find closest match per signature ---
+    print("\n[2/4] Finding closest match per signature (same accountant)...")
+    acct_groups = defaultdict(list)
+    for i, acct in enumerate(accountants):
+        acct_groups[acct].append(i)
+
+    # Load checkpoint
+    processed_ids = load_checkpoint()
+    print(f"  Checkpoint: {len(processed_ids)} already processed")
+
+    # Prepare tasks
+    tasks = []
+    for acct, indices in acct_groups.items():
+        if len(indices) < 2:
+            continue
+        vecs = features[indices]
+        sim_matrix = vecs @ vecs.T
+        np.fill_diagonal(sim_matrix, -1)  # Exclude self
+
+        for local_i, global_i in enumerate(indices):
+            if sig_ids[global_i] in processed_ids:
+                continue
+            best_local = np.argmax(sim_matrix[local_i])
+            best_global = indices[best_local]
+            best_sim = float(sim_matrix[local_i, best_local])
+            tasks.append((
+                sig_ids[global_i],
+                filenames[global_i],
+                filenames[best_global],
+                best_sim
+            ))
+
+    print(f"  Tasks to process: {len(tasks)}")
+
+    # --- Step 3: Compute SSIM/pHash in parallel ---
+    print(f"\n[3/4] Computing SSIM & pHash ({len(tasks)} pairs, {NUM_WORKERS} workers)...")
+
+    # Add SSIM columns to database if not exist
+    try:
+        cur.execute('ALTER TABLE signatures ADD COLUMN ssim_to_closest REAL')
+    except:
+        pass
+    try:
+        cur.execute('ALTER TABLE signatures ADD COLUMN phash_distance_to_closest INTEGER')
+    except:
+        pass
+    try:
+        cur.execute('ALTER TABLE signatures ADD COLUMN histogram_corr_to_closest REAL')
+    except:
+        pass
+    try:
+        cur.execute('ALTER TABLE signatures ADD COLUMN pixel_identical_to_closest INTEGER')
+    except:
+        pass
+    try:
+        cur.execute('ALTER TABLE signatures ADD COLUMN closest_match_file TEXT')
+    except:
+        pass
+    conn.commit()
+
+    total = len(tasks)
+    done = 0
+    batch_results = []
+
+    with Pool(NUM_WORKERS) as pool:
+        for result in pool.imap_unordered(compute_pair_ssim, tasks, chunksize=50):
+            batch_results.append(result)
+            done += 1
+
+            if done % BATCH_SIZE == 0 or done == total:
+                # Save batch to database
+                for r in batch_results:
+                    cur.execute('''
+                        UPDATE signatures SET
+                            ssim_to_closest = ?,
+                            phash_distance_to_closest = ?,
+                            histogram_corr_to_closest = ?,
+                            pixel_identical_to_closest = ?,
+                            closest_match_file = ?
+                        WHERE signature_id = ?
+                    ''', (
+                        r['ssim'],
+                        r['phash_distance'],
+                        r['histogram_corr'],
+                        1 if r['pixel_identical'] else 0,
+                        r['match_file'],
+                        r['signature_id']
+                    ))
+                    processed_ids.add(r['signature_id'])
+                conn.commit()
+                save_checkpoint(processed_ids)
+                batch_results = []
+
+                elapsed = time.time() - start_time
+                rate = done / elapsed
+                eta = (total - done) / rate if rate > 0 else 0
+                print(f"  {done:,}/{total:,} ({100*done/total:.1f}%) "
+                      f"| {rate:.1f} pairs/s | ETA: {eta/60:.1f} min")
+
+    # --- Step 4: Generate complete CSV ---
+    print(f"\n[4/4] Generating complete CSV...")
+
+    cur.execute('''
+        SELECT
+            s.source_pdf,
+            s.year_month,
+            s.serial_number,
+            s.doc_type,
+            s.page_number,
+            s.sig_index,
+            s.image_filename,
+            s.assigned_accountant,
+            s.excel_accountant1,
+            s.excel_accountant2,
+            s.excel_firm,
+            s.detection_confidence,
+            s.signature_verdict,
+            s.max_similarity_to_same_accountant,
+            s.ssim_to_closest,
+            s.phash_distance_to_closest,
+            s.histogram_corr_to_closest,
+            s.pixel_identical_to_closest,
+            s.closest_match_file,
+            a.risk_level,
+            a.mean_similarity as acct_mean_similarity,
+            a.ratio_gt_95 as acct_ratio_gt_95
+        FROM signatures s
+        LEFT JOIN accountants a ON s.assigned_accountant = a.name
+        ORDER BY s.source_pdf, s.sig_index
+    ''')
+
+    columns = [
+        'source_pdf', 'year_month', 'serial_number', 'doc_type',
+        'page_number', 'sig_index', 'image_filename',
+        'assigned_accountant', 'excel_accountant1', 'excel_accountant2', 'excel_firm',
+        'detection_confidence', 'signature_verdict',
+        'max_cosine_similarity', 'ssim_to_closest', 'phash_distance_to_closest',
+        'histogram_corr_to_closest', 'pixel_identical_to_closest', 'closest_match_file',
+        'accountant_risk_level', 'accountant_mean_similarity', 'accountant_ratio_gt_95'
+    ]
+
+    with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        writer.writerow(columns)
+        for row in cur:
+            writer.writerow(row)
+
+    # Count rows
+    cur.execute('SELECT COUNT(*) FROM signatures')
+    total_sigs = cur.fetchone()[0]
+    cur.execute('SELECT COUNT(DISTINCT source_pdf) FROM signatures')
+    total_pdfs = cur.fetchone()[0]
+
+    conn.close()
+
+    elapsed = time.time() - start_time
+    print(f"\n{'='*70}")
+    print(f"Complete!")
+    print(f"  Total signatures: {total_sigs:,}")
+    print(f"  Total PDFs: {total_pdfs:,}")
+    print(f"  Output: {OUTPUT_CSV}")
+    print(f"  Time: {elapsed/60:.1f} minutes")
+    print(f"{'='*70}")
+
+    # Clean up checkpoint
+    if os.path.exists(CHECKPOINT_PATH):
+        os.remove(CHECKPOINT_PATH)
+
+
+if __name__ == '__main__':
+    main()