Add Deloitte distribution & independent dHash analysis scripts

- Script 13: Firm A normality/multimodality analysis (Shapiro-Wilk, Anderson-Darling, KDE, per-accountant ANOVA, Beta/Gamma fitting) - Script 14: Independent min-dHash computation across all pairs per accountant (not just cosine-nearest pair) - THRESHOLD_VALIDATION_OPTIONS: 2026-01 discussion doc on threshold validation approaches - .gitignore: exclude model weights, node artifacts, and xlsx data Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 21:34:24 +08:00
parent 939a348da4
commit a261a22bd2
4 changed files with 1270 additions and 0 deletions
@@ -0,0 +1,293 @@
+#!/usr/bin/env python3
+"""
+Compute independent min dHash for all signatures.
+===================================================
+Currently phash_distance_to_closest is conditional on cosine-nearest pair.
+This script computes an INDEPENDENT min dHash: for each signature, find the
+pair within the same accountant that has the smallest dHash distance,
+regardless of cosine similarity.
+
+Three metrics after this script:
+  1. max_similarity_to_same_accountant  (max cosine)     — primary classifier
+  2. min_dhash_independent              (independent min) — independent 2nd classifier
+  3. phash_distance_to_closest          (conditional)     — diagnostic tool
+
+Phase 1: Compute dHash vector for each image, store as BLOB in DB
+Phase 2: All-pairs hamming distance within same accountant, store min
+"""
+
+import sqlite3
+import numpy as np
+import cv2
+import os
+import sys
+import time
+from multiprocessing import Pool, cpu_count
+from pathlib import Path
+
+DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+IMAGE_DIR = '/Volumes/NV2/PDF-Processing/yolo-signatures/images'
+NUM_WORKERS = max(1, cpu_count() - 2)
+BATCH_SIZE = 5000
+HASH_SIZE = 8  # 9x8 -> 8x8 = 64-bit hash
+
+
+# ── Phase 1: Compute dHash per image ─────────────────────────────────
+
+def compute_dhash_for_file(args):
+    """Compute dHash for a single image file. Returns (sig_id, hash_bytes) or (sig_id, None)."""
+    sig_id, filename = args
+    path = os.path.join(IMAGE_DIR, filename)
+    try:
+        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
+        if img is None:
+            return (sig_id, None)
+        resized = cv2.resize(img, (HASH_SIZE + 1, HASH_SIZE))
+        diff = resized[:, 1:] > resized[:, :-1]  # 8x8 = 64 bits
+        return (sig_id, np.packbits(diff.flatten()).tobytes())
+    except Exception:
+        return (sig_id, None)
+
+
+def phase1_compute_hashes():
+    """Compute and store dHash for all signatures."""
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+
+    # Add columns if not exist
+    for col in ['dhash_vector BLOB', 'min_dhash_independent INTEGER',
+                'min_dhash_independent_match TEXT']:
+        try:
+            cur.execute(f'ALTER TABLE signatures ADD COLUMN {col}')
+        except sqlite3.OperationalError:
+            pass
+    conn.commit()
+
+    # Check which signatures already have dhash_vector
+    cur.execute('''
+        SELECT signature_id, image_filename
+        FROM signatures
+        WHERE feature_vector IS NOT NULL
+          AND assigned_accountant IS NOT NULL
+          AND dhash_vector IS NULL
+    ''')
+    todo = cur.fetchall()
+
+    if not todo:
+        # Check total with dhash
+        cur.execute('SELECT COUNT(*) FROM signatures WHERE dhash_vector IS NOT NULL')
+        n_done = cur.fetchone()[0]
+        print(f"  Phase 1 already complete ({n_done:,} hashes in DB)")
+        conn.close()
+        return
+
+    print(f"  Computing dHash for {len(todo):,} images ({NUM_WORKERS} workers)...")
+    t0 = time.time()
+
+    processed = 0
+    for batch_start in range(0, len(todo), BATCH_SIZE):
+        batch = todo[batch_start:batch_start + BATCH_SIZE]
+
+        with Pool(NUM_WORKERS) as pool:
+            results = pool.map(compute_dhash_for_file, batch)
+
+        updates = [(dhash, sid) for sid, dhash in results if dhash is not None]
+        cur.executemany('UPDATE signatures SET dhash_vector = ? WHERE signature_id = ?', updates)
+        conn.commit()
+
+        processed += len(batch)
+        elapsed = time.time() - t0
+        rate = processed / elapsed
+        eta = (len(todo) - processed) / rate if rate > 0 else 0
+        print(f"    {processed:,}/{len(todo):,}  ({rate:.0f}/s, ETA {eta:.0f}s)")
+
+    conn.close()
+    elapsed = time.time() - t0
+    print(f"  Phase 1 done: {processed:,} hashes in {elapsed:.1f}s")
+
+
+# ── Phase 2: All-pairs min dHash within same accountant ──────────────
+
+def hamming_distance(h1_bytes, h2_bytes):
+    """Hamming distance between two packed dHash byte strings."""
+    a = np.frombuffer(h1_bytes, dtype=np.uint8)
+    b = np.frombuffer(h2_bytes, dtype=np.uint8)
+    xor = np.bitwise_xor(a, b)
+    return sum(bin(byte).count('1') for byte in xor)
+
+
+def phase2_compute_min_dhash():
+    """For each accountant group, find the min dHash pair per signature."""
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+
+    # Load all signatures with dhash
+    cur.execute('''
+        SELECT s.signature_id, s.assigned_accountant, s.dhash_vector, s.image_filename
+        FROM signatures s
+        WHERE s.dhash_vector IS NOT NULL
+          AND s.assigned_accountant IS NOT NULL
+    ''')
+    rows = cur.fetchall()
+    print(f"  Loaded {len(rows):,} signatures with dHash")
+
+    # Group by accountant
+    acct_groups = {}
+    for sig_id, acct, dhash, filename in rows:
+        acct_groups.setdefault(acct, []).append((sig_id, dhash, filename))
+
+    # Filter out singletons
+    acct_groups = {k: v for k, v in acct_groups.items() if len(v) >= 2}
+    total_sigs = sum(len(v) for v in acct_groups.values())
+    total_pairs = sum(len(v) * (len(v) - 1) // 2 for v in acct_groups.values())
+    print(f"  {len(acct_groups)} accountants, {total_sigs:,} signatures, {total_pairs:,} pairs")
+
+    t0 = time.time()
+    updates = []
+    accts_done = 0
+
+    for acct, sigs in acct_groups.items():
+        n = len(sigs)
+        sig_ids = [s[0] for s in sigs]
+        hashes = [s[1] for s in sigs]
+        filenames = [s[2] for s in sigs]
+
+        # Unpack all hashes to bit arrays for vectorized hamming
+        bits = np.array([np.unpackbits(np.frombuffer(h, dtype=np.uint8)) for h in hashes],
+                        dtype=np.uint8)  # shape: (n, 64)
+
+        # Pairwise hamming via XOR + sum
+        # For groups up to ~2000, direct matrix computation is fine
+        # hamming_matrix[i,j] = number of differing bits between i and j
+        xor_matrix = bits[:, None, :] ^ bits[None, :, :]  # (n, n, 64)
+        hamming_matrix = xor_matrix.sum(axis=2)  # (n, n)
+        np.fill_diagonal(hamming_matrix, 999)  # exclude self
+
+        # For each signature, find min
+        min_indices = np.argmin(hamming_matrix, axis=1)
+        min_distances = hamming_matrix[np.arange(n), min_indices]
+
+        for i in range(n):
+            updates.append((
+                int(min_distances[i]),
+                filenames[min_indices[i]],
+                sig_ids[i]
+            ))
+
+        accts_done += 1
+        if accts_done % 100 == 0:
+            elapsed = time.time() - t0
+            print(f"    {accts_done}/{len(acct_groups)} accountants ({elapsed:.0f}s)")
+
+    # Write to DB
+    print(f"  Writing {len(updates):,} results to DB...")
+    cur.executemany('''
+        UPDATE signatures
+        SET min_dhash_independent = ?, min_dhash_independent_match = ?
+        WHERE signature_id = ?
+    ''', updates)
+    conn.commit()
+    conn.close()
+
+    elapsed = time.time() - t0
+    print(f"  Phase 2 done: {len(updates):,} signatures in {elapsed:.1f}s")
+
+
+# ── Phase 3: Summary statistics ──────────────────────────────────────
+
+def print_summary():
+    """Print summary comparing conditional vs independent dHash."""
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+
+    # Overall stats
+    cur.execute('''
+        SELECT
+            COUNT(*) as n,
+            AVG(phash_distance_to_closest) as cond_mean,
+            AVG(min_dhash_independent) as indep_mean
+        FROM signatures
+        WHERE min_dhash_independent IS NOT NULL
+          AND phash_distance_to_closest IS NOT NULL
+    ''')
+    n, cond_mean, indep_mean = cur.fetchone()
+
+    print(f"\n{'='*65}")
+    print(f"  COMPARISON: Conditional vs Independent dHash")
+    print(f"{'='*65}")
+    print(f"  N = {n:,}")
+    print(f"  Conditional dHash (cosine-nearest pair):  mean = {cond_mean:.2f}")
+    print(f"  Independent dHash (all-pairs min):        mean = {indep_mean:.2f}")
+
+    # Percentiles
+    cur.execute('''
+        SELECT phash_distance_to_closest, min_dhash_independent
+        FROM signatures
+        WHERE min_dhash_independent IS NOT NULL
+          AND phash_distance_to_closest IS NOT NULL
+    ''')
+    rows = cur.fetchall()
+    cond = np.array([r[0] for r in rows])
+    indep = np.array([r[1] for r in rows])
+
+    print(f"\n  {'Percentile':<12} {'Conditional':>12} {'Independent':>12} {'Diff':>8}")
+    print(f"  {'-'*44}")
+    for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
+        cv = np.percentile(cond, p)
+        iv = np.percentile(indep, p)
+        print(f"  P{p:<10d} {cv:>12.1f} {iv:>12.1f} {iv-cv:>+8.1f}")
+
+    # Agreement analysis
+    print(f"\n  Agreement analysis (both ≤ threshold):")
+    for t in [5, 10, 15, 21]:
+        both = np.sum((cond <= t) & (indep <= t))
+        cond_only = np.sum((cond <= t) & (indep > t))
+        indep_only = np.sum((cond > t) & (indep <= t))
+        neither = np.sum((cond > t) & (indep > t))
+        agree_pct = (both + neither) / len(cond) * 100
+        print(f"  θ={t:>2d}: both={both:,}, cond_only={cond_only:,}, "
+              f"indep_only={indep_only:,}, neither={neither:,} (agree={agree_pct:.1f}%)")
+
+    # Firm A specific
+    cur.execute('''
+        SELECT s.phash_distance_to_closest, s.min_dhash_independent
+        FROM signatures s
+        LEFT JOIN accountants a ON s.assigned_accountant = a.name
+        WHERE a.firm = '勤業眾信聯合'
+          AND s.min_dhash_independent IS NOT NULL
+          AND s.phash_distance_to_closest IS NOT NULL
+    ''')
+    rows = cur.fetchall()
+    if rows:
+        cond_a = np.array([r[0] for r in rows])
+        indep_a = np.array([r[1] for r in rows])
+        print(f"\n  Firm A (勤業眾信) — N={len(rows):,}:")
+        print(f"  {'Percentile':<12} {'Conditional':>12} {'Independent':>12}")
+        print(f"  {'-'*36}")
+        for p in [50, 75, 90, 95, 99]:
+            print(f"  P{p:<10d} {np.percentile(cond_a, p):>12.1f} {np.percentile(indep_a, p):>12.1f}")
+
+    conn.close()
+
+
+def main():
+    t_start = time.time()
+    print("=" * 65)
+    print("  Independent Min dHash Computation")
+    print("=" * 65)
+
+    print(f"\n[Phase 1] Computing dHash vectors...")
+    phase1_compute_hashes()
+
+    print(f"\n[Phase 2] Computing all-pairs min dHash per accountant...")
+    phase2_compute_min_dhash()
+
+    print(f"\n[Phase 3] Summary...")
+    print_summary()
+
+    elapsed = time.time() - t_start
+    print(f"\nTotal time: {elapsed:.0f}s ({elapsed/60:.1f} min)")
+
+
+if __name__ == "__main__":
+    main()