a261a22bd2
- Script 13: Firm A normality/multimodality analysis (Shapiro-Wilk, Anderson-Darling, KDE, per-accountant ANOVA, Beta/Gamma fitting) - Script 14: Independent min-dHash computation across all pairs per accountant (not just cosine-nearest pair) - THRESHOLD_VALIDATION_OPTIONS: 2026-01 discussion doc on threshold validation approaches - .gitignore: exclude model weights, node artifacts, and xlsx data Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
294 lines
10 KiB
Python
294 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Compute independent min dHash for all signatures.
|
|
===================================================
|
|
Currently phash_distance_to_closest is conditional on cosine-nearest pair.
|
|
This script computes an INDEPENDENT min dHash: for each signature, find the
|
|
pair within the same accountant that has the smallest dHash distance,
|
|
regardless of cosine similarity.
|
|
|
|
Three metrics after this script:
|
|
1. max_similarity_to_same_accountant (max cosine) — primary classifier
|
|
2. min_dhash_independent (independent min) — independent 2nd classifier
|
|
3. phash_distance_to_closest (conditional) — diagnostic tool
|
|
|
|
Phase 1: Compute dHash vector for each image, store as BLOB in DB
|
|
Phase 2: All-pairs hamming distance within same accountant, store min
|
|
"""
|
|
|
|
import sqlite3
|
|
import numpy as np
|
|
import cv2
|
|
import os
|
|
import sys
|
|
import time
|
|
from multiprocessing import Pool, cpu_count
|
|
from pathlib import Path
|
|
|
|
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
|
IMAGE_DIR = '/Volumes/NV2/PDF-Processing/yolo-signatures/images'
|
|
NUM_WORKERS = max(1, cpu_count() - 2)
|
|
BATCH_SIZE = 5000
|
|
HASH_SIZE = 8 # 9x8 -> 8x8 = 64-bit hash
|
|
|
|
|
|
# ── Phase 1: Compute dHash per image ─────────────────────────────────
|
|
|
|
def compute_dhash_for_file(args):
|
|
"""Compute dHash for a single image file. Returns (sig_id, hash_bytes) or (sig_id, None)."""
|
|
sig_id, filename = args
|
|
path = os.path.join(IMAGE_DIR, filename)
|
|
try:
|
|
img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
|
|
if img is None:
|
|
return (sig_id, None)
|
|
resized = cv2.resize(img, (HASH_SIZE + 1, HASH_SIZE))
|
|
diff = resized[:, 1:] > resized[:, :-1] # 8x8 = 64 bits
|
|
return (sig_id, np.packbits(diff.flatten()).tobytes())
|
|
except Exception:
|
|
return (sig_id, None)
|
|
|
|
|
|
def phase1_compute_hashes():
|
|
"""Compute and store dHash for all signatures."""
|
|
conn = sqlite3.connect(DB_PATH)
|
|
cur = conn.cursor()
|
|
|
|
# Add columns if not exist
|
|
for col in ['dhash_vector BLOB', 'min_dhash_independent INTEGER',
|
|
'min_dhash_independent_match TEXT']:
|
|
try:
|
|
cur.execute(f'ALTER TABLE signatures ADD COLUMN {col}')
|
|
except sqlite3.OperationalError:
|
|
pass
|
|
conn.commit()
|
|
|
|
# Check which signatures already have dhash_vector
|
|
cur.execute('''
|
|
SELECT signature_id, image_filename
|
|
FROM signatures
|
|
WHERE feature_vector IS NOT NULL
|
|
AND assigned_accountant IS NOT NULL
|
|
AND dhash_vector IS NULL
|
|
''')
|
|
todo = cur.fetchall()
|
|
|
|
if not todo:
|
|
# Check total with dhash
|
|
cur.execute('SELECT COUNT(*) FROM signatures WHERE dhash_vector IS NOT NULL')
|
|
n_done = cur.fetchone()[0]
|
|
print(f" Phase 1 already complete ({n_done:,} hashes in DB)")
|
|
conn.close()
|
|
return
|
|
|
|
print(f" Computing dHash for {len(todo):,} images ({NUM_WORKERS} workers)...")
|
|
t0 = time.time()
|
|
|
|
processed = 0
|
|
for batch_start in range(0, len(todo), BATCH_SIZE):
|
|
batch = todo[batch_start:batch_start + BATCH_SIZE]
|
|
|
|
with Pool(NUM_WORKERS) as pool:
|
|
results = pool.map(compute_dhash_for_file, batch)
|
|
|
|
updates = [(dhash, sid) for sid, dhash in results if dhash is not None]
|
|
cur.executemany('UPDATE signatures SET dhash_vector = ? WHERE signature_id = ?', updates)
|
|
conn.commit()
|
|
|
|
processed += len(batch)
|
|
elapsed = time.time() - t0
|
|
rate = processed / elapsed
|
|
eta = (len(todo) - processed) / rate if rate > 0 else 0
|
|
print(f" {processed:,}/{len(todo):,} ({rate:.0f}/s, ETA {eta:.0f}s)")
|
|
|
|
conn.close()
|
|
elapsed = time.time() - t0
|
|
print(f" Phase 1 done: {processed:,} hashes in {elapsed:.1f}s")
|
|
|
|
|
|
# ── Phase 2: All-pairs min dHash within same accountant ──────────────
|
|
|
|
def hamming_distance(h1_bytes, h2_bytes):
|
|
"""Hamming distance between two packed dHash byte strings."""
|
|
a = np.frombuffer(h1_bytes, dtype=np.uint8)
|
|
b = np.frombuffer(h2_bytes, dtype=np.uint8)
|
|
xor = np.bitwise_xor(a, b)
|
|
return sum(bin(byte).count('1') for byte in xor)
|
|
|
|
|
|
def phase2_compute_min_dhash():
|
|
"""For each accountant group, find the min dHash pair per signature."""
|
|
conn = sqlite3.connect(DB_PATH)
|
|
cur = conn.cursor()
|
|
|
|
# Load all signatures with dhash
|
|
cur.execute('''
|
|
SELECT s.signature_id, s.assigned_accountant, s.dhash_vector, s.image_filename
|
|
FROM signatures s
|
|
WHERE s.dhash_vector IS NOT NULL
|
|
AND s.assigned_accountant IS NOT NULL
|
|
''')
|
|
rows = cur.fetchall()
|
|
print(f" Loaded {len(rows):,} signatures with dHash")
|
|
|
|
# Group by accountant
|
|
acct_groups = {}
|
|
for sig_id, acct, dhash, filename in rows:
|
|
acct_groups.setdefault(acct, []).append((sig_id, dhash, filename))
|
|
|
|
# Filter out singletons
|
|
acct_groups = {k: v for k, v in acct_groups.items() if len(v) >= 2}
|
|
total_sigs = sum(len(v) for v in acct_groups.values())
|
|
total_pairs = sum(len(v) * (len(v) - 1) // 2 for v in acct_groups.values())
|
|
print(f" {len(acct_groups)} accountants, {total_sigs:,} signatures, {total_pairs:,} pairs")
|
|
|
|
t0 = time.time()
|
|
updates = []
|
|
accts_done = 0
|
|
|
|
for acct, sigs in acct_groups.items():
|
|
n = len(sigs)
|
|
sig_ids = [s[0] for s in sigs]
|
|
hashes = [s[1] for s in sigs]
|
|
filenames = [s[2] for s in sigs]
|
|
|
|
# Unpack all hashes to bit arrays for vectorized hamming
|
|
bits = np.array([np.unpackbits(np.frombuffer(h, dtype=np.uint8)) for h in hashes],
|
|
dtype=np.uint8) # shape: (n, 64)
|
|
|
|
# Pairwise hamming via XOR + sum
|
|
# For groups up to ~2000, direct matrix computation is fine
|
|
# hamming_matrix[i,j] = number of differing bits between i and j
|
|
xor_matrix = bits[:, None, :] ^ bits[None, :, :] # (n, n, 64)
|
|
hamming_matrix = xor_matrix.sum(axis=2) # (n, n)
|
|
np.fill_diagonal(hamming_matrix, 999) # exclude self
|
|
|
|
# For each signature, find min
|
|
min_indices = np.argmin(hamming_matrix, axis=1)
|
|
min_distances = hamming_matrix[np.arange(n), min_indices]
|
|
|
|
for i in range(n):
|
|
updates.append((
|
|
int(min_distances[i]),
|
|
filenames[min_indices[i]],
|
|
sig_ids[i]
|
|
))
|
|
|
|
accts_done += 1
|
|
if accts_done % 100 == 0:
|
|
elapsed = time.time() - t0
|
|
print(f" {accts_done}/{len(acct_groups)} accountants ({elapsed:.0f}s)")
|
|
|
|
# Write to DB
|
|
print(f" Writing {len(updates):,} results to DB...")
|
|
cur.executemany('''
|
|
UPDATE signatures
|
|
SET min_dhash_independent = ?, min_dhash_independent_match = ?
|
|
WHERE signature_id = ?
|
|
''', updates)
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
elapsed = time.time() - t0
|
|
print(f" Phase 2 done: {len(updates):,} signatures in {elapsed:.1f}s")
|
|
|
|
|
|
# ── Phase 3: Summary statistics ──────────────────────────────────────
|
|
|
|
def print_summary():
|
|
"""Print summary comparing conditional vs independent dHash."""
|
|
conn = sqlite3.connect(DB_PATH)
|
|
cur = conn.cursor()
|
|
|
|
# Overall stats
|
|
cur.execute('''
|
|
SELECT
|
|
COUNT(*) as n,
|
|
AVG(phash_distance_to_closest) as cond_mean,
|
|
AVG(min_dhash_independent) as indep_mean
|
|
FROM signatures
|
|
WHERE min_dhash_independent IS NOT NULL
|
|
AND phash_distance_to_closest IS NOT NULL
|
|
''')
|
|
n, cond_mean, indep_mean = cur.fetchone()
|
|
|
|
print(f"\n{'='*65}")
|
|
print(f" COMPARISON: Conditional vs Independent dHash")
|
|
print(f"{'='*65}")
|
|
print(f" N = {n:,}")
|
|
print(f" Conditional dHash (cosine-nearest pair): mean = {cond_mean:.2f}")
|
|
print(f" Independent dHash (all-pairs min): mean = {indep_mean:.2f}")
|
|
|
|
# Percentiles
|
|
cur.execute('''
|
|
SELECT phash_distance_to_closest, min_dhash_independent
|
|
FROM signatures
|
|
WHERE min_dhash_independent IS NOT NULL
|
|
AND phash_distance_to_closest IS NOT NULL
|
|
''')
|
|
rows = cur.fetchall()
|
|
cond = np.array([r[0] for r in rows])
|
|
indep = np.array([r[1] for r in rows])
|
|
|
|
print(f"\n {'Percentile':<12} {'Conditional':>12} {'Independent':>12} {'Diff':>8}")
|
|
print(f" {'-'*44}")
|
|
for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
|
|
cv = np.percentile(cond, p)
|
|
iv = np.percentile(indep, p)
|
|
print(f" P{p:<10d} {cv:>12.1f} {iv:>12.1f} {iv-cv:>+8.1f}")
|
|
|
|
# Agreement analysis
|
|
print(f"\n Agreement analysis (both ≤ threshold):")
|
|
for t in [5, 10, 15, 21]:
|
|
both = np.sum((cond <= t) & (indep <= t))
|
|
cond_only = np.sum((cond <= t) & (indep > t))
|
|
indep_only = np.sum((cond > t) & (indep <= t))
|
|
neither = np.sum((cond > t) & (indep > t))
|
|
agree_pct = (both + neither) / len(cond) * 100
|
|
print(f" θ={t:>2d}: both={both:,}, cond_only={cond_only:,}, "
|
|
f"indep_only={indep_only:,}, neither={neither:,} (agree={agree_pct:.1f}%)")
|
|
|
|
# Firm A specific
|
|
cur.execute('''
|
|
SELECT s.phash_distance_to_closest, s.min_dhash_independent
|
|
FROM signatures s
|
|
LEFT JOIN accountants a ON s.assigned_accountant = a.name
|
|
WHERE a.firm = '勤業眾信聯合'
|
|
AND s.min_dhash_independent IS NOT NULL
|
|
AND s.phash_distance_to_closest IS NOT NULL
|
|
''')
|
|
rows = cur.fetchall()
|
|
if rows:
|
|
cond_a = np.array([r[0] for r in rows])
|
|
indep_a = np.array([r[1] for r in rows])
|
|
print(f"\n Firm A (勤業眾信) — N={len(rows):,}:")
|
|
print(f" {'Percentile':<12} {'Conditional':>12} {'Independent':>12}")
|
|
print(f" {'-'*36}")
|
|
for p in [50, 75, 90, 95, 99]:
|
|
print(f" P{p:<10d} {np.percentile(cond_a, p):>12.1f} {np.percentile(indep_a, p):>12.1f}")
|
|
|
|
conn.close()
|
|
|
|
|
|
def main():
|
|
t_start = time.time()
|
|
print("=" * 65)
|
|
print(" Independent Min dHash Computation")
|
|
print("=" * 65)
|
|
|
|
print(f"\n[Phase 1] Computing dHash vectors...")
|
|
phase1_compute_hashes()
|
|
|
|
print(f"\n[Phase 2] Computing all-pairs min dHash per accountant...")
|
|
phase2_compute_min_dhash()
|
|
|
|
print(f"\n[Phase 3] Summary...")
|
|
print_summary()
|
|
|
|
elapsed = time.time() - t_start
|
|
print(f"\nTotal time: {elapsed:.0f}s ({elapsed/60:.1f} min)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|