Files
pdf_signature_extraction/signature_analysis/14_compute_independent_dhash.py
T
gbanyan a261a22bd2 Add Deloitte distribution & independent dHash analysis scripts
- Script 13: Firm A normality/multimodality analysis (Shapiro-Wilk, Anderson-Darling, KDE, per-accountant ANOVA, Beta/Gamma fitting)
- Script 14: Independent min-dHash computation across all pairs per accountant (not just cosine-nearest pair)
- THRESHOLD_VALIDATION_OPTIONS: 2026-01 discussion doc on threshold validation approaches
- .gitignore: exclude model weights, node artifacts, and xlsx data

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 21:34:24 +08:00

294 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Compute independent min dHash for all signatures.
===================================================
Currently phash_distance_to_closest is conditional on cosine-nearest pair.
This script computes an INDEPENDENT min dHash: for each signature, find the
pair within the same accountant that has the smallest dHash distance,
regardless of cosine similarity.
Three metrics after this script:
1. max_similarity_to_same_accountant (max cosine) — primary classifier
2. min_dhash_independent (independent min) — independent 2nd classifier
3. phash_distance_to_closest (conditional) — diagnostic tool
Phase 1: Compute dHash vector for each image, store as BLOB in DB
Phase 2: All-pairs hamming distance within same accountant, store min
"""
import sqlite3
import numpy as np
import cv2
import os
import sys
import time
from multiprocessing import Pool, cpu_count
from pathlib import Path
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
IMAGE_DIR = '/Volumes/NV2/PDF-Processing/yolo-signatures/images'
NUM_WORKERS = max(1, cpu_count() - 2)
BATCH_SIZE = 5000
HASH_SIZE = 8 # 9x8 -> 8x8 = 64-bit hash
# ── Phase 1: Compute dHash per image ─────────────────────────────────
def compute_dhash_for_file(args):
"""Compute dHash for a single image file. Returns (sig_id, hash_bytes) or (sig_id, None)."""
sig_id, filename = args
path = os.path.join(IMAGE_DIR, filename)
try:
img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
if img is None:
return (sig_id, None)
resized = cv2.resize(img, (HASH_SIZE + 1, HASH_SIZE))
diff = resized[:, 1:] > resized[:, :-1] # 8x8 = 64 bits
return (sig_id, np.packbits(diff.flatten()).tobytes())
except Exception:
return (sig_id, None)
def phase1_compute_hashes():
"""Compute and store dHash for all signatures."""
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
# Add columns if not exist
for col in ['dhash_vector BLOB', 'min_dhash_independent INTEGER',
'min_dhash_independent_match TEXT']:
try:
cur.execute(f'ALTER TABLE signatures ADD COLUMN {col}')
except sqlite3.OperationalError:
pass
conn.commit()
# Check which signatures already have dhash_vector
cur.execute('''
SELECT signature_id, image_filename
FROM signatures
WHERE feature_vector IS NOT NULL
AND assigned_accountant IS NOT NULL
AND dhash_vector IS NULL
''')
todo = cur.fetchall()
if not todo:
# Check total with dhash
cur.execute('SELECT COUNT(*) FROM signatures WHERE dhash_vector IS NOT NULL')
n_done = cur.fetchone()[0]
print(f" Phase 1 already complete ({n_done:,} hashes in DB)")
conn.close()
return
print(f" Computing dHash for {len(todo):,} images ({NUM_WORKERS} workers)...")
t0 = time.time()
processed = 0
for batch_start in range(0, len(todo), BATCH_SIZE):
batch = todo[batch_start:batch_start + BATCH_SIZE]
with Pool(NUM_WORKERS) as pool:
results = pool.map(compute_dhash_for_file, batch)
updates = [(dhash, sid) for sid, dhash in results if dhash is not None]
cur.executemany('UPDATE signatures SET dhash_vector = ? WHERE signature_id = ?', updates)
conn.commit()
processed += len(batch)
elapsed = time.time() - t0
rate = processed / elapsed
eta = (len(todo) - processed) / rate if rate > 0 else 0
print(f" {processed:,}/{len(todo):,} ({rate:.0f}/s, ETA {eta:.0f}s)")
conn.close()
elapsed = time.time() - t0
print(f" Phase 1 done: {processed:,} hashes in {elapsed:.1f}s")
# ── Phase 2: All-pairs min dHash within same accountant ──────────────
def hamming_distance(h1_bytes, h2_bytes):
"""Hamming distance between two packed dHash byte strings."""
a = np.frombuffer(h1_bytes, dtype=np.uint8)
b = np.frombuffer(h2_bytes, dtype=np.uint8)
xor = np.bitwise_xor(a, b)
return sum(bin(byte).count('1') for byte in xor)
def phase2_compute_min_dhash():
"""For each accountant group, find the min dHash pair per signature."""
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
# Load all signatures with dhash
cur.execute('''
SELECT s.signature_id, s.assigned_accountant, s.dhash_vector, s.image_filename
FROM signatures s
WHERE s.dhash_vector IS NOT NULL
AND s.assigned_accountant IS NOT NULL
''')
rows = cur.fetchall()
print(f" Loaded {len(rows):,} signatures with dHash")
# Group by accountant
acct_groups = {}
for sig_id, acct, dhash, filename in rows:
acct_groups.setdefault(acct, []).append((sig_id, dhash, filename))
# Filter out singletons
acct_groups = {k: v for k, v in acct_groups.items() if len(v) >= 2}
total_sigs = sum(len(v) for v in acct_groups.values())
total_pairs = sum(len(v) * (len(v) - 1) // 2 for v in acct_groups.values())
print(f" {len(acct_groups)} accountants, {total_sigs:,} signatures, {total_pairs:,} pairs")
t0 = time.time()
updates = []
accts_done = 0
for acct, sigs in acct_groups.items():
n = len(sigs)
sig_ids = [s[0] for s in sigs]
hashes = [s[1] for s in sigs]
filenames = [s[2] for s in sigs]
# Unpack all hashes to bit arrays for vectorized hamming
bits = np.array([np.unpackbits(np.frombuffer(h, dtype=np.uint8)) for h in hashes],
dtype=np.uint8) # shape: (n, 64)
# Pairwise hamming via XOR + sum
# For groups up to ~2000, direct matrix computation is fine
# hamming_matrix[i,j] = number of differing bits between i and j
xor_matrix = bits[:, None, :] ^ bits[None, :, :] # (n, n, 64)
hamming_matrix = xor_matrix.sum(axis=2) # (n, n)
np.fill_diagonal(hamming_matrix, 999) # exclude self
# For each signature, find min
min_indices = np.argmin(hamming_matrix, axis=1)
min_distances = hamming_matrix[np.arange(n), min_indices]
for i in range(n):
updates.append((
int(min_distances[i]),
filenames[min_indices[i]],
sig_ids[i]
))
accts_done += 1
if accts_done % 100 == 0:
elapsed = time.time() - t0
print(f" {accts_done}/{len(acct_groups)} accountants ({elapsed:.0f}s)")
# Write to DB
print(f" Writing {len(updates):,} results to DB...")
cur.executemany('''
UPDATE signatures
SET min_dhash_independent = ?, min_dhash_independent_match = ?
WHERE signature_id = ?
''', updates)
conn.commit()
conn.close()
elapsed = time.time() - t0
print(f" Phase 2 done: {len(updates):,} signatures in {elapsed:.1f}s")
# ── Phase 3: Summary statistics ──────────────────────────────────────
def print_summary():
"""Print summary comparing conditional vs independent dHash."""
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
# Overall stats
cur.execute('''
SELECT
COUNT(*) as n,
AVG(phash_distance_to_closest) as cond_mean,
AVG(min_dhash_independent) as indep_mean
FROM signatures
WHERE min_dhash_independent IS NOT NULL
AND phash_distance_to_closest IS NOT NULL
''')
n, cond_mean, indep_mean = cur.fetchone()
print(f"\n{'='*65}")
print(f" COMPARISON: Conditional vs Independent dHash")
print(f"{'='*65}")
print(f" N = {n:,}")
print(f" Conditional dHash (cosine-nearest pair): mean = {cond_mean:.2f}")
print(f" Independent dHash (all-pairs min): mean = {indep_mean:.2f}")
# Percentiles
cur.execute('''
SELECT phash_distance_to_closest, min_dhash_independent
FROM signatures
WHERE min_dhash_independent IS NOT NULL
AND phash_distance_to_closest IS NOT NULL
''')
rows = cur.fetchall()
cond = np.array([r[0] for r in rows])
indep = np.array([r[1] for r in rows])
print(f"\n {'Percentile':<12} {'Conditional':>12} {'Independent':>12} {'Diff':>8}")
print(f" {'-'*44}")
for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
cv = np.percentile(cond, p)
iv = np.percentile(indep, p)
print(f" P{p:<10d} {cv:>12.1f} {iv:>12.1f} {iv-cv:>+8.1f}")
# Agreement analysis
print(f"\n Agreement analysis (both ≤ threshold):")
for t in [5, 10, 15, 21]:
both = np.sum((cond <= t) & (indep <= t))
cond_only = np.sum((cond <= t) & (indep > t))
indep_only = np.sum((cond > t) & (indep <= t))
neither = np.sum((cond > t) & (indep > t))
agree_pct = (both + neither) / len(cond) * 100
print(f" θ={t:>2d}: both={both:,}, cond_only={cond_only:,}, "
f"indep_only={indep_only:,}, neither={neither:,} (agree={agree_pct:.1f}%)")
# Firm A specific
cur.execute('''
SELECT s.phash_distance_to_closest, s.min_dhash_independent
FROM signatures s
LEFT JOIN accountants a ON s.assigned_accountant = a.name
WHERE a.firm = '勤業眾信聯合'
AND s.min_dhash_independent IS NOT NULL
AND s.phash_distance_to_closest IS NOT NULL
''')
rows = cur.fetchall()
if rows:
cond_a = np.array([r[0] for r in rows])
indep_a = np.array([r[1] for r in rows])
print(f"\n Firm A (勤業眾信) — N={len(rows):,}:")
print(f" {'Percentile':<12} {'Conditional':>12} {'Independent':>12}")
print(f" {'-'*36}")
for p in [50, 75, 90, 95, 99]:
print(f" P{p:<10d} {np.percentile(cond_a, p):>12.1f} {np.percentile(indep_a, p):>12.1f}")
conn.close()
def main():
t_start = time.time()
print("=" * 65)
print(" Independent Min dHash Computation")
print("=" * 65)
print(f"\n[Phase 1] Computing dHash vectors...")
phase1_compute_hashes()
print(f"\n[Phase 2] Computing all-pairs min dHash per accountant...")
phase2_compute_min_dhash()
print(f"\n[Phase 3] Summary...")
print_summary()
elapsed = time.time() - t_start
print(f"\nTotal time: {elapsed:.0f}s ({elapsed/60:.1f} min)")
if __name__ == "__main__":
main()