#!/usr/bin/env python3 """ Compute SSIM and pHash for all signature pairs (closest match per accountant). Uses multiprocessing for parallel image loading and computation. Saves results to database and outputs complete CSV. """ import sqlite3 import numpy as np import cv2 import os import sys import json import csv import time from datetime import datetime from collections import defaultdict from multiprocessing import Pool, cpu_count from pathlib import Path DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' IMAGE_DIR = '/Volumes/NV2/PDF-Processing/yolo-signatures/images' OUTPUT_CSV = '/Volumes/NV2/PDF-Processing/signature-analysis/reports/complete_pdf_report.csv' CHECKPOINT_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/ssim_checkpoint.json' NUM_WORKERS = max(1, cpu_count() - 2) # Leave 2 cores free BATCH_SIZE = 1000 def compute_phash(img, hash_size=8): """Compute perceptual hash.""" resized = cv2.resize(img, (hash_size + 1, hash_size)) diff = resized[:, 1:] > resized[:, :-1] return diff.flatten() def compute_pair_ssim(args): """Compute SSIM, pHash, histogram correlation for a pair of images.""" sig_id, file1, file2, cosine_sim = args path1 = os.path.join(IMAGE_DIR, file1) path2 = os.path.join(IMAGE_DIR, file2) result = { 'signature_id': sig_id, 'match_file': file2, 'cosine_similarity': cosine_sim, 'ssim': None, 'phash_distance': None, 'histogram_corr': None, 'pixel_identical': False, } try: img1 = cv2.imread(path1, cv2.IMREAD_GRAYSCALE) img2 = cv2.imread(path2, cv2.IMREAD_GRAYSCALE) if img1 is None or img2 is None: return result # Resize to same dimensions h = min(img1.shape[0], img2.shape[0]) w = min(img1.shape[1], img2.shape[1]) if h < 3 or w < 3: return result img1_r = cv2.resize(img1, (w, h)) img2_r = cv2.resize(img2, (w, h)) # Pixel identical check result['pixel_identical'] = bool(np.array_equal(img1_r, img2_r)) # SSIM try: from skimage.metrics import structural_similarity as ssim win_size = min(7, min(h, w)) if win_size % 2 == 0: win_size -= 1 if win_size >= 3: result['ssim'] = float(ssim(img1_r, img2_r, win_size=win_size)) else: result['ssim'] = None except Exception: result['ssim'] = None # Histogram correlation hist1 = cv2.calcHist([img1_r], [0], None, [256], [0, 256]) hist2 = cv2.calcHist([img2_r], [0], None, [256], [0, 256]) result['histogram_corr'] = float(cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)) # pHash distance h1 = compute_phash(img1_r) h2 = compute_phash(img2_r) result['phash_distance'] = int(np.sum(h1 != h2)) except Exception as e: pass return result def load_checkpoint(): """Load checkpoint of already processed signature IDs.""" if os.path.exists(CHECKPOINT_PATH): with open(CHECKPOINT_PATH, 'r') as f: data = json.load(f) return set(data.get('processed_ids', [])) return set() def save_checkpoint(processed_ids): """Save checkpoint.""" with open(CHECKPOINT_PATH, 'w') as f: json.dump({'processed_ids': list(processed_ids), 'timestamp': str(datetime.now())}, f) def main(): start_time = time.time() print("=" * 70) print("SSIM & pHash Computation for All Signature Pairs") print(f"Workers: {NUM_WORKERS}") print("=" * 70) # --- Step 1: Load data --- print("\n[1/4] Loading data from database...") conn = sqlite3.connect(DB_PATH) cur = conn.cursor() cur.execute(''' SELECT signature_id, image_filename, assigned_accountant, feature_vector FROM signatures WHERE feature_vector IS NOT NULL AND assigned_accountant IS NOT NULL ''') rows = cur.fetchall() sig_ids = [] filenames = [] accountants = [] features = [] for row in rows: sig_ids.append(row[0]) filenames.append(row[1]) accountants.append(row[2]) features.append(np.frombuffer(row[3], dtype=np.float32)) features = np.array(features) print(f" Loaded {len(sig_ids)} signatures") # --- Step 2: Find closest match per signature --- print("\n[2/4] Finding closest match per signature (same accountant)...") acct_groups = defaultdict(list) for i, acct in enumerate(accountants): acct_groups[acct].append(i) # Load checkpoint processed_ids = load_checkpoint() print(f" Checkpoint: {len(processed_ids)} already processed") # Prepare tasks tasks = [] for acct, indices in acct_groups.items(): if len(indices) < 2: continue vecs = features[indices] sim_matrix = vecs @ vecs.T np.fill_diagonal(sim_matrix, -1) # Exclude self for local_i, global_i in enumerate(indices): if sig_ids[global_i] in processed_ids: continue best_local = np.argmax(sim_matrix[local_i]) best_global = indices[best_local] best_sim = float(sim_matrix[local_i, best_local]) tasks.append(( sig_ids[global_i], filenames[global_i], filenames[best_global], best_sim )) print(f" Tasks to process: {len(tasks)}") # --- Step 3: Compute SSIM/pHash in parallel --- print(f"\n[3/4] Computing SSIM & pHash ({len(tasks)} pairs, {NUM_WORKERS} workers)...") # Add SSIM columns to database if not exist try: cur.execute('ALTER TABLE signatures ADD COLUMN ssim_to_closest REAL') except: pass try: cur.execute('ALTER TABLE signatures ADD COLUMN phash_distance_to_closest INTEGER') except: pass try: cur.execute('ALTER TABLE signatures ADD COLUMN histogram_corr_to_closest REAL') except: pass try: cur.execute('ALTER TABLE signatures ADD COLUMN pixel_identical_to_closest INTEGER') except: pass try: cur.execute('ALTER TABLE signatures ADD COLUMN closest_match_file TEXT') except: pass conn.commit() total = len(tasks) done = 0 batch_results = [] with Pool(NUM_WORKERS) as pool: for result in pool.imap_unordered(compute_pair_ssim, tasks, chunksize=50): batch_results.append(result) done += 1 if done % BATCH_SIZE == 0 or done == total: # Save batch to database for r in batch_results: cur.execute(''' UPDATE signatures SET ssim_to_closest = ?, phash_distance_to_closest = ?, histogram_corr_to_closest = ?, pixel_identical_to_closest = ?, closest_match_file = ? WHERE signature_id = ? ''', ( r['ssim'], r['phash_distance'], r['histogram_corr'], 1 if r['pixel_identical'] else 0, r['match_file'], r['signature_id'] )) processed_ids.add(r['signature_id']) conn.commit() save_checkpoint(processed_ids) batch_results = [] elapsed = time.time() - start_time rate = done / elapsed eta = (total - done) / rate if rate > 0 else 0 print(f" {done:,}/{total:,} ({100*done/total:.1f}%) " f"| {rate:.1f} pairs/s | ETA: {eta/60:.1f} min") # --- Step 4: Generate complete CSV --- print(f"\n[4/4] Generating complete CSV...") cur.execute(''' SELECT s.source_pdf, s.year_month, s.serial_number, s.doc_type, s.page_number, s.sig_index, s.image_filename, s.assigned_accountant, s.excel_accountant1, s.excel_accountant2, s.excel_firm, s.detection_confidence, s.signature_verdict, s.max_similarity_to_same_accountant, s.ssim_to_closest, s.phash_distance_to_closest, s.histogram_corr_to_closest, s.pixel_identical_to_closest, s.closest_match_file, a.risk_level, a.mean_similarity as acct_mean_similarity, a.ratio_gt_95 as acct_ratio_gt_95 FROM signatures s LEFT JOIN accountants a ON s.assigned_accountant = a.name ORDER BY s.source_pdf, s.sig_index ''') columns = [ 'source_pdf', 'year_month', 'serial_number', 'doc_type', 'page_number', 'sig_index', 'image_filename', 'assigned_accountant', 'excel_accountant1', 'excel_accountant2', 'excel_firm', 'detection_confidence', 'signature_verdict', 'max_cosine_similarity', 'ssim_to_closest', 'phash_distance_to_closest', 'histogram_corr_to_closest', 'pixel_identical_to_closest', 'closest_match_file', 'accountant_risk_level', 'accountant_mean_similarity', 'accountant_ratio_gt_95' ] with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(columns) for row in cur: writer.writerow(row) # Count rows cur.execute('SELECT COUNT(*) FROM signatures') total_sigs = cur.fetchone()[0] cur.execute('SELECT COUNT(DISTINCT source_pdf) FROM signatures') total_pdfs = cur.fetchone()[0] conn.close() elapsed = time.time() - start_time print(f"\n{'='*70}") print(f"Complete!") print(f" Total signatures: {total_sigs:,}") print(f" Total PDFs: {total_pdfs:,}") print(f" Output: {OUTPUT_CSV}") print(f" Time: {elapsed/60:.1f} minutes") print(f"{'='*70}") # Clean up checkpoint if os.path.exists(CHECKPOINT_PATH): os.remove(CHECKPOINT_PATH) if __name__ == '__main__': main()