939a348da4
Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
320 lines
10 KiB
Python
320 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Compute SSIM and pHash for all signature pairs (closest match per accountant).
|
|
Uses multiprocessing for parallel image loading and computation.
|
|
Saves results to database and outputs complete CSV.
|
|
"""
|
|
|
|
import sqlite3
|
|
import numpy as np
|
|
import cv2
|
|
import os
|
|
import sys
|
|
import json
|
|
import csv
|
|
import time
|
|
from datetime import datetime
|
|
from collections import defaultdict
|
|
from multiprocessing import Pool, cpu_count
|
|
from pathlib import Path
|
|
|
|
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
|
IMAGE_DIR = '/Volumes/NV2/PDF-Processing/yolo-signatures/images'
|
|
OUTPUT_CSV = '/Volumes/NV2/PDF-Processing/signature-analysis/reports/complete_pdf_report.csv'
|
|
CHECKPOINT_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/ssim_checkpoint.json'
|
|
NUM_WORKERS = max(1, cpu_count() - 2) # Leave 2 cores free
|
|
BATCH_SIZE = 1000
|
|
|
|
|
|
def compute_phash(img, hash_size=8):
|
|
"""Compute perceptual hash."""
|
|
resized = cv2.resize(img, (hash_size + 1, hash_size))
|
|
diff = resized[:, 1:] > resized[:, :-1]
|
|
return diff.flatten()
|
|
|
|
|
|
def compute_pair_ssim(args):
|
|
"""Compute SSIM, pHash, histogram correlation for a pair of images."""
|
|
sig_id, file1, file2, cosine_sim = args
|
|
|
|
path1 = os.path.join(IMAGE_DIR, file1)
|
|
path2 = os.path.join(IMAGE_DIR, file2)
|
|
|
|
result = {
|
|
'signature_id': sig_id,
|
|
'match_file': file2,
|
|
'cosine_similarity': cosine_sim,
|
|
'ssim': None,
|
|
'phash_distance': None,
|
|
'histogram_corr': None,
|
|
'pixel_identical': False,
|
|
}
|
|
|
|
try:
|
|
img1 = cv2.imread(path1, cv2.IMREAD_GRAYSCALE)
|
|
img2 = cv2.imread(path2, cv2.IMREAD_GRAYSCALE)
|
|
|
|
if img1 is None or img2 is None:
|
|
return result
|
|
|
|
# Resize to same dimensions
|
|
h = min(img1.shape[0], img2.shape[0])
|
|
w = min(img1.shape[1], img2.shape[1])
|
|
if h < 3 or w < 3:
|
|
return result
|
|
|
|
img1_r = cv2.resize(img1, (w, h))
|
|
img2_r = cv2.resize(img2, (w, h))
|
|
|
|
# Pixel identical check
|
|
result['pixel_identical'] = bool(np.array_equal(img1_r, img2_r))
|
|
|
|
# SSIM
|
|
try:
|
|
from skimage.metrics import structural_similarity as ssim
|
|
win_size = min(7, min(h, w))
|
|
if win_size % 2 == 0:
|
|
win_size -= 1
|
|
if win_size >= 3:
|
|
result['ssim'] = float(ssim(img1_r, img2_r, win_size=win_size))
|
|
else:
|
|
result['ssim'] = None
|
|
except Exception:
|
|
result['ssim'] = None
|
|
|
|
# Histogram correlation
|
|
hist1 = cv2.calcHist([img1_r], [0], None, [256], [0, 256])
|
|
hist2 = cv2.calcHist([img2_r], [0], None, [256], [0, 256])
|
|
result['histogram_corr'] = float(cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL))
|
|
|
|
# pHash distance
|
|
h1 = compute_phash(img1_r)
|
|
h2 = compute_phash(img2_r)
|
|
result['phash_distance'] = int(np.sum(h1 != h2))
|
|
|
|
except Exception as e:
|
|
pass
|
|
|
|
return result
|
|
|
|
|
|
def load_checkpoint():
|
|
"""Load checkpoint of already processed signature IDs."""
|
|
if os.path.exists(CHECKPOINT_PATH):
|
|
with open(CHECKPOINT_PATH, 'r') as f:
|
|
data = json.load(f)
|
|
return set(data.get('processed_ids', []))
|
|
return set()
|
|
|
|
|
|
def save_checkpoint(processed_ids):
|
|
"""Save checkpoint."""
|
|
with open(CHECKPOINT_PATH, 'w') as f:
|
|
json.dump({'processed_ids': list(processed_ids), 'timestamp': str(datetime.now())}, f)
|
|
|
|
|
|
def main():
|
|
start_time = time.time()
|
|
print("=" * 70)
|
|
print("SSIM & pHash Computation for All Signature Pairs")
|
|
print(f"Workers: {NUM_WORKERS}")
|
|
print("=" * 70)
|
|
|
|
# --- Step 1: Load data ---
|
|
print("\n[1/4] Loading data from database...")
|
|
conn = sqlite3.connect(DB_PATH)
|
|
cur = conn.cursor()
|
|
|
|
cur.execute('''
|
|
SELECT signature_id, image_filename, assigned_accountant, feature_vector
|
|
FROM signatures
|
|
WHERE feature_vector IS NOT NULL AND assigned_accountant IS NOT NULL
|
|
''')
|
|
rows = cur.fetchall()
|
|
|
|
sig_ids = []
|
|
filenames = []
|
|
accountants = []
|
|
features = []
|
|
|
|
for row in rows:
|
|
sig_ids.append(row[0])
|
|
filenames.append(row[1])
|
|
accountants.append(row[2])
|
|
features.append(np.frombuffer(row[3], dtype=np.float32))
|
|
|
|
features = np.array(features)
|
|
print(f" Loaded {len(sig_ids)} signatures")
|
|
|
|
# --- Step 2: Find closest match per signature ---
|
|
print("\n[2/4] Finding closest match per signature (same accountant)...")
|
|
acct_groups = defaultdict(list)
|
|
for i, acct in enumerate(accountants):
|
|
acct_groups[acct].append(i)
|
|
|
|
# Load checkpoint
|
|
processed_ids = load_checkpoint()
|
|
print(f" Checkpoint: {len(processed_ids)} already processed")
|
|
|
|
# Prepare tasks
|
|
tasks = []
|
|
for acct, indices in acct_groups.items():
|
|
if len(indices) < 2:
|
|
continue
|
|
vecs = features[indices]
|
|
sim_matrix = vecs @ vecs.T
|
|
np.fill_diagonal(sim_matrix, -1) # Exclude self
|
|
|
|
for local_i, global_i in enumerate(indices):
|
|
if sig_ids[global_i] in processed_ids:
|
|
continue
|
|
best_local = np.argmax(sim_matrix[local_i])
|
|
best_global = indices[best_local]
|
|
best_sim = float(sim_matrix[local_i, best_local])
|
|
tasks.append((
|
|
sig_ids[global_i],
|
|
filenames[global_i],
|
|
filenames[best_global],
|
|
best_sim
|
|
))
|
|
|
|
print(f" Tasks to process: {len(tasks)}")
|
|
|
|
# --- Step 3: Compute SSIM/pHash in parallel ---
|
|
print(f"\n[3/4] Computing SSIM & pHash ({len(tasks)} pairs, {NUM_WORKERS} workers)...")
|
|
|
|
# Add SSIM columns to database if not exist
|
|
try:
|
|
cur.execute('ALTER TABLE signatures ADD COLUMN ssim_to_closest REAL')
|
|
except:
|
|
pass
|
|
try:
|
|
cur.execute('ALTER TABLE signatures ADD COLUMN phash_distance_to_closest INTEGER')
|
|
except:
|
|
pass
|
|
try:
|
|
cur.execute('ALTER TABLE signatures ADD COLUMN histogram_corr_to_closest REAL')
|
|
except:
|
|
pass
|
|
try:
|
|
cur.execute('ALTER TABLE signatures ADD COLUMN pixel_identical_to_closest INTEGER')
|
|
except:
|
|
pass
|
|
try:
|
|
cur.execute('ALTER TABLE signatures ADD COLUMN closest_match_file TEXT')
|
|
except:
|
|
pass
|
|
conn.commit()
|
|
|
|
total = len(tasks)
|
|
done = 0
|
|
batch_results = []
|
|
|
|
with Pool(NUM_WORKERS) as pool:
|
|
for result in pool.imap_unordered(compute_pair_ssim, tasks, chunksize=50):
|
|
batch_results.append(result)
|
|
done += 1
|
|
|
|
if done % BATCH_SIZE == 0 or done == total:
|
|
# Save batch to database
|
|
for r in batch_results:
|
|
cur.execute('''
|
|
UPDATE signatures SET
|
|
ssim_to_closest = ?,
|
|
phash_distance_to_closest = ?,
|
|
histogram_corr_to_closest = ?,
|
|
pixel_identical_to_closest = ?,
|
|
closest_match_file = ?
|
|
WHERE signature_id = ?
|
|
''', (
|
|
r['ssim'],
|
|
r['phash_distance'],
|
|
r['histogram_corr'],
|
|
1 if r['pixel_identical'] else 0,
|
|
r['match_file'],
|
|
r['signature_id']
|
|
))
|
|
processed_ids.add(r['signature_id'])
|
|
conn.commit()
|
|
save_checkpoint(processed_ids)
|
|
batch_results = []
|
|
|
|
elapsed = time.time() - start_time
|
|
rate = done / elapsed
|
|
eta = (total - done) / rate if rate > 0 else 0
|
|
print(f" {done:,}/{total:,} ({100*done/total:.1f}%) "
|
|
f"| {rate:.1f} pairs/s | ETA: {eta/60:.1f} min")
|
|
|
|
# --- Step 4: Generate complete CSV ---
|
|
print(f"\n[4/4] Generating complete CSV...")
|
|
|
|
cur.execute('''
|
|
SELECT
|
|
s.source_pdf,
|
|
s.year_month,
|
|
s.serial_number,
|
|
s.doc_type,
|
|
s.page_number,
|
|
s.sig_index,
|
|
s.image_filename,
|
|
s.assigned_accountant,
|
|
s.excel_accountant1,
|
|
s.excel_accountant2,
|
|
s.excel_firm,
|
|
s.detection_confidence,
|
|
s.signature_verdict,
|
|
s.max_similarity_to_same_accountant,
|
|
s.ssim_to_closest,
|
|
s.phash_distance_to_closest,
|
|
s.histogram_corr_to_closest,
|
|
s.pixel_identical_to_closest,
|
|
s.closest_match_file,
|
|
a.risk_level,
|
|
a.mean_similarity as acct_mean_similarity,
|
|
a.ratio_gt_95 as acct_ratio_gt_95
|
|
FROM signatures s
|
|
LEFT JOIN accountants a ON s.assigned_accountant = a.name
|
|
ORDER BY s.source_pdf, s.sig_index
|
|
''')
|
|
|
|
columns = [
|
|
'source_pdf', 'year_month', 'serial_number', 'doc_type',
|
|
'page_number', 'sig_index', 'image_filename',
|
|
'assigned_accountant', 'excel_accountant1', 'excel_accountant2', 'excel_firm',
|
|
'detection_confidence', 'signature_verdict',
|
|
'max_cosine_similarity', 'ssim_to_closest', 'phash_distance_to_closest',
|
|
'histogram_corr_to_closest', 'pixel_identical_to_closest', 'closest_match_file',
|
|
'accountant_risk_level', 'accountant_mean_similarity', 'accountant_ratio_gt_95'
|
|
]
|
|
|
|
with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.writer(f)
|
|
writer.writerow(columns)
|
|
for row in cur:
|
|
writer.writerow(row)
|
|
|
|
# Count rows
|
|
cur.execute('SELECT COUNT(*) FROM signatures')
|
|
total_sigs = cur.fetchone()[0]
|
|
cur.execute('SELECT COUNT(DISTINCT source_pdf) FROM signatures')
|
|
total_pdfs = cur.fetchone()[0]
|
|
|
|
conn.close()
|
|
|
|
elapsed = time.time() - start_time
|
|
print(f"\n{'='*70}")
|
|
print(f"Complete!")
|
|
print(f" Total signatures: {total_sigs:,}")
|
|
print(f" Total PDFs: {total_pdfs:,}")
|
|
print(f" Output: {OUTPUT_CSV}")
|
|
print(f" Time: {elapsed/60:.1f} minutes")
|
|
print(f"{'='*70}")
|
|
|
|
# Clean up checkpoint
|
|
if os.path.exists(CHECKPOINT_PATH):
|
|
os.remove(CHECKPOINT_PATH)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|