Files
pdf_signature_extraction/signature_analysis/11_compute_ssim_phash.py
T
gbanyan 939a348da4 Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification
Paper draft includes all sections (Abstract through Conclusion), 36 references,
and supporting scripts. Key methodology: Cosine similarity + dHash dual-method
verification with thresholds calibrated against known-replication firm (Firm A).

Includes:
- 8 section markdown files (paper_a_*.md)
- Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0)
- Recalibrated classification script (84,386 PDFs, 5-tier system)
- Figure generation and Word export scripts
- Citation renumbering script ([1]-[36])
- Signature analysis pipeline (12 steps)
- YOLO extraction scripts

Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 23:05:33 +08:00

320 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Compute SSIM and pHash for all signature pairs (closest match per accountant).
Uses multiprocessing for parallel image loading and computation.
Saves results to database and outputs complete CSV.
"""
import sqlite3
import numpy as np
import cv2
import os
import sys
import json
import csv
import time
from datetime import datetime
from collections import defaultdict
from multiprocessing import Pool, cpu_count
from pathlib import Path
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
IMAGE_DIR = '/Volumes/NV2/PDF-Processing/yolo-signatures/images'
OUTPUT_CSV = '/Volumes/NV2/PDF-Processing/signature-analysis/reports/complete_pdf_report.csv'
CHECKPOINT_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/ssim_checkpoint.json'
NUM_WORKERS = max(1, cpu_count() - 2) # Leave 2 cores free
BATCH_SIZE = 1000
def compute_phash(img, hash_size=8):
"""Compute perceptual hash."""
resized = cv2.resize(img, (hash_size + 1, hash_size))
diff = resized[:, 1:] > resized[:, :-1]
return diff.flatten()
def compute_pair_ssim(args):
"""Compute SSIM, pHash, histogram correlation for a pair of images."""
sig_id, file1, file2, cosine_sim = args
path1 = os.path.join(IMAGE_DIR, file1)
path2 = os.path.join(IMAGE_DIR, file2)
result = {
'signature_id': sig_id,
'match_file': file2,
'cosine_similarity': cosine_sim,
'ssim': None,
'phash_distance': None,
'histogram_corr': None,
'pixel_identical': False,
}
try:
img1 = cv2.imread(path1, cv2.IMREAD_GRAYSCALE)
img2 = cv2.imread(path2, cv2.IMREAD_GRAYSCALE)
if img1 is None or img2 is None:
return result
# Resize to same dimensions
h = min(img1.shape[0], img2.shape[0])
w = min(img1.shape[1], img2.shape[1])
if h < 3 or w < 3:
return result
img1_r = cv2.resize(img1, (w, h))
img2_r = cv2.resize(img2, (w, h))
# Pixel identical check
result['pixel_identical'] = bool(np.array_equal(img1_r, img2_r))
# SSIM
try:
from skimage.metrics import structural_similarity as ssim
win_size = min(7, min(h, w))
if win_size % 2 == 0:
win_size -= 1
if win_size >= 3:
result['ssim'] = float(ssim(img1_r, img2_r, win_size=win_size))
else:
result['ssim'] = None
except Exception:
result['ssim'] = None
# Histogram correlation
hist1 = cv2.calcHist([img1_r], [0], None, [256], [0, 256])
hist2 = cv2.calcHist([img2_r], [0], None, [256], [0, 256])
result['histogram_corr'] = float(cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL))
# pHash distance
h1 = compute_phash(img1_r)
h2 = compute_phash(img2_r)
result['phash_distance'] = int(np.sum(h1 != h2))
except Exception as e:
pass
return result
def load_checkpoint():
"""Load checkpoint of already processed signature IDs."""
if os.path.exists(CHECKPOINT_PATH):
with open(CHECKPOINT_PATH, 'r') as f:
data = json.load(f)
return set(data.get('processed_ids', []))
return set()
def save_checkpoint(processed_ids):
"""Save checkpoint."""
with open(CHECKPOINT_PATH, 'w') as f:
json.dump({'processed_ids': list(processed_ids), 'timestamp': str(datetime.now())}, f)
def main():
start_time = time.time()
print("=" * 70)
print("SSIM & pHash Computation for All Signature Pairs")
print(f"Workers: {NUM_WORKERS}")
print("=" * 70)
# --- Step 1: Load data ---
print("\n[1/4] Loading data from database...")
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
cur.execute('''
SELECT signature_id, image_filename, assigned_accountant, feature_vector
FROM signatures
WHERE feature_vector IS NOT NULL AND assigned_accountant IS NOT NULL
''')
rows = cur.fetchall()
sig_ids = []
filenames = []
accountants = []
features = []
for row in rows:
sig_ids.append(row[0])
filenames.append(row[1])
accountants.append(row[2])
features.append(np.frombuffer(row[3], dtype=np.float32))
features = np.array(features)
print(f" Loaded {len(sig_ids)} signatures")
# --- Step 2: Find closest match per signature ---
print("\n[2/4] Finding closest match per signature (same accountant)...")
acct_groups = defaultdict(list)
for i, acct in enumerate(accountants):
acct_groups[acct].append(i)
# Load checkpoint
processed_ids = load_checkpoint()
print(f" Checkpoint: {len(processed_ids)} already processed")
# Prepare tasks
tasks = []
for acct, indices in acct_groups.items():
if len(indices) < 2:
continue
vecs = features[indices]
sim_matrix = vecs @ vecs.T
np.fill_diagonal(sim_matrix, -1) # Exclude self
for local_i, global_i in enumerate(indices):
if sig_ids[global_i] in processed_ids:
continue
best_local = np.argmax(sim_matrix[local_i])
best_global = indices[best_local]
best_sim = float(sim_matrix[local_i, best_local])
tasks.append((
sig_ids[global_i],
filenames[global_i],
filenames[best_global],
best_sim
))
print(f" Tasks to process: {len(tasks)}")
# --- Step 3: Compute SSIM/pHash in parallel ---
print(f"\n[3/4] Computing SSIM & pHash ({len(tasks)} pairs, {NUM_WORKERS} workers)...")
# Add SSIM columns to database if not exist
try:
cur.execute('ALTER TABLE signatures ADD COLUMN ssim_to_closest REAL')
except:
pass
try:
cur.execute('ALTER TABLE signatures ADD COLUMN phash_distance_to_closest INTEGER')
except:
pass
try:
cur.execute('ALTER TABLE signatures ADD COLUMN histogram_corr_to_closest REAL')
except:
pass
try:
cur.execute('ALTER TABLE signatures ADD COLUMN pixel_identical_to_closest INTEGER')
except:
pass
try:
cur.execute('ALTER TABLE signatures ADD COLUMN closest_match_file TEXT')
except:
pass
conn.commit()
total = len(tasks)
done = 0
batch_results = []
with Pool(NUM_WORKERS) as pool:
for result in pool.imap_unordered(compute_pair_ssim, tasks, chunksize=50):
batch_results.append(result)
done += 1
if done % BATCH_SIZE == 0 or done == total:
# Save batch to database
for r in batch_results:
cur.execute('''
UPDATE signatures SET
ssim_to_closest = ?,
phash_distance_to_closest = ?,
histogram_corr_to_closest = ?,
pixel_identical_to_closest = ?,
closest_match_file = ?
WHERE signature_id = ?
''', (
r['ssim'],
r['phash_distance'],
r['histogram_corr'],
1 if r['pixel_identical'] else 0,
r['match_file'],
r['signature_id']
))
processed_ids.add(r['signature_id'])
conn.commit()
save_checkpoint(processed_ids)
batch_results = []
elapsed = time.time() - start_time
rate = done / elapsed
eta = (total - done) / rate if rate > 0 else 0
print(f" {done:,}/{total:,} ({100*done/total:.1f}%) "
f"| {rate:.1f} pairs/s | ETA: {eta/60:.1f} min")
# --- Step 4: Generate complete CSV ---
print(f"\n[4/4] Generating complete CSV...")
cur.execute('''
SELECT
s.source_pdf,
s.year_month,
s.serial_number,
s.doc_type,
s.page_number,
s.sig_index,
s.image_filename,
s.assigned_accountant,
s.excel_accountant1,
s.excel_accountant2,
s.excel_firm,
s.detection_confidence,
s.signature_verdict,
s.max_similarity_to_same_accountant,
s.ssim_to_closest,
s.phash_distance_to_closest,
s.histogram_corr_to_closest,
s.pixel_identical_to_closest,
s.closest_match_file,
a.risk_level,
a.mean_similarity as acct_mean_similarity,
a.ratio_gt_95 as acct_ratio_gt_95
FROM signatures s
LEFT JOIN accountants a ON s.assigned_accountant = a.name
ORDER BY s.source_pdf, s.sig_index
''')
columns = [
'source_pdf', 'year_month', 'serial_number', 'doc_type',
'page_number', 'sig_index', 'image_filename',
'assigned_accountant', 'excel_accountant1', 'excel_accountant2', 'excel_firm',
'detection_confidence', 'signature_verdict',
'max_cosine_similarity', 'ssim_to_closest', 'phash_distance_to_closest',
'histogram_corr_to_closest', 'pixel_identical_to_closest', 'closest_match_file',
'accountant_risk_level', 'accountant_mean_similarity', 'accountant_ratio_gt_95'
]
with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(columns)
for row in cur:
writer.writerow(row)
# Count rows
cur.execute('SELECT COUNT(*) FROM signatures')
total_sigs = cur.fetchone()[0]
cur.execute('SELECT COUNT(DISTINCT source_pdf) FROM signatures')
total_pdfs = cur.fetchone()[0]
conn.close()
elapsed = time.time() - start_time
print(f"\n{'='*70}")
print(f"Complete!")
print(f" Total signatures: {total_sigs:,}")
print(f" Total PDFs: {total_pdfs:,}")
print(f" Output: {OUTPUT_CSV}")
print(f" Time: {elapsed/60:.1f} minutes")
print(f"{'='*70}")
# Clean up checkpoint
if os.path.exists(CHECKPOINT_PATH):
os.remove(CHECKPOINT_PATH)
if __name__ == '__main__':
main()