Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification
Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,319 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Compute SSIM and pHash for all signature pairs (closest match per accountant).
|
||||
Uses multiprocessing for parallel image loading and computation.
|
||||
Saves results to database and outputs complete CSV.
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
import cv2
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import csv
|
||||
import time
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
from multiprocessing import Pool, cpu_count
|
||||
from pathlib import Path
|
||||
|
||||
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
IMAGE_DIR = '/Volumes/NV2/PDF-Processing/yolo-signatures/images'
|
||||
OUTPUT_CSV = '/Volumes/NV2/PDF-Processing/signature-analysis/reports/complete_pdf_report.csv'
|
||||
CHECKPOINT_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/ssim_checkpoint.json'
|
||||
NUM_WORKERS = max(1, cpu_count() - 2) # Leave 2 cores free
|
||||
BATCH_SIZE = 1000
|
||||
|
||||
|
||||
def compute_phash(img, hash_size=8):
|
||||
"""Compute perceptual hash."""
|
||||
resized = cv2.resize(img, (hash_size + 1, hash_size))
|
||||
diff = resized[:, 1:] > resized[:, :-1]
|
||||
return diff.flatten()
|
||||
|
||||
|
||||
def compute_pair_ssim(args):
|
||||
"""Compute SSIM, pHash, histogram correlation for a pair of images."""
|
||||
sig_id, file1, file2, cosine_sim = args
|
||||
|
||||
path1 = os.path.join(IMAGE_DIR, file1)
|
||||
path2 = os.path.join(IMAGE_DIR, file2)
|
||||
|
||||
result = {
|
||||
'signature_id': sig_id,
|
||||
'match_file': file2,
|
||||
'cosine_similarity': cosine_sim,
|
||||
'ssim': None,
|
||||
'phash_distance': None,
|
||||
'histogram_corr': None,
|
||||
'pixel_identical': False,
|
||||
}
|
||||
|
||||
try:
|
||||
img1 = cv2.imread(path1, cv2.IMREAD_GRAYSCALE)
|
||||
img2 = cv2.imread(path2, cv2.IMREAD_GRAYSCALE)
|
||||
|
||||
if img1 is None or img2 is None:
|
||||
return result
|
||||
|
||||
# Resize to same dimensions
|
||||
h = min(img1.shape[0], img2.shape[0])
|
||||
w = min(img1.shape[1], img2.shape[1])
|
||||
if h < 3 or w < 3:
|
||||
return result
|
||||
|
||||
img1_r = cv2.resize(img1, (w, h))
|
||||
img2_r = cv2.resize(img2, (w, h))
|
||||
|
||||
# Pixel identical check
|
||||
result['pixel_identical'] = bool(np.array_equal(img1_r, img2_r))
|
||||
|
||||
# SSIM
|
||||
try:
|
||||
from skimage.metrics import structural_similarity as ssim
|
||||
win_size = min(7, min(h, w))
|
||||
if win_size % 2 == 0:
|
||||
win_size -= 1
|
||||
if win_size >= 3:
|
||||
result['ssim'] = float(ssim(img1_r, img2_r, win_size=win_size))
|
||||
else:
|
||||
result['ssim'] = None
|
||||
except Exception:
|
||||
result['ssim'] = None
|
||||
|
||||
# Histogram correlation
|
||||
hist1 = cv2.calcHist([img1_r], [0], None, [256], [0, 256])
|
||||
hist2 = cv2.calcHist([img2_r], [0], None, [256], [0, 256])
|
||||
result['histogram_corr'] = float(cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL))
|
||||
|
||||
# pHash distance
|
||||
h1 = compute_phash(img1_r)
|
||||
h2 = compute_phash(img2_r)
|
||||
result['phash_distance'] = int(np.sum(h1 != h2))
|
||||
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def load_checkpoint():
|
||||
"""Load checkpoint of already processed signature IDs."""
|
||||
if os.path.exists(CHECKPOINT_PATH):
|
||||
with open(CHECKPOINT_PATH, 'r') as f:
|
||||
data = json.load(f)
|
||||
return set(data.get('processed_ids', []))
|
||||
return set()
|
||||
|
||||
|
||||
def save_checkpoint(processed_ids):
|
||||
"""Save checkpoint."""
|
||||
with open(CHECKPOINT_PATH, 'w') as f:
|
||||
json.dump({'processed_ids': list(processed_ids), 'timestamp': str(datetime.now())}, f)
|
||||
|
||||
|
||||
def main():
|
||||
start_time = time.time()
|
||||
print("=" * 70)
|
||||
print("SSIM & pHash Computation for All Signature Pairs")
|
||||
print(f"Workers: {NUM_WORKERS}")
|
||||
print("=" * 70)
|
||||
|
||||
# --- Step 1: Load data ---
|
||||
print("\n[1/4] Loading data from database...")
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute('''
|
||||
SELECT signature_id, image_filename, assigned_accountant, feature_vector
|
||||
FROM signatures
|
||||
WHERE feature_vector IS NOT NULL AND assigned_accountant IS NOT NULL
|
||||
''')
|
||||
rows = cur.fetchall()
|
||||
|
||||
sig_ids = []
|
||||
filenames = []
|
||||
accountants = []
|
||||
features = []
|
||||
|
||||
for row in rows:
|
||||
sig_ids.append(row[0])
|
||||
filenames.append(row[1])
|
||||
accountants.append(row[2])
|
||||
features.append(np.frombuffer(row[3], dtype=np.float32))
|
||||
|
||||
features = np.array(features)
|
||||
print(f" Loaded {len(sig_ids)} signatures")
|
||||
|
||||
# --- Step 2: Find closest match per signature ---
|
||||
print("\n[2/4] Finding closest match per signature (same accountant)...")
|
||||
acct_groups = defaultdict(list)
|
||||
for i, acct in enumerate(accountants):
|
||||
acct_groups[acct].append(i)
|
||||
|
||||
# Load checkpoint
|
||||
processed_ids = load_checkpoint()
|
||||
print(f" Checkpoint: {len(processed_ids)} already processed")
|
||||
|
||||
# Prepare tasks
|
||||
tasks = []
|
||||
for acct, indices in acct_groups.items():
|
||||
if len(indices) < 2:
|
||||
continue
|
||||
vecs = features[indices]
|
||||
sim_matrix = vecs @ vecs.T
|
||||
np.fill_diagonal(sim_matrix, -1) # Exclude self
|
||||
|
||||
for local_i, global_i in enumerate(indices):
|
||||
if sig_ids[global_i] in processed_ids:
|
||||
continue
|
||||
best_local = np.argmax(sim_matrix[local_i])
|
||||
best_global = indices[best_local]
|
||||
best_sim = float(sim_matrix[local_i, best_local])
|
||||
tasks.append((
|
||||
sig_ids[global_i],
|
||||
filenames[global_i],
|
||||
filenames[best_global],
|
||||
best_sim
|
||||
))
|
||||
|
||||
print(f" Tasks to process: {len(tasks)}")
|
||||
|
||||
# --- Step 3: Compute SSIM/pHash in parallel ---
|
||||
print(f"\n[3/4] Computing SSIM & pHash ({len(tasks)} pairs, {NUM_WORKERS} workers)...")
|
||||
|
||||
# Add SSIM columns to database if not exist
|
||||
try:
|
||||
cur.execute('ALTER TABLE signatures ADD COLUMN ssim_to_closest REAL')
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
cur.execute('ALTER TABLE signatures ADD COLUMN phash_distance_to_closest INTEGER')
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
cur.execute('ALTER TABLE signatures ADD COLUMN histogram_corr_to_closest REAL')
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
cur.execute('ALTER TABLE signatures ADD COLUMN pixel_identical_to_closest INTEGER')
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
cur.execute('ALTER TABLE signatures ADD COLUMN closest_match_file TEXT')
|
||||
except:
|
||||
pass
|
||||
conn.commit()
|
||||
|
||||
total = len(tasks)
|
||||
done = 0
|
||||
batch_results = []
|
||||
|
||||
with Pool(NUM_WORKERS) as pool:
|
||||
for result in pool.imap_unordered(compute_pair_ssim, tasks, chunksize=50):
|
||||
batch_results.append(result)
|
||||
done += 1
|
||||
|
||||
if done % BATCH_SIZE == 0 or done == total:
|
||||
# Save batch to database
|
||||
for r in batch_results:
|
||||
cur.execute('''
|
||||
UPDATE signatures SET
|
||||
ssim_to_closest = ?,
|
||||
phash_distance_to_closest = ?,
|
||||
histogram_corr_to_closest = ?,
|
||||
pixel_identical_to_closest = ?,
|
||||
closest_match_file = ?
|
||||
WHERE signature_id = ?
|
||||
''', (
|
||||
r['ssim'],
|
||||
r['phash_distance'],
|
||||
r['histogram_corr'],
|
||||
1 if r['pixel_identical'] else 0,
|
||||
r['match_file'],
|
||||
r['signature_id']
|
||||
))
|
||||
processed_ids.add(r['signature_id'])
|
||||
conn.commit()
|
||||
save_checkpoint(processed_ids)
|
||||
batch_results = []
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
rate = done / elapsed
|
||||
eta = (total - done) / rate if rate > 0 else 0
|
||||
print(f" {done:,}/{total:,} ({100*done/total:.1f}%) "
|
||||
f"| {rate:.1f} pairs/s | ETA: {eta/60:.1f} min")
|
||||
|
||||
# --- Step 4: Generate complete CSV ---
|
||||
print(f"\n[4/4] Generating complete CSV...")
|
||||
|
||||
cur.execute('''
|
||||
SELECT
|
||||
s.source_pdf,
|
||||
s.year_month,
|
||||
s.serial_number,
|
||||
s.doc_type,
|
||||
s.page_number,
|
||||
s.sig_index,
|
||||
s.image_filename,
|
||||
s.assigned_accountant,
|
||||
s.excel_accountant1,
|
||||
s.excel_accountant2,
|
||||
s.excel_firm,
|
||||
s.detection_confidence,
|
||||
s.signature_verdict,
|
||||
s.max_similarity_to_same_accountant,
|
||||
s.ssim_to_closest,
|
||||
s.phash_distance_to_closest,
|
||||
s.histogram_corr_to_closest,
|
||||
s.pixel_identical_to_closest,
|
||||
s.closest_match_file,
|
||||
a.risk_level,
|
||||
a.mean_similarity as acct_mean_similarity,
|
||||
a.ratio_gt_95 as acct_ratio_gt_95
|
||||
FROM signatures s
|
||||
LEFT JOIN accountants a ON s.assigned_accountant = a.name
|
||||
ORDER BY s.source_pdf, s.sig_index
|
||||
''')
|
||||
|
||||
columns = [
|
||||
'source_pdf', 'year_month', 'serial_number', 'doc_type',
|
||||
'page_number', 'sig_index', 'image_filename',
|
||||
'assigned_accountant', 'excel_accountant1', 'excel_accountant2', 'excel_firm',
|
||||
'detection_confidence', 'signature_verdict',
|
||||
'max_cosine_similarity', 'ssim_to_closest', 'phash_distance_to_closest',
|
||||
'histogram_corr_to_closest', 'pixel_identical_to_closest', 'closest_match_file',
|
||||
'accountant_risk_level', 'accountant_mean_similarity', 'accountant_ratio_gt_95'
|
||||
]
|
||||
|
||||
with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(columns)
|
||||
for row in cur:
|
||||
writer.writerow(row)
|
||||
|
||||
# Count rows
|
||||
cur.execute('SELECT COUNT(*) FROM signatures')
|
||||
total_sigs = cur.fetchone()[0]
|
||||
cur.execute('SELECT COUNT(DISTINCT source_pdf) FROM signatures')
|
||||
total_pdfs = cur.fetchone()[0]
|
||||
|
||||
conn.close()
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Complete!")
|
||||
print(f" Total signatures: {total_sigs:,}")
|
||||
print(f" Total PDFs: {total_pdfs:,}")
|
||||
print(f" Output: {OUTPUT_CSV}")
|
||||
print(f" Time: {elapsed/60:.1f} minutes")
|
||||
print(f"{'='*70}")
|
||||
|
||||
# Clean up checkpoint
|
||||
if os.path.exists(CHECKPOINT_PATH):
|
||||
os.remove(CHECKPOINT_PATH)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user