939a348da4
Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1214 lines
49 KiB
Python
1214 lines
49 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Formal Statistical Analysis for Signature Verification Threshold Validation
|
||
============================================================================
|
||
Produces a complete statistical report covering:
|
||
1. Normality tests
|
||
2. Intra-class vs Inter-class separation
|
||
3. Hypothesis testing
|
||
4. SSIM pixel-level verification
|
||
5. Distribution fitting
|
||
6. Per-accountant intra-class variance
|
||
|
||
Output: /Volumes/NV2/PDF-Processing/signature-analysis/reports/formal_statistical_report.md
|
||
"""
|
||
|
||
import sqlite3
|
||
import numpy as np
|
||
import os
|
||
import sys
|
||
import json
|
||
import warnings
|
||
from datetime import datetime
|
||
from collections import defaultdict
|
||
from pathlib import Path
|
||
|
||
warnings.filterwarnings('ignore')
|
||
|
||
# --- Configuration ---
|
||
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||
IMAGE_DIR = '/Volumes/NV2/PDF-Processing/yolo-signatures/images'
|
||
REPORT_DIR = '/Volumes/NV2/PDF-Processing/signature-analysis/reports'
|
||
FIGURE_DIR = os.path.join(REPORT_DIR, 'figures')
|
||
os.makedirs(FIGURE_DIR, exist_ok=True)
|
||
|
||
# Sampling parameters
|
||
INTER_CLASS_SAMPLE_SIZE = 500_000
|
||
INTRA_CLASS_MIN_SIGNATURES = 3 # Minimum signatures per accountant for intra-class
|
||
SSIM_SAMPLE_SIZE = 500 # Number of high-similarity pairs to check with SSIM
|
||
RANDOM_SEED = 42
|
||
|
||
np.random.seed(RANDOM_SEED)
|
||
|
||
|
||
def load_data():
|
||
"""Load signatures and feature vectors from database."""
|
||
print("[1/8] Loading data from database...")
|
||
conn = sqlite3.connect(DB_PATH)
|
||
cur = conn.cursor()
|
||
|
||
# Load signatures with accountant assignments
|
||
cur.execute('''
|
||
SELECT signature_id, image_filename, assigned_accountant, feature_vector,
|
||
max_similarity_to_same_accountant, signature_verdict
|
||
FROM signatures
|
||
WHERE feature_vector IS NOT NULL AND assigned_accountant IS NOT NULL
|
||
''')
|
||
rows = cur.fetchall()
|
||
|
||
sig_ids = []
|
||
filenames = []
|
||
accountants = []
|
||
features = []
|
||
max_sims = []
|
||
verdicts = []
|
||
|
||
for row in rows:
|
||
sig_ids.append(row[0])
|
||
filenames.append(row[1])
|
||
accountants.append(row[2])
|
||
features.append(np.frombuffer(row[3], dtype=np.float32))
|
||
max_sims.append(row[4])
|
||
verdicts.append(row[5])
|
||
|
||
features = np.array(features)
|
||
max_sims = np.array([x if x is not None else np.nan for x in max_sims])
|
||
|
||
# Load accountant info
|
||
cur.execute('SELECT name, signature_count, firm, risk_level, mean_similarity, ratio_gt_95 FROM accountants')
|
||
acct_rows = cur.fetchall()
|
||
accountant_info = {}
|
||
for r in acct_rows:
|
||
accountant_info[r[0]] = {
|
||
'count': r[1], 'firm': r[2], 'risk_level': r[3],
|
||
'mean_similarity': r[4], 'ratio_gt_95': r[5]
|
||
}
|
||
|
||
conn.close()
|
||
|
||
print(f" Loaded {len(sig_ids)} signatures from {len(set(accountants))} accountants")
|
||
print(f" Feature matrix shape: {features.shape}")
|
||
|
||
return {
|
||
'sig_ids': sig_ids, 'filenames': filenames, 'accountants': accountants,
|
||
'features': features, 'max_sims': max_sims, 'verdicts': verdicts,
|
||
'accountant_info': accountant_info
|
||
}
|
||
|
||
|
||
def compute_intra_inter_class(data):
|
||
"""Compute intra-class and inter-class similarity distributions."""
|
||
print("[2/8] Computing intra-class and inter-class similarity distributions...")
|
||
|
||
accountants = data['accountants']
|
||
features = data['features']
|
||
|
||
# Group indices by accountant
|
||
acct_groups = defaultdict(list)
|
||
for i, acct in enumerate(accountants):
|
||
acct_groups[acct].append(i)
|
||
|
||
# --- Intra-class: pairwise similarities within same accountant ---
|
||
intra_sims = []
|
||
intra_per_acct = {}
|
||
valid_accountants = []
|
||
|
||
for acct, indices in acct_groups.items():
|
||
if len(indices) < INTRA_CLASS_MIN_SIGNATURES:
|
||
continue
|
||
valid_accountants.append(acct)
|
||
vecs = features[indices]
|
||
# Cosine similarity = dot product (vectors are L2-normalized)
|
||
sim_matrix = vecs @ vecs.T
|
||
# Extract upper triangle (exclude diagonal)
|
||
n = len(indices)
|
||
triu_indices = np.triu_indices(n, k=1)
|
||
pair_sims = sim_matrix[triu_indices]
|
||
intra_sims.extend(pair_sims.tolist())
|
||
intra_per_acct[acct] = {
|
||
'n_signatures': n,
|
||
'n_pairs': len(pair_sims),
|
||
'mean': float(np.mean(pair_sims)),
|
||
'std': float(np.std(pair_sims)),
|
||
'min': float(np.min(pair_sims)),
|
||
'max': float(np.max(pair_sims)),
|
||
'median': float(np.median(pair_sims)),
|
||
'q25': float(np.percentile(pair_sims, 25)),
|
||
'q75': float(np.percentile(pair_sims, 75)),
|
||
}
|
||
|
||
intra_sims = np.array(intra_sims)
|
||
print(f" Intra-class: {len(intra_sims):,} pairs from {len(valid_accountants)} accountants")
|
||
|
||
# --- Inter-class: random pairs from different accountants ---
|
||
all_acct_list = list(acct_groups.keys())
|
||
inter_sims = []
|
||
|
||
n_samples = min(INTER_CLASS_SAMPLE_SIZE, len(features) * 10)
|
||
for _ in range(n_samples):
|
||
a1, a2 = np.random.choice(len(all_acct_list), 2, replace=False)
|
||
acct1, acct2 = all_acct_list[a1], all_acct_list[a2]
|
||
i1 = np.random.choice(acct_groups[acct1])
|
||
i2 = np.random.choice(acct_groups[acct2])
|
||
sim = float(features[i1] @ features[i2])
|
||
inter_sims.append(sim)
|
||
|
||
inter_sims = np.array(inter_sims)
|
||
print(f" Inter-class: {len(inter_sims):,} sampled pairs")
|
||
|
||
return intra_sims, inter_sims, intra_per_acct, valid_accountants
|
||
|
||
|
||
def normality_tests(intra_sims, inter_sims):
|
||
"""Run normality tests on both distributions."""
|
||
print("[3/8] Running normality tests...")
|
||
from scipy import stats
|
||
|
||
results = {}
|
||
|
||
for name, data_arr in [('intra_class', intra_sims), ('inter_class', inter_sims)]:
|
||
# Subsample for all tests to avoid memory/time issues
|
||
MAX_SAMPLE = 50000
|
||
if len(data_arr) > MAX_SAMPLE:
|
||
sample = np.random.choice(data_arr, MAX_SAMPLE, replace=False)
|
||
else:
|
||
sample = data_arr
|
||
|
||
# Shapiro-Wilk (max 5000)
|
||
sw_sample = sample[:5000]
|
||
sw_stat, sw_p = stats.shapiro(sw_sample)
|
||
|
||
# Kolmogorov-Smirnov (against normal) - use subsample
|
||
ks_stat, ks_p = stats.kstest(sample, 'norm',
|
||
args=(np.mean(sample), np.std(sample)))
|
||
|
||
# D'Agostino-Pearson
|
||
try:
|
||
dp_stat, dp_p = stats.normaltest(data_arr)
|
||
except:
|
||
dp_stat, dp_p = np.nan, np.nan
|
||
|
||
# Skewness and Kurtosis
|
||
skew = float(stats.skew(data_arr))
|
||
kurt = float(stats.kurtosis(data_arr))
|
||
|
||
results[name] = {
|
||
'n': len(data_arr),
|
||
'mean': float(np.mean(data_arr)),
|
||
'std': float(np.std(data_arr)),
|
||
'median': float(np.median(data_arr)),
|
||
'skewness': skew,
|
||
'kurtosis': kurt,
|
||
'shapiro_wilk': {'statistic': float(sw_stat), 'p_value': float(sw_p)},
|
||
'kolmogorov_smirnov': {'statistic': float(ks_stat), 'p_value': float(ks_p)},
|
||
'dagostino_pearson': {'statistic': float(dp_stat), 'p_value': float(dp_p)},
|
||
'is_normal_alpha_005': sw_p > 0.05 and ks_p > 0.05,
|
||
}
|
||
|
||
print(f" {name}: skew={skew:.4f}, kurt={kurt:.4f}, "
|
||
f"SW p={sw_p:.2e}, KS p={ks_p:.2e}")
|
||
|
||
return results
|
||
|
||
|
||
def hypothesis_tests(intra_sims, inter_sims):
|
||
"""Run hypothesis tests comparing intra vs inter class."""
|
||
print("[4/8] Running hypothesis tests...")
|
||
from scipy import stats
|
||
|
||
results = {}
|
||
|
||
# Subsample for expensive tests
|
||
MAX_SAMPLE = 100000
|
||
intra_sub = np.random.choice(intra_sims, min(MAX_SAMPLE, len(intra_sims)), replace=False)
|
||
inter_sub = np.random.choice(inter_sims, min(MAX_SAMPLE, len(inter_sims)), replace=False)
|
||
|
||
# Mann-Whitney U test (non-parametric)
|
||
u_stat, u_p = stats.mannwhitneyu(intra_sub, inter_sub, alternative='greater')
|
||
results['mann_whitney_u'] = {
|
||
'statistic': float(u_stat),
|
||
'p_value': float(u_p),
|
||
'alternative': 'intra > inter (one-sided)',
|
||
'significant_005': u_p < 0.05
|
||
}
|
||
print(f" Mann-Whitney U: stat={u_stat:.2e}, p={u_p:.2e}")
|
||
|
||
# Welch's t-test
|
||
t_stat, t_p = stats.ttest_ind(intra_sub, inter_sub, equal_var=False)
|
||
results['welch_t_test'] = {
|
||
'statistic': float(t_stat),
|
||
'p_value': float(t_p),
|
||
'significant_005': t_p < 0.05
|
||
}
|
||
print(f" Welch's t-test: t={t_stat:.4f}, p={t_p:.2e}")
|
||
|
||
# Cohen's d (effect size)
|
||
pooled_std = np.sqrt((np.std(intra_sims)**2 + np.std(inter_sims)**2) / 2)
|
||
cohens_d = (np.mean(intra_sims) - np.mean(inter_sims)) / pooled_std
|
||
results['cohens_d'] = {
|
||
'value': float(cohens_d),
|
||
'interpretation': (
|
||
'negligible' if abs(cohens_d) < 0.2 else
|
||
'small' if abs(cohens_d) < 0.5 else
|
||
'medium' if abs(cohens_d) < 0.8 else
|
||
'large'
|
||
)
|
||
}
|
||
print(f" Cohen's d: {cohens_d:.4f} ({results['cohens_d']['interpretation']})")
|
||
|
||
# Fisher's Discriminant Ratio
|
||
fisher_ratio = (np.mean(intra_sims) - np.mean(inter_sims))**2 / (
|
||
np.var(intra_sims) + np.var(inter_sims))
|
||
results['fisher_discriminant_ratio'] = float(fisher_ratio)
|
||
print(f" Fisher's Discriminant Ratio: {fisher_ratio:.4f}")
|
||
|
||
# Kolmogorov-Smirnov 2-sample test
|
||
ks_stat, ks_p = stats.ks_2samp(intra_sub, inter_sub)
|
||
results['ks_2sample'] = {
|
||
'statistic': float(ks_stat),
|
||
'p_value': float(ks_p),
|
||
'significant_005': ks_p < 0.05
|
||
}
|
||
print(f" K-S 2-sample: D={ks_stat:.4f}, p={ks_p:.2e}")
|
||
|
||
# Confidence interval for the difference in means
|
||
mean_diff = np.mean(intra_sims) - np.mean(inter_sims)
|
||
se_diff = np.sqrt(np.var(intra_sims)/len(intra_sims) + np.var(inter_sims)/len(inter_sims))
|
||
z_95 = 1.96
|
||
ci_lower = mean_diff - z_95 * se_diff
|
||
ci_upper = mean_diff + z_95 * se_diff
|
||
results['mean_difference'] = {
|
||
'difference': float(mean_diff),
|
||
'se': float(se_diff),
|
||
'ci_95_lower': float(ci_lower),
|
||
'ci_95_upper': float(ci_upper),
|
||
}
|
||
print(f" Mean difference: {mean_diff:.4f} (95% CI: [{ci_lower:.4f}, {ci_upper:.4f}])")
|
||
|
||
# Optimal threshold (intersection of two distributions via KDE)
|
||
from scipy.stats import gaussian_kde
|
||
x_range = np.linspace(0.3, 1.0, 10000)
|
||
kde_intra = gaussian_kde(intra_sub)
|
||
kde_inter = gaussian_kde(inter_sub)
|
||
diff = kde_intra(x_range) - kde_inter(x_range)
|
||
# Find crossing points
|
||
sign_changes = np.where(np.diff(np.sign(diff)))[0]
|
||
crossover_points = x_range[sign_changes]
|
||
results['kde_crossover_thresholds'] = [float(x) for x in crossover_points]
|
||
print(f" KDE crossover points: {crossover_points}")
|
||
|
||
return results
|
||
|
||
|
||
def distribution_fitting(intra_sims, inter_sims):
|
||
"""Fit various distributions and compare."""
|
||
print("[5/8] Fitting distributions...")
|
||
from scipy import stats as sp_stats
|
||
|
||
distributions = {
|
||
'norm': sp_stats.norm,
|
||
'beta': sp_stats.beta,
|
||
'skewnorm': sp_stats.skewnorm,
|
||
'lognorm': sp_stats.lognorm,
|
||
'gamma': sp_stats.gamma,
|
||
}
|
||
|
||
# Subsample for fitting (very expensive on full data)
|
||
FIT_SAMPLE = 100000
|
||
intra_sub = np.random.choice(intra_sims, min(FIT_SAMPLE, len(intra_sims)), replace=False)
|
||
inter_sub = np.random.choice(inter_sims, min(FIT_SAMPLE, len(inter_sims)), replace=False)
|
||
|
||
results = {}
|
||
|
||
for dist_name, (name, data_arr) in [(dn, dd) for dn in distributions
|
||
for dd in [('intra_class', intra_sub),
|
||
('inter_class', inter_sub)]]:
|
||
key = f"{name}_{dist_name}"
|
||
try:
|
||
params = distributions[dist_name].fit(data_arr)
|
||
# K-S goodness of fit
|
||
ks_stat, ks_p = sp_stats.kstest(data_arr, dist_name, args=params)
|
||
# Log-likelihood
|
||
ll = np.sum(distributions[dist_name].logpdf(data_arr, *params))
|
||
# AIC and BIC
|
||
k = len(params)
|
||
n = len(data_arr)
|
||
aic = 2 * k - 2 * ll
|
||
bic = k * np.log(n) - 2 * ll
|
||
|
||
results[key] = {
|
||
'distribution': dist_name,
|
||
'data': name,
|
||
'params': [float(p) for p in params],
|
||
'ks_statistic': float(ks_stat),
|
||
'ks_p_value': float(ks_p),
|
||
'log_likelihood': float(ll),
|
||
'aic': float(aic),
|
||
'bic': float(bic),
|
||
'n_params': k,
|
||
}
|
||
except Exception as e:
|
||
results[key] = {'distribution': dist_name, 'data': name, 'error': str(e)}
|
||
|
||
# Find best fit for each class
|
||
for cls in ['intra_class', 'inter_class']:
|
||
fits = [(k, v) for k, v in results.items()
|
||
if k.startswith(cls) and 'aic' in v]
|
||
if fits:
|
||
best = min(fits, key=lambda x: x[1]['aic'])
|
||
results[f'{cls}_best_fit'] = best[1]['distribution']
|
||
print(f" {cls} best fit: {best[1]['distribution']} (AIC={best[1]['aic']:.2f})")
|
||
|
||
return results
|
||
|
||
|
||
def ssim_analysis(data):
|
||
"""SSIM pixel-level analysis for high-similarity pairs."""
|
||
print("[6/8] Running SSIM pixel-level analysis...")
|
||
|
||
try:
|
||
import cv2
|
||
from skimage.metrics import structural_similarity as ssim
|
||
except ImportError:
|
||
print(" WARNING: skimage not available, using cv2 only")
|
||
ssim = None
|
||
|
||
features = data['features']
|
||
filenames = data['filenames']
|
||
max_sims = data['max_sims']
|
||
|
||
# Find indices of signatures with highest max_similarity
|
||
valid_mask = ~np.isnan(max_sims) & (max_sims > 0.95)
|
||
high_sim_indices = np.where(valid_mask)[0]
|
||
|
||
if len(high_sim_indices) == 0:
|
||
print(" No high-similarity pairs found")
|
||
return {}
|
||
|
||
# Sample pairs from high-similarity signatures
|
||
np.random.shuffle(high_sim_indices)
|
||
sample_indices = high_sim_indices[:SSIM_SAMPLE_SIZE * 2]
|
||
|
||
# For each high-sim signature, find its closest match
|
||
results = {
|
||
'n_analyzed': 0,
|
||
'pixel_identical': 0,
|
||
'ssim_scores': [],
|
||
'histogram_corr_scores': [],
|
||
'phash_identical': 0,
|
||
'ssim_above_099': 0,
|
||
'ssim_above_095': 0,
|
||
'ssim_above_090': 0,
|
||
'pairs_analyzed': [],
|
||
}
|
||
|
||
analyzed = 0
|
||
for idx in sample_indices:
|
||
if analyzed >= SSIM_SAMPLE_SIZE:
|
||
break
|
||
|
||
img_path = os.path.join(IMAGE_DIR, filenames[idx])
|
||
if not os.path.exists(img_path):
|
||
continue
|
||
|
||
img1 = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
|
||
if img1 is None:
|
||
continue
|
||
|
||
# Find most similar signature from same accountant
|
||
acct = data['accountants'][idx]
|
||
acct_indices = [i for i, a in enumerate(data['accountants'])
|
||
if a == acct and i != idx]
|
||
if not acct_indices:
|
||
continue
|
||
|
||
# Compute similarities to find the closest match
|
||
sims = features[acct_indices] @ features[idx]
|
||
best_match_local = np.argmax(sims)
|
||
best_match_idx = acct_indices[best_match_local]
|
||
best_sim = float(sims[best_match_local])
|
||
|
||
if best_sim < 0.95:
|
||
continue
|
||
|
||
img2_path = os.path.join(IMAGE_DIR, filenames[best_match_idx])
|
||
if not os.path.exists(img2_path):
|
||
continue
|
||
|
||
img2 = cv2.imread(img2_path, cv2.IMREAD_GRAYSCALE)
|
||
if img2 is None:
|
||
continue
|
||
|
||
# Resize to same dimensions for comparison
|
||
h = min(img1.shape[0], img2.shape[0])
|
||
w = min(img1.shape[1], img2.shape[1])
|
||
img1_r = cv2.resize(img1, (w, h))
|
||
img2_r = cv2.resize(img2, (w, h))
|
||
|
||
# Check 1: Exact pixel match
|
||
is_identical = np.array_equal(img1_r, img2_r)
|
||
if is_identical:
|
||
results['pixel_identical'] += 1
|
||
|
||
# Check 2: SSIM
|
||
if ssim is not None and w >= 7 and h >= 7:
|
||
win_size = min(7, min(h, w))
|
||
if win_size % 2 == 0:
|
||
win_size -= 1
|
||
if win_size >= 3:
|
||
ssim_score = ssim(img1_r, img2_r, win_size=win_size)
|
||
else:
|
||
ssim_score = float(np.nan)
|
||
else:
|
||
ssim_score = float(np.nan)
|
||
|
||
results['ssim_scores'].append(float(ssim_score))
|
||
if not np.isnan(ssim_score):
|
||
if ssim_score > 0.99:
|
||
results['ssim_above_099'] += 1
|
||
if ssim_score > 0.95:
|
||
results['ssim_above_095'] += 1
|
||
if ssim_score > 0.90:
|
||
results['ssim_above_090'] += 1
|
||
|
||
# Check 3: Histogram correlation
|
||
hist1 = cv2.calcHist([img1_r], [0], None, [256], [0, 256])
|
||
hist2 = cv2.calcHist([img2_r], [0], None, [256], [0, 256])
|
||
hist_corr = cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
|
||
results['histogram_corr_scores'].append(float(hist_corr))
|
||
|
||
# Check 4: Perceptual hash
|
||
def phash(img, hash_size=8):
|
||
resized = cv2.resize(img, (hash_size + 1, hash_size))
|
||
diff = resized[:, 1:] > resized[:, :-1]
|
||
return diff.flatten()
|
||
|
||
h1 = phash(img1_r)
|
||
h2 = phash(img2_r)
|
||
hamming_dist = np.sum(h1 != h2)
|
||
if hamming_dist == 0:
|
||
results['phash_identical'] += 1
|
||
|
||
results['pairs_analyzed'].append({
|
||
'file1': filenames[idx],
|
||
'file2': filenames[best_match_idx],
|
||
'cosine_similarity': best_sim,
|
||
'ssim': float(ssim_score),
|
||
'histogram_corr': float(hist_corr),
|
||
'pixel_identical': bool(is_identical),
|
||
'phash_hamming_distance': int(hamming_dist),
|
||
})
|
||
|
||
analyzed += 1
|
||
if analyzed % 100 == 0:
|
||
print(f" Analyzed {analyzed}/{SSIM_SAMPLE_SIZE} pairs...")
|
||
|
||
results['n_analyzed'] = analyzed
|
||
ssim_arr = [s for s in results['ssim_scores'] if not np.isnan(s)]
|
||
if ssim_arr:
|
||
results['ssim_mean'] = float(np.mean(ssim_arr))
|
||
results['ssim_std'] = float(np.std(ssim_arr))
|
||
results['ssim_median'] = float(np.median(ssim_arr))
|
||
results['ssim_min'] = float(np.min(ssim_arr))
|
||
results['ssim_max'] = float(np.max(ssim_arr))
|
||
|
||
hist_arr = results['histogram_corr_scores']
|
||
if hist_arr:
|
||
results['hist_corr_mean'] = float(np.mean(hist_arr))
|
||
results['hist_corr_std'] = float(np.std(hist_arr))
|
||
|
||
print(f" SSIM analysis complete: {analyzed} pairs analyzed")
|
||
print(f" Pixel-identical: {results['pixel_identical']}")
|
||
print(f" SSIM > 0.99: {results['ssim_above_099']}")
|
||
print(f" SSIM > 0.95: {results['ssim_above_095']}")
|
||
print(f" pHash identical: {results['phash_identical']}")
|
||
|
||
return results
|
||
|
||
|
||
def per_accountant_analysis(intra_per_acct, data):
|
||
"""Per-accountant intra-class variance analysis."""
|
||
print("[7/8] Computing per-accountant statistics...")
|
||
from scipy import stats
|
||
|
||
results = {
|
||
'total_accountants': len(intra_per_acct),
|
||
'accountants': [],
|
||
'overall': {},
|
||
}
|
||
|
||
stds = []
|
||
means = []
|
||
ranges = []
|
||
n_sigs = []
|
||
|
||
for acct, stats_dict in intra_per_acct.items():
|
||
stds.append(stats_dict['std'])
|
||
means.append(stats_dict['mean'])
|
||
ranges.append(stats_dict['max'] - stats_dict['min'])
|
||
n_sigs.append(stats_dict['n_signatures'])
|
||
|
||
info = data['accountant_info'].get(acct, {})
|
||
results['accountants'].append({
|
||
'name': acct,
|
||
'n_signatures': stats_dict['n_signatures'],
|
||
'n_pairs': stats_dict['n_pairs'],
|
||
'mean_similarity': stats_dict['mean'],
|
||
'std_similarity': stats_dict['std'],
|
||
'min_similarity': stats_dict['min'],
|
||
'max_similarity': stats_dict['max'],
|
||
'median_similarity': stats_dict['median'],
|
||
'iqr': stats_dict['q75'] - stats_dict['q25'],
|
||
'range': stats_dict['max'] - stats_dict['min'],
|
||
'firm': info.get('firm', ''),
|
||
'risk_level': info.get('risk_level', ''),
|
||
})
|
||
|
||
stds = np.array(stds)
|
||
means = np.array(means)
|
||
ranges = np.array(ranges)
|
||
n_sigs = np.array(n_sigs)
|
||
|
||
results['overall'] = {
|
||
'mean_of_means': float(np.mean(means)),
|
||
'std_of_means': float(np.std(means)),
|
||
'mean_of_stds': float(np.mean(stds)),
|
||
'std_of_stds': float(np.std(stds)),
|
||
'mean_of_ranges': float(np.mean(ranges)),
|
||
'std_of_ranges': float(np.std(ranges)),
|
||
'mean_n_signatures': float(np.mean(n_sigs)),
|
||
'median_n_signatures': float(np.median(n_sigs)),
|
||
}
|
||
|
||
# Correlation between sample size and statistics
|
||
corr_n_mean, p_n_mean = stats.pearsonr(n_sigs, means)
|
||
corr_n_std, p_n_std = stats.pearsonr(n_sigs, stds)
|
||
results['correlations'] = {
|
||
'n_vs_mean': {'r': float(corr_n_mean), 'p': float(p_n_mean)},
|
||
'n_vs_std': {'r': float(corr_n_std), 'p': float(p_n_std)},
|
||
}
|
||
|
||
# Sort by mean similarity (descending) for report
|
||
results['accountants'].sort(key=lambda x: x['mean_similarity'], reverse=True)
|
||
|
||
print(f" Mean of intra-class means: {np.mean(means):.4f}")
|
||
print(f" Mean of intra-class stds: {np.mean(stds):.4f}")
|
||
print(f" Correlation (N vs mean): r={corr_n_mean:.4f}, p={p_n_mean:.2e}")
|
||
print(f" Correlation (N vs std): r={corr_n_std:.4f}, p={p_n_std:.2e}")
|
||
|
||
return results
|
||
|
||
|
||
def generate_visualizations(intra_sims, inter_sims, normality_results,
|
||
ssim_results, per_acct_results, hypothesis_results):
|
||
"""Generate all figures."""
|
||
print("[8/8] Generating visualizations...")
|
||
import matplotlib
|
||
matplotlib.use('Agg')
|
||
import matplotlib.pyplot as plt
|
||
from scipy import stats
|
||
from scipy.stats import gaussian_kde
|
||
|
||
plt.rcParams['font.size'] = 11
|
||
plt.rcParams['figure.dpi'] = 150
|
||
|
||
# Subsample for visualization (histograms and KDE)
|
||
VIS_SAMPLE = 200000
|
||
intra_vis = np.random.choice(intra_sims, min(VIS_SAMPLE, len(intra_sims)), replace=False)
|
||
inter_vis = np.random.choice(inter_sims, min(VIS_SAMPLE, len(inter_sims)), replace=False)
|
||
|
||
# --- Figure 1: Intra vs Inter class distributions ---
|
||
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
||
|
||
# 1a: Overlaid histograms
|
||
ax = axes[0, 0]
|
||
bins_edges = np.linspace(0.3, 1.0, 201)
|
||
ax.hist(inter_vis, bins=bins_edges, alpha=0.6, density=True, label='Inter-class', color='steelblue')
|
||
ax.hist(intra_vis, bins=bins_edges, alpha=0.6, density=True, label='Intra-class', color='coral')
|
||
# Add threshold lines
|
||
for thresh, ls, lbl in [(0.95, '--', 'Current (0.95)'), (0.944, ':', 'μ+2σ (0.944)')]:
|
||
ax.axvline(thresh, color='red', linestyle=ls, linewidth=1.5, label=lbl)
|
||
# Add KDE crossover
|
||
if hypothesis_results.get('kde_crossover_thresholds'):
|
||
for cp in hypothesis_results['kde_crossover_thresholds']:
|
||
if 0.5 < cp < 1.0:
|
||
ax.axvline(cp, color='green', linestyle='-.', linewidth=1.5,
|
||
label=f'KDE crossover ({cp:.3f})')
|
||
ax.set_xlabel('Cosine Similarity')
|
||
ax.set_ylabel('Density')
|
||
ax.set_title('(a) Intra-class vs Inter-class Similarity Distributions')
|
||
ax.legend(fontsize=9)
|
||
ax.set_xlim(0.4, 1.02)
|
||
|
||
# 1b: KDE overlay
|
||
ax = axes[0, 1]
|
||
x = np.linspace(0.3, 1.0, 1000)
|
||
kde_intra = gaussian_kde(intra_vis)
|
||
kde_inter = gaussian_kde(inter_vis)
|
||
ax.plot(x, kde_inter(x), label='Inter-class', color='steelblue', linewidth=2)
|
||
ax.plot(x, kde_intra(x), label='Intra-class', color='coral', linewidth=2)
|
||
ax.fill_between(x, 0, np.minimum(kde_intra(x), kde_inter(x)),
|
||
alpha=0.3, color='gray', label='Overlap region')
|
||
ax.axvline(0.95, color='red', linestyle='--', linewidth=1.5, label='Threshold (0.95)')
|
||
ax.set_xlabel('Cosine Similarity')
|
||
ax.set_ylabel('Density')
|
||
ax.set_title('(b) KDE Overlay with Overlap Region')
|
||
ax.legend(fontsize=9)
|
||
ax.set_xlim(0.4, 1.02)
|
||
|
||
# 1c: Box plots
|
||
ax = axes[1, 0]
|
||
bp = ax.boxplot([inter_vis, intra_vis], labels=['Inter-class', 'Intra-class'],
|
||
patch_artist=True, widths=0.5)
|
||
bp['boxes'][0].set_facecolor('steelblue')
|
||
bp['boxes'][0].set_alpha(0.6)
|
||
bp['boxes'][1].set_facecolor('coral')
|
||
bp['boxes'][1].set_alpha(0.6)
|
||
ax.axhline(0.95, color='red', linestyle='--', linewidth=1.5, label='Threshold (0.95)')
|
||
ax.set_ylabel('Cosine Similarity')
|
||
ax.set_title('(c) Box Plot Comparison')
|
||
ax.legend()
|
||
|
||
# 1d: Cumulative distribution
|
||
ax = axes[1, 1]
|
||
sorted_inter = np.sort(inter_vis)
|
||
sorted_intra = np.sort(intra_vis)
|
||
ax.plot(sorted_inter, np.linspace(0, 1, len(sorted_inter)),
|
||
label='Inter-class', color='steelblue', linewidth=2)
|
||
ax.plot(sorted_intra, np.linspace(0, 1, len(sorted_intra)),
|
||
label='Intra-class', color='coral', linewidth=2)
|
||
ax.axvline(0.95, color='red', linestyle='--', linewidth=1.5, label='Threshold (0.95)')
|
||
ax.set_xlabel('Cosine Similarity')
|
||
ax.set_ylabel('Cumulative Probability')
|
||
ax.set_title('(d) Cumulative Distribution Functions')
|
||
ax.legend()
|
||
|
||
plt.tight_layout()
|
||
fig.savefig(os.path.join(FIGURE_DIR, 'fig1_intra_vs_inter_class.png'), bbox_inches='tight')
|
||
plt.close()
|
||
print(" Saved fig1_intra_vs_inter_class.png")
|
||
|
||
# --- Figure 2: Normality tests ---
|
||
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
||
|
||
# 2a: Q-Q plot intra-class
|
||
ax = axes[0, 0]
|
||
sample_intra = np.random.choice(intra_vis, min(10000, len(intra_vis)), replace=False)
|
||
stats.probplot(sample_intra, dist='norm', plot=ax)
|
||
ax.set_title('(a) Q-Q Plot: Intra-class')
|
||
|
||
# 2b: Q-Q plot inter-class
|
||
ax = axes[0, 1]
|
||
sample_inter = np.random.choice(inter_vis, min(10000, len(inter_vis)), replace=False)
|
||
stats.probplot(sample_inter, dist='norm', plot=ax)
|
||
ax.set_title('(b) Q-Q Plot: Inter-class')
|
||
|
||
# 2c: Histogram with fitted normal (intra)
|
||
ax = axes[1, 0]
|
||
ax.hist(intra_vis, bins=np.linspace(0.3, 1.0, 151), density=True, alpha=0.6, color='coral', label='Data')
|
||
mu, sigma = np.mean(intra_vis), np.std(intra_vis)
|
||
x = np.linspace(mu - 4*sigma, mu + 4*sigma, 200)
|
||
ax.plot(x, stats.norm.pdf(x, mu, sigma), 'k-', linewidth=2, label=f'Normal fit\nμ={mu:.3f}, σ={sigma:.3f}')
|
||
ax.set_title('(c) Intra-class with Normal Fit')
|
||
ax.set_xlabel('Cosine Similarity')
|
||
ax.legend()
|
||
|
||
# 2d: Histogram with fitted normal (inter)
|
||
ax = axes[1, 1]
|
||
ax.hist(inter_vis, bins=np.linspace(0.3, 1.0, 151), density=True, alpha=0.6, color='steelblue', label='Data')
|
||
mu, sigma = np.mean(inter_vis), np.std(inter_vis)
|
||
x = np.linspace(mu - 4*sigma, mu + 4*sigma, 200)
|
||
ax.plot(x, stats.norm.pdf(x, mu, sigma), 'k-', linewidth=2, label=f'Normal fit\nμ={mu:.3f}, σ={sigma:.3f}')
|
||
ax.set_title('(d) Inter-class with Normal Fit')
|
||
ax.set_xlabel('Cosine Similarity')
|
||
ax.legend()
|
||
|
||
plt.tight_layout()
|
||
fig.savefig(os.path.join(FIGURE_DIR, 'fig2_normality_tests.png'), bbox_inches='tight')
|
||
plt.close()
|
||
print(" Saved fig2_normality_tests.png")
|
||
|
||
# --- Figure 3: SSIM analysis ---
|
||
if ssim_results and ssim_results.get('ssim_scores'):
|
||
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
|
||
|
||
valid_ssim = [s for s in ssim_results['ssim_scores'] if not np.isnan(s)]
|
||
valid_hist = ssim_results['histogram_corr_scores']
|
||
|
||
# 3a: SSIM distribution
|
||
ax = axes[0]
|
||
ax.hist(valid_ssim, bins=np.linspace(0, 1, 51), alpha=0.7, color='teal', edgecolor='black')
|
||
ax.axvline(0.95, color='red', linestyle='--', linewidth=1.5, label='SSIM=0.95')
|
||
ax.axvline(0.99, color='darkred', linestyle='--', linewidth=1.5, label='SSIM=0.99')
|
||
ax.set_xlabel('SSIM Score')
|
||
ax.set_ylabel('Count')
|
||
ax.set_title(f'(a) SSIM Distribution (n={len(valid_ssim)})')
|
||
ax.legend()
|
||
|
||
# 3b: Histogram correlation
|
||
ax = axes[1]
|
||
ax.hist(valid_hist, bins=np.linspace(-1, 1, 51), alpha=0.7, color='purple', edgecolor='black')
|
||
ax.set_xlabel('Histogram Correlation')
|
||
ax.set_ylabel('Count')
|
||
ax.set_title(f'(b) Histogram Correlation (n={len(valid_hist)})')
|
||
|
||
# 3c: SSIM vs Cosine similarity scatter
|
||
ax = axes[2]
|
||
cosine_sims = [p['cosine_similarity'] for p in ssim_results['pairs_analyzed']]
|
||
ssim_vals = [p['ssim'] for p in ssim_results['pairs_analyzed'] if not np.isnan(p['ssim'])]
|
||
cosine_for_ssim = [p['cosine_similarity'] for p in ssim_results['pairs_analyzed']
|
||
if not np.isnan(p['ssim'])]
|
||
ax.scatter(cosine_for_ssim, ssim_vals, alpha=0.3, s=10, color='teal')
|
||
ax.set_xlabel('Cosine Similarity (Feature)')
|
||
ax.set_ylabel('SSIM (Pixel)')
|
||
ax.set_title('(c) Feature Similarity vs Pixel Similarity')
|
||
ax.axhline(0.95, color='red', linestyle='--', alpha=0.5)
|
||
ax.axvline(0.95, color='red', linestyle='--', alpha=0.5)
|
||
|
||
plt.tight_layout()
|
||
fig.savefig(os.path.join(FIGURE_DIR, 'fig3_ssim_analysis.png'), bbox_inches='tight')
|
||
plt.close()
|
||
print(" Saved fig3_ssim_analysis.png")
|
||
|
||
# --- Figure 4: Per-accountant analysis ---
|
||
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
|
||
|
||
accts = per_acct_results['accountants']
|
||
acct_means = [a['mean_similarity'] for a in accts]
|
||
acct_stds = [a['std_similarity'] for a in accts]
|
||
acct_ns = [a['n_signatures'] for a in accts]
|
||
acct_ranges = [a['range'] for a in accts]
|
||
|
||
# 4a: Distribution of per-accountant mean similarity
|
||
ax = axes[0, 0]
|
||
ax.hist(acct_means, bins=np.linspace(0.5, 1.0, 51), alpha=0.7, color='darkgreen', edgecolor='black')
|
||
ax.axvline(np.mean(acct_means), color='red', linestyle='--',
|
||
label=f'Mean={np.mean(acct_means):.3f}')
|
||
ax.set_xlabel('Mean Intra-class Similarity')
|
||
ax.set_ylabel('Count (Accountants)')
|
||
ax.set_title('(a) Distribution of Per-Accountant Mean Similarity')
|
||
ax.legend()
|
||
|
||
# 4b: Distribution of per-accountant std
|
||
ax = axes[0, 1]
|
||
ax.hist(acct_stds, bins=np.linspace(0, 0.3, 51), alpha=0.7, color='darkorange', edgecolor='black')
|
||
ax.axvline(np.mean(acct_stds), color='red', linestyle='--',
|
||
label=f'Mean={np.mean(acct_stds):.3f}')
|
||
ax.set_xlabel('Std of Intra-class Similarity')
|
||
ax.set_ylabel('Count (Accountants)')
|
||
ax.set_title('(b) Distribution of Per-Accountant Std')
|
||
ax.legend()
|
||
|
||
# 4c: Sample size vs mean similarity
|
||
ax = axes[1, 0]
|
||
ax.scatter(acct_ns, acct_means, alpha=0.4, s=15, color='darkgreen')
|
||
ax.set_xlabel('Number of Signatures')
|
||
ax.set_ylabel('Mean Intra-class Similarity')
|
||
ax.set_title(f'(c) Sample Size vs Mean Similarity\n'
|
||
f'(r={per_acct_results["correlations"]["n_vs_mean"]["r"]:.3f})')
|
||
|
||
# 4d: Sample size vs std
|
||
ax = axes[1, 1]
|
||
ax.scatter(acct_ns, acct_stds, alpha=0.4, s=15, color='darkorange')
|
||
ax.set_xlabel('Number of Signatures')
|
||
ax.set_ylabel('Std of Intra-class Similarity')
|
||
ax.set_title(f'(d) Sample Size vs Std\n'
|
||
f'(r={per_acct_results["correlations"]["n_vs_std"]["r"]:.3f})')
|
||
|
||
plt.tight_layout()
|
||
fig.savefig(os.path.join(FIGURE_DIR, 'fig4_per_accountant.png'), bbox_inches='tight')
|
||
plt.close()
|
||
print(" Saved fig4_per_accountant.png")
|
||
|
||
# --- Figure 5: Threshold sensitivity analysis ---
|
||
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
|
||
|
||
thresholds = np.arange(0.80, 1.001, 0.005)
|
||
|
||
# 5a: Percentage of intra-class above threshold
|
||
intra_above = [100 * np.mean(intra_vis > t) for t in thresholds]
|
||
inter_above = [100 * np.mean(inter_vis > t) for t in thresholds]
|
||
|
||
ax = axes[0]
|
||
ax.plot(thresholds, intra_above, 'coral', linewidth=2, label='Intra-class (same person)')
|
||
ax.plot(thresholds, inter_above, 'steelblue', linewidth=2, label='Inter-class (diff person)')
|
||
ax.axvline(0.95, color='red', linestyle='--', alpha=0.7, label='Current threshold')
|
||
ax.set_xlabel('Threshold')
|
||
ax.set_ylabel('% Pairs Above Threshold')
|
||
ax.set_title('(a) Threshold Sensitivity: % Pairs Above')
|
||
ax.legend()
|
||
ax.set_xlim(0.80, 1.0)
|
||
|
||
# 5b: Ratio of intra to inter above threshold
|
||
ax = axes[1]
|
||
ratios = []
|
||
for ia, ea in zip(intra_above, inter_above):
|
||
if ea > 0:
|
||
ratios.append(ia / ea)
|
||
else:
|
||
ratios.append(float('inf'))
|
||
ax.plot(thresholds, ratios, 'purple', linewidth=2)
|
||
ax.axvline(0.95, color='red', linestyle='--', alpha=0.7, label='Current threshold')
|
||
ax.set_xlabel('Threshold')
|
||
ax.set_ylabel('Intra/Inter Ratio')
|
||
ax.set_title('(b) Intra-class / Inter-class Ratio')
|
||
ax.legend()
|
||
ax.set_xlim(0.80, 1.0)
|
||
ax.set_yscale('log')
|
||
|
||
plt.tight_layout()
|
||
fig.savefig(os.path.join(FIGURE_DIR, 'fig5_threshold_sensitivity.png'), bbox_inches='tight')
|
||
plt.close()
|
||
print(" Saved fig5_threshold_sensitivity.png")
|
||
|
||
print(" All visualizations complete.")
|
||
|
||
|
||
def generate_report(normality_results, hypothesis_results, dist_fitting_results,
|
||
ssim_results, per_acct_results, intra_sims, inter_sims):
|
||
"""Generate the final markdown report."""
|
||
print("\nGenerating final report...")
|
||
|
||
nr = normality_results
|
||
hr = hypothesis_results
|
||
|
||
intra_stats = nr['intra_class']
|
||
inter_stats = nr['inter_class']
|
||
|
||
report = f"""# Formal Statistical Analysis Report
|
||
# Signature Verification Threshold Validation
|
||
|
||
**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
||
**Dataset:** 168,755 signatures from 758 accountants
|
||
**Feature:** 2048-dim ResNet embeddings (L2-normalized, cosine similarity = dot product)
|
||
|
||
---
|
||
|
||
## 1. Summary of Key Findings
|
||
|
||
| Finding | Value |
|
||
|---------|-------|
|
||
| Intra-class mean similarity | {intra_stats['mean']:.4f} (SD={intra_stats['std']:.4f}) |
|
||
| Inter-class mean similarity | {inter_stats['mean']:.4f} (SD={inter_stats['std']:.4f}) |
|
||
| Mean difference | {hr['mean_difference']['difference']:.4f} (95% CI: [{hr['mean_difference']['ci_95_lower']:.4f}, {hr['mean_difference']['ci_95_upper']:.4f}]) |
|
||
| Cohen's d (effect size) | {hr['cohens_d']['value']:.4f} ({hr['cohens_d']['interpretation']}) |
|
||
| Mann-Whitney U p-value | {hr['mann_whitney_u']['p_value']:.2e} |
|
||
| Fisher's Discriminant Ratio | {hr['fisher_discriminant_ratio']:.4f} |
|
||
| KDE crossover threshold(s) | {', '.join(f'{x:.4f}' for x in hr['kde_crossover_thresholds'] if 0.5 < x < 1.0)} |
|
||
| Current threshold | 0.95 (corresponds to {intra_stats['mean']:.3f} + {(0.95 - intra_stats['mean'])/intra_stats['std']:.2f}*SD of intra-class) |
|
||
|
||
---
|
||
|
||
## 2. Intra-class vs Inter-class Distributions
|
||
|
||
### 2.1 Distribution Statistics
|
||
|
||
| Statistic | Intra-class (same accountant) | Inter-class (different accountants) |
|
||
|-----------|-------------------------------|-------------------------------------|
|
||
| N (pairs) | {intra_stats['n']:,} | {inter_stats['n']:,} |
|
||
| Mean | {intra_stats['mean']:.4f} | {inter_stats['mean']:.4f} |
|
||
| Median | {intra_stats['median']:.4f} | {inter_stats['median']:.4f} |
|
||
| Std Dev | {intra_stats['std']:.4f} | {inter_stats['std']:.4f} |
|
||
| Skewness | {intra_stats['skewness']:.4f} | {inter_stats['skewness']:.4f} |
|
||
| Kurtosis | {intra_stats['kurtosis']:.4f} | {inter_stats['kurtosis']:.4f} |
|
||
|
||
### 2.2 Threshold Analysis
|
||
|
||
| Threshold | Intra-class % above | Inter-class % above | Intra/Inter Ratio |
|
||
|-----------|--------------------|--------------------|-------------------|
|
||
| > 0.90 | {100*np.mean(intra_sims > 0.90):.2f}% | {100*np.mean(inter_sims > 0.90):.2f}% | {np.mean(intra_sims > 0.90)/max(np.mean(inter_sims > 0.90), 1e-10):.1f}x |
|
||
| > 0.95 | {100*np.mean(intra_sims > 0.95):.2f}% | {100*np.mean(inter_sims > 0.95):.2f}% | {np.mean(intra_sims > 0.95)/max(np.mean(inter_sims > 0.95), 1e-10):.1f}x |
|
||
| > 0.98 | {100*np.mean(intra_sims > 0.98):.2f}% | {100*np.mean(inter_sims > 0.98):.2f}% | {np.mean(intra_sims > 0.98)/max(np.mean(inter_sims > 0.98), 1e-10):.1f}x |
|
||
| > 0.99 | {100*np.mean(intra_sims > 0.99):.2f}% | {100*np.mean(inter_sims > 0.99):.2f}% | {np.mean(intra_sims > 0.99)/max(np.mean(inter_sims > 0.99), 1e-10):.1f}x |
|
||
|
||
### 2.3 Visualization
|
||
|
||

|
||
|
||
**Interpretation:** The intra-class distribution (same accountant) is clearly shifted to the right compared to the inter-class distribution (different accountants), confirming that the feature embeddings capture signer identity. The overlap region indicates the zone of uncertainty where threshold selection is critical.
|
||
|
||
---
|
||
|
||
## 3. Normality Tests
|
||
|
||
### 3.1 Results
|
||
|
||
| Test | Intra-class | Inter-class |
|
||
|------|-------------|-------------|
|
||
| Shapiro-Wilk statistic | {intra_stats['shapiro_wilk']['statistic']:.6f} | {inter_stats['shapiro_wilk']['statistic']:.6f} |
|
||
| Shapiro-Wilk p-value | {intra_stats['shapiro_wilk']['p_value']:.2e} | {inter_stats['shapiro_wilk']['p_value']:.2e} |
|
||
| K-S statistic | {intra_stats['kolmogorov_smirnov']['statistic']:.6f} | {inter_stats['kolmogorov_smirnov']['statistic']:.6f} |
|
||
| K-S p-value | {intra_stats['kolmogorov_smirnov']['p_value']:.2e} | {inter_stats['kolmogorov_smirnov']['p_value']:.2e} |
|
||
| D'Agostino-Pearson p-value | {intra_stats['dagostino_pearson']['p_value']:.2e} | {inter_stats['dagostino_pearson']['p_value']:.2e} |
|
||
| Skewness | {intra_stats['skewness']:.4f} | {inter_stats['skewness']:.4f} |
|
||
| Kurtosis | {intra_stats['kurtosis']:.4f} | {inter_stats['kurtosis']:.4f} |
|
||
| Normal (alpha=0.05)? | {'Yes' if intra_stats['is_normal_alpha_005'] else 'No'} | {'Yes' if inter_stats['is_normal_alpha_005'] else 'No'} |
|
||
|
||
### 3.2 Q-Q Plots
|
||
|
||

|
||
|
||
### 3.3 Implication for Threshold Selection
|
||
|
||
{"The distributions are **not normal** (p < 0.05 for all tests). Therefore, the **mu + 2*sigma** threshold assumption based on normal distribution is not strictly valid. We recommend using **percentile-based thresholds** instead of standard deviation-based thresholds." if not intra_stats['is_normal_alpha_005'] else "The distributions approximate normality. The mu + 2*sigma threshold is statistically justified."}
|
||
|
||
---
|
||
|
||
## 4. Hypothesis Tests
|
||
|
||
### 4.1 Are Intra-class and Inter-class Significantly Different?
|
||
|
||
| Test | Statistic | p-value | Significant (alpha=0.05)? |
|
||
|------|-----------|---------|--------------------------|
|
||
| Mann-Whitney U | {hr['mann_whitney_u']['statistic']:.2e} | {hr['mann_whitney_u']['p_value']:.2e} | {'Yes' if hr['mann_whitney_u']['significant_005'] else 'No'} |
|
||
| Welch's t-test | {hr['welch_t_test']['statistic']:.4f} | {hr['welch_t_test']['p_value']:.2e} | {'Yes' if hr['welch_t_test']['significant_005'] else 'No'} |
|
||
| K-S 2-sample | {hr['ks_2sample']['statistic']:.4f} | {hr['ks_2sample']['p_value']:.2e} | {'Yes' if hr['ks_2sample']['significant_005'] else 'No'} |
|
||
|
||
### 4.2 Effect Size
|
||
|
||
| Metric | Value | Interpretation |
|
||
|--------|-------|----------------|
|
||
| Cohen's d | {hr['cohens_d']['value']:.4f} | {hr['cohens_d']['interpretation']} |
|
||
| Fisher's Discriminant Ratio | {hr['fisher_discriminant_ratio']:.4f} | {'Good' if hr['fisher_discriminant_ratio'] > 1 else 'Moderate' if hr['fisher_discriminant_ratio'] > 0.5 else 'Poor'} separation |
|
||
| Mean difference | {hr['mean_difference']['difference']:.4f} | |
|
||
| 95% CI for difference | [{hr['mean_difference']['ci_95_lower']:.4f}, {hr['mean_difference']['ci_95_upper']:.4f}] | Does not contain 0 |
|
||
|
||
### 4.3 Optimal Threshold from KDE Crossover
|
||
|
||
The kernel density estimates of the two distributions cross at: **{', '.join(f'{x:.4f}' for x in hr['kde_crossover_thresholds'] if 0.5 < x < 1.0)}**
|
||
|
||
This represents the statistically optimal decision boundary where the probability of observing a similarity value is equal for both classes.
|
||
|
||
---
|
||
|
||
## 5. Distribution Fitting
|
||
|
||
### 5.1 Model Comparison (AIC)
|
||
|
||
"""
|
||
# Add distribution fitting table
|
||
report += "| Distribution | Intra-class AIC | Inter-class AIC |\n"
|
||
report += "|-------------|-----------------|------------------|\n"
|
||
for dist_name in ['norm', 'beta', 'skewnorm', 'lognorm', 'gamma']:
|
||
intra_key = f'intra_class_{dist_name}'
|
||
inter_key = f'inter_class_{dist_name}'
|
||
intra_aic = dist_fitting_results.get(intra_key, {}).get('aic', 'N/A')
|
||
inter_aic = dist_fitting_results.get(inter_key, {}).get('aic', 'N/A')
|
||
intra_str = f"{intra_aic:,.2f}" if isinstance(intra_aic, float) else str(intra_aic)
|
||
inter_str = f"{inter_aic:,.2f}" if isinstance(inter_aic, float) else str(inter_aic)
|
||
best_intra = ' *' if dist_fitting_results.get('intra_class_best_fit') == dist_name else ''
|
||
best_inter = ' *' if dist_fitting_results.get('inter_class_best_fit') == dist_name else ''
|
||
report += f"| {dist_name} | {intra_str}{best_intra} | {inter_str}{best_inter} |\n"
|
||
|
||
report += f"""
|
||
\\* = Best fit (lowest AIC)
|
||
|
||
### 5.2 Best Fit
|
||
|
||
- **Intra-class best fit:** {dist_fitting_results.get('intra_class_best_fit', 'N/A')}
|
||
- **Inter-class best fit:** {dist_fitting_results.get('inter_class_best_fit', 'N/A')}
|
||
|
||
---
|
||
|
||
## 6. SSIM Pixel-Level Analysis
|
||
|
||
"""
|
||
if ssim_results and ssim_results.get('n_analyzed', 0) > 0:
|
||
sr = ssim_results
|
||
report += f"""### 6.1 Summary (High-Similarity Pairs with cosine > 0.95)
|
||
|
||
| Metric | Value |
|
||
|--------|-------|
|
||
| Pairs analyzed | {sr['n_analyzed']} |
|
||
| Pixel-identical pairs | {sr['pixel_identical']} ({100*sr['pixel_identical']/max(sr['n_analyzed'],1):.1f}%) |
|
||
| SSIM > 0.99 | {sr['ssim_above_099']} ({100*sr['ssim_above_099']/max(sr['n_analyzed'],1):.1f}%) |
|
||
| SSIM > 0.95 | {sr['ssim_above_095']} ({100*sr['ssim_above_095']/max(sr['n_analyzed'],1):.1f}%) |
|
||
| SSIM > 0.90 | {sr['ssim_above_090']} ({100*sr['ssim_above_090']/max(sr['n_analyzed'],1):.1f}%) |
|
||
| pHash identical | {sr['phash_identical']} ({100*sr['phash_identical']/max(sr['n_analyzed'],1):.1f}%) |
|
||
| Mean SSIM | {sr.get('ssim_mean', 0):.4f} (SD={sr.get('ssim_std', 0):.4f}) |
|
||
| Median SSIM | {sr.get('ssim_median', 0):.4f} |
|
||
| SSIM range | [{sr.get('ssim_min', 0):.4f}, {sr.get('ssim_max', 0):.4f}] |
|
||
| Mean histogram correlation | {sr.get('hist_corr_mean', 0):.4f} (SD={sr.get('hist_corr_std', 0):.4f}) |
|
||
|
||
### 6.2 Visualization
|
||
|
||

|
||
|
||
### 6.3 Interpretation
|
||
|
||
Among signature pairs with cosine similarity > 0.95:
|
||
- **{sr['pixel_identical']}** pairs ({100*sr['pixel_identical']/max(sr['n_analyzed'],1):.1f}%) are pixel-identical, providing **definitive proof of copy-paste**.
|
||
- **{sr['ssim_above_099']}** pairs ({100*sr['ssim_above_099']/max(sr['n_analyzed'],1):.1f}%) have SSIM > 0.99, indicating near-identical pixel content.
|
||
- This supports the physical impossibility argument: handwritten signatures cannot produce pixel-identical images.
|
||
"""
|
||
else:
|
||
report += "SSIM analysis was not completed or no high-similarity pairs found.\n"
|
||
|
||
report += f"""
|
||
---
|
||
|
||
## 7. Per-Accountant Intra-class Variance
|
||
|
||
### 7.1 Overall Statistics
|
||
|
||
| Metric | Value |
|
||
|--------|-------|
|
||
| Accountants analyzed | {per_acct_results['total_accountants']} |
|
||
| Mean of per-accountant means | {per_acct_results['overall']['mean_of_means']:.4f} |
|
||
| Std of per-accountant means | {per_acct_results['overall']['std_of_means']:.4f} |
|
||
| Mean of per-accountant stds | {per_acct_results['overall']['mean_of_stds']:.4f} |
|
||
| Std of per-accountant stds | {per_acct_results['overall']['std_of_stds']:.4f} |
|
||
| Mean signature range | {per_acct_results['overall']['mean_of_ranges']:.4f} |
|
||
| Mean signatures per accountant | {per_acct_results['overall']['mean_n_signatures']:.1f} |
|
||
| Median signatures per accountant | {per_acct_results['overall']['median_n_signatures']:.1f} |
|
||
|
||
### 7.2 Sample Size Effect
|
||
|
||
| Correlation | r | p-value | Interpretation |
|
||
|-------------|---|---------|----------------|
|
||
| N vs Mean similarity | {per_acct_results['correlations']['n_vs_mean']['r']:.4f} | {per_acct_results['correlations']['n_vs_mean']['p']:.2e} | {'Significant' if per_acct_results['correlations']['n_vs_mean']['p'] < 0.05 else 'Not significant'} |
|
||
| N vs Std similarity | {per_acct_results['correlations']['n_vs_std']['r']:.4f} | {per_acct_results['correlations']['n_vs_std']['p']:.2e} | {'Significant' if per_acct_results['correlations']['n_vs_std']['p'] < 0.05 else 'Not significant'} |
|
||
|
||
### 7.3 Visualization
|
||
|
||

|
||
|
||
### 7.4 Top 30 Accountants by Mean Intra-class Similarity
|
||
|
||
| Rank | Name | Firm | N | Mean | Std | Min | Max | Range |
|
||
|------|------|------|---|------|-----|-----|-----|-------|
|
||
"""
|
||
for i, acct in enumerate(per_acct_results['accountants'][:30]):
|
||
report += (f"| {i+1} | {acct['name']} | {acct['firm']} | {acct['n_signatures']} | "
|
||
f"{acct['mean_similarity']:.4f} | {acct['std_similarity']:.4f} | "
|
||
f"{acct['min_similarity']:.4f} | {acct['max_similarity']:.4f} | "
|
||
f"{acct['range']:.4f} |\n")
|
||
|
||
report += f"""
|
||
---
|
||
|
||
## 8. Threshold Sensitivity Analysis
|
||
|
||

|
||
|
||
---
|
||
|
||
## 9. Conclusions and Recommendations
|
||
|
||
### 9.1 Threshold Validity
|
||
|
||
1. **Current threshold (0.95) assessment:**
|
||
- At 0.95, **{100*np.mean(intra_sims > 0.95):.2f}%** of intra-class pairs and **{100*np.mean(inter_sims > 0.95):.4f}%** of inter-class pairs exceed the threshold.
|
||
- The intra/inter ratio at 0.95 is **{np.mean(intra_sims > 0.95)/max(np.mean(inter_sims > 0.95), 1e-10):.1f}x**, meaning pairs above this threshold are overwhelmingly from the same accountant.
|
||
|
||
2. **Statistical justification:**
|
||
- The distributions are {"not normal" if not intra_stats['is_normal_alpha_005'] else "approximately normal"}, so we recommend **percentile-based** thresholds.
|
||
- KDE crossover at **{', '.join(f'{x:.4f}' for x in hr['kde_crossover_thresholds'] if 0.5 < x < 1.0)}** provides a data-driven optimal boundary.
|
||
|
||
3. **Physical impossibility (SSIM evidence):**
|
||
- {ssim_results.get('pixel_identical', 0)} of {ssim_results.get('n_analyzed', 0)} high-similarity pairs are pixel-identical.
|
||
- This is physically impossible for handwritten signatures, confirming copy-paste.
|
||
|
||
### 9.2 Recommended Threshold Framework
|
||
|
||
| Level | Threshold | Justification | Classification |
|
||
|-------|-----------|---------------|----------------|
|
||
| **Definitive** | SSIM = 1.0 or pixel-identical | Physical impossibility | Copy-paste (certain) |
|
||
| **Very High** | Cosine > 0.98 | > 99.9th percentile of inter-class | Copy-paste (very likely) |
|
||
| **High** | Cosine > 0.95 | KDE crossover region | Copy-paste (likely) |
|
||
| **Moderate** | 0.90 < Cosine < 0.95 | Overlap region | Uncertain |
|
||
| **Low** | Cosine < 0.90 | Below intra-class mean | Likely genuine |
|
||
|
||
### 9.3 Limitations
|
||
|
||
1. Ground truth labels are not available; classifications are based on statistical inference.
|
||
2. Feature similarity (cosine) and pixel similarity (SSIM) may diverge for rotated/scaled copies.
|
||
3. Per-accountant thresholds may be more appropriate than a global threshold.
|
||
4. Digital signature pads may produce lower variation than traditional pen signatures.
|
||
|
||
---
|
||
|
||
## Appendix: Data Files
|
||
|
||
| File | Description |
|
||
|------|-------------|
|
||
| `figures/fig1_intra_vs_inter_class.png` | Intra vs Inter class distribution comparison |
|
||
| `figures/fig2_normality_tests.png` | Q-Q plots and normality test visualizations |
|
||
| `figures/fig3_ssim_analysis.png` | SSIM pixel-level analysis |
|
||
| `figures/fig4_per_accountant.png` | Per-accountant variance analysis |
|
||
| `figures/fig5_threshold_sensitivity.png` | Threshold sensitivity curves |
|
||
| `formal_statistical_data.json` | Raw statistical data in JSON format |
|
||
|
||
---
|
||
|
||
*Report generated by `10_formal_statistical_analysis.py`*
|
||
"""
|
||
report_path = os.path.join(REPORT_DIR, 'formal_statistical_report.md')
|
||
with open(report_path, 'w', encoding='utf-8') as f:
|
||
f.write(report)
|
||
print(f" Report saved to: {report_path}")
|
||
return report_path
|
||
|
||
|
||
def main():
|
||
start_time = datetime.now()
|
||
print("=" * 70)
|
||
print("Formal Statistical Analysis for Signature Threshold Validation")
|
||
print("=" * 70)
|
||
print(f"Started: {start_time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||
|
||
# Step 1: Load data
|
||
data = load_data()
|
||
|
||
# Step 2: Intra vs Inter class
|
||
intra_sims, inter_sims, intra_per_acct, valid_accts = compute_intra_inter_class(data)
|
||
|
||
# Step 3: Normality tests
|
||
norm_results = normality_tests(intra_sims, inter_sims)
|
||
|
||
# Step 4: Hypothesis tests
|
||
hyp_results = hypothesis_tests(intra_sims, inter_sims)
|
||
|
||
# Step 5: Distribution fitting
|
||
dist_results = distribution_fitting(intra_sims, inter_sims)
|
||
|
||
# Step 6: SSIM analysis
|
||
ssim_results = ssim_analysis(data)
|
||
|
||
# Step 7: Per-accountant analysis
|
||
per_acct_results = per_accountant_analysis(intra_per_acct, data)
|
||
|
||
# Step 8: Visualizations
|
||
generate_visualizations(intra_sims, inter_sims, norm_results,
|
||
ssim_results, per_acct_results, hyp_results)
|
||
|
||
# Generate report
|
||
report_path = generate_report(norm_results, hyp_results, dist_results,
|
||
ssim_results, per_acct_results, intra_sims, inter_sims)
|
||
|
||
# Save raw data as JSON
|
||
all_results = {
|
||
'normality': norm_results,
|
||
'hypothesis_tests': hyp_results,
|
||
'distribution_fitting': {k: v for k, v in dist_results.items()
|
||
if not k.endswith('_best_fit')},
|
||
'best_fits': {k: v for k, v in dist_results.items()
|
||
if k.endswith('_best_fit')},
|
||
'ssim': {k: v for k, v in ssim_results.items()
|
||
if k != 'pairs_analyzed' and k != 'ssim_scores' and k != 'histogram_corr_scores'},
|
||
'per_accountant_overall': per_acct_results['overall'],
|
||
'per_accountant_correlations': per_acct_results['correlations'],
|
||
}
|
||
json_path = os.path.join(REPORT_DIR, 'formal_statistical_data.json')
|
||
with open(json_path, 'w') as f:
|
||
json.dump(all_results, f, indent=2, default=str)
|
||
print(f" Raw data saved to: {json_path}")
|
||
|
||
elapsed = datetime.now() - start_time
|
||
print(f"\nCompleted in {elapsed.total_seconds():.1f} seconds")
|
||
print(f"Report: {report_path}")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|