Add Deloitte distribution & independent dHash analysis scripts
- Script 13: Firm A normality/multimodality analysis (Shapiro-Wilk, Anderson-Darling, KDE, per-accountant ANOVA, Beta/Gamma fitting) - Script 14: Independent min-dHash computation across all pairs per accountant (not just cosine-nearest pair) - THRESHOLD_VALIDATION_OPTIONS: 2026-01 discussion doc on threshold validation approaches - .gitignore: exclude model weights, node artifacts, and xlsx data Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+13
@@ -48,3 +48,16 @@ Thumbs.db
|
|||||||
# Temporary files
|
# Temporary files
|
||||||
*.tmp
|
*.tmp
|
||||||
*.bak
|
*.bak
|
||||||
|
|
||||||
|
# Model weights (too large for git)
|
||||||
|
models/
|
||||||
|
*.pt
|
||||||
|
*.pth
|
||||||
|
|
||||||
|
# Node.js shells (accidentally created)
|
||||||
|
package.json
|
||||||
|
package-lock.json
|
||||||
|
node_modules/
|
||||||
|
|
||||||
|
# Sensitive/large data
|
||||||
|
*.xlsx
|
||||||
|
|||||||
@@ -0,0 +1,430 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Deloitte (勤業眾信) Signature Similarity Distribution Analysis
|
||||||
|
==============================================================
|
||||||
|
Evaluate whether Firm A's max_similarity values follow a normal distribution
|
||||||
|
or contain subgroups (e.g., genuinely hand-signed vs digitally stamped).
|
||||||
|
|
||||||
|
Tests:
|
||||||
|
1. Descriptive statistics & percentiles
|
||||||
|
2. Normality tests (Shapiro-Wilk, D'Agostino-Pearson, Anderson-Darling, KS)
|
||||||
|
3. Histogram + KDE + fitted normal overlay
|
||||||
|
4. Q-Q plot
|
||||||
|
5. Multimodality check (Hartigan's dip test approximation)
|
||||||
|
6. Outlier identification (signatures with unusually low similarity)
|
||||||
|
7. dHash distance distribution for Firm A
|
||||||
|
|
||||||
|
Output: figures + report to console
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib
|
||||||
|
matplotlib.use('Agg')
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from scipy import stats
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||||
|
OUTPUT_DIR = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/deloitte_distribution')
|
||||||
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
FIRM_A = '勤業眾信聯合'
|
||||||
|
|
||||||
|
|
||||||
|
def load_firm_a_data():
|
||||||
|
"""Load all Firm A signature similarity data."""
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
cur.execute('''
|
||||||
|
SELECT s.signature_id, s.image_filename, s.assigned_accountant,
|
||||||
|
s.max_similarity_to_same_accountant,
|
||||||
|
s.phash_distance_to_closest
|
||||||
|
FROM signatures s
|
||||||
|
LEFT JOIN accountants a ON s.assigned_accountant = a.name
|
||||||
|
WHERE a.firm = ?
|
||||||
|
AND s.max_similarity_to_same_accountant IS NOT NULL
|
||||||
|
''', (FIRM_A,))
|
||||||
|
rows = cur.fetchall()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
data = []
|
||||||
|
for r in rows:
|
||||||
|
data.append({
|
||||||
|
'sig_id': r[0],
|
||||||
|
'filename': r[1],
|
||||||
|
'accountant': r[2],
|
||||||
|
'cosine': r[3],
|
||||||
|
'phash': r[4],
|
||||||
|
})
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def descriptive_stats(cosines, label="Firm A Cosine Similarity"):
|
||||||
|
"""Print comprehensive descriptive statistics."""
|
||||||
|
print(f"\n{'='*65}")
|
||||||
|
print(f" {label}")
|
||||||
|
print(f"{'='*65}")
|
||||||
|
print(f" N = {len(cosines):,}")
|
||||||
|
print(f" Mean = {np.mean(cosines):.6f}")
|
||||||
|
print(f" Median = {np.median(cosines):.6f}")
|
||||||
|
print(f" Std Dev = {np.std(cosines):.6f}")
|
||||||
|
print(f" Variance = {np.var(cosines):.8f}")
|
||||||
|
print(f" Min = {np.min(cosines):.6f}")
|
||||||
|
print(f" Max = {np.max(cosines):.6f}")
|
||||||
|
print(f" Range = {np.ptp(cosines):.6f}")
|
||||||
|
print(f" Skewness = {stats.skew(cosines):.4f}")
|
||||||
|
print(f" Kurtosis = {stats.kurtosis(cosines):.4f} (excess)")
|
||||||
|
print(f" IQR = {np.percentile(cosines, 75) - np.percentile(cosines, 25):.6f}")
|
||||||
|
print()
|
||||||
|
print(f" Percentiles:")
|
||||||
|
for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
|
||||||
|
print(f" P{p:<3d} = {np.percentile(cosines, p):.6f}")
|
||||||
|
|
||||||
|
|
||||||
|
def normality_tests(cosines):
|
||||||
|
"""Run multiple normality tests."""
|
||||||
|
print(f"\n{'='*65}")
|
||||||
|
print(f" NORMALITY TESTS")
|
||||||
|
print(f"{'='*65}")
|
||||||
|
|
||||||
|
# Shapiro-Wilk (max 5000 samples)
|
||||||
|
if len(cosines) > 5000:
|
||||||
|
sample = np.random.choice(cosines, 5000, replace=False)
|
||||||
|
stat, p = stats.shapiro(sample)
|
||||||
|
print(f"\n Shapiro-Wilk (n=5000 subsample):")
|
||||||
|
else:
|
||||||
|
stat, p = stats.shapiro(cosines)
|
||||||
|
print(f"\n Shapiro-Wilk (n={len(cosines)}):")
|
||||||
|
print(f" W = {stat:.6f}, p = {p:.2e}")
|
||||||
|
print(f" → {'Normal' if p > 0.05 else 'NOT normal'} at α=0.05")
|
||||||
|
|
||||||
|
# D'Agostino-Pearson
|
||||||
|
if len(cosines) >= 20:
|
||||||
|
stat, p = stats.normaltest(cosines)
|
||||||
|
print(f"\n D'Agostino-Pearson:")
|
||||||
|
print(f" K² = {stat:.4f}, p = {p:.2e}")
|
||||||
|
print(f" → {'Normal' if p > 0.05 else 'NOT normal'} at α=0.05")
|
||||||
|
|
||||||
|
# Anderson-Darling
|
||||||
|
result = stats.anderson(cosines, dist='norm')
|
||||||
|
print(f"\n Anderson-Darling:")
|
||||||
|
print(f" A² = {result.statistic:.4f}")
|
||||||
|
for i, (sl, cv) in enumerate(zip(result.significance_level, result.critical_values)):
|
||||||
|
reject = "REJECT" if result.statistic > cv else "accept"
|
||||||
|
print(f" {sl}%: critical={cv:.4f} → {reject}")
|
||||||
|
|
||||||
|
# Kolmogorov-Smirnov against normal
|
||||||
|
mu, sigma = np.mean(cosines), np.std(cosines)
|
||||||
|
stat, p = stats.kstest(cosines, 'norm', args=(mu, sigma))
|
||||||
|
print(f"\n Kolmogorov-Smirnov (vs fitted normal):")
|
||||||
|
print(f" D = {stat:.6f}, p = {p:.2e}")
|
||||||
|
print(f" → {'Normal' if p > 0.05 else 'NOT normal'} at α=0.05")
|
||||||
|
|
||||||
|
return mu, sigma
|
||||||
|
|
||||||
|
|
||||||
|
def test_alternative_distributions(cosines):
|
||||||
|
"""Fit alternative distributions and compare."""
|
||||||
|
print(f"\n{'='*65}")
|
||||||
|
print(f" DISTRIBUTION FITTING (AIC comparison)")
|
||||||
|
print(f"{'='*65}")
|
||||||
|
|
||||||
|
distributions = {
|
||||||
|
'norm': stats.norm,
|
||||||
|
'skewnorm': stats.skewnorm,
|
||||||
|
'beta': stats.beta,
|
||||||
|
'lognorm': stats.lognorm,
|
||||||
|
'gamma': stats.gamma,
|
||||||
|
}
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for name, dist in distributions.items():
|
||||||
|
try:
|
||||||
|
params = dist.fit(cosines)
|
||||||
|
log_likelihood = np.sum(dist.logpdf(cosines, *params))
|
||||||
|
k = len(params)
|
||||||
|
aic = 2 * k - 2 * log_likelihood
|
||||||
|
bic = k * np.log(len(cosines)) - 2 * log_likelihood
|
||||||
|
results.append((name, aic, bic, params, log_likelihood))
|
||||||
|
except Exception as e:
|
||||||
|
print(f" {name}: fit failed ({e})")
|
||||||
|
|
||||||
|
results.sort(key=lambda x: x[1]) # sort by AIC
|
||||||
|
print(f"\n {'Distribution':<15} {'AIC':>12} {'BIC':>12} {'LogLik':>12}")
|
||||||
|
print(f" {'-'*51}")
|
||||||
|
for name, aic, bic, params, ll in results:
|
||||||
|
marker = " ←best" if name == results[0][0] else ""
|
||||||
|
print(f" {name:<15} {aic:>12.1f} {bic:>12.1f} {ll:>12.1f}{marker}")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def per_accountant_analysis(data):
|
||||||
|
"""Analyze per-accountant distributions within Firm A."""
|
||||||
|
print(f"\n{'='*65}")
|
||||||
|
print(f" PER-ACCOUNTANT ANALYSIS (within Firm A)")
|
||||||
|
print(f"{'='*65}")
|
||||||
|
|
||||||
|
by_acct = {}
|
||||||
|
for d in data:
|
||||||
|
by_acct.setdefault(d['accountant'], []).append(d['cosine'])
|
||||||
|
|
||||||
|
print(f"\n {'Accountant':<20} {'N':>6} {'Mean':>8} {'Std':>8} {'Min':>8} {'P5':>8} {'P50':>8}")
|
||||||
|
print(f" {'-'*66}")
|
||||||
|
acct_stats = []
|
||||||
|
for acct, vals in sorted(by_acct.items(), key=lambda x: np.mean(x[1])):
|
||||||
|
v = np.array(vals)
|
||||||
|
print(f" {acct:<20} {len(v):>6} {v.mean():>8.4f} {v.std():>8.4f} "
|
||||||
|
f"{v.min():>8.4f} {np.percentile(v, 5):>8.4f} {np.median(v):>8.4f}")
|
||||||
|
acct_stats.append({
|
||||||
|
'accountant': acct,
|
||||||
|
'n': len(v),
|
||||||
|
'mean': float(v.mean()),
|
||||||
|
'std': float(v.std()),
|
||||||
|
'min': float(v.min()),
|
||||||
|
'values': v,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Check if per-accountant means are homogeneous (one-way ANOVA)
|
||||||
|
if len(by_acct) >= 2:
|
||||||
|
groups = [np.array(v) for v in by_acct.values() if len(v) >= 5]
|
||||||
|
if len(groups) >= 2:
|
||||||
|
f_stat, p_val = stats.f_oneway(*groups)
|
||||||
|
print(f"\n One-way ANOVA across accountants:")
|
||||||
|
print(f" F = {f_stat:.4f}, p = {p_val:.2e}")
|
||||||
|
print(f" → {'Homogeneous' if p_val > 0.05 else 'Significantly different means'} at α=0.05")
|
||||||
|
|
||||||
|
# Levene's test for homogeneity of variance
|
||||||
|
lev_stat, lev_p = stats.levene(*groups)
|
||||||
|
print(f"\n Levene's test (variance homogeneity):")
|
||||||
|
print(f" W = {lev_stat:.4f}, p = {lev_p:.2e}")
|
||||||
|
print(f" → {'Homogeneous variance' if lev_p > 0.05 else 'Heterogeneous variance'} at α=0.05")
|
||||||
|
|
||||||
|
return acct_stats
|
||||||
|
|
||||||
|
|
||||||
|
def identify_outliers(data, cosines):
|
||||||
|
"""Identify Firm A signatures with unusually low similarity."""
|
||||||
|
print(f"\n{'='*65}")
|
||||||
|
print(f" OUTLIER ANALYSIS (low-similarity Firm A signatures)")
|
||||||
|
print(f"{'='*65}")
|
||||||
|
|
||||||
|
q1 = np.percentile(cosines, 25)
|
||||||
|
q3 = np.percentile(cosines, 75)
|
||||||
|
iqr = q3 - q1
|
||||||
|
lower_fence = q1 - 1.5 * iqr
|
||||||
|
lower_extreme = q1 - 3.0 * iqr
|
||||||
|
|
||||||
|
print(f" IQR method: Q1={q1:.4f}, Q3={q3:.4f}, IQR={iqr:.4f}")
|
||||||
|
print(f" Lower fence (mild): {lower_fence:.4f}")
|
||||||
|
print(f" Lower fence (extreme): {lower_extreme:.4f}")
|
||||||
|
|
||||||
|
outliers = [d for d in data if d['cosine'] < lower_fence]
|
||||||
|
extreme_outliers = [d for d in data if d['cosine'] < lower_extreme]
|
||||||
|
|
||||||
|
print(f"\n Mild outliers (< {lower_fence:.4f}): {len(outliers)}")
|
||||||
|
print(f" Extreme outliers (< {lower_extreme:.4f}): {len(extreme_outliers)}")
|
||||||
|
|
||||||
|
if outliers:
|
||||||
|
print(f"\n Bottom 20 by cosine similarity:")
|
||||||
|
sorted_outliers = sorted(outliers, key=lambda x: x['cosine'])[:20]
|
||||||
|
for d in sorted_outliers:
|
||||||
|
phash_str = f"pHash={d['phash']}" if d['phash'] is not None else "pHash=N/A"
|
||||||
|
print(f" cosine={d['cosine']:.4f} {phash_str} {d['accountant']} {d['filename']}")
|
||||||
|
|
||||||
|
# Also show count below various thresholds
|
||||||
|
print(f"\n Signatures below key thresholds:")
|
||||||
|
for thresh in [0.95, 0.90, 0.85, 0.837, 0.80]:
|
||||||
|
n_below = sum(1 for c in cosines if c < thresh)
|
||||||
|
print(f" < {thresh:.3f}: {n_below:,} ({100*n_below/len(cosines):.2f}%)")
|
||||||
|
|
||||||
|
|
||||||
|
def plot_histogram_kde(cosines, mu, sigma):
|
||||||
|
"""Plot histogram with KDE and fitted normal overlay."""
|
||||||
|
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
|
||||||
|
|
||||||
|
# Left: Full histogram
|
||||||
|
ax = axes[0]
|
||||||
|
ax.hist(cosines, bins=80, density=True, alpha=0.6, color='steelblue',
|
||||||
|
edgecolor='white', linewidth=0.5, label='Observed')
|
||||||
|
|
||||||
|
# Fitted normal
|
||||||
|
x = np.linspace(cosines.min() - 0.02, cosines.max() + 0.02, 300)
|
||||||
|
ax.plot(x, stats.norm.pdf(x, mu, sigma), 'r-', lw=2,
|
||||||
|
label=f'Normal fit (μ={mu:.4f}, σ={sigma:.4f})')
|
||||||
|
|
||||||
|
# KDE
|
||||||
|
kde = stats.gaussian_kde(cosines)
|
||||||
|
ax.plot(x, kde(x), 'g--', lw=2, label='KDE')
|
||||||
|
|
||||||
|
ax.set_xlabel('Max Cosine Similarity')
|
||||||
|
ax.set_ylabel('Density')
|
||||||
|
ax.set_title(f'Firm A (勤業眾信) Cosine Similarity Distribution (N={len(cosines):,})')
|
||||||
|
ax.legend(fontsize=9)
|
||||||
|
ax.axvline(0.95, color='orange', ls=':', alpha=0.7, label='θ=0.95')
|
||||||
|
ax.axvline(0.837, color='purple', ls=':', alpha=0.7, label='KDE crossover')
|
||||||
|
|
||||||
|
# Right: Q-Q plot
|
||||||
|
ax2 = axes[1]
|
||||||
|
stats.probplot(cosines, dist='norm', plot=ax2)
|
||||||
|
ax2.set_title('Q-Q Plot (vs Normal)')
|
||||||
|
ax2.get_lines()[0].set_markersize(2)
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
fig.savefig(OUTPUT_DIR / 'firm_a_cosine_distribution.png', dpi=150)
|
||||||
|
print(f"\n Saved: {OUTPUT_DIR / 'firm_a_cosine_distribution.png'}")
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
|
||||||
|
def plot_per_accountant(acct_stats):
|
||||||
|
"""Box plot per accountant."""
|
||||||
|
# Sort by mean
|
||||||
|
acct_stats.sort(key=lambda x: x['mean'])
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=(12, max(5, len(acct_stats) * 0.4)))
|
||||||
|
positions = range(len(acct_stats))
|
||||||
|
labels = [f"{a['accountant']} (n={a['n']})" for a in acct_stats]
|
||||||
|
box_data = [a['values'] for a in acct_stats]
|
||||||
|
|
||||||
|
bp = ax.boxplot(box_data, positions=positions, vert=False, widths=0.6,
|
||||||
|
patch_artist=True, showfliers=True,
|
||||||
|
flierprops=dict(marker='.', markersize=3, alpha=0.5))
|
||||||
|
for patch in bp['boxes']:
|
||||||
|
patch.set_facecolor('lightsteelblue')
|
||||||
|
|
||||||
|
ax.set_yticks(positions)
|
||||||
|
ax.set_yticklabels(labels, fontsize=8)
|
||||||
|
ax.set_xlabel('Max Cosine Similarity')
|
||||||
|
ax.set_title('Per-Accountant Similarity Distribution (Firm A)')
|
||||||
|
ax.axvline(0.95, color='orange', ls=':', alpha=0.7)
|
||||||
|
ax.axvline(0.837, color='purple', ls=':', alpha=0.7)
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
fig.savefig(OUTPUT_DIR / 'firm_a_per_accountant_boxplot.png', dpi=150)
|
||||||
|
print(f" Saved: {OUTPUT_DIR / 'firm_a_per_accountant_boxplot.png'}")
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
|
||||||
|
def plot_phash_distribution(data):
|
||||||
|
"""Plot dHash distance distribution for Firm A."""
|
||||||
|
phash_vals = [d['phash'] for d in data if d['phash'] is not None]
|
||||||
|
if not phash_vals:
|
||||||
|
print(" No pHash data available.")
|
||||||
|
return
|
||||||
|
|
||||||
|
phash_arr = np.array(phash_vals)
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=(10, 5))
|
||||||
|
max_val = min(int(phash_arr.max()) + 2, 65)
|
||||||
|
bins = np.arange(-0.5, max_val + 0.5, 1)
|
||||||
|
ax.hist(phash_arr, bins=bins, alpha=0.7, color='coral', edgecolor='white')
|
||||||
|
ax.set_xlabel('dHash Distance')
|
||||||
|
ax.set_ylabel('Count')
|
||||||
|
ax.set_title(f'Firm A dHash Distance Distribution (N={len(phash_vals):,})')
|
||||||
|
ax.axvline(5, color='green', ls='--', label='θ=5 (high conf.)')
|
||||||
|
ax.axvline(15, color='orange', ls='--', label='θ=15 (moderate)')
|
||||||
|
ax.legend()
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
fig.savefig(OUTPUT_DIR / 'firm_a_dhash_distribution.png', dpi=150)
|
||||||
|
print(f" Saved: {OUTPUT_DIR / 'firm_a_dhash_distribution.png'}")
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
|
||||||
|
def multimodality_test(cosines):
|
||||||
|
"""Check for potential multimodality using kernel density peaks."""
|
||||||
|
print(f"\n{'='*65}")
|
||||||
|
print(f" MULTIMODALITY ANALYSIS")
|
||||||
|
print(f"{'='*65}")
|
||||||
|
|
||||||
|
kde = stats.gaussian_kde(cosines, bw_method='silverman')
|
||||||
|
x = np.linspace(cosines.min(), cosines.max(), 1000)
|
||||||
|
density = kde(x)
|
||||||
|
|
||||||
|
# Find local maxima
|
||||||
|
from scipy.signal import find_peaks
|
||||||
|
peaks, properties = find_peaks(density, prominence=0.01)
|
||||||
|
peak_positions = x[peaks]
|
||||||
|
peak_heights = density[peaks]
|
||||||
|
|
||||||
|
print(f" KDE bandwidth (Silverman): {kde.factor:.6f}")
|
||||||
|
print(f" Number of detected modes: {len(peaks)}")
|
||||||
|
for i, (pos, h) in enumerate(zip(peak_positions, peak_heights)):
|
||||||
|
print(f" Mode {i+1}: position={pos:.4f}, density={h:.2f}")
|
||||||
|
|
||||||
|
if len(peaks) == 1:
|
||||||
|
print(f"\n → Distribution appears UNIMODAL")
|
||||||
|
print(f" Single peak at {peak_positions[0]:.4f}")
|
||||||
|
elif len(peaks) > 1:
|
||||||
|
print(f"\n → Distribution appears MULTIMODAL ({len(peaks)} modes)")
|
||||||
|
print(f" This suggests subgroups may exist within Firm A")
|
||||||
|
# Check separation between modes
|
||||||
|
for i in range(len(peaks) - 1):
|
||||||
|
sep = peak_positions[i + 1] - peak_positions[i]
|
||||||
|
# Find valley between modes
|
||||||
|
valley_region = density[peaks[i]:peaks[i + 1]]
|
||||||
|
valley_depth = peak_heights[i:i + 2].min() - valley_region.min()
|
||||||
|
print(f" Separation {i+1}-{i+2}: Δ={sep:.4f}, valley depth={valley_depth:.2f}")
|
||||||
|
|
||||||
|
# Also try different bandwidths
|
||||||
|
print(f"\n Sensitivity analysis (bandwidth variation):")
|
||||||
|
for bw_factor in [0.5, 0.75, 1.0, 1.5, 2.0]:
|
||||||
|
bw = kde.factor * bw_factor
|
||||||
|
kde_test = stats.gaussian_kde(cosines, bw_method=bw)
|
||||||
|
density_test = kde_test(x)
|
||||||
|
peaks_test, _ = find_peaks(density_test, prominence=0.005)
|
||||||
|
print(f" bw={bw:.4f} (×{bw_factor:.1f}): {len(peaks_test)} mode(s)")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("Loading Firm A (勤業眾信) signature data...")
|
||||||
|
data = load_firm_a_data()
|
||||||
|
print(f"Total Firm A signatures: {len(data):,}")
|
||||||
|
|
||||||
|
cosines = np.array([d['cosine'] for d in data])
|
||||||
|
|
||||||
|
# 1. Descriptive statistics
|
||||||
|
descriptive_stats(cosines)
|
||||||
|
|
||||||
|
# 2. Normality tests
|
||||||
|
mu, sigma = normality_tests(cosines)
|
||||||
|
|
||||||
|
# 3. Alternative distribution fitting
|
||||||
|
test_alternative_distributions(cosines)
|
||||||
|
|
||||||
|
# 4. Per-accountant analysis
|
||||||
|
acct_stats = per_accountant_analysis(data)
|
||||||
|
|
||||||
|
# 5. Outlier analysis
|
||||||
|
identify_outliers(data, cosines)
|
||||||
|
|
||||||
|
# 6. Multimodality test
|
||||||
|
multimodality_test(cosines)
|
||||||
|
|
||||||
|
# 7. Generate plots
|
||||||
|
print(f"\n{'='*65}")
|
||||||
|
print(f" GENERATING FIGURES")
|
||||||
|
print(f"{'='*65}")
|
||||||
|
plot_histogram_kde(cosines, mu, sigma)
|
||||||
|
plot_per_accountant(acct_stats)
|
||||||
|
plot_phash_distribution(data)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print(f"\n{'='*65}")
|
||||||
|
print(f" SUMMARY")
|
||||||
|
print(f"{'='*65}")
|
||||||
|
below_95 = sum(1 for c in cosines if c < 0.95)
|
||||||
|
below_kde = sum(1 for c in cosines if c < 0.837)
|
||||||
|
print(f" Firm A signatures: {len(cosines):,}")
|
||||||
|
print(f" Below 0.95 threshold: {below_95:,} ({100*below_95/len(cosines):.1f}%)")
|
||||||
|
print(f" Below KDE crossover (0.837): {below_kde:,} ({100*below_kde/len(cosines):.1f}%)")
|
||||||
|
print(f" If distribution is NOT normal → subgroups may exist")
|
||||||
|
print(f" If multimodal → some signatures may be genuinely hand-signed")
|
||||||
|
print(f"\n Output directory: {OUTPUT_DIR}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,293 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Compute independent min dHash for all signatures.
|
||||||
|
===================================================
|
||||||
|
Currently phash_distance_to_closest is conditional on cosine-nearest pair.
|
||||||
|
This script computes an INDEPENDENT min dHash: for each signature, find the
|
||||||
|
pair within the same accountant that has the smallest dHash distance,
|
||||||
|
regardless of cosine similarity.
|
||||||
|
|
||||||
|
Three metrics after this script:
|
||||||
|
1. max_similarity_to_same_accountant (max cosine) — primary classifier
|
||||||
|
2. min_dhash_independent (independent min) — independent 2nd classifier
|
||||||
|
3. phash_distance_to_closest (conditional) — diagnostic tool
|
||||||
|
|
||||||
|
Phase 1: Compute dHash vector for each image, store as BLOB in DB
|
||||||
|
Phase 2: All-pairs hamming distance within same accountant, store min
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
import numpy as np
|
||||||
|
import cv2
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from multiprocessing import Pool, cpu_count
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||||
|
IMAGE_DIR = '/Volumes/NV2/PDF-Processing/yolo-signatures/images'
|
||||||
|
NUM_WORKERS = max(1, cpu_count() - 2)
|
||||||
|
BATCH_SIZE = 5000
|
||||||
|
HASH_SIZE = 8 # 9x8 -> 8x8 = 64-bit hash
|
||||||
|
|
||||||
|
|
||||||
|
# ── Phase 1: Compute dHash per image ─────────────────────────────────
|
||||||
|
|
||||||
|
def compute_dhash_for_file(args):
|
||||||
|
"""Compute dHash for a single image file. Returns (sig_id, hash_bytes) or (sig_id, None)."""
|
||||||
|
sig_id, filename = args
|
||||||
|
path = os.path.join(IMAGE_DIR, filename)
|
||||||
|
try:
|
||||||
|
img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
|
||||||
|
if img is None:
|
||||||
|
return (sig_id, None)
|
||||||
|
resized = cv2.resize(img, (HASH_SIZE + 1, HASH_SIZE))
|
||||||
|
diff = resized[:, 1:] > resized[:, :-1] # 8x8 = 64 bits
|
||||||
|
return (sig_id, np.packbits(diff.flatten()).tobytes())
|
||||||
|
except Exception:
|
||||||
|
return (sig_id, None)
|
||||||
|
|
||||||
|
|
||||||
|
def phase1_compute_hashes():
|
||||||
|
"""Compute and store dHash for all signatures."""
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
# Add columns if not exist
|
||||||
|
for col in ['dhash_vector BLOB', 'min_dhash_independent INTEGER',
|
||||||
|
'min_dhash_independent_match TEXT']:
|
||||||
|
try:
|
||||||
|
cur.execute(f'ALTER TABLE signatures ADD COLUMN {col}')
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
pass
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# Check which signatures already have dhash_vector
|
||||||
|
cur.execute('''
|
||||||
|
SELECT signature_id, image_filename
|
||||||
|
FROM signatures
|
||||||
|
WHERE feature_vector IS NOT NULL
|
||||||
|
AND assigned_accountant IS NOT NULL
|
||||||
|
AND dhash_vector IS NULL
|
||||||
|
''')
|
||||||
|
todo = cur.fetchall()
|
||||||
|
|
||||||
|
if not todo:
|
||||||
|
# Check total with dhash
|
||||||
|
cur.execute('SELECT COUNT(*) FROM signatures WHERE dhash_vector IS NOT NULL')
|
||||||
|
n_done = cur.fetchone()[0]
|
||||||
|
print(f" Phase 1 already complete ({n_done:,} hashes in DB)")
|
||||||
|
conn.close()
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f" Computing dHash for {len(todo):,} images ({NUM_WORKERS} workers)...")
|
||||||
|
t0 = time.time()
|
||||||
|
|
||||||
|
processed = 0
|
||||||
|
for batch_start in range(0, len(todo), BATCH_SIZE):
|
||||||
|
batch = todo[batch_start:batch_start + BATCH_SIZE]
|
||||||
|
|
||||||
|
with Pool(NUM_WORKERS) as pool:
|
||||||
|
results = pool.map(compute_dhash_for_file, batch)
|
||||||
|
|
||||||
|
updates = [(dhash, sid) for sid, dhash in results if dhash is not None]
|
||||||
|
cur.executemany('UPDATE signatures SET dhash_vector = ? WHERE signature_id = ?', updates)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
processed += len(batch)
|
||||||
|
elapsed = time.time() - t0
|
||||||
|
rate = processed / elapsed
|
||||||
|
eta = (len(todo) - processed) / rate if rate > 0 else 0
|
||||||
|
print(f" {processed:,}/{len(todo):,} ({rate:.0f}/s, ETA {eta:.0f}s)")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
elapsed = time.time() - t0
|
||||||
|
print(f" Phase 1 done: {processed:,} hashes in {elapsed:.1f}s")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Phase 2: All-pairs min dHash within same accountant ──────────────
|
||||||
|
|
||||||
|
def hamming_distance(h1_bytes, h2_bytes):
|
||||||
|
"""Hamming distance between two packed dHash byte strings."""
|
||||||
|
a = np.frombuffer(h1_bytes, dtype=np.uint8)
|
||||||
|
b = np.frombuffer(h2_bytes, dtype=np.uint8)
|
||||||
|
xor = np.bitwise_xor(a, b)
|
||||||
|
return sum(bin(byte).count('1') for byte in xor)
|
||||||
|
|
||||||
|
|
||||||
|
def phase2_compute_min_dhash():
|
||||||
|
"""For each accountant group, find the min dHash pair per signature."""
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
# Load all signatures with dhash
|
||||||
|
cur.execute('''
|
||||||
|
SELECT s.signature_id, s.assigned_accountant, s.dhash_vector, s.image_filename
|
||||||
|
FROM signatures s
|
||||||
|
WHERE s.dhash_vector IS NOT NULL
|
||||||
|
AND s.assigned_accountant IS NOT NULL
|
||||||
|
''')
|
||||||
|
rows = cur.fetchall()
|
||||||
|
print(f" Loaded {len(rows):,} signatures with dHash")
|
||||||
|
|
||||||
|
# Group by accountant
|
||||||
|
acct_groups = {}
|
||||||
|
for sig_id, acct, dhash, filename in rows:
|
||||||
|
acct_groups.setdefault(acct, []).append((sig_id, dhash, filename))
|
||||||
|
|
||||||
|
# Filter out singletons
|
||||||
|
acct_groups = {k: v for k, v in acct_groups.items() if len(v) >= 2}
|
||||||
|
total_sigs = sum(len(v) for v in acct_groups.values())
|
||||||
|
total_pairs = sum(len(v) * (len(v) - 1) // 2 for v in acct_groups.values())
|
||||||
|
print(f" {len(acct_groups)} accountants, {total_sigs:,} signatures, {total_pairs:,} pairs")
|
||||||
|
|
||||||
|
t0 = time.time()
|
||||||
|
updates = []
|
||||||
|
accts_done = 0
|
||||||
|
|
||||||
|
for acct, sigs in acct_groups.items():
|
||||||
|
n = len(sigs)
|
||||||
|
sig_ids = [s[0] for s in sigs]
|
||||||
|
hashes = [s[1] for s in sigs]
|
||||||
|
filenames = [s[2] for s in sigs]
|
||||||
|
|
||||||
|
# Unpack all hashes to bit arrays for vectorized hamming
|
||||||
|
bits = np.array([np.unpackbits(np.frombuffer(h, dtype=np.uint8)) for h in hashes],
|
||||||
|
dtype=np.uint8) # shape: (n, 64)
|
||||||
|
|
||||||
|
# Pairwise hamming via XOR + sum
|
||||||
|
# For groups up to ~2000, direct matrix computation is fine
|
||||||
|
# hamming_matrix[i,j] = number of differing bits between i and j
|
||||||
|
xor_matrix = bits[:, None, :] ^ bits[None, :, :] # (n, n, 64)
|
||||||
|
hamming_matrix = xor_matrix.sum(axis=2) # (n, n)
|
||||||
|
np.fill_diagonal(hamming_matrix, 999) # exclude self
|
||||||
|
|
||||||
|
# For each signature, find min
|
||||||
|
min_indices = np.argmin(hamming_matrix, axis=1)
|
||||||
|
min_distances = hamming_matrix[np.arange(n), min_indices]
|
||||||
|
|
||||||
|
for i in range(n):
|
||||||
|
updates.append((
|
||||||
|
int(min_distances[i]),
|
||||||
|
filenames[min_indices[i]],
|
||||||
|
sig_ids[i]
|
||||||
|
))
|
||||||
|
|
||||||
|
accts_done += 1
|
||||||
|
if accts_done % 100 == 0:
|
||||||
|
elapsed = time.time() - t0
|
||||||
|
print(f" {accts_done}/{len(acct_groups)} accountants ({elapsed:.0f}s)")
|
||||||
|
|
||||||
|
# Write to DB
|
||||||
|
print(f" Writing {len(updates):,} results to DB...")
|
||||||
|
cur.executemany('''
|
||||||
|
UPDATE signatures
|
||||||
|
SET min_dhash_independent = ?, min_dhash_independent_match = ?
|
||||||
|
WHERE signature_id = ?
|
||||||
|
''', updates)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
elapsed = time.time() - t0
|
||||||
|
print(f" Phase 2 done: {len(updates):,} signatures in {elapsed:.1f}s")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Phase 3: Summary statistics ──────────────────────────────────────
|
||||||
|
|
||||||
|
def print_summary():
|
||||||
|
"""Print summary comparing conditional vs independent dHash."""
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
# Overall stats
|
||||||
|
cur.execute('''
|
||||||
|
SELECT
|
||||||
|
COUNT(*) as n,
|
||||||
|
AVG(phash_distance_to_closest) as cond_mean,
|
||||||
|
AVG(min_dhash_independent) as indep_mean
|
||||||
|
FROM signatures
|
||||||
|
WHERE min_dhash_independent IS NOT NULL
|
||||||
|
AND phash_distance_to_closest IS NOT NULL
|
||||||
|
''')
|
||||||
|
n, cond_mean, indep_mean = cur.fetchone()
|
||||||
|
|
||||||
|
print(f"\n{'='*65}")
|
||||||
|
print(f" COMPARISON: Conditional vs Independent dHash")
|
||||||
|
print(f"{'='*65}")
|
||||||
|
print(f" N = {n:,}")
|
||||||
|
print(f" Conditional dHash (cosine-nearest pair): mean = {cond_mean:.2f}")
|
||||||
|
print(f" Independent dHash (all-pairs min): mean = {indep_mean:.2f}")
|
||||||
|
|
||||||
|
# Percentiles
|
||||||
|
cur.execute('''
|
||||||
|
SELECT phash_distance_to_closest, min_dhash_independent
|
||||||
|
FROM signatures
|
||||||
|
WHERE min_dhash_independent IS NOT NULL
|
||||||
|
AND phash_distance_to_closest IS NOT NULL
|
||||||
|
''')
|
||||||
|
rows = cur.fetchall()
|
||||||
|
cond = np.array([r[0] for r in rows])
|
||||||
|
indep = np.array([r[1] for r in rows])
|
||||||
|
|
||||||
|
print(f"\n {'Percentile':<12} {'Conditional':>12} {'Independent':>12} {'Diff':>8}")
|
||||||
|
print(f" {'-'*44}")
|
||||||
|
for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
|
||||||
|
cv = np.percentile(cond, p)
|
||||||
|
iv = np.percentile(indep, p)
|
||||||
|
print(f" P{p:<10d} {cv:>12.1f} {iv:>12.1f} {iv-cv:>+8.1f}")
|
||||||
|
|
||||||
|
# Agreement analysis
|
||||||
|
print(f"\n Agreement analysis (both ≤ threshold):")
|
||||||
|
for t in [5, 10, 15, 21]:
|
||||||
|
both = np.sum((cond <= t) & (indep <= t))
|
||||||
|
cond_only = np.sum((cond <= t) & (indep > t))
|
||||||
|
indep_only = np.sum((cond > t) & (indep <= t))
|
||||||
|
neither = np.sum((cond > t) & (indep > t))
|
||||||
|
agree_pct = (both + neither) / len(cond) * 100
|
||||||
|
print(f" θ={t:>2d}: both={both:,}, cond_only={cond_only:,}, "
|
||||||
|
f"indep_only={indep_only:,}, neither={neither:,} (agree={agree_pct:.1f}%)")
|
||||||
|
|
||||||
|
# Firm A specific
|
||||||
|
cur.execute('''
|
||||||
|
SELECT s.phash_distance_to_closest, s.min_dhash_independent
|
||||||
|
FROM signatures s
|
||||||
|
LEFT JOIN accountants a ON s.assigned_accountant = a.name
|
||||||
|
WHERE a.firm = '勤業眾信聯合'
|
||||||
|
AND s.min_dhash_independent IS NOT NULL
|
||||||
|
AND s.phash_distance_to_closest IS NOT NULL
|
||||||
|
''')
|
||||||
|
rows = cur.fetchall()
|
||||||
|
if rows:
|
||||||
|
cond_a = np.array([r[0] for r in rows])
|
||||||
|
indep_a = np.array([r[1] for r in rows])
|
||||||
|
print(f"\n Firm A (勤業眾信) — N={len(rows):,}:")
|
||||||
|
print(f" {'Percentile':<12} {'Conditional':>12} {'Independent':>12}")
|
||||||
|
print(f" {'-'*36}")
|
||||||
|
for p in [50, 75, 90, 95, 99]:
|
||||||
|
print(f" P{p:<10d} {np.percentile(cond_a, p):>12.1f} {np.percentile(indep_a, p):>12.1f}")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
t_start = time.time()
|
||||||
|
print("=" * 65)
|
||||||
|
print(" Independent Min dHash Computation")
|
||||||
|
print("=" * 65)
|
||||||
|
|
||||||
|
print(f"\n[Phase 1] Computing dHash vectors...")
|
||||||
|
phase1_compute_hashes()
|
||||||
|
|
||||||
|
print(f"\n[Phase 2] Computing all-pairs min dHash per accountant...")
|
||||||
|
phase2_compute_min_dhash()
|
||||||
|
|
||||||
|
print(f"\n[Phase 3] Summary...")
|
||||||
|
print_summary()
|
||||||
|
|
||||||
|
elapsed = time.time() - t_start
|
||||||
|
print(f"\nTotal time: {elapsed:.0f}s ({elapsed/60:.1f} min)")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,534 @@
|
|||||||
|
# Signature Verification Threshold Validation Options
|
||||||
|
|
||||||
|
**Report Date:** 2026-01-14
|
||||||
|
**Purpose:** Discussion document for research partners on threshold selection methodology
|
||||||
|
**Context:** Validating copy-paste detection thresholds for accountant signature analysis
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Table of Contents
|
||||||
|
|
||||||
|
1. [Current Findings Summary](#1-current-findings-summary)
|
||||||
|
2. [The Core Problem](#2-the-core-problem)
|
||||||
|
3. [Key Metrics Explained](#3-key-metrics-explained)
|
||||||
|
4. [Validation Options](#4-validation-options)
|
||||||
|
5. [Academic References](#5-academic-references)
|
||||||
|
6. [Recommendations](#6-recommendations)
|
||||||
|
7. [Next Steps for Discussion](#7-next-steps-for-discussion)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Current Findings Summary
|
||||||
|
|
||||||
|
Our YOLO-based signature extraction and similarity analysis produced the following results:
|
||||||
|
|
||||||
|
| Metric | Value |
|
||||||
|
|--------|-------|
|
||||||
|
| Total PDFs analyzed | 84,386 |
|
||||||
|
| Total signatures extracted | 168,755 |
|
||||||
|
| High similarity pairs (>0.95) | 659,111 |
|
||||||
|
| Classified as "copy-paste" | 71,656 PDFs (84.9%) |
|
||||||
|
| Classified as "authentic" | 76 PDFs (0.1%) |
|
||||||
|
| Uncertain | 12,651 PDFs (15.0%) |
|
||||||
|
|
||||||
|
**Current threshold used:**
|
||||||
|
- Copy-paste: similarity ≥ 0.95
|
||||||
|
- Authentic: similarity ≤ 0.85
|
||||||
|
- Uncertain: 0.85 < similarity < 0.95
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. The Core Problem
|
||||||
|
|
||||||
|
### 2.1 What is Ground Truth?
|
||||||
|
|
||||||
|
**Ground truth labels** are pre-verified classifications that serve as the "correct answer" for machine learning evaluation. For signature verification:
|
||||||
|
|
||||||
|
| Label | Meaning | How to Obtain |
|
||||||
|
|-------|---------|---------------|
|
||||||
|
| **Genuine** | Physically hand-signed by the accountant | Expert forensic examination |
|
||||||
|
| **Copy-paste/Forged** | Digitally copied from another document | Pixel-level analysis or expert verification |
|
||||||
|
|
||||||
|
### 2.2 Why We Need Ground Truth
|
||||||
|
|
||||||
|
To calculate rigorous metrics like EER (Equal Error Rate), we need labeled data:
|
||||||
|
|
||||||
|
```
|
||||||
|
EER Calculation requires:
|
||||||
|
├── Known genuine signatures → Calculate FRR at each threshold
|
||||||
|
├── Known forged signatures → Calculate FAR at each threshold
|
||||||
|
└── Find threshold where FAR = FRR → This is EER
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.3 Our Current Limitation
|
||||||
|
|
||||||
|
We do not have pre-labeled ground truth data. Our current classification is based on:
|
||||||
|
- **Domain assumption**: Identical handwritten signatures are physically impossible
|
||||||
|
- **Similarity threshold**: Arbitrarily selected at 0.95
|
||||||
|
|
||||||
|
This approach is reasonable but may be challenged in academic peer review without additional validation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Key Metrics Explained
|
||||||
|
|
||||||
|
### 3.1 Error Rate Metrics
|
||||||
|
|
||||||
|
| Metric | Full Name | Formula | Interpretation |
|
||||||
|
|--------|-----------|---------|----------------|
|
||||||
|
| **FAR** | False Acceptance Rate | Forgeries Accepted / Total Forgeries | Security risk |
|
||||||
|
| **FRR** | False Rejection Rate | Genuine Rejected / Total Genuine | Usability risk |
|
||||||
|
| **EER** | Equal Error Rate | Point where FAR = FRR | Overall performance |
|
||||||
|
| **AER** | Average Error Rate | (FAR + FRR) / 2 | Combined error |
|
||||||
|
|
||||||
|
### 3.2 Visual Representation of EER
|
||||||
|
|
||||||
|
```
|
||||||
|
100% ┌─────────────────────────────────────┐
|
||||||
|
│ FRR │
|
||||||
|
│ \ │
|
||||||
|
│ \ │
|
||||||
|
Rate │ \ ╳ ← EER point │
|
||||||
|
│ \ / │
|
||||||
|
│ \ / │
|
||||||
|
│ \ / FAR │
|
||||||
|
0% │────────\/──────────────────────────│
|
||||||
|
└─────────────────────────────────────┘
|
||||||
|
Low ←──── Threshold ────→ High
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.3 Benchmark Performance (from Literature)
|
||||||
|
|
||||||
|
| System | Dataset | EER | Reference |
|
||||||
|
|--------|---------|-----|-----------|
|
||||||
|
| SigNet (Siamese CNN) | GPDS-300 | 3.92% | Dey et al., 2017 |
|
||||||
|
| Consensus-Threshold | GPDS-300 | 1.27% FAR | arXiv:2401.03085 |
|
||||||
|
| Type-2 Neutrosophic | Custom | 98% accuracy | IASC 2024 |
|
||||||
|
| InceptionV3 Transfer | CEDAR | 99.10% accuracy | Springer 2024 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Validation Options
|
||||||
|
|
||||||
|
### Option 1: Manual Ground Truth Creation (Most Rigorous)
|
||||||
|
|
||||||
|
**Description:**
|
||||||
|
Manually verify a subset of signatures with human expert examination.
|
||||||
|
|
||||||
|
**Methodology:**
|
||||||
|
1. Randomly sample ~100-200 signature pairs from different similarity ranges
|
||||||
|
2. Expert examines original PDF documents for:
|
||||||
|
- Scan artifact variations (genuine scans have unique noise)
|
||||||
|
- Pixel-perfect alignment (copy-paste is exact)
|
||||||
|
- Ink pressure and stroke variations
|
||||||
|
- Document metadata (creation dates, software used)
|
||||||
|
3. Label each pair as "genuine" or "copy-paste"
|
||||||
|
4. Calculate EER, FAR, FRR at various thresholds
|
||||||
|
5. Select optimal threshold based on EER
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Academically rigorous
|
||||||
|
- Enables standard metric calculation (EER, FAR, FRR)
|
||||||
|
- Defensible in peer review
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Time-consuming (estimated 20-40 hours for 200 samples)
|
||||||
|
- Requires forensic document expertise
|
||||||
|
- Subjective in edge cases
|
||||||
|
|
||||||
|
**Academic Support:**
|
||||||
|
> "The final verification results can be obtained by the voting method with different thresholds and can be adjusted according to different types of application requirements."
|
||||||
|
> — Hadjadj et al., Applied Sciences, 2020 [[1]](#ref1)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Option 2: Statistical Distribution-Based Threshold (No Labels Needed)
|
||||||
|
|
||||||
|
**Description:**
|
||||||
|
Use the statistical distribution of similarity scores to define outliers.
|
||||||
|
|
||||||
|
**Methodology:**
|
||||||
|
1. Calculate mean (μ) and standard deviation (σ) of all similarity scores
|
||||||
|
2. Define thresholds based on standard deviations:
|
||||||
|
|
||||||
|
| Threshold | Formula | Percentile | Classification |
|
||||||
|
|-----------|---------|------------|----------------|
|
||||||
|
| Very High | > μ + 3σ | 99.7% | Definite copy-paste |
|
||||||
|
| High | > μ + 2σ | 95% | Likely copy-paste |
|
||||||
|
| Normal | μ ± 2σ | 5-95% | Uncertain |
|
||||||
|
| Low | < μ - 2σ | <5% | Likely genuine |
|
||||||
|
|
||||||
|
**Your Data:**
|
||||||
|
```
|
||||||
|
Mean similarity (μ) = 0.7608
|
||||||
|
Std deviation (σ) = 0.0916
|
||||||
|
|
||||||
|
Thresholds:
|
||||||
|
- μ + 2σ = 0.944 (95th percentile)
|
||||||
|
- μ + 3σ = 1.035 (99.7th percentile, capped at 1.0)
|
||||||
|
|
||||||
|
Your current 0.95 threshold ≈ μ + 2.07σ (96th percentile)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- No manual labeling required
|
||||||
|
- Statistically defensible
|
||||||
|
- Based on actual data distribution
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Assumes normal distribution (may not hold)
|
||||||
|
- Does not provide FAR/FRR metrics
|
||||||
|
- Less intuitive for non-statistical audiences
|
||||||
|
|
||||||
|
**Academic Support:**
|
||||||
|
> "Keypoint-based detection methods employ statistical thresholds derived from feature distributions to identify anomalous similarity patterns."
|
||||||
|
> — Copy-Move Forgery Detection Survey, Multimedia Tools & Applications, 2024 [[2]](#ref2)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Option 3: Physical Impossibility Argument (Domain Knowledge)
|
||||||
|
|
||||||
|
**Description:**
|
||||||
|
Use the physical impossibility of identical handwritten signatures as justification.
|
||||||
|
|
||||||
|
**Methodology:**
|
||||||
|
1. Define threshold based on handwriting science:
|
||||||
|
|
||||||
|
| Similarity | Physical Interpretation | Classification |
|
||||||
|
|------------|------------------------|----------------|
|
||||||
|
| = 1.0 | Pixel-identical; physically impossible for handwriting | **Definite copy** |
|
||||||
|
| > 0.98 | Near-identical; extremely improbable naturally | **Very likely copy** |
|
||||||
|
| 0.90 - 0.98 | Highly similar; unusual but possible | **Suspicious** |
|
||||||
|
| 0.80 - 0.90 | Similar; consistent with same signer | **Uncertain** |
|
||||||
|
| < 0.80 | Different; normal variation | **Likely genuine** |
|
||||||
|
|
||||||
|
2. Cite forensic document examination literature on signature variability
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Intuitive and explainable
|
||||||
|
- Based on established forensic principles
|
||||||
|
- Does not require labeled data
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Thresholds are somewhat arbitrary
|
||||||
|
- May not account for digital signature pads (lower variation)
|
||||||
|
- Requires supporting citations
|
||||||
|
|
||||||
|
**Academic Support:**
|
||||||
|
> "Signature verification presents several unique difficulties: high intra-class variability (an individual's signature may vary greatly day-to-day), large temporal variation (signature may change completely over time), and high inter-class similarity (forgeries attempt to be indistinguishable)."
|
||||||
|
> — Stanford CS231n Report, 2016 [[3]](#ref3)
|
||||||
|
|
||||||
|
> "A genuine signer's signature is naturally unstable even at short time-intervals, presenting inherent variation that digital copies lack."
|
||||||
|
> — Consensus-Threshold Criterion, arXiv:2401.03085, 2024 [[4]](#ref4)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Option 4: Pixel-Level Copy Detection (Technical Verification)
|
||||||
|
|
||||||
|
**Description:**
|
||||||
|
Detect exact copies through pixel-level analysis, independent of feature similarity.
|
||||||
|
|
||||||
|
**Methodology:**
|
||||||
|
1. For high-similarity pairs (>0.95), perform additional checks:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Check 1: Exact pixel match
|
||||||
|
if np.array_equal(image1, image2):
|
||||||
|
return "DEFINITE_COPY"
|
||||||
|
|
||||||
|
# Check 2: Structural Similarity Index (SSIM)
|
||||||
|
ssim_score = structural_similarity(image1, image2)
|
||||||
|
if ssim_score > 0.999:
|
||||||
|
return "DEFINITE_COPY"
|
||||||
|
|
||||||
|
# Check 3: Histogram correlation
|
||||||
|
hist_corr = cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
|
||||||
|
if hist_corr > 0.999:
|
||||||
|
return "LIKELY_COPY"
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Use copy-move forgery detection (CMFD) techniques from image forensics
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Technical proof of copying
|
||||||
|
- Not dependent on threshold selection
|
||||||
|
- Provides definitive evidence for exact copies
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Only detects exact copies (not scaled/rotated)
|
||||||
|
- Requires additional processing
|
||||||
|
- May miss high-quality forgeries
|
||||||
|
|
||||||
|
**Academic Support:**
|
||||||
|
> "Block-based methods segment an image into overlapping blocks and extract features. The forgery regions are determined by computing the similarity between block features using DCT (Discrete Cosine Transform) or SIFT (Scale-Invariant Feature Transform)."
|
||||||
|
> — Copy-Move Forgery Detection Survey, 2024 [[2]](#ref2)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Option 5: Siamese Network with Learned Threshold (Advanced)
|
||||||
|
|
||||||
|
**Description:**
|
||||||
|
Train a Siamese neural network on signature pairs to learn optimal decision boundaries.
|
||||||
|
|
||||||
|
**Methodology:**
|
||||||
|
1. Collect training data:
|
||||||
|
- Positive pairs: Same accountant, different documents
|
||||||
|
- Negative pairs: Different accountants
|
||||||
|
2. Train Siamese network with contrastive or triplet loss
|
||||||
|
3. Network learns embedding space where:
|
||||||
|
- Same-person signatures cluster together
|
||||||
|
- Different-person signatures separate
|
||||||
|
4. Threshold is learned during training, not manually set
|
||||||
|
|
||||||
|
**Architecture:**
|
||||||
|
```
|
||||||
|
┌──────────────┐ ┌──────────────┐
|
||||||
|
│ Signature 1 │ │ Signature 2 │
|
||||||
|
└──────┬───────┘ └──────┬───────┘
|
||||||
|
│ │
|
||||||
|
▼ ▼
|
||||||
|
┌──────────────┐ ┌──────────────┐
|
||||||
|
│ CNN │ │ CNN │ (Shared weights)
|
||||||
|
│ Encoder │ │ Encoder │
|
||||||
|
└──────┬───────┘ └──────┬───────┘
|
||||||
|
│ │
|
||||||
|
▼ ▼
|
||||||
|
┌──────────────┐ ┌──────────────┐
|
||||||
|
│ Embedding │ │ Embedding │
|
||||||
|
│ Vector │ │ Vector │
|
||||||
|
└──────┬───────┘ └──────┬───────┘
|
||||||
|
│ │
|
||||||
|
└────────┬───────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌───────────────┐
|
||||||
|
│ Distance │
|
||||||
|
│ Metric │
|
||||||
|
└───────┬───────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌───────────────┐
|
||||||
|
│ Same/Different│
|
||||||
|
└───────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Learns optimal threshold from data
|
||||||
|
- State-of-the-art performance
|
||||||
|
- Handles complex variations
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Requires substantial training data
|
||||||
|
- Computationally expensive
|
||||||
|
- May overfit to specific accountant styles
|
||||||
|
|
||||||
|
**Academic Support:**
|
||||||
|
> "SigNet provided better results than the state-of-the-art results on most of the benchmark signature datasets by learning a feature space where similar observations are placed in proximity."
|
||||||
|
> — SigNet, arXiv:1707.02131, 2017 [[5]](#ref5)
|
||||||
|
|
||||||
|
> "Among various distance measures employed in the t-Siamese similarity network, the Manhattan distance technique emerged as the most effective."
|
||||||
|
> — Triplet Siamese Similarity Networks, Mathematics, 2024 [[6]](#ref6)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Academic References
|
||||||
|
|
||||||
|
<a name="ref1"></a>
|
||||||
|
### [1] Single Known Sample Verification (MDPI 2020)
|
||||||
|
**Title:** An Offline Signature Verification and Forgery Detection Method Based on a Single Known Sample and an Explainable Deep Learning Approach
|
||||||
|
**Authors:** Hadjadj, I. et al.
|
||||||
|
**Journal:** Applied Sciences, 10(11), 3716
|
||||||
|
**Year:** 2020
|
||||||
|
**URL:** https://www.mdpi.com/2076-3417/10/11/3716
|
||||||
|
**Key Findings:**
|
||||||
|
- Accuracy: 94.37% - 99.96%
|
||||||
|
- FRR: 0% - 5.88%
|
||||||
|
- FAR: 0.22% - 5.34%
|
||||||
|
- Voting method with adjustable thresholds
|
||||||
|
|
||||||
|
<a name="ref2"></a>
|
||||||
|
### [2] Copy-Move Forgery Detection Survey (Springer 2024)
|
||||||
|
**Title:** Copy-move forgery detection in digital image forensics: A survey
|
||||||
|
**Journal:** Multimedia Tools and Applications
|
||||||
|
**Year:** 2024
|
||||||
|
**URL:** https://link.springer.com/article/10.1007/s11042-024-18399-2
|
||||||
|
**Key Findings:**
|
||||||
|
- Block-based, keypoint-based, and deep learning methods reviewed
|
||||||
|
- DCT and SIFT for feature extraction
|
||||||
|
- Statistical thresholds for anomaly detection
|
||||||
|
|
||||||
|
<a name="ref3"></a>
|
||||||
|
### [3] Stanford CS231n Signature Verification Report
|
||||||
|
**Title:** Offline Signature Verification with Convolutional Neural Networks
|
||||||
|
**Institution:** Stanford University
|
||||||
|
**Year:** 2016
|
||||||
|
**URL:** https://cs231n.stanford.edu/reports/2016/pdfs/276_Report.pdf
|
||||||
|
**Key Findings:**
|
||||||
|
- High intra-class variability challenge
|
||||||
|
- Low inter-class similarity for skilled forgeries
|
||||||
|
- CNN-based feature extraction
|
||||||
|
|
||||||
|
<a name="ref4"></a>
|
||||||
|
### [4] Consensus-Threshold Criterion (arXiv 2024)
|
||||||
|
**Title:** Consensus-Threshold Criterion for Offline Signature Verification using Convolutional Neural Network Learned Representations
|
||||||
|
**Year:** 2024
|
||||||
|
**URL:** https://arxiv.org/abs/2401.03085
|
||||||
|
**Key Findings:**
|
||||||
|
- Achieved 1.27% FAR (vs 8.73% and 17.31% in prior work)
|
||||||
|
- Consensus-threshold distance-based classifier
|
||||||
|
- Uses SigNet and SigNet-F features
|
||||||
|
|
||||||
|
<a name="ref5"></a>
|
||||||
|
### [5] SigNet: Siamese Network for Signature Verification (arXiv 2017)
|
||||||
|
**Title:** SigNet: Convolutional Siamese Network for Writer Independent Offline Signature Verification
|
||||||
|
**Authors:** Dey, S. et al.
|
||||||
|
**Year:** 2017
|
||||||
|
**URL:** https://arxiv.org/abs/1707.02131
|
||||||
|
**Key Findings:**
|
||||||
|
- Siamese architecture with shared weights
|
||||||
|
- Euclidean distance minimization for genuine pairs
|
||||||
|
- State-of-the-art on GPDS, CEDAR, MCYT datasets
|
||||||
|
|
||||||
|
<a name="ref6"></a>
|
||||||
|
### [6] Triplet Siamese Similarity Networks (MDPI 2024)
|
||||||
|
**Title:** Enhancing Signature Verification Using Triplet Siamese Similarity Networks in Digital Documents
|
||||||
|
**Journal:** Mathematics, 12(17), 2757
|
||||||
|
**Year:** 2024
|
||||||
|
**URL:** https://www.mdpi.com/2227-7390/12/17/2757
|
||||||
|
**Key Findings:**
|
||||||
|
- Manhattan distance outperforms Euclidean and Minkowski
|
||||||
|
- Triplet loss for inter-class/intra-class optimization
|
||||||
|
- Tested on 4NSigComp2012, SigComp2011, BHSig260
|
||||||
|
|
||||||
|
<a name="ref7"></a>
|
||||||
|
### [7] Original Siamese Network Paper (NeurIPS 1993)
|
||||||
|
**Title:** Signature Verification using a "Siamese" Time Delay Neural Network
|
||||||
|
**Authors:** Bromley, J. et al.
|
||||||
|
**Conference:** NeurIPS 1993
|
||||||
|
**URL:** https://papers.neurips.cc/paper/1993/file/288cc0ff022877bd3df94bc9360b9c5d-Paper.pdf
|
||||||
|
**Key Findings:**
|
||||||
|
- Introduced Siamese architecture for signature verification
|
||||||
|
- Cosine similarity = 1.0 for genuine pairs
|
||||||
|
- Foundational work for modern approaches
|
||||||
|
|
||||||
|
<a name="ref8"></a>
|
||||||
|
### [8] Australian Journal of Forensic Sciences (2024)
|
||||||
|
**Title:** Handling high level of uncertainty in forensic signature examination
|
||||||
|
**Journal:** Australian Journal of Forensic Sciences, 57(5)
|
||||||
|
**Year:** 2024
|
||||||
|
**URL:** https://www.tandfonline.com/doi/full/10.1080/00450618.2024.2410044
|
||||||
|
**Key Findings:**
|
||||||
|
- Type-2 Neutrosophic similarity measure
|
||||||
|
- 98% accuracy (vs 95% for Type-1)
|
||||||
|
- Addresses ambiguity in forensic analysis
|
||||||
|
|
||||||
|
<a name="ref9"></a>
|
||||||
|
### [9] Benchmark Datasets
|
||||||
|
**CEDAR Dataset:**
|
||||||
|
- 55 signers × 24 genuine + 24 forged signatures
|
||||||
|
- URL: https://paperswithcode.com/dataset/cedar-signature
|
||||||
|
|
||||||
|
**GPDS-960 Corpus:**
|
||||||
|
- 960 writers × 24 genuine + 30 forgeries
|
||||||
|
- 600 dpi grayscale scans
|
||||||
|
- URL: https://www.researchgate.net/publication/220860371
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Recommendations
|
||||||
|
|
||||||
|
### For Academic Publication
|
||||||
|
|
||||||
|
| Priority | Option | Effort | Rigor | Recommendation |
|
||||||
|
|----------|--------|--------|-------|----------------|
|
||||||
|
| 1 | **Option 1 + Option 2** | High | Very High | Create small labeled dataset + validate statistical threshold |
|
||||||
|
| 2 | **Option 2 + Option 3** | Low | Medium | Statistical threshold + physical impossibility argument |
|
||||||
|
| 3 | **Option 4** | Medium | High | Add pixel-level verification for definitive cases |
|
||||||
|
|
||||||
|
### Suggested Approach
|
||||||
|
|
||||||
|
1. **Primary method:** Use statistical threshold (Option 2)
|
||||||
|
- Report threshold as μ + 2σ ≈ 0.944 (close to your current 0.95)
|
||||||
|
- Statistically defensible without ground truth
|
||||||
|
|
||||||
|
2. **Supporting evidence:** Physical impossibility argument (Option 3)
|
||||||
|
- Cite forensic literature on signature variability
|
||||||
|
- Emphasize that identical signatures are physically impossible
|
||||||
|
|
||||||
|
3. **Validation (if time permits):** Small labeled subset (Option 1)
|
||||||
|
- Manually verify 100-200 samples
|
||||||
|
- Calculate EER to validate threshold choice
|
||||||
|
|
||||||
|
4. **Technical proof:** Pixel-level analysis (Option 4)
|
||||||
|
- Add SSIM analysis for high-similarity pairs
|
||||||
|
- Report exact copy counts separately
|
||||||
|
|
||||||
|
### Suggested Report Language
|
||||||
|
|
||||||
|
> "We adopt a similarity threshold of 0.95 (approximately μ + 2σ, representing the 96th percentile of our similarity distribution) to classify signatures as potential copy-paste instances. This threshold is supported by: (1) statistical outlier detection principles, (2) the physical impossibility of pixel-identical handwritten signatures, and (3) alignment with forensic document examination literature [cite: Hadjadj 2020, arXiv:2401.03085]."
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Next Steps for Discussion
|
||||||
|
|
||||||
|
### Questions for Research Partners
|
||||||
|
|
||||||
|
1. **Data availability:** Do we have access to any documents with known authentic signatures for validation?
|
||||||
|
|
||||||
|
2. **Expert resources:** Can we involve a forensic document examiner for ground truth labeling?
|
||||||
|
|
||||||
|
3. **Scope decision:** Should we focus on statistical validation (faster) or pursue full EER analysis (more rigorous)?
|
||||||
|
|
||||||
|
4. **Publication target:** What level of rigor does the target journal require?
|
||||||
|
|
||||||
|
5. **Time constraints:** How much time can we allocate to validation before submission?
|
||||||
|
|
||||||
|
### Proposed Action Items
|
||||||
|
|
||||||
|
| Task | Owner | Deadline | Notes |
|
||||||
|
|------|-------|----------|-------|
|
||||||
|
| Review this document | All partners | TBD | Discuss options |
|
||||||
|
| Select validation approach | Team decision | TBD | Based on resources |
|
||||||
|
| Implement selected approach | TBD | TBD | After decision |
|
||||||
|
| Update threshold if needed | TBD | TBD | Based on validation |
|
||||||
|
| Draft methodology section | TBD | TBD | For paper |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Appendix: Code for Statistical Threshold Calculation
|
||||||
|
|
||||||
|
```python
|
||||||
|
import numpy as np
|
||||||
|
from scipy import stats
|
||||||
|
|
||||||
|
# Your similarity data
|
||||||
|
similarities = [...] # Load from your analysis
|
||||||
|
|
||||||
|
# Calculate statistics
|
||||||
|
mean_sim = np.mean(similarities)
|
||||||
|
std_sim = np.std(similarities)
|
||||||
|
percentiles = np.percentile(similarities, [90, 95, 99, 99.7])
|
||||||
|
|
||||||
|
print(f"Mean (μ): {mean_sim:.4f}")
|
||||||
|
print(f"Std (σ): {std_sim:.4f}")
|
||||||
|
print(f"μ + 2σ: {mean_sim + 2*std_sim:.4f}")
|
||||||
|
print(f"μ + 3σ: {mean_sim + 3*std_sim:.4f}")
|
||||||
|
print(f"Percentiles: 90%={percentiles[0]:.4f}, 95%={percentiles[1]:.4f}, "
|
||||||
|
f"99%={percentiles[2]:.4f}, 99.7%={percentiles[3]:.4f}")
|
||||||
|
|
||||||
|
# Threshold recommendations
|
||||||
|
thresholds = {
|
||||||
|
"Conservative (μ+3σ)": min(1.0, mean_sim + 3*std_sim),
|
||||||
|
"Standard (μ+2σ)": mean_sim + 2*std_sim,
|
||||||
|
"Liberal (95th percentile)": percentiles[1],
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, thresh in thresholds.items():
|
||||||
|
count_above = np.sum(similarities > thresh)
|
||||||
|
pct_above = 100 * count_above / len(similarities)
|
||||||
|
print(f"{name}: {thresh:.4f} → {count_above} pairs ({pct_above:.2f}%)")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*Document prepared for research discussion. Please share feedback and questions with the team.*
|
||||||
Reference in New Issue
Block a user