Add Deloitte distribution & independent dHash analysis scripts

- Script 13: Firm A normality/multimodality analysis (Shapiro-Wilk, Anderson-Darling, KDE, per-accountant ANOVA, Beta/Gamma fitting)
- Script 14: Independent min-dHash computation across all pairs per accountant (not just cosine-nearest pair)
- THRESHOLD_VALIDATION_OPTIONS: 2026-01 discussion doc on threshold validation approaches
- .gitignore: exclude model weights, node artifacts, and xlsx data

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-20 21:34:24 +08:00
parent 939a348da4
commit a261a22bd2
4 changed files with 1270 additions and 0 deletions
@@ -0,0 +1,430 @@
#!/usr/bin/env python3
"""
Deloitte (勤業眾信) Signature Similarity Distribution Analysis
==============================================================
Evaluate whether Firm A's max_similarity values follow a normal distribution
or contain subgroups (e.g., genuinely hand-signed vs digitally stamped).
Tests:
1. Descriptive statistics & percentiles
2. Normality tests (Shapiro-Wilk, D'Agostino-Pearson, Anderson-Darling, KS)
3. Histogram + KDE + fitted normal overlay
4. Q-Q plot
5. Multimodality check (Hartigan's dip test approximation)
6. Outlier identification (signatures with unusually low similarity)
7. dHash distance distribution for Firm A
Output: figures + report to console
"""
import sqlite3
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from scipy import stats
from pathlib import Path
from collections import Counter
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
OUTPUT_DIR = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/deloitte_distribution')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
FIRM_A = '勤業眾信聯合'
def load_firm_a_data():
"""Load all Firm A signature similarity data."""
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
cur.execute('''
SELECT s.signature_id, s.image_filename, s.assigned_accountant,
s.max_similarity_to_same_accountant,
s.phash_distance_to_closest
FROM signatures s
LEFT JOIN accountants a ON s.assigned_accountant = a.name
WHERE a.firm = ?
AND s.max_similarity_to_same_accountant IS NOT NULL
''', (FIRM_A,))
rows = cur.fetchall()
conn.close()
data = []
for r in rows:
data.append({
'sig_id': r[0],
'filename': r[1],
'accountant': r[2],
'cosine': r[3],
'phash': r[4],
})
return data
def descriptive_stats(cosines, label="Firm A Cosine Similarity"):
"""Print comprehensive descriptive statistics."""
print(f"\n{'='*65}")
print(f" {label}")
print(f"{'='*65}")
print(f" N = {len(cosines):,}")
print(f" Mean = {np.mean(cosines):.6f}")
print(f" Median = {np.median(cosines):.6f}")
print(f" Std Dev = {np.std(cosines):.6f}")
print(f" Variance = {np.var(cosines):.8f}")
print(f" Min = {np.min(cosines):.6f}")
print(f" Max = {np.max(cosines):.6f}")
print(f" Range = {np.ptp(cosines):.6f}")
print(f" Skewness = {stats.skew(cosines):.4f}")
print(f" Kurtosis = {stats.kurtosis(cosines):.4f} (excess)")
print(f" IQR = {np.percentile(cosines, 75) - np.percentile(cosines, 25):.6f}")
print()
print(f" Percentiles:")
for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
print(f" P{p:<3d} = {np.percentile(cosines, p):.6f}")
def normality_tests(cosines):
"""Run multiple normality tests."""
print(f"\n{'='*65}")
print(f" NORMALITY TESTS")
print(f"{'='*65}")
# Shapiro-Wilk (max 5000 samples)
if len(cosines) > 5000:
sample = np.random.choice(cosines, 5000, replace=False)
stat, p = stats.shapiro(sample)
print(f"\n Shapiro-Wilk (n=5000 subsample):")
else:
stat, p = stats.shapiro(cosines)
print(f"\n Shapiro-Wilk (n={len(cosines)}):")
print(f" W = {stat:.6f}, p = {p:.2e}")
print(f"{'Normal' if p > 0.05 else 'NOT normal'} at α=0.05")
# D'Agostino-Pearson
if len(cosines) >= 20:
stat, p = stats.normaltest(cosines)
print(f"\n D'Agostino-Pearson:")
print(f" K² = {stat:.4f}, p = {p:.2e}")
print(f"{'Normal' if p > 0.05 else 'NOT normal'} at α=0.05")
# Anderson-Darling
result = stats.anderson(cosines, dist='norm')
print(f"\n Anderson-Darling:")
print(f" A² = {result.statistic:.4f}")
for i, (sl, cv) in enumerate(zip(result.significance_level, result.critical_values)):
reject = "REJECT" if result.statistic > cv else "accept"
print(f" {sl}%: critical={cv:.4f}{reject}")
# Kolmogorov-Smirnov against normal
mu, sigma = np.mean(cosines), np.std(cosines)
stat, p = stats.kstest(cosines, 'norm', args=(mu, sigma))
print(f"\n Kolmogorov-Smirnov (vs fitted normal):")
print(f" D = {stat:.6f}, p = {p:.2e}")
print(f"{'Normal' if p > 0.05 else 'NOT normal'} at α=0.05")
return mu, sigma
def test_alternative_distributions(cosines):
"""Fit alternative distributions and compare."""
print(f"\n{'='*65}")
print(f" DISTRIBUTION FITTING (AIC comparison)")
print(f"{'='*65}")
distributions = {
'norm': stats.norm,
'skewnorm': stats.skewnorm,
'beta': stats.beta,
'lognorm': stats.lognorm,
'gamma': stats.gamma,
}
results = []
for name, dist in distributions.items():
try:
params = dist.fit(cosines)
log_likelihood = np.sum(dist.logpdf(cosines, *params))
k = len(params)
aic = 2 * k - 2 * log_likelihood
bic = k * np.log(len(cosines)) - 2 * log_likelihood
results.append((name, aic, bic, params, log_likelihood))
except Exception as e:
print(f" {name}: fit failed ({e})")
results.sort(key=lambda x: x[1]) # sort by AIC
print(f"\n {'Distribution':<15} {'AIC':>12} {'BIC':>12} {'LogLik':>12}")
print(f" {'-'*51}")
for name, aic, bic, params, ll in results:
marker = " ←best" if name == results[0][0] else ""
print(f" {name:<15} {aic:>12.1f} {bic:>12.1f} {ll:>12.1f}{marker}")
return results
def per_accountant_analysis(data):
"""Analyze per-accountant distributions within Firm A."""
print(f"\n{'='*65}")
print(f" PER-ACCOUNTANT ANALYSIS (within Firm A)")
print(f"{'='*65}")
by_acct = {}
for d in data:
by_acct.setdefault(d['accountant'], []).append(d['cosine'])
print(f"\n {'Accountant':<20} {'N':>6} {'Mean':>8} {'Std':>8} {'Min':>8} {'P5':>8} {'P50':>8}")
print(f" {'-'*66}")
acct_stats = []
for acct, vals in sorted(by_acct.items(), key=lambda x: np.mean(x[1])):
v = np.array(vals)
print(f" {acct:<20} {len(v):>6} {v.mean():>8.4f} {v.std():>8.4f} "
f"{v.min():>8.4f} {np.percentile(v, 5):>8.4f} {np.median(v):>8.4f}")
acct_stats.append({
'accountant': acct,
'n': len(v),
'mean': float(v.mean()),
'std': float(v.std()),
'min': float(v.min()),
'values': v,
})
# Check if per-accountant means are homogeneous (one-way ANOVA)
if len(by_acct) >= 2:
groups = [np.array(v) for v in by_acct.values() if len(v) >= 5]
if len(groups) >= 2:
f_stat, p_val = stats.f_oneway(*groups)
print(f"\n One-way ANOVA across accountants:")
print(f" F = {f_stat:.4f}, p = {p_val:.2e}")
print(f"{'Homogeneous' if p_val > 0.05 else 'Significantly different means'} at α=0.05")
# Levene's test for homogeneity of variance
lev_stat, lev_p = stats.levene(*groups)
print(f"\n Levene's test (variance homogeneity):")
print(f" W = {lev_stat:.4f}, p = {lev_p:.2e}")
print(f"{'Homogeneous variance' if lev_p > 0.05 else 'Heterogeneous variance'} at α=0.05")
return acct_stats
def identify_outliers(data, cosines):
"""Identify Firm A signatures with unusually low similarity."""
print(f"\n{'='*65}")
print(f" OUTLIER ANALYSIS (low-similarity Firm A signatures)")
print(f"{'='*65}")
q1 = np.percentile(cosines, 25)
q3 = np.percentile(cosines, 75)
iqr = q3 - q1
lower_fence = q1 - 1.5 * iqr
lower_extreme = q1 - 3.0 * iqr
print(f" IQR method: Q1={q1:.4f}, Q3={q3:.4f}, IQR={iqr:.4f}")
print(f" Lower fence (mild): {lower_fence:.4f}")
print(f" Lower fence (extreme): {lower_extreme:.4f}")
outliers = [d for d in data if d['cosine'] < lower_fence]
extreme_outliers = [d for d in data if d['cosine'] < lower_extreme]
print(f"\n Mild outliers (< {lower_fence:.4f}): {len(outliers)}")
print(f" Extreme outliers (< {lower_extreme:.4f}): {len(extreme_outliers)}")
if outliers:
print(f"\n Bottom 20 by cosine similarity:")
sorted_outliers = sorted(outliers, key=lambda x: x['cosine'])[:20]
for d in sorted_outliers:
phash_str = f"pHash={d['phash']}" if d['phash'] is not None else "pHash=N/A"
print(f" cosine={d['cosine']:.4f} {phash_str} {d['accountant']} {d['filename']}")
# Also show count below various thresholds
print(f"\n Signatures below key thresholds:")
for thresh in [0.95, 0.90, 0.85, 0.837, 0.80]:
n_below = sum(1 for c in cosines if c < thresh)
print(f" < {thresh:.3f}: {n_below:,} ({100*n_below/len(cosines):.2f}%)")
def plot_histogram_kde(cosines, mu, sigma):
"""Plot histogram with KDE and fitted normal overlay."""
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Left: Full histogram
ax = axes[0]
ax.hist(cosines, bins=80, density=True, alpha=0.6, color='steelblue',
edgecolor='white', linewidth=0.5, label='Observed')
# Fitted normal
x = np.linspace(cosines.min() - 0.02, cosines.max() + 0.02, 300)
ax.plot(x, stats.norm.pdf(x, mu, sigma), 'r-', lw=2,
label=f'Normal fit (μ={mu:.4f}, σ={sigma:.4f})')
# KDE
kde = stats.gaussian_kde(cosines)
ax.plot(x, kde(x), 'g--', lw=2, label='KDE')
ax.set_xlabel('Max Cosine Similarity')
ax.set_ylabel('Density')
ax.set_title(f'Firm A (勤業眾信) Cosine Similarity Distribution (N={len(cosines):,})')
ax.legend(fontsize=9)
ax.axvline(0.95, color='orange', ls=':', alpha=0.7, label='θ=0.95')
ax.axvline(0.837, color='purple', ls=':', alpha=0.7, label='KDE crossover')
# Right: Q-Q plot
ax2 = axes[1]
stats.probplot(cosines, dist='norm', plot=ax2)
ax2.set_title('Q-Q Plot (vs Normal)')
ax2.get_lines()[0].set_markersize(2)
plt.tight_layout()
fig.savefig(OUTPUT_DIR / 'firm_a_cosine_distribution.png', dpi=150)
print(f"\n Saved: {OUTPUT_DIR / 'firm_a_cosine_distribution.png'}")
plt.close()
def plot_per_accountant(acct_stats):
"""Box plot per accountant."""
# Sort by mean
acct_stats.sort(key=lambda x: x['mean'])
fig, ax = plt.subplots(figsize=(12, max(5, len(acct_stats) * 0.4)))
positions = range(len(acct_stats))
labels = [f"{a['accountant']} (n={a['n']})" for a in acct_stats]
box_data = [a['values'] for a in acct_stats]
bp = ax.boxplot(box_data, positions=positions, vert=False, widths=0.6,
patch_artist=True, showfliers=True,
flierprops=dict(marker='.', markersize=3, alpha=0.5))
for patch in bp['boxes']:
patch.set_facecolor('lightsteelblue')
ax.set_yticks(positions)
ax.set_yticklabels(labels, fontsize=8)
ax.set_xlabel('Max Cosine Similarity')
ax.set_title('Per-Accountant Similarity Distribution (Firm A)')
ax.axvline(0.95, color='orange', ls=':', alpha=0.7)
ax.axvline(0.837, color='purple', ls=':', alpha=0.7)
plt.tight_layout()
fig.savefig(OUTPUT_DIR / 'firm_a_per_accountant_boxplot.png', dpi=150)
print(f" Saved: {OUTPUT_DIR / 'firm_a_per_accountant_boxplot.png'}")
plt.close()
def plot_phash_distribution(data):
"""Plot dHash distance distribution for Firm A."""
phash_vals = [d['phash'] for d in data if d['phash'] is not None]
if not phash_vals:
print(" No pHash data available.")
return
phash_arr = np.array(phash_vals)
fig, ax = plt.subplots(figsize=(10, 5))
max_val = min(int(phash_arr.max()) + 2, 65)
bins = np.arange(-0.5, max_val + 0.5, 1)
ax.hist(phash_arr, bins=bins, alpha=0.7, color='coral', edgecolor='white')
ax.set_xlabel('dHash Distance')
ax.set_ylabel('Count')
ax.set_title(f'Firm A dHash Distance Distribution (N={len(phash_vals):,})')
ax.axvline(5, color='green', ls='--', label='θ=5 (high conf.)')
ax.axvline(15, color='orange', ls='--', label='θ=15 (moderate)')
ax.legend()
plt.tight_layout()
fig.savefig(OUTPUT_DIR / 'firm_a_dhash_distribution.png', dpi=150)
print(f" Saved: {OUTPUT_DIR / 'firm_a_dhash_distribution.png'}")
plt.close()
def multimodality_test(cosines):
"""Check for potential multimodality using kernel density peaks."""
print(f"\n{'='*65}")
print(f" MULTIMODALITY ANALYSIS")
print(f"{'='*65}")
kde = stats.gaussian_kde(cosines, bw_method='silverman')
x = np.linspace(cosines.min(), cosines.max(), 1000)
density = kde(x)
# Find local maxima
from scipy.signal import find_peaks
peaks, properties = find_peaks(density, prominence=0.01)
peak_positions = x[peaks]
peak_heights = density[peaks]
print(f" KDE bandwidth (Silverman): {kde.factor:.6f}")
print(f" Number of detected modes: {len(peaks)}")
for i, (pos, h) in enumerate(zip(peak_positions, peak_heights)):
print(f" Mode {i+1}: position={pos:.4f}, density={h:.2f}")
if len(peaks) == 1:
print(f"\n → Distribution appears UNIMODAL")
print(f" Single peak at {peak_positions[0]:.4f}")
elif len(peaks) > 1:
print(f"\n → Distribution appears MULTIMODAL ({len(peaks)} modes)")
print(f" This suggests subgroups may exist within Firm A")
# Check separation between modes
for i in range(len(peaks) - 1):
sep = peak_positions[i + 1] - peak_positions[i]
# Find valley between modes
valley_region = density[peaks[i]:peaks[i + 1]]
valley_depth = peak_heights[i:i + 2].min() - valley_region.min()
print(f" Separation {i+1}-{i+2}: Δ={sep:.4f}, valley depth={valley_depth:.2f}")
# Also try different bandwidths
print(f"\n Sensitivity analysis (bandwidth variation):")
for bw_factor in [0.5, 0.75, 1.0, 1.5, 2.0]:
bw = kde.factor * bw_factor
kde_test = stats.gaussian_kde(cosines, bw_method=bw)
density_test = kde_test(x)
peaks_test, _ = find_peaks(density_test, prominence=0.005)
print(f" bw={bw:.4f} (×{bw_factor:.1f}): {len(peaks_test)} mode(s)")
def main():
print("Loading Firm A (勤業眾信) signature data...")
data = load_firm_a_data()
print(f"Total Firm A signatures: {len(data):,}")
cosines = np.array([d['cosine'] for d in data])
# 1. Descriptive statistics
descriptive_stats(cosines)
# 2. Normality tests
mu, sigma = normality_tests(cosines)
# 3. Alternative distribution fitting
test_alternative_distributions(cosines)
# 4. Per-accountant analysis
acct_stats = per_accountant_analysis(data)
# 5. Outlier analysis
identify_outliers(data, cosines)
# 6. Multimodality test
multimodality_test(cosines)
# 7. Generate plots
print(f"\n{'='*65}")
print(f" GENERATING FIGURES")
print(f"{'='*65}")
plot_histogram_kde(cosines, mu, sigma)
plot_per_accountant(acct_stats)
plot_phash_distribution(data)
# Summary
print(f"\n{'='*65}")
print(f" SUMMARY")
print(f"{'='*65}")
below_95 = sum(1 for c in cosines if c < 0.95)
below_kde = sum(1 for c in cosines if c < 0.837)
print(f" Firm A signatures: {len(cosines):,}")
print(f" Below 0.95 threshold: {below_95:,} ({100*below_95/len(cosines):.1f}%)")
print(f" Below KDE crossover (0.837): {below_kde:,} ({100*below_kde/len(cosines):.1f}%)")
print(f" If distribution is NOT normal → subgroups may exist")
print(f" If multimodal → some signatures may be genuinely hand-signed")
print(f"\n Output directory: {OUTPUT_DIR}")
if __name__ == "__main__":
main()