Add Deloitte distribution & independent dHash analysis scripts

- Script 13: Firm A normality/multimodality analysis (Shapiro-Wilk, Anderson-Darling, KDE, per-accountant ANOVA, Beta/Gamma fitting) - Script 14: Independent min-dHash computation across all pairs per accountant (not just cosine-nearest pair) - THRESHOLD_VALIDATION_OPTIONS: 2026-01 discussion doc on threshold validation approaches - .gitignore: exclude model weights, node artifacts, and xlsx data Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 21:34:24 +08:00
parent 939a348da4
commit a261a22bd2
4 changed files with 1270 additions and 0 deletions
@@ -0,0 +1,430 @@
+#!/usr/bin/env python3
+"""
+Deloitte (勤業眾信) Signature Similarity Distribution Analysis
+==============================================================
+Evaluate whether Firm A's max_similarity values follow a normal distribution
+or contain subgroups (e.g., genuinely hand-signed vs digitally stamped).
+
+Tests:
+  1. Descriptive statistics & percentiles
+  2. Normality tests (Shapiro-Wilk, D'Agostino-Pearson, Anderson-Darling, KS)
+  3. Histogram + KDE + fitted normal overlay
+  4. Q-Q plot
+  5. Multimodality check (Hartigan's dip test approximation)
+  6. Outlier identification (signatures with unusually low similarity)
+  7. dHash distance distribution for Firm A
+
+Output: figures + report to console
+"""
+
+import sqlite3
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from scipy import stats
+from pathlib import Path
+from collections import Counter
+
+DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+OUTPUT_DIR = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/deloitte_distribution')
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+FIRM_A = '勤業眾信聯合'
+
+
+def load_firm_a_data():
+    """Load all Firm A signature similarity data."""
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+
+    cur.execute('''
+        SELECT s.signature_id, s.image_filename, s.assigned_accountant,
+               s.max_similarity_to_same_accountant,
+               s.phash_distance_to_closest
+        FROM signatures s
+        LEFT JOIN accountants a ON s.assigned_accountant = a.name
+        WHERE a.firm = ?
+        AND s.max_similarity_to_same_accountant IS NOT NULL
+    ''', (FIRM_A,))
+    rows = cur.fetchall()
+    conn.close()
+
+    data = []
+    for r in rows:
+        data.append({
+            'sig_id': r[0],
+            'filename': r[1],
+            'accountant': r[2],
+            'cosine': r[3],
+            'phash': r[4],
+        })
+    return data
+
+
+def descriptive_stats(cosines, label="Firm A Cosine Similarity"):
+    """Print comprehensive descriptive statistics."""
+    print(f"\n{'='*65}")
+    print(f"  {label}")
+    print(f"{'='*65}")
+    print(f"  N            = {len(cosines):,}")
+    print(f"  Mean         = {np.mean(cosines):.6f}")
+    print(f"  Median       = {np.median(cosines):.6f}")
+    print(f"  Std Dev      = {np.std(cosines):.6f}")
+    print(f"  Variance     = {np.var(cosines):.8f}")
+    print(f"  Min          = {np.min(cosines):.6f}")
+    print(f"  Max          = {np.max(cosines):.6f}")
+    print(f"  Range        = {np.ptp(cosines):.6f}")
+    print(f"  Skewness     = {stats.skew(cosines):.4f}")
+    print(f"  Kurtosis     = {stats.kurtosis(cosines):.4f} (excess)")
+    print(f"  IQR          = {np.percentile(cosines, 75) - np.percentile(cosines, 25):.6f}")
+    print()
+    print(f"  Percentiles:")
+    for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
+        print(f"    P{p:<3d}       = {np.percentile(cosines, p):.6f}")
+
+
+def normality_tests(cosines):
+    """Run multiple normality tests."""
+    print(f"\n{'='*65}")
+    print(f"  NORMALITY TESTS")
+    print(f"{'='*65}")
+
+    # Shapiro-Wilk (max 5000 samples)
+    if len(cosines) > 5000:
+        sample = np.random.choice(cosines, 5000, replace=False)
+        stat, p = stats.shapiro(sample)
+        print(f"\n  Shapiro-Wilk (n=5000 subsample):")
+    else:
+        stat, p = stats.shapiro(cosines)
+        print(f"\n  Shapiro-Wilk (n={len(cosines)}):")
+    print(f"    W = {stat:.6f},  p = {p:.2e}")
+    print(f"    → {'Normal' if p > 0.05 else 'NOT normal'} at α=0.05")
+
+    # D'Agostino-Pearson
+    if len(cosines) >= 20:
+        stat, p = stats.normaltest(cosines)
+        print(f"\n  D'Agostino-Pearson:")
+        print(f"    K² = {stat:.4f},  p = {p:.2e}")
+        print(f"    → {'Normal' if p > 0.05 else 'NOT normal'} at α=0.05")
+
+    # Anderson-Darling
+    result = stats.anderson(cosines, dist='norm')
+    print(f"\n  Anderson-Darling:")
+    print(f"    A² = {result.statistic:.4f}")
+    for i, (sl, cv) in enumerate(zip(result.significance_level, result.critical_values)):
+        reject = "REJECT" if result.statistic > cv else "accept"
+        print(f"    {sl}%: critical={cv:.4f} → {reject}")
+
+    # Kolmogorov-Smirnov against normal
+    mu, sigma = np.mean(cosines), np.std(cosines)
+    stat, p = stats.kstest(cosines, 'norm', args=(mu, sigma))
+    print(f"\n  Kolmogorov-Smirnov (vs fitted normal):")
+    print(f"    D = {stat:.6f},  p = {p:.2e}")
+    print(f"    → {'Normal' if p > 0.05 else 'NOT normal'} at α=0.05")
+
+    return mu, sigma
+
+
+def test_alternative_distributions(cosines):
+    """Fit alternative distributions and compare."""
+    print(f"\n{'='*65}")
+    print(f"  DISTRIBUTION FITTING (AIC comparison)")
+    print(f"{'='*65}")
+
+    distributions = {
+        'norm': stats.norm,
+        'skewnorm': stats.skewnorm,
+        'beta': stats.beta,
+        'lognorm': stats.lognorm,
+        'gamma': stats.gamma,
+    }
+
+    results = []
+    for name, dist in distributions.items():
+        try:
+            params = dist.fit(cosines)
+            log_likelihood = np.sum(dist.logpdf(cosines, *params))
+            k = len(params)
+            aic = 2 * k - 2 * log_likelihood
+            bic = k * np.log(len(cosines)) - 2 * log_likelihood
+            results.append((name, aic, bic, params, log_likelihood))
+        except Exception as e:
+            print(f"  {name}: fit failed ({e})")
+
+    results.sort(key=lambda x: x[1])  # sort by AIC
+    print(f"\n  {'Distribution':<15} {'AIC':>12} {'BIC':>12} {'LogLik':>12}")
+    print(f"  {'-'*51}")
+    for name, aic, bic, params, ll in results:
+        marker = " ←best" if name == results[0][0] else ""
+        print(f"  {name:<15} {aic:>12.1f} {bic:>12.1f} {ll:>12.1f}{marker}")
+
+    return results
+
+
+def per_accountant_analysis(data):
+    """Analyze per-accountant distributions within Firm A."""
+    print(f"\n{'='*65}")
+    print(f"  PER-ACCOUNTANT ANALYSIS (within Firm A)")
+    print(f"{'='*65}")
+
+    by_acct = {}
+    for d in data:
+        by_acct.setdefault(d['accountant'], []).append(d['cosine'])
+
+    print(f"\n  {'Accountant':<20} {'N':>6} {'Mean':>8} {'Std':>8} {'Min':>8} {'P5':>8} {'P50':>8}")
+    print(f"  {'-'*66}")
+    acct_stats = []
+    for acct, vals in sorted(by_acct.items(), key=lambda x: np.mean(x[1])):
+        v = np.array(vals)
+        print(f"  {acct:<20} {len(v):>6} {v.mean():>8.4f} {v.std():>8.4f} "
+              f"{v.min():>8.4f} {np.percentile(v, 5):>8.4f} {np.median(v):>8.4f}")
+        acct_stats.append({
+            'accountant': acct,
+            'n': len(v),
+            'mean': float(v.mean()),
+            'std': float(v.std()),
+            'min': float(v.min()),
+            'values': v,
+        })
+
+    # Check if per-accountant means are homogeneous (one-way ANOVA)
+    if len(by_acct) >= 2:
+        groups = [np.array(v) for v in by_acct.values() if len(v) >= 5]
+        if len(groups) >= 2:
+            f_stat, p_val = stats.f_oneway(*groups)
+            print(f"\n  One-way ANOVA across accountants:")
+            print(f"    F = {f_stat:.4f},  p = {p_val:.2e}")
+            print(f"    → {'Homogeneous' if p_val > 0.05 else 'Significantly different means'} at α=0.05")
+
+            # Levene's test for homogeneity of variance
+            lev_stat, lev_p = stats.levene(*groups)
+            print(f"\n  Levene's test (variance homogeneity):")
+            print(f"    W = {lev_stat:.4f},  p = {lev_p:.2e}")
+            print(f"    → {'Homogeneous variance' if lev_p > 0.05 else 'Heterogeneous variance'} at α=0.05")
+
+    return acct_stats
+
+
+def identify_outliers(data, cosines):
+    """Identify Firm A signatures with unusually low similarity."""
+    print(f"\n{'='*65}")
+    print(f"  OUTLIER ANALYSIS (low-similarity Firm A signatures)")
+    print(f"{'='*65}")
+
+    q1 = np.percentile(cosines, 25)
+    q3 = np.percentile(cosines, 75)
+    iqr = q3 - q1
+    lower_fence = q1 - 1.5 * iqr
+    lower_extreme = q1 - 3.0 * iqr
+
+    print(f"  IQR method: Q1={q1:.4f}, Q3={q3:.4f}, IQR={iqr:.4f}")
+    print(f"  Lower fence (mild):    {lower_fence:.4f}")
+    print(f"  Lower fence (extreme): {lower_extreme:.4f}")
+
+    outliers = [d for d in data if d['cosine'] < lower_fence]
+    extreme_outliers = [d for d in data if d['cosine'] < lower_extreme]
+
+    print(f"\n  Mild outliers (< {lower_fence:.4f}): {len(outliers)}")
+    print(f"  Extreme outliers (< {lower_extreme:.4f}): {len(extreme_outliers)}")
+
+    if outliers:
+        print(f"\n  Bottom 20 by cosine similarity:")
+        sorted_outliers = sorted(outliers, key=lambda x: x['cosine'])[:20]
+        for d in sorted_outliers:
+            phash_str = f"pHash={d['phash']}" if d['phash'] is not None else "pHash=N/A"
+            print(f"    cosine={d['cosine']:.4f}  {phash_str}  {d['accountant']}  {d['filename']}")
+
+    # Also show count below various thresholds
+    print(f"\n  Signatures below key thresholds:")
+    for thresh in [0.95, 0.90, 0.85, 0.837, 0.80]:
+        n_below = sum(1 for c in cosines if c < thresh)
+        print(f"    < {thresh:.3f}: {n_below:,} ({100*n_below/len(cosines):.2f}%)")
+
+
+def plot_histogram_kde(cosines, mu, sigma):
+    """Plot histogram with KDE and fitted normal overlay."""
+    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+
+    # Left: Full histogram
+    ax = axes[0]
+    ax.hist(cosines, bins=80, density=True, alpha=0.6, color='steelblue',
+            edgecolor='white', linewidth=0.5, label='Observed')
+
+    # Fitted normal
+    x = np.linspace(cosines.min() - 0.02, cosines.max() + 0.02, 300)
+    ax.plot(x, stats.norm.pdf(x, mu, sigma), 'r-', lw=2,
+            label=f'Normal fit (μ={mu:.4f}, σ={sigma:.4f})')
+
+    # KDE
+    kde = stats.gaussian_kde(cosines)
+    ax.plot(x, kde(x), 'g--', lw=2, label='KDE')
+
+    ax.set_xlabel('Max Cosine Similarity')
+    ax.set_ylabel('Density')
+    ax.set_title(f'Firm A (勤業眾信) Cosine Similarity Distribution (N={len(cosines):,})')
+    ax.legend(fontsize=9)
+    ax.axvline(0.95, color='orange', ls=':', alpha=0.7, label='θ=0.95')
+    ax.axvline(0.837, color='purple', ls=':', alpha=0.7, label='KDE crossover')
+
+    # Right: Q-Q plot
+    ax2 = axes[1]
+    stats.probplot(cosines, dist='norm', plot=ax2)
+    ax2.set_title('Q-Q Plot (vs Normal)')
+    ax2.get_lines()[0].set_markersize(2)
+
+    plt.tight_layout()
+    fig.savefig(OUTPUT_DIR / 'firm_a_cosine_distribution.png', dpi=150)
+    print(f"\n  Saved: {OUTPUT_DIR / 'firm_a_cosine_distribution.png'}")
+    plt.close()
+
+
+def plot_per_accountant(acct_stats):
+    """Box plot per accountant."""
+    # Sort by mean
+    acct_stats.sort(key=lambda x: x['mean'])
+
+    fig, ax = plt.subplots(figsize=(12, max(5, len(acct_stats) * 0.4)))
+    positions = range(len(acct_stats))
+    labels = [f"{a['accountant']} (n={a['n']})" for a in acct_stats]
+    box_data = [a['values'] for a in acct_stats]
+
+    bp = ax.boxplot(box_data, positions=positions, vert=False, widths=0.6,
+                    patch_artist=True, showfliers=True,
+                    flierprops=dict(marker='.', markersize=3, alpha=0.5))
+    for patch in bp['boxes']:
+        patch.set_facecolor('lightsteelblue')
+
+    ax.set_yticks(positions)
+    ax.set_yticklabels(labels, fontsize=8)
+    ax.set_xlabel('Max Cosine Similarity')
+    ax.set_title('Per-Accountant Similarity Distribution (Firm A)')
+    ax.axvline(0.95, color='orange', ls=':', alpha=0.7)
+    ax.axvline(0.837, color='purple', ls=':', alpha=0.7)
+
+    plt.tight_layout()
+    fig.savefig(OUTPUT_DIR / 'firm_a_per_accountant_boxplot.png', dpi=150)
+    print(f"  Saved: {OUTPUT_DIR / 'firm_a_per_accountant_boxplot.png'}")
+    plt.close()
+
+
+def plot_phash_distribution(data):
+    """Plot dHash distance distribution for Firm A."""
+    phash_vals = [d['phash'] for d in data if d['phash'] is not None]
+    if not phash_vals:
+        print("  No pHash data available.")
+        return
+
+    phash_arr = np.array(phash_vals)
+
+    fig, ax = plt.subplots(figsize=(10, 5))
+    max_val = min(int(phash_arr.max()) + 2, 65)
+    bins = np.arange(-0.5, max_val + 0.5, 1)
+    ax.hist(phash_arr, bins=bins, alpha=0.7, color='coral', edgecolor='white')
+    ax.set_xlabel('dHash Distance')
+    ax.set_ylabel('Count')
+    ax.set_title(f'Firm A dHash Distance Distribution (N={len(phash_vals):,})')
+    ax.axvline(5, color='green', ls='--', label='θ=5 (high conf.)')
+    ax.axvline(15, color='orange', ls='--', label='θ=15 (moderate)')
+    ax.legend()
+
+    plt.tight_layout()
+    fig.savefig(OUTPUT_DIR / 'firm_a_dhash_distribution.png', dpi=150)
+    print(f"  Saved: {OUTPUT_DIR / 'firm_a_dhash_distribution.png'}")
+    plt.close()
+
+
+def multimodality_test(cosines):
+    """Check for potential multimodality using kernel density peaks."""
+    print(f"\n{'='*65}")
+    print(f"  MULTIMODALITY ANALYSIS")
+    print(f"{'='*65}")
+
+    kde = stats.gaussian_kde(cosines, bw_method='silverman')
+    x = np.linspace(cosines.min(), cosines.max(), 1000)
+    density = kde(x)
+
+    # Find local maxima
+    from scipy.signal import find_peaks
+    peaks, properties = find_peaks(density, prominence=0.01)
+    peak_positions = x[peaks]
+    peak_heights = density[peaks]
+
+    print(f"  KDE bandwidth (Silverman): {kde.factor:.6f}")
+    print(f"  Number of detected modes: {len(peaks)}")
+    for i, (pos, h) in enumerate(zip(peak_positions, peak_heights)):
+        print(f"    Mode {i+1}: position={pos:.4f}, density={h:.2f}")
+
+    if len(peaks) == 1:
+        print(f"\n  → Distribution appears UNIMODAL")
+        print(f"    Single peak at {peak_positions[0]:.4f}")
+    elif len(peaks) > 1:
+        print(f"\n  → Distribution appears MULTIMODAL ({len(peaks)} modes)")
+        print(f"    This suggests subgroups may exist within Firm A")
+        # Check separation between modes
+        for i in range(len(peaks) - 1):
+            sep = peak_positions[i + 1] - peak_positions[i]
+            # Find valley between modes
+            valley_region = density[peaks[i]:peaks[i + 1]]
+            valley_depth = peak_heights[i:i + 2].min() - valley_region.min()
+            print(f"    Separation {i+1}-{i+2}: Δ={sep:.4f}, valley depth={valley_depth:.2f}")
+
+    # Also try different bandwidths
+    print(f"\n  Sensitivity analysis (bandwidth variation):")
+    for bw_factor in [0.5, 0.75, 1.0, 1.5, 2.0]:
+        bw = kde.factor * bw_factor
+        kde_test = stats.gaussian_kde(cosines, bw_method=bw)
+        density_test = kde_test(x)
+        peaks_test, _ = find_peaks(density_test, prominence=0.005)
+        print(f"    bw={bw:.4f} (×{bw_factor:.1f}): {len(peaks_test)} mode(s)")
+
+
+def main():
+    print("Loading Firm A (勤業眾信) signature data...")
+    data = load_firm_a_data()
+    print(f"Total Firm A signatures: {len(data):,}")
+
+    cosines = np.array([d['cosine'] for d in data])
+
+    # 1. Descriptive statistics
+    descriptive_stats(cosines)
+
+    # 2. Normality tests
+    mu, sigma = normality_tests(cosines)
+
+    # 3. Alternative distribution fitting
+    test_alternative_distributions(cosines)
+
+    # 4. Per-accountant analysis
+    acct_stats = per_accountant_analysis(data)
+
+    # 5. Outlier analysis
+    identify_outliers(data, cosines)
+
+    # 6. Multimodality test
+    multimodality_test(cosines)
+
+    # 7. Generate plots
+    print(f"\n{'='*65}")
+    print(f"  GENERATING FIGURES")
+    print(f"{'='*65}")
+    plot_histogram_kde(cosines, mu, sigma)
+    plot_per_accountant(acct_stats)
+    plot_phash_distribution(data)
+
+    # Summary
+    print(f"\n{'='*65}")
+    print(f"  SUMMARY")
+    print(f"{'='*65}")
+    below_95 = sum(1 for c in cosines if c < 0.95)
+    below_kde = sum(1 for c in cosines if c < 0.837)
+    print(f"  Firm A signatures: {len(cosines):,}")
+    print(f"  Below 0.95 threshold: {below_95:,} ({100*below_95/len(cosines):.1f}%)")
+    print(f"  Below KDE crossover (0.837): {below_kde:,} ({100*below_kde/len(cosines):.1f}%)")
+    print(f"  If distribution is NOT normal → subgroups may exist")
+    print(f"  If multimodal → some signatures may be genuinely hand-signed")
+    print(f"\n  Output directory: {OUTPUT_DIR}")
+
+
+if __name__ == "__main__":
+    main()