Add Deloitte distribution & independent dHash analysis scripts

- Script 13: Firm A normality/multimodality analysis (Shapiro-Wilk, Anderson-Darling, KDE, per-accountant ANOVA, Beta/Gamma fitting) - Script 14: Independent min-dHash computation across all pairs per accountant (not just cosine-nearest pair) - THRESHOLD_VALIDATION_OPTIONS: 2026-01 discussion doc on threshold validation approaches - .gitignore: exclude model weights, node artifacts, and xlsx data Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 21:34:24 +08:00
parent 939a348da4
commit a261a22bd2
4 changed files with 1270 additions and 0 deletions
@@ -48,3 +48,16 @@ Thumbs.db
 # Temporary files
 *.tmp
 *.bak
 # Model weights (too large for git)
 models/
 *.pt
 *.pth
 # Node.js shells (accidentally created)
 package.json
 package-lock.json
 node_modules/
 # Sensitive/large data
 *.xlsx
@@ -0,0 +1,430 @@
 #!/usr/bin/env python3
 """
 Deloitte (勤業眾信) Signature Similarity Distribution Analysis
 ==============================================================
 Evaluate whether Firm A's max_similarity values follow a normal distribution
 or contain subgroups (e.g., genuinely hand-signed vs digitally stamped).
 Tests:
  1. Descriptive statistics & percentiles
  2. Normality tests (Shapiro-Wilk, D'Agostino-Pearson, Anderson-Darling, KS)
  3. Histogram + KDE + fitted normal overlay
  4. Q-Q plot
  5. Multimodality check (Hartigan's dip test approximation)
  6. Outlier identification (signatures with unusually low similarity)
  7. dHash distance distribution for Firm A
 Output: figures + report to console
 """
 import sqlite3
 import numpy as np
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 from scipy import stats
 from pathlib import Path
 from collections import Counter
 DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
 OUTPUT_DIR = Path('/Volumes/NV2/PDF-Processing/signature-analysis/reports/deloitte_distribution')
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 FIRM_A = '勤業眾信聯合'
 def load_firm_a_data():
    """Load all Firm A signature similarity data."""
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    cur.execute('''
        SELECT s.signature_id, s.image_filename, s.assigned_accountant,
               s.max_similarity_to_same_accountant,
               s.phash_distance_to_closest
        FROM signatures s
        LEFT JOIN accountants a ON s.assigned_accountant = a.name
        WHERE a.firm = ?
        AND s.max_similarity_to_same_accountant IS NOT NULL
    ''', (FIRM_A,))
    rows = cur.fetchall()
    conn.close()
    data = []
    for r in rows:
        data.append({
            'sig_id': r[0],
            'filename': r[1],
            'accountant': r[2],
            'cosine': r[3],
            'phash': r[4],
        })
    return data
 def descriptive_stats(cosines, label="Firm A Cosine Similarity"):
    """Print comprehensive descriptive statistics."""
    print(f"\n{'='*65}")
    print(f"  {label}")
    print(f"{'='*65}")
    print(f"  N            = {len(cosines):,}")
    print(f"  Mean         = {np.mean(cosines):.6f}")
    print(f"  Median       = {np.median(cosines):.6f}")
    print(f"  Std Dev      = {np.std(cosines):.6f}")
    print(f"  Variance     = {np.var(cosines):.8f}")
    print(f"  Min          = {np.min(cosines):.6f}")
    print(f"  Max          = {np.max(cosines):.6f}")
    print(f"  Range        = {np.ptp(cosines):.6f}")
    print(f"  Skewness     = {stats.skew(cosines):.4f}")
    print(f"  Kurtosis     = {stats.kurtosis(cosines):.4f} (excess)")
    print(f"  IQR          = {np.percentile(cosines, 75) - np.percentile(cosines, 25):.6f}")
    print()
    print(f"  Percentiles:")
    for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
        print(f"    P{p:<3d}       = {np.percentile(cosines, p):.6f}")
 def normality_tests(cosines):
    """Run multiple normality tests."""
    print(f"\n{'='*65}")
    print(f"  NORMALITY TESTS")
    print(f"{'='*65}")
    # Shapiro-Wilk (max 5000 samples)
    if len(cosines) > 5000:
        sample = np.random.choice(cosines, 5000, replace=False)
        stat, p = stats.shapiro(sample)
        print(f"\n  Shapiro-Wilk (n=5000 subsample):")
    else:
        stat, p = stats.shapiro(cosines)
        print(f"\n  Shapiro-Wilk (n={len(cosines)}):")
    print(f"    W = {stat:.6f},  p = {p:.2e}")
    print(f"    → {'Normal' if p > 0.05 else 'NOT normal'} at α=0.05")
    # D'Agostino-Pearson
    if len(cosines) >= 20:
        stat, p = stats.normaltest(cosines)
        print(f"\n  D'Agostino-Pearson:")
        print(f"    K² = {stat:.4f},  p = {p:.2e}")
        print(f"    → {'Normal' if p > 0.05 else 'NOT normal'} at α=0.05")
    # Anderson-Darling
    result = stats.anderson(cosines, dist='norm')
    print(f"\n  Anderson-Darling:")
    print(f"    A² = {result.statistic:.4f}")
    for i, (sl, cv) in enumerate(zip(result.significance_level, result.critical_values)):
        reject = "REJECT" if result.statistic > cv else "accept"
        print(f"    {sl}%: critical={cv:.4f} → {reject}")
    # Kolmogorov-Smirnov against normal
    mu, sigma = np.mean(cosines), np.std(cosines)
    stat, p = stats.kstest(cosines, 'norm', args=(mu, sigma))
    print(f"\n  Kolmogorov-Smirnov (vs fitted normal):")
    print(f"    D = {stat:.6f},  p = {p:.2e}")
    print(f"    → {'Normal' if p > 0.05 else 'NOT normal'} at α=0.05")
    return mu, sigma
 def test_alternative_distributions(cosines):
    """Fit alternative distributions and compare."""
    print(f"\n{'='*65}")
    print(f"  DISTRIBUTION FITTING (AIC comparison)")
    print(f"{'='*65}")
    distributions = {
        'norm': stats.norm,
        'skewnorm': stats.skewnorm,
        'beta': stats.beta,
        'lognorm': stats.lognorm,
        'gamma': stats.gamma,
    }
    results = []
    for name, dist in distributions.items():
        try:
            params = dist.fit(cosines)
            log_likelihood = np.sum(dist.logpdf(cosines, *params))
            k = len(params)
            aic = 2 * k - 2 * log_likelihood
            bic = k * np.log(len(cosines)) - 2 * log_likelihood
            results.append((name, aic, bic, params, log_likelihood))
        except Exception as e:
            print(f"  {name}: fit failed ({e})")
    results.sort(key=lambda x: x[1])  # sort by AIC
    print(f"\n  {'Distribution':<15} {'AIC':>12} {'BIC':>12} {'LogLik':>12}")
    print(f"  {'-'*51}")
    for name, aic, bic, params, ll in results:
        marker = " ←best" if name == results[0][0] else ""
        print(f"  {name:<15} {aic:>12.1f} {bic:>12.1f} {ll:>12.1f}{marker}")
    return results
 def per_accountant_analysis(data):
    """Analyze per-accountant distributions within Firm A."""
    print(f"\n{'='*65}")
    print(f"  PER-ACCOUNTANT ANALYSIS (within Firm A)")
    print(f"{'='*65}")
    by_acct = {}
    for d in data:
        by_acct.setdefault(d['accountant'], []).append(d['cosine'])
    print(f"\n  {'Accountant':<20} {'N':>6} {'Mean':>8} {'Std':>8} {'Min':>8} {'P5':>8} {'P50':>8}")
    print(f"  {'-'*66}")
    acct_stats = []
    for acct, vals in sorted(by_acct.items(), key=lambda x: np.mean(x[1])):
        v = np.array(vals)
        print(f"  {acct:<20} {len(v):>6} {v.mean():>8.4f} {v.std():>8.4f} "
              f"{v.min():>8.4f} {np.percentile(v, 5):>8.4f} {np.median(v):>8.4f}")
        acct_stats.append({
            'accountant': acct,
            'n': len(v),
            'mean': float(v.mean()),
            'std': float(v.std()),
            'min': float(v.min()),
            'values': v,
        })
    # Check if per-accountant means are homogeneous (one-way ANOVA)
    if len(by_acct) >= 2:
        groups = [np.array(v) for v in by_acct.values() if len(v) >= 5]
        if len(groups) >= 2:
            f_stat, p_val = stats.f_oneway(*groups)
            print(f"\n  One-way ANOVA across accountants:")
            print(f"    F = {f_stat:.4f},  p = {p_val:.2e}")
            print(f"    → {'Homogeneous' if p_val > 0.05 else 'Significantly different means'} at α=0.05")
            # Levene's test for homogeneity of variance
            lev_stat, lev_p = stats.levene(*groups)
            print(f"\n  Levene's test (variance homogeneity):")
            print(f"    W = {lev_stat:.4f},  p = {lev_p:.2e}")
            print(f"    → {'Homogeneous variance' if lev_p > 0.05 else 'Heterogeneous variance'} at α=0.05")
    return acct_stats
 def identify_outliers(data, cosines):
    """Identify Firm A signatures with unusually low similarity."""
    print(f"\n{'='*65}")
    print(f"  OUTLIER ANALYSIS (low-similarity Firm A signatures)")
    print(f"{'='*65}")
    q1 = np.percentile(cosines, 25)
    q3 = np.percentile(cosines, 75)
    iqr = q3 - q1
    lower_fence = q1 - 1.5 * iqr
    lower_extreme = q1 - 3.0 * iqr
    print(f"  IQR method: Q1={q1:.4f}, Q3={q3:.4f}, IQR={iqr:.4f}")
    print(f"  Lower fence (mild):    {lower_fence:.4f}")
    print(f"  Lower fence (extreme): {lower_extreme:.4f}")
    outliers = [d for d in data if d['cosine'] < lower_fence]
    extreme_outliers = [d for d in data if d['cosine'] < lower_extreme]
    print(f"\n  Mild outliers (< {lower_fence:.4f}): {len(outliers)}")
    print(f"  Extreme outliers (< {lower_extreme:.4f}): {len(extreme_outliers)}")
    if outliers:
        print(f"\n  Bottom 20 by cosine similarity:")
        sorted_outliers = sorted(outliers, key=lambda x: x['cosine'])[:20]
        for d in sorted_outliers:
            phash_str = f"pHash={d['phash']}" if d['phash'] is not None else "pHash=N/A"
            print(f"    cosine={d['cosine']:.4f}  {phash_str}  {d['accountant']}  {d['filename']}")
    # Also show count below various thresholds
    print(f"\n  Signatures below key thresholds:")
    for thresh in [0.95, 0.90, 0.85, 0.837, 0.80]:
        n_below = sum(1 for c in cosines if c < thresh)
        print(f"    < {thresh:.3f}: {n_below:,} ({100*n_below/len(cosines):.2f}%)")
 def plot_histogram_kde(cosines, mu, sigma):
    """Plot histogram with KDE and fitted normal overlay."""
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    # Left: Full histogram
    ax = axes[0]
    ax.hist(cosines, bins=80, density=True, alpha=0.6, color='steelblue',
            edgecolor='white', linewidth=0.5, label='Observed')
    # Fitted normal
    x = np.linspace(cosines.min() - 0.02, cosines.max() + 0.02, 300)
    ax.plot(x, stats.norm.pdf(x, mu, sigma), 'r-', lw=2,
            label=f'Normal fit (μ={mu:.4f}, σ={sigma:.4f})')
    # KDE
    kde = stats.gaussian_kde(cosines)
    ax.plot(x, kde(x), 'g--', lw=2, label='KDE')
    ax.set_xlabel('Max Cosine Similarity')
    ax.set_ylabel('Density')
    ax.set_title(f'Firm A (勤業眾信) Cosine Similarity Distribution (N={len(cosines):,})')
    ax.legend(fontsize=9)
    ax.axvline(0.95, color='orange', ls=':', alpha=0.7, label='θ=0.95')
    ax.axvline(0.837, color='purple', ls=':', alpha=0.7, label='KDE crossover')
    # Right: Q-Q plot
    ax2 = axes[1]
    stats.probplot(cosines, dist='norm', plot=ax2)
    ax2.set_title('Q-Q Plot (vs Normal)')
    ax2.get_lines()[0].set_markersize(2)
    plt.tight_layout()
    fig.savefig(OUTPUT_DIR / 'firm_a_cosine_distribution.png', dpi=150)
    print(f"\n  Saved: {OUTPUT_DIR / 'firm_a_cosine_distribution.png'}")
    plt.close()
 def plot_per_accountant(acct_stats):
    """Box plot per accountant."""
    # Sort by mean
    acct_stats.sort(key=lambda x: x['mean'])
    fig, ax = plt.subplots(figsize=(12, max(5, len(acct_stats) * 0.4)))
    positions = range(len(acct_stats))
    labels = [f"{a['accountant']} (n={a['n']})" for a in acct_stats]
    box_data = [a['values'] for a in acct_stats]
    bp = ax.boxplot(box_data, positions=positions, vert=False, widths=0.6,
                    patch_artist=True, showfliers=True,
                    flierprops=dict(marker='.', markersize=3, alpha=0.5))
    for patch in bp['boxes']:
        patch.set_facecolor('lightsteelblue')
    ax.set_yticks(positions)
    ax.set_yticklabels(labels, fontsize=8)
    ax.set_xlabel('Max Cosine Similarity')
    ax.set_title('Per-Accountant Similarity Distribution (Firm A)')
    ax.axvline(0.95, color='orange', ls=':', alpha=0.7)
    ax.axvline(0.837, color='purple', ls=':', alpha=0.7)
    plt.tight_layout()
    fig.savefig(OUTPUT_DIR / 'firm_a_per_accountant_boxplot.png', dpi=150)
    print(f"  Saved: {OUTPUT_DIR / 'firm_a_per_accountant_boxplot.png'}")
    plt.close()
 def plot_phash_distribution(data):
    """Plot dHash distance distribution for Firm A."""
    phash_vals = [d['phash'] for d in data if d['phash'] is not None]
    if not phash_vals:
        print("  No pHash data available.")
        return
    phash_arr = np.array(phash_vals)
    fig, ax = plt.subplots(figsize=(10, 5))
    max_val = min(int(phash_arr.max()) + 2, 65)
    bins = np.arange(-0.5, max_val + 0.5, 1)
    ax.hist(phash_arr, bins=bins, alpha=0.7, color='coral', edgecolor='white')
    ax.set_xlabel('dHash Distance')
    ax.set_ylabel('Count')
    ax.set_title(f'Firm A dHash Distance Distribution (N={len(phash_vals):,})')
    ax.axvline(5, color='green', ls='--', label='θ=5 (high conf.)')
    ax.axvline(15, color='orange', ls='--', label='θ=15 (moderate)')
    ax.legend()
    plt.tight_layout()
    fig.savefig(OUTPUT_DIR / 'firm_a_dhash_distribution.png', dpi=150)
    print(f"  Saved: {OUTPUT_DIR / 'firm_a_dhash_distribution.png'}")
    plt.close()
 def multimodality_test(cosines):
    """Check for potential multimodality using kernel density peaks."""
    print(f"\n{'='*65}")
    print(f"  MULTIMODALITY ANALYSIS")
    print(f"{'='*65}")
    kde = stats.gaussian_kde(cosines, bw_method='silverman')
    x = np.linspace(cosines.min(), cosines.max(), 1000)
    density = kde(x)
    # Find local maxima
    from scipy.signal import find_peaks
    peaks, properties = find_peaks(density, prominence=0.01)
    peak_positions = x[peaks]
    peak_heights = density[peaks]
    print(f"  KDE bandwidth (Silverman): {kde.factor:.6f}")
    print(f"  Number of detected modes: {len(peaks)}")
    for i, (pos, h) in enumerate(zip(peak_positions, peak_heights)):
        print(f"    Mode {i+1}: position={pos:.4f}, density={h:.2f}")
    if len(peaks) == 1:
        print(f"\n  → Distribution appears UNIMODAL")
        print(f"    Single peak at {peak_positions[0]:.4f}")
    elif len(peaks) > 1:
        print(f"\n  → Distribution appears MULTIMODAL ({len(peaks)} modes)")
        print(f"    This suggests subgroups may exist within Firm A")
        # Check separation between modes
        for i in range(len(peaks) - 1):
            sep = peak_positions[i + 1] - peak_positions[i]
            # Find valley between modes
            valley_region = density[peaks[i]:peaks[i + 1]]
            valley_depth = peak_heights[i:i + 2].min() - valley_region.min()
            print(f"    Separation {i+1}-{i+2}: Δ={sep:.4f}, valley depth={valley_depth:.2f}")
    # Also try different bandwidths
    print(f"\n  Sensitivity analysis (bandwidth variation):")
    for bw_factor in [0.5, 0.75, 1.0, 1.5, 2.0]:
        bw = kde.factor * bw_factor
        kde_test = stats.gaussian_kde(cosines, bw_method=bw)
        density_test = kde_test(x)
        peaks_test, _ = find_peaks(density_test, prominence=0.005)
        print(f"    bw={bw:.4f} (×{bw_factor:.1f}): {len(peaks_test)} mode(s)")
 def main():
    print("Loading Firm A (勤業眾信) signature data...")
    data = load_firm_a_data()
    print(f"Total Firm A signatures: {len(data):,}")
    cosines = np.array([d['cosine'] for d in data])
    # 1. Descriptive statistics
    descriptive_stats(cosines)
    # 2. Normality tests
    mu, sigma = normality_tests(cosines)
    # 3. Alternative distribution fitting
    test_alternative_distributions(cosines)
    # 4. Per-accountant analysis
    acct_stats = per_accountant_analysis(data)
    # 5. Outlier analysis
    identify_outliers(data, cosines)
    # 6. Multimodality test
    multimodality_test(cosines)
    # 7. Generate plots
    print(f"\n{'='*65}")
    print(f"  GENERATING FIGURES")
    print(f"{'='*65}")
    plot_histogram_kde(cosines, mu, sigma)
    plot_per_accountant(acct_stats)
    plot_phash_distribution(data)
    # Summary
    print(f"\n{'='*65}")
    print(f"  SUMMARY")
    print(f"{'='*65}")
    below_95 = sum(1 for c in cosines if c < 0.95)
    below_kde = sum(1 for c in cosines if c < 0.837)
    print(f"  Firm A signatures: {len(cosines):,}")
    print(f"  Below 0.95 threshold: {below_95:,} ({100*below_95/len(cosines):.1f}%)")
    print(f"  Below KDE crossover (0.837): {below_kde:,} ({100*below_kde/len(cosines):.1f}%)")
    print(f"  If distribution is NOT normal → subgroups may exist")
    print(f"  If multimodal → some signatures may be genuinely hand-signed")
    print(f"\n  Output directory: {OUTPUT_DIR}")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,293 @@
 #!/usr/bin/env python3
 """
 Compute independent min dHash for all signatures.
 ===================================================
 Currently phash_distance_to_closest is conditional on cosine-nearest pair.
 This script computes an INDEPENDENT min dHash: for each signature, find the
 pair within the same accountant that has the smallest dHash distance,
 regardless of cosine similarity.
 Three metrics after this script:
  1. max_similarity_to_same_accountant  (max cosine)     — primary classifier
  2. min_dhash_independent              (independent min) — independent 2nd classifier
  3. phash_distance_to_closest          (conditional)     — diagnostic tool
 Phase 1: Compute dHash vector for each image, store as BLOB in DB
 Phase 2: All-pairs hamming distance within same accountant, store min
 """
 import sqlite3
 import numpy as np
 import cv2
 import os
 import sys
 import time
 from multiprocessing import Pool, cpu_count
 from pathlib import Path
 DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
 IMAGE_DIR = '/Volumes/NV2/PDF-Processing/yolo-signatures/images'
 NUM_WORKERS = max(1, cpu_count() - 2)
 BATCH_SIZE = 5000
 HASH_SIZE = 8  # 9x8 -> 8x8 = 64-bit hash
 # ── Phase 1: Compute dHash per image ─────────────────────────────────
 def compute_dhash_for_file(args):
    """Compute dHash for a single image file. Returns (sig_id, hash_bytes) or (sig_id, None)."""
    sig_id, filename = args
    path = os.path.join(IMAGE_DIR, filename)
    try:
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            return (sig_id, None)
        resized = cv2.resize(img, (HASH_SIZE + 1, HASH_SIZE))
        diff = resized[:, 1:] > resized[:, :-1]  # 8x8 = 64 bits
        return (sig_id, np.packbits(diff.flatten()).tobytes())
    except Exception:
        return (sig_id, None)
 def phase1_compute_hashes():
    """Compute and store dHash for all signatures."""
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    # Add columns if not exist
    for col in ['dhash_vector BLOB', 'min_dhash_independent INTEGER',
                'min_dhash_independent_match TEXT']:
        try:
            cur.execute(f'ALTER TABLE signatures ADD COLUMN {col}')
        except sqlite3.OperationalError:
            pass
    conn.commit()
    # Check which signatures already have dhash_vector
    cur.execute('''
        SELECT signature_id, image_filename
        FROM signatures
        WHERE feature_vector IS NOT NULL
          AND assigned_accountant IS NOT NULL
          AND dhash_vector IS NULL
    ''')
    todo = cur.fetchall()
    if not todo:
        # Check total with dhash
        cur.execute('SELECT COUNT(*) FROM signatures WHERE dhash_vector IS NOT NULL')
        n_done = cur.fetchone()[0]
        print(f"  Phase 1 already complete ({n_done:,} hashes in DB)")
        conn.close()
        return
    print(f"  Computing dHash for {len(todo):,} images ({NUM_WORKERS} workers)...")
    t0 = time.time()
    processed = 0
    for batch_start in range(0, len(todo), BATCH_SIZE):
        batch = todo[batch_start:batch_start + BATCH_SIZE]
        with Pool(NUM_WORKERS) as pool:
            results = pool.map(compute_dhash_for_file, batch)
        updates = [(dhash, sid) for sid, dhash in results if dhash is not None]
        cur.executemany('UPDATE signatures SET dhash_vector = ? WHERE signature_id = ?', updates)
        conn.commit()
        processed += len(batch)
        elapsed = time.time() - t0
        rate = processed / elapsed
        eta = (len(todo) - processed) / rate if rate > 0 else 0
        print(f"    {processed:,}/{len(todo):,}  ({rate:.0f}/s, ETA {eta:.0f}s)")
    conn.close()
    elapsed = time.time() - t0
    print(f"  Phase 1 done: {processed:,} hashes in {elapsed:.1f}s")
 # ── Phase 2: All-pairs min dHash within same accountant ──────────────
 def hamming_distance(h1_bytes, h2_bytes):
    """Hamming distance between two packed dHash byte strings."""
    a = np.frombuffer(h1_bytes, dtype=np.uint8)
    b = np.frombuffer(h2_bytes, dtype=np.uint8)
    xor = np.bitwise_xor(a, b)
    return sum(bin(byte).count('1') for byte in xor)
 def phase2_compute_min_dhash():
    """For each accountant group, find the min dHash pair per signature."""
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    # Load all signatures with dhash
    cur.execute('''
        SELECT s.signature_id, s.assigned_accountant, s.dhash_vector, s.image_filename
        FROM signatures s
        WHERE s.dhash_vector IS NOT NULL
          AND s.assigned_accountant IS NOT NULL
    ''')
    rows = cur.fetchall()
    print(f"  Loaded {len(rows):,} signatures with dHash")
    # Group by accountant
    acct_groups = {}
    for sig_id, acct, dhash, filename in rows:
        acct_groups.setdefault(acct, []).append((sig_id, dhash, filename))
    # Filter out singletons
    acct_groups = {k: v for k, v in acct_groups.items() if len(v) >= 2}
    total_sigs = sum(len(v) for v in acct_groups.values())
    total_pairs = sum(len(v) * (len(v) - 1) // 2 for v in acct_groups.values())
    print(f"  {len(acct_groups)} accountants, {total_sigs:,} signatures, {total_pairs:,} pairs")
    t0 = time.time()
    updates = []
    accts_done = 0
    for acct, sigs in acct_groups.items():
        n = len(sigs)
        sig_ids = [s[0] for s in sigs]
        hashes = [s[1] for s in sigs]
        filenames = [s[2] for s in sigs]
        # Unpack all hashes to bit arrays for vectorized hamming
        bits = np.array([np.unpackbits(np.frombuffer(h, dtype=np.uint8)) for h in hashes],
                        dtype=np.uint8)  # shape: (n, 64)
        # Pairwise hamming via XOR + sum
        # For groups up to ~2000, direct matrix computation is fine
        # hamming_matrix[i,j] = number of differing bits between i and j
        xor_matrix = bits[:, None, :] ^ bits[None, :, :]  # (n, n, 64)
        hamming_matrix = xor_matrix.sum(axis=2)  # (n, n)
        np.fill_diagonal(hamming_matrix, 999)  # exclude self
        # For each signature, find min
        min_indices = np.argmin(hamming_matrix, axis=1)
        min_distances = hamming_matrix[np.arange(n), min_indices]
        for i in range(n):
            updates.append((
                int(min_distances[i]),
                filenames[min_indices[i]],
                sig_ids[i]
            ))
        accts_done += 1
        if accts_done % 100 == 0:
            elapsed = time.time() - t0
            print(f"    {accts_done}/{len(acct_groups)} accountants ({elapsed:.0f}s)")
    # Write to DB
    print(f"  Writing {len(updates):,} results to DB...")
    cur.executemany('''
        UPDATE signatures
        SET min_dhash_independent = ?, min_dhash_independent_match = ?
        WHERE signature_id = ?
    ''', updates)
    conn.commit()
    conn.close()
    elapsed = time.time() - t0
    print(f"  Phase 2 done: {len(updates):,} signatures in {elapsed:.1f}s")
 # ── Phase 3: Summary statistics ──────────────────────────────────────
 def print_summary():
    """Print summary comparing conditional vs independent dHash."""
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    # Overall stats
    cur.execute('''
        SELECT
            COUNT(*) as n,
            AVG(phash_distance_to_closest) as cond_mean,
            AVG(min_dhash_independent) as indep_mean
        FROM signatures
        WHERE min_dhash_independent IS NOT NULL
          AND phash_distance_to_closest IS NOT NULL
    ''')
    n, cond_mean, indep_mean = cur.fetchone()
    print(f"\n{'='*65}")
    print(f"  COMPARISON: Conditional vs Independent dHash")
    print(f"{'='*65}")
    print(f"  N = {n:,}")
    print(f"  Conditional dHash (cosine-nearest pair):  mean = {cond_mean:.2f}")
    print(f"  Independent dHash (all-pairs min):        mean = {indep_mean:.2f}")
    # Percentiles
    cur.execute('''
        SELECT phash_distance_to_closest, min_dhash_independent
        FROM signatures
        WHERE min_dhash_independent IS NOT NULL
          AND phash_distance_to_closest IS NOT NULL
    ''')
    rows = cur.fetchall()
    cond = np.array([r[0] for r in rows])
    indep = np.array([r[1] for r in rows])
    print(f"\n  {'Percentile':<12} {'Conditional':>12} {'Independent':>12} {'Diff':>8}")
    print(f"  {'-'*44}")
    for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
        cv = np.percentile(cond, p)
        iv = np.percentile(indep, p)
        print(f"  P{p:<10d} {cv:>12.1f} {iv:>12.1f} {iv-cv:>+8.1f}")
    # Agreement analysis
    print(f"\n  Agreement analysis (both ≤ threshold):")
    for t in [5, 10, 15, 21]:
        both = np.sum((cond <= t) & (indep <= t))
        cond_only = np.sum((cond <= t) & (indep > t))
        indep_only = np.sum((cond > t) & (indep <= t))
        neither = np.sum((cond > t) & (indep > t))
        agree_pct = (both + neither) / len(cond) * 100
        print(f"  θ={t:>2d}: both={both:,}, cond_only={cond_only:,}, "
              f"indep_only={indep_only:,}, neither={neither:,} (agree={agree_pct:.1f}%)")
    # Firm A specific
    cur.execute('''
        SELECT s.phash_distance_to_closest, s.min_dhash_independent
        FROM signatures s
        LEFT JOIN accountants a ON s.assigned_accountant = a.name
        WHERE a.firm = '勤業眾信聯合'
          AND s.min_dhash_independent IS NOT NULL
          AND s.phash_distance_to_closest IS NOT NULL
    ''')
    rows = cur.fetchall()
    if rows:
        cond_a = np.array([r[0] for r in rows])
        indep_a = np.array([r[1] for r in rows])
        print(f"\n  Firm A (勤業眾信) — N={len(rows):,}:")
        print(f"  {'Percentile':<12} {'Conditional':>12} {'Independent':>12}")
        print(f"  {'-'*36}")
        for p in [50, 75, 90, 95, 99]:
            print(f"  P{p:<10d} {np.percentile(cond_a, p):>12.1f} {np.percentile(indep_a, p):>12.1f}")
    conn.close()
 def main():
    t_start = time.time()
    print("=" * 65)
    print("  Independent Min dHash Computation")
    print("=" * 65)
    print(f"\n[Phase 1] Computing dHash vectors...")
    phase1_compute_hashes()
    print(f"\n[Phase 2] Computing all-pairs min dHash per accountant...")
    phase2_compute_min_dhash()
    print(f"\n[Phase 3] Summary...")
    print_summary()
    elapsed = time.time() - t_start
    print(f"\nTotal time: {elapsed:.0f}s ({elapsed/60:.1f} min)")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,534 @@
 # Signature Verification Threshold Validation Options
 **Report Date:** 2026-01-14
 **Purpose:** Discussion document for research partners on threshold selection methodology
 **Context:** Validating copy-paste detection thresholds for accountant signature analysis
 ---
 ## Table of Contents
 1. [Current Findings Summary](#1-current-findings-summary)
 2. [The Core Problem](#2-the-core-problem)
 3. [Key Metrics Explained](#3-key-metrics-explained)
 4. [Validation Options](#4-validation-options)
 5. [Academic References](#5-academic-references)
 6. [Recommendations](#6-recommendations)
 7. [Next Steps for Discussion](#7-next-steps-for-discussion)
 ---
 ## 1. Current Findings Summary
 Our YOLO-based signature extraction and similarity analysis produced the following results:
 | Metric | Value |
 |--------|-------|
 | Total PDFs analyzed | 84,386 |
 | Total signatures extracted | 168,755 |
 | High similarity pairs (>0.95) | 659,111 |
 | Classified as "copy-paste" | 71,656 PDFs (84.9%) |
 | Classified as "authentic" | 76 PDFs (0.1%) |
 | Uncertain | 12,651 PDFs (15.0%) |
 **Current threshold used:**
 - Copy-paste: similarity ≥ 0.95
 - Authentic: similarity ≤ 0.85
 - Uncertain: 0.85 < similarity < 0.95
 ---
 ## 2. The Core Problem
 ### 2.1 What is Ground Truth?
 **Ground truth labels** are pre-verified classifications that serve as the "correct answer" for machine learning evaluation. For signature verification:
 | Label | Meaning | How to Obtain |
 |-------|---------|---------------|
 | **Genuine** | Physically hand-signed by the accountant | Expert forensic examination |
 | **Copy-paste/Forged** | Digitally copied from another document | Pixel-level analysis or expert verification |
 ### 2.2 Why We Need Ground Truth
 To calculate rigorous metrics like EER (Equal Error Rate), we need labeled data:
 ```
 EER Calculation requires:
 ├── Known genuine signatures → Calculate FRR at each threshold
 ├── Known forged signatures  → Calculate FAR at each threshold
 └── Find threshold where FAR = FRR → This is EER
 ```
 ### 2.3 Our Current Limitation
 We do not have pre-labeled ground truth data. Our current classification is based on:
 - **Domain assumption**: Identical handwritten signatures are physically impossible
 - **Similarity threshold**: Arbitrarily selected at 0.95
 This approach is reasonable but may be challenged in academic peer review without additional validation.
 ---
 ## 3. Key Metrics Explained
 ### 3.1 Error Rate Metrics
 | Metric | Full Name | Formula | Interpretation |
 |--------|-----------|---------|----------------|
 | **FAR** | False Acceptance Rate | Forgeries Accepted / Total Forgeries | Security risk |
 | **FRR** | False Rejection Rate | Genuine Rejected / Total Genuine | Usability risk |
 | **EER** | Equal Error Rate | Point where FAR = FRR | Overall performance |
 | **AER** | Average Error Rate | (FAR + FRR) / 2 | Combined error |
 ### 3.2 Visual Representation of EER
 ```
        100% ┌─────────────────────────────────────┐
             │ FRR                                 │
             │  \                                  │
             │   \                                 │
        Rate │    \        ╳ ← EER point           │
             │     \      /                        │
             │      \    /                         │
             │       \  /   FAR                    │
          0% │────────\/──────────────────────────│
             └─────────────────────────────────────┘
             Low ←──── Threshold ────→ High
 ```
 ### 3.3 Benchmark Performance (from Literature)
 | System | Dataset | EER | Reference |
 |--------|---------|-----|-----------|
 | SigNet (Siamese CNN) | GPDS-300 | 3.92% | Dey et al., 2017 |
 | Consensus-Threshold | GPDS-300 | 1.27% FAR | arXiv:2401.03085 |
 | Type-2 Neutrosophic | Custom | 98% accuracy | IASC 2024 |
 | InceptionV3 Transfer | CEDAR | 99.10% accuracy | Springer 2024 |
 ---
 ## 4. Validation Options
 ### Option 1: Manual Ground Truth Creation (Most Rigorous)
 **Description:**
 Manually verify a subset of signatures with human expert examination.
 **Methodology:**
 1. Randomly sample ~100-200 signature pairs from different similarity ranges
 2. Expert examines original PDF documents for:
   - Scan artifact variations (genuine scans have unique noise)
   - Pixel-perfect alignment (copy-paste is exact)
   - Ink pressure and stroke variations
   - Document metadata (creation dates, software used)
 3. Label each pair as "genuine" or "copy-paste"
 4. Calculate EER, FAR, FRR at various thresholds
 5. Select optimal threshold based on EER
 **Pros:**
 - Academically rigorous
 - Enables standard metric calculation (EER, FAR, FRR)
 - Defensible in peer review
 **Cons:**
 - Time-consuming (estimated 20-40 hours for 200 samples)
 - Requires forensic document expertise
 - Subjective in edge cases
 **Academic Support:**
 > "The final verification results can be obtained by the voting method with different thresholds and can be adjusted according to different types of application requirements."
 > — Hadjadj et al., Applied Sciences, 2020 [[1]](#ref1)
 ---
 ### Option 2: Statistical Distribution-Based Threshold (No Labels Needed)
 **Description:**
 Use the statistical distribution of similarity scores to define outliers.
 **Methodology:**
 1. Calculate mean (μ) and standard deviation (σ) of all similarity scores
 2. Define thresholds based on standard deviations:
 | Threshold | Formula | Percentile | Classification |
 |-----------|---------|------------|----------------|
 | Very High | > μ + 3σ | 99.7% | Definite copy-paste |
 | High | > μ + 2σ | 95% | Likely copy-paste |
 | Normal | μ ± 2σ | 5-95% | Uncertain |
 | Low | < μ - 2σ | <5% | Likely genuine |
 **Your Data:**
 ```
 Mean similarity (μ) = 0.7608
 Std deviation (σ)   = 0.0916
 Thresholds:
 - μ + 2σ = 0.944 (95th percentile)
 - μ + 3σ = 1.035 (99.7th percentile, capped at 1.0)
 Your current 0.95 threshold ≈ μ + 2.07σ (96th percentile)
 ```
 **Pros:**
 - No manual labeling required
 - Statistically defensible
 - Based on actual data distribution
 **Cons:**
 - Assumes normal distribution (may not hold)
 - Does not provide FAR/FRR metrics
 - Less intuitive for non-statistical audiences
 **Academic Support:**
 > "Keypoint-based detection methods employ statistical thresholds derived from feature distributions to identify anomalous similarity patterns."
 > — Copy-Move Forgery Detection Survey, Multimedia Tools & Applications, 2024 [[2]](#ref2)
 ---
 ### Option 3: Physical Impossibility Argument (Domain Knowledge)
 **Description:**
 Use the physical impossibility of identical handwritten signatures as justification.
 **Methodology:**
 1. Define threshold based on handwriting science:
 | Similarity | Physical Interpretation | Classification |
 |------------|------------------------|----------------|
 | = 1.0 | Pixel-identical; physically impossible for handwriting | **Definite copy** |
 | > 0.98 | Near-identical; extremely improbable naturally | **Very likely copy** |
 | 0.90 - 0.98 | Highly similar; unusual but possible | **Suspicious** |
 | 0.80 - 0.90 | Similar; consistent with same signer | **Uncertain** |
 | < 0.80 | Different; normal variation | **Likely genuine** |
 2. Cite forensic document examination literature on signature variability
 **Pros:**
 - Intuitive and explainable
 - Based on established forensic principles
 - Does not require labeled data
 **Cons:**
 - Thresholds are somewhat arbitrary
 - May not account for digital signature pads (lower variation)
 - Requires supporting citations
 **Academic Support:**
 > "Signature verification presents several unique difficulties: high intra-class variability (an individual's signature may vary greatly day-to-day), large temporal variation (signature may change completely over time), and high inter-class similarity (forgeries attempt to be indistinguishable)."
 > — Stanford CS231n Report, 2016 [[3]](#ref3)
 > "A genuine signer's signature is naturally unstable even at short time-intervals, presenting inherent variation that digital copies lack."
 > — Consensus-Threshold Criterion, arXiv:2401.03085, 2024 [[4]](#ref4)
 ---
 ### Option 4: Pixel-Level Copy Detection (Technical Verification)
 **Description:**
 Detect exact copies through pixel-level analysis, independent of feature similarity.
 **Methodology:**
 1. For high-similarity pairs (>0.95), perform additional checks:
 ```python
 # Check 1: Exact pixel match
 if np.array_equal(image1, image2):
    return "DEFINITE_COPY"
 # Check 2: Structural Similarity Index (SSIM)
 ssim_score = structural_similarity(image1, image2)
 if ssim_score > 0.999:
    return "DEFINITE_COPY"
 # Check 3: Histogram correlation
 hist_corr = cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
 if hist_corr > 0.999:
    return "LIKELY_COPY"
 ```
 2. Use copy-move forgery detection (CMFD) techniques from image forensics
 **Pros:**
 - Technical proof of copying
 - Not dependent on threshold selection
 - Provides definitive evidence for exact copies
 **Cons:**
 - Only detects exact copies (not scaled/rotated)
 - Requires additional processing
 - May miss high-quality forgeries
 **Academic Support:**
 > "Block-based methods segment an image into overlapping blocks and extract features. The forgery regions are determined by computing the similarity between block features using DCT (Discrete Cosine Transform) or SIFT (Scale-Invariant Feature Transform)."
 > — Copy-Move Forgery Detection Survey, 2024 [[2]](#ref2)
 ---
 ### Option 5: Siamese Network with Learned Threshold (Advanced)
 **Description:**
 Train a Siamese neural network on signature pairs to learn optimal decision boundaries.
 **Methodology:**
 1. Collect training data:
   - Positive pairs: Same accountant, different documents
   - Negative pairs: Different accountants
 2. Train Siamese network with contrastive or triplet loss
 3. Network learns embedding space where:
   - Same-person signatures cluster together
   - Different-person signatures separate
 4. Threshold is learned during training, not manually set
 **Architecture:**
 ```
 ┌──────────────┐     ┌──────────────┐
 │  Signature 1 │     │  Signature 2 │
 └──────┬───────┘     └──────┬───────┘
       │                    │
       ▼                    ▼
 ┌──────────────┐     ┌──────────────┐
 │   CNN        │     │   CNN        │  (Shared weights)
 │   Encoder    │     │   Encoder    │
 └──────┬───────┘     └──────┬───────┘
       │                    │
       ▼                    ▼
 ┌──────────────┐     ┌──────────────┐
 │  Embedding   │     │  Embedding   │
 │  Vector      │     │  Vector      │
 └──────┬───────┘     └──────┬───────┘
       │                    │
       └────────┬───────────┘
                │
                ▼
        ┌───────────────┐
        │   Distance    │
        │   Metric      │
        └───────┬───────┘
                │
                ▼
        ┌───────────────┐
        │  Same/Different│
        └───────────────┘
 ```
 **Pros:**
 - Learns optimal threshold from data
 - State-of-the-art performance
 - Handles complex variations
 **Cons:**
 - Requires substantial training data
 - Computationally expensive
 - May overfit to specific accountant styles
 **Academic Support:**
 > "SigNet provided better results than the state-of-the-art results on most of the benchmark signature datasets by learning a feature space where similar observations are placed in proximity."
 > — SigNet, arXiv:1707.02131, 2017 [[5]](#ref5)
 > "Among various distance measures employed in the t-Siamese similarity network, the Manhattan distance technique emerged as the most effective."
 > — Triplet Siamese Similarity Networks, Mathematics, 2024 [[6]](#ref6)
 ---
 ## 5. Academic References
 <a name="ref1"></a>
 ### [1] Single Known Sample Verification (MDPI 2020)
 **Title:** An Offline Signature Verification and Forgery Detection Method Based on a Single Known Sample and an Explainable Deep Learning Approach
 **Authors:** Hadjadj, I. et al.
 **Journal:** Applied Sciences, 10(11), 3716
 **Year:** 2020
 **URL:** https://www.mdpi.com/2076-3417/10/11/3716
 **Key Findings:**
 - Accuracy: 94.37% - 99.96%
 - FRR: 0% - 5.88%
 - FAR: 0.22% - 5.34%
 - Voting method with adjustable thresholds
 <a name="ref2"></a>
 ### [2] Copy-Move Forgery Detection Survey (Springer 2024)
 **Title:** Copy-move forgery detection in digital image forensics: A survey
 **Journal:** Multimedia Tools and Applications
 **Year:** 2024
 **URL:** https://link.springer.com/article/10.1007/s11042-024-18399-2
 **Key Findings:**
 - Block-based, keypoint-based, and deep learning methods reviewed
 - DCT and SIFT for feature extraction
 - Statistical thresholds for anomaly detection
 <a name="ref3"></a>
 ### [3] Stanford CS231n Signature Verification Report
 **Title:** Offline Signature Verification with Convolutional Neural Networks
 **Institution:** Stanford University
 **Year:** 2016
 **URL:** https://cs231n.stanford.edu/reports/2016/pdfs/276_Report.pdf
 **Key Findings:**
 - High intra-class variability challenge
 - Low inter-class similarity for skilled forgeries
 - CNN-based feature extraction
 <a name="ref4"></a>
 ### [4] Consensus-Threshold Criterion (arXiv 2024)
 **Title:** Consensus-Threshold Criterion for Offline Signature Verification using Convolutional Neural Network Learned Representations
 **Year:** 2024
 **URL:** https://arxiv.org/abs/2401.03085
 **Key Findings:**
 - Achieved 1.27% FAR (vs 8.73% and 17.31% in prior work)
 - Consensus-threshold distance-based classifier
 - Uses SigNet and SigNet-F features
 <a name="ref5"></a>
 ### [5] SigNet: Siamese Network for Signature Verification (arXiv 2017)
 **Title:** SigNet: Convolutional Siamese Network for Writer Independent Offline Signature Verification
 **Authors:** Dey, S. et al.
 **Year:** 2017
 **URL:** https://arxiv.org/abs/1707.02131
 **Key Findings:**
 - Siamese architecture with shared weights
 - Euclidean distance minimization for genuine pairs
 - State-of-the-art on GPDS, CEDAR, MCYT datasets
 <a name="ref6"></a>
 ### [6] Triplet Siamese Similarity Networks (MDPI 2024)
 **Title:** Enhancing Signature Verification Using Triplet Siamese Similarity Networks in Digital Documents
 **Journal:** Mathematics, 12(17), 2757
 **Year:** 2024
 **URL:** https://www.mdpi.com/2227-7390/12/17/2757
 **Key Findings:**
 - Manhattan distance outperforms Euclidean and Minkowski
 - Triplet loss for inter-class/intra-class optimization
 - Tested on 4NSigComp2012, SigComp2011, BHSig260
 <a name="ref7"></a>
 ### [7] Original Siamese Network Paper (NeurIPS 1993)
 **Title:** Signature Verification using a "Siamese" Time Delay Neural Network
 **Authors:** Bromley, J. et al.
 **Conference:** NeurIPS 1993
 **URL:** https://papers.neurips.cc/paper/1993/file/288cc0ff022877bd3df94bc9360b9c5d-Paper.pdf
 **Key Findings:**
 - Introduced Siamese architecture for signature verification
 - Cosine similarity = 1.0 for genuine pairs
 - Foundational work for modern approaches
 <a name="ref8"></a>
 ### [8] Australian Journal of Forensic Sciences (2024)
 **Title:** Handling high level of uncertainty in forensic signature examination
 **Journal:** Australian Journal of Forensic Sciences, 57(5)
 **Year:** 2024
 **URL:** https://www.tandfonline.com/doi/full/10.1080/00450618.2024.2410044
 **Key Findings:**
 - Type-2 Neutrosophic similarity measure
 - 98% accuracy (vs 95% for Type-1)
 - Addresses ambiguity in forensic analysis
 <a name="ref9"></a>
 ### [9] Benchmark Datasets
 **CEDAR Dataset:**
 - 55 signers × 24 genuine + 24 forged signatures
 - URL: https://paperswithcode.com/dataset/cedar-signature
 **GPDS-960 Corpus:**
 - 960 writers × 24 genuine + 30 forgeries
 - 600 dpi grayscale scans
 - URL: https://www.researchgate.net/publication/220860371
 ---
 ## 6. Recommendations
 ### For Academic Publication
 | Priority | Option | Effort | Rigor | Recommendation |
 |----------|--------|--------|-------|----------------|
 | 1 | **Option 1 + Option 2** | High | Very High | Create small labeled dataset + validate statistical threshold |
 | 2 | **Option 2 + Option 3** | Low | Medium | Statistical threshold + physical impossibility argument |
 | 3 | **Option 4** | Medium | High | Add pixel-level verification for definitive cases |
 ### Suggested Approach
 1. **Primary method:** Use statistical threshold (Option 2)
   - Report threshold as μ + 2σ ≈ 0.944 (close to your current 0.95)
   - Statistically defensible without ground truth
 2. **Supporting evidence:** Physical impossibility argument (Option 3)
   - Cite forensic literature on signature variability
   - Emphasize that identical signatures are physically impossible
 3. **Validation (if time permits):** Small labeled subset (Option 1)
   - Manually verify 100-200 samples
   - Calculate EER to validate threshold choice
 4. **Technical proof:** Pixel-level analysis (Option 4)
   - Add SSIM analysis for high-similarity pairs
   - Report exact copy counts separately
 ### Suggested Report Language
 > "We adopt a similarity threshold of 0.95 (approximately μ + 2σ, representing the 96th percentile of our similarity distribution) to classify signatures as potential copy-paste instances. This threshold is supported by: (1) statistical outlier detection principles, (2) the physical impossibility of pixel-identical handwritten signatures, and (3) alignment with forensic document examination literature [cite: Hadjadj 2020, arXiv:2401.03085]."
 ---
 ## 7. Next Steps for Discussion
 ### Questions for Research Partners
 1. **Data availability:** Do we have access to any documents with known authentic signatures for validation?
 2. **Expert resources:** Can we involve a forensic document examiner for ground truth labeling?
 3. **Scope decision:** Should we focus on statistical validation (faster) or pursue full EER analysis (more rigorous)?
 4. **Publication target:** What level of rigor does the target journal require?
 5. **Time constraints:** How much time can we allocate to validation before submission?
 ### Proposed Action Items
 | Task | Owner | Deadline | Notes |
 |------|-------|----------|-------|
 | Review this document | All partners | TBD | Discuss options |
 | Select validation approach | Team decision | TBD | Based on resources |
 | Implement selected approach | TBD | TBD | After decision |
 | Update threshold if needed | TBD | TBD | Based on validation |
 | Draft methodology section | TBD | TBD | For paper |
 ---
 ## Appendix: Code for Statistical Threshold Calculation
 ```python
 import numpy as np
 from scipy import stats
 # Your similarity data
 similarities = [...]  # Load from your analysis
 # Calculate statistics
 mean_sim = np.mean(similarities)
 std_sim = np.std(similarities)
 percentiles = np.percentile(similarities, [90, 95, 99, 99.7])
 print(f"Mean (μ): {mean_sim:.4f}")
 print(f"Std (σ): {std_sim:.4f}")
 print(f"μ + 2σ: {mean_sim + 2*std_sim:.4f}")
 print(f"μ + 3σ: {mean_sim + 3*std_sim:.4f}")
 print(f"Percentiles: 90%={percentiles[0]:.4f}, 95%={percentiles[1]:.4f}, "
      f"99%={percentiles[2]:.4f}, 99.7%={percentiles[3]:.4f}")
 # Threshold recommendations
 thresholds = {
    "Conservative (μ+3σ)": min(1.0, mean_sim + 3*std_sim),
    "Standard (μ+2σ)": mean_sim + 2*std_sim,
    "Liberal (95th percentile)": percentiles[1],
 }
 for name, thresh in thresholds.items():
    count_above = np.sum(similarities > thresh)
    pct_above = 100 * count_above / len(similarities)
    print(f"{name}: {thresh:.4f} → {count_above} pairs ({pct_above:.2f}%)")
 ```
 ---
 *Document prepared for research discussion. Please share feedback and questions with the team.*