#!/usr/bin/env python3 """ Recalibrate classification using Firm A as ground truth. Dual-method only: Cosine + dHash (drops SSIM and pixel-identical). Approach: 1. Load per-signature best-match cosine + pHash from DB 2. Use Firm A (勤業眾信聯合) as known-positive calibration set 3. Analyze 2D distribution (cosine × pHash) for Firm A vs others 4. Determine calibrated thresholds 5. Reclassify all PDFs 6. Output new Table VII """ import sqlite3 import numpy as np from collections import defaultdict from pathlib import Path import json DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' OUTPUT_DIR = Path('/Volumes/NV2/PDF-Processing/signature-analysis/recalibrated') OUTPUT_DIR.mkdir(parents=True, exist_ok=True) FIRM_A = '勤業眾信聯合' KDE_CROSSOVER = 0.837 # from intra/inter analysis def load_data(): """Load per-signature data with cosine and pHash.""" conn = sqlite3.connect(DB_PATH) cur = conn.cursor() cur.execute(''' SELECT s.signature_id, s.image_filename, s.assigned_accountant, s.max_similarity_to_same_accountant, s.phash_distance_to_closest, a.firm FROM signatures s LEFT JOIN accountants a ON s.assigned_accountant = a.name WHERE s.assigned_accountant IS NOT NULL AND s.max_similarity_to_same_accountant IS NOT NULL ''') rows = cur.fetchall() conn.close() data = [] for r in rows: data.append({ 'sig_id': r[0], 'filename': r[1], 'accountant': r[2], 'cosine': r[3], 'phash': r[4], # may be None 'firm': r[5], }) print(f"Loaded {len(data):,} signatures") return data def analyze_firm_a(data): """Analyze Firm A's dual-method distribution to calibrate thresholds.""" firm_a = [d for d in data if d['firm'] == FIRM_A] others = [d for d in data if d['firm'] != FIRM_A] print(f"\n{'='*60}") print(f"FIRM A CALIBRATION ANALYSIS") print(f"{'='*60}") print(f"Firm A signatures: {len(firm_a):,}") print(f"Other signatures: {len(others):,}") # Firm A cosine distribution fa_cosine = np.array([d['cosine'] for d in firm_a]) ot_cosine = np.array([d['cosine'] for d in others]) print(f"\n--- Cosine Similarity ---") print(f"Firm A: mean={fa_cosine.mean():.4f}, std={fa_cosine.std():.4f}, " f"p1={np.percentile(fa_cosine,1):.4f}, p5={np.percentile(fa_cosine,5):.4f}") print(f"Others: mean={ot_cosine.mean():.4f}, std={ot_cosine.std():.4f}") # Firm A pHash distribution (only where available) fa_phash = [d['phash'] for d in firm_a if d['phash'] is not None] ot_phash = [d['phash'] for d in others if d['phash'] is not None] print(f"\n--- pHash (dHash) Distance ---") print(f"Firm A with pHash: {len(fa_phash):,}") print(f"Others with pHash: {len(ot_phash):,}") if fa_phash: fa_ph = np.array(fa_phash) print(f"Firm A: mean={fa_ph.mean():.2f}, median={np.median(fa_ph):.0f}, " f"p95={np.percentile(fa_ph,95):.0f}") print(f" pHash=0: {(fa_ph==0).sum():,} ({100*(fa_ph==0).mean():.1f}%)") print(f" pHash<=2: {(fa_ph<=2).sum():,} ({100*(fa_ph<=2).mean():.1f}%)") print(f" pHash<=5: {(fa_ph<=5).sum():,} ({100*(fa_ph<=5).mean():.1f}%)") print(f" pHash<=10:{(fa_ph<=10).sum():,} ({100*(fa_ph<=10).mean():.1f}%)") print(f" pHash<=15:{(fa_ph<=15).sum():,} ({100*(fa_ph<=15).mean():.1f}%)") print(f" pHash>15: {(fa_ph>15).sum():,} ({100*(fa_ph>15).mean():.1f}%)") if ot_phash: ot_ph = np.array(ot_phash) print(f"\nOthers: mean={ot_ph.mean():.2f}, median={np.median(ot_ph):.0f}") print(f" pHash=0: {(ot_ph==0).sum():,} ({100*(ot_ph==0).mean():.1f}%)") print(f" pHash<=5: {(ot_ph<=5).sum():,} ({100*(ot_ph<=5).mean():.1f}%)") print(f" pHash<=10:{(ot_ph<=10).sum():,} ({100*(ot_ph<=10).mean():.1f}%)") print(f" pHash>15: {(ot_ph>15).sum():,} ({100*(ot_ph>15).mean():.1f}%)") # 2D analysis: cosine × pHash for Firm A print(f"\n--- 2D Analysis: Cosine × pHash (Firm A) ---") fa_both = [(d['cosine'], d['phash']) for d in firm_a if d['phash'] is not None] if fa_both: cosines, phashes = zip(*fa_both) cosines = np.array(cosines) phashes = np.array(phashes) # Cross-tabulate for cos_thresh in [0.95, 0.90, KDE_CROSSOVER]: for ph_thresh in [5, 10, 15]: match = ((cosines > cos_thresh) & (phashes <= ph_thresh)).sum() total = len(cosines) print(f" Cosine>{cos_thresh:.3f} AND pHash<={ph_thresh}: " f"{match:,}/{total:,} ({100*match/total:.1f}%)") # Same for others (high cosine subset) print(f"\n--- 2D Analysis: Cosine × pHash (Others, cosine > 0.95 only) ---") ot_both_high = [(d['cosine'], d['phash']) for d in others if d['phash'] is not None and d['cosine'] > 0.95] if ot_both_high: cosines_o, phashes_o = zip(*ot_both_high) phashes_o = np.array(phashes_o) print(f" N (others with cosine>0.95 and pHash): {len(ot_both_high):,}") for ph_thresh in [5, 10, 15]: match = (phashes_o <= ph_thresh).sum() print(f" pHash<={ph_thresh}: {match:,}/{len(phashes_o):,} ({100*match/len(phashes_o):.1f}%)") return fa_phash, ot_phash def reclassify_pdfs(data): """ Reclassify all PDFs using calibrated dual-method thresholds. New classification (cosine + dHash only): 1. High-confidence replication: cosine > 0.95 AND pHash ≤ 5 2. Moderate-confidence replication: cosine > 0.95 AND pHash 6-15 3. High style consistency: cosine > 0.95 AND (pHash > 15 OR pHash unavailable) 4. Uncertain: cosine between KDE_CROSSOVER and 0.95 5. Likely genuine: cosine < KDE_CROSSOVER """ # Group signatures by PDF (derive PDF from filename pattern) # Filename format: {company}_{year}_{type}_sig{N}.png or similar # We need to group by source PDF conn = sqlite3.connect(DB_PATH) cur = conn.cursor() # Get PDF-level data cur.execute(''' SELECT s.signature_id, s.image_filename, s.assigned_accountant, s.max_similarity_to_same_accountant, s.phash_distance_to_closest, a.firm FROM signatures s LEFT JOIN accountants a ON s.assigned_accountant = a.name WHERE s.assigned_accountant IS NOT NULL AND s.max_similarity_to_same_accountant IS NOT NULL ''') rows = cur.fetchall() # Group by PDF: extract PDF identifier from filename # Signature filenames are like: {pdfname}_page{N}_sig{M}.png pdf_sigs = defaultdict(list) for r in rows: sig_id, filename, accountant, cosine, phash, firm = r # Extract PDF name (everything before _page or _sig) parts = filename.rsplit('_sig', 1) pdf_key = parts[0] if len(parts) > 1 else filename.rsplit('.', 1)[0] # Further strip _page part page_parts = pdf_key.rsplit('_page', 1) pdf_key = page_parts[0] if len(page_parts) > 1 else pdf_key pdf_sigs[pdf_key].append({ 'cosine': cosine, 'phash': phash, 'firm': firm, 'accountant': accountant, }) conn.close() print(f"\n{'='*60}") print(f"RECLASSIFICATION (Dual-Method: Cosine + dHash)") print(f"{'='*60}") print(f"Total PDFs: {len(pdf_sigs):,}") # Classify each PDF based on its signatures verdicts = defaultdict(int) firm_a_verdicts = defaultdict(int) details = [] for pdf_key, sigs in pdf_sigs.items(): # Use the signature with the highest cosine as the representative best_sig = max(sigs, key=lambda s: s['cosine']) cosine = best_sig['cosine'] phash = best_sig['phash'] is_firm_a = best_sig['firm'] == FIRM_A # Also check if ANY signature in this PDF has low pHash min_phash = None for s in sigs: if s['phash'] is not None: if min_phash is None or s['phash'] < min_phash: min_phash = s['phash'] # Classification if cosine > 0.95 and min_phash is not None and min_phash <= 5: verdict = 'high_confidence_replication' elif cosine > 0.95 and min_phash is not None and min_phash <= 15: verdict = 'moderate_confidence_replication' elif cosine > 0.95: verdict = 'high_style_consistency' elif cosine > KDE_CROSSOVER: verdict = 'uncertain' else: verdict = 'likely_genuine' verdicts[verdict] += 1 if is_firm_a: firm_a_verdicts[verdict] += 1 details.append({ 'pdf': pdf_key, 'cosine': cosine, 'min_phash': min_phash, 'verdict': verdict, 'is_firm_a': is_firm_a, }) total = sum(verdicts.values()) firm_a_total = sum(firm_a_verdicts.values()) # Print results print(f"\n--- New Classification Results ---") print(f"{'Verdict':<35} {'Count':>8} {'%':>7} | {'Firm A':>8} {'%':>7}") print("-" * 75) order = ['high_confidence_replication', 'moderate_confidence_replication', 'high_style_consistency', 'uncertain', 'likely_genuine'] labels = { 'high_confidence_replication': 'High-conf. replication', 'moderate_confidence_replication': 'Moderate-conf. replication', 'high_style_consistency': 'High style consistency', 'uncertain': 'Uncertain', 'likely_genuine': 'Likely genuine', } for v in order: n = verdicts.get(v, 0) fa = firm_a_verdicts.get(v, 0) pct = 100 * n / total if total > 0 else 0 fa_pct = 100 * fa / firm_a_total if firm_a_total > 0 else 0 print(f" {labels.get(v, v):<33} {n:>8,} {pct:>6.1f}% | {fa:>8,} {fa_pct:>6.1f}%") print("-" * 75) print(f" {'Total':<33} {total:>8,} {'100.0%':>7} | {firm_a_total:>8,} {'100.0%':>7}") # Precision/Recall using Firm A as positive set print(f"\n--- Firm A Capture Rate (Calibration Validation) ---") fa_replication = firm_a_verdicts.get('high_confidence_replication', 0) + \ firm_a_verdicts.get('moderate_confidence_replication', 0) print(f" Firm A classified as replication (high+moderate): {fa_replication:,}/{firm_a_total:,} " f"({100*fa_replication/firm_a_total:.1f}%)") fa_high = firm_a_verdicts.get('high_confidence_replication', 0) print(f" Firm A classified as high-confidence: {fa_high:,}/{firm_a_total:,} " f"({100*fa_high/firm_a_total:.1f}%)") # Save results results = { 'classification': {v: verdicts.get(v, 0) for v in order}, 'firm_a': {v: firm_a_verdicts.get(v, 0) for v in order}, 'total_pdfs': total, 'firm_a_pdfs': firm_a_total, 'thresholds': { 'cosine_high': 0.95, 'kde_crossover': KDE_CROSSOVER, 'phash_high_confidence': 5, 'phash_moderate_confidence': 15, }, } with open(OUTPUT_DIR / 'recalibrated_results.json', 'w') as f: json.dump(results, f, indent=2) print(f"\nResults saved: {OUTPUT_DIR / 'recalibrated_results.json'}") return results def main(): data = load_data() analyze_firm_a(data) results = reclassify_pdfs(data) if __name__ == "__main__": main()