#!/usr/bin/env python3 """ Generate complete PDF-level Excel report with Firm A-calibrated dual-method classification. Output: One row per PDF with identification, CPA info, detection stats, cosine similarity, dHash distance, and new dual-method verdicts. """ import sqlite3 import numpy as np import openpyxl from openpyxl.styles import Font, PatternFill, Alignment, Border, Side from collections import defaultdict from pathlib import Path from datetime import datetime DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db' OUTPUT_DIR = Path('/Volumes/NV2/PDF-Processing/signature-analysis/recalibrated') OUTPUT_DIR.mkdir(parents=True, exist_ok=True) OUTPUT_PATH = OUTPUT_DIR / 'pdf_level_recalibrated_report.xlsx' FIRM_A = '勤業眾信聯合' KDE_CROSSOVER = 0.837 COSINE_HIGH = 0.95 PHASH_HIGH_CONF = 5 PHASH_MOD_CONF = 15 def load_all_data(): """Load all signature data grouped by PDF.""" conn = sqlite3.connect(DB_PATH) cur = conn.cursor() # Get all signatures with their stats cur.execute(''' SELECT s.signature_id, s.image_filename, s.assigned_accountant, s.max_similarity_to_same_accountant, s.phash_distance_to_closest, s.ssim_to_closest, s.signature_verdict, a.firm, a.risk_level, a.mean_similarity, a.ratio_gt_95, a.signature_count FROM signatures s LEFT JOIN accountants a ON s.assigned_accountant = a.name WHERE s.assigned_accountant IS NOT NULL ''') rows = cur.fetchall() # Get PDF metadata from the master index or derive from filenames # Also get YOLO detection info cur.execute(''' SELECT s.image_filename, s.detection_confidence FROM signatures s ''') detection_rows = cur.fetchall() detection_conf = {r[0]: r[1] for r in detection_rows} conn.close() # Group by PDF pdf_data = defaultdict(lambda: { 'signatures': [], 'accountants': set(), 'firms': set(), }) for r in rows: sig_id, filename, accountant, cosine, phash, ssim, verdict, \ firm, risk, mean_sim, ratio95, sig_count = r # Extract PDF key from filename # Format: {company}_{year}_{type}_page{N}_sig{M}.png or similar parts = filename.rsplit('_sig', 1) pdf_key = parts[0] if len(parts) > 1 else filename.rsplit('.', 1)[0] page_parts = pdf_key.rsplit('_page', 1) pdf_key = page_parts[0] if len(page_parts) > 1 else pdf_key pdf_data[pdf_key]['signatures'].append({ 'sig_id': sig_id, 'filename': filename, 'accountant': accountant, 'cosine': cosine, 'phash': phash, 'ssim': ssim, 'old_verdict': verdict, 'firm': firm, 'risk_level': risk, 'acct_mean_sim': mean_sim, 'acct_ratio_95': ratio95, 'acct_sig_count': sig_count, 'detection_conf': detection_conf.get(filename), }) if accountant: pdf_data[pdf_key]['accountants'].add(accountant) if firm: pdf_data[pdf_key]['firms'].add(firm) print(f"Loaded {sum(len(v['signatures']) for v in pdf_data.values()):,} signatures across {len(pdf_data):,} PDFs") return pdf_data def classify_dual_method(max_cosine, min_phash): """New dual-method classification with Firm A-calibrated thresholds.""" if max_cosine is None: return 'unknown', 'none' if max_cosine > COSINE_HIGH: if min_phash is not None and min_phash <= PHASH_HIGH_CONF: return 'high_confidence_replication', 'high' elif min_phash is not None and min_phash <= PHASH_MOD_CONF: return 'moderate_confidence_replication', 'medium' else: return 'high_style_consistency', 'low' elif max_cosine > KDE_CROSSOVER: return 'uncertain', 'low' else: return 'likely_genuine', 'medium' def build_report(pdf_data): """Build Excel report.""" wb = openpyxl.Workbook() ws = wb.active ws.title = "PDF-Level Report" # Define columns columns = [ # Group A: PDF Identification (Blue) ('pdf_key', 'PDF Key'), ('n_signatures', '# Signatures'), # Group B: CPA Info (Green) ('accountant_1', 'CPA 1 Name'), ('accountant_2', 'CPA 2 Name'), ('firm_1', 'Firm 1'), ('firm_2', 'Firm 2'), ('is_firm_a', 'Is Firm A'), # Group C: Detection (Yellow) ('avg_detection_conf', 'Avg Detection Conf'), # Group D: Cosine Similarity - Sig 1 (Red) ('sig1_cosine', 'Sig1 Max Cosine'), ('sig1_cosine_verdict', 'Sig1 Cosine Verdict'), ('sig1_acct_mean', 'Sig1 CPA Mean Sim'), ('sig1_acct_ratio95', 'Sig1 CPA >0.95 Ratio'), ('sig1_acct_count', 'Sig1 CPA Sig Count'), # Group E: Cosine Similarity - Sig 2 (Purple) ('sig2_cosine', 'Sig2 Max Cosine'), ('sig2_cosine_verdict', 'Sig2 Cosine Verdict'), ('sig2_acct_mean', 'Sig2 CPA Mean Sim'), ('sig2_acct_ratio95', 'Sig2 CPA >0.95 Ratio'), ('sig2_acct_count', 'Sig2 CPA Sig Count'), # Group F: dHash Distance (Orange) ('min_phash', 'Min dHash Distance'), ('max_phash', 'Max dHash Distance'), ('avg_phash', 'Avg dHash Distance'), ('sig1_phash', 'Sig1 dHash Distance'), ('sig2_phash', 'Sig2 dHash Distance'), # Group G: SSIM (for reference only) (Gray) ('max_ssim', 'Max SSIM'), ('avg_ssim', 'Avg SSIM'), # Group H: Dual-Method Classification (Dark Blue) ('dual_verdict', 'Dual-Method Verdict'), ('dual_confidence', 'Confidence Level'), ('max_cosine', 'PDF Max Cosine'), ('pdf_min_phash', 'PDF Min dHash'), # Group I: CPA Risk (Teal) ('sig1_risk', 'Sig1 CPA Risk Level'), ('sig2_risk', 'Sig2 CPA Risk Level'), ] col_keys = [c[0] for c in columns] col_names = [c[1] for c in columns] # Header styles header_fill = PatternFill(start_color='1F4E79', end_color='1F4E79', fill_type='solid') header_font = Font(name='Arial', size=9, bold=True, color='FFFFFF') data_font = Font(name='Arial', size=9) thin_border = Border( left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'), bottom=Side(style='thin'), ) # Group colors group_colors = { 'A': 'D6E4F0', # Blue - PDF ID 'B': 'D9E2D0', # Green - CPA 'C': 'FFF2CC', # Yellow - Detection 'D': 'F4CCCC', # Red - Cosine Sig1 'E': 'E1D5E7', # Purple - Cosine Sig2 'F': 'FFE0B2', # Orange - dHash 'G': 'E0E0E0', # Gray - SSIM 'H': 'B3D4FC', # Dark Blue - Dual method 'I': 'B2DFDB', # Teal - Risk } group_ranges = { 'A': (0, 2), 'B': (2, 7), 'C': (7, 8), 'D': (8, 13), 'E': (13, 18), 'F': (18, 23), 'G': (23, 25), 'H': (25, 29), 'I': (29, 31), } # Write header for col_idx, name in enumerate(col_names, 1): cell = ws.cell(row=1, column=col_idx, value=name) cell.font = header_font cell.fill = header_fill cell.alignment = Alignment(horizontal='center', wrap_text=True) cell.border = thin_border # Process PDFs row_idx = 2 verdict_counts = defaultdict(int) firm_a_counts = defaultdict(int) for pdf_key, pdata in sorted(pdf_data.items()): sigs = pdata['signatures'] if not sigs: continue # Sort signatures by position (sig1, sig2) sigs_sorted = sorted(sigs, key=lambda s: s['filename']) sig1 = sigs_sorted[0] if len(sigs_sorted) > 0 else None sig2 = sigs_sorted[1] if len(sigs_sorted) > 1 else None # Compute PDF-level aggregates cosines = [s['cosine'] for s in sigs if s['cosine'] is not None] phashes = [s['phash'] for s in sigs if s['phash'] is not None] ssims = [s['ssim'] for s in sigs if s['ssim'] is not None] confs = [s['detection_conf'] for s in sigs if s['detection_conf'] is not None] max_cosine = max(cosines) if cosines else None min_phash = min(phashes) if phashes else None max_phash = max(phashes) if phashes else None avg_phash = np.mean(phashes) if phashes else None max_ssim = max(ssims) if ssims else None avg_ssim = np.mean(ssims) if ssims else None avg_conf = np.mean(confs) if confs else None is_firm_a = FIRM_A in pdata['firms'] # Dual-method classification verdict, confidence = classify_dual_method(max_cosine, min_phash) verdict_counts[verdict] += 1 if is_firm_a: firm_a_counts[verdict] += 1 # Cosine verdicts per signature def cosine_verdict(cos): if cos is None: return None if cos > COSINE_HIGH: return 'high' if cos > KDE_CROSSOVER: return 'uncertain' return 'low' # Build row row_data = { 'pdf_key': pdf_key, 'n_signatures': len(sigs), 'accountant_1': sig1['accountant'] if sig1 else None, 'accountant_2': sig2['accountant'] if sig2 else None, 'firm_1': sig1['firm'] if sig1 else None, 'firm_2': sig2['firm'] if sig2 else None, 'is_firm_a': 'Yes' if is_firm_a else 'No', 'avg_detection_conf': round(avg_conf, 4) if avg_conf else None, 'sig1_cosine': round(sig1['cosine'], 4) if sig1 and sig1['cosine'] else None, 'sig1_cosine_verdict': cosine_verdict(sig1['cosine']) if sig1 else None, 'sig1_acct_mean': round(sig1['acct_mean_sim'], 4) if sig1 and sig1['acct_mean_sim'] else None, 'sig1_acct_ratio95': round(sig1['acct_ratio_95'], 4) if sig1 and sig1['acct_ratio_95'] else None, 'sig1_acct_count': sig1['acct_sig_count'] if sig1 else None, 'sig2_cosine': round(sig2['cosine'], 4) if sig2 and sig2['cosine'] else None, 'sig2_cosine_verdict': cosine_verdict(sig2['cosine']) if sig2 else None, 'sig2_acct_mean': round(sig2['acct_mean_sim'], 4) if sig2 and sig2['acct_mean_sim'] else None, 'sig2_acct_ratio95': round(sig2['acct_ratio_95'], 4) if sig2 and sig2['acct_ratio_95'] else None, 'sig2_acct_count': sig2['acct_sig_count'] if sig2 else None, 'min_phash': min_phash, 'max_phash': max_phash, 'avg_phash': round(avg_phash, 2) if avg_phash is not None else None, 'sig1_phash': sig1['phash'] if sig1 else None, 'sig2_phash': sig2['phash'] if sig2 else None, 'max_ssim': round(max_ssim, 4) if max_ssim is not None else None, 'avg_ssim': round(avg_ssim, 4) if avg_ssim is not None else None, 'dual_verdict': verdict, 'dual_confidence': confidence, 'max_cosine': round(max_cosine, 4) if max_cosine is not None else None, 'pdf_min_phash': min_phash, 'sig1_risk': sig1['risk_level'] if sig1 else None, 'sig2_risk': sig2['risk_level'] if sig2 else None, } for col_idx, key in enumerate(col_keys, 1): val = row_data.get(key) cell = ws.cell(row=row_idx, column=col_idx, value=val) cell.font = data_font cell.border = thin_border # Color by group for group, (start, end) in group_ranges.items(): if start <= col_idx - 1 < end: cell.fill = PatternFill(start_color=group_colors[group], end_color=group_colors[group], fill_type='solid') break # Highlight Firm A rows if is_firm_a and col_idx == 7: cell.font = Font(name='Arial', size=9, bold=True, color='CC0000') # Color verdicts if key == 'dual_verdict': colors = { 'high_confidence_replication': 'FF0000', 'moderate_confidence_replication': 'FF6600', 'high_style_consistency': '009900', 'uncertain': 'FF9900', 'likely_genuine': '006600', } if val in colors: cell.font = Font(name='Arial', size=9, bold=True, color=colors[val]) row_idx += 1 # Auto-width for col_idx in range(1, len(col_keys) + 1): ws.column_dimensions[openpyxl.utils.get_column_letter(col_idx)].width = 15 # Freeze header ws.freeze_panes = 'A2' ws.auto_filter.ref = f"A1:{openpyxl.utils.get_column_letter(len(col_keys))}{row_idx-1}" # === Summary Sheet === ws2 = wb.create_sheet("Summary") ws2.cell(row=1, column=1, value="Dual-Method Classification Summary").font = Font(size=14, bold=True) ws2.cell(row=2, column=1, value=f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}") ws2.cell(row=3, column=1, value=f"Calibration: Firm A (dHash median=5, p95=15)") ws2.cell(row=5, column=1, value="Verdict").font = Font(bold=True) ws2.cell(row=5, column=2, value="Count").font = Font(bold=True) ws2.cell(row=5, column=3, value="%").font = Font(bold=True) ws2.cell(row=5, column=4, value="Firm A").font = Font(bold=True) ws2.cell(row=5, column=5, value="Firm A %").font = Font(bold=True) total = sum(verdict_counts.values()) fa_total = sum(firm_a_counts.values()) order = ['high_confidence_replication', 'moderate_confidence_replication', 'high_style_consistency', 'uncertain', 'likely_genuine', 'unknown'] for i, v in enumerate(order): n = verdict_counts.get(v, 0) fa = firm_a_counts.get(v, 0) ws2.cell(row=6+i, column=1, value=v) ws2.cell(row=6+i, column=2, value=n) ws2.cell(row=6+i, column=3, value=f"{100*n/total:.1f}%" if total > 0 else "0%") ws2.cell(row=6+i, column=4, value=fa) ws2.cell(row=6+i, column=5, value=f"{100*fa/fa_total:.1f}%" if fa_total > 0 else "0%") ws2.cell(row=6+len(order), column=1, value="Total").font = Font(bold=True) ws2.cell(row=6+len(order), column=2, value=total) ws2.cell(row=6+len(order), column=4, value=fa_total) # Thresholds ws2.cell(row=15, column=1, value="Thresholds Used").font = Font(size=12, bold=True) ws2.cell(row=16, column=1, value="Cosine high threshold") ws2.cell(row=16, column=2, value=COSINE_HIGH) ws2.cell(row=17, column=1, value="KDE crossover") ws2.cell(row=17, column=2, value=KDE_CROSSOVER) ws2.cell(row=18, column=1, value="dHash high-confidence (Firm A median)") ws2.cell(row=18, column=2, value=PHASH_HIGH_CONF) ws2.cell(row=19, column=1, value="dHash moderate-confidence (Firm A p95)") ws2.cell(row=19, column=2, value=PHASH_MOD_CONF) for col in range(1, 6): ws2.column_dimensions[openpyxl.utils.get_column_letter(col)].width = 30 # Save wb.save(str(OUTPUT_PATH)) print(f"\nSaved: {OUTPUT_PATH}") print(f"Total PDFs: {total:,}") print(f"Firm A PDFs: {fa_total:,}") # Print summary print(f"\n{'Verdict':<35} {'Count':>8} {'%':>7} | {'Firm A':>8} {'%':>7}") print("-" * 70) for v in order: n = verdict_counts.get(v, 0) fa = firm_a_counts.get(v, 0) if n > 0: print(f" {v:<33} {n:>8,} {100*n/total:>6.1f}% | {fa:>8,} {100*fa/fa_total:>6.1f}%" if fa_total > 0 else f" {v:<33} {n:>8,} {100*n/total:>6.1f}%") print("-" * 70) print(f" {'Total':<33} {total:>8,} | {fa_total:>8,}") def main(): print("=" * 60) print("Generating Recalibrated PDF-Level Report") print(f"Calibration: Firm A ({FIRM_A})") print(f"Method: Dual (Cosine + dHash)") print("=" * 60) pdf_data = load_all_data() build_report(pdf_data) if __name__ == "__main__": main()