pdf_signature_extraction/paper/generate_recalibrated_report.py

#!/usr/bin/env python3
"""
Generate complete PDF-level Excel report with Firm A-calibrated dual-method classification.
Output: One row per PDF with identification, CPA info, detection stats,
        cosine similarity, dHash distance, and new dual-method verdicts.
"""

import sqlite3
import numpy as np
import openpyxl
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from collections import defaultdict
from pathlib import Path
from datetime import datetime

DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
OUTPUT_DIR = Path('/Volumes/NV2/PDF-Processing/signature-analysis/recalibrated')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH = OUTPUT_DIR / 'pdf_level_recalibrated_report.xlsx'

FIRM_A = '勤業眾信聯合'
KDE_CROSSOVER = 0.837
COSINE_HIGH = 0.95
PHASH_HIGH_CONF = 5
PHASH_MOD_CONF = 15


def load_all_data():
    """Load all signature data grouped by PDF."""
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()

    # Get all signatures with their stats
    cur.execute('''
        SELECT s.signature_id, s.image_filename, s.assigned_accountant,
               s.max_similarity_to_same_accountant,
               s.phash_distance_to_closest,
               s.ssim_to_closest,
               s.signature_verdict,
               a.firm, a.risk_level, a.mean_similarity, a.ratio_gt_95,
               a.signature_count
        FROM signatures s
        LEFT JOIN accountants a ON s.assigned_accountant = a.name
        WHERE s.assigned_accountant IS NOT NULL
    ''')
    rows = cur.fetchall()

    # Get PDF metadata from the master index or derive from filenames
    # Also get YOLO detection info
    cur.execute('''
        SELECT s.image_filename,
               s.detection_confidence
        FROM signatures s
    ''')
    detection_rows = cur.fetchall()
    detection_conf = {r[0]: r[1] for r in detection_rows}

    conn.close()

    # Group by PDF
    pdf_data = defaultdict(lambda: {
        'signatures': [],
        'accountants': set(),
        'firms': set(),
    })

    for r in rows:
        sig_id, filename, accountant, cosine, phash, ssim, verdict, \
            firm, risk, mean_sim, ratio95, sig_count = r

        # Extract PDF key from filename
        # Format: {company}_{year}_{type}_page{N}_sig{M}.png or similar
        parts = filename.rsplit('_sig', 1)
        pdf_key = parts[0] if len(parts) > 1 else filename.rsplit('.', 1)[0]
        page_parts = pdf_key.rsplit('_page', 1)
        pdf_key = page_parts[0] if len(page_parts) > 1 else pdf_key

        pdf_data[pdf_key]['signatures'].append({
            'sig_id': sig_id,
            'filename': filename,
            'accountant': accountant,
            'cosine': cosine,
            'phash': phash,
            'ssim': ssim,
            'old_verdict': verdict,
            'firm': firm,
            'risk_level': risk,
            'acct_mean_sim': mean_sim,
            'acct_ratio_95': ratio95,
            'acct_sig_count': sig_count,
            'detection_conf': detection_conf.get(filename),
        })
        if accountant:
            pdf_data[pdf_key]['accountants'].add(accountant)
        if firm:
            pdf_data[pdf_key]['firms'].add(firm)

    print(f"Loaded {sum(len(v['signatures']) for v in pdf_data.values()):,} signatures across {len(pdf_data):,} PDFs")
    return pdf_data


def classify_dual_method(max_cosine, min_phash):
    """New dual-method classification with Firm A-calibrated thresholds."""
    if max_cosine is None:
        return 'unknown', 'none'

    if max_cosine > COSINE_HIGH:
        if min_phash is not None and min_phash <= PHASH_HIGH_CONF:
            return 'high_confidence_replication', 'high'
        elif min_phash is not None and min_phash <= PHASH_MOD_CONF:
            return 'moderate_confidence_replication', 'medium'
        else:
            return 'high_style_consistency', 'low'
    elif max_cosine > KDE_CROSSOVER:
        return 'uncertain', 'low'
    else:
        return 'likely_genuine', 'medium'


def build_report(pdf_data):
    """Build Excel report."""
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "PDF-Level Report"

    # Define columns
    columns = [
        # Group A: PDF Identification (Blue)
        ('pdf_key', 'PDF Key'),
        ('n_signatures', '# Signatures'),

        # Group B: CPA Info (Green)
        ('accountant_1', 'CPA 1 Name'),
        ('accountant_2', 'CPA 2 Name'),
        ('firm_1', 'Firm 1'),
        ('firm_2', 'Firm 2'),
        ('is_firm_a', 'Is Firm A'),

        # Group C: Detection (Yellow)
        ('avg_detection_conf', 'Avg Detection Conf'),

        # Group D: Cosine Similarity - Sig 1 (Red)
        ('sig1_cosine', 'Sig1 Max Cosine'),
        ('sig1_cosine_verdict', 'Sig1 Cosine Verdict'),
        ('sig1_acct_mean', 'Sig1 CPA Mean Sim'),
        ('sig1_acct_ratio95', 'Sig1 CPA >0.95 Ratio'),
        ('sig1_acct_count', 'Sig1 CPA Sig Count'),

        # Group E: Cosine Similarity - Sig 2 (Purple)
        ('sig2_cosine', 'Sig2 Max Cosine'),
        ('sig2_cosine_verdict', 'Sig2 Cosine Verdict'),
        ('sig2_acct_mean', 'Sig2 CPA Mean Sim'),
        ('sig2_acct_ratio95', 'Sig2 CPA >0.95 Ratio'),
        ('sig2_acct_count', 'Sig2 CPA Sig Count'),

        # Group F: dHash Distance (Orange)
        ('min_phash', 'Min dHash Distance'),
        ('max_phash', 'Max dHash Distance'),
        ('avg_phash', 'Avg dHash Distance'),
        ('sig1_phash', 'Sig1 dHash Distance'),
        ('sig2_phash', 'Sig2 dHash Distance'),

        # Group G: SSIM (for reference only) (Gray)
        ('max_ssim', 'Max SSIM'),
        ('avg_ssim', 'Avg SSIM'),

        # Group H: Dual-Method Classification (Dark Blue)
        ('dual_verdict', 'Dual-Method Verdict'),
        ('dual_confidence', 'Confidence Level'),
        ('max_cosine', 'PDF Max Cosine'),
        ('pdf_min_phash', 'PDF Min dHash'),

        # Group I: CPA Risk (Teal)
        ('sig1_risk', 'Sig1 CPA Risk Level'),
        ('sig2_risk', 'Sig2 CPA Risk Level'),
    ]

    col_keys = [c[0] for c in columns]
    col_names = [c[1] for c in columns]

    # Header styles
    header_fill = PatternFill(start_color='1F4E79', end_color='1F4E79', fill_type='solid')
    header_font = Font(name='Arial', size=9, bold=True, color='FFFFFF')
    data_font = Font(name='Arial', size=9)
    thin_border = Border(
        left=Side(style='thin'),
        right=Side(style='thin'),
        top=Side(style='thin'),
        bottom=Side(style='thin'),
    )

    # Group colors
    group_colors = {
        'A': 'D6E4F0',  # Blue - PDF ID
        'B': 'D9E2D0',  # Green - CPA
        'C': 'FFF2CC',  # Yellow - Detection
        'D': 'F4CCCC',  # Red - Cosine Sig1
        'E': 'E1D5E7',  # Purple - Cosine Sig2
        'F': 'FFE0B2',  # Orange - dHash
        'G': 'E0E0E0',  # Gray - SSIM
        'H': 'B3D4FC',  # Dark Blue - Dual method
        'I': 'B2DFDB',  # Teal - Risk
    }

    group_ranges = {
        'A': (0, 2), 'B': (2, 7), 'C': (7, 8),
        'D': (8, 13), 'E': (13, 18), 'F': (18, 23),
        'G': (23, 25), 'H': (25, 29), 'I': (29, 31),
    }

    # Write header
    for col_idx, name in enumerate(col_names, 1):
        cell = ws.cell(row=1, column=col_idx, value=name)
        cell.font = header_font
        cell.fill = header_fill
        cell.alignment = Alignment(horizontal='center', wrap_text=True)
        cell.border = thin_border

    # Process PDFs
    row_idx = 2
    verdict_counts = defaultdict(int)
    firm_a_counts = defaultdict(int)

    for pdf_key, pdata in sorted(pdf_data.items()):
        sigs = pdata['signatures']
        if not sigs:
            continue

        # Sort signatures by position (sig1, sig2)
        sigs_sorted = sorted(sigs, key=lambda s: s['filename'])
        sig1 = sigs_sorted[0] if len(sigs_sorted) > 0 else None
        sig2 = sigs_sorted[1] if len(sigs_sorted) > 1 else None

        # Compute PDF-level aggregates
        cosines = [s['cosine'] for s in sigs if s['cosine'] is not None]
        phashes = [s['phash'] for s in sigs if s['phash'] is not None]
        ssims = [s['ssim'] for s in sigs if s['ssim'] is not None]
        confs = [s['detection_conf'] for s in sigs if s['detection_conf'] is not None]

        max_cosine = max(cosines) if cosines else None
        min_phash = min(phashes) if phashes else None
        max_phash = max(phashes) if phashes else None
        avg_phash = np.mean(phashes) if phashes else None
        max_ssim = max(ssims) if ssims else None
        avg_ssim = np.mean(ssims) if ssims else None
        avg_conf = np.mean(confs) if confs else None

        is_firm_a = FIRM_A in pdata['firms']

        # Dual-method classification
        verdict, confidence = classify_dual_method(max_cosine, min_phash)
        verdict_counts[verdict] += 1
        if is_firm_a:
            firm_a_counts[verdict] += 1

        # Cosine verdicts per signature
        def cosine_verdict(cos):
            if cos is None: return None
            if cos > COSINE_HIGH: return 'high'
            if cos > KDE_CROSSOVER: return 'uncertain'
            return 'low'

        # Build row
        row_data = {
            'pdf_key': pdf_key,
            'n_signatures': len(sigs),
            'accountant_1': sig1['accountant'] if sig1 else None,
            'accountant_2': sig2['accountant'] if sig2 else None,
            'firm_1': sig1['firm'] if sig1 else None,
            'firm_2': sig2['firm'] if sig2 else None,
            'is_firm_a': 'Yes' if is_firm_a else 'No',
            'avg_detection_conf': round(avg_conf, 4) if avg_conf else None,
            'sig1_cosine': round(sig1['cosine'], 4) if sig1 and sig1['cosine'] else None,
            'sig1_cosine_verdict': cosine_verdict(sig1['cosine']) if sig1 else None,
            'sig1_acct_mean': round(sig1['acct_mean_sim'], 4) if sig1 and sig1['acct_mean_sim'] else None,
            'sig1_acct_ratio95': round(sig1['acct_ratio_95'], 4) if sig1 and sig1['acct_ratio_95'] else None,
            'sig1_acct_count': sig1['acct_sig_count'] if sig1 else None,
            'sig2_cosine': round(sig2['cosine'], 4) if sig2 and sig2['cosine'] else None,
            'sig2_cosine_verdict': cosine_verdict(sig2['cosine']) if sig2 else None,
            'sig2_acct_mean': round(sig2['acct_mean_sim'], 4) if sig2 and sig2['acct_mean_sim'] else None,
            'sig2_acct_ratio95': round(sig2['acct_ratio_95'], 4) if sig2 and sig2['acct_ratio_95'] else None,
            'sig2_acct_count': sig2['acct_sig_count'] if sig2 else None,
            'min_phash': min_phash,
            'max_phash': max_phash,
            'avg_phash': round(avg_phash, 2) if avg_phash is not None else None,
            'sig1_phash': sig1['phash'] if sig1 else None,
            'sig2_phash': sig2['phash'] if sig2 else None,
            'max_ssim': round(max_ssim, 4) if max_ssim is not None else None,
            'avg_ssim': round(avg_ssim, 4) if avg_ssim is not None else None,
            'dual_verdict': verdict,
            'dual_confidence': confidence,
            'max_cosine': round(max_cosine, 4) if max_cosine is not None else None,
            'pdf_min_phash': min_phash,
            'sig1_risk': sig1['risk_level'] if sig1 else None,
            'sig2_risk': sig2['risk_level'] if sig2 else None,
        }

        for col_idx, key in enumerate(col_keys, 1):
            val = row_data.get(key)
            cell = ws.cell(row=row_idx, column=col_idx, value=val)
            cell.font = data_font
            cell.border = thin_border

            # Color by group
            for group, (start, end) in group_ranges.items():
                if start <= col_idx - 1 < end:
                    cell.fill = PatternFill(start_color=group_colors[group],
                                           end_color=group_colors[group],
                                           fill_type='solid')
                    break

            # Highlight Firm A rows
            if is_firm_a and col_idx == 7:
                cell.font = Font(name='Arial', size=9, bold=True, color='CC0000')

            # Color verdicts
            if key == 'dual_verdict':
                colors = {
                    'high_confidence_replication': 'FF0000',
                    'moderate_confidence_replication': 'FF6600',
                    'high_style_consistency': '009900',
                    'uncertain': 'FF9900',
                    'likely_genuine': '006600',
                }
                if val in colors:
                    cell.font = Font(name='Arial', size=9, bold=True, color=colors[val])

        row_idx += 1

    # Auto-width
    for col_idx in range(1, len(col_keys) + 1):
        ws.column_dimensions[openpyxl.utils.get_column_letter(col_idx)].width = 15

    # Freeze header
    ws.freeze_panes = 'A2'
    ws.auto_filter.ref = f"A1:{openpyxl.utils.get_column_letter(len(col_keys))}{row_idx-1}"

    # === Summary Sheet ===
    ws2 = wb.create_sheet("Summary")
    ws2.cell(row=1, column=1, value="Dual-Method Classification Summary").font = Font(size=14, bold=True)
    ws2.cell(row=2, column=1, value=f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
    ws2.cell(row=3, column=1, value=f"Calibration: Firm A (dHash median=5, p95=15)")

    ws2.cell(row=5, column=1, value="Verdict").font = Font(bold=True)
    ws2.cell(row=5, column=2, value="Count").font = Font(bold=True)
    ws2.cell(row=5, column=3, value="%").font = Font(bold=True)
    ws2.cell(row=5, column=4, value="Firm A").font = Font(bold=True)
    ws2.cell(row=5, column=5, value="Firm A %").font = Font(bold=True)

    total = sum(verdict_counts.values())
    fa_total = sum(firm_a_counts.values())
    order = ['high_confidence_replication', 'moderate_confidence_replication',
             'high_style_consistency', 'uncertain', 'likely_genuine', 'unknown']

    for i, v in enumerate(order):
        n = verdict_counts.get(v, 0)
        fa = firm_a_counts.get(v, 0)
        ws2.cell(row=6+i, column=1, value=v)
        ws2.cell(row=6+i, column=2, value=n)
        ws2.cell(row=6+i, column=3, value=f"{100*n/total:.1f}%" if total > 0 else "0%")
        ws2.cell(row=6+i, column=4, value=fa)
        ws2.cell(row=6+i, column=5, value=f"{100*fa/fa_total:.1f}%" if fa_total > 0 else "0%")

    ws2.cell(row=6+len(order), column=1, value="Total").font = Font(bold=True)
    ws2.cell(row=6+len(order), column=2, value=total)
    ws2.cell(row=6+len(order), column=4, value=fa_total)

    # Thresholds
    ws2.cell(row=15, column=1, value="Thresholds Used").font = Font(size=12, bold=True)
    ws2.cell(row=16, column=1, value="Cosine high threshold")
    ws2.cell(row=16, column=2, value=COSINE_HIGH)
    ws2.cell(row=17, column=1, value="KDE crossover")
    ws2.cell(row=17, column=2, value=KDE_CROSSOVER)
    ws2.cell(row=18, column=1, value="dHash high-confidence (Firm A median)")
    ws2.cell(row=18, column=2, value=PHASH_HIGH_CONF)
    ws2.cell(row=19, column=1, value="dHash moderate-confidence (Firm A p95)")
    ws2.cell(row=19, column=2, value=PHASH_MOD_CONF)

    for col in range(1, 6):
        ws2.column_dimensions[openpyxl.utils.get_column_letter(col)].width = 30

    # Save
    wb.save(str(OUTPUT_PATH))
    print(f"\nSaved: {OUTPUT_PATH}")
    print(f"Total PDFs: {total:,}")
    print(f"Firm A PDFs: {fa_total:,}")

    # Print summary
    print(f"\n{'Verdict':<35} {'Count':>8} {'%':>7}  | {'Firm A':>8} {'%':>7}")
    print("-" * 70)
    for v in order:
        n = verdict_counts.get(v, 0)
        fa = firm_a_counts.get(v, 0)
        if n > 0:
            print(f"  {v:<33} {n:>8,} {100*n/total:>6.1f}%  | {fa:>8,} {100*fa/fa_total:>6.1f}%"
                  if fa_total > 0 else f"  {v:<33} {n:>8,} {100*n/total:>6.1f}%")
    print("-" * 70)
    print(f"  {'Total':<33} {total:>8,}         | {fa_total:>8,}")


def main():
    print("=" * 60)
    print("Generating Recalibrated PDF-Level Report")
    print(f"Calibration: Firm A ({FIRM_A})")
    print(f"Method: Dual (Cosine + dHash)")
    print("=" * 60)

    pdf_data = load_all_data()
    build_report(pdf_data)


if __name__ == "__main__":
    main()