Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification

Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 23:05:33 +08:00
parent 21df0ff387
commit 939a348da4
33 changed files with 9315 additions and 0 deletions
@@ -0,0 +1,413 @@
+#!/usr/bin/env python3
+"""
+Generate complete PDF-level Excel report with Firm A-calibrated dual-method classification.
+Output: One row per PDF with identification, CPA info, detection stats,
+        cosine similarity, dHash distance, and new dual-method verdicts.
+"""
+
+import sqlite3
+import numpy as np
+import openpyxl
+from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
+from collections import defaultdict
+from pathlib import Path
+from datetime import datetime
+
+DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
+OUTPUT_DIR = Path('/Volumes/NV2/PDF-Processing/signature-analysis/recalibrated')
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+OUTPUT_PATH = OUTPUT_DIR / 'pdf_level_recalibrated_report.xlsx'
+
+FIRM_A = '勤業眾信聯合'
+KDE_CROSSOVER = 0.837
+COSINE_HIGH = 0.95
+PHASH_HIGH_CONF = 5
+PHASH_MOD_CONF = 15
+
+
+def load_all_data():
+    """Load all signature data grouped by PDF."""
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+
+    # Get all signatures with their stats
+    cur.execute('''
+        SELECT s.signature_id, s.image_filename, s.assigned_accountant,
+               s.max_similarity_to_same_accountant,
+               s.phash_distance_to_closest,
+               s.ssim_to_closest,
+               s.signature_verdict,
+               a.firm, a.risk_level, a.mean_similarity, a.ratio_gt_95,
+               a.signature_count
+        FROM signatures s
+        LEFT JOIN accountants a ON s.assigned_accountant = a.name
+        WHERE s.assigned_accountant IS NOT NULL
+    ''')
+    rows = cur.fetchall()
+
+    # Get PDF metadata from the master index or derive from filenames
+    # Also get YOLO detection info
+    cur.execute('''
+        SELECT s.image_filename,
+               s.detection_confidence
+        FROM signatures s
+    ''')
+    detection_rows = cur.fetchall()
+    detection_conf = {r[0]: r[1] for r in detection_rows}
+
+    conn.close()
+
+    # Group by PDF
+    pdf_data = defaultdict(lambda: {
+        'signatures': [],
+        'accountants': set(),
+        'firms': set(),
+    })
+
+    for r in rows:
+        sig_id, filename, accountant, cosine, phash, ssim, verdict, \
+            firm, risk, mean_sim, ratio95, sig_count = r
+
+        # Extract PDF key from filename
+        # Format: {company}_{year}_{type}_page{N}_sig{M}.png or similar
+        parts = filename.rsplit('_sig', 1)
+        pdf_key = parts[0] if len(parts) > 1 else filename.rsplit('.', 1)[0]
+        page_parts = pdf_key.rsplit('_page', 1)
+        pdf_key = page_parts[0] if len(page_parts) > 1 else pdf_key
+
+        pdf_data[pdf_key]['signatures'].append({
+            'sig_id': sig_id,
+            'filename': filename,
+            'accountant': accountant,
+            'cosine': cosine,
+            'phash': phash,
+            'ssim': ssim,
+            'old_verdict': verdict,
+            'firm': firm,
+            'risk_level': risk,
+            'acct_mean_sim': mean_sim,
+            'acct_ratio_95': ratio95,
+            'acct_sig_count': sig_count,
+            'detection_conf': detection_conf.get(filename),
+        })
+        if accountant:
+            pdf_data[pdf_key]['accountants'].add(accountant)
+        if firm:
+            pdf_data[pdf_key]['firms'].add(firm)
+
+    print(f"Loaded {sum(len(v['signatures']) for v in pdf_data.values()):,} signatures across {len(pdf_data):,} PDFs")
+    return pdf_data
+
+
+def classify_dual_method(max_cosine, min_phash):
+    """New dual-method classification with Firm A-calibrated thresholds."""
+    if max_cosine is None:
+        return 'unknown', 'none'
+
+    if max_cosine > COSINE_HIGH:
+        if min_phash is not None and min_phash <= PHASH_HIGH_CONF:
+            return 'high_confidence_replication', 'high'
+        elif min_phash is not None and min_phash <= PHASH_MOD_CONF:
+            return 'moderate_confidence_replication', 'medium'
+        else:
+            return 'high_style_consistency', 'low'
+    elif max_cosine > KDE_CROSSOVER:
+        return 'uncertain', 'low'
+    else:
+        return 'likely_genuine', 'medium'
+
+
+def build_report(pdf_data):
+    """Build Excel report."""
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "PDF-Level Report"
+
+    # Define columns
+    columns = [
+        # Group A: PDF Identification (Blue)
+        ('pdf_key', 'PDF Key'),
+        ('n_signatures', '# Signatures'),
+
+        # Group B: CPA Info (Green)
+        ('accountant_1', 'CPA 1 Name'),
+        ('accountant_2', 'CPA 2 Name'),
+        ('firm_1', 'Firm 1'),
+        ('firm_2', 'Firm 2'),
+        ('is_firm_a', 'Is Firm A'),
+
+        # Group C: Detection (Yellow)
+        ('avg_detection_conf', 'Avg Detection Conf'),
+
+        # Group D: Cosine Similarity - Sig 1 (Red)
+        ('sig1_cosine', 'Sig1 Max Cosine'),
+        ('sig1_cosine_verdict', 'Sig1 Cosine Verdict'),
+        ('sig1_acct_mean', 'Sig1 CPA Mean Sim'),
+        ('sig1_acct_ratio95', 'Sig1 CPA >0.95 Ratio'),
+        ('sig1_acct_count', 'Sig1 CPA Sig Count'),
+
+        # Group E: Cosine Similarity - Sig 2 (Purple)
+        ('sig2_cosine', 'Sig2 Max Cosine'),
+        ('sig2_cosine_verdict', 'Sig2 Cosine Verdict'),
+        ('sig2_acct_mean', 'Sig2 CPA Mean Sim'),
+        ('sig2_acct_ratio95', 'Sig2 CPA >0.95 Ratio'),
+        ('sig2_acct_count', 'Sig2 CPA Sig Count'),
+
+        # Group F: dHash Distance (Orange)
+        ('min_phash', 'Min dHash Distance'),
+        ('max_phash', 'Max dHash Distance'),
+        ('avg_phash', 'Avg dHash Distance'),
+        ('sig1_phash', 'Sig1 dHash Distance'),
+        ('sig2_phash', 'Sig2 dHash Distance'),
+
+        # Group G: SSIM (for reference only) (Gray)
+        ('max_ssim', 'Max SSIM'),
+        ('avg_ssim', 'Avg SSIM'),
+
+        # Group H: Dual-Method Classification (Dark Blue)
+        ('dual_verdict', 'Dual-Method Verdict'),
+        ('dual_confidence', 'Confidence Level'),
+        ('max_cosine', 'PDF Max Cosine'),
+        ('pdf_min_phash', 'PDF Min dHash'),
+
+        # Group I: CPA Risk (Teal)
+        ('sig1_risk', 'Sig1 CPA Risk Level'),
+        ('sig2_risk', 'Sig2 CPA Risk Level'),
+    ]
+
+    col_keys = [c[0] for c in columns]
+    col_names = [c[1] for c in columns]
+
+    # Header styles
+    header_fill = PatternFill(start_color='1F4E79', end_color='1F4E79', fill_type='solid')
+    header_font = Font(name='Arial', size=9, bold=True, color='FFFFFF')
+    data_font = Font(name='Arial', size=9)
+    thin_border = Border(
+        left=Side(style='thin'),
+        right=Side(style='thin'),
+        top=Side(style='thin'),
+        bottom=Side(style='thin'),
+    )
+
+    # Group colors
+    group_colors = {
+        'A': 'D6E4F0',  # Blue - PDF ID
+        'B': 'D9E2D0',  # Green - CPA
+        'C': 'FFF2CC',  # Yellow - Detection
+        'D': 'F4CCCC',  # Red - Cosine Sig1
+        'E': 'E1D5E7',  # Purple - Cosine Sig2
+        'F': 'FFE0B2',  # Orange - dHash
+        'G': 'E0E0E0',  # Gray - SSIM
+        'H': 'B3D4FC',  # Dark Blue - Dual method
+        'I': 'B2DFDB',  # Teal - Risk
+    }
+
+    group_ranges = {
+        'A': (0, 2), 'B': (2, 7), 'C': (7, 8),
+        'D': (8, 13), 'E': (13, 18), 'F': (18, 23),
+        'G': (23, 25), 'H': (25, 29), 'I': (29, 31),
+    }
+
+    # Write header
+    for col_idx, name in enumerate(col_names, 1):
+        cell = ws.cell(row=1, column=col_idx, value=name)
+        cell.font = header_font
+        cell.fill = header_fill
+        cell.alignment = Alignment(horizontal='center', wrap_text=True)
+        cell.border = thin_border
+
+    # Process PDFs
+    row_idx = 2
+    verdict_counts = defaultdict(int)
+    firm_a_counts = defaultdict(int)
+
+    for pdf_key, pdata in sorted(pdf_data.items()):
+        sigs = pdata['signatures']
+        if not sigs:
+            continue
+
+        # Sort signatures by position (sig1, sig2)
+        sigs_sorted = sorted(sigs, key=lambda s: s['filename'])
+        sig1 = sigs_sorted[0] if len(sigs_sorted) > 0 else None
+        sig2 = sigs_sorted[1] if len(sigs_sorted) > 1 else None
+
+        # Compute PDF-level aggregates
+        cosines = [s['cosine'] for s in sigs if s['cosine'] is not None]
+        phashes = [s['phash'] for s in sigs if s['phash'] is not None]
+        ssims = [s['ssim'] for s in sigs if s['ssim'] is not None]
+        confs = [s['detection_conf'] for s in sigs if s['detection_conf'] is not None]
+
+        max_cosine = max(cosines) if cosines else None
+        min_phash = min(phashes) if phashes else None
+        max_phash = max(phashes) if phashes else None
+        avg_phash = np.mean(phashes) if phashes else None
+        max_ssim = max(ssims) if ssims else None
+        avg_ssim = np.mean(ssims) if ssims else None
+        avg_conf = np.mean(confs) if confs else None
+
+        is_firm_a = FIRM_A in pdata['firms']
+
+        # Dual-method classification
+        verdict, confidence = classify_dual_method(max_cosine, min_phash)
+        verdict_counts[verdict] += 1
+        if is_firm_a:
+            firm_a_counts[verdict] += 1
+
+        # Cosine verdicts per signature
+        def cosine_verdict(cos):
+            if cos is None: return None
+            if cos > COSINE_HIGH: return 'high'
+            if cos > KDE_CROSSOVER: return 'uncertain'
+            return 'low'
+
+        # Build row
+        row_data = {
+            'pdf_key': pdf_key,
+            'n_signatures': len(sigs),
+            'accountant_1': sig1['accountant'] if sig1 else None,
+            'accountant_2': sig2['accountant'] if sig2 else None,
+            'firm_1': sig1['firm'] if sig1 else None,
+            'firm_2': sig2['firm'] if sig2 else None,
+            'is_firm_a': 'Yes' if is_firm_a else 'No',
+            'avg_detection_conf': round(avg_conf, 4) if avg_conf else None,
+            'sig1_cosine': round(sig1['cosine'], 4) if sig1 and sig1['cosine'] else None,
+            'sig1_cosine_verdict': cosine_verdict(sig1['cosine']) if sig1 else None,
+            'sig1_acct_mean': round(sig1['acct_mean_sim'], 4) if sig1 and sig1['acct_mean_sim'] else None,
+            'sig1_acct_ratio95': round(sig1['acct_ratio_95'], 4) if sig1 and sig1['acct_ratio_95'] else None,
+            'sig1_acct_count': sig1['acct_sig_count'] if sig1 else None,
+            'sig2_cosine': round(sig2['cosine'], 4) if sig2 and sig2['cosine'] else None,
+            'sig2_cosine_verdict': cosine_verdict(sig2['cosine']) if sig2 else None,
+            'sig2_acct_mean': round(sig2['acct_mean_sim'], 4) if sig2 and sig2['acct_mean_sim'] else None,
+            'sig2_acct_ratio95': round(sig2['acct_ratio_95'], 4) if sig2 and sig2['acct_ratio_95'] else None,
+            'sig2_acct_count': sig2['acct_sig_count'] if sig2 else None,
+            'min_phash': min_phash,
+            'max_phash': max_phash,
+            'avg_phash': round(avg_phash, 2) if avg_phash is not None else None,
+            'sig1_phash': sig1['phash'] if sig1 else None,
+            'sig2_phash': sig2['phash'] if sig2 else None,
+            'max_ssim': round(max_ssim, 4) if max_ssim is not None else None,
+            'avg_ssim': round(avg_ssim, 4) if avg_ssim is not None else None,
+            'dual_verdict': verdict,
+            'dual_confidence': confidence,
+            'max_cosine': round(max_cosine, 4) if max_cosine is not None else None,
+            'pdf_min_phash': min_phash,
+            'sig1_risk': sig1['risk_level'] if sig1 else None,
+            'sig2_risk': sig2['risk_level'] if sig2 else None,
+        }
+
+        for col_idx, key in enumerate(col_keys, 1):
+            val = row_data.get(key)
+            cell = ws.cell(row=row_idx, column=col_idx, value=val)
+            cell.font = data_font
+            cell.border = thin_border
+
+            # Color by group
+            for group, (start, end) in group_ranges.items():
+                if start <= col_idx - 1 < end:
+                    cell.fill = PatternFill(start_color=group_colors[group],
+                                           end_color=group_colors[group],
+                                           fill_type='solid')
+                    break
+
+            # Highlight Firm A rows
+            if is_firm_a and col_idx == 7:
+                cell.font = Font(name='Arial', size=9, bold=True, color='CC0000')
+
+            # Color verdicts
+            if key == 'dual_verdict':
+                colors = {
+                    'high_confidence_replication': 'FF0000',
+                    'moderate_confidence_replication': 'FF6600',
+                    'high_style_consistency': '009900',
+                    'uncertain': 'FF9900',
+                    'likely_genuine': '006600',
+                }
+                if val in colors:
+                    cell.font = Font(name='Arial', size=9, bold=True, color=colors[val])
+
+        row_idx += 1
+
+    # Auto-width
+    for col_idx in range(1, len(col_keys) + 1):
+        ws.column_dimensions[openpyxl.utils.get_column_letter(col_idx)].width = 15
+
+    # Freeze header
+    ws.freeze_panes = 'A2'
+    ws.auto_filter.ref = f"A1:{openpyxl.utils.get_column_letter(len(col_keys))}{row_idx-1}"
+
+    # === Summary Sheet ===
+    ws2 = wb.create_sheet("Summary")
+    ws2.cell(row=1, column=1, value="Dual-Method Classification Summary").font = Font(size=14, bold=True)
+    ws2.cell(row=2, column=1, value=f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
+    ws2.cell(row=3, column=1, value=f"Calibration: Firm A (dHash median=5, p95=15)")
+
+    ws2.cell(row=5, column=1, value="Verdict").font = Font(bold=True)
+    ws2.cell(row=5, column=2, value="Count").font = Font(bold=True)
+    ws2.cell(row=5, column=3, value="%").font = Font(bold=True)
+    ws2.cell(row=5, column=4, value="Firm A").font = Font(bold=True)
+    ws2.cell(row=5, column=5, value="Firm A %").font = Font(bold=True)
+
+    total = sum(verdict_counts.values())
+    fa_total = sum(firm_a_counts.values())
+    order = ['high_confidence_replication', 'moderate_confidence_replication',
+             'high_style_consistency', 'uncertain', 'likely_genuine', 'unknown']
+
+    for i, v in enumerate(order):
+        n = verdict_counts.get(v, 0)
+        fa = firm_a_counts.get(v, 0)
+        ws2.cell(row=6+i, column=1, value=v)
+        ws2.cell(row=6+i, column=2, value=n)
+        ws2.cell(row=6+i, column=3, value=f"{100*n/total:.1f}%" if total > 0 else "0%")
+        ws2.cell(row=6+i, column=4, value=fa)
+        ws2.cell(row=6+i, column=5, value=f"{100*fa/fa_total:.1f}%" if fa_total > 0 else "0%")
+
+    ws2.cell(row=6+len(order), column=1, value="Total").font = Font(bold=True)
+    ws2.cell(row=6+len(order), column=2, value=total)
+    ws2.cell(row=6+len(order), column=4, value=fa_total)
+
+    # Thresholds
+    ws2.cell(row=15, column=1, value="Thresholds Used").font = Font(size=12, bold=True)
+    ws2.cell(row=16, column=1, value="Cosine high threshold")
+    ws2.cell(row=16, column=2, value=COSINE_HIGH)
+    ws2.cell(row=17, column=1, value="KDE crossover")
+    ws2.cell(row=17, column=2, value=KDE_CROSSOVER)
+    ws2.cell(row=18, column=1, value="dHash high-confidence (Firm A median)")
+    ws2.cell(row=18, column=2, value=PHASH_HIGH_CONF)
+    ws2.cell(row=19, column=1, value="dHash moderate-confidence (Firm A p95)")
+    ws2.cell(row=19, column=2, value=PHASH_MOD_CONF)
+
+    for col in range(1, 6):
+        ws2.column_dimensions[openpyxl.utils.get_column_letter(col)].width = 30
+
+    # Save
+    wb.save(str(OUTPUT_PATH))
+    print(f"\nSaved: {OUTPUT_PATH}")
+    print(f"Total PDFs: {total:,}")
+    print(f"Firm A PDFs: {fa_total:,}")
+
+    # Print summary
+    print(f"\n{'Verdict':<35} {'Count':>8} {'%':>7}  | {'Firm A':>8} {'%':>7}")
+    print("-" * 70)
+    for v in order:
+        n = verdict_counts.get(v, 0)
+        fa = firm_a_counts.get(v, 0)
+        if n > 0:
+            print(f"  {v:<33} {n:>8,} {100*n/total:>6.1f}%  | {fa:>8,} {100*fa/fa_total:>6.1f}%"
+                  if fa_total > 0 else f"  {v:<33} {n:>8,} {100*n/total:>6.1f}%")
+    print("-" * 70)
+    print(f"  {'Total':<33} {total:>8,}         | {fa_total:>8,}")
+
+
+def main():
+    print("=" * 60)
+    print("Generating Recalibrated PDF-Level Report")
+    print(f"Calibration: Firm A ({FIRM_A})")
+    print(f"Method: Dual (Cosine + dHash)")
+    print("=" * 60)
+
+    pdf_data = load_all_data()
+    build_report(pdf_data)
+
+
+if __name__ == "__main__":
+    main()