Files
pdf_signature_extraction/paper/generate_recalibrated_report.py
T
gbanyan 939a348da4 Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification
Paper draft includes all sections (Abstract through Conclusion), 36 references,
and supporting scripts. Key methodology: Cosine similarity + dHash dual-method
verification with thresholds calibrated against known-replication firm (Firm A).

Includes:
- 8 section markdown files (paper_a_*.md)
- Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0)
- Recalibrated classification script (84,386 PDFs, 5-tier system)
- Figure generation and Word export scripts
- Citation renumbering script ([1]-[36])
- Signature analysis pipeline (12 steps)
- YOLO extraction scripts

Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 23:05:33 +08:00

414 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Generate complete PDF-level Excel report with Firm A-calibrated dual-method classification.
Output: One row per PDF with identification, CPA info, detection stats,
cosine similarity, dHash distance, and new dual-method verdicts.
"""
import sqlite3
import numpy as np
import openpyxl
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from collections import defaultdict
from pathlib import Path
from datetime import datetime
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
OUTPUT_DIR = Path('/Volumes/NV2/PDF-Processing/signature-analysis/recalibrated')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH = OUTPUT_DIR / 'pdf_level_recalibrated_report.xlsx'
FIRM_A = '勤業眾信聯合'
KDE_CROSSOVER = 0.837
COSINE_HIGH = 0.95
PHASH_HIGH_CONF = 5
PHASH_MOD_CONF = 15
def load_all_data():
"""Load all signature data grouped by PDF."""
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
# Get all signatures with their stats
cur.execute('''
SELECT s.signature_id, s.image_filename, s.assigned_accountant,
s.max_similarity_to_same_accountant,
s.phash_distance_to_closest,
s.ssim_to_closest,
s.signature_verdict,
a.firm, a.risk_level, a.mean_similarity, a.ratio_gt_95,
a.signature_count
FROM signatures s
LEFT JOIN accountants a ON s.assigned_accountant = a.name
WHERE s.assigned_accountant IS NOT NULL
''')
rows = cur.fetchall()
# Get PDF metadata from the master index or derive from filenames
# Also get YOLO detection info
cur.execute('''
SELECT s.image_filename,
s.detection_confidence
FROM signatures s
''')
detection_rows = cur.fetchall()
detection_conf = {r[0]: r[1] for r in detection_rows}
conn.close()
# Group by PDF
pdf_data = defaultdict(lambda: {
'signatures': [],
'accountants': set(),
'firms': set(),
})
for r in rows:
sig_id, filename, accountant, cosine, phash, ssim, verdict, \
firm, risk, mean_sim, ratio95, sig_count = r
# Extract PDF key from filename
# Format: {company}_{year}_{type}_page{N}_sig{M}.png or similar
parts = filename.rsplit('_sig', 1)
pdf_key = parts[0] if len(parts) > 1 else filename.rsplit('.', 1)[0]
page_parts = pdf_key.rsplit('_page', 1)
pdf_key = page_parts[0] if len(page_parts) > 1 else pdf_key
pdf_data[pdf_key]['signatures'].append({
'sig_id': sig_id,
'filename': filename,
'accountant': accountant,
'cosine': cosine,
'phash': phash,
'ssim': ssim,
'old_verdict': verdict,
'firm': firm,
'risk_level': risk,
'acct_mean_sim': mean_sim,
'acct_ratio_95': ratio95,
'acct_sig_count': sig_count,
'detection_conf': detection_conf.get(filename),
})
if accountant:
pdf_data[pdf_key]['accountants'].add(accountant)
if firm:
pdf_data[pdf_key]['firms'].add(firm)
print(f"Loaded {sum(len(v['signatures']) for v in pdf_data.values()):,} signatures across {len(pdf_data):,} PDFs")
return pdf_data
def classify_dual_method(max_cosine, min_phash):
"""New dual-method classification with Firm A-calibrated thresholds."""
if max_cosine is None:
return 'unknown', 'none'
if max_cosine > COSINE_HIGH:
if min_phash is not None and min_phash <= PHASH_HIGH_CONF:
return 'high_confidence_replication', 'high'
elif min_phash is not None and min_phash <= PHASH_MOD_CONF:
return 'moderate_confidence_replication', 'medium'
else:
return 'high_style_consistency', 'low'
elif max_cosine > KDE_CROSSOVER:
return 'uncertain', 'low'
else:
return 'likely_genuine', 'medium'
def build_report(pdf_data):
"""Build Excel report."""
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "PDF-Level Report"
# Define columns
columns = [
# Group A: PDF Identification (Blue)
('pdf_key', 'PDF Key'),
('n_signatures', '# Signatures'),
# Group B: CPA Info (Green)
('accountant_1', 'CPA 1 Name'),
('accountant_2', 'CPA 2 Name'),
('firm_1', 'Firm 1'),
('firm_2', 'Firm 2'),
('is_firm_a', 'Is Firm A'),
# Group C: Detection (Yellow)
('avg_detection_conf', 'Avg Detection Conf'),
# Group D: Cosine Similarity - Sig 1 (Red)
('sig1_cosine', 'Sig1 Max Cosine'),
('sig1_cosine_verdict', 'Sig1 Cosine Verdict'),
('sig1_acct_mean', 'Sig1 CPA Mean Sim'),
('sig1_acct_ratio95', 'Sig1 CPA >0.95 Ratio'),
('sig1_acct_count', 'Sig1 CPA Sig Count'),
# Group E: Cosine Similarity - Sig 2 (Purple)
('sig2_cosine', 'Sig2 Max Cosine'),
('sig2_cosine_verdict', 'Sig2 Cosine Verdict'),
('sig2_acct_mean', 'Sig2 CPA Mean Sim'),
('sig2_acct_ratio95', 'Sig2 CPA >0.95 Ratio'),
('sig2_acct_count', 'Sig2 CPA Sig Count'),
# Group F: dHash Distance (Orange)
('min_phash', 'Min dHash Distance'),
('max_phash', 'Max dHash Distance'),
('avg_phash', 'Avg dHash Distance'),
('sig1_phash', 'Sig1 dHash Distance'),
('sig2_phash', 'Sig2 dHash Distance'),
# Group G: SSIM (for reference only) (Gray)
('max_ssim', 'Max SSIM'),
('avg_ssim', 'Avg SSIM'),
# Group H: Dual-Method Classification (Dark Blue)
('dual_verdict', 'Dual-Method Verdict'),
('dual_confidence', 'Confidence Level'),
('max_cosine', 'PDF Max Cosine'),
('pdf_min_phash', 'PDF Min dHash'),
# Group I: CPA Risk (Teal)
('sig1_risk', 'Sig1 CPA Risk Level'),
('sig2_risk', 'Sig2 CPA Risk Level'),
]
col_keys = [c[0] for c in columns]
col_names = [c[1] for c in columns]
# Header styles
header_fill = PatternFill(start_color='1F4E79', end_color='1F4E79', fill_type='solid')
header_font = Font(name='Arial', size=9, bold=True, color='FFFFFF')
data_font = Font(name='Arial', size=9)
thin_border = Border(
left=Side(style='thin'),
right=Side(style='thin'),
top=Side(style='thin'),
bottom=Side(style='thin'),
)
# Group colors
group_colors = {
'A': 'D6E4F0', # Blue - PDF ID
'B': 'D9E2D0', # Green - CPA
'C': 'FFF2CC', # Yellow - Detection
'D': 'F4CCCC', # Red - Cosine Sig1
'E': 'E1D5E7', # Purple - Cosine Sig2
'F': 'FFE0B2', # Orange - dHash
'G': 'E0E0E0', # Gray - SSIM
'H': 'B3D4FC', # Dark Blue - Dual method
'I': 'B2DFDB', # Teal - Risk
}
group_ranges = {
'A': (0, 2), 'B': (2, 7), 'C': (7, 8),
'D': (8, 13), 'E': (13, 18), 'F': (18, 23),
'G': (23, 25), 'H': (25, 29), 'I': (29, 31),
}
# Write header
for col_idx, name in enumerate(col_names, 1):
cell = ws.cell(row=1, column=col_idx, value=name)
cell.font = header_font
cell.fill = header_fill
cell.alignment = Alignment(horizontal='center', wrap_text=True)
cell.border = thin_border
# Process PDFs
row_idx = 2
verdict_counts = defaultdict(int)
firm_a_counts = defaultdict(int)
for pdf_key, pdata in sorted(pdf_data.items()):
sigs = pdata['signatures']
if not sigs:
continue
# Sort signatures by position (sig1, sig2)
sigs_sorted = sorted(sigs, key=lambda s: s['filename'])
sig1 = sigs_sorted[0] if len(sigs_sorted) > 0 else None
sig2 = sigs_sorted[1] if len(sigs_sorted) > 1 else None
# Compute PDF-level aggregates
cosines = [s['cosine'] for s in sigs if s['cosine'] is not None]
phashes = [s['phash'] for s in sigs if s['phash'] is not None]
ssims = [s['ssim'] for s in sigs if s['ssim'] is not None]
confs = [s['detection_conf'] for s in sigs if s['detection_conf'] is not None]
max_cosine = max(cosines) if cosines else None
min_phash = min(phashes) if phashes else None
max_phash = max(phashes) if phashes else None
avg_phash = np.mean(phashes) if phashes else None
max_ssim = max(ssims) if ssims else None
avg_ssim = np.mean(ssims) if ssims else None
avg_conf = np.mean(confs) if confs else None
is_firm_a = FIRM_A in pdata['firms']
# Dual-method classification
verdict, confidence = classify_dual_method(max_cosine, min_phash)
verdict_counts[verdict] += 1
if is_firm_a:
firm_a_counts[verdict] += 1
# Cosine verdicts per signature
def cosine_verdict(cos):
if cos is None: return None
if cos > COSINE_HIGH: return 'high'
if cos > KDE_CROSSOVER: return 'uncertain'
return 'low'
# Build row
row_data = {
'pdf_key': pdf_key,
'n_signatures': len(sigs),
'accountant_1': sig1['accountant'] if sig1 else None,
'accountant_2': sig2['accountant'] if sig2 else None,
'firm_1': sig1['firm'] if sig1 else None,
'firm_2': sig2['firm'] if sig2 else None,
'is_firm_a': 'Yes' if is_firm_a else 'No',
'avg_detection_conf': round(avg_conf, 4) if avg_conf else None,
'sig1_cosine': round(sig1['cosine'], 4) if sig1 and sig1['cosine'] else None,
'sig1_cosine_verdict': cosine_verdict(sig1['cosine']) if sig1 else None,
'sig1_acct_mean': round(sig1['acct_mean_sim'], 4) if sig1 and sig1['acct_mean_sim'] else None,
'sig1_acct_ratio95': round(sig1['acct_ratio_95'], 4) if sig1 and sig1['acct_ratio_95'] else None,
'sig1_acct_count': sig1['acct_sig_count'] if sig1 else None,
'sig2_cosine': round(sig2['cosine'], 4) if sig2 and sig2['cosine'] else None,
'sig2_cosine_verdict': cosine_verdict(sig2['cosine']) if sig2 else None,
'sig2_acct_mean': round(sig2['acct_mean_sim'], 4) if sig2 and sig2['acct_mean_sim'] else None,
'sig2_acct_ratio95': round(sig2['acct_ratio_95'], 4) if sig2 and sig2['acct_ratio_95'] else None,
'sig2_acct_count': sig2['acct_sig_count'] if sig2 else None,
'min_phash': min_phash,
'max_phash': max_phash,
'avg_phash': round(avg_phash, 2) if avg_phash is not None else None,
'sig1_phash': sig1['phash'] if sig1 else None,
'sig2_phash': sig2['phash'] if sig2 else None,
'max_ssim': round(max_ssim, 4) if max_ssim is not None else None,
'avg_ssim': round(avg_ssim, 4) if avg_ssim is not None else None,
'dual_verdict': verdict,
'dual_confidence': confidence,
'max_cosine': round(max_cosine, 4) if max_cosine is not None else None,
'pdf_min_phash': min_phash,
'sig1_risk': sig1['risk_level'] if sig1 else None,
'sig2_risk': sig2['risk_level'] if sig2 else None,
}
for col_idx, key in enumerate(col_keys, 1):
val = row_data.get(key)
cell = ws.cell(row=row_idx, column=col_idx, value=val)
cell.font = data_font
cell.border = thin_border
# Color by group
for group, (start, end) in group_ranges.items():
if start <= col_idx - 1 < end:
cell.fill = PatternFill(start_color=group_colors[group],
end_color=group_colors[group],
fill_type='solid')
break
# Highlight Firm A rows
if is_firm_a and col_idx == 7:
cell.font = Font(name='Arial', size=9, bold=True, color='CC0000')
# Color verdicts
if key == 'dual_verdict':
colors = {
'high_confidence_replication': 'FF0000',
'moderate_confidence_replication': 'FF6600',
'high_style_consistency': '009900',
'uncertain': 'FF9900',
'likely_genuine': '006600',
}
if val in colors:
cell.font = Font(name='Arial', size=9, bold=True, color=colors[val])
row_idx += 1
# Auto-width
for col_idx in range(1, len(col_keys) + 1):
ws.column_dimensions[openpyxl.utils.get_column_letter(col_idx)].width = 15
# Freeze header
ws.freeze_panes = 'A2'
ws.auto_filter.ref = f"A1:{openpyxl.utils.get_column_letter(len(col_keys))}{row_idx-1}"
# === Summary Sheet ===
ws2 = wb.create_sheet("Summary")
ws2.cell(row=1, column=1, value="Dual-Method Classification Summary").font = Font(size=14, bold=True)
ws2.cell(row=2, column=1, value=f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
ws2.cell(row=3, column=1, value=f"Calibration: Firm A (dHash median=5, p95=15)")
ws2.cell(row=5, column=1, value="Verdict").font = Font(bold=True)
ws2.cell(row=5, column=2, value="Count").font = Font(bold=True)
ws2.cell(row=5, column=3, value="%").font = Font(bold=True)
ws2.cell(row=5, column=4, value="Firm A").font = Font(bold=True)
ws2.cell(row=5, column=5, value="Firm A %").font = Font(bold=True)
total = sum(verdict_counts.values())
fa_total = sum(firm_a_counts.values())
order = ['high_confidence_replication', 'moderate_confidence_replication',
'high_style_consistency', 'uncertain', 'likely_genuine', 'unknown']
for i, v in enumerate(order):
n = verdict_counts.get(v, 0)
fa = firm_a_counts.get(v, 0)
ws2.cell(row=6+i, column=1, value=v)
ws2.cell(row=6+i, column=2, value=n)
ws2.cell(row=6+i, column=3, value=f"{100*n/total:.1f}%" if total > 0 else "0%")
ws2.cell(row=6+i, column=4, value=fa)
ws2.cell(row=6+i, column=5, value=f"{100*fa/fa_total:.1f}%" if fa_total > 0 else "0%")
ws2.cell(row=6+len(order), column=1, value="Total").font = Font(bold=True)
ws2.cell(row=6+len(order), column=2, value=total)
ws2.cell(row=6+len(order), column=4, value=fa_total)
# Thresholds
ws2.cell(row=15, column=1, value="Thresholds Used").font = Font(size=12, bold=True)
ws2.cell(row=16, column=1, value="Cosine high threshold")
ws2.cell(row=16, column=2, value=COSINE_HIGH)
ws2.cell(row=17, column=1, value="KDE crossover")
ws2.cell(row=17, column=2, value=KDE_CROSSOVER)
ws2.cell(row=18, column=1, value="dHash high-confidence (Firm A median)")
ws2.cell(row=18, column=2, value=PHASH_HIGH_CONF)
ws2.cell(row=19, column=1, value="dHash moderate-confidence (Firm A p95)")
ws2.cell(row=19, column=2, value=PHASH_MOD_CONF)
for col in range(1, 6):
ws2.column_dimensions[openpyxl.utils.get_column_letter(col)].width = 30
# Save
wb.save(str(OUTPUT_PATH))
print(f"\nSaved: {OUTPUT_PATH}")
print(f"Total PDFs: {total:,}")
print(f"Firm A PDFs: {fa_total:,}")
# Print summary
print(f"\n{'Verdict':<35} {'Count':>8} {'%':>7} | {'Firm A':>8} {'%':>7}")
print("-" * 70)
for v in order:
n = verdict_counts.get(v, 0)
fa = firm_a_counts.get(v, 0)
if n > 0:
print(f" {v:<33} {n:>8,} {100*n/total:>6.1f}% | {fa:>8,} {100*fa/fa_total:>6.1f}%"
if fa_total > 0 else f" {v:<33} {n:>8,} {100*n/total:>6.1f}%")
print("-" * 70)
print(f" {'Total':<33} {total:>8,} | {fa_total:>8,}")
def main():
print("=" * 60)
print("Generating Recalibrated PDF-Level Report")
print(f"Calibration: Firm A ({FIRM_A})")
print(f"Method: Dual (Cosine + dHash)")
print("=" * 60)
pdf_data = load_all_data()
build_report(pdf_data)
if __name__ == "__main__":
main()