939a348da4
Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
414 lines
16 KiB
Python
414 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate complete PDF-level Excel report with Firm A-calibrated dual-method classification.
|
|
Output: One row per PDF with identification, CPA info, detection stats,
|
|
cosine similarity, dHash distance, and new dual-method verdicts.
|
|
"""
|
|
|
|
import sqlite3
|
|
import numpy as np
|
|
import openpyxl
|
|
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
|
OUTPUT_DIR = Path('/Volumes/NV2/PDF-Processing/signature-analysis/recalibrated')
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
OUTPUT_PATH = OUTPUT_DIR / 'pdf_level_recalibrated_report.xlsx'
|
|
|
|
FIRM_A = '勤業眾信聯合'
|
|
KDE_CROSSOVER = 0.837
|
|
COSINE_HIGH = 0.95
|
|
PHASH_HIGH_CONF = 5
|
|
PHASH_MOD_CONF = 15
|
|
|
|
|
|
def load_all_data():
|
|
"""Load all signature data grouped by PDF."""
|
|
conn = sqlite3.connect(DB_PATH)
|
|
cur = conn.cursor()
|
|
|
|
# Get all signatures with their stats
|
|
cur.execute('''
|
|
SELECT s.signature_id, s.image_filename, s.assigned_accountant,
|
|
s.max_similarity_to_same_accountant,
|
|
s.phash_distance_to_closest,
|
|
s.ssim_to_closest,
|
|
s.signature_verdict,
|
|
a.firm, a.risk_level, a.mean_similarity, a.ratio_gt_95,
|
|
a.signature_count
|
|
FROM signatures s
|
|
LEFT JOIN accountants a ON s.assigned_accountant = a.name
|
|
WHERE s.assigned_accountant IS NOT NULL
|
|
''')
|
|
rows = cur.fetchall()
|
|
|
|
# Get PDF metadata from the master index or derive from filenames
|
|
# Also get YOLO detection info
|
|
cur.execute('''
|
|
SELECT s.image_filename,
|
|
s.detection_confidence
|
|
FROM signatures s
|
|
''')
|
|
detection_rows = cur.fetchall()
|
|
detection_conf = {r[0]: r[1] for r in detection_rows}
|
|
|
|
conn.close()
|
|
|
|
# Group by PDF
|
|
pdf_data = defaultdict(lambda: {
|
|
'signatures': [],
|
|
'accountants': set(),
|
|
'firms': set(),
|
|
})
|
|
|
|
for r in rows:
|
|
sig_id, filename, accountant, cosine, phash, ssim, verdict, \
|
|
firm, risk, mean_sim, ratio95, sig_count = r
|
|
|
|
# Extract PDF key from filename
|
|
# Format: {company}_{year}_{type}_page{N}_sig{M}.png or similar
|
|
parts = filename.rsplit('_sig', 1)
|
|
pdf_key = parts[0] if len(parts) > 1 else filename.rsplit('.', 1)[0]
|
|
page_parts = pdf_key.rsplit('_page', 1)
|
|
pdf_key = page_parts[0] if len(page_parts) > 1 else pdf_key
|
|
|
|
pdf_data[pdf_key]['signatures'].append({
|
|
'sig_id': sig_id,
|
|
'filename': filename,
|
|
'accountant': accountant,
|
|
'cosine': cosine,
|
|
'phash': phash,
|
|
'ssim': ssim,
|
|
'old_verdict': verdict,
|
|
'firm': firm,
|
|
'risk_level': risk,
|
|
'acct_mean_sim': mean_sim,
|
|
'acct_ratio_95': ratio95,
|
|
'acct_sig_count': sig_count,
|
|
'detection_conf': detection_conf.get(filename),
|
|
})
|
|
if accountant:
|
|
pdf_data[pdf_key]['accountants'].add(accountant)
|
|
if firm:
|
|
pdf_data[pdf_key]['firms'].add(firm)
|
|
|
|
print(f"Loaded {sum(len(v['signatures']) for v in pdf_data.values()):,} signatures across {len(pdf_data):,} PDFs")
|
|
return pdf_data
|
|
|
|
|
|
def classify_dual_method(max_cosine, min_phash):
|
|
"""New dual-method classification with Firm A-calibrated thresholds."""
|
|
if max_cosine is None:
|
|
return 'unknown', 'none'
|
|
|
|
if max_cosine > COSINE_HIGH:
|
|
if min_phash is not None and min_phash <= PHASH_HIGH_CONF:
|
|
return 'high_confidence_replication', 'high'
|
|
elif min_phash is not None and min_phash <= PHASH_MOD_CONF:
|
|
return 'moderate_confidence_replication', 'medium'
|
|
else:
|
|
return 'high_style_consistency', 'low'
|
|
elif max_cosine > KDE_CROSSOVER:
|
|
return 'uncertain', 'low'
|
|
else:
|
|
return 'likely_genuine', 'medium'
|
|
|
|
|
|
def build_report(pdf_data):
|
|
"""Build Excel report."""
|
|
wb = openpyxl.Workbook()
|
|
ws = wb.active
|
|
ws.title = "PDF-Level Report"
|
|
|
|
# Define columns
|
|
columns = [
|
|
# Group A: PDF Identification (Blue)
|
|
('pdf_key', 'PDF Key'),
|
|
('n_signatures', '# Signatures'),
|
|
|
|
# Group B: CPA Info (Green)
|
|
('accountant_1', 'CPA 1 Name'),
|
|
('accountant_2', 'CPA 2 Name'),
|
|
('firm_1', 'Firm 1'),
|
|
('firm_2', 'Firm 2'),
|
|
('is_firm_a', 'Is Firm A'),
|
|
|
|
# Group C: Detection (Yellow)
|
|
('avg_detection_conf', 'Avg Detection Conf'),
|
|
|
|
# Group D: Cosine Similarity - Sig 1 (Red)
|
|
('sig1_cosine', 'Sig1 Max Cosine'),
|
|
('sig1_cosine_verdict', 'Sig1 Cosine Verdict'),
|
|
('sig1_acct_mean', 'Sig1 CPA Mean Sim'),
|
|
('sig1_acct_ratio95', 'Sig1 CPA >0.95 Ratio'),
|
|
('sig1_acct_count', 'Sig1 CPA Sig Count'),
|
|
|
|
# Group E: Cosine Similarity - Sig 2 (Purple)
|
|
('sig2_cosine', 'Sig2 Max Cosine'),
|
|
('sig2_cosine_verdict', 'Sig2 Cosine Verdict'),
|
|
('sig2_acct_mean', 'Sig2 CPA Mean Sim'),
|
|
('sig2_acct_ratio95', 'Sig2 CPA >0.95 Ratio'),
|
|
('sig2_acct_count', 'Sig2 CPA Sig Count'),
|
|
|
|
# Group F: dHash Distance (Orange)
|
|
('min_phash', 'Min dHash Distance'),
|
|
('max_phash', 'Max dHash Distance'),
|
|
('avg_phash', 'Avg dHash Distance'),
|
|
('sig1_phash', 'Sig1 dHash Distance'),
|
|
('sig2_phash', 'Sig2 dHash Distance'),
|
|
|
|
# Group G: SSIM (for reference only) (Gray)
|
|
('max_ssim', 'Max SSIM'),
|
|
('avg_ssim', 'Avg SSIM'),
|
|
|
|
# Group H: Dual-Method Classification (Dark Blue)
|
|
('dual_verdict', 'Dual-Method Verdict'),
|
|
('dual_confidence', 'Confidence Level'),
|
|
('max_cosine', 'PDF Max Cosine'),
|
|
('pdf_min_phash', 'PDF Min dHash'),
|
|
|
|
# Group I: CPA Risk (Teal)
|
|
('sig1_risk', 'Sig1 CPA Risk Level'),
|
|
('sig2_risk', 'Sig2 CPA Risk Level'),
|
|
]
|
|
|
|
col_keys = [c[0] for c in columns]
|
|
col_names = [c[1] for c in columns]
|
|
|
|
# Header styles
|
|
header_fill = PatternFill(start_color='1F4E79', end_color='1F4E79', fill_type='solid')
|
|
header_font = Font(name='Arial', size=9, bold=True, color='FFFFFF')
|
|
data_font = Font(name='Arial', size=9)
|
|
thin_border = Border(
|
|
left=Side(style='thin'),
|
|
right=Side(style='thin'),
|
|
top=Side(style='thin'),
|
|
bottom=Side(style='thin'),
|
|
)
|
|
|
|
# Group colors
|
|
group_colors = {
|
|
'A': 'D6E4F0', # Blue - PDF ID
|
|
'B': 'D9E2D0', # Green - CPA
|
|
'C': 'FFF2CC', # Yellow - Detection
|
|
'D': 'F4CCCC', # Red - Cosine Sig1
|
|
'E': 'E1D5E7', # Purple - Cosine Sig2
|
|
'F': 'FFE0B2', # Orange - dHash
|
|
'G': 'E0E0E0', # Gray - SSIM
|
|
'H': 'B3D4FC', # Dark Blue - Dual method
|
|
'I': 'B2DFDB', # Teal - Risk
|
|
}
|
|
|
|
group_ranges = {
|
|
'A': (0, 2), 'B': (2, 7), 'C': (7, 8),
|
|
'D': (8, 13), 'E': (13, 18), 'F': (18, 23),
|
|
'G': (23, 25), 'H': (25, 29), 'I': (29, 31),
|
|
}
|
|
|
|
# Write header
|
|
for col_idx, name in enumerate(col_names, 1):
|
|
cell = ws.cell(row=1, column=col_idx, value=name)
|
|
cell.font = header_font
|
|
cell.fill = header_fill
|
|
cell.alignment = Alignment(horizontal='center', wrap_text=True)
|
|
cell.border = thin_border
|
|
|
|
# Process PDFs
|
|
row_idx = 2
|
|
verdict_counts = defaultdict(int)
|
|
firm_a_counts = defaultdict(int)
|
|
|
|
for pdf_key, pdata in sorted(pdf_data.items()):
|
|
sigs = pdata['signatures']
|
|
if not sigs:
|
|
continue
|
|
|
|
# Sort signatures by position (sig1, sig2)
|
|
sigs_sorted = sorted(sigs, key=lambda s: s['filename'])
|
|
sig1 = sigs_sorted[0] if len(sigs_sorted) > 0 else None
|
|
sig2 = sigs_sorted[1] if len(sigs_sorted) > 1 else None
|
|
|
|
# Compute PDF-level aggregates
|
|
cosines = [s['cosine'] for s in sigs if s['cosine'] is not None]
|
|
phashes = [s['phash'] for s in sigs if s['phash'] is not None]
|
|
ssims = [s['ssim'] for s in sigs if s['ssim'] is not None]
|
|
confs = [s['detection_conf'] for s in sigs if s['detection_conf'] is not None]
|
|
|
|
max_cosine = max(cosines) if cosines else None
|
|
min_phash = min(phashes) if phashes else None
|
|
max_phash = max(phashes) if phashes else None
|
|
avg_phash = np.mean(phashes) if phashes else None
|
|
max_ssim = max(ssims) if ssims else None
|
|
avg_ssim = np.mean(ssims) if ssims else None
|
|
avg_conf = np.mean(confs) if confs else None
|
|
|
|
is_firm_a = FIRM_A in pdata['firms']
|
|
|
|
# Dual-method classification
|
|
verdict, confidence = classify_dual_method(max_cosine, min_phash)
|
|
verdict_counts[verdict] += 1
|
|
if is_firm_a:
|
|
firm_a_counts[verdict] += 1
|
|
|
|
# Cosine verdicts per signature
|
|
def cosine_verdict(cos):
|
|
if cos is None: return None
|
|
if cos > COSINE_HIGH: return 'high'
|
|
if cos > KDE_CROSSOVER: return 'uncertain'
|
|
return 'low'
|
|
|
|
# Build row
|
|
row_data = {
|
|
'pdf_key': pdf_key,
|
|
'n_signatures': len(sigs),
|
|
'accountant_1': sig1['accountant'] if sig1 else None,
|
|
'accountant_2': sig2['accountant'] if sig2 else None,
|
|
'firm_1': sig1['firm'] if sig1 else None,
|
|
'firm_2': sig2['firm'] if sig2 else None,
|
|
'is_firm_a': 'Yes' if is_firm_a else 'No',
|
|
'avg_detection_conf': round(avg_conf, 4) if avg_conf else None,
|
|
'sig1_cosine': round(sig1['cosine'], 4) if sig1 and sig1['cosine'] else None,
|
|
'sig1_cosine_verdict': cosine_verdict(sig1['cosine']) if sig1 else None,
|
|
'sig1_acct_mean': round(sig1['acct_mean_sim'], 4) if sig1 and sig1['acct_mean_sim'] else None,
|
|
'sig1_acct_ratio95': round(sig1['acct_ratio_95'], 4) if sig1 and sig1['acct_ratio_95'] else None,
|
|
'sig1_acct_count': sig1['acct_sig_count'] if sig1 else None,
|
|
'sig2_cosine': round(sig2['cosine'], 4) if sig2 and sig2['cosine'] else None,
|
|
'sig2_cosine_verdict': cosine_verdict(sig2['cosine']) if sig2 else None,
|
|
'sig2_acct_mean': round(sig2['acct_mean_sim'], 4) if sig2 and sig2['acct_mean_sim'] else None,
|
|
'sig2_acct_ratio95': round(sig2['acct_ratio_95'], 4) if sig2 and sig2['acct_ratio_95'] else None,
|
|
'sig2_acct_count': sig2['acct_sig_count'] if sig2 else None,
|
|
'min_phash': min_phash,
|
|
'max_phash': max_phash,
|
|
'avg_phash': round(avg_phash, 2) if avg_phash is not None else None,
|
|
'sig1_phash': sig1['phash'] if sig1 else None,
|
|
'sig2_phash': sig2['phash'] if sig2 else None,
|
|
'max_ssim': round(max_ssim, 4) if max_ssim is not None else None,
|
|
'avg_ssim': round(avg_ssim, 4) if avg_ssim is not None else None,
|
|
'dual_verdict': verdict,
|
|
'dual_confidence': confidence,
|
|
'max_cosine': round(max_cosine, 4) if max_cosine is not None else None,
|
|
'pdf_min_phash': min_phash,
|
|
'sig1_risk': sig1['risk_level'] if sig1 else None,
|
|
'sig2_risk': sig2['risk_level'] if sig2 else None,
|
|
}
|
|
|
|
for col_idx, key in enumerate(col_keys, 1):
|
|
val = row_data.get(key)
|
|
cell = ws.cell(row=row_idx, column=col_idx, value=val)
|
|
cell.font = data_font
|
|
cell.border = thin_border
|
|
|
|
# Color by group
|
|
for group, (start, end) in group_ranges.items():
|
|
if start <= col_idx - 1 < end:
|
|
cell.fill = PatternFill(start_color=group_colors[group],
|
|
end_color=group_colors[group],
|
|
fill_type='solid')
|
|
break
|
|
|
|
# Highlight Firm A rows
|
|
if is_firm_a and col_idx == 7:
|
|
cell.font = Font(name='Arial', size=9, bold=True, color='CC0000')
|
|
|
|
# Color verdicts
|
|
if key == 'dual_verdict':
|
|
colors = {
|
|
'high_confidence_replication': 'FF0000',
|
|
'moderate_confidence_replication': 'FF6600',
|
|
'high_style_consistency': '009900',
|
|
'uncertain': 'FF9900',
|
|
'likely_genuine': '006600',
|
|
}
|
|
if val in colors:
|
|
cell.font = Font(name='Arial', size=9, bold=True, color=colors[val])
|
|
|
|
row_idx += 1
|
|
|
|
# Auto-width
|
|
for col_idx in range(1, len(col_keys) + 1):
|
|
ws.column_dimensions[openpyxl.utils.get_column_letter(col_idx)].width = 15
|
|
|
|
# Freeze header
|
|
ws.freeze_panes = 'A2'
|
|
ws.auto_filter.ref = f"A1:{openpyxl.utils.get_column_letter(len(col_keys))}{row_idx-1}"
|
|
|
|
# === Summary Sheet ===
|
|
ws2 = wb.create_sheet("Summary")
|
|
ws2.cell(row=1, column=1, value="Dual-Method Classification Summary").font = Font(size=14, bold=True)
|
|
ws2.cell(row=2, column=1, value=f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
|
|
ws2.cell(row=3, column=1, value=f"Calibration: Firm A (dHash median=5, p95=15)")
|
|
|
|
ws2.cell(row=5, column=1, value="Verdict").font = Font(bold=True)
|
|
ws2.cell(row=5, column=2, value="Count").font = Font(bold=True)
|
|
ws2.cell(row=5, column=3, value="%").font = Font(bold=True)
|
|
ws2.cell(row=5, column=4, value="Firm A").font = Font(bold=True)
|
|
ws2.cell(row=5, column=5, value="Firm A %").font = Font(bold=True)
|
|
|
|
total = sum(verdict_counts.values())
|
|
fa_total = sum(firm_a_counts.values())
|
|
order = ['high_confidence_replication', 'moderate_confidence_replication',
|
|
'high_style_consistency', 'uncertain', 'likely_genuine', 'unknown']
|
|
|
|
for i, v in enumerate(order):
|
|
n = verdict_counts.get(v, 0)
|
|
fa = firm_a_counts.get(v, 0)
|
|
ws2.cell(row=6+i, column=1, value=v)
|
|
ws2.cell(row=6+i, column=2, value=n)
|
|
ws2.cell(row=6+i, column=3, value=f"{100*n/total:.1f}%" if total > 0 else "0%")
|
|
ws2.cell(row=6+i, column=4, value=fa)
|
|
ws2.cell(row=6+i, column=5, value=f"{100*fa/fa_total:.1f}%" if fa_total > 0 else "0%")
|
|
|
|
ws2.cell(row=6+len(order), column=1, value="Total").font = Font(bold=True)
|
|
ws2.cell(row=6+len(order), column=2, value=total)
|
|
ws2.cell(row=6+len(order), column=4, value=fa_total)
|
|
|
|
# Thresholds
|
|
ws2.cell(row=15, column=1, value="Thresholds Used").font = Font(size=12, bold=True)
|
|
ws2.cell(row=16, column=1, value="Cosine high threshold")
|
|
ws2.cell(row=16, column=2, value=COSINE_HIGH)
|
|
ws2.cell(row=17, column=1, value="KDE crossover")
|
|
ws2.cell(row=17, column=2, value=KDE_CROSSOVER)
|
|
ws2.cell(row=18, column=1, value="dHash high-confidence (Firm A median)")
|
|
ws2.cell(row=18, column=2, value=PHASH_HIGH_CONF)
|
|
ws2.cell(row=19, column=1, value="dHash moderate-confidence (Firm A p95)")
|
|
ws2.cell(row=19, column=2, value=PHASH_MOD_CONF)
|
|
|
|
for col in range(1, 6):
|
|
ws2.column_dimensions[openpyxl.utils.get_column_letter(col)].width = 30
|
|
|
|
# Save
|
|
wb.save(str(OUTPUT_PATH))
|
|
print(f"\nSaved: {OUTPUT_PATH}")
|
|
print(f"Total PDFs: {total:,}")
|
|
print(f"Firm A PDFs: {fa_total:,}")
|
|
|
|
# Print summary
|
|
print(f"\n{'Verdict':<35} {'Count':>8} {'%':>7} | {'Firm A':>8} {'%':>7}")
|
|
print("-" * 70)
|
|
for v in order:
|
|
n = verdict_counts.get(v, 0)
|
|
fa = firm_a_counts.get(v, 0)
|
|
if n > 0:
|
|
print(f" {v:<33} {n:>8,} {100*n/total:>6.1f}% | {fa:>8,} {100*fa/fa_total:>6.1f}%"
|
|
if fa_total > 0 else f" {v:<33} {n:>8,} {100*n/total:>6.1f}%")
|
|
print("-" * 70)
|
|
print(f" {'Total':<33} {total:>8,} | {fa_total:>8,}")
|
|
|
|
|
|
def main():
|
|
print("=" * 60)
|
|
print("Generating Recalibrated PDF-Level Report")
|
|
print(f"Calibration: Firm A ({FIRM_A})")
|
|
print(f"Method: Dual (Cosine + dHash)")
|
|
print("=" * 60)
|
|
|
|
pdf_data = load_all_data()
|
|
build_report(pdf_data)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|