Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification
Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,413 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate complete PDF-level Excel report with Firm A-calibrated dual-method classification.
|
||||
Output: One row per PDF with identification, CPA info, detection stats,
|
||||
cosine similarity, dHash distance, and new dual-method verdicts.
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
import openpyxl
|
||||
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
DB_PATH = '/Volumes/NV2/PDF-Processing/signature-analysis/signature_analysis.db'
|
||||
OUTPUT_DIR = Path('/Volumes/NV2/PDF-Processing/signature-analysis/recalibrated')
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
OUTPUT_PATH = OUTPUT_DIR / 'pdf_level_recalibrated_report.xlsx'
|
||||
|
||||
FIRM_A = '勤業眾信聯合'
|
||||
KDE_CROSSOVER = 0.837
|
||||
COSINE_HIGH = 0.95
|
||||
PHASH_HIGH_CONF = 5
|
||||
PHASH_MOD_CONF = 15
|
||||
|
||||
|
||||
def load_all_data():
|
||||
"""Load all signature data grouped by PDF."""
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get all signatures with their stats
|
||||
cur.execute('''
|
||||
SELECT s.signature_id, s.image_filename, s.assigned_accountant,
|
||||
s.max_similarity_to_same_accountant,
|
||||
s.phash_distance_to_closest,
|
||||
s.ssim_to_closest,
|
||||
s.signature_verdict,
|
||||
a.firm, a.risk_level, a.mean_similarity, a.ratio_gt_95,
|
||||
a.signature_count
|
||||
FROM signatures s
|
||||
LEFT JOIN accountants a ON s.assigned_accountant = a.name
|
||||
WHERE s.assigned_accountant IS NOT NULL
|
||||
''')
|
||||
rows = cur.fetchall()
|
||||
|
||||
# Get PDF metadata from the master index or derive from filenames
|
||||
# Also get YOLO detection info
|
||||
cur.execute('''
|
||||
SELECT s.image_filename,
|
||||
s.detection_confidence
|
||||
FROM signatures s
|
||||
''')
|
||||
detection_rows = cur.fetchall()
|
||||
detection_conf = {r[0]: r[1] for r in detection_rows}
|
||||
|
||||
conn.close()
|
||||
|
||||
# Group by PDF
|
||||
pdf_data = defaultdict(lambda: {
|
||||
'signatures': [],
|
||||
'accountants': set(),
|
||||
'firms': set(),
|
||||
})
|
||||
|
||||
for r in rows:
|
||||
sig_id, filename, accountant, cosine, phash, ssim, verdict, \
|
||||
firm, risk, mean_sim, ratio95, sig_count = r
|
||||
|
||||
# Extract PDF key from filename
|
||||
# Format: {company}_{year}_{type}_page{N}_sig{M}.png or similar
|
||||
parts = filename.rsplit('_sig', 1)
|
||||
pdf_key = parts[0] if len(parts) > 1 else filename.rsplit('.', 1)[0]
|
||||
page_parts = pdf_key.rsplit('_page', 1)
|
||||
pdf_key = page_parts[0] if len(page_parts) > 1 else pdf_key
|
||||
|
||||
pdf_data[pdf_key]['signatures'].append({
|
||||
'sig_id': sig_id,
|
||||
'filename': filename,
|
||||
'accountant': accountant,
|
||||
'cosine': cosine,
|
||||
'phash': phash,
|
||||
'ssim': ssim,
|
||||
'old_verdict': verdict,
|
||||
'firm': firm,
|
||||
'risk_level': risk,
|
||||
'acct_mean_sim': mean_sim,
|
||||
'acct_ratio_95': ratio95,
|
||||
'acct_sig_count': sig_count,
|
||||
'detection_conf': detection_conf.get(filename),
|
||||
})
|
||||
if accountant:
|
||||
pdf_data[pdf_key]['accountants'].add(accountant)
|
||||
if firm:
|
||||
pdf_data[pdf_key]['firms'].add(firm)
|
||||
|
||||
print(f"Loaded {sum(len(v['signatures']) for v in pdf_data.values()):,} signatures across {len(pdf_data):,} PDFs")
|
||||
return pdf_data
|
||||
|
||||
|
||||
def classify_dual_method(max_cosine, min_phash):
|
||||
"""New dual-method classification with Firm A-calibrated thresholds."""
|
||||
if max_cosine is None:
|
||||
return 'unknown', 'none'
|
||||
|
||||
if max_cosine > COSINE_HIGH:
|
||||
if min_phash is not None and min_phash <= PHASH_HIGH_CONF:
|
||||
return 'high_confidence_replication', 'high'
|
||||
elif min_phash is not None and min_phash <= PHASH_MOD_CONF:
|
||||
return 'moderate_confidence_replication', 'medium'
|
||||
else:
|
||||
return 'high_style_consistency', 'low'
|
||||
elif max_cosine > KDE_CROSSOVER:
|
||||
return 'uncertain', 'low'
|
||||
else:
|
||||
return 'likely_genuine', 'medium'
|
||||
|
||||
|
||||
def build_report(pdf_data):
|
||||
"""Build Excel report."""
|
||||
wb = openpyxl.Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "PDF-Level Report"
|
||||
|
||||
# Define columns
|
||||
columns = [
|
||||
# Group A: PDF Identification (Blue)
|
||||
('pdf_key', 'PDF Key'),
|
||||
('n_signatures', '# Signatures'),
|
||||
|
||||
# Group B: CPA Info (Green)
|
||||
('accountant_1', 'CPA 1 Name'),
|
||||
('accountant_2', 'CPA 2 Name'),
|
||||
('firm_1', 'Firm 1'),
|
||||
('firm_2', 'Firm 2'),
|
||||
('is_firm_a', 'Is Firm A'),
|
||||
|
||||
# Group C: Detection (Yellow)
|
||||
('avg_detection_conf', 'Avg Detection Conf'),
|
||||
|
||||
# Group D: Cosine Similarity - Sig 1 (Red)
|
||||
('sig1_cosine', 'Sig1 Max Cosine'),
|
||||
('sig1_cosine_verdict', 'Sig1 Cosine Verdict'),
|
||||
('sig1_acct_mean', 'Sig1 CPA Mean Sim'),
|
||||
('sig1_acct_ratio95', 'Sig1 CPA >0.95 Ratio'),
|
||||
('sig1_acct_count', 'Sig1 CPA Sig Count'),
|
||||
|
||||
# Group E: Cosine Similarity - Sig 2 (Purple)
|
||||
('sig2_cosine', 'Sig2 Max Cosine'),
|
||||
('sig2_cosine_verdict', 'Sig2 Cosine Verdict'),
|
||||
('sig2_acct_mean', 'Sig2 CPA Mean Sim'),
|
||||
('sig2_acct_ratio95', 'Sig2 CPA >0.95 Ratio'),
|
||||
('sig2_acct_count', 'Sig2 CPA Sig Count'),
|
||||
|
||||
# Group F: dHash Distance (Orange)
|
||||
('min_phash', 'Min dHash Distance'),
|
||||
('max_phash', 'Max dHash Distance'),
|
||||
('avg_phash', 'Avg dHash Distance'),
|
||||
('sig1_phash', 'Sig1 dHash Distance'),
|
||||
('sig2_phash', 'Sig2 dHash Distance'),
|
||||
|
||||
# Group G: SSIM (for reference only) (Gray)
|
||||
('max_ssim', 'Max SSIM'),
|
||||
('avg_ssim', 'Avg SSIM'),
|
||||
|
||||
# Group H: Dual-Method Classification (Dark Blue)
|
||||
('dual_verdict', 'Dual-Method Verdict'),
|
||||
('dual_confidence', 'Confidence Level'),
|
||||
('max_cosine', 'PDF Max Cosine'),
|
||||
('pdf_min_phash', 'PDF Min dHash'),
|
||||
|
||||
# Group I: CPA Risk (Teal)
|
||||
('sig1_risk', 'Sig1 CPA Risk Level'),
|
||||
('sig2_risk', 'Sig2 CPA Risk Level'),
|
||||
]
|
||||
|
||||
col_keys = [c[0] for c in columns]
|
||||
col_names = [c[1] for c in columns]
|
||||
|
||||
# Header styles
|
||||
header_fill = PatternFill(start_color='1F4E79', end_color='1F4E79', fill_type='solid')
|
||||
header_font = Font(name='Arial', size=9, bold=True, color='FFFFFF')
|
||||
data_font = Font(name='Arial', size=9)
|
||||
thin_border = Border(
|
||||
left=Side(style='thin'),
|
||||
right=Side(style='thin'),
|
||||
top=Side(style='thin'),
|
||||
bottom=Side(style='thin'),
|
||||
)
|
||||
|
||||
# Group colors
|
||||
group_colors = {
|
||||
'A': 'D6E4F0', # Blue - PDF ID
|
||||
'B': 'D9E2D0', # Green - CPA
|
||||
'C': 'FFF2CC', # Yellow - Detection
|
||||
'D': 'F4CCCC', # Red - Cosine Sig1
|
||||
'E': 'E1D5E7', # Purple - Cosine Sig2
|
||||
'F': 'FFE0B2', # Orange - dHash
|
||||
'G': 'E0E0E0', # Gray - SSIM
|
||||
'H': 'B3D4FC', # Dark Blue - Dual method
|
||||
'I': 'B2DFDB', # Teal - Risk
|
||||
}
|
||||
|
||||
group_ranges = {
|
||||
'A': (0, 2), 'B': (2, 7), 'C': (7, 8),
|
||||
'D': (8, 13), 'E': (13, 18), 'F': (18, 23),
|
||||
'G': (23, 25), 'H': (25, 29), 'I': (29, 31),
|
||||
}
|
||||
|
||||
# Write header
|
||||
for col_idx, name in enumerate(col_names, 1):
|
||||
cell = ws.cell(row=1, column=col_idx, value=name)
|
||||
cell.font = header_font
|
||||
cell.fill = header_fill
|
||||
cell.alignment = Alignment(horizontal='center', wrap_text=True)
|
||||
cell.border = thin_border
|
||||
|
||||
# Process PDFs
|
||||
row_idx = 2
|
||||
verdict_counts = defaultdict(int)
|
||||
firm_a_counts = defaultdict(int)
|
||||
|
||||
for pdf_key, pdata in sorted(pdf_data.items()):
|
||||
sigs = pdata['signatures']
|
||||
if not sigs:
|
||||
continue
|
||||
|
||||
# Sort signatures by position (sig1, sig2)
|
||||
sigs_sorted = sorted(sigs, key=lambda s: s['filename'])
|
||||
sig1 = sigs_sorted[0] if len(sigs_sorted) > 0 else None
|
||||
sig2 = sigs_sorted[1] if len(sigs_sorted) > 1 else None
|
||||
|
||||
# Compute PDF-level aggregates
|
||||
cosines = [s['cosine'] for s in sigs if s['cosine'] is not None]
|
||||
phashes = [s['phash'] for s in sigs if s['phash'] is not None]
|
||||
ssims = [s['ssim'] for s in sigs if s['ssim'] is not None]
|
||||
confs = [s['detection_conf'] for s in sigs if s['detection_conf'] is not None]
|
||||
|
||||
max_cosine = max(cosines) if cosines else None
|
||||
min_phash = min(phashes) if phashes else None
|
||||
max_phash = max(phashes) if phashes else None
|
||||
avg_phash = np.mean(phashes) if phashes else None
|
||||
max_ssim = max(ssims) if ssims else None
|
||||
avg_ssim = np.mean(ssims) if ssims else None
|
||||
avg_conf = np.mean(confs) if confs else None
|
||||
|
||||
is_firm_a = FIRM_A in pdata['firms']
|
||||
|
||||
# Dual-method classification
|
||||
verdict, confidence = classify_dual_method(max_cosine, min_phash)
|
||||
verdict_counts[verdict] += 1
|
||||
if is_firm_a:
|
||||
firm_a_counts[verdict] += 1
|
||||
|
||||
# Cosine verdicts per signature
|
||||
def cosine_verdict(cos):
|
||||
if cos is None: return None
|
||||
if cos > COSINE_HIGH: return 'high'
|
||||
if cos > KDE_CROSSOVER: return 'uncertain'
|
||||
return 'low'
|
||||
|
||||
# Build row
|
||||
row_data = {
|
||||
'pdf_key': pdf_key,
|
||||
'n_signatures': len(sigs),
|
||||
'accountant_1': sig1['accountant'] if sig1 else None,
|
||||
'accountant_2': sig2['accountant'] if sig2 else None,
|
||||
'firm_1': sig1['firm'] if sig1 else None,
|
||||
'firm_2': sig2['firm'] if sig2 else None,
|
||||
'is_firm_a': 'Yes' if is_firm_a else 'No',
|
||||
'avg_detection_conf': round(avg_conf, 4) if avg_conf else None,
|
||||
'sig1_cosine': round(sig1['cosine'], 4) if sig1 and sig1['cosine'] else None,
|
||||
'sig1_cosine_verdict': cosine_verdict(sig1['cosine']) if sig1 else None,
|
||||
'sig1_acct_mean': round(sig1['acct_mean_sim'], 4) if sig1 and sig1['acct_mean_sim'] else None,
|
||||
'sig1_acct_ratio95': round(sig1['acct_ratio_95'], 4) if sig1 and sig1['acct_ratio_95'] else None,
|
||||
'sig1_acct_count': sig1['acct_sig_count'] if sig1 else None,
|
||||
'sig2_cosine': round(sig2['cosine'], 4) if sig2 and sig2['cosine'] else None,
|
||||
'sig2_cosine_verdict': cosine_verdict(sig2['cosine']) if sig2 else None,
|
||||
'sig2_acct_mean': round(sig2['acct_mean_sim'], 4) if sig2 and sig2['acct_mean_sim'] else None,
|
||||
'sig2_acct_ratio95': round(sig2['acct_ratio_95'], 4) if sig2 and sig2['acct_ratio_95'] else None,
|
||||
'sig2_acct_count': sig2['acct_sig_count'] if sig2 else None,
|
||||
'min_phash': min_phash,
|
||||
'max_phash': max_phash,
|
||||
'avg_phash': round(avg_phash, 2) if avg_phash is not None else None,
|
||||
'sig1_phash': sig1['phash'] if sig1 else None,
|
||||
'sig2_phash': sig2['phash'] if sig2 else None,
|
||||
'max_ssim': round(max_ssim, 4) if max_ssim is not None else None,
|
||||
'avg_ssim': round(avg_ssim, 4) if avg_ssim is not None else None,
|
||||
'dual_verdict': verdict,
|
||||
'dual_confidence': confidence,
|
||||
'max_cosine': round(max_cosine, 4) if max_cosine is not None else None,
|
||||
'pdf_min_phash': min_phash,
|
||||
'sig1_risk': sig1['risk_level'] if sig1 else None,
|
||||
'sig2_risk': sig2['risk_level'] if sig2 else None,
|
||||
}
|
||||
|
||||
for col_idx, key in enumerate(col_keys, 1):
|
||||
val = row_data.get(key)
|
||||
cell = ws.cell(row=row_idx, column=col_idx, value=val)
|
||||
cell.font = data_font
|
||||
cell.border = thin_border
|
||||
|
||||
# Color by group
|
||||
for group, (start, end) in group_ranges.items():
|
||||
if start <= col_idx - 1 < end:
|
||||
cell.fill = PatternFill(start_color=group_colors[group],
|
||||
end_color=group_colors[group],
|
||||
fill_type='solid')
|
||||
break
|
||||
|
||||
# Highlight Firm A rows
|
||||
if is_firm_a and col_idx == 7:
|
||||
cell.font = Font(name='Arial', size=9, bold=True, color='CC0000')
|
||||
|
||||
# Color verdicts
|
||||
if key == 'dual_verdict':
|
||||
colors = {
|
||||
'high_confidence_replication': 'FF0000',
|
||||
'moderate_confidence_replication': 'FF6600',
|
||||
'high_style_consistency': '009900',
|
||||
'uncertain': 'FF9900',
|
||||
'likely_genuine': '006600',
|
||||
}
|
||||
if val in colors:
|
||||
cell.font = Font(name='Arial', size=9, bold=True, color=colors[val])
|
||||
|
||||
row_idx += 1
|
||||
|
||||
# Auto-width
|
||||
for col_idx in range(1, len(col_keys) + 1):
|
||||
ws.column_dimensions[openpyxl.utils.get_column_letter(col_idx)].width = 15
|
||||
|
||||
# Freeze header
|
||||
ws.freeze_panes = 'A2'
|
||||
ws.auto_filter.ref = f"A1:{openpyxl.utils.get_column_letter(len(col_keys))}{row_idx-1}"
|
||||
|
||||
# === Summary Sheet ===
|
||||
ws2 = wb.create_sheet("Summary")
|
||||
ws2.cell(row=1, column=1, value="Dual-Method Classification Summary").font = Font(size=14, bold=True)
|
||||
ws2.cell(row=2, column=1, value=f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
|
||||
ws2.cell(row=3, column=1, value=f"Calibration: Firm A (dHash median=5, p95=15)")
|
||||
|
||||
ws2.cell(row=5, column=1, value="Verdict").font = Font(bold=True)
|
||||
ws2.cell(row=5, column=2, value="Count").font = Font(bold=True)
|
||||
ws2.cell(row=5, column=3, value="%").font = Font(bold=True)
|
||||
ws2.cell(row=5, column=4, value="Firm A").font = Font(bold=True)
|
||||
ws2.cell(row=5, column=5, value="Firm A %").font = Font(bold=True)
|
||||
|
||||
total = sum(verdict_counts.values())
|
||||
fa_total = sum(firm_a_counts.values())
|
||||
order = ['high_confidence_replication', 'moderate_confidence_replication',
|
||||
'high_style_consistency', 'uncertain', 'likely_genuine', 'unknown']
|
||||
|
||||
for i, v in enumerate(order):
|
||||
n = verdict_counts.get(v, 0)
|
||||
fa = firm_a_counts.get(v, 0)
|
||||
ws2.cell(row=6+i, column=1, value=v)
|
||||
ws2.cell(row=6+i, column=2, value=n)
|
||||
ws2.cell(row=6+i, column=3, value=f"{100*n/total:.1f}%" if total > 0 else "0%")
|
||||
ws2.cell(row=6+i, column=4, value=fa)
|
||||
ws2.cell(row=6+i, column=5, value=f"{100*fa/fa_total:.1f}%" if fa_total > 0 else "0%")
|
||||
|
||||
ws2.cell(row=6+len(order), column=1, value="Total").font = Font(bold=True)
|
||||
ws2.cell(row=6+len(order), column=2, value=total)
|
||||
ws2.cell(row=6+len(order), column=4, value=fa_total)
|
||||
|
||||
# Thresholds
|
||||
ws2.cell(row=15, column=1, value="Thresholds Used").font = Font(size=12, bold=True)
|
||||
ws2.cell(row=16, column=1, value="Cosine high threshold")
|
||||
ws2.cell(row=16, column=2, value=COSINE_HIGH)
|
||||
ws2.cell(row=17, column=1, value="KDE crossover")
|
||||
ws2.cell(row=17, column=2, value=KDE_CROSSOVER)
|
||||
ws2.cell(row=18, column=1, value="dHash high-confidence (Firm A median)")
|
||||
ws2.cell(row=18, column=2, value=PHASH_HIGH_CONF)
|
||||
ws2.cell(row=19, column=1, value="dHash moderate-confidence (Firm A p95)")
|
||||
ws2.cell(row=19, column=2, value=PHASH_MOD_CONF)
|
||||
|
||||
for col in range(1, 6):
|
||||
ws2.column_dimensions[openpyxl.utils.get_column_letter(col)].width = 30
|
||||
|
||||
# Save
|
||||
wb.save(str(OUTPUT_PATH))
|
||||
print(f"\nSaved: {OUTPUT_PATH}")
|
||||
print(f"Total PDFs: {total:,}")
|
||||
print(f"Firm A PDFs: {fa_total:,}")
|
||||
|
||||
# Print summary
|
||||
print(f"\n{'Verdict':<35} {'Count':>8} {'%':>7} | {'Firm A':>8} {'%':>7}")
|
||||
print("-" * 70)
|
||||
for v in order:
|
||||
n = verdict_counts.get(v, 0)
|
||||
fa = firm_a_counts.get(v, 0)
|
||||
if n > 0:
|
||||
print(f" {v:<33} {n:>8,} {100*n/total:>6.1f}% | {fa:>8,} {100*fa/fa_total:>6.1f}%"
|
||||
if fa_total > 0 else f" {v:<33} {n:>8,} {100*n/total:>6.1f}%")
|
||||
print("-" * 70)
|
||||
print(f" {'Total':<33} {total:>8,} | {fa_total:>8,}")
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Generating Recalibrated PDF-Level Report")
|
||||
print(f"Calibration: Firm A ({FIRM_A})")
|
||||
print(f"Method: Dual (Cosine + dHash)")
|
||||
print("=" * 60)
|
||||
|
||||
pdf_data = load_all_data()
|
||||
build_report(pdf_data)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user