939a348da4
Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
414 lines
13 KiB
Python
414 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
YOLO-based signature extraction from PDF documents.
|
|
Uses a trained YOLOv11n model to detect and extract handwritten signatures.
|
|
|
|
Pipeline:
|
|
PDF → Render to Image → YOLO Detection → Crop Signatures → Output
|
|
"""
|
|
|
|
import csv
|
|
import json
|
|
import os
|
|
import random
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import cv2
|
|
import fitz # PyMuPDF
|
|
import numpy as np
|
|
from ultralytics import YOLO
|
|
|
|
|
|
# Configuration
|
|
CSV_PATH = "/Volumes/NV2/PDF-Processing/master_signatures.csv"
|
|
PDF_BASE_PATH = "/Volumes/NV2/PDF-Processing/total-pdf"
|
|
OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/yolo"
|
|
OUTPUT_PATH_NO_STAMP = "/Volumes/NV2/PDF-Processing/signature-image-output/yolo_no_stamp"
|
|
MODEL_PATH = "/Volumes/NV2/pdf_recognize/models/best.pt"
|
|
|
|
# Detection parameters
|
|
DPI = 300
|
|
CONFIDENCE_THRESHOLD = 0.5
|
|
|
|
|
|
def remove_red_stamp(image: np.ndarray) -> np.ndarray:
|
|
"""
|
|
Remove red stamp pixels from an image by replacing them with white.
|
|
|
|
Uses HSV color space to detect red regions (stamps are typically red/orange).
|
|
|
|
Args:
|
|
image: RGB image as numpy array
|
|
|
|
Returns:
|
|
Image with red stamp pixels replaced by white
|
|
"""
|
|
# Convert to HSV
|
|
hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
|
|
|
|
# Red color wraps around in HSV, so we need two ranges
|
|
# Range 1: H = 0-10 (red-orange)
|
|
lower_red1 = np.array([0, 50, 50])
|
|
upper_red1 = np.array([10, 255, 255])
|
|
|
|
# Range 2: H = 160-180 (red-magenta)
|
|
lower_red2 = np.array([160, 50, 50])
|
|
upper_red2 = np.array([180, 255, 255])
|
|
|
|
# Create masks for red regions
|
|
mask1 = cv2.inRange(hsv, lower_red1, upper_red1)
|
|
mask2 = cv2.inRange(hsv, lower_red2, upper_red2)
|
|
|
|
# Combine masks
|
|
red_mask = cv2.bitwise_or(mask1, mask2)
|
|
|
|
# Optional: dilate mask slightly to catch edges
|
|
kernel = np.ones((3, 3), np.uint8)
|
|
red_mask = cv2.dilate(red_mask, kernel, iterations=1)
|
|
|
|
# Replace red pixels with white
|
|
result = image.copy()
|
|
result[red_mask > 0] = [255, 255, 255]
|
|
|
|
return result
|
|
|
|
|
|
class YOLOSignatureExtractor:
|
|
"""Extract signatures from PDF pages using YOLO object detection."""
|
|
|
|
def __init__(self, model_path: str = MODEL_PATH, conf_threshold: float = CONFIDENCE_THRESHOLD):
|
|
"""
|
|
Initialize the extractor with a trained YOLO model.
|
|
|
|
Args:
|
|
model_path: Path to the YOLO model weights
|
|
conf_threshold: Minimum confidence threshold for detections
|
|
"""
|
|
print(f"Loading YOLO model from {model_path}...")
|
|
self.model = YOLO(model_path)
|
|
self.conf_threshold = conf_threshold
|
|
self.dpi = DPI
|
|
print(f"Model loaded. Confidence threshold: {conf_threshold}")
|
|
|
|
def render_pdf_page(self, pdf_path: str, page_num: int) -> Optional[np.ndarray]:
|
|
"""
|
|
Render a PDF page to an image array.
|
|
|
|
Args:
|
|
pdf_path: Path to the PDF file
|
|
page_num: Page number (1-indexed)
|
|
|
|
Returns:
|
|
RGB image as numpy array, or None if failed
|
|
"""
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
if page_num < 1 or page_num > len(doc):
|
|
print(f" Invalid page number: {page_num} (PDF has {len(doc)} pages)")
|
|
doc.close()
|
|
return None
|
|
|
|
page = doc[page_num - 1]
|
|
mat = fitz.Matrix(self.dpi / 72, self.dpi / 72)
|
|
pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
image = np.frombuffer(pix.samples, dtype=np.uint8)
|
|
image = image.reshape(pix.height, pix.width, pix.n)
|
|
doc.close()
|
|
return image
|
|
except Exception as e:
|
|
print(f" Error rendering PDF: {e}")
|
|
return None
|
|
|
|
def detect_signatures(self, image: np.ndarray) -> list[dict]:
|
|
"""
|
|
Detect signature regions in an image using YOLO.
|
|
|
|
Args:
|
|
image: RGB image as numpy array
|
|
|
|
Returns:
|
|
List of detected signatures with box coordinates and confidence
|
|
"""
|
|
results = self.model(image, conf=self.conf_threshold, verbose=False)
|
|
signatures = []
|
|
|
|
for r in results:
|
|
for box in r.boxes:
|
|
x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
|
|
conf = float(box.conf[0].cpu().numpy())
|
|
signatures.append({
|
|
'box': (x1, y1, x2 - x1, y2 - y1), # x, y, w, h format
|
|
'xyxy': (x1, y1, x2, y2),
|
|
'confidence': conf
|
|
})
|
|
|
|
# Sort by y-coordinate (top to bottom), then x-coordinate (left to right)
|
|
signatures.sort(key=lambda s: (s['box'][1], s['box'][0]))
|
|
|
|
return signatures
|
|
|
|
def extract_signature_images(self, image: np.ndarray, signatures: list[dict]) -> list[np.ndarray]:
|
|
"""
|
|
Crop signature regions from the image.
|
|
|
|
Args:
|
|
image: RGB image as numpy array
|
|
signatures: List of detected signatures
|
|
|
|
Returns:
|
|
List of cropped signature images
|
|
"""
|
|
cropped = []
|
|
for sig in signatures:
|
|
x, y, w, h = sig['box']
|
|
# Ensure bounds are within image
|
|
x = max(0, x)
|
|
y = max(0, y)
|
|
x2 = min(image.shape[1], x + w)
|
|
y2 = min(image.shape[0], y + h)
|
|
cropped.append(image[y:y2, x:x2])
|
|
return cropped
|
|
|
|
def create_visualization(self, image: np.ndarray, signatures: list[dict]) -> np.ndarray:
|
|
"""
|
|
Create a visualization with detection boxes drawn on the image.
|
|
|
|
Args:
|
|
image: RGB image as numpy array
|
|
signatures: List of detected signatures
|
|
|
|
Returns:
|
|
Image with drawn bounding boxes
|
|
"""
|
|
vis = image.copy()
|
|
for i, sig in enumerate(signatures):
|
|
x1, y1, x2, y2 = sig['xyxy']
|
|
conf = sig['confidence']
|
|
|
|
# Draw box
|
|
cv2.rectangle(vis, (x1, y1), (x2, y2), (255, 0, 0), 3)
|
|
|
|
# Draw label
|
|
label = f"sig{i+1}: {conf:.2f}"
|
|
font_scale = 0.8
|
|
thickness = 2
|
|
(text_w, text_h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)
|
|
|
|
cv2.rectangle(vis, (x1, y1 - text_h - 10), (x1 + text_w + 5, y1), (255, 0, 0), -1)
|
|
cv2.putText(vis, label, (x1 + 2, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX,
|
|
font_scale, (255, 255, 255), thickness)
|
|
|
|
return vis
|
|
|
|
|
|
def find_pdf_file(filename: str) -> Optional[str]:
|
|
"""
|
|
Search for PDF file in batch directories.
|
|
|
|
Args:
|
|
filename: PDF filename to search for
|
|
|
|
Returns:
|
|
Full path if found, None otherwise
|
|
"""
|
|
for batch_dir in sorted(Path(PDF_BASE_PATH).glob("batch_*")):
|
|
pdf_path = batch_dir / filename
|
|
if pdf_path.exists():
|
|
return str(pdf_path)
|
|
return None
|
|
|
|
|
|
def load_csv_samples(csv_path: str, sample_size: int = 50, seed: int = 42) -> list[dict]:
|
|
"""
|
|
Load random samples from the CSV file.
|
|
|
|
Args:
|
|
csv_path: Path to master_signatures.csv
|
|
sample_size: Number of samples to load
|
|
seed: Random seed for reproducibility
|
|
|
|
Returns:
|
|
List of dictionaries with filename and page info
|
|
"""
|
|
with open(csv_path, 'r') as f:
|
|
reader = csv.DictReader(f)
|
|
all_rows = list(reader)
|
|
|
|
random.seed(seed)
|
|
samples = random.sample(all_rows, min(sample_size, len(all_rows)))
|
|
|
|
return samples
|
|
|
|
|
|
def process_samples(extractor: YOLOSignatureExtractor, samples: list[dict],
|
|
output_dir: str, output_dir_no_stamp: str = None,
|
|
save_visualization: bool = True) -> dict:
|
|
"""
|
|
Process a list of PDF samples and extract signatures.
|
|
|
|
Args:
|
|
extractor: YOLOSignatureExtractor instance
|
|
samples: List of sample dictionaries from CSV
|
|
output_dir: Output directory for signatures
|
|
output_dir_no_stamp: Output directory for stamp-removed signatures (optional)
|
|
save_visualization: Whether to save visualization images
|
|
|
|
Returns:
|
|
Results dictionary with statistics and per-file results
|
|
"""
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
if save_visualization:
|
|
os.makedirs(os.path.join(output_dir, "visualization"), exist_ok=True)
|
|
|
|
# Create no-stamp output directory if specified
|
|
if output_dir_no_stamp:
|
|
os.makedirs(output_dir_no_stamp, exist_ok=True)
|
|
|
|
results = {
|
|
'timestamp': datetime.now().isoformat(),
|
|
'total_samples': len(samples),
|
|
'processed': 0,
|
|
'pdf_not_found': 0,
|
|
'render_failed': 0,
|
|
'total_signatures': 0,
|
|
'files': {}
|
|
}
|
|
|
|
for i, row in enumerate(samples):
|
|
filename = row['filename']
|
|
page_num = int(row['page'])
|
|
base_name = Path(filename).stem
|
|
|
|
print(f"[{i+1}/{len(samples)}] Processing: {filename}, page {page_num}...", end=' ', flush=True)
|
|
|
|
# Find PDF
|
|
pdf_path = find_pdf_file(filename)
|
|
if pdf_path is None:
|
|
print("PDF NOT FOUND")
|
|
results['pdf_not_found'] += 1
|
|
results['files'][filename] = {'status': 'pdf_not_found'}
|
|
continue
|
|
|
|
# Render page
|
|
image = extractor.render_pdf_page(pdf_path, page_num)
|
|
if image is None:
|
|
print("RENDER FAILED")
|
|
results['render_failed'] += 1
|
|
results['files'][filename] = {'status': 'render_failed'}
|
|
continue
|
|
|
|
# Detect signatures
|
|
signatures = extractor.detect_signatures(image)
|
|
num_sigs = len(signatures)
|
|
results['total_signatures'] += num_sigs
|
|
results['processed'] += 1
|
|
|
|
print(f"Found {num_sigs} signature(s)")
|
|
|
|
# Extract and save signature crops
|
|
crops = extractor.extract_signature_images(image, signatures)
|
|
for j, (crop, sig) in enumerate(zip(crops, signatures)):
|
|
crop_filename = f"{base_name}_page{page_num}_sig{j+1}.png"
|
|
crop_path = os.path.join(output_dir, crop_filename)
|
|
cv2.imwrite(crop_path, cv2.cvtColor(crop, cv2.COLOR_RGB2BGR))
|
|
|
|
# Save stamp-removed version if output dir specified
|
|
if output_dir_no_stamp:
|
|
crop_no_stamp = remove_red_stamp(crop)
|
|
crop_no_stamp_path = os.path.join(output_dir_no_stamp, crop_filename)
|
|
cv2.imwrite(crop_no_stamp_path, cv2.cvtColor(crop_no_stamp, cv2.COLOR_RGB2BGR))
|
|
|
|
# Save visualization
|
|
if save_visualization and signatures:
|
|
vis_image = extractor.create_visualization(image, signatures)
|
|
vis_filename = f"{base_name}_page{page_num}_annotated.png"
|
|
vis_path = os.path.join(output_dir, "visualization", vis_filename)
|
|
cv2.imwrite(vis_path, cv2.cvtColor(vis_image, cv2.COLOR_RGB2BGR))
|
|
|
|
# Store file results
|
|
results['files'][filename] = {
|
|
'status': 'success',
|
|
'page': page_num,
|
|
'signatures': [
|
|
{
|
|
'box': list(sig['box']),
|
|
'confidence': sig['confidence']
|
|
}
|
|
for sig in signatures
|
|
]
|
|
}
|
|
|
|
return results
|
|
|
|
|
|
def print_summary(results: dict):
|
|
"""Print processing summary."""
|
|
print("\n" + "=" * 60)
|
|
print("YOLO SIGNATURE EXTRACTION SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Total samples: {results['total_samples']}")
|
|
print(f"Successfully processed: {results['processed']}")
|
|
print(f"PDFs not found: {results['pdf_not_found']}")
|
|
print(f"Render failed: {results['render_failed']}")
|
|
print(f"Total signatures found: {results['total_signatures']}")
|
|
|
|
if results['processed'] > 0:
|
|
avg_sigs = results['total_signatures'] / results['processed']
|
|
print(f"Average signatures/page: {avg_sigs:.2f}")
|
|
|
|
print("=" * 60)
|
|
|
|
|
|
def main():
|
|
"""Main entry point for signature extraction."""
|
|
print("=" * 60)
|
|
print("YOLO Signature Extraction Pipeline")
|
|
print("=" * 60)
|
|
print(f"Model: {MODEL_PATH}")
|
|
print(f"CSV: {CSV_PATH}")
|
|
print(f"Output (original): {OUTPUT_PATH}")
|
|
print(f"Output (no stamp): {OUTPUT_PATH_NO_STAMP}")
|
|
print(f"Confidence threshold: {CONFIDENCE_THRESHOLD}")
|
|
print("=" * 60 + "\n")
|
|
|
|
# Initialize extractor
|
|
extractor = YOLOSignatureExtractor(MODEL_PATH, CONFIDENCE_THRESHOLD)
|
|
|
|
# Load samples
|
|
print("\nLoading samples from CSV...")
|
|
samples = load_csv_samples(CSV_PATH, sample_size=50, seed=42)
|
|
print(f"Loaded {len(samples)} samples\n")
|
|
|
|
# Process samples (with stamp removal)
|
|
results = process_samples(
|
|
extractor, samples, OUTPUT_PATH,
|
|
output_dir_no_stamp=OUTPUT_PATH_NO_STAMP,
|
|
save_visualization=True
|
|
)
|
|
|
|
# Save results JSON
|
|
results_path = os.path.join(OUTPUT_PATH, "results.json")
|
|
with open(results_path, 'w') as f:
|
|
json.dump(results, f, indent=2)
|
|
print(f"\nResults saved to: {results_path}")
|
|
|
|
# Print summary
|
|
print_summary(results)
|
|
print(f"\nStamp-removed signatures saved to: {OUTPUT_PATH_NO_STAMP}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main()
|
|
except KeyboardInterrupt:
|
|
print("\n\nProcess interrupted by user.")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"\n\nFATAL ERROR: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
sys.exit(1)
|