Files
pdf_signature_extraction/extract_signatures_yolo.py
T
gbanyan 939a348da4 Add Paper A (IEEE TAI) complete draft with Firm A-calibrated dual-method classification
Paper draft includes all sections (Abstract through Conclusion), 36 references,
and supporting scripts. Key methodology: Cosine similarity + dHash dual-method
verification with thresholds calibrated against known-replication firm (Firm A).

Includes:
- 8 section markdown files (paper_a_*.md)
- Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0)
- Recalibrated classification script (84,386 PDFs, 5-tier system)
- Figure generation and Word export scripts
- Citation renumbering script ([1]-[36])
- Signature analysis pipeline (12 steps)
- YOLO extraction scripts

Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 23:05:33 +08:00

414 lines
13 KiB
Python

#!/usr/bin/env python3
"""
YOLO-based signature extraction from PDF documents.
Uses a trained YOLOv11n model to detect and extract handwritten signatures.
Pipeline:
PDF → Render to Image → YOLO Detection → Crop Signatures → Output
"""
import csv
import json
import os
import random
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
import cv2
import fitz # PyMuPDF
import numpy as np
from ultralytics import YOLO
# Configuration
CSV_PATH = "/Volumes/NV2/PDF-Processing/master_signatures.csv"
PDF_BASE_PATH = "/Volumes/NV2/PDF-Processing/total-pdf"
OUTPUT_PATH = "/Volumes/NV2/PDF-Processing/signature-image-output/yolo"
OUTPUT_PATH_NO_STAMP = "/Volumes/NV2/PDF-Processing/signature-image-output/yolo_no_stamp"
MODEL_PATH = "/Volumes/NV2/pdf_recognize/models/best.pt"
# Detection parameters
DPI = 300
CONFIDENCE_THRESHOLD = 0.5
def remove_red_stamp(image: np.ndarray) -> np.ndarray:
"""
Remove red stamp pixels from an image by replacing them with white.
Uses HSV color space to detect red regions (stamps are typically red/orange).
Args:
image: RGB image as numpy array
Returns:
Image with red stamp pixels replaced by white
"""
# Convert to HSV
hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
# Red color wraps around in HSV, so we need two ranges
# Range 1: H = 0-10 (red-orange)
lower_red1 = np.array([0, 50, 50])
upper_red1 = np.array([10, 255, 255])
# Range 2: H = 160-180 (red-magenta)
lower_red2 = np.array([160, 50, 50])
upper_red2 = np.array([180, 255, 255])
# Create masks for red regions
mask1 = cv2.inRange(hsv, lower_red1, upper_red1)
mask2 = cv2.inRange(hsv, lower_red2, upper_red2)
# Combine masks
red_mask = cv2.bitwise_or(mask1, mask2)
# Optional: dilate mask slightly to catch edges
kernel = np.ones((3, 3), np.uint8)
red_mask = cv2.dilate(red_mask, kernel, iterations=1)
# Replace red pixels with white
result = image.copy()
result[red_mask > 0] = [255, 255, 255]
return result
class YOLOSignatureExtractor:
"""Extract signatures from PDF pages using YOLO object detection."""
def __init__(self, model_path: str = MODEL_PATH, conf_threshold: float = CONFIDENCE_THRESHOLD):
"""
Initialize the extractor with a trained YOLO model.
Args:
model_path: Path to the YOLO model weights
conf_threshold: Minimum confidence threshold for detections
"""
print(f"Loading YOLO model from {model_path}...")
self.model = YOLO(model_path)
self.conf_threshold = conf_threshold
self.dpi = DPI
print(f"Model loaded. Confidence threshold: {conf_threshold}")
def render_pdf_page(self, pdf_path: str, page_num: int) -> Optional[np.ndarray]:
"""
Render a PDF page to an image array.
Args:
pdf_path: Path to the PDF file
page_num: Page number (1-indexed)
Returns:
RGB image as numpy array, or None if failed
"""
try:
doc = fitz.open(pdf_path)
if page_num < 1 or page_num > len(doc):
print(f" Invalid page number: {page_num} (PDF has {len(doc)} pages)")
doc.close()
return None
page = doc[page_num - 1]
mat = fitz.Matrix(self.dpi / 72, self.dpi / 72)
pix = page.get_pixmap(matrix=mat, alpha=False)
image = np.frombuffer(pix.samples, dtype=np.uint8)
image = image.reshape(pix.height, pix.width, pix.n)
doc.close()
return image
except Exception as e:
print(f" Error rendering PDF: {e}")
return None
def detect_signatures(self, image: np.ndarray) -> list[dict]:
"""
Detect signature regions in an image using YOLO.
Args:
image: RGB image as numpy array
Returns:
List of detected signatures with box coordinates and confidence
"""
results = self.model(image, conf=self.conf_threshold, verbose=False)
signatures = []
for r in results:
for box in r.boxes:
x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
conf = float(box.conf[0].cpu().numpy())
signatures.append({
'box': (x1, y1, x2 - x1, y2 - y1), # x, y, w, h format
'xyxy': (x1, y1, x2, y2),
'confidence': conf
})
# Sort by y-coordinate (top to bottom), then x-coordinate (left to right)
signatures.sort(key=lambda s: (s['box'][1], s['box'][0]))
return signatures
def extract_signature_images(self, image: np.ndarray, signatures: list[dict]) -> list[np.ndarray]:
"""
Crop signature regions from the image.
Args:
image: RGB image as numpy array
signatures: List of detected signatures
Returns:
List of cropped signature images
"""
cropped = []
for sig in signatures:
x, y, w, h = sig['box']
# Ensure bounds are within image
x = max(0, x)
y = max(0, y)
x2 = min(image.shape[1], x + w)
y2 = min(image.shape[0], y + h)
cropped.append(image[y:y2, x:x2])
return cropped
def create_visualization(self, image: np.ndarray, signatures: list[dict]) -> np.ndarray:
"""
Create a visualization with detection boxes drawn on the image.
Args:
image: RGB image as numpy array
signatures: List of detected signatures
Returns:
Image with drawn bounding boxes
"""
vis = image.copy()
for i, sig in enumerate(signatures):
x1, y1, x2, y2 = sig['xyxy']
conf = sig['confidence']
# Draw box
cv2.rectangle(vis, (x1, y1), (x2, y2), (255, 0, 0), 3)
# Draw label
label = f"sig{i+1}: {conf:.2f}"
font_scale = 0.8
thickness = 2
(text_w, text_h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)
cv2.rectangle(vis, (x1, y1 - text_h - 10), (x1 + text_w + 5, y1), (255, 0, 0), -1)
cv2.putText(vis, label, (x1 + 2, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX,
font_scale, (255, 255, 255), thickness)
return vis
def find_pdf_file(filename: str) -> Optional[str]:
"""
Search for PDF file in batch directories.
Args:
filename: PDF filename to search for
Returns:
Full path if found, None otherwise
"""
for batch_dir in sorted(Path(PDF_BASE_PATH).glob("batch_*")):
pdf_path = batch_dir / filename
if pdf_path.exists():
return str(pdf_path)
return None
def load_csv_samples(csv_path: str, sample_size: int = 50, seed: int = 42) -> list[dict]:
"""
Load random samples from the CSV file.
Args:
csv_path: Path to master_signatures.csv
sample_size: Number of samples to load
seed: Random seed for reproducibility
Returns:
List of dictionaries with filename and page info
"""
with open(csv_path, 'r') as f:
reader = csv.DictReader(f)
all_rows = list(reader)
random.seed(seed)
samples = random.sample(all_rows, min(sample_size, len(all_rows)))
return samples
def process_samples(extractor: YOLOSignatureExtractor, samples: list[dict],
output_dir: str, output_dir_no_stamp: str = None,
save_visualization: bool = True) -> dict:
"""
Process a list of PDF samples and extract signatures.
Args:
extractor: YOLOSignatureExtractor instance
samples: List of sample dictionaries from CSV
output_dir: Output directory for signatures
output_dir_no_stamp: Output directory for stamp-removed signatures (optional)
save_visualization: Whether to save visualization images
Returns:
Results dictionary with statistics and per-file results
"""
os.makedirs(output_dir, exist_ok=True)
if save_visualization:
os.makedirs(os.path.join(output_dir, "visualization"), exist_ok=True)
# Create no-stamp output directory if specified
if output_dir_no_stamp:
os.makedirs(output_dir_no_stamp, exist_ok=True)
results = {
'timestamp': datetime.now().isoformat(),
'total_samples': len(samples),
'processed': 0,
'pdf_not_found': 0,
'render_failed': 0,
'total_signatures': 0,
'files': {}
}
for i, row in enumerate(samples):
filename = row['filename']
page_num = int(row['page'])
base_name = Path(filename).stem
print(f"[{i+1}/{len(samples)}] Processing: {filename}, page {page_num}...", end=' ', flush=True)
# Find PDF
pdf_path = find_pdf_file(filename)
if pdf_path is None:
print("PDF NOT FOUND")
results['pdf_not_found'] += 1
results['files'][filename] = {'status': 'pdf_not_found'}
continue
# Render page
image = extractor.render_pdf_page(pdf_path, page_num)
if image is None:
print("RENDER FAILED")
results['render_failed'] += 1
results['files'][filename] = {'status': 'render_failed'}
continue
# Detect signatures
signatures = extractor.detect_signatures(image)
num_sigs = len(signatures)
results['total_signatures'] += num_sigs
results['processed'] += 1
print(f"Found {num_sigs} signature(s)")
# Extract and save signature crops
crops = extractor.extract_signature_images(image, signatures)
for j, (crop, sig) in enumerate(zip(crops, signatures)):
crop_filename = f"{base_name}_page{page_num}_sig{j+1}.png"
crop_path = os.path.join(output_dir, crop_filename)
cv2.imwrite(crop_path, cv2.cvtColor(crop, cv2.COLOR_RGB2BGR))
# Save stamp-removed version if output dir specified
if output_dir_no_stamp:
crop_no_stamp = remove_red_stamp(crop)
crop_no_stamp_path = os.path.join(output_dir_no_stamp, crop_filename)
cv2.imwrite(crop_no_stamp_path, cv2.cvtColor(crop_no_stamp, cv2.COLOR_RGB2BGR))
# Save visualization
if save_visualization and signatures:
vis_image = extractor.create_visualization(image, signatures)
vis_filename = f"{base_name}_page{page_num}_annotated.png"
vis_path = os.path.join(output_dir, "visualization", vis_filename)
cv2.imwrite(vis_path, cv2.cvtColor(vis_image, cv2.COLOR_RGB2BGR))
# Store file results
results['files'][filename] = {
'status': 'success',
'page': page_num,
'signatures': [
{
'box': list(sig['box']),
'confidence': sig['confidence']
}
for sig in signatures
]
}
return results
def print_summary(results: dict):
"""Print processing summary."""
print("\n" + "=" * 60)
print("YOLO SIGNATURE EXTRACTION SUMMARY")
print("=" * 60)
print(f"Total samples: {results['total_samples']}")
print(f"Successfully processed: {results['processed']}")
print(f"PDFs not found: {results['pdf_not_found']}")
print(f"Render failed: {results['render_failed']}")
print(f"Total signatures found: {results['total_signatures']}")
if results['processed'] > 0:
avg_sigs = results['total_signatures'] / results['processed']
print(f"Average signatures/page: {avg_sigs:.2f}")
print("=" * 60)
def main():
"""Main entry point for signature extraction."""
print("=" * 60)
print("YOLO Signature Extraction Pipeline")
print("=" * 60)
print(f"Model: {MODEL_PATH}")
print(f"CSV: {CSV_PATH}")
print(f"Output (original): {OUTPUT_PATH}")
print(f"Output (no stamp): {OUTPUT_PATH_NO_STAMP}")
print(f"Confidence threshold: {CONFIDENCE_THRESHOLD}")
print("=" * 60 + "\n")
# Initialize extractor
extractor = YOLOSignatureExtractor(MODEL_PATH, CONFIDENCE_THRESHOLD)
# Load samples
print("\nLoading samples from CSV...")
samples = load_csv_samples(CSV_PATH, sample_size=50, seed=42)
print(f"Loaded {len(samples)} samples\n")
# Process samples (with stamp removal)
results = process_samples(
extractor, samples, OUTPUT_PATH,
output_dir_no_stamp=OUTPUT_PATH_NO_STAMP,
save_visualization=True
)
# Save results JSON
results_path = os.path.join(OUTPUT_PATH, "results.json")
with open(results_path, 'w') as f:
json.dump(results, f, indent=2)
print(f"\nResults saved to: {results_path}")
# Print summary
print_summary(results)
print(f"\nStamp-removed signatures saved to: {OUTPUT_PATH_NO_STAMP}")
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\n\nProcess interrupted by user.")
sys.exit(1)
except Exception as e:
print(f"\n\nFATAL ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)