939a348da4
Paper draft includes all sections (Abstract through Conclusion), 36 references, and supporting scripts. Key methodology: Cosine similarity + dHash dual-method verification with thresholds calibrated against known-replication firm (Firm A). Includes: - 8 section markdown files (paper_a_*.md) - Ablation study script (ResNet-50 vs VGG-16 vs EfficientNet-B0) - Recalibrated classification script (84,386 PDFs, 5-tier system) - Figure generation and Word export scripts - Citation renumbering script ([1]-[36]) - Signature analysis pipeline (12 steps) - YOLO extraction scripts Three rounds of AI review completed (GPT-5.4, Claude Opus 4.6, Gemini 3 Pro). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
242 lines
7.0 KiB
Python
242 lines
7.0 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Step 2: 使用 ResNet-50 提取簽名圖片的特徵向量
|
||
|
||
預處理流程:
|
||
1. 載入圖片 (RGB)
|
||
2. 縮放至 224x224(保持比例,填充白色)
|
||
3. 正規化 (ImageNet mean/std)
|
||
4. 通過 ResNet-50 (去掉最後分類層)
|
||
5. L2 正規化
|
||
6. 輸出 2048 維特徵向量
|
||
"""
|
||
|
||
import torch
|
||
import torch.nn as nn
|
||
import torchvision.models as models
|
||
import torchvision.transforms as transforms
|
||
from torch.utils.data import Dataset, DataLoader
|
||
import numpy as np
|
||
import cv2
|
||
import sqlite3
|
||
from pathlib import Path
|
||
from tqdm import tqdm
|
||
import warnings
|
||
warnings.filterwarnings('ignore')
|
||
|
||
# 路徑配置
|
||
IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
|
||
OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis")
|
||
DB_PATH = OUTPUT_DIR / "signature_analysis.db"
|
||
FEATURES_PATH = OUTPUT_DIR / "features"
|
||
|
||
# 模型配置
|
||
BATCH_SIZE = 64
|
||
NUM_WORKERS = 4
|
||
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else
|
||
"cuda" if torch.cuda.is_available() else "cpu")
|
||
|
||
|
||
class SignatureDataset(Dataset):
|
||
"""簽名圖片資料集"""
|
||
|
||
def __init__(self, image_paths: list, transform=None):
|
||
self.image_paths = image_paths
|
||
self.transform = transform
|
||
|
||
def __len__(self):
|
||
return len(self.image_paths)
|
||
|
||
def __getitem__(self, idx):
|
||
img_path = self.image_paths[idx]
|
||
|
||
# 載入圖片
|
||
img = cv2.imread(str(img_path))
|
||
if img is None:
|
||
# 如果讀取失敗,返回白色圖片
|
||
img = np.ones((224, 224, 3), dtype=np.uint8) * 255
|
||
else:
|
||
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
||
|
||
# 調整大小(保持比例,填充白色)
|
||
img = self.resize_with_padding(img, 224, 224)
|
||
|
||
if self.transform:
|
||
img = self.transform(img)
|
||
|
||
return img, str(img_path.name)
|
||
|
||
@staticmethod
|
||
def resize_with_padding(img, target_w, target_h):
|
||
"""調整大小並填充白色以保持比例"""
|
||
h, w = img.shape[:2]
|
||
|
||
# 計算縮放比例
|
||
scale = min(target_w / w, target_h / h)
|
||
new_w = int(w * scale)
|
||
new_h = int(h * scale)
|
||
|
||
# 縮放
|
||
resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
||
|
||
# 建立白色畫布
|
||
canvas = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255
|
||
|
||
# 置中貼上
|
||
x_offset = (target_w - new_w) // 2
|
||
y_offset = (target_h - new_h) // 2
|
||
canvas[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized
|
||
|
||
return canvas
|
||
|
||
|
||
class FeatureExtractor:
|
||
"""特徵提取器"""
|
||
|
||
def __init__(self, device):
|
||
self.device = device
|
||
|
||
# 載入預訓練 ResNet-50
|
||
print(f"載入 ResNet-50 模型... (device: {device})")
|
||
self.model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
|
||
|
||
# 移除最後的分類層,保留特徵
|
||
self.model = nn.Sequential(*list(self.model.children())[:-1])
|
||
self.model = self.model.to(device)
|
||
self.model.eval()
|
||
|
||
# ImageNet 正規化
|
||
self.transform = transforms.Compose([
|
||
transforms.ToTensor(),
|
||
transforms.Normalize(
|
||
mean=[0.485, 0.456, 0.406],
|
||
std=[0.229, 0.224, 0.225]
|
||
)
|
||
])
|
||
|
||
@torch.no_grad()
|
||
def extract_batch(self, images):
|
||
"""提取一批圖片的特徵"""
|
||
images = images.to(self.device)
|
||
features = self.model(images)
|
||
features = features.squeeze(-1).squeeze(-1) # [B, 2048]
|
||
|
||
# L2 正規化
|
||
features = nn.functional.normalize(features, p=2, dim=1)
|
||
|
||
return features.cpu().numpy()
|
||
|
||
|
||
def get_image_list_from_db():
|
||
"""從資料庫取得所有圖片檔名"""
|
||
conn = sqlite3.connect(DB_PATH)
|
||
cursor = conn.cursor()
|
||
|
||
cursor.execute('SELECT image_filename FROM signatures ORDER BY signature_id')
|
||
filenames = [row[0] for row in cursor.fetchall()]
|
||
|
||
conn.close()
|
||
return filenames
|
||
|
||
|
||
def save_features_to_db(features_dict: dict):
|
||
"""將特徵向量存入資料庫"""
|
||
conn = sqlite3.connect(DB_PATH)
|
||
cursor = conn.cursor()
|
||
|
||
for filename, feature in tqdm(features_dict.items(), desc="寫入資料庫"):
|
||
cursor.execute('''
|
||
UPDATE signatures
|
||
SET feature_vector = ?
|
||
WHERE image_filename = ?
|
||
''', (feature.tobytes(), filename))
|
||
|
||
conn.commit()
|
||
conn.close()
|
||
|
||
|
||
def main():
|
||
print("=" * 60)
|
||
print("Step 2: ResNet-50 特徵向量提取")
|
||
print("=" * 60)
|
||
print(f"裝置: {DEVICE}")
|
||
|
||
# 確保輸出目錄存在
|
||
FEATURES_PATH.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 從資料庫取得圖片列表
|
||
print("從資料庫讀取圖片列表...")
|
||
filenames = get_image_list_from_db()
|
||
print(f"共 {len(filenames):,} 張圖片待處理")
|
||
|
||
# 建立圖片路徑列表
|
||
image_paths = [IMAGES_DIR / f for f in filenames]
|
||
|
||
# 初始化特徵提取器
|
||
extractor = FeatureExtractor(DEVICE)
|
||
|
||
# 建立資料集和載入器
|
||
dataset = SignatureDataset(image_paths, transform=extractor.transform)
|
||
dataloader = DataLoader(
|
||
dataset,
|
||
batch_size=BATCH_SIZE,
|
||
shuffle=False,
|
||
num_workers=NUM_WORKERS,
|
||
pin_memory=True
|
||
)
|
||
|
||
# 提取特徵
|
||
print(f"\n開始提取特徵 (batch_size={BATCH_SIZE})...")
|
||
all_features = []
|
||
all_filenames = []
|
||
|
||
for images, batch_filenames in tqdm(dataloader, desc="提取特徵"):
|
||
features = extractor.extract_batch(images)
|
||
all_features.append(features)
|
||
all_filenames.extend(batch_filenames)
|
||
|
||
# 合併所有特徵
|
||
all_features = np.vstack(all_features)
|
||
print(f"\n特徵矩陣形狀: {all_features.shape}")
|
||
|
||
# 儲存為 numpy 檔案(備份)
|
||
npy_path = FEATURES_PATH / "signature_features.npy"
|
||
np.save(npy_path, all_features)
|
||
print(f"特徵向量已儲存: {npy_path} ({all_features.nbytes / 1e9:.2f} GB)")
|
||
|
||
# 儲存檔名對應(用於後續索引)
|
||
filenames_path = FEATURES_PATH / "signature_filenames.txt"
|
||
with open(filenames_path, 'w') as f:
|
||
for fn in all_filenames:
|
||
f.write(fn + '\n')
|
||
print(f"檔名列表已儲存: {filenames_path}")
|
||
|
||
# 更新資料庫
|
||
print("\n更新資料庫中的特徵向量...")
|
||
features_dict = dict(zip(all_filenames, all_features))
|
||
save_features_to_db(features_dict)
|
||
|
||
# 統計
|
||
print("\n" + "=" * 60)
|
||
print("特徵提取完成")
|
||
print("=" * 60)
|
||
print(f"處理圖片數: {len(all_filenames):,}")
|
||
print(f"特徵維度: {all_features.shape[1]}")
|
||
print(f"特徵檔案: {npy_path}")
|
||
print(f"檔案大小: {all_features.nbytes / 1e9:.2f} GB")
|
||
|
||
# 簡單驗證
|
||
print("\n特徵統計:")
|
||
print(f" 平均值: {all_features.mean():.6f}")
|
||
print(f" 標準差: {all_features.std():.6f}")
|
||
print(f" 最小值: {all_features.min():.6f}")
|
||
print(f" 最大值: {all_features.max():.6f}")
|
||
|
||
# L2 norm 驗證(應該都是 1.0)
|
||
norms = np.linalg.norm(all_features, axis=1)
|
||
print(f" L2 norm: {norms.mean():.6f} ± {norms.std():.6f}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|