pdf_signature_extraction/signature_analysis/02_extract_features.py

#!/usr/bin/env python3
"""
Step 2: 使用 ResNet-50 提取簽名圖片的特徵向量

預處理流程:
1. 載入圖片 (RGB)
2. 縮放至 224x224（保持比例，填充白色）
3. 正規化 (ImageNet mean/std)
4. 通過 ResNet-50 (去掉最後分類層)
5. L2 正規化
6. 輸出 2048 維特徵向量
"""

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import numpy as np
import cv2
import sqlite3
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# 路徑配置
IMAGES_DIR = Path("/Volumes/NV2/PDF-Processing/yolo-signatures/images")
OUTPUT_DIR = Path("/Volumes/NV2/PDF-Processing/signature-analysis")
DB_PATH = OUTPUT_DIR / "signature_analysis.db"
FEATURES_PATH = OUTPUT_DIR / "features"

# 模型配置
BATCH_SIZE = 64
NUM_WORKERS = 4
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else
                      "cuda" if torch.cuda.is_available() else "cpu")


class SignatureDataset(Dataset):
    """簽名圖片資料集"""

    def __init__(self, image_paths: list, transform=None):
        self.image_paths = image_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]

        # 載入圖片
        img = cv2.imread(str(img_path))
        if img is None:
            # 如果讀取失敗，返回白色圖片
            img = np.ones((224, 224, 3), dtype=np.uint8) * 255
        else:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # 調整大小（保持比例，填充白色）
        img = self.resize_with_padding(img, 224, 224)

        if self.transform:
            img = self.transform(img)

        return img, str(img_path.name)

    @staticmethod
    def resize_with_padding(img, target_w, target_h):
        """調整大小並填充白色以保持比例"""
        h, w = img.shape[:2]

        # 計算縮放比例
        scale = min(target_w / w, target_h / h)
        new_w = int(w * scale)
        new_h = int(h * scale)

        # 縮放
        resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)

        # 建立白色畫布
        canvas = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255

        # 置中貼上
        x_offset = (target_w - new_w) // 2
        y_offset = (target_h - new_h) // 2
        canvas[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized

        return canvas


class FeatureExtractor:
    """特徵提取器"""

    def __init__(self, device):
        self.device = device

        # 載入預訓練 ResNet-50
        print(f"載入 ResNet-50 模型... (device: {device})")
        self.model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)

        # 移除最後的分類層，保留特徵
        self.model = nn.Sequential(*list(self.model.children())[:-1])
        self.model = self.model.to(device)
        self.model.eval()

        # ImageNet 正規化
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]
            )
        ])

    @torch.no_grad()
    def extract_batch(self, images):
        """提取一批圖片的特徵"""
        images = images.to(self.device)
        features = self.model(images)
        features = features.squeeze(-1).squeeze(-1)  # [B, 2048]

        # L2 正規化
        features = nn.functional.normalize(features, p=2, dim=1)

        return features.cpu().numpy()


def get_image_list_from_db():
    """從資料庫取得所有圖片檔名"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()

    cursor.execute('SELECT image_filename FROM signatures ORDER BY signature_id')
    filenames = [row[0] for row in cursor.fetchall()]

    conn.close()
    return filenames


def save_features_to_db(features_dict: dict):
    """將特徵向量存入資料庫"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()

    for filename, feature in tqdm(features_dict.items(), desc="寫入資料庫"):
        cursor.execute('''
            UPDATE signatures
            SET feature_vector = ?
            WHERE image_filename = ?
        ''', (feature.tobytes(), filename))

    conn.commit()
    conn.close()


def main():
    print("=" * 60)
    print("Step 2: ResNet-50 特徵向量提取")
    print("=" * 60)
    print(f"裝置: {DEVICE}")

    # 確保輸出目錄存在
    FEATURES_PATH.mkdir(parents=True, exist_ok=True)

    # 從資料庫取得圖片列表
    print("從資料庫讀取圖片列表...")
    filenames = get_image_list_from_db()
    print(f"共 {len(filenames):,} 張圖片待處理")

    # 建立圖片路徑列表
    image_paths = [IMAGES_DIR / f for f in filenames]

    # 初始化特徵提取器
    extractor = FeatureExtractor(DEVICE)

    # 建立資料集和載入器
    dataset = SignatureDataset(image_paths, transform=extractor.transform)
    dataloader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=True
    )

    # 提取特徵
    print(f"\n開始提取特徵 (batch_size={BATCH_SIZE})...")
    all_features = []
    all_filenames = []

    for images, batch_filenames in tqdm(dataloader, desc="提取特徵"):
        features = extractor.extract_batch(images)
        all_features.append(features)
        all_filenames.extend(batch_filenames)

    # 合併所有特徵
    all_features = np.vstack(all_features)
    print(f"\n特徵矩陣形狀: {all_features.shape}")

    # 儲存為 numpy 檔案（備份）
    npy_path = FEATURES_PATH / "signature_features.npy"
    np.save(npy_path, all_features)
    print(f"特徵向量已儲存: {npy_path} ({all_features.nbytes / 1e9:.2f} GB)")

    # 儲存檔名對應（用於後續索引）
    filenames_path = FEATURES_PATH / "signature_filenames.txt"
    with open(filenames_path, 'w') as f:
        for fn in all_filenames:
            f.write(fn + '\n')
    print(f"檔名列表已儲存: {filenames_path}")

    # 更新資料庫
    print("\n更新資料庫中的特徵向量...")
    features_dict = dict(zip(all_filenames, all_features))
    save_features_to_db(features_dict)

    # 統計
    print("\n" + "=" * 60)
    print("特徵提取完成")
    print("=" * 60)
    print(f"處理圖片數: {len(all_filenames):,}")
    print(f"特徵維度: {all_features.shape[1]}")
    print(f"特徵檔案: {npy_path}")
    print(f"檔案大小: {all_features.nbytes / 1e9:.2f} GB")

    # 簡單驗證
    print("\n特徵統計:")
    print(f"  平均值: {all_features.mean():.6f}")
    print(f"  標準差: {all_features.std():.6f}")
    print(f"  最小值: {all_features.min():.6f}")
    print(f"  最大值: {all_features.max():.6f}")

    # L2 norm 驗證（應該都是 1.0）
    norms = np.linalg.norm(all_features, axis=1)
    print(f"  L2 norm: {norms.mean():.6f} ± {norms.std():.6f}")


if __name__ == "__main__":
    main()