- Add complete experiments directory with pilot study infrastructure - 5 experimental conditions (direct, expert-only, attribute-only, full-pipeline, random-perspective) - Human assessment tool with React frontend and FastAPI backend - AUT flexibility analysis with jump signal detection - Result visualization and metrics computation - Add novelty-driven agent loop module (experiments/novelty_loop/) - NoveltyDrivenTaskAgent with expert perspective perturbation - Three termination strategies: breakthrough, exhaust, coverage - Interactive CLI demo with colored output - Embedding-based novelty scoring - Add DDC knowledge domain classification data (en/zh) - Add CLAUDE.md project documentation - Update research report with experiment findings Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
251 lines
8.1 KiB
Python
251 lines
8.1 KiB
Python
"""
|
|
Embedding Service - generates embeddings and performs similarity-based deduplication
|
|
|
|
使用 Ollama 的 embedding 端點生成向量,並透過餘弦相似度進行去重分組。
|
|
"""
|
|
|
|
import logging
|
|
from typing import List, Optional
|
|
|
|
import httpx
|
|
import numpy as np
|
|
|
|
from ..config import settings
|
|
from ..models.schemas import (
|
|
ExpertTransformationDescription,
|
|
DeduplicationResult,
|
|
DeduplicationMethod,
|
|
DescriptionGroup,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class EmbeddingService:
|
|
"""Embedding 服務:生成向量並執行相似度去重"""
|
|
|
|
def __init__(self):
|
|
self.base_url = settings.ollama_base_url
|
|
self.default_model = "qwen3-embedding:4b" # Qwen3 embedding model for better semantic understanding
|
|
self.client = httpx.AsyncClient(timeout=120.0)
|
|
|
|
async def get_embedding(self, text: str, model: Optional[str] = None) -> List[float]:
|
|
"""取得單一文字的 embedding 向量"""
|
|
model = model or self.default_model
|
|
url = f"{self.base_url}/api/embed"
|
|
|
|
try:
|
|
response = await self.client.post(url, json={
|
|
"model": model,
|
|
"input": text
|
|
})
|
|
response.raise_for_status()
|
|
result = response.json()
|
|
return result["embeddings"][0]
|
|
except httpx.HTTPStatusError as e:
|
|
logger.error(f"Embedding API error: {e.response.status_code} - {e.response.text}")
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Embedding error: {e}")
|
|
raise
|
|
|
|
async def get_embeddings_batch(
|
|
self,
|
|
texts: List[str],
|
|
model: Optional[str] = None
|
|
) -> List[List[float]]:
|
|
"""批次取得多個文字的 embedding 向量"""
|
|
if not texts:
|
|
return []
|
|
|
|
model = model or self.default_model
|
|
url = f"{self.base_url}/api/embed"
|
|
|
|
try:
|
|
# Ollama 支援批次 embedding
|
|
response = await self.client.post(url, json={
|
|
"model": model,
|
|
"input": texts
|
|
})
|
|
response.raise_for_status()
|
|
result = response.json()
|
|
return result["embeddings"]
|
|
except httpx.HTTPStatusError as e:
|
|
logger.error(f"Batch embedding API error: {e.response.status_code} - {e.response.text}")
|
|
# 如果批次失敗,嘗試逐一處理
|
|
logger.info("Falling back to single embedding requests...")
|
|
embeddings = []
|
|
for text in texts:
|
|
emb = await self.get_embedding(text, model)
|
|
embeddings.append(emb)
|
|
return embeddings
|
|
except Exception as e:
|
|
logger.error(f"Batch embedding error: {e}")
|
|
raise
|
|
|
|
def cosine_similarity(self, a: List[float], b: List[float]) -> float:
|
|
"""計算兩個向量的餘弦相似度"""
|
|
a_np = np.array(a)
|
|
b_np = np.array(b)
|
|
norm_a = np.linalg.norm(a_np)
|
|
norm_b = np.linalg.norm(b_np)
|
|
if norm_a == 0 or norm_b == 0:
|
|
return 0.0
|
|
return float(np.dot(a_np, b_np) / (norm_a * norm_b))
|
|
|
|
def build_similarity_matrix(
|
|
self,
|
|
embeddings: List[List[float]]
|
|
) -> np.ndarray:
|
|
"""建立成對相似度矩陣"""
|
|
n = len(embeddings)
|
|
matrix = np.zeros((n, n))
|
|
|
|
for i in range(n):
|
|
matrix[i][i] = 1.0 # 自己與自己的相似度為 1
|
|
for j in range(i + 1, n):
|
|
sim = self.cosine_similarity(embeddings[i], embeddings[j])
|
|
matrix[i][j] = sim
|
|
matrix[j][i] = sim
|
|
|
|
return matrix
|
|
|
|
def cluster_by_similarity(
|
|
self,
|
|
similarity_matrix: np.ndarray,
|
|
threshold: float
|
|
) -> List[List[int]]:
|
|
"""
|
|
貪婪聚類:將相似度 >= threshold 的項目分組
|
|
|
|
演算法:
|
|
1. 從第一個未分配的項目開始
|
|
2. 找出所有與該項目相似度 >= threshold 的項目
|
|
3. 歸入同一組
|
|
4. 重複直到所有項目都已分配
|
|
|
|
Returns:
|
|
List[List[int]]: 每個子列表包含同組項目的索引
|
|
"""
|
|
n = len(similarity_matrix)
|
|
assigned = [False] * n
|
|
groups = []
|
|
|
|
for i in range(n):
|
|
if assigned[i]:
|
|
continue
|
|
|
|
# 開始新的分組,以 item i 為代表
|
|
group = [i]
|
|
assigned[i] = True
|
|
|
|
# 找出所有與 i 相似的項目
|
|
for j in range(i + 1, n):
|
|
if not assigned[j] and similarity_matrix[i][j] >= threshold:
|
|
group.append(j)
|
|
assigned[j] = True
|
|
|
|
groups.append(group)
|
|
|
|
return groups
|
|
|
|
async def deduplicate(
|
|
self,
|
|
descriptions: List[ExpertTransformationDescription],
|
|
threshold: float = 0.85,
|
|
model: Optional[str] = None
|
|
) -> DeduplicationResult:
|
|
"""
|
|
主要去重方法
|
|
|
|
Args:
|
|
descriptions: 要去重的描述列表
|
|
threshold: 相似度閾值 (0.0-1.0),預設 0.85
|
|
model: Embedding 模型名稱
|
|
|
|
Returns:
|
|
DeduplicationResult: 去重結果,包含分組資訊
|
|
"""
|
|
model = model or self.default_model
|
|
|
|
# 空輸入處理
|
|
if not descriptions:
|
|
return DeduplicationResult(
|
|
total_input=0,
|
|
total_groups=0,
|
|
total_duplicates=0,
|
|
groups=[],
|
|
threshold_used=threshold,
|
|
method_used=DeduplicationMethod.EMBEDDING,
|
|
model_used=model
|
|
)
|
|
|
|
# 提取描述文字
|
|
texts = [d.description for d in descriptions]
|
|
logger.info(f"Generating embeddings for {len(texts)} descriptions using model '{model}'...")
|
|
|
|
# 批次取得 embeddings
|
|
try:
|
|
embeddings = await self.get_embeddings_batch(texts, model)
|
|
except Exception as e:
|
|
logger.error(f"Failed to generate embeddings: {e}")
|
|
raise ValueError(f"Embedding generation failed: {e}. Make sure the model '{model}' is installed (run: ollama pull {model})")
|
|
|
|
# 建立相似度矩陣
|
|
logger.info("Building similarity matrix...")
|
|
sim_matrix = self.build_similarity_matrix(embeddings)
|
|
|
|
# 聚類
|
|
logger.info(f"Clustering with threshold {threshold}...")
|
|
clusters = self.cluster_by_similarity(sim_matrix, threshold)
|
|
|
|
# 建立結果分組
|
|
result_groups = []
|
|
total_duplicates = 0
|
|
|
|
for group_idx, indices in enumerate(clusters):
|
|
if len(indices) == 1:
|
|
# 獨立項目 - 無重複
|
|
result_groups.append(DescriptionGroup(
|
|
group_id=f"group-{group_idx}",
|
|
representative=descriptions[indices[0]],
|
|
duplicates=[],
|
|
similarity_scores=[]
|
|
))
|
|
else:
|
|
# 有重複的分組 - 第一個為代表
|
|
rep_idx = indices[0]
|
|
dup_indices = indices[1:]
|
|
dup_scores = [
|
|
float(sim_matrix[rep_idx][idx])
|
|
for idx in dup_indices
|
|
]
|
|
|
|
result_groups.append(DescriptionGroup(
|
|
group_id=f"group-{group_idx}",
|
|
representative=descriptions[rep_idx],
|
|
duplicates=[descriptions[idx] for idx in dup_indices],
|
|
similarity_scores=dup_scores
|
|
))
|
|
total_duplicates += len(dup_indices)
|
|
|
|
logger.info(f"Deduplication complete: {len(descriptions)} -> {len(result_groups)} groups, {total_duplicates} duplicates found")
|
|
|
|
return DeduplicationResult(
|
|
total_input=len(descriptions),
|
|
total_groups=len(result_groups),
|
|
total_duplicates=total_duplicates,
|
|
groups=result_groups,
|
|
threshold_used=threshold,
|
|
method_used=DeduplicationMethod.EMBEDDING,
|
|
model_used=model
|
|
)
|
|
|
|
async def close(self):
|
|
"""關閉 HTTP 客戶端"""
|
|
await self.client.aclose()
|
|
|
|
|
|
# 全域實例
|
|
embedding_service = EmbeddingService()
|