feat: Add experiments framework and novelty-driven agent loop
- Add complete experiments directory with pilot study infrastructure - 5 experimental conditions (direct, expert-only, attribute-only, full-pipeline, random-perspective) - Human assessment tool with React frontend and FastAPI backend - AUT flexibility analysis with jump signal detection - Result visualization and metrics computation - Add novelty-driven agent loop module (experiments/novelty_loop/) - NoveltyDrivenTaskAgent with expert perspective perturbation - Three termination strategies: breakthrough, exhaust, coverage - Interactive CLI demo with colored output - Embedding-based novelty scoring - Add DDC knowledge domain classification data (en/zh) - Add CLAUDE.md project documentation - Update research report with experiment findings Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
666
experiments/compute_metrics.py
Normal file
666
experiments/compute_metrics.py
Normal file
@@ -0,0 +1,666 @@
|
||||
"""
|
||||
Compute metrics for experiment results.
|
||||
|
||||
Computes metrics BOTH before and after deduplication:
|
||||
- Pre-dedup: Measures raw generation capability
|
||||
- Post-dedup: Measures quality of unique ideas
|
||||
|
||||
Also normalizes idea counts for fair cross-condition comparison.
|
||||
|
||||
Usage:
|
||||
python -m experiments.compute_metrics --input results/experiment_xxx_deduped.json
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
from dataclasses import dataclass, asdict
|
||||
|
||||
import numpy as np
|
||||
|
||||
# Add backend to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
|
||||
|
||||
from app.services.embedding_service import embedding_service
|
||||
from app.services.llm_service import ollama_provider, extract_json_from_response
|
||||
from experiments.config import RESULTS_DIR, MODEL, RANDOM_SEED
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiversityMetrics:
|
||||
"""Semantic diversity metrics for a set of ideas."""
|
||||
mean_pairwise_distance: float
|
||||
std_pairwise_distance: float
|
||||
min_pairwise_distance: float
|
||||
max_pairwise_distance: float
|
||||
idea_count: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClusterMetrics:
|
||||
"""Cluster analysis metrics."""
|
||||
optimal_clusters: int
|
||||
silhouette_score: float
|
||||
cluster_sizes: List[int]
|
||||
|
||||
|
||||
@dataclass
|
||||
class QueryDistanceMetrics:
|
||||
"""Distance from original query metrics."""
|
||||
mean_distance: float
|
||||
std_distance: float
|
||||
min_distance: float
|
||||
max_distance: float
|
||||
distances: List[float]
|
||||
|
||||
|
||||
@dataclass
|
||||
class RelevanceMetrics:
|
||||
"""LLM-as-judge relevance metrics (for hallucination detection)."""
|
||||
relevance_rate: float # Score >= 2
|
||||
nonsense_rate: float # Score == 1
|
||||
mean_score: float
|
||||
score_distribution: Dict[int, int] # {1: count, 2: count, 3: count}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConditionMetrics:
|
||||
"""All metrics for a single condition."""
|
||||
condition: str
|
||||
query: str
|
||||
|
||||
# Idea counts
|
||||
raw_count: int
|
||||
unique_count: int
|
||||
survival_rate: float
|
||||
|
||||
# Pre-dedup metrics (on raw ideas)
|
||||
pre_dedup_diversity: Optional[DiversityMetrics]
|
||||
|
||||
# Post-dedup metrics (on unique ideas)
|
||||
post_dedup_diversity: Optional[DiversityMetrics]
|
||||
post_dedup_clusters: Optional[ClusterMetrics]
|
||||
post_dedup_query_distance: Optional[QueryDistanceMetrics]
|
||||
|
||||
# Normalized metrics (on equal-sized samples)
|
||||
normalized_diversity: Optional[DiversityMetrics]
|
||||
normalized_sample_size: int
|
||||
|
||||
# Relevance/hallucination (post-dedup only)
|
||||
relevance: Optional[RelevanceMetrics]
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Embedding-based metrics
|
||||
# ============================================================
|
||||
|
||||
async def get_embeddings(texts: List[str]) -> List[List[float]]:
|
||||
"""Get embeddings for a list of texts."""
|
||||
if not texts:
|
||||
return []
|
||||
return await embedding_service.get_embeddings_batch(texts)
|
||||
|
||||
|
||||
def compute_pairwise_distances(embeddings: List[List[float]]) -> List[float]:
|
||||
"""Compute all pairwise cosine distances."""
|
||||
n = len(embeddings)
|
||||
if n < 2:
|
||||
return []
|
||||
|
||||
distances = []
|
||||
for i in range(n):
|
||||
for j in range(i + 1, n):
|
||||
sim = embedding_service.cosine_similarity(embeddings[i], embeddings[j])
|
||||
dist = 1 - sim # Convert similarity to distance
|
||||
distances.append(dist)
|
||||
|
||||
return distances
|
||||
|
||||
|
||||
async def compute_diversity_metrics(ideas: List[str]) -> Optional[DiversityMetrics]:
|
||||
"""Compute semantic diversity metrics for a set of ideas."""
|
||||
if len(ideas) < 2:
|
||||
return None
|
||||
|
||||
embeddings = await get_embeddings(ideas)
|
||||
distances = compute_pairwise_distances(embeddings)
|
||||
|
||||
if not distances:
|
||||
return None
|
||||
|
||||
return DiversityMetrics(
|
||||
mean_pairwise_distance=float(np.mean(distances)),
|
||||
std_pairwise_distance=float(np.std(distances)),
|
||||
min_pairwise_distance=float(np.min(distances)),
|
||||
max_pairwise_distance=float(np.max(distances)),
|
||||
idea_count=len(ideas)
|
||||
)
|
||||
|
||||
|
||||
async def compute_query_distance_metrics(
|
||||
query: str,
|
||||
ideas: List[str]
|
||||
) -> Optional[QueryDistanceMetrics]:
|
||||
"""Compute distance of ideas from the original query."""
|
||||
if not ideas:
|
||||
return None
|
||||
|
||||
# Get query embedding
|
||||
query_emb = await embedding_service.get_embedding(query)
|
||||
idea_embs = await get_embeddings(ideas)
|
||||
|
||||
distances = []
|
||||
for emb in idea_embs:
|
||||
sim = embedding_service.cosine_similarity(query_emb, emb)
|
||||
dist = 1 - sim
|
||||
distances.append(dist)
|
||||
|
||||
return QueryDistanceMetrics(
|
||||
mean_distance=float(np.mean(distances)),
|
||||
std_distance=float(np.std(distances)),
|
||||
min_distance=float(np.min(distances)),
|
||||
max_distance=float(np.max(distances)),
|
||||
distances=distances
|
||||
)
|
||||
|
||||
|
||||
async def compute_cluster_metrics(ideas: List[str]) -> Optional[ClusterMetrics]:
|
||||
"""Compute cluster analysis metrics."""
|
||||
if len(ideas) < 3:
|
||||
return None
|
||||
|
||||
try:
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.metrics import silhouette_score
|
||||
except ImportError:
|
||||
logger.warning("sklearn not installed, skipping cluster metrics")
|
||||
return None
|
||||
|
||||
embeddings = await get_embeddings(ideas)
|
||||
embeddings_np = np.array(embeddings)
|
||||
|
||||
# Find optimal k using silhouette score
|
||||
max_k = min(len(ideas) - 1, 10)
|
||||
if max_k < 2:
|
||||
return None
|
||||
|
||||
best_k = 2
|
||||
best_score = -1
|
||||
|
||||
for k in range(2, max_k + 1):
|
||||
try:
|
||||
kmeans = KMeans(n_clusters=k, random_state=RANDOM_SEED, n_init=10)
|
||||
labels = kmeans.fit_predict(embeddings_np)
|
||||
score = silhouette_score(embeddings_np, labels)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_k = k
|
||||
except Exception as e:
|
||||
logger.warning(f"Clustering failed for k={k}: {e}")
|
||||
continue
|
||||
|
||||
# Get cluster sizes for optimal k
|
||||
kmeans = KMeans(n_clusters=best_k, random_state=RANDOM_SEED, n_init=10)
|
||||
labels = kmeans.fit_predict(embeddings_np)
|
||||
cluster_sizes = [int(np.sum(labels == i)) for i in range(best_k)]
|
||||
|
||||
return ClusterMetrics(
|
||||
optimal_clusters=best_k,
|
||||
silhouette_score=float(best_score),
|
||||
cluster_sizes=sorted(cluster_sizes, reverse=True)
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# LLM-as-Judge relevance metrics
|
||||
# ============================================================
|
||||
|
||||
async def judge_relevance(query: str, idea: str, model: str = None) -> Dict[str, Any]:
|
||||
"""Use LLM to judge if an idea is relevant to the query."""
|
||||
model = model or MODEL
|
||||
|
||||
prompt = f"""/no_think
|
||||
You are evaluating whether a generated idea is relevant and applicable to an original query.
|
||||
|
||||
Original query: {query}
|
||||
Generated idea: {idea}
|
||||
|
||||
Rate the relevance on a scale of 1-3:
|
||||
1 = Nonsense/completely irrelevant (no logical connection to the query)
|
||||
2 = Weak but valid connection (requires stretch but has some relevance)
|
||||
3 = Clearly relevant and applicable (directly relates to the query)
|
||||
|
||||
Return JSON only:
|
||||
{{"score": N, "reason": "brief explanation (10-20 words)"}}
|
||||
"""
|
||||
|
||||
try:
|
||||
response = await ollama_provider.generate(
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
temperature=0.3 # Lower temperature for more consistent judgments
|
||||
)
|
||||
result = extract_json_from_response(response)
|
||||
return {
|
||||
"score": result.get("score", 2),
|
||||
"reason": result.get("reason", "")
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning(f"Relevance judgment failed: {e}")
|
||||
return {"score": 2, "reason": "judgment failed"}
|
||||
|
||||
|
||||
async def compute_relevance_metrics(
|
||||
query: str,
|
||||
ideas: List[str],
|
||||
model: str = None,
|
||||
sample_size: int = None
|
||||
) -> Optional[RelevanceMetrics]:
|
||||
"""Compute LLM-as-judge relevance metrics for ideas."""
|
||||
if not ideas:
|
||||
return None
|
||||
|
||||
# Optionally sample to reduce API calls
|
||||
if sample_size and len(ideas) > sample_size:
|
||||
rng = random.Random(RANDOM_SEED)
|
||||
ideas_to_judge = rng.sample(ideas, sample_size)
|
||||
else:
|
||||
ideas_to_judge = ideas
|
||||
|
||||
scores = []
|
||||
for idea in ideas_to_judge:
|
||||
result = await judge_relevance(query, idea, model)
|
||||
scores.append(result["score"])
|
||||
|
||||
# Compute distribution
|
||||
distribution = {1: 0, 2: 0, 3: 0}
|
||||
for s in scores:
|
||||
if s in distribution:
|
||||
distribution[s] += 1
|
||||
|
||||
nonsense_count = distribution[1]
|
||||
relevant_count = distribution[2] + distribution[3]
|
||||
|
||||
return RelevanceMetrics(
|
||||
relevance_rate=relevant_count / len(scores) if scores else 0,
|
||||
nonsense_rate=nonsense_count / len(scores) if scores else 0,
|
||||
mean_score=float(np.mean(scores)) if scores else 0,
|
||||
score_distribution=distribution
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Main metrics computation
|
||||
# ============================================================
|
||||
|
||||
async def compute_condition_metrics(
|
||||
query: str,
|
||||
condition: str,
|
||||
raw_ideas: List[str],
|
||||
unique_ideas: List[str],
|
||||
normalized_sample_size: int,
|
||||
compute_relevance: bool = False
|
||||
) -> ConditionMetrics:
|
||||
"""Compute all metrics for a single condition."""
|
||||
|
||||
raw_count = len(raw_ideas)
|
||||
unique_count = len(unique_ideas)
|
||||
survival_rate = unique_count / raw_count if raw_count > 0 else 1.0
|
||||
|
||||
logger.info(f" Computing metrics for {condition}...")
|
||||
logger.info(f" Raw: {raw_count}, Unique: {unique_count}, Survival: {survival_rate:.1%}")
|
||||
|
||||
# Pre-dedup diversity (on raw ideas)
|
||||
logger.info(f" Computing pre-dedup diversity...")
|
||||
pre_dedup_diversity = await compute_diversity_metrics(raw_ideas)
|
||||
|
||||
# Post-dedup diversity (on unique ideas)
|
||||
logger.info(f" Computing post-dedup diversity...")
|
||||
post_dedup_diversity = await compute_diversity_metrics(unique_ideas)
|
||||
|
||||
# Cluster analysis (post-dedup)
|
||||
logger.info(f" Computing cluster metrics...")
|
||||
post_dedup_clusters = await compute_cluster_metrics(unique_ideas)
|
||||
|
||||
# Query distance (post-dedup)
|
||||
logger.info(f" Computing query distance...")
|
||||
post_dedup_query_distance = await compute_query_distance_metrics(query, unique_ideas)
|
||||
|
||||
# Normalized diversity (equal-sized sample for fair comparison)
|
||||
normalized_diversity = None
|
||||
if len(unique_ideas) >= normalized_sample_size and normalized_sample_size > 1:
|
||||
logger.info(f" Computing normalized diversity (n={normalized_sample_size})...")
|
||||
rng = random.Random(RANDOM_SEED)
|
||||
sampled_ideas = rng.sample(unique_ideas, normalized_sample_size)
|
||||
normalized_diversity = await compute_diversity_metrics(sampled_ideas)
|
||||
|
||||
# Relevance metrics (optional, expensive)
|
||||
relevance = None
|
||||
if compute_relevance and unique_ideas:
|
||||
logger.info(f" Computing relevance metrics (LLM-as-judge)...")
|
||||
# Sample up to 10 ideas to reduce cost
|
||||
relevance = await compute_relevance_metrics(
|
||||
query, unique_ideas, sample_size=min(10, len(unique_ideas))
|
||||
)
|
||||
|
||||
return ConditionMetrics(
|
||||
condition=condition,
|
||||
query=query,
|
||||
raw_count=raw_count,
|
||||
unique_count=unique_count,
|
||||
survival_rate=survival_rate,
|
||||
pre_dedup_diversity=pre_dedup_diversity,
|
||||
post_dedup_diversity=post_dedup_diversity,
|
||||
post_dedup_clusters=post_dedup_clusters,
|
||||
post_dedup_query_distance=post_dedup_query_distance,
|
||||
normalized_diversity=normalized_diversity,
|
||||
normalized_sample_size=normalized_sample_size,
|
||||
relevance=relevance
|
||||
)
|
||||
|
||||
|
||||
async def process_experiment_results(
|
||||
input_file: Path,
|
||||
output_file: Optional[Path] = None,
|
||||
compute_relevance: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Process experiment results and compute all metrics.
|
||||
|
||||
Args:
|
||||
input_file: Path to deduped experiment results JSON
|
||||
output_file: Path for output (default: input with _metrics suffix)
|
||||
compute_relevance: Whether to compute LLM-as-judge relevance
|
||||
|
||||
Returns:
|
||||
Results with computed metrics
|
||||
"""
|
||||
# Load experiment results
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
experiment = json.load(f)
|
||||
|
||||
logger.info(f"Processing experiment: {experiment.get('experiment_id', 'unknown')}")
|
||||
|
||||
# Determine normalized sample size (minimum unique count across all conditions)
|
||||
min_unique_count = float('inf')
|
||||
for query_result in experiment["results"]:
|
||||
for condition, cond_result in query_result["conditions"].items():
|
||||
if cond_result.get("success", False):
|
||||
dedup = cond_result.get("dedup", {})
|
||||
unique_count = len(dedup.get("unique_ideas", cond_result.get("ideas", [])))
|
||||
if unique_count > 0:
|
||||
min_unique_count = min(min_unique_count, unique_count)
|
||||
|
||||
normalized_sample_size = min(int(min_unique_count), 10) if min_unique_count != float('inf') else 5
|
||||
logger.info(f"Normalized sample size: {normalized_sample_size}")
|
||||
|
||||
# Process each query
|
||||
all_metrics = []
|
||||
|
||||
for query_result in experiment["results"]:
|
||||
query = query_result["query"]
|
||||
query_id = query_result["query_id"]
|
||||
|
||||
logger.info(f"\nProcessing query: {query} ({query_id})")
|
||||
|
||||
query_metrics = {
|
||||
"query_id": query_id,
|
||||
"query": query,
|
||||
"conditions": {}
|
||||
}
|
||||
|
||||
for condition, cond_result in query_result["conditions"].items():
|
||||
if not cond_result.get("success", False):
|
||||
logger.warning(f" Skipping failed condition: {condition}")
|
||||
continue
|
||||
|
||||
# Get raw and unique ideas
|
||||
raw_ideas = cond_result.get("ideas", [])
|
||||
dedup = cond_result.get("dedup", {})
|
||||
unique_ideas = dedup.get("unique_ideas", raw_ideas)
|
||||
|
||||
# Compute metrics
|
||||
metrics = await compute_condition_metrics(
|
||||
query=query,
|
||||
condition=condition,
|
||||
raw_ideas=raw_ideas,
|
||||
unique_ideas=unique_ideas,
|
||||
normalized_sample_size=normalized_sample_size,
|
||||
compute_relevance=compute_relevance
|
||||
)
|
||||
|
||||
# Convert to dict for JSON serialization
|
||||
query_metrics["conditions"][condition] = asdict(metrics)
|
||||
|
||||
all_metrics.append(query_metrics)
|
||||
|
||||
# Calculate aggregate statistics
|
||||
aggregate = calculate_aggregate_metrics(all_metrics)
|
||||
|
||||
# Build output
|
||||
output = {
|
||||
"experiment_id": experiment.get("experiment_id"),
|
||||
"config": experiment.get("config"),
|
||||
"normalized_sample_size": normalized_sample_size,
|
||||
"metrics_by_query": all_metrics,
|
||||
"aggregate": aggregate
|
||||
}
|
||||
|
||||
# Save results
|
||||
if output_file is None:
|
||||
stem = input_file.stem.replace("_deduped", "").replace("_complete", "")
|
||||
output_file = input_file.parent / f"{stem}_metrics.json"
|
||||
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(output, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logger.info(f"\nMetrics saved to: {output_file}")
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def calculate_aggregate_metrics(all_metrics: List[Dict]) -> Dict[str, Any]:
|
||||
"""Calculate aggregate statistics across all queries."""
|
||||
aggregate = {}
|
||||
|
||||
# Collect metrics by condition
|
||||
by_condition = {}
|
||||
|
||||
for query_metrics in all_metrics:
|
||||
for condition, metrics in query_metrics["conditions"].items():
|
||||
if condition not in by_condition:
|
||||
by_condition[condition] = {
|
||||
"raw_counts": [],
|
||||
"unique_counts": [],
|
||||
"survival_rates": [],
|
||||
"pre_dedup_diversity": [],
|
||||
"post_dedup_diversity": [],
|
||||
"normalized_diversity": [],
|
||||
"query_distances": [],
|
||||
"cluster_counts": [],
|
||||
"silhouette_scores": [],
|
||||
"relevance_rates": [],
|
||||
"nonsense_rates": []
|
||||
}
|
||||
|
||||
bc = by_condition[condition]
|
||||
bc["raw_counts"].append(metrics["raw_count"])
|
||||
bc["unique_counts"].append(metrics["unique_count"])
|
||||
bc["survival_rates"].append(metrics["survival_rate"])
|
||||
|
||||
if metrics.get("pre_dedup_diversity"):
|
||||
bc["pre_dedup_diversity"].append(
|
||||
metrics["pre_dedup_diversity"]["mean_pairwise_distance"]
|
||||
)
|
||||
|
||||
if metrics.get("post_dedup_diversity"):
|
||||
bc["post_dedup_diversity"].append(
|
||||
metrics["post_dedup_diversity"]["mean_pairwise_distance"]
|
||||
)
|
||||
|
||||
if metrics.get("normalized_diversity"):
|
||||
bc["normalized_diversity"].append(
|
||||
metrics["normalized_diversity"]["mean_pairwise_distance"]
|
||||
)
|
||||
|
||||
if metrics.get("post_dedup_query_distance"):
|
||||
bc["query_distances"].append(
|
||||
metrics["post_dedup_query_distance"]["mean_distance"]
|
||||
)
|
||||
|
||||
if metrics.get("post_dedup_clusters"):
|
||||
bc["cluster_counts"].append(
|
||||
metrics["post_dedup_clusters"]["optimal_clusters"]
|
||||
)
|
||||
bc["silhouette_scores"].append(
|
||||
metrics["post_dedup_clusters"]["silhouette_score"]
|
||||
)
|
||||
|
||||
if metrics.get("relevance"):
|
||||
bc["relevance_rates"].append(metrics["relevance"]["relevance_rate"])
|
||||
bc["nonsense_rates"].append(metrics["relevance"]["nonsense_rate"])
|
||||
|
||||
# Calculate means and stds
|
||||
for condition, data in by_condition.items():
|
||||
aggregate[condition] = {}
|
||||
|
||||
for metric_name, values in data.items():
|
||||
if values:
|
||||
aggregate[condition][metric_name] = {
|
||||
"mean": float(np.mean(values)),
|
||||
"std": float(np.std(values)),
|
||||
"min": float(np.min(values)),
|
||||
"max": float(np.max(values)),
|
||||
"n": len(values)
|
||||
}
|
||||
|
||||
return aggregate
|
||||
|
||||
|
||||
def print_metrics_summary(metrics: Dict[str, Any]):
|
||||
"""Print a formatted summary of computed metrics."""
|
||||
print("\n" + "=" * 80)
|
||||
print("METRICS SUMMARY")
|
||||
print("=" * 80)
|
||||
|
||||
print(f"\nNormalized sample size: {metrics.get('normalized_sample_size', 'N/A')}")
|
||||
|
||||
aggregate = metrics.get("aggregate", {})
|
||||
|
||||
# Idea counts
|
||||
print("\n--- Idea Counts ---")
|
||||
print(f"{'Condition':<25} {'Raw':<10} {'Unique':<10} {'Survival':<10}")
|
||||
print("-" * 55)
|
||||
for cond, data in aggregate.items():
|
||||
raw = data.get("raw_counts", {}).get("mean", 0)
|
||||
unique = data.get("unique_counts", {}).get("mean", 0)
|
||||
survival = data.get("survival_rates", {}).get("mean", 0)
|
||||
print(f"{cond:<25} {raw:<10.1f} {unique:<10.1f} {survival:<10.1%}")
|
||||
|
||||
# Diversity metrics
|
||||
print("\n--- Semantic Diversity (Mean Pairwise Distance) ---")
|
||||
print(f"{'Condition':<25} {'Pre-Dedup':<12} {'Post-Dedup':<12} {'Normalized':<12}")
|
||||
print("-" * 61)
|
||||
for cond, data in aggregate.items():
|
||||
pre = data.get("pre_dedup_diversity", {}).get("mean", 0)
|
||||
post = data.get("post_dedup_diversity", {}).get("mean", 0)
|
||||
norm = data.get("normalized_diversity", {}).get("mean", 0)
|
||||
print(f"{cond:<25} {pre:<12.4f} {post:<12.4f} {norm:<12.4f}")
|
||||
|
||||
# Query distance
|
||||
print("\n--- Query Distance (Novelty) ---")
|
||||
print(f"{'Condition':<25} {'Mean Distance':<15} {'Std':<10}")
|
||||
print("-" * 50)
|
||||
for cond, data in aggregate.items():
|
||||
dist = data.get("query_distances", {})
|
||||
mean = dist.get("mean", 0)
|
||||
std = dist.get("std", 0)
|
||||
print(f"{cond:<25} {mean:<15.4f} {std:<10.4f}")
|
||||
|
||||
# Cluster metrics
|
||||
print("\n--- Cluster Analysis ---")
|
||||
print(f"{'Condition':<25} {'Clusters':<12} {'Silhouette':<12}")
|
||||
print("-" * 49)
|
||||
for cond, data in aggregate.items():
|
||||
clusters = data.get("cluster_counts", {}).get("mean", 0)
|
||||
silhouette = data.get("silhouette_scores", {}).get("mean", 0)
|
||||
print(f"{cond:<25} {clusters:<12.1f} {silhouette:<12.4f}")
|
||||
|
||||
# Relevance (if computed)
|
||||
has_relevance = any(
|
||||
"relevance_rates" in data and data["relevance_rates"].get("n", 0) > 0
|
||||
for data in aggregate.values()
|
||||
)
|
||||
if has_relevance:
|
||||
print("\n--- Relevance (LLM-as-Judge) ---")
|
||||
print(f"{'Condition':<25} {'Relevance':<12} {'Nonsense':<12}")
|
||||
print("-" * 49)
|
||||
for cond, data in aggregate.items():
|
||||
rel = data.get("relevance_rates", {}).get("mean", 0)
|
||||
non = data.get("nonsense_rates", {}).get("mean", 0)
|
||||
print(f"{cond:<25} {rel:<12.1%} {non:<12.1%}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("Interpretation:")
|
||||
print("- Higher pairwise distance = more diverse ideas")
|
||||
print("- Higher query distance = more novel (farther from original)")
|
||||
print("- More clusters = more distinct themes")
|
||||
print("- Higher silhouette = cleaner cluster separation")
|
||||
print("=" * 80)
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Compute metrics for experiment results"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Input deduped experiment results JSON file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
help="Output file path (default: input_metrics.json)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--relevance",
|
||||
action="store_true",
|
||||
help="Compute LLM-as-judge relevance metrics (expensive)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
input_path = Path(args.input)
|
||||
if not input_path.exists():
|
||||
input_path = RESULTS_DIR / args.input
|
||||
if not input_path.exists():
|
||||
print(f"Error: Input file not found: {args.input}")
|
||||
sys.exit(1)
|
||||
|
||||
output_path = Path(args.output) if args.output else None
|
||||
|
||||
metrics = await process_experiment_results(
|
||||
input_file=input_path,
|
||||
output_file=output_path,
|
||||
compute_relevance=args.relevance
|
||||
)
|
||||
|
||||
print_metrics_summary(metrics)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user