feat: Add experiments framework and novelty-driven agent loop
- Add complete experiments directory with pilot study infrastructure - 5 experimental conditions (direct, expert-only, attribute-only, full-pipeline, random-perspective) - Human assessment tool with React frontend and FastAPI backend - AUT flexibility analysis with jump signal detection - Result visualization and metrics computation - Add novelty-driven agent loop module (experiments/novelty_loop/) - NoveltyDrivenTaskAgent with expert perspective perturbation - Three termination strategies: breakthrough, exhaust, coverage - Interactive CLI demo with colored output - Embedding-based novelty scoring - Add DDC knowledge domain classification data (en/zh) - Add CLAUDE.md project documentation - Update research report with experiment findings Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
356
experiments/assessment/analyze_ratings.py
Executable file
356
experiments/assessment/analyze_ratings.py
Executable file
@@ -0,0 +1,356 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyze assessment ratings for inter-rater reliability and condition comparisons.
|
||||
|
||||
This script:
|
||||
1. Loads ratings from the SQLite database
|
||||
2. Joins with hidden metadata (condition, expert)
|
||||
3. Calculates inter-rater reliability metrics
|
||||
4. Computes mean ratings per dimension per condition
|
||||
5. Performs statistical comparisons between conditions
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
from scipy import stats
|
||||
|
||||
|
||||
# Paths
|
||||
RESULTS_DIR = Path(__file__).parent / 'results'
|
||||
DATA_DIR = Path(__file__).parent / 'data'
|
||||
DB_PATH = RESULTS_DIR / 'ratings.db'
|
||||
ASSESSMENT_DATA_PATH = DATA_DIR / 'assessment_items.json'
|
||||
|
||||
|
||||
def load_assessment_data() -> dict[str, Any]:
|
||||
"""Load the assessment items data with hidden metadata."""
|
||||
with open(ASSESSMENT_DATA_PATH, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def load_ratings_from_db() -> list[dict[str, Any]]:
|
||||
"""Load all ratings from the SQLite database."""
|
||||
if not DB_PATH.exists():
|
||||
print(f"Database not found at {DB_PATH}")
|
||||
return []
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
SELECT r.*, rat.name as rater_name
|
||||
FROM ratings r
|
||||
LEFT JOIN raters rat ON r.rater_id = rat.rater_id
|
||||
WHERE r.skipped = 0
|
||||
''')
|
||||
|
||||
ratings = [dict(row) for row in cursor.fetchall()]
|
||||
conn.close()
|
||||
|
||||
return ratings
|
||||
|
||||
|
||||
def build_idea_lookup(assessment_data: dict[str, Any]) -> dict[str, dict[str, Any]]:
|
||||
"""Build a lookup table from idea_id to metadata."""
|
||||
lookup = {}
|
||||
for query in assessment_data['queries']:
|
||||
for idea in query['ideas']:
|
||||
lookup[idea['idea_id']] = {
|
||||
'text': idea['text'],
|
||||
'query_id': query['query_id'],
|
||||
'query_text': query['query_text'],
|
||||
**idea['_hidden']
|
||||
}
|
||||
return lookup
|
||||
|
||||
|
||||
def calculate_krippendorff_alpha(ratings_matrix: np.ndarray) -> float:
|
||||
"""
|
||||
Calculate Krippendorff's alpha for ordinal data.
|
||||
|
||||
Args:
|
||||
ratings_matrix: 2D array where rows are items and columns are raters.
|
||||
NaN values indicate missing ratings.
|
||||
|
||||
Returns:
|
||||
Krippendorff's alpha coefficient
|
||||
"""
|
||||
# Remove items with fewer than 2 raters
|
||||
valid_items = ~np.all(np.isnan(ratings_matrix), axis=1)
|
||||
ratings_matrix = ratings_matrix[valid_items]
|
||||
|
||||
if ratings_matrix.shape[0] < 2:
|
||||
return np.nan
|
||||
|
||||
n_items, n_raters = ratings_matrix.shape
|
||||
|
||||
# Observed disagreement
|
||||
observed_disagreement = 0
|
||||
n_pairs = 0
|
||||
|
||||
for i in range(n_items):
|
||||
values = ratings_matrix[i, ~np.isnan(ratings_matrix[i])]
|
||||
if len(values) < 2:
|
||||
continue
|
||||
# Ordinal distance: squared difference
|
||||
for j in range(len(values)):
|
||||
for k in range(j + 1, len(values)):
|
||||
observed_disagreement += (values[j] - values[k]) ** 2
|
||||
n_pairs += 1
|
||||
|
||||
if n_pairs == 0:
|
||||
return np.nan
|
||||
|
||||
observed_disagreement /= n_pairs
|
||||
|
||||
# Expected disagreement (based on marginal distribution)
|
||||
all_values = ratings_matrix[~np.isnan(ratings_matrix)]
|
||||
if len(all_values) < 2:
|
||||
return np.nan
|
||||
|
||||
expected_disagreement = 0
|
||||
n_total_pairs = 0
|
||||
for i in range(len(all_values)):
|
||||
for j in range(i + 1, len(all_values)):
|
||||
expected_disagreement += (all_values[i] - all_values[j]) ** 2
|
||||
n_total_pairs += 1
|
||||
|
||||
if n_total_pairs == 0:
|
||||
return np.nan
|
||||
|
||||
expected_disagreement /= n_total_pairs
|
||||
|
||||
if expected_disagreement == 0:
|
||||
return 1.0
|
||||
|
||||
alpha = 1 - (observed_disagreement / expected_disagreement)
|
||||
return alpha
|
||||
|
||||
|
||||
def calculate_icc(ratings_matrix: np.ndarray) -> tuple[float, float, float]:
|
||||
"""
|
||||
Calculate Intraclass Correlation Coefficient (ICC(2,1)).
|
||||
|
||||
Args:
|
||||
ratings_matrix: 2D array where rows are items and columns are raters.
|
||||
|
||||
Returns:
|
||||
Tuple of (ICC, lower_bound, upper_bound)
|
||||
"""
|
||||
# Remove rows with any NaN
|
||||
valid_rows = ~np.any(np.isnan(ratings_matrix), axis=1)
|
||||
ratings_matrix = ratings_matrix[valid_rows]
|
||||
|
||||
if ratings_matrix.shape[0] < 2 or ratings_matrix.shape[1] < 2:
|
||||
return np.nan, np.nan, np.nan
|
||||
|
||||
n, k = ratings_matrix.shape
|
||||
|
||||
# Grand mean
|
||||
grand_mean = np.mean(ratings_matrix)
|
||||
|
||||
# Row means (item means)
|
||||
row_means = np.mean(ratings_matrix, axis=1)
|
||||
|
||||
# Column means (rater means)
|
||||
col_means = np.mean(ratings_matrix, axis=0)
|
||||
|
||||
# Sum of squares
|
||||
ss_total = np.sum((ratings_matrix - grand_mean) ** 2)
|
||||
ss_rows = k * np.sum((row_means - grand_mean) ** 2)
|
||||
ss_cols = n * np.sum((col_means - grand_mean) ** 2)
|
||||
ss_error = ss_total - ss_rows - ss_cols
|
||||
|
||||
# Mean squares
|
||||
ms_rows = ss_rows / (n - 1) if n > 1 else 0
|
||||
ms_cols = ss_cols / (k - 1) if k > 1 else 0
|
||||
ms_error = ss_error / ((n - 1) * (k - 1)) if (n > 1 and k > 1) else 0
|
||||
|
||||
# ICC(2,1) - two-way random, absolute agreement, single rater
|
||||
if ms_error + (ms_cols - ms_error) / n == 0:
|
||||
return np.nan, np.nan, np.nan
|
||||
|
||||
icc = (ms_rows - ms_error) / (ms_rows + (k - 1) * ms_error + k * (ms_cols - ms_error) / n)
|
||||
|
||||
# Confidence interval (approximate)
|
||||
# Using F distribution
|
||||
df1 = n - 1
|
||||
df2 = (n - 1) * (k - 1)
|
||||
|
||||
if ms_error == 0:
|
||||
return icc, np.nan, np.nan
|
||||
|
||||
f_value = ms_rows / ms_error
|
||||
f_lower = f_value / stats.f.ppf(0.975, df1, df2)
|
||||
f_upper = f_value / stats.f.ppf(0.025, df1, df2)
|
||||
|
||||
icc_lower = (f_lower - 1) / (f_lower + k - 1)
|
||||
icc_upper = (f_upper - 1) / (f_upper + k - 1)
|
||||
|
||||
return icc, icc_lower, icc_upper
|
||||
|
||||
|
||||
def analyze_ratings():
|
||||
"""Main analysis function."""
|
||||
print("=" * 60)
|
||||
print("CREATIVE IDEA ASSESSMENT ANALYSIS")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
# Load data
|
||||
assessment_data = load_assessment_data()
|
||||
ratings = load_ratings_from_db()
|
||||
idea_lookup = build_idea_lookup(assessment_data)
|
||||
|
||||
if not ratings:
|
||||
print("No ratings found in database.")
|
||||
return
|
||||
|
||||
print(f"Loaded {len(ratings)} ratings from database")
|
||||
print(f"Experiment ID: {assessment_data['experiment_id']}")
|
||||
print()
|
||||
|
||||
# Get unique raters
|
||||
raters = list(set(r['rater_id'] for r in ratings))
|
||||
print(f"Raters: {raters}")
|
||||
print()
|
||||
|
||||
# Join ratings with metadata
|
||||
enriched_ratings = []
|
||||
for r in ratings:
|
||||
idea_meta = idea_lookup.get(r['idea_id'], {})
|
||||
enriched_ratings.append({
|
||||
**r,
|
||||
'condition': idea_meta.get('condition', 'unknown'),
|
||||
'expert_name': idea_meta.get('expert_name', ''),
|
||||
'keyword': idea_meta.get('keyword', ''),
|
||||
'query_text': idea_meta.get('query_text', ''),
|
||||
'idea_text': idea_meta.get('text', '')
|
||||
})
|
||||
|
||||
# Dimensions
|
||||
dimensions = ['originality', 'elaboration', 'coherence', 'usefulness']
|
||||
|
||||
# ================================
|
||||
# Inter-rater reliability
|
||||
# ================================
|
||||
print("-" * 60)
|
||||
print("INTER-RATER RELIABILITY")
|
||||
print("-" * 60)
|
||||
print()
|
||||
|
||||
if len(raters) >= 2:
|
||||
# Build ratings matrix per dimension
|
||||
idea_ids = list(set(r['idea_id'] for r in enriched_ratings))
|
||||
|
||||
for dim in dimensions:
|
||||
# Create matrix: rows = ideas, cols = raters
|
||||
matrix = np.full((len(idea_ids), len(raters)), np.nan)
|
||||
idea_to_idx = {idea: idx for idx, idea in enumerate(idea_ids)}
|
||||
rater_to_idx = {rater: idx for idx, rater in enumerate(raters)}
|
||||
|
||||
for r in enriched_ratings:
|
||||
if r[dim] is not None:
|
||||
i = idea_to_idx[r['idea_id']]
|
||||
j = rater_to_idx[r['rater_id']]
|
||||
matrix[i, j] = r[dim]
|
||||
|
||||
# Calculate metrics
|
||||
alpha = calculate_krippendorff_alpha(matrix)
|
||||
icc, icc_low, icc_high = calculate_icc(matrix)
|
||||
|
||||
print(f"{dim.upper()}:")
|
||||
print(f" Krippendorff's alpha: {alpha:.3f}")
|
||||
print(f" ICC(2,1): {icc:.3f} (95% CI: {icc_low:.3f} - {icc_high:.3f})")
|
||||
print()
|
||||
else:
|
||||
print("Need at least 2 raters for inter-rater reliability analysis.")
|
||||
print()
|
||||
|
||||
# ================================
|
||||
# Condition comparisons
|
||||
# ================================
|
||||
print("-" * 60)
|
||||
print("MEAN RATINGS BY CONDITION")
|
||||
print("-" * 60)
|
||||
print()
|
||||
|
||||
# Group ratings by condition
|
||||
condition_ratings: dict[str, dict[str, list[int]]] = defaultdict(lambda: defaultdict(list))
|
||||
|
||||
for r in enriched_ratings:
|
||||
condition = r['condition']
|
||||
for dim in dimensions:
|
||||
if r[dim] is not None:
|
||||
condition_ratings[condition][dim].append(r[dim])
|
||||
|
||||
# Calculate means and print
|
||||
condition_stats = {}
|
||||
for condition in sorted(condition_ratings.keys()):
|
||||
print(f"\n{condition}:")
|
||||
condition_stats[condition] = {}
|
||||
for dim in dimensions:
|
||||
values = condition_ratings[condition][dim]
|
||||
if values:
|
||||
mean = np.mean(values)
|
||||
std = np.std(values)
|
||||
n = len(values)
|
||||
condition_stats[condition][dim] = {'mean': mean, 'std': std, 'n': n}
|
||||
print(f" {dim}: {mean:.2f} (SD={std:.2f}, n={n})")
|
||||
else:
|
||||
print(f" {dim}: no data")
|
||||
|
||||
# ================================
|
||||
# Statistical comparisons
|
||||
# ================================
|
||||
print()
|
||||
print("-" * 60)
|
||||
print("STATISTICAL COMPARISONS (Kruskal-Wallis)")
|
||||
print("-" * 60)
|
||||
print()
|
||||
|
||||
conditions = sorted(condition_ratings.keys())
|
||||
if len(conditions) >= 2:
|
||||
for dim in dimensions:
|
||||
groups = [condition_ratings[c][dim] for c in conditions if condition_ratings[c][dim]]
|
||||
if len(groups) >= 2:
|
||||
h_stat, p_value = stats.kruskal(*groups)
|
||||
sig = "*" if p_value < 0.05 else ""
|
||||
print(f"{dim}: H={h_stat:.2f}, p={p_value:.4f} {sig}")
|
||||
else:
|
||||
print(f"{dim}: insufficient data for comparison")
|
||||
else:
|
||||
print("Need at least 2 conditions with data for statistical comparison.")
|
||||
|
||||
# ================================
|
||||
# Export results
|
||||
# ================================
|
||||
output = {
|
||||
'analysis_timestamp': datetime.utcnow().isoformat(),
|
||||
'experiment_id': assessment_data['experiment_id'],
|
||||
'total_ratings': len(ratings),
|
||||
'raters': raters,
|
||||
'rater_count': len(raters),
|
||||
'condition_stats': condition_stats,
|
||||
'enriched_ratings': enriched_ratings
|
||||
}
|
||||
|
||||
output_path = RESULTS_DIR / 'analysis_results.json'
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(output, f, ensure_ascii=False, indent=2, default=str)
|
||||
|
||||
print()
|
||||
print("-" * 60)
|
||||
print(f"Results exported to: {output_path}")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
analyze_ratings()
|
||||
Reference in New Issue
Block a user