feat: Add experiments framework and novelty-driven agent loop

- Add complete experiments directory with pilot study infrastructure
  - 5 experimental conditions (direct, expert-only, attribute-only, full-pipeline, random-perspective)
  - Human assessment tool with React frontend and FastAPI backend
  - AUT flexibility analysis with jump signal detection
  - Result visualization and metrics computation

- Add novelty-driven agent loop module (experiments/novelty_loop/)
  - NoveltyDrivenTaskAgent with expert perspective perturbation
  - Three termination strategies: breakthrough, exhaust, coverage
  - Interactive CLI demo with colored output
  - Embedding-based novelty scoring

- Add DDC knowledge domain classification data (en/zh)
- Add CLAUDE.md project documentation
- Update research report with experiment findings

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-20 10:16:21 +08:00
parent 26a56a2a07
commit 43c025e060
81 changed files with 18766 additions and 2 deletions

View File

@@ -0,0 +1,356 @@
#!/usr/bin/env python3
"""
Analyze assessment ratings for inter-rater reliability and condition comparisons.
This script:
1. Loads ratings from the SQLite database
2. Joins with hidden metadata (condition, expert)
3. Calculates inter-rater reliability metrics
4. Computes mean ratings per dimension per condition
5. Performs statistical comparisons between conditions
"""
import json
import sqlite3
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from typing import Any
import numpy as np
from scipy import stats
# Paths
RESULTS_DIR = Path(__file__).parent / 'results'
DATA_DIR = Path(__file__).parent / 'data'
DB_PATH = RESULTS_DIR / 'ratings.db'
ASSESSMENT_DATA_PATH = DATA_DIR / 'assessment_items.json'
def load_assessment_data() -> dict[str, Any]:
"""Load the assessment items data with hidden metadata."""
with open(ASSESSMENT_DATA_PATH, 'r', encoding='utf-8') as f:
return json.load(f)
def load_ratings_from_db() -> list[dict[str, Any]]:
"""Load all ratings from the SQLite database."""
if not DB_PATH.exists():
print(f"Database not found at {DB_PATH}")
return []
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute('''
SELECT r.*, rat.name as rater_name
FROM ratings r
LEFT JOIN raters rat ON r.rater_id = rat.rater_id
WHERE r.skipped = 0
''')
ratings = [dict(row) for row in cursor.fetchall()]
conn.close()
return ratings
def build_idea_lookup(assessment_data: dict[str, Any]) -> dict[str, dict[str, Any]]:
"""Build a lookup table from idea_id to metadata."""
lookup = {}
for query in assessment_data['queries']:
for idea in query['ideas']:
lookup[idea['idea_id']] = {
'text': idea['text'],
'query_id': query['query_id'],
'query_text': query['query_text'],
**idea['_hidden']
}
return lookup
def calculate_krippendorff_alpha(ratings_matrix: np.ndarray) -> float:
"""
Calculate Krippendorff's alpha for ordinal data.
Args:
ratings_matrix: 2D array where rows are items and columns are raters.
NaN values indicate missing ratings.
Returns:
Krippendorff's alpha coefficient
"""
# Remove items with fewer than 2 raters
valid_items = ~np.all(np.isnan(ratings_matrix), axis=1)
ratings_matrix = ratings_matrix[valid_items]
if ratings_matrix.shape[0] < 2:
return np.nan
n_items, n_raters = ratings_matrix.shape
# Observed disagreement
observed_disagreement = 0
n_pairs = 0
for i in range(n_items):
values = ratings_matrix[i, ~np.isnan(ratings_matrix[i])]
if len(values) < 2:
continue
# Ordinal distance: squared difference
for j in range(len(values)):
for k in range(j + 1, len(values)):
observed_disagreement += (values[j] - values[k]) ** 2
n_pairs += 1
if n_pairs == 0:
return np.nan
observed_disagreement /= n_pairs
# Expected disagreement (based on marginal distribution)
all_values = ratings_matrix[~np.isnan(ratings_matrix)]
if len(all_values) < 2:
return np.nan
expected_disagreement = 0
n_total_pairs = 0
for i in range(len(all_values)):
for j in range(i + 1, len(all_values)):
expected_disagreement += (all_values[i] - all_values[j]) ** 2
n_total_pairs += 1
if n_total_pairs == 0:
return np.nan
expected_disagreement /= n_total_pairs
if expected_disagreement == 0:
return 1.0
alpha = 1 - (observed_disagreement / expected_disagreement)
return alpha
def calculate_icc(ratings_matrix: np.ndarray) -> tuple[float, float, float]:
"""
Calculate Intraclass Correlation Coefficient (ICC(2,1)).
Args:
ratings_matrix: 2D array where rows are items and columns are raters.
Returns:
Tuple of (ICC, lower_bound, upper_bound)
"""
# Remove rows with any NaN
valid_rows = ~np.any(np.isnan(ratings_matrix), axis=1)
ratings_matrix = ratings_matrix[valid_rows]
if ratings_matrix.shape[0] < 2 or ratings_matrix.shape[1] < 2:
return np.nan, np.nan, np.nan
n, k = ratings_matrix.shape
# Grand mean
grand_mean = np.mean(ratings_matrix)
# Row means (item means)
row_means = np.mean(ratings_matrix, axis=1)
# Column means (rater means)
col_means = np.mean(ratings_matrix, axis=0)
# Sum of squares
ss_total = np.sum((ratings_matrix - grand_mean) ** 2)
ss_rows = k * np.sum((row_means - grand_mean) ** 2)
ss_cols = n * np.sum((col_means - grand_mean) ** 2)
ss_error = ss_total - ss_rows - ss_cols
# Mean squares
ms_rows = ss_rows / (n - 1) if n > 1 else 0
ms_cols = ss_cols / (k - 1) if k > 1 else 0
ms_error = ss_error / ((n - 1) * (k - 1)) if (n > 1 and k > 1) else 0
# ICC(2,1) - two-way random, absolute agreement, single rater
if ms_error + (ms_cols - ms_error) / n == 0:
return np.nan, np.nan, np.nan
icc = (ms_rows - ms_error) / (ms_rows + (k - 1) * ms_error + k * (ms_cols - ms_error) / n)
# Confidence interval (approximate)
# Using F distribution
df1 = n - 1
df2 = (n - 1) * (k - 1)
if ms_error == 0:
return icc, np.nan, np.nan
f_value = ms_rows / ms_error
f_lower = f_value / stats.f.ppf(0.975, df1, df2)
f_upper = f_value / stats.f.ppf(0.025, df1, df2)
icc_lower = (f_lower - 1) / (f_lower + k - 1)
icc_upper = (f_upper - 1) / (f_upper + k - 1)
return icc, icc_lower, icc_upper
def analyze_ratings():
"""Main analysis function."""
print("=" * 60)
print("CREATIVE IDEA ASSESSMENT ANALYSIS")
print("=" * 60)
print()
# Load data
assessment_data = load_assessment_data()
ratings = load_ratings_from_db()
idea_lookup = build_idea_lookup(assessment_data)
if not ratings:
print("No ratings found in database.")
return
print(f"Loaded {len(ratings)} ratings from database")
print(f"Experiment ID: {assessment_data['experiment_id']}")
print()
# Get unique raters
raters = list(set(r['rater_id'] for r in ratings))
print(f"Raters: {raters}")
print()
# Join ratings with metadata
enriched_ratings = []
for r in ratings:
idea_meta = idea_lookup.get(r['idea_id'], {})
enriched_ratings.append({
**r,
'condition': idea_meta.get('condition', 'unknown'),
'expert_name': idea_meta.get('expert_name', ''),
'keyword': idea_meta.get('keyword', ''),
'query_text': idea_meta.get('query_text', ''),
'idea_text': idea_meta.get('text', '')
})
# Dimensions
dimensions = ['originality', 'elaboration', 'coherence', 'usefulness']
# ================================
# Inter-rater reliability
# ================================
print("-" * 60)
print("INTER-RATER RELIABILITY")
print("-" * 60)
print()
if len(raters) >= 2:
# Build ratings matrix per dimension
idea_ids = list(set(r['idea_id'] for r in enriched_ratings))
for dim in dimensions:
# Create matrix: rows = ideas, cols = raters
matrix = np.full((len(idea_ids), len(raters)), np.nan)
idea_to_idx = {idea: idx for idx, idea in enumerate(idea_ids)}
rater_to_idx = {rater: idx for idx, rater in enumerate(raters)}
for r in enriched_ratings:
if r[dim] is not None:
i = idea_to_idx[r['idea_id']]
j = rater_to_idx[r['rater_id']]
matrix[i, j] = r[dim]
# Calculate metrics
alpha = calculate_krippendorff_alpha(matrix)
icc, icc_low, icc_high = calculate_icc(matrix)
print(f"{dim.upper()}:")
print(f" Krippendorff's alpha: {alpha:.3f}")
print(f" ICC(2,1): {icc:.3f} (95% CI: {icc_low:.3f} - {icc_high:.3f})")
print()
else:
print("Need at least 2 raters for inter-rater reliability analysis.")
print()
# ================================
# Condition comparisons
# ================================
print("-" * 60)
print("MEAN RATINGS BY CONDITION")
print("-" * 60)
print()
# Group ratings by condition
condition_ratings: dict[str, dict[str, list[int]]] = defaultdict(lambda: defaultdict(list))
for r in enriched_ratings:
condition = r['condition']
for dim in dimensions:
if r[dim] is not None:
condition_ratings[condition][dim].append(r[dim])
# Calculate means and print
condition_stats = {}
for condition in sorted(condition_ratings.keys()):
print(f"\n{condition}:")
condition_stats[condition] = {}
for dim in dimensions:
values = condition_ratings[condition][dim]
if values:
mean = np.mean(values)
std = np.std(values)
n = len(values)
condition_stats[condition][dim] = {'mean': mean, 'std': std, 'n': n}
print(f" {dim}: {mean:.2f} (SD={std:.2f}, n={n})")
else:
print(f" {dim}: no data")
# ================================
# Statistical comparisons
# ================================
print()
print("-" * 60)
print("STATISTICAL COMPARISONS (Kruskal-Wallis)")
print("-" * 60)
print()
conditions = sorted(condition_ratings.keys())
if len(conditions) >= 2:
for dim in dimensions:
groups = [condition_ratings[c][dim] for c in conditions if condition_ratings[c][dim]]
if len(groups) >= 2:
h_stat, p_value = stats.kruskal(*groups)
sig = "*" if p_value < 0.05 else ""
print(f"{dim}: H={h_stat:.2f}, p={p_value:.4f} {sig}")
else:
print(f"{dim}: insufficient data for comparison")
else:
print("Need at least 2 conditions with data for statistical comparison.")
# ================================
# Export results
# ================================
output = {
'analysis_timestamp': datetime.utcnow().isoformat(),
'experiment_id': assessment_data['experiment_id'],
'total_ratings': len(ratings),
'raters': raters,
'rater_count': len(raters),
'condition_stats': condition_stats,
'enriched_ratings': enriched_ratings
}
output_path = RESULTS_DIR / 'analysis_results.json'
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2, default=str)
print()
print("-" * 60)
print(f"Results exported to: {output_path}")
print("=" * 60)
if __name__ == '__main__':
analyze_ratings()