feat: Add experiments framework and novelty-driven agent loop

- Add complete experiments directory with pilot study infrastructure - 5 experimental conditions (direct, expert-only, attribute-only, full-pipeline, random-perspective) - Human assessment tool with React frontend and FastAPI backend - AUT flexibility analysis with jump signal detection - Result visualization and metrics computation - Add novelty-driven agent loop module (experiments/novelty_loop/) - NoveltyDrivenTaskAgent with expert perspective perturbation - Three termination strategies: breakthrough, exhaust, coverage - Interactive CLI demo with colored output - Embedding-based novelty scoring - Add DDC knowledge domain classification data (en/zh) - Add CLAUDE.md project documentation - Update research report with experiment findings Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 10:16:21 +08:00
parent 26a56a2a07
commit 43c025e060
81 changed files with 18766 additions and 2 deletions
--- a/experiments/assessment/analyze_ratings.py
+++ b/experiments/assessment/analyze_ratings.py
@@ -0,0 +1,356 @@
+#!/usr/bin/env python3
+"""
+Analyze assessment ratings for inter-rater reliability and condition comparisons.
+
+This script:
+1. Loads ratings from the SQLite database
+2. Joins with hidden metadata (condition, expert)
+3. Calculates inter-rater reliability metrics
+4. Computes mean ratings per dimension per condition
+5. Performs statistical comparisons between conditions
+"""
+
+import json
+import sqlite3
+from collections import defaultdict
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from scipy import stats
+
+
+# Paths
+RESULTS_DIR = Path(__file__).parent / 'results'
+DATA_DIR = Path(__file__).parent / 'data'
+DB_PATH = RESULTS_DIR / 'ratings.db'
+ASSESSMENT_DATA_PATH = DATA_DIR / 'assessment_items.json'
+
+
+def load_assessment_data() -> dict[str, Any]:
+    """Load the assessment items data with hidden metadata."""
+    with open(ASSESSMENT_DATA_PATH, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def load_ratings_from_db() -> list[dict[str, Any]]:
+    """Load all ratings from the SQLite database."""
+    if not DB_PATH.exists():
+        print(f"Database not found at {DB_PATH}")
+        return []
+
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
+    cursor = conn.cursor()
+
+    cursor.execute('''
+        SELECT r.*, rat.name as rater_name
+        FROM ratings r
+        LEFT JOIN raters rat ON r.rater_id = rat.rater_id
+        WHERE r.skipped = 0
+    ''')
+
+    ratings = [dict(row) for row in cursor.fetchall()]
+    conn.close()
+
+    return ratings
+
+
+def build_idea_lookup(assessment_data: dict[str, Any]) -> dict[str, dict[str, Any]]:
+    """Build a lookup table from idea_id to metadata."""
+    lookup = {}
+    for query in assessment_data['queries']:
+        for idea in query['ideas']:
+            lookup[idea['idea_id']] = {
+                'text': idea['text'],
+                'query_id': query['query_id'],
+                'query_text': query['query_text'],
+                **idea['_hidden']
+            }
+    return lookup
+
+
+def calculate_krippendorff_alpha(ratings_matrix: np.ndarray) -> float:
+    """
+    Calculate Krippendorff's alpha for ordinal data.
+
+    Args:
+        ratings_matrix: 2D array where rows are items and columns are raters.
+                       NaN values indicate missing ratings.
+
+    Returns:
+        Krippendorff's alpha coefficient
+    """
+    # Remove items with fewer than 2 raters
+    valid_items = ~np.all(np.isnan(ratings_matrix), axis=1)
+    ratings_matrix = ratings_matrix[valid_items]
+
+    if ratings_matrix.shape[0] < 2:
+        return np.nan
+
+    n_items, n_raters = ratings_matrix.shape
+
+    # Observed disagreement
+    observed_disagreement = 0
+    n_pairs = 0
+
+    for i in range(n_items):
+        values = ratings_matrix[i, ~np.isnan(ratings_matrix[i])]
+        if len(values) < 2:
+            continue
+        # Ordinal distance: squared difference
+        for j in range(len(values)):
+            for k in range(j + 1, len(values)):
+                observed_disagreement += (values[j] - values[k]) ** 2
+                n_pairs += 1
+
+    if n_pairs == 0:
+        return np.nan
+
+    observed_disagreement /= n_pairs
+
+    # Expected disagreement (based on marginal distribution)
+    all_values = ratings_matrix[~np.isnan(ratings_matrix)]
+    if len(all_values) < 2:
+        return np.nan
+
+    expected_disagreement = 0
+    n_total_pairs = 0
+    for i in range(len(all_values)):
+        for j in range(i + 1, len(all_values)):
+            expected_disagreement += (all_values[i] - all_values[j]) ** 2
+            n_total_pairs += 1
+
+    if n_total_pairs == 0:
+        return np.nan
+
+    expected_disagreement /= n_total_pairs
+
+    if expected_disagreement == 0:
+        return 1.0
+
+    alpha = 1 - (observed_disagreement / expected_disagreement)
+    return alpha
+
+
+def calculate_icc(ratings_matrix: np.ndarray) -> tuple[float, float, float]:
+    """
+    Calculate Intraclass Correlation Coefficient (ICC(2,1)).
+
+    Args:
+        ratings_matrix: 2D array where rows are items and columns are raters.
+
+    Returns:
+        Tuple of (ICC, lower_bound, upper_bound)
+    """
+    # Remove rows with any NaN
+    valid_rows = ~np.any(np.isnan(ratings_matrix), axis=1)
+    ratings_matrix = ratings_matrix[valid_rows]
+
+    if ratings_matrix.shape[0] < 2 or ratings_matrix.shape[1] < 2:
+        return np.nan, np.nan, np.nan
+
+    n, k = ratings_matrix.shape
+
+    # Grand mean
+    grand_mean = np.mean(ratings_matrix)
+
+    # Row means (item means)
+    row_means = np.mean(ratings_matrix, axis=1)
+
+    # Column means (rater means)
+    col_means = np.mean(ratings_matrix, axis=0)
+
+    # Sum of squares
+    ss_total = np.sum((ratings_matrix - grand_mean) ** 2)
+    ss_rows = k * np.sum((row_means - grand_mean) ** 2)
+    ss_cols = n * np.sum((col_means - grand_mean) ** 2)
+    ss_error = ss_total - ss_rows - ss_cols
+
+    # Mean squares
+    ms_rows = ss_rows / (n - 1) if n > 1 else 0
+    ms_cols = ss_cols / (k - 1) if k > 1 else 0
+    ms_error = ss_error / ((n - 1) * (k - 1)) if (n > 1 and k > 1) else 0
+
+    # ICC(2,1) - two-way random, absolute agreement, single rater
+    if ms_error + (ms_cols - ms_error) / n == 0:
+        return np.nan, np.nan, np.nan
+
+    icc = (ms_rows - ms_error) / (ms_rows + (k - 1) * ms_error + k * (ms_cols - ms_error) / n)
+
+    # Confidence interval (approximate)
+    # Using F distribution
+    df1 = n - 1
+    df2 = (n - 1) * (k - 1)
+
+    if ms_error == 0:
+        return icc, np.nan, np.nan
+
+    f_value = ms_rows / ms_error
+    f_lower = f_value / stats.f.ppf(0.975, df1, df2)
+    f_upper = f_value / stats.f.ppf(0.025, df1, df2)
+
+    icc_lower = (f_lower - 1) / (f_lower + k - 1)
+    icc_upper = (f_upper - 1) / (f_upper + k - 1)
+
+    return icc, icc_lower, icc_upper
+
+
+def analyze_ratings():
+    """Main analysis function."""
+    print("=" * 60)
+    print("CREATIVE IDEA ASSESSMENT ANALYSIS")
+    print("=" * 60)
+    print()
+
+    # Load data
+    assessment_data = load_assessment_data()
+    ratings = load_ratings_from_db()
+    idea_lookup = build_idea_lookup(assessment_data)
+
+    if not ratings:
+        print("No ratings found in database.")
+        return
+
+    print(f"Loaded {len(ratings)} ratings from database")
+    print(f"Experiment ID: {assessment_data['experiment_id']}")
+    print()
+
+    # Get unique raters
+    raters = list(set(r['rater_id'] for r in ratings))
+    print(f"Raters: {raters}")
+    print()
+
+    # Join ratings with metadata
+    enriched_ratings = []
+    for r in ratings:
+        idea_meta = idea_lookup.get(r['idea_id'], {})
+        enriched_ratings.append({
+            **r,
+            'condition': idea_meta.get('condition', 'unknown'),
+            'expert_name': idea_meta.get('expert_name', ''),
+            'keyword': idea_meta.get('keyword', ''),
+            'query_text': idea_meta.get('query_text', ''),
+            'idea_text': idea_meta.get('text', '')
+        })
+
+    # Dimensions
+    dimensions = ['originality', 'elaboration', 'coherence', 'usefulness']
+
+    # ================================
+    # Inter-rater reliability
+    # ================================
+    print("-" * 60)
+    print("INTER-RATER RELIABILITY")
+    print("-" * 60)
+    print()
+
+    if len(raters) >= 2:
+        # Build ratings matrix per dimension
+        idea_ids = list(set(r['idea_id'] for r in enriched_ratings))
+
+        for dim in dimensions:
+            # Create matrix: rows = ideas, cols = raters
+            matrix = np.full((len(idea_ids), len(raters)), np.nan)
+            idea_to_idx = {idea: idx for idx, idea in enumerate(idea_ids)}
+            rater_to_idx = {rater: idx for idx, rater in enumerate(raters)}
+
+            for r in enriched_ratings:
+                if r[dim] is not None:
+                    i = idea_to_idx[r['idea_id']]
+                    j = rater_to_idx[r['rater_id']]
+                    matrix[i, j] = r[dim]
+
+            # Calculate metrics
+            alpha = calculate_krippendorff_alpha(matrix)
+            icc, icc_low, icc_high = calculate_icc(matrix)
+
+            print(f"{dim.upper()}:")
+            print(f"  Krippendorff's alpha: {alpha:.3f}")
+            print(f"  ICC(2,1): {icc:.3f} (95% CI: {icc_low:.3f} - {icc_high:.3f})")
+            print()
+    else:
+        print("Need at least 2 raters for inter-rater reliability analysis.")
+        print()
+
+    # ================================
+    # Condition comparisons
+    # ================================
+    print("-" * 60)
+    print("MEAN RATINGS BY CONDITION")
+    print("-" * 60)
+    print()
+
+    # Group ratings by condition
+    condition_ratings: dict[str, dict[str, list[int]]] = defaultdict(lambda: defaultdict(list))
+
+    for r in enriched_ratings:
+        condition = r['condition']
+        for dim in dimensions:
+            if r[dim] is not None:
+                condition_ratings[condition][dim].append(r[dim])
+
+    # Calculate means and print
+    condition_stats = {}
+    for condition in sorted(condition_ratings.keys()):
+        print(f"\n{condition}:")
+        condition_stats[condition] = {}
+        for dim in dimensions:
+            values = condition_ratings[condition][dim]
+            if values:
+                mean = np.mean(values)
+                std = np.std(values)
+                n = len(values)
+                condition_stats[condition][dim] = {'mean': mean, 'std': std, 'n': n}
+                print(f"  {dim}: {mean:.2f} (SD={std:.2f}, n={n})")
+            else:
+                print(f"  {dim}: no data")
+
+    # ================================
+    # Statistical comparisons
+    # ================================
+    print()
+    print("-" * 60)
+    print("STATISTICAL COMPARISONS (Kruskal-Wallis)")
+    print("-" * 60)
+    print()
+
+    conditions = sorted(condition_ratings.keys())
+    if len(conditions) >= 2:
+        for dim in dimensions:
+            groups = [condition_ratings[c][dim] for c in conditions if condition_ratings[c][dim]]
+            if len(groups) >= 2:
+                h_stat, p_value = stats.kruskal(*groups)
+                sig = "*" if p_value < 0.05 else ""
+                print(f"{dim}: H={h_stat:.2f}, p={p_value:.4f} {sig}")
+            else:
+                print(f"{dim}: insufficient data for comparison")
+    else:
+        print("Need at least 2 conditions with data for statistical comparison.")
+
+    # ================================
+    # Export results
+    # ================================
+    output = {
+        'analysis_timestamp': datetime.utcnow().isoformat(),
+        'experiment_id': assessment_data['experiment_id'],
+        'total_ratings': len(ratings),
+        'raters': raters,
+        'rater_count': len(raters),
+        'condition_stats': condition_stats,
+        'enriched_ratings': enriched_ratings
+    }
+
+    output_path = RESULTS_DIR / 'analysis_results.json'
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(output, f, ensure_ascii=False, indent=2, default=str)
+
+    print()
+    print("-" * 60)
+    print(f"Results exported to: {output_path}")
+    print("=" * 60)
+
+
+if __name__ == '__main__':
+    analyze_ratings()