feat: Add experiments framework and novelty-driven agent loop
- Add complete experiments directory with pilot study infrastructure - 5 experimental conditions (direct, expert-only, attribute-only, full-pipeline, random-perspective) - Human assessment tool with React frontend and FastAPI backend - AUT flexibility analysis with jump signal detection - Result visualization and metrics computation - Add novelty-driven agent loop module (experiments/novelty_loop/) - NoveltyDrivenTaskAgent with expert perspective perturbation - Three termination strategies: breakthrough, exhaust, coverage - Interactive CLI demo with colored output - Embedding-based novelty scoring - Add DDC knowledge domain classification data (en/zh) - Add CLAUDE.md project documentation - Update research report with experiment findings Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
546
experiments/analyze_results.py
Normal file
546
experiments/analyze_results.py
Normal file
@@ -0,0 +1,546 @@
|
||||
"""
|
||||
Statistical analysis for experiment results.
|
||||
|
||||
Performs:
|
||||
- 2×2 ANOVA for main effects (attributes, experts) and interaction
|
||||
- Post-hoc tests (Tukey HSD)
|
||||
- Effect sizes (Cohen's d)
|
||||
- Control comparison (C2 vs C5)
|
||||
|
||||
Usage:
|
||||
python -m experiments.analyze_results --input results/experiment_xxx_metrics.json
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Tuple
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class NumpyEncoder(json.JSONEncoder):
|
||||
"""JSON encoder that handles numpy types."""
|
||||
def default(self, obj):
|
||||
if isinstance(obj, (np.integer, np.int64, np.int32)):
|
||||
return int(obj)
|
||||
if isinstance(obj, (np.floating, np.float64, np.float32)):
|
||||
return float(obj)
|
||||
if isinstance(obj, (np.bool_, bool)):
|
||||
return bool(obj)
|
||||
if isinstance(obj, np.ndarray):
|
||||
return obj.tolist()
|
||||
return super().default(obj)
|
||||
|
||||
|
||||
# Add experiments to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from experiments.config import RESULTS_DIR
|
||||
|
||||
# Try to import statistical libraries
|
||||
try:
|
||||
from scipy import stats
|
||||
SCIPY_AVAILABLE = True
|
||||
except ImportError:
|
||||
SCIPY_AVAILABLE = False
|
||||
print("Warning: scipy not installed. Some statistical tests will be unavailable.")
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
PANDAS_AVAILABLE = True
|
||||
except ImportError:
|
||||
PANDAS_AVAILABLE = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class EffectSize:
|
||||
"""Cohen's d effect size with interpretation."""
|
||||
d: float
|
||||
interpretation: str # small, medium, large
|
||||
|
||||
@staticmethod
|
||||
def from_groups(group1: List[float], group2: List[float]) -> 'EffectSize':
|
||||
"""Calculate Cohen's d from two groups."""
|
||||
n1, n2 = len(group1), len(group2)
|
||||
if n1 < 2 or n2 < 2:
|
||||
return EffectSize(d=0, interpretation="insufficient data")
|
||||
|
||||
mean1, mean2 = np.mean(group1), np.mean(group2)
|
||||
var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
|
||||
|
||||
# Pooled standard deviation
|
||||
pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
|
||||
|
||||
if pooled_std == 0:
|
||||
return EffectSize(d=0, interpretation="no variance")
|
||||
|
||||
d = (mean1 - mean2) / pooled_std
|
||||
|
||||
# Interpretation (Cohen's conventions)
|
||||
abs_d = abs(d)
|
||||
if abs_d < 0.2:
|
||||
interpretation = "negligible"
|
||||
elif abs_d < 0.5:
|
||||
interpretation = "small"
|
||||
elif abs_d < 0.8:
|
||||
interpretation = "medium"
|
||||
else:
|
||||
interpretation = "large"
|
||||
|
||||
return EffectSize(d=round(d, 4), interpretation=interpretation)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TTestResult:
|
||||
"""Independent samples t-test result."""
|
||||
t_statistic: float
|
||||
p_value: float
|
||||
effect_size: EffectSize
|
||||
significant: bool # p < 0.05
|
||||
group1_mean: float
|
||||
group2_mean: float
|
||||
group1_std: float
|
||||
group2_std: float
|
||||
group1_n: int
|
||||
group2_n: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class ANOVAResult:
|
||||
"""2×2 ANOVA result."""
|
||||
main_effect_attributes: Dict[str, float] # F, p
|
||||
main_effect_experts: Dict[str, float] # F, p
|
||||
interaction: Dict[str, float] # F, p
|
||||
significant_effects: List[str]
|
||||
|
||||
|
||||
def extract_metric_values(
|
||||
metrics: Dict[str, Any],
|
||||
metric_path: str
|
||||
) -> Dict[str, List[float]]:
|
||||
"""
|
||||
Extract values for a specific metric across all queries.
|
||||
|
||||
Args:
|
||||
metrics: Full metrics dict from compute_metrics.py
|
||||
metric_path: Dot-separated path like "post_dedup_diversity.mean_pairwise_distance"
|
||||
|
||||
Returns:
|
||||
Dict mapping condition name to list of values
|
||||
"""
|
||||
by_condition = {}
|
||||
|
||||
for query_metrics in metrics.get("metrics_by_query", []):
|
||||
for condition, cond_metrics in query_metrics.get("conditions", {}).items():
|
||||
if condition not in by_condition:
|
||||
by_condition[condition] = []
|
||||
|
||||
# Navigate the metric path
|
||||
value = cond_metrics
|
||||
for key in metric_path.split("."):
|
||||
if value is None:
|
||||
break
|
||||
if isinstance(value, dict):
|
||||
value = value.get(key)
|
||||
else:
|
||||
value = None
|
||||
|
||||
if value is not None and isinstance(value, (int, float)):
|
||||
by_condition[condition].append(float(value))
|
||||
|
||||
return by_condition
|
||||
|
||||
|
||||
def perform_ttest(
|
||||
group1: List[float],
|
||||
group2: List[float],
|
||||
group1_name: str = "Group 1",
|
||||
group2_name: str = "Group 2"
|
||||
) -> TTestResult:
|
||||
"""Perform independent samples t-test."""
|
||||
if not SCIPY_AVAILABLE:
|
||||
return None
|
||||
|
||||
if len(group1) < 2 or len(group2) < 2:
|
||||
return None
|
||||
|
||||
t_stat, p_value = stats.ttest_ind(group1, group2)
|
||||
effect = EffectSize.from_groups(group1, group2)
|
||||
|
||||
return TTestResult(
|
||||
t_statistic=round(t_stat, 4),
|
||||
p_value=round(p_value, 4),
|
||||
effect_size=effect,
|
||||
significant=p_value < 0.05,
|
||||
group1_mean=round(np.mean(group1), 4),
|
||||
group2_mean=round(np.mean(group2), 4),
|
||||
group1_std=round(np.std(group1, ddof=1), 4),
|
||||
group2_std=round(np.std(group2, ddof=1), 4),
|
||||
group1_n=len(group1),
|
||||
group2_n=len(group2)
|
||||
)
|
||||
|
||||
|
||||
def perform_2x2_anova(
|
||||
c1_direct: List[float], # No attributes, No experts
|
||||
c2_expert: List[float], # No attributes, With experts
|
||||
c3_attribute: List[float], # With attributes, No experts
|
||||
c4_full: List[float] # With attributes, With experts
|
||||
) -> ANOVAResult:
|
||||
"""
|
||||
Perform 2×2 factorial ANOVA.
|
||||
|
||||
Factors:
|
||||
- Attributes: Without (C1, C2) vs With (C3, C4)
|
||||
- Experts: Without (C1, C3) vs With (C2, C4)
|
||||
"""
|
||||
if not SCIPY_AVAILABLE:
|
||||
return None
|
||||
|
||||
# Check minimum data
|
||||
min_n = min(len(c1_direct), len(c2_expert), len(c3_attribute), len(c4_full))
|
||||
if min_n < 2:
|
||||
return None
|
||||
|
||||
# For a proper 2×2 ANOVA, we'd use statsmodels or similar
|
||||
# Here we'll compute main effects and interaction manually
|
||||
|
||||
# Main effect of Attributes: (C3 + C4) vs (C1 + C2)
|
||||
no_attr = c1_direct + c2_expert
|
||||
with_attr = c3_attribute + c4_full
|
||||
f_attr, p_attr = stats.f_oneway(no_attr, with_attr)
|
||||
|
||||
# Main effect of Experts: (C2 + C4) vs (C1 + C3)
|
||||
no_expert = c1_direct + c3_attribute
|
||||
with_expert = c2_expert + c4_full
|
||||
f_expert, p_expert = stats.f_oneway(no_expert, with_expert)
|
||||
|
||||
# Interaction: Compare the difference of differences
|
||||
# (C4 - C3) - (C2 - C1) = interaction term
|
||||
# Simplified approach: compare all 4 groups
|
||||
f_all, p_all = stats.f_oneway(c1_direct, c2_expert, c3_attribute, c4_full)
|
||||
|
||||
# Estimate interaction by checking if combination is super-additive
|
||||
mean1, mean2, mean3, mean4 = np.mean(c1_direct), np.mean(c2_expert), np.mean(c3_attribute), np.mean(c4_full)
|
||||
expected_additive = mean1 + (mean2 - mean1) + (mean3 - mean1) # Additive prediction
|
||||
actual_combination = mean4
|
||||
interaction_strength = actual_combination - expected_additive
|
||||
|
||||
significant_effects = []
|
||||
if p_attr < 0.05:
|
||||
significant_effects.append("Attributes")
|
||||
if p_expert < 0.05:
|
||||
significant_effects.append("Experts")
|
||||
if p_all < 0.05 and abs(interaction_strength) > 0.01:
|
||||
significant_effects.append("Interaction")
|
||||
|
||||
return ANOVAResult(
|
||||
main_effect_attributes={"F": round(f_attr, 4), "p": round(p_attr, 4)},
|
||||
main_effect_experts={"F": round(f_expert, 4), "p": round(p_expert, 4)},
|
||||
interaction={
|
||||
"F_all_groups": round(f_all, 4),
|
||||
"p_all_groups": round(p_all, 4),
|
||||
"interaction_strength": round(interaction_strength, 4),
|
||||
"super_additive": interaction_strength > 0
|
||||
},
|
||||
significant_effects=significant_effects
|
||||
)
|
||||
|
||||
|
||||
def analyze_experiment(metrics: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Perform full statistical analysis on experiment metrics.
|
||||
|
||||
Returns analysis results for multiple metrics.
|
||||
"""
|
||||
results = {
|
||||
"analysis_metrics": [],
|
||||
"research_questions": {}
|
||||
}
|
||||
|
||||
# Define metrics to analyze
|
||||
metrics_to_analyze = [
|
||||
("Survival Rate", "survival_rate"),
|
||||
("Post-Dedup Diversity", "post_dedup_diversity.mean_pairwise_distance"),
|
||||
("Normalized Diversity", "normalized_diversity.mean_pairwise_distance"),
|
||||
("Query Distance", "post_dedup_query_distance.mean_distance"),
|
||||
("Cluster Count", "post_dedup_clusters.optimal_clusters"),
|
||||
]
|
||||
|
||||
for metric_name, metric_path in metrics_to_analyze:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Analyzing: {metric_name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Extract values by condition
|
||||
by_condition = extract_metric_values(metrics, metric_path)
|
||||
|
||||
if not by_condition:
|
||||
print(f" No data available for {metric_name}")
|
||||
continue
|
||||
|
||||
metric_results = {
|
||||
"metric_name": metric_name,
|
||||
"metric_path": metric_path,
|
||||
"descriptive": {},
|
||||
"comparisons": {},
|
||||
"anova": None
|
||||
}
|
||||
|
||||
# Descriptive statistics
|
||||
print(f"\nDescriptive Statistics:")
|
||||
print(f"{'Condition':<25} {'Mean':<10} {'Std':<10} {'N':<5}")
|
||||
print("-" * 50)
|
||||
|
||||
for cond, values in sorted(by_condition.items()):
|
||||
if values:
|
||||
mean = np.mean(values)
|
||||
std = np.std(values, ddof=1) if len(values) > 1 else 0
|
||||
metric_results["descriptive"][cond] = {
|
||||
"mean": round(mean, 4),
|
||||
"std": round(std, 4),
|
||||
"n": len(values)
|
||||
}
|
||||
print(f"{cond:<25} {mean:<10.4f} {std:<10.4f} {len(values):<5}")
|
||||
|
||||
# Key comparisons
|
||||
comparisons = []
|
||||
|
||||
# 1. C1 (Direct) vs C4 (Full Pipeline) - Main comparison
|
||||
if "c1_direct" in by_condition and "c4_full_pipeline" in by_condition:
|
||||
result = perform_ttest(
|
||||
by_condition["c4_full_pipeline"],
|
||||
by_condition["c1_direct"],
|
||||
"Full Pipeline", "Direct"
|
||||
)
|
||||
if result:
|
||||
comparisons.append(("C4 vs C1 (Full vs Direct)", result))
|
||||
metric_results["comparisons"]["c4_vs_c1"] = {
|
||||
"t": result.t_statistic,
|
||||
"p": result.p_value,
|
||||
"d": result.effect_size.d,
|
||||
"interpretation": result.effect_size.interpretation,
|
||||
"significant": result.significant
|
||||
}
|
||||
|
||||
# 2. C2 (Expert) vs C5 (Random) - Control comparison
|
||||
if "c2_expert_only" in by_condition and "c5_random_perspective" in by_condition:
|
||||
result = perform_ttest(
|
||||
by_condition["c2_expert_only"],
|
||||
by_condition["c5_random_perspective"],
|
||||
"Expert", "Random"
|
||||
)
|
||||
if result:
|
||||
comparisons.append(("C2 vs C5 (Expert vs Random)", result))
|
||||
metric_results["comparisons"]["c2_vs_c5"] = {
|
||||
"t": result.t_statistic,
|
||||
"p": result.p_value,
|
||||
"d": result.effect_size.d,
|
||||
"interpretation": result.effect_size.interpretation,
|
||||
"significant": result.significant
|
||||
}
|
||||
|
||||
# 3. C2 (Expert-Only) vs C1 (Direct) - Effect of experts alone
|
||||
if "c2_expert_only" in by_condition and "c1_direct" in by_condition:
|
||||
result = perform_ttest(
|
||||
by_condition["c2_expert_only"],
|
||||
by_condition["c1_direct"],
|
||||
"Expert-Only", "Direct"
|
||||
)
|
||||
if result:
|
||||
comparisons.append(("C2 vs C1 (Expert effect)", result))
|
||||
metric_results["comparisons"]["c2_vs_c1"] = {
|
||||
"t": result.t_statistic,
|
||||
"p": result.p_value,
|
||||
"d": result.effect_size.d,
|
||||
"interpretation": result.effect_size.interpretation,
|
||||
"significant": result.significant
|
||||
}
|
||||
|
||||
# 4. C3 (Attribute-Only) vs C1 (Direct) - Effect of attributes alone
|
||||
if "c3_attribute_only" in by_condition and "c1_direct" in by_condition:
|
||||
result = perform_ttest(
|
||||
by_condition["c3_attribute_only"],
|
||||
by_condition["c1_direct"],
|
||||
"Attribute-Only", "Direct"
|
||||
)
|
||||
if result:
|
||||
comparisons.append(("C3 vs C1 (Attribute effect)", result))
|
||||
metric_results["comparisons"]["c3_vs_c1"] = {
|
||||
"t": result.t_statistic,
|
||||
"p": result.p_value,
|
||||
"d": result.effect_size.d,
|
||||
"interpretation": result.effect_size.interpretation,
|
||||
"significant": result.significant
|
||||
}
|
||||
|
||||
# Print comparisons
|
||||
if comparisons:
|
||||
print(f"\nPairwise Comparisons:")
|
||||
print(f"{'Comparison':<30} {'t':<10} {'p':<10} {'d':<10} {'Sig?':<8}")
|
||||
print("-" * 68)
|
||||
for name, result in comparisons:
|
||||
sig = "Yes*" if result.significant else "No"
|
||||
print(f"{name:<30} {result.t_statistic:<10.3f} {result.p_value:<10.4f} "
|
||||
f"{result.effect_size.d:<10.3f} {sig:<8}")
|
||||
|
||||
# 2×2 ANOVA (if all conditions available)
|
||||
if all(c in by_condition for c in ["c1_direct", "c2_expert_only", "c3_attribute_only", "c4_full_pipeline"]):
|
||||
anova = perform_2x2_anova(
|
||||
by_condition["c1_direct"],
|
||||
by_condition["c2_expert_only"],
|
||||
by_condition["c3_attribute_only"],
|
||||
by_condition["c4_full_pipeline"]
|
||||
)
|
||||
if anova:
|
||||
metric_results["anova"] = {
|
||||
"main_effect_attributes": anova.main_effect_attributes,
|
||||
"main_effect_experts": anova.main_effect_experts,
|
||||
"interaction": anova.interaction,
|
||||
"significant_effects": anova.significant_effects
|
||||
}
|
||||
|
||||
print(f"\n2×2 ANOVA Results:")
|
||||
print(f" Main Effect (Attributes): F={anova.main_effect_attributes['F']:.3f}, "
|
||||
f"p={anova.main_effect_attributes['p']:.4f}")
|
||||
print(f" Main Effect (Experts): F={anova.main_effect_experts['F']:.3f}, "
|
||||
f"p={anova.main_effect_experts['p']:.4f}")
|
||||
print(f" Interaction Strength: {anova.interaction['interaction_strength']:.4f} "
|
||||
f"({'super-additive' if anova.interaction['super_additive'] else 'sub-additive'})")
|
||||
print(f" Significant Effects: {', '.join(anova.significant_effects) or 'None'}")
|
||||
|
||||
results["analysis_metrics"].append(metric_results)
|
||||
|
||||
# Summarize research questions
|
||||
results["research_questions"] = summarize_research_questions(results["analysis_metrics"])
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def summarize_research_questions(analysis_metrics: List[Dict]) -> Dict[str, str]:
|
||||
"""Summarize findings for each research question."""
|
||||
rq = {}
|
||||
|
||||
# Find the diversity metric results
|
||||
diversity_results = None
|
||||
for m in analysis_metrics:
|
||||
if "Diversity" in m["metric_name"] and "Normalized" in m["metric_name"]:
|
||||
diversity_results = m
|
||||
break
|
||||
if diversity_results is None:
|
||||
for m in analysis_metrics:
|
||||
if "Diversity" in m["metric_name"]:
|
||||
diversity_results = m
|
||||
break
|
||||
|
||||
if diversity_results:
|
||||
anova = diversity_results.get("anova", {})
|
||||
comparisons = diversity_results.get("comparisons", {})
|
||||
|
||||
# RQ1: Does attribute decomposition improve diversity?
|
||||
if anova and "main_effect_attributes" in anova:
|
||||
p = anova["main_effect_attributes"]["p"]
|
||||
rq["RQ1_attributes"] = f"Main effect p={p:.4f}. " + \
|
||||
("Significant effect of attributes." if p < 0.05 else "No significant effect.")
|
||||
|
||||
# RQ2: Do expert perspectives improve diversity?
|
||||
if anova and "main_effect_experts" in anova:
|
||||
p = anova["main_effect_experts"]["p"]
|
||||
rq["RQ2_experts"] = f"Main effect p={p:.4f}. " + \
|
||||
("Significant effect of experts." if p < 0.05 else "No significant effect.")
|
||||
|
||||
# RQ3: Interaction effect?
|
||||
if anova and "interaction" in anova:
|
||||
strength = anova["interaction"]["interaction_strength"]
|
||||
super_add = anova["interaction"]["super_additive"]
|
||||
rq["RQ3_interaction"] = f"Interaction strength={strength:.4f}. " + \
|
||||
("Super-additive (combination better than sum)." if super_add else "Sub-additive or additive.")
|
||||
|
||||
# RQ5: Expert vs Random (C2 vs C5)
|
||||
if "c2_vs_c5" in comparisons:
|
||||
comp = comparisons["c2_vs_c5"]
|
||||
rq["RQ5_expert_vs_random"] = f"d={comp['d']:.3f} ({comp['interpretation']}), p={comp['p']:.4f}. " + \
|
||||
("Expert knowledge matters." if comp["significant"] and comp["d"] > 0 else "No significant difference from random perspectives.")
|
||||
|
||||
return rq
|
||||
|
||||
|
||||
def print_research_summary(results: Dict[str, Any]):
|
||||
"""Print summary of research question findings."""
|
||||
print("\n" + "=" * 70)
|
||||
print("RESEARCH QUESTIONS SUMMARY")
|
||||
print("=" * 70)
|
||||
|
||||
rq = results.get("research_questions", {})
|
||||
|
||||
print("\nRQ1: Does attribute decomposition improve semantic diversity?")
|
||||
print(f" → {rq.get('RQ1_attributes', 'Insufficient data')}")
|
||||
|
||||
print("\nRQ2: Do expert perspectives improve semantic diversity?")
|
||||
print(f" → {rq.get('RQ2_experts', 'Insufficient data')}")
|
||||
|
||||
print("\nRQ3: Is there an interaction effect (Full Pipeline > sum of parts)?")
|
||||
print(f" → {rq.get('RQ3_interaction', 'Insufficient data')}")
|
||||
|
||||
print("\nRQ5: Do experts beat random perspectives? (C2 vs C5)")
|
||||
print(f" → {rq.get('RQ5_expert_vs_random', 'Insufficient data')}")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("Note: With pilot data (n=1 query), statistical power is limited.")
|
||||
print("Full experiment (n=10+ queries) needed for reliable conclusions.")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Statistical analysis for experiment results"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Input metrics JSON file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
help="Output file path (default: input_analysis.json)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
input_path = Path(args.input)
|
||||
if not input_path.exists():
|
||||
input_path = RESULTS_DIR / args.input
|
||||
if not input_path.exists():
|
||||
print(f"Error: Input file not found: {args.input}")
|
||||
sys.exit(1)
|
||||
|
||||
# Load metrics
|
||||
with open(input_path, "r", encoding="utf-8") as f:
|
||||
metrics = json.load(f)
|
||||
|
||||
# Run analysis
|
||||
results = analyze_experiment(metrics)
|
||||
|
||||
# Print research summary
|
||||
print_research_summary(results)
|
||||
|
||||
# Save results
|
||||
if args.output:
|
||||
output_path = Path(args.output)
|
||||
else:
|
||||
stem = input_path.stem.replace("_metrics", "")
|
||||
output_path = input_path.parent / f"{stem}_analysis.json"
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False, cls=NumpyEncoder)
|
||||
|
||||
print(f"\nAnalysis saved to: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user