Files
novelty-seeking/experiments/analyze_results.py
gbanyan 43c025e060 feat: Add experiments framework and novelty-driven agent loop
- Add complete experiments directory with pilot study infrastructure
  - 5 experimental conditions (direct, expert-only, attribute-only, full-pipeline, random-perspective)
  - Human assessment tool with React frontend and FastAPI backend
  - AUT flexibility analysis with jump signal detection
  - Result visualization and metrics computation

- Add novelty-driven agent loop module (experiments/novelty_loop/)
  - NoveltyDrivenTaskAgent with expert perspective perturbation
  - Three termination strategies: breakthrough, exhaust, coverage
  - Interactive CLI demo with colored output
  - Embedding-based novelty scoring

- Add DDC knowledge domain classification data (en/zh)
- Add CLAUDE.md project documentation
- Update research report with experiment findings

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 10:16:21 +08:00

547 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Statistical analysis for experiment results.
Performs:
- 2×2 ANOVA for main effects (attributes, experts) and interaction
- Post-hoc tests (Tukey HSD)
- Effect sizes (Cohen's d)
- Control comparison (C2 vs C5)
Usage:
python -m experiments.analyze_results --input results/experiment_xxx_metrics.json
"""
import sys
import json
import argparse
from pathlib import Path
from typing import List, Dict, Any, Tuple
from dataclasses import dataclass
import numpy as np
class NumpyEncoder(json.JSONEncoder):
"""JSON encoder that handles numpy types."""
def default(self, obj):
if isinstance(obj, (np.integer, np.int64, np.int32)):
return int(obj)
if isinstance(obj, (np.floating, np.float64, np.float32)):
return float(obj)
if isinstance(obj, (np.bool_, bool)):
return bool(obj)
if isinstance(obj, np.ndarray):
return obj.tolist()
return super().default(obj)
# Add experiments to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from experiments.config import RESULTS_DIR
# Try to import statistical libraries
try:
from scipy import stats
SCIPY_AVAILABLE = True
except ImportError:
SCIPY_AVAILABLE = False
print("Warning: scipy not installed. Some statistical tests will be unavailable.")
try:
import pandas as pd
PANDAS_AVAILABLE = True
except ImportError:
PANDAS_AVAILABLE = False
@dataclass
class EffectSize:
"""Cohen's d effect size with interpretation."""
d: float
interpretation: str # small, medium, large
@staticmethod
def from_groups(group1: List[float], group2: List[float]) -> 'EffectSize':
"""Calculate Cohen's d from two groups."""
n1, n2 = len(group1), len(group2)
if n1 < 2 or n2 < 2:
return EffectSize(d=0, interpretation="insufficient data")
mean1, mean2 = np.mean(group1), np.mean(group2)
var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
# Pooled standard deviation
pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
if pooled_std == 0:
return EffectSize(d=0, interpretation="no variance")
d = (mean1 - mean2) / pooled_std
# Interpretation (Cohen's conventions)
abs_d = abs(d)
if abs_d < 0.2:
interpretation = "negligible"
elif abs_d < 0.5:
interpretation = "small"
elif abs_d < 0.8:
interpretation = "medium"
else:
interpretation = "large"
return EffectSize(d=round(d, 4), interpretation=interpretation)
@dataclass
class TTestResult:
"""Independent samples t-test result."""
t_statistic: float
p_value: float
effect_size: EffectSize
significant: bool # p < 0.05
group1_mean: float
group2_mean: float
group1_std: float
group2_std: float
group1_n: int
group2_n: int
@dataclass
class ANOVAResult:
"""2×2 ANOVA result."""
main_effect_attributes: Dict[str, float] # F, p
main_effect_experts: Dict[str, float] # F, p
interaction: Dict[str, float] # F, p
significant_effects: List[str]
def extract_metric_values(
metrics: Dict[str, Any],
metric_path: str
) -> Dict[str, List[float]]:
"""
Extract values for a specific metric across all queries.
Args:
metrics: Full metrics dict from compute_metrics.py
metric_path: Dot-separated path like "post_dedup_diversity.mean_pairwise_distance"
Returns:
Dict mapping condition name to list of values
"""
by_condition = {}
for query_metrics in metrics.get("metrics_by_query", []):
for condition, cond_metrics in query_metrics.get("conditions", {}).items():
if condition not in by_condition:
by_condition[condition] = []
# Navigate the metric path
value = cond_metrics
for key in metric_path.split("."):
if value is None:
break
if isinstance(value, dict):
value = value.get(key)
else:
value = None
if value is not None and isinstance(value, (int, float)):
by_condition[condition].append(float(value))
return by_condition
def perform_ttest(
group1: List[float],
group2: List[float],
group1_name: str = "Group 1",
group2_name: str = "Group 2"
) -> TTestResult:
"""Perform independent samples t-test."""
if not SCIPY_AVAILABLE:
return None
if len(group1) < 2 or len(group2) < 2:
return None
t_stat, p_value = stats.ttest_ind(group1, group2)
effect = EffectSize.from_groups(group1, group2)
return TTestResult(
t_statistic=round(t_stat, 4),
p_value=round(p_value, 4),
effect_size=effect,
significant=p_value < 0.05,
group1_mean=round(np.mean(group1), 4),
group2_mean=round(np.mean(group2), 4),
group1_std=round(np.std(group1, ddof=1), 4),
group2_std=round(np.std(group2, ddof=1), 4),
group1_n=len(group1),
group2_n=len(group2)
)
def perform_2x2_anova(
c1_direct: List[float], # No attributes, No experts
c2_expert: List[float], # No attributes, With experts
c3_attribute: List[float], # With attributes, No experts
c4_full: List[float] # With attributes, With experts
) -> ANOVAResult:
"""
Perform 2×2 factorial ANOVA.
Factors:
- Attributes: Without (C1, C2) vs With (C3, C4)
- Experts: Without (C1, C3) vs With (C2, C4)
"""
if not SCIPY_AVAILABLE:
return None
# Check minimum data
min_n = min(len(c1_direct), len(c2_expert), len(c3_attribute), len(c4_full))
if min_n < 2:
return None
# For a proper 2×2 ANOVA, we'd use statsmodels or similar
# Here we'll compute main effects and interaction manually
# Main effect of Attributes: (C3 + C4) vs (C1 + C2)
no_attr = c1_direct + c2_expert
with_attr = c3_attribute + c4_full
f_attr, p_attr = stats.f_oneway(no_attr, with_attr)
# Main effect of Experts: (C2 + C4) vs (C1 + C3)
no_expert = c1_direct + c3_attribute
with_expert = c2_expert + c4_full
f_expert, p_expert = stats.f_oneway(no_expert, with_expert)
# Interaction: Compare the difference of differences
# (C4 - C3) - (C2 - C1) = interaction term
# Simplified approach: compare all 4 groups
f_all, p_all = stats.f_oneway(c1_direct, c2_expert, c3_attribute, c4_full)
# Estimate interaction by checking if combination is super-additive
mean1, mean2, mean3, mean4 = np.mean(c1_direct), np.mean(c2_expert), np.mean(c3_attribute), np.mean(c4_full)
expected_additive = mean1 + (mean2 - mean1) + (mean3 - mean1) # Additive prediction
actual_combination = mean4
interaction_strength = actual_combination - expected_additive
significant_effects = []
if p_attr < 0.05:
significant_effects.append("Attributes")
if p_expert < 0.05:
significant_effects.append("Experts")
if p_all < 0.05 and abs(interaction_strength) > 0.01:
significant_effects.append("Interaction")
return ANOVAResult(
main_effect_attributes={"F": round(f_attr, 4), "p": round(p_attr, 4)},
main_effect_experts={"F": round(f_expert, 4), "p": round(p_expert, 4)},
interaction={
"F_all_groups": round(f_all, 4),
"p_all_groups": round(p_all, 4),
"interaction_strength": round(interaction_strength, 4),
"super_additive": interaction_strength > 0
},
significant_effects=significant_effects
)
def analyze_experiment(metrics: Dict[str, Any]) -> Dict[str, Any]:
"""
Perform full statistical analysis on experiment metrics.
Returns analysis results for multiple metrics.
"""
results = {
"analysis_metrics": [],
"research_questions": {}
}
# Define metrics to analyze
metrics_to_analyze = [
("Survival Rate", "survival_rate"),
("Post-Dedup Diversity", "post_dedup_diversity.mean_pairwise_distance"),
("Normalized Diversity", "normalized_diversity.mean_pairwise_distance"),
("Query Distance", "post_dedup_query_distance.mean_distance"),
("Cluster Count", "post_dedup_clusters.optimal_clusters"),
]
for metric_name, metric_path in metrics_to_analyze:
print(f"\n{'='*60}")
print(f"Analyzing: {metric_name}")
print(f"{'='*60}")
# Extract values by condition
by_condition = extract_metric_values(metrics, metric_path)
if not by_condition:
print(f" No data available for {metric_name}")
continue
metric_results = {
"metric_name": metric_name,
"metric_path": metric_path,
"descriptive": {},
"comparisons": {},
"anova": None
}
# Descriptive statistics
print(f"\nDescriptive Statistics:")
print(f"{'Condition':<25} {'Mean':<10} {'Std':<10} {'N':<5}")
print("-" * 50)
for cond, values in sorted(by_condition.items()):
if values:
mean = np.mean(values)
std = np.std(values, ddof=1) if len(values) > 1 else 0
metric_results["descriptive"][cond] = {
"mean": round(mean, 4),
"std": round(std, 4),
"n": len(values)
}
print(f"{cond:<25} {mean:<10.4f} {std:<10.4f} {len(values):<5}")
# Key comparisons
comparisons = []
# 1. C1 (Direct) vs C4 (Full Pipeline) - Main comparison
if "c1_direct" in by_condition and "c4_full_pipeline" in by_condition:
result = perform_ttest(
by_condition["c4_full_pipeline"],
by_condition["c1_direct"],
"Full Pipeline", "Direct"
)
if result:
comparisons.append(("C4 vs C1 (Full vs Direct)", result))
metric_results["comparisons"]["c4_vs_c1"] = {
"t": result.t_statistic,
"p": result.p_value,
"d": result.effect_size.d,
"interpretation": result.effect_size.interpretation,
"significant": result.significant
}
# 2. C2 (Expert) vs C5 (Random) - Control comparison
if "c2_expert_only" in by_condition and "c5_random_perspective" in by_condition:
result = perform_ttest(
by_condition["c2_expert_only"],
by_condition["c5_random_perspective"],
"Expert", "Random"
)
if result:
comparisons.append(("C2 vs C5 (Expert vs Random)", result))
metric_results["comparisons"]["c2_vs_c5"] = {
"t": result.t_statistic,
"p": result.p_value,
"d": result.effect_size.d,
"interpretation": result.effect_size.interpretation,
"significant": result.significant
}
# 3. C2 (Expert-Only) vs C1 (Direct) - Effect of experts alone
if "c2_expert_only" in by_condition and "c1_direct" in by_condition:
result = perform_ttest(
by_condition["c2_expert_only"],
by_condition["c1_direct"],
"Expert-Only", "Direct"
)
if result:
comparisons.append(("C2 vs C1 (Expert effect)", result))
metric_results["comparisons"]["c2_vs_c1"] = {
"t": result.t_statistic,
"p": result.p_value,
"d": result.effect_size.d,
"interpretation": result.effect_size.interpretation,
"significant": result.significant
}
# 4. C3 (Attribute-Only) vs C1 (Direct) - Effect of attributes alone
if "c3_attribute_only" in by_condition and "c1_direct" in by_condition:
result = perform_ttest(
by_condition["c3_attribute_only"],
by_condition["c1_direct"],
"Attribute-Only", "Direct"
)
if result:
comparisons.append(("C3 vs C1 (Attribute effect)", result))
metric_results["comparisons"]["c3_vs_c1"] = {
"t": result.t_statistic,
"p": result.p_value,
"d": result.effect_size.d,
"interpretation": result.effect_size.interpretation,
"significant": result.significant
}
# Print comparisons
if comparisons:
print(f"\nPairwise Comparisons:")
print(f"{'Comparison':<30} {'t':<10} {'p':<10} {'d':<10} {'Sig?':<8}")
print("-" * 68)
for name, result in comparisons:
sig = "Yes*" if result.significant else "No"
print(f"{name:<30} {result.t_statistic:<10.3f} {result.p_value:<10.4f} "
f"{result.effect_size.d:<10.3f} {sig:<8}")
# 2×2 ANOVA (if all conditions available)
if all(c in by_condition for c in ["c1_direct", "c2_expert_only", "c3_attribute_only", "c4_full_pipeline"]):
anova = perform_2x2_anova(
by_condition["c1_direct"],
by_condition["c2_expert_only"],
by_condition["c3_attribute_only"],
by_condition["c4_full_pipeline"]
)
if anova:
metric_results["anova"] = {
"main_effect_attributes": anova.main_effect_attributes,
"main_effect_experts": anova.main_effect_experts,
"interaction": anova.interaction,
"significant_effects": anova.significant_effects
}
print(f"\n2×2 ANOVA Results:")
print(f" Main Effect (Attributes): F={anova.main_effect_attributes['F']:.3f}, "
f"p={anova.main_effect_attributes['p']:.4f}")
print(f" Main Effect (Experts): F={anova.main_effect_experts['F']:.3f}, "
f"p={anova.main_effect_experts['p']:.4f}")
print(f" Interaction Strength: {anova.interaction['interaction_strength']:.4f} "
f"({'super-additive' if anova.interaction['super_additive'] else 'sub-additive'})")
print(f" Significant Effects: {', '.join(anova.significant_effects) or 'None'}")
results["analysis_metrics"].append(metric_results)
# Summarize research questions
results["research_questions"] = summarize_research_questions(results["analysis_metrics"])
return results
def summarize_research_questions(analysis_metrics: List[Dict]) -> Dict[str, str]:
"""Summarize findings for each research question."""
rq = {}
# Find the diversity metric results
diversity_results = None
for m in analysis_metrics:
if "Diversity" in m["metric_name"] and "Normalized" in m["metric_name"]:
diversity_results = m
break
if diversity_results is None:
for m in analysis_metrics:
if "Diversity" in m["metric_name"]:
diversity_results = m
break
if diversity_results:
anova = diversity_results.get("anova", {})
comparisons = diversity_results.get("comparisons", {})
# RQ1: Does attribute decomposition improve diversity?
if anova and "main_effect_attributes" in anova:
p = anova["main_effect_attributes"]["p"]
rq["RQ1_attributes"] = f"Main effect p={p:.4f}. " + \
("Significant effect of attributes." if p < 0.05 else "No significant effect.")
# RQ2: Do expert perspectives improve diversity?
if anova and "main_effect_experts" in anova:
p = anova["main_effect_experts"]["p"]
rq["RQ2_experts"] = f"Main effect p={p:.4f}. " + \
("Significant effect of experts." if p < 0.05 else "No significant effect.")
# RQ3: Interaction effect?
if anova and "interaction" in anova:
strength = anova["interaction"]["interaction_strength"]
super_add = anova["interaction"]["super_additive"]
rq["RQ3_interaction"] = f"Interaction strength={strength:.4f}. " + \
("Super-additive (combination better than sum)." if super_add else "Sub-additive or additive.")
# RQ5: Expert vs Random (C2 vs C5)
if "c2_vs_c5" in comparisons:
comp = comparisons["c2_vs_c5"]
rq["RQ5_expert_vs_random"] = f"d={comp['d']:.3f} ({comp['interpretation']}), p={comp['p']:.4f}. " + \
("Expert knowledge matters." if comp["significant"] and comp["d"] > 0 else "No significant difference from random perspectives.")
return rq
def print_research_summary(results: Dict[str, Any]):
"""Print summary of research question findings."""
print("\n" + "=" * 70)
print("RESEARCH QUESTIONS SUMMARY")
print("=" * 70)
rq = results.get("research_questions", {})
print("\nRQ1: Does attribute decomposition improve semantic diversity?")
print(f"{rq.get('RQ1_attributes', 'Insufficient data')}")
print("\nRQ2: Do expert perspectives improve semantic diversity?")
print(f"{rq.get('RQ2_experts', 'Insufficient data')}")
print("\nRQ3: Is there an interaction effect (Full Pipeline > sum of parts)?")
print(f"{rq.get('RQ3_interaction', 'Insufficient data')}")
print("\nRQ5: Do experts beat random perspectives? (C2 vs C5)")
print(f"{rq.get('RQ5_expert_vs_random', 'Insufficient data')}")
print("\n" + "=" * 70)
print("Note: With pilot data (n=1 query), statistical power is limited.")
print("Full experiment (n=10+ queries) needed for reliable conclusions.")
print("=" * 70)
def main():
parser = argparse.ArgumentParser(
description="Statistical analysis for experiment results"
)
parser.add_argument(
"--input",
type=str,
required=True,
help="Input metrics JSON file"
)
parser.add_argument(
"--output",
type=str,
help="Output file path (default: input_analysis.json)"
)
args = parser.parse_args()
input_path = Path(args.input)
if not input_path.exists():
input_path = RESULTS_DIR / args.input
if not input_path.exists():
print(f"Error: Input file not found: {args.input}")
sys.exit(1)
# Load metrics
with open(input_path, "r", encoding="utf-8") as f:
metrics = json.load(f)
# Run analysis
results = analyze_experiment(metrics)
# Print research summary
print_research_summary(results)
# Save results
if args.output:
output_path = Path(args.output)
else:
stem = input_path.stem.replace("_metrics", "")
output_path = input_path.parent / f"{stem}_analysis.json"
with open(output_path, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False, cls=NumpyEncoder)
print(f"\nAnalysis saved to: {output_path}")
if __name__ == "__main__":
main()