""" Statistical analysis for experiment results. Performs: - 2×2 ANOVA for main effects (attributes, experts) and interaction - Post-hoc tests (Tukey HSD) - Effect sizes (Cohen's d) - Control comparison (C2 vs C5) Usage: python -m experiments.analyze_results --input results/experiment_xxx_metrics.json """ import sys import json import argparse from pathlib import Path from typing import List, Dict, Any, Tuple from dataclasses import dataclass import numpy as np class NumpyEncoder(json.JSONEncoder): """JSON encoder that handles numpy types.""" def default(self, obj): if isinstance(obj, (np.integer, np.int64, np.int32)): return int(obj) if isinstance(obj, (np.floating, np.float64, np.float32)): return float(obj) if isinstance(obj, (np.bool_, bool)): return bool(obj) if isinstance(obj, np.ndarray): return obj.tolist() return super().default(obj) # Add experiments to path sys.path.insert(0, str(Path(__file__).parent.parent)) from experiments.config import RESULTS_DIR # Try to import statistical libraries try: from scipy import stats SCIPY_AVAILABLE = True except ImportError: SCIPY_AVAILABLE = False print("Warning: scipy not installed. Some statistical tests will be unavailable.") try: import pandas as pd PANDAS_AVAILABLE = True except ImportError: PANDAS_AVAILABLE = False @dataclass class EffectSize: """Cohen's d effect size with interpretation.""" d: float interpretation: str # small, medium, large @staticmethod def from_groups(group1: List[float], group2: List[float]) -> 'EffectSize': """Calculate Cohen's d from two groups.""" n1, n2 = len(group1), len(group2) if n1 < 2 or n2 < 2: return EffectSize(d=0, interpretation="insufficient data") mean1, mean2 = np.mean(group1), np.mean(group2) var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1) # Pooled standard deviation pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2)) if pooled_std == 0: return EffectSize(d=0, interpretation="no variance") d = (mean1 - mean2) / pooled_std # Interpretation (Cohen's conventions) abs_d = abs(d) if abs_d < 0.2: interpretation = "negligible" elif abs_d < 0.5: interpretation = "small" elif abs_d < 0.8: interpretation = "medium" else: interpretation = "large" return EffectSize(d=round(d, 4), interpretation=interpretation) @dataclass class TTestResult: """Independent samples t-test result.""" t_statistic: float p_value: float effect_size: EffectSize significant: bool # p < 0.05 group1_mean: float group2_mean: float group1_std: float group2_std: float group1_n: int group2_n: int @dataclass class ANOVAResult: """2×2 ANOVA result.""" main_effect_attributes: Dict[str, float] # F, p main_effect_experts: Dict[str, float] # F, p interaction: Dict[str, float] # F, p significant_effects: List[str] def extract_metric_values( metrics: Dict[str, Any], metric_path: str ) -> Dict[str, List[float]]: """ Extract values for a specific metric across all queries. Args: metrics: Full metrics dict from compute_metrics.py metric_path: Dot-separated path like "post_dedup_diversity.mean_pairwise_distance" Returns: Dict mapping condition name to list of values """ by_condition = {} for query_metrics in metrics.get("metrics_by_query", []): for condition, cond_metrics in query_metrics.get("conditions", {}).items(): if condition not in by_condition: by_condition[condition] = [] # Navigate the metric path value = cond_metrics for key in metric_path.split("."): if value is None: break if isinstance(value, dict): value = value.get(key) else: value = None if value is not None and isinstance(value, (int, float)): by_condition[condition].append(float(value)) return by_condition def perform_ttest( group1: List[float], group2: List[float], group1_name: str = "Group 1", group2_name: str = "Group 2" ) -> TTestResult: """Perform independent samples t-test.""" if not SCIPY_AVAILABLE: return None if len(group1) < 2 or len(group2) < 2: return None t_stat, p_value = stats.ttest_ind(group1, group2) effect = EffectSize.from_groups(group1, group2) return TTestResult( t_statistic=round(t_stat, 4), p_value=round(p_value, 4), effect_size=effect, significant=p_value < 0.05, group1_mean=round(np.mean(group1), 4), group2_mean=round(np.mean(group2), 4), group1_std=round(np.std(group1, ddof=1), 4), group2_std=round(np.std(group2, ddof=1), 4), group1_n=len(group1), group2_n=len(group2) ) def perform_2x2_anova( c1_direct: List[float], # No attributes, No experts c2_expert: List[float], # No attributes, With experts c3_attribute: List[float], # With attributes, No experts c4_full: List[float] # With attributes, With experts ) -> ANOVAResult: """ Perform 2×2 factorial ANOVA. Factors: - Attributes: Without (C1, C2) vs With (C3, C4) - Experts: Without (C1, C3) vs With (C2, C4) """ if not SCIPY_AVAILABLE: return None # Check minimum data min_n = min(len(c1_direct), len(c2_expert), len(c3_attribute), len(c4_full)) if min_n < 2: return None # For a proper 2×2 ANOVA, we'd use statsmodels or similar # Here we'll compute main effects and interaction manually # Main effect of Attributes: (C3 + C4) vs (C1 + C2) no_attr = c1_direct + c2_expert with_attr = c3_attribute + c4_full f_attr, p_attr = stats.f_oneway(no_attr, with_attr) # Main effect of Experts: (C2 + C4) vs (C1 + C3) no_expert = c1_direct + c3_attribute with_expert = c2_expert + c4_full f_expert, p_expert = stats.f_oneway(no_expert, with_expert) # Interaction: Compare the difference of differences # (C4 - C3) - (C2 - C1) = interaction term # Simplified approach: compare all 4 groups f_all, p_all = stats.f_oneway(c1_direct, c2_expert, c3_attribute, c4_full) # Estimate interaction by checking if combination is super-additive mean1, mean2, mean3, mean4 = np.mean(c1_direct), np.mean(c2_expert), np.mean(c3_attribute), np.mean(c4_full) expected_additive = mean1 + (mean2 - mean1) + (mean3 - mean1) # Additive prediction actual_combination = mean4 interaction_strength = actual_combination - expected_additive significant_effects = [] if p_attr < 0.05: significant_effects.append("Attributes") if p_expert < 0.05: significant_effects.append("Experts") if p_all < 0.05 and abs(interaction_strength) > 0.01: significant_effects.append("Interaction") return ANOVAResult( main_effect_attributes={"F": round(f_attr, 4), "p": round(p_attr, 4)}, main_effect_experts={"F": round(f_expert, 4), "p": round(p_expert, 4)}, interaction={ "F_all_groups": round(f_all, 4), "p_all_groups": round(p_all, 4), "interaction_strength": round(interaction_strength, 4), "super_additive": interaction_strength > 0 }, significant_effects=significant_effects ) def analyze_experiment(metrics: Dict[str, Any]) -> Dict[str, Any]: """ Perform full statistical analysis on experiment metrics. Returns analysis results for multiple metrics. """ results = { "analysis_metrics": [], "research_questions": {} } # Define metrics to analyze metrics_to_analyze = [ ("Survival Rate", "survival_rate"), ("Post-Dedup Diversity", "post_dedup_diversity.mean_pairwise_distance"), ("Normalized Diversity", "normalized_diversity.mean_pairwise_distance"), ("Query Distance", "post_dedup_query_distance.mean_distance"), ("Cluster Count", "post_dedup_clusters.optimal_clusters"), ] for metric_name, metric_path in metrics_to_analyze: print(f"\n{'='*60}") print(f"Analyzing: {metric_name}") print(f"{'='*60}") # Extract values by condition by_condition = extract_metric_values(metrics, metric_path) if not by_condition: print(f" No data available for {metric_name}") continue metric_results = { "metric_name": metric_name, "metric_path": metric_path, "descriptive": {}, "comparisons": {}, "anova": None } # Descriptive statistics print(f"\nDescriptive Statistics:") print(f"{'Condition':<25} {'Mean':<10} {'Std':<10} {'N':<5}") print("-" * 50) for cond, values in sorted(by_condition.items()): if values: mean = np.mean(values) std = np.std(values, ddof=1) if len(values) > 1 else 0 metric_results["descriptive"][cond] = { "mean": round(mean, 4), "std": round(std, 4), "n": len(values) } print(f"{cond:<25} {mean:<10.4f} {std:<10.4f} {len(values):<5}") # Key comparisons comparisons = [] # 1. C1 (Direct) vs C4 (Full Pipeline) - Main comparison if "c1_direct" in by_condition and "c4_full_pipeline" in by_condition: result = perform_ttest( by_condition["c4_full_pipeline"], by_condition["c1_direct"], "Full Pipeline", "Direct" ) if result: comparisons.append(("C4 vs C1 (Full vs Direct)", result)) metric_results["comparisons"]["c4_vs_c1"] = { "t": result.t_statistic, "p": result.p_value, "d": result.effect_size.d, "interpretation": result.effect_size.interpretation, "significant": result.significant } # 2. C2 (Expert) vs C5 (Random) - Control comparison if "c2_expert_only" in by_condition and "c5_random_perspective" in by_condition: result = perform_ttest( by_condition["c2_expert_only"], by_condition["c5_random_perspective"], "Expert", "Random" ) if result: comparisons.append(("C2 vs C5 (Expert vs Random)", result)) metric_results["comparisons"]["c2_vs_c5"] = { "t": result.t_statistic, "p": result.p_value, "d": result.effect_size.d, "interpretation": result.effect_size.interpretation, "significant": result.significant } # 3. C2 (Expert-Only) vs C1 (Direct) - Effect of experts alone if "c2_expert_only" in by_condition and "c1_direct" in by_condition: result = perform_ttest( by_condition["c2_expert_only"], by_condition["c1_direct"], "Expert-Only", "Direct" ) if result: comparisons.append(("C2 vs C1 (Expert effect)", result)) metric_results["comparisons"]["c2_vs_c1"] = { "t": result.t_statistic, "p": result.p_value, "d": result.effect_size.d, "interpretation": result.effect_size.interpretation, "significant": result.significant } # 4. C3 (Attribute-Only) vs C1 (Direct) - Effect of attributes alone if "c3_attribute_only" in by_condition and "c1_direct" in by_condition: result = perform_ttest( by_condition["c3_attribute_only"], by_condition["c1_direct"], "Attribute-Only", "Direct" ) if result: comparisons.append(("C3 vs C1 (Attribute effect)", result)) metric_results["comparisons"]["c3_vs_c1"] = { "t": result.t_statistic, "p": result.p_value, "d": result.effect_size.d, "interpretation": result.effect_size.interpretation, "significant": result.significant } # Print comparisons if comparisons: print(f"\nPairwise Comparisons:") print(f"{'Comparison':<30} {'t':<10} {'p':<10} {'d':<10} {'Sig?':<8}") print("-" * 68) for name, result in comparisons: sig = "Yes*" if result.significant else "No" print(f"{name:<30} {result.t_statistic:<10.3f} {result.p_value:<10.4f} " f"{result.effect_size.d:<10.3f} {sig:<8}") # 2×2 ANOVA (if all conditions available) if all(c in by_condition for c in ["c1_direct", "c2_expert_only", "c3_attribute_only", "c4_full_pipeline"]): anova = perform_2x2_anova( by_condition["c1_direct"], by_condition["c2_expert_only"], by_condition["c3_attribute_only"], by_condition["c4_full_pipeline"] ) if anova: metric_results["anova"] = { "main_effect_attributes": anova.main_effect_attributes, "main_effect_experts": anova.main_effect_experts, "interaction": anova.interaction, "significant_effects": anova.significant_effects } print(f"\n2×2 ANOVA Results:") print(f" Main Effect (Attributes): F={anova.main_effect_attributes['F']:.3f}, " f"p={anova.main_effect_attributes['p']:.4f}") print(f" Main Effect (Experts): F={anova.main_effect_experts['F']:.3f}, " f"p={anova.main_effect_experts['p']:.4f}") print(f" Interaction Strength: {anova.interaction['interaction_strength']:.4f} " f"({'super-additive' if anova.interaction['super_additive'] else 'sub-additive'})") print(f" Significant Effects: {', '.join(anova.significant_effects) or 'None'}") results["analysis_metrics"].append(metric_results) # Summarize research questions results["research_questions"] = summarize_research_questions(results["analysis_metrics"]) return results def summarize_research_questions(analysis_metrics: List[Dict]) -> Dict[str, str]: """Summarize findings for each research question.""" rq = {} # Find the diversity metric results diversity_results = None for m in analysis_metrics: if "Diversity" in m["metric_name"] and "Normalized" in m["metric_name"]: diversity_results = m break if diversity_results is None: for m in analysis_metrics: if "Diversity" in m["metric_name"]: diversity_results = m break if diversity_results: anova = diversity_results.get("anova", {}) comparisons = diversity_results.get("comparisons", {}) # RQ1: Does attribute decomposition improve diversity? if anova and "main_effect_attributes" in anova: p = anova["main_effect_attributes"]["p"] rq["RQ1_attributes"] = f"Main effect p={p:.4f}. " + \ ("Significant effect of attributes." if p < 0.05 else "No significant effect.") # RQ2: Do expert perspectives improve diversity? if anova and "main_effect_experts" in anova: p = anova["main_effect_experts"]["p"] rq["RQ2_experts"] = f"Main effect p={p:.4f}. " + \ ("Significant effect of experts." if p < 0.05 else "No significant effect.") # RQ3: Interaction effect? if anova and "interaction" in anova: strength = anova["interaction"]["interaction_strength"] super_add = anova["interaction"]["super_additive"] rq["RQ3_interaction"] = f"Interaction strength={strength:.4f}. " + \ ("Super-additive (combination better than sum)." if super_add else "Sub-additive or additive.") # RQ5: Expert vs Random (C2 vs C5) if "c2_vs_c5" in comparisons: comp = comparisons["c2_vs_c5"] rq["RQ5_expert_vs_random"] = f"d={comp['d']:.3f} ({comp['interpretation']}), p={comp['p']:.4f}. " + \ ("Expert knowledge matters." if comp["significant"] and comp["d"] > 0 else "No significant difference from random perspectives.") return rq def print_research_summary(results: Dict[str, Any]): """Print summary of research question findings.""" print("\n" + "=" * 70) print("RESEARCH QUESTIONS SUMMARY") print("=" * 70) rq = results.get("research_questions", {}) print("\nRQ1: Does attribute decomposition improve semantic diversity?") print(f" → {rq.get('RQ1_attributes', 'Insufficient data')}") print("\nRQ2: Do expert perspectives improve semantic diversity?") print(f" → {rq.get('RQ2_experts', 'Insufficient data')}") print("\nRQ3: Is there an interaction effect (Full Pipeline > sum of parts)?") print(f" → {rq.get('RQ3_interaction', 'Insufficient data')}") print("\nRQ5: Do experts beat random perspectives? (C2 vs C5)") print(f" → {rq.get('RQ5_expert_vs_random', 'Insufficient data')}") print("\n" + "=" * 70) print("Note: With pilot data (n=1 query), statistical power is limited.") print("Full experiment (n=10+ queries) needed for reliable conclusions.") print("=" * 70) def main(): parser = argparse.ArgumentParser( description="Statistical analysis for experiment results" ) parser.add_argument( "--input", type=str, required=True, help="Input metrics JSON file" ) parser.add_argument( "--output", type=str, help="Output file path (default: input_analysis.json)" ) args = parser.parse_args() input_path = Path(args.input) if not input_path.exists(): input_path = RESULTS_DIR / args.input if not input_path.exists(): print(f"Error: Input file not found: {args.input}") sys.exit(1) # Load metrics with open(input_path, "r", encoding="utf-8") as f: metrics = json.load(f) # Run analysis results = analyze_experiment(metrics) # Print research summary print_research_summary(results) # Save results if args.output: output_path = Path(args.output) else: stem = input_path.stem.replace("_metrics", "") output_path = input_path.parent / f"{stem}_analysis.json" with open(output_path, "w", encoding="utf-8") as f: json.dump(results, f, indent=2, ensure_ascii=False, cls=NumpyEncoder) print(f"\nAnalysis saved to: {output_path}") if __name__ == "__main__": main()