feat: Add experiments framework and novelty-driven agent loop
- Add complete experiments directory with pilot study infrastructure - 5 experimental conditions (direct, expert-only, attribute-only, full-pipeline, random-perspective) - Human assessment tool with React frontend and FastAPI backend - AUT flexibility analysis with jump signal detection - Result visualization and metrics computation - Add novelty-driven agent loop module (experiments/novelty_loop/) - NoveltyDrivenTaskAgent with expert perspective perturbation - Three termination strategies: breakthrough, exhaust, coverage - Interactive CLI demo with colored output - Embedding-based novelty scoring - Add DDC knowledge domain classification data (en/zh) - Add CLAUDE.md project documentation - Update research report with experiment findings Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
375
experiments/assessment/prepare_data.py
Executable file
375
experiments/assessment/prepare_data.py
Executable file
@@ -0,0 +1,375 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Prepare assessment data from experiment results.
|
||||
|
||||
Extracts unique ideas from deduped experiment results, assigns stable IDs,
|
||||
and randomizes the order within each query for unbiased human assessment.
|
||||
|
||||
Usage:
|
||||
python prepare_data.py # Use latest, all ideas
|
||||
python prepare_data.py --sample 100 # Sample 100 ideas total
|
||||
python prepare_data.py --per-query 10 # 10 ideas per query
|
||||
python prepare_data.py --per-condition 5 # 5 ideas per condition per query
|
||||
python prepare_data.py --list # List available files
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def load_experiment_data(filepath: Path) -> dict[str, Any]:
|
||||
"""Load experiment data from JSON file."""
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def sample_ideas_stratified(
|
||||
ideas: list[dict[str, Any]],
|
||||
per_condition: int | None = None,
|
||||
total_limit: int | None = None,
|
||||
rng: random.Random | None = None
|
||||
) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Sample ideas with stratification by condition.
|
||||
|
||||
Args:
|
||||
ideas: List of ideas with _hidden.condition metadata
|
||||
per_condition: Max ideas per condition (stratified sampling)
|
||||
total_limit: Max total ideas (after stratified sampling)
|
||||
rng: Random number generator for reproducibility
|
||||
|
||||
Returns:
|
||||
Sampled list of ideas
|
||||
"""
|
||||
if rng is None:
|
||||
rng = random.Random()
|
||||
|
||||
if per_condition is None and total_limit is None:
|
||||
return ideas
|
||||
|
||||
# Group by condition
|
||||
by_condition: dict[str, list[dict[str, Any]]] = {}
|
||||
for idea in ideas:
|
||||
condition = idea['_hidden']['condition']
|
||||
if condition not in by_condition:
|
||||
by_condition[condition] = []
|
||||
by_condition[condition].append(idea)
|
||||
|
||||
# Sample per condition
|
||||
sampled = []
|
||||
for condition, cond_ideas in by_condition.items():
|
||||
rng.shuffle(cond_ideas)
|
||||
if per_condition is not None:
|
||||
cond_ideas = cond_ideas[:per_condition]
|
||||
sampled.extend(cond_ideas)
|
||||
|
||||
# Apply total limit if specified
|
||||
if total_limit is not None and len(sampled) > total_limit:
|
||||
rng.shuffle(sampled)
|
||||
sampled = sampled[:total_limit]
|
||||
|
||||
return sampled
|
||||
|
||||
|
||||
def extract_ideas_from_condition(
|
||||
query_id: str,
|
||||
condition_name: str,
|
||||
condition_data: dict[str, Any],
|
||||
idea_counter: dict[str, int]
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Extract ideas from a single condition with hidden metadata."""
|
||||
ideas = []
|
||||
|
||||
dedup_data = condition_data.get('dedup', {})
|
||||
unique_ideas_with_source = dedup_data.get('unique_ideas_with_source', [])
|
||||
|
||||
for item in unique_ideas_with_source:
|
||||
idea_text = item.get('idea', '')
|
||||
if not idea_text:
|
||||
continue
|
||||
|
||||
# Generate stable idea ID
|
||||
current_count = idea_counter.get(query_id, 0)
|
||||
idea_id = f"{query_id}_I{current_count:03d}"
|
||||
idea_counter[query_id] = current_count + 1
|
||||
|
||||
ideas.append({
|
||||
'idea_id': idea_id,
|
||||
'text': idea_text,
|
||||
'_hidden': {
|
||||
'condition': condition_name,
|
||||
'expert_name': item.get('expert_name', ''),
|
||||
'keyword': item.get('keyword', '')
|
||||
}
|
||||
})
|
||||
|
||||
return ideas
|
||||
|
||||
|
||||
def prepare_assessment_data(
|
||||
experiment_filepath: Path,
|
||||
output_filepath: Path,
|
||||
seed: int = 42,
|
||||
sample_total: int | None = None,
|
||||
per_query: int | None = None,
|
||||
per_condition: int | None = None
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Prepare assessment data from experiment results.
|
||||
|
||||
Args:
|
||||
experiment_filepath: Path to deduped experiment JSON
|
||||
output_filepath: Path to write assessment items JSON
|
||||
seed: Random seed for reproducible shuffling
|
||||
sample_total: Total number of ideas to sample (across all queries)
|
||||
per_query: Maximum ideas per query
|
||||
per_condition: Maximum ideas per condition per query (stratified)
|
||||
|
||||
Returns:
|
||||
Assessment data structure
|
||||
"""
|
||||
rng = random.Random(seed)
|
||||
|
||||
# Load experiment data
|
||||
data = load_experiment_data(experiment_filepath)
|
||||
experiment_id = data.get('experiment_id', 'unknown')
|
||||
conditions = data.get('conditions', [])
|
||||
results = data.get('results', [])
|
||||
|
||||
print(f"Loading experiment: {experiment_id}")
|
||||
print(f"Conditions: {conditions}")
|
||||
print(f"Number of queries: {len(results)}")
|
||||
|
||||
# Show sampling config
|
||||
if sample_total or per_query or per_condition:
|
||||
print(f"Sampling config: total={sample_total}, per_query={per_query}, per_condition={per_condition}")
|
||||
|
||||
assessment_queries = []
|
||||
total_ideas = 0
|
||||
idea_counter: dict[str, int] = {}
|
||||
|
||||
for result in results:
|
||||
query_id = result.get('query_id', '')
|
||||
query_text = result.get('query', '')
|
||||
category = result.get('category', '')
|
||||
|
||||
query_ideas = []
|
||||
|
||||
# Extract ideas from all conditions
|
||||
conditions_data = result.get('conditions', {})
|
||||
for condition_name, condition_data in conditions_data.items():
|
||||
ideas = extract_ideas_from_condition(
|
||||
query_id, condition_name, condition_data, idea_counter
|
||||
)
|
||||
query_ideas.extend(ideas)
|
||||
|
||||
# Apply stratified sampling if per_condition is specified
|
||||
if per_condition is not None:
|
||||
query_ideas = sample_ideas_stratified(
|
||||
query_ideas,
|
||||
per_condition=per_condition,
|
||||
rng=rng
|
||||
)
|
||||
|
||||
# Apply per-query limit
|
||||
if per_query is not None and len(query_ideas) > per_query:
|
||||
rng.shuffle(query_ideas)
|
||||
query_ideas = query_ideas[:per_query]
|
||||
|
||||
# Shuffle ideas within this query
|
||||
rng.shuffle(query_ideas)
|
||||
|
||||
assessment_queries.append({
|
||||
'query_id': query_id,
|
||||
'query_text': query_text,
|
||||
'category': category,
|
||||
'ideas': query_ideas,
|
||||
'idea_count': len(query_ideas)
|
||||
})
|
||||
|
||||
total_ideas += len(query_ideas)
|
||||
print(f" Query '{query_text}' ({query_id}): {len(query_ideas)} ideas")
|
||||
|
||||
# Apply total sample limit across all queries (proportionally)
|
||||
if sample_total is not None and total_ideas > sample_total:
|
||||
print(f"\nApplying total sample limit: {sample_total} (from {total_ideas})")
|
||||
# Calculate proportion to keep
|
||||
keep_ratio = sample_total / total_ideas
|
||||
new_total = 0
|
||||
|
||||
for query in assessment_queries:
|
||||
n_keep = max(1, int(len(query['ideas']) * keep_ratio))
|
||||
rng.shuffle(query['ideas'])
|
||||
query['ideas'] = query['ideas'][:n_keep]
|
||||
query['idea_count'] = len(query['ideas'])
|
||||
new_total += len(query['ideas'])
|
||||
|
||||
total_ideas = new_total
|
||||
|
||||
# Build output structure
|
||||
assessment_data = {
|
||||
'experiment_id': experiment_id,
|
||||
'queries': assessment_queries,
|
||||
'total_ideas': total_ideas,
|
||||
'query_count': len(assessment_queries),
|
||||
'conditions': conditions,
|
||||
'randomization_seed': seed,
|
||||
'sampling': {
|
||||
'sample_total': sample_total,
|
||||
'per_query': per_query,
|
||||
'per_condition': per_condition
|
||||
},
|
||||
'metadata': {
|
||||
'source_file': str(experiment_filepath.name),
|
||||
'prepared_for': 'human_assessment'
|
||||
}
|
||||
}
|
||||
|
||||
# Write output
|
||||
output_filepath.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump(assessment_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\nTotal ideas for assessment: {total_ideas}")
|
||||
print(f"Output written to: {output_filepath}")
|
||||
|
||||
return assessment_data
|
||||
|
||||
|
||||
def list_experiment_files(results_dir: Path) -> list[Path]:
|
||||
"""List available deduped experiment files."""
|
||||
return sorted(results_dir.glob('*_deduped.json'), key=lambda p: p.stat().st_mtime, reverse=True)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Prepare assessment data from experiment results.',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python prepare_data.py # Use latest, all ideas
|
||||
python prepare_data.py --sample 100 # Sample 100 ideas total
|
||||
python prepare_data.py --per-query 20 # Max 20 ideas per query
|
||||
python prepare_data.py --per-condition 4 # 4 ideas per condition per query
|
||||
python prepare_data.py --per-condition 4 --per-query 15 # Combined limits
|
||||
python prepare_data.py --list # List available files
|
||||
|
||||
Recommended for human assessment:
|
||||
# 5 conditions × 4 ideas × 10 queries = 200 ideas (balanced)
|
||||
python prepare_data.py --per-condition 4
|
||||
|
||||
# Or limit total to ~150 ideas
|
||||
python prepare_data.py --sample 150
|
||||
"""
|
||||
)
|
||||
parser.add_argument(
|
||||
'experiment_file',
|
||||
nargs='?',
|
||||
default=None,
|
||||
help='Experiment file name (e.g., experiment_20260119_165650_deduped.json)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--list', '-l',
|
||||
action='store_true',
|
||||
help='List available experiment files'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--sample',
|
||||
type=int,
|
||||
default=None,
|
||||
metavar='N',
|
||||
help='Total number of ideas to sample (proportionally across queries)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--per-query',
|
||||
type=int,
|
||||
default=None,
|
||||
metavar='N',
|
||||
help='Maximum ideas per query'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--per-condition',
|
||||
type=int,
|
||||
default=None,
|
||||
metavar='N',
|
||||
help='Maximum ideas per condition per query (stratified sampling)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--seed', '-s',
|
||||
type=int,
|
||||
default=42,
|
||||
help='Random seed for shuffling (default: 42)'
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Paths
|
||||
base_dir = Path(__file__).parent.parent
|
||||
results_dir = base_dir / 'results'
|
||||
output_file = Path(__file__).parent / 'data' / 'assessment_items.json'
|
||||
|
||||
# List available files
|
||||
available_files = list_experiment_files(results_dir)
|
||||
|
||||
if args.list:
|
||||
print("Available experiment files (most recent first):")
|
||||
for f in available_files:
|
||||
size_kb = f.stat().st_size / 1024
|
||||
print(f" {f.name} ({size_kb:.1f} KB)")
|
||||
return
|
||||
|
||||
# Determine which file to use
|
||||
if args.experiment_file:
|
||||
experiment_file = results_dir / args.experiment_file
|
||||
if not experiment_file.exists():
|
||||
# Try without .json extension
|
||||
experiment_file = results_dir / f"{args.experiment_file}.json"
|
||||
else:
|
||||
# Use the latest deduped file
|
||||
if not available_files:
|
||||
print("Error: No deduped experiment files found in results directory.")
|
||||
return
|
||||
experiment_file = available_files[0]
|
||||
print(f"Using latest experiment file: {experiment_file.name}")
|
||||
|
||||
if not experiment_file.exists():
|
||||
print(f"Error: Experiment file not found: {experiment_file}")
|
||||
print("\nAvailable files:")
|
||||
for f in available_files:
|
||||
print(f" {f.name}")
|
||||
return
|
||||
|
||||
prepare_assessment_data(
|
||||
experiment_file,
|
||||
output_file,
|
||||
seed=args.seed,
|
||||
sample_total=args.sample,
|
||||
per_query=args.per_query,
|
||||
per_condition=args.per_condition
|
||||
)
|
||||
|
||||
# Verify output
|
||||
with open(output_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
print("\n--- Verification ---")
|
||||
print(f"Queries: {data['query_count']}")
|
||||
print(f"Total ideas: {data['total_ideas']}")
|
||||
|
||||
# Show distribution by condition (from hidden metadata)
|
||||
condition_counts: dict[str, int] = {}
|
||||
for query in data['queries']:
|
||||
for idea in query['ideas']:
|
||||
condition = idea['_hidden']['condition']
|
||||
condition_counts[condition] = condition_counts.get(condition, 0) + 1
|
||||
|
||||
print("\nIdeas per condition:")
|
||||
for condition, count in sorted(condition_counts.items()):
|
||||
print(f" {condition}: {count}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user