textevolve/run_script.py at main · nickcdryan/textevolve · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#!/usr/bin/env python
"""
run_script.py - Main entry point for the Agentic Learning System with custom dataset loaders

python run_script.py --iterations 5 --dataset ARC_2024_Training/ --loader arc --no-shuffle

python run_script.py --iterations 5 --dataset ARC_2024_Training/ --loader arc
"""

import os
import sys
import json
import argparse
from pathlib import Path
from typing import Dict, List, Any

from agent_system import AgentSystem
from dataset_loader import create_dataset_loader

# Fixed random seed for reproducibility (if shuffling is enabled)
RANDOM_SEED = 42

def run_agent(iterations: int, loader_config: Dict, use_sandbox: bool = False) -> None:
    """
    Run the agent system for the specified number of iterations.

    Args:
        iterations: Number of iterations to run
        loader_config: Configuration for dataset loader
        use_sandbox: Whether to use Docker sandbox for code execution
    """
    # Create the appropriate dataset loader
    try:
        loader_type = loader_config.pop("loader_type")
        dataset_loader = create_dataset_loader(loader_type, **loader_config)
        print(f"Created {loader_type} dataset loader with {dataset_loader.get_total_count()} examples")

        # Initialize the agent system with the dataset loader
        agent = AgentSystem(dataset_loader=dataset_loader, use_sandbox=use_sandbox)
    except Exception as e:
        print(f"Error initializing system: {e}")
        sys.exit(1)

    print("=" * 80)
    print("Agentic Learning System")
    print("=" * 80)
    print(f"Dataset: {loader_config.get('dataset_path')}")
    print(f"Loader type: {loader_type}")
    print(f"Shuffle data: {loader_config.get('shuffle', True)}")
    print(f"Sandbox enabled: {agent.use_sandbox}")
    print(f"Starting with explore/exploit/refine balance: {agent.explore_rate}/{agent.exploit_rate}/{agent.refine_rate}")
    print(f"Starting batch size: {agent.current_batch_size}")
    print("-" * 80)

    # Run iterations
    for i in range(iterations):
        try:
            result = agent.run_iteration()
            if not result.get("success", True):
                print(f"Iteration {i} failed: {result.get('error', 'Unknown error')}")
                break
        except KeyboardInterrupt:
            print("\nProcess interrupted by user. Saving current state...")
            break
        except Exception as e:
            print(f"\nError in iteration {i}: {e}")
            import traceback
            traceback.print_exc()
            continue

    # Print final summary
    print("\n" + "=" * 80)
    print("Final Results Summary")
    print("=" * 80)

    summaries = agent.get_summaries()

    if summaries:
        # Sort by iteration number
        summaries.sort(key=lambda x: x.get("iteration", 0))

        all_iteration_data = agent.get_all_iterations()

        # Build summary table lines
        summary_lines = []
        summary_lines.append("Performance Trend:")

        header_line = f"{'Iteration':<8} {'Strategy':<12} {'Batch Acc.':<12} {'Prog. Acc.':<16} {'Combined':<12} {'Batch Size':<10} {'Prog. Size':<10} {'Expl/Expt':<10} {'Primary Issue'}"
        summary_lines.append(header_line)

        separator_line = "-" * 120
        summary_lines.append(separator_line)

        # Print performance trend
        print("\nPerformance Trend:")
        print(header_line)
        print(separator_line)

        for summary in summaries:
            iteration = summary.get("iteration", "?")
            strategy = summary.get("strategy", "Unknown")
            batch_accuracy = summary.get("performance", {}).get("accuracy", 0) * 100
            batch_size = summary.get("batch_size", 5)
            explore = summary.get("explore_rate", 0)
            exploit = summary.get("exploit_rate", 0)
            refine = summary.get("refine_rate", 0)
            prog_accuracy = summary.get("progressive_accuracy", None)

            # Get progressive testing sample count
            prog_samples = 0
            for it in all_iteration_data:
                if it and it.get("iteration") == summary.get("iteration"):
                    if "progressive_testing" in it and it["progressive_testing"]:
                        prog_samples = it["progressive_testing"].get("total_examples", 0)
                        break

            issue = summary.get("primary_issue", "None identified")

            # Truncate issue if too long
            if len(issue) > 30:
                issue = issue[:27] + "..."

            # Format progressive accuracy with sample count
            if prog_accuracy is not None:
                prog_acc_str = f"{prog_accuracy*100:.2f}% ({prog_samples})"
            else:
                prog_acc_str = "N/A"

            # Calculate combined accuracy - weighted by sample counts
            combined_acc_str = "N/A"
            if prog_accuracy is not None and batch_accuracy > 0:
                # Correct weighted average calculation
                total_correct = (batch_accuracy/100 * batch_size) + (prog_accuracy * prog_samples)
                total_samples = batch_size + prog_samples
                combined_acc = (total_correct / total_samples) * 100
                combined_acc_str = f"{combined_acc:.2f}%"

            data_line = f"{iteration:<8} {strategy:<12} {batch_accuracy:<12.2f}% {prog_acc_str:<16} {combined_acc_str:<12} {batch_size:<10} {prog_samples:<10} {explore}/{exploit}/{refine:<10} {issue}"

            # Print and save the line
            print(data_line)
            summary_lines.append(data_line)

        # Write summary to file
        try:
            Path("scripts").mkdir(exist_ok=True)
            with open("scripts/summary.txt", "w") as f:
                f.write("\n".join(summary_lines))
            print(f"\nSummary saved to scripts/summary.txt")
        except Exception as e:
            print(f"Error saving summary to file: {e}")

    # Get best script info - with error handling
    try:
        best_script_info = agent.get_best_script_info()
        if best_script_info:
            print("\n=== Current Best Script ===")
            print(f"Iteration: {best_script_info.get('iteration')}")

            # Report batch testing results
            batch_acc = best_script_info.get('accuracy', 0)
            batch_size = best_script_info.get('batch_size', 0)
            print(f"Batch Accuracy: {batch_acc:.2f} (tested on {batch_size} examples)")

            # Report progressive testing results if available
            prog_acc = best_script_info.get('progressive_accuracy')
            if prog_acc is not None:
                prog_samples = best_script_info.get('progressive_samples', 0)
                print(f"Progressive Accuracy: {prog_acc:.2f} (tested on {prog_samples} examples)")

            # Report combined accuracy if available
            combined_acc = best_script_info.get('combined_accuracy')
            if combined_acc is not None:
                total_samples = batch_size
                if prog_acc is not None:
                    total_samples += best_script_info.get('progressive_samples', 0)
                print(f"Combined Accuracy: {combined_acc:.2f} (across all {total_samples} examples)")

            print(f"Path: {best_script_info.get('path')}")
            print(f"Approach: {best_script_info.get('approach')}")
            print(f"Rationale: {best_script_info.get('rationale')}")
            print("\nTo validate this script on a specific range of examples, run:")
            print(f"python validate_script.py --script {best_script_info.get('path')}")
    except Exception as e:
        print(f"Error getting best script info: {e}")
        print("Could not determine best script due to an error.")

    print(f"Final batch size: {agent.current_batch_size}")
    print(f"Total examples seen: {len(agent.seen_examples)}")
    print("=" * 80)


def parse_arguments():
    """Parse command-line arguments."""
    parser = argparse.ArgumentParser(
        description="Run the Agentic Learning System with custom dataset loaders")

    parser.add_argument("--iterations",
                        "-i",
                        type=int,
                        default=5,
                        help="Number of iterations to run (default: 5)")

    # Dataset configuration
    parser.add_argument(
        "--dataset",
        "-d",
        type=str,
        default="dataset.json",
        help="Path to the dataset file or directory (default: dataset.json)")


    parser.add_argument(
            "--loader",
            "-l",
            type=str,
            choices=["arc", "json", "jsonl", "simpleqa", "custom", "natural_plan", "hotpotqa", "math", "gpqa"],
            default="arc",
            help="Type of dataset loader to use (default: arc)")

    # JSON loader options
    parser.add_argument(
        "--input-field",
        "-if",
        type=str,
        default="input",
        help="Field name for input data in JSON/JSONL loader (default: input)")

    parser.add_argument(
        "--output-field",
        "-of",
        type=str,
        default="output",
        help="Field name for output data in JSON/JSONL loader (default: output)")

    parser.add_argument(
        "--example-prefix",
        "-p",
        type=str,
        default="",
        help="Prefix for example keys in JSON loader (default: none)")

    # JSONL loader options
    parser.add_argument(
        "--passage-field",
        type=str,
        default="passage",
        help="Field name for passage text in JSONL loader (default: passage)")

    parser.add_argument(
        "--answer-extraction",
        type=str,
        default="spans",
        help="Field to extract from nested answer data in JSONL loader (default: spans)")

    # General options
    parser.add_argument(
        "--no-shuffle",
        action="store_true",
        help="Disable dataset shuffling (default: False)")

    parser.add_argument(
        "--seed",
        "-s",
        type=int,
        default=RANDOM_SEED,
        help=f"Random seed for dataset shuffling (default: {RANDOM_SEED})")

    # Sandbox options
    parser.add_argument(
        "--sandbox",
        action="store_true",
        help="Enable Docker sandbox for code execution (default: False)")

    return parser.parse_args()

if __name__ == "__main__":
    # Parse command-line arguments
    args = parse_arguments()

    # Check environment variables
    if not os.environ.get("GEMINI_API_KEY"):
        print("Error: GEMINI_API_KEY environment variable is not set.")
        print(
            "Please set this variable to your Gemini API key before running the script."
        )
        print("Example: export GEMINI_API_KEY=your_api_key_here")
        sys.exit(1)

    # Create loader configuration
    loader_config = {
        "loader_type": args.loader,
        "dataset_path": args.dataset,
        "shuffle": not args.no_shuffle,
        "random_seed": args.seed
    }

    # Add loader-specific parameters
    if args.loader == "json":
        loader_config.update({
            "input_field": args.input_field,
            "output_field": args.output_field
        })

        if args.example_prefix:
            loader_config["example_prefix"] = args.example_prefix

    # Add JSONL loader specific parameters
    elif args.loader == "jsonl":
        loader_config.update({
            "input_field": args.input_field,
            "output_field": args.output_field,
            "passage_field": args.passage_field,
            "answer_extraction": args.answer_extraction
        })

    # Run the agent
    run_agent(args.iterations, loader_config, use_sandbox=args.sandbox)