THUDM · XinyuJiangCMU · Dec 19, 2025 · Dec 23, 2025 · Dec 26, 2025 · Dec 27, 2025
diff --git a/examples/eval/eval_delegate.py b/examples/eval/eval_delegate.py
@@ -91,6 +91,12 @@ def _rebuild_delegate_config(
             env_cfg = build_skills_eval_env_config(args, env, defaults)
             if env_cfg is not None:
                 envs.append(env_cfg)
+        elif env_name == "terminal_bench":
+            from examples.eval.terminal_bench.tb_config import build_terminal_bench_config
+
+            env_cfg = build_terminal_bench_config(args, env, defaults)
+            if env_cfg is not None:
+                envs.append(env_cfg)
         else:
             raise ValueError(f"Unknown delegate environment: {env_name}")
     return envs
@@ -151,6 +157,10 @@ def _create_delegate(env_cfg: EvalEnvConfig, router_addr: str):
             from examples.eval.nemo_skills.skills_client import SkillsEvalClient
 
             return SkillsEvalClient.from_config(env_cfg, router_addr)
+        elif env_name == "terminal_bench":
+            from examples.eval.terminal_bench.tb_client import TerminalBenchClient
+
+            return TerminalBenchClient.from_config(env_cfg, router_addr)
         logger.warning("No delegate client registered for environment: %s", env_name)
         return None
 

diff --git a/examples/eval/README.md → examples/eval/nemo_skills/README.md b/examples/eval/README.md → examples/eval/nemo_skills/README.md
diff --git a/examples/eval/scripts/eval_tb_example.yaml b/examples/eval/scripts/eval_tb_example.yaml
@@ -0,0 +1,29 @@
+eval:
+  defaults:
+    n_samples_per_eval_prompt: 1
+    temperature: 0.6
+    top_p: 0.95
+    top_k: -1
+    max_response_len: 24576
+  datasets: # these eval tasks go through slime dataset config and default rollout function (slime.rollout.sglang_rollout.generate_rollout)
+    - name: gpqa    # huggingface-cli download --repo-type dataset zyzshishui0627/gpqa_diamond --local-dir /root/gpqa
+      path: /root/gpqa/gpqa_eval.jsonl
+      rm_type: gpqa
+      n_samples_per_eval_prompt: 2
+    - name: ifbench   # huggingface-cli download --repo-type dataset zyzshishui0627/IFBench --local-dir /root/ifbench
+      path: /root/ifbench/IFBench_eval.jsonl
+      rm_type: ifbench
+      n_samples_per_eval_prompt: 1
+  delegate:
+    - name: terminal_bench
+      url: http://172.17.0.1:9051 # Port must match the TB server running on the host machine
+      timeout_secs: 86400  # 24 hours
+      max_retries: 1 # HTTP request retries from Slime to the TB server
+      model_name: qwen3-8b
+      api_base: http://127.0.0.1:30005/v1 # Port must match the sglang router port set in run-eval-tb-qwen.sh
+      dataset_path: /mnt/data/xinyu/program/slime-tb/terminal-bench/tasks # Dataset path on the host machine
+      # task_ids: 
+      #   - hello-world
+      # n_tasks: 10
+      n_attempts: 1 # TB task-level retries (per task within tb run)
+      n_concurrent: 8
diff --git a/examples/eval/scripts/run-eval-tb-qwen.sh b/examples/eval/scripts/run-eval-tb-qwen.sh
@@ -0,0 +1,159 @@
+#!/bin/bash
+
+# Example launcher that reuses the Qwen3-8B recipe but delegates evaluation to an
+# external Terminal Bench server via the eval_delegate_rollout wrapper.
+
+# Clean up any stale processes from a previous run.
+pkill -9 sglang
+sleep 3
+ray stop --force
+pkill -9 ray
+pkill -9 python
+sleep 3
+pkill -9 ray
+pkill -9 python
+
+set -ex
+
+export PYTHONBUFFERED=16
+export SLIME_HOST_IP=${SLIME_HOST_IP:-"127.0.0.1"}
+
+MODEL_DIR="${MODEL_DIR:-/root/.cache}"
+export MODEL_DIR
+
+NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
+if [ "$NVLINK_COUNT" -gt 0 ]; then
+    HAS_NVLINK=1
+else
+    HAS_NVLINK=0
+fi
+echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." &>/dev/null && pwd)"
+source "${REPO_ROOT}/scripts/models/qwen3-8B.sh"
+
+# Store eval/delegate settings in a YAML config similar to examples/eval_multi_task.
+EVAL_CONFIG_PATH=${TB_EVAL_CONFIG_PATH:-"${REPO_ROOT}/examples/eval/scripts/eval_tb_example.yaml"}
+
+CKPT_ARGS=(
+   --hf-checkpoint ${MODEL_DIR}/OpenThinker-Agent-v1 # huggingface-cli download open-thoughts/OpenThinker-Agent-v1
+   --ref-load ${MODEL_DIR}/OpenThinker-Agent-v1_torch_dist
+   # --load ${MODEL_DIR}/OpenThinker-Agent-v1_slime/
+   --save ${MODEL_DIR}/OpenThinker-Agent-v1_slime/
+   --save-interval 20
+)
+
+ROLLOUT_ARGS=(
+   --prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl
+   --input-key prompt
+   --label-key label
+   --apply-chat-template
+   --rollout-shuffle
+   --rm-type deepscaler
+   --num-rollout 3000
+   --rollout-batch-size 32
+   --n-samples-per-prompt 8
+   --rollout-max-response-len 8192
+   --rollout-temperature 0.8
+   --global-batch-size 256
+   --balance-data
+)
+
+EVAL_ARGS=(
+   --eval-interval 5
+   --eval-config "${EVAL_CONFIG_PATH}"
+   --eval-function-path examples.eval.eval_delegate_rollout.generate_rollout
+)
+
+PERF_ARGS=(
+   --tensor-model-parallel-size 1
+   --pipeline-model-parallel-size 1
+   --context-parallel-size 1
+   --expert-model-parallel-size 1
+   --expert-tensor-parallel-size 1
+
+   --recompute-granularity full
+   --recompute-method uniform
+   --recompute-num-layers 1
+
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 9216
+)
+
+GRPO_ARGS=(
+   --advantage-estimator grpo
+   --use-kl-loss
+   --kl-loss-coef 0.00
+   --kl-loss-type low_var_kl
+   --entropy-coef 0.00
+   --eps-clip 0.2
+   --eps-clip-high 0.28
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   --lr 1e-6
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+)
+
+WANDB_ARGS=(
+   --use-wandb
+   --wandb-project slime-eval
+   --wandb-group qwen3-8b-eval
+   --wandb-key ${WANDB_KEY}   # export WANDB_KEY="your_key"
+)
+
+SGLANG_ARGS=(
+   --rollout-num-gpus-per-engine 1
+   --sglang-mem-fraction-static 0.7
+   --sglang-router-port 30005
+)
+
+MISC_ARGS=(
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   --accumulate-allreduce-grads-in-fp32
+   --attention-softmax-in-fp32
+   --attention-backend flash
+)
+
+export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+export CUDA_VISIBLE_DEVICES=0,1
+
+ray start --head --node-ip-address ${MASTER_ADDR} --port 6380 --num-gpus 2 \
+            --disable-usage-stats \
+            --dashboard-host=0.0.0.0 \
+            --dashboard-port=8266 \
+            --dashboard-agent-listen-port 52366 \
+            --dashboard-agent-grpc-port 52367 \
+            --runtime-env-agent-port 52368
+
+
+RUNTIME_ENV_JSON="{
+  \"env_vars\": {
+    \"PYTHONPATH\": \"/root/Megatron-LM/\",
+    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\"
+  }
+}"
+
+ray job submit --address="http://${MASTER_ADDR}:8266" \
+   --working-dir "${REPO_ROOT}" \
+   --runtime-env-json="${RUNTIME_ENV_JSON}" \
+   -- python3 train.py \
+   --actor-num-nodes 1 \
+   --actor-num-gpus-per-node 2 \
+   --colocate \
+   ${MODEL_ARGS[@]} \
+   ${CKPT_ARGS[@]} \
+   ${ROLLOUT_ARGS[@]} \
+   ${OPTIMIZER_ARGS[@]} \
+   ${GRPO_ARGS[@]} \
+   ${WANDB_ARGS[@]} \
+   ${PERF_ARGS[@]} \
+   ${EVAL_ARGS[@]} \
+   ${SGLANG_ARGS[@]} \
+   ${MISC_ARGS[@]}
diff --git a/examples/eval/terminal_bench/README-cn.md b/examples/eval/terminal_bench/README-cn.md
@@ -0,0 +1,122 @@
+# Terminal Bench 评估集成
+
+本目录将 Terminal Bench (TB) 封装为 Slime 的评估委托（Eval Delegate）。评估过程在宿主机（Host）上通过 `tb` CLI 执行，Slime 负责读取并汇总各项指标，包括 `accuracy`、`n_resolved`、`n_unresolved`、`pass_at_k/*` 以及 Token 统计数据（如 `total_input_tokens_mean/median` 和 `total_output_tokens_mean/median`）。
+
+## 运行架构
+
+* **Slime 内部**：运行训练/评估主循环；调用 TB delegate client。
+* **宿主机（Host）**：运行 TB delegate server (`tb_server.py`)，由其执行 `tb run ...`。
+* **Server逻辑**：读取最新的 TB JSON 结果并将各项指标返回给 Slime。
+
+## 1) 获取代码 (宿主机)
+
+```bash
+mkdir slime-tb
+cd slime-tb
+git clone https://github.com/THUDM/slime.git
+git clone https://github.com/laude-institute/terminal-bench
+```
+
+## 2) 启动 Slime 容器
+
+```bash
+docker run \
+  -itd \
+  --gpus all \
+  --shm-size 32g \
+  --network host \
+  --ipc=host \
+  --privileged \
+  --ulimit memlock=-1 \
+  --ulimit stack=67108864 \
+  --ulimit nofile=65536:65536 \
+  -v /mnt/data/.cache:/root/.cache \
+  -v $(pwd):/shared/slime-tb \
+  --name <slime_container_name> \
+  slimerl/slime:latest \
+  /bin/bash
+```
+
+## 3) 进入 Slime 容器
+
+```bash
+docker exec -it <slime_container_name> /bin/bash
+```
+
+## 4) 配置 Terminal Bench 环境 (宿主机)
+
+在运行 `tb_server.py` 的宿主机上执行：
+
+```bash
+# 在宿主机终端执行（非 Docker 内部）
+uv venv --python 3.13 .venv
+source .venv/bin/activate
+uv pip install terminal-bench/.
+uv pip install -r slime/examples/eval/terminal_bench/requirements.txt
+```
+
+*如果仓库路径不是 `./slime` 和 `./terminal-bench`，请根据实际路径调整。*
+
+## 5) 启动 Terminal Bench server
+
+在宿主机上启动（即 `tb` 命令可用的环境）：
+
+```bash
+python slime/examples/eval/terminal_bench/tb_server.py \
+  --host 0.0.0.0 --port 9051 \
+  --output-root tb_eval_output
+```
+
+**该脚本的功能：**
+
+* 默认设置 `OPENAI_API_KEY=EMPTY`。
+* 执行 `tb run -a terminus-2 -m openai/<model> ... --n-concurrent 8`。
+* 等待运行完成后，返回 `accuracy`、`pass_at_k` 以及 Token 消耗等统计数据。
+
+## 6) 运行评估脚本 (示例)
+
+如果使用提供的 Qwen 评估启动脚本 (`run-eval-tb-qwen.sh`)，请按以下步骤操作：
+
+**更新路径**：将 `eval_tb_example.yaml` 中的 `dataset_path` 修改为宿主机上 `terminal-bench/tasks` 的**绝对路径**（注意不是 Docker 内部路径）。
+
+**下载模型**：在 Slime 容器内下载 HuggingFace 权重：
+```bash
+huggingface-cli download open-thoughts/OpenThinker-Agent-v1 \
+--local-dir /root/.cache/OpenThinker-Agent-v1
+```
+
+**格式转换**：将 HuggingFace 权重转换为 Slime 的 torch distributed 格式。在 Slime 根目录下执行：
+```bash
+cd /shared/slime-tb/slime
+source scripts/models/qwen3-8B.sh
+
+export PYTHONPATH=/root/Megatron-LM:/shared/slime-tb/slime
+
+python tools/convert_hf_to_torch_dist.py \
+  ${MODEL_ARGS[@]} \
+  --hf-checkpoint /root/.cache/OpenThinker-Agent-v1 \
+  --save /root/.cache/OpenThinker-Agent-v1_torch_dist
+```
+
+**开始评估**：在 Slime 容器内运行：
+```bash
+bash slime/examples/eval/scripts/run-eval-tb-qwen.sh 2>&1 | tee run.log
+```
+
+*为了快速测试，可以在 `eval_tb_example.yaml` 中通过 `task_ids` 指定特定任务，或通过 `n_tasks` 限制评估任务的数量。*
+
+## 7) 常见问题
+
+当在 Docker 容器中使用 `--network host` 运行 Slime 时，Ray 可能由于与宿主机共享网络而出现端口冲突。
+
+这会导致 Ray 启动失败，或报 Redis/会话相关错误。通常可以在启动 Ray head 时显式指定未占用端口来解决，比如设置非默认的 `--port` 和 `--dashboard-port`。
+
+有时甚至会导致 Ray job 提交失败，提示没有可用 agent 接受任务。这通常是 dashboard agent 或 runtime env agent 的端口也发生冲突。此时可在启动 Ray 时指定这些端口（如 `--dashboard-agent-listen-port`、`--dashboard-agent-grpc-port`、`--runtime-env-agent-port`）来解决。
+
+如果 TB server无法通过 sglang router 连接到 Slime（`InternalServerError`），请检查 router 端口（例如 30005）实际监听的地址，并更新 `eval_tb_example.yaml` 中的 `api_base`：
+
+```bash
+ss -lntp | grep 30005
+```
+
+TB server开始接受请求后，可能会在输出中看到 `Parser warnings`、`Context length exceeded`、`Command 1 should end with newline`、`Harness execution failed`等。这些是Terminal Bench 的警告，如果正常运行可以忽略。