From 3338a57f5c7ce61fe82ba5dd2a138603caef65e1 Mon Sep 17 00:00:00 2001 From: Sebastian Date: Sun, 25 Jan 2026 12:35:32 +0100 Subject: [PATCH 1/8] Refactor model call path and align sub-LLM tests --- tests/test_rlm_env.py | 1 + verifiers/envs/environment.py | 52 +++++++++++++++++++------- verifiers/envs/experimental/rlm_env.py | 41 +++++--------------- 3 files changed, 50 insertions(+), 44 deletions(-) diff --git a/tests/test_rlm_env.py b/tests/test_rlm_env.py index 9c4a85274..82f72362b 100644 --- a/tests/test_rlm_env.py +++ b/tests/test_rlm_env.py @@ -1036,6 +1036,7 @@ class TestSubLLMRequestPaths: async def test_interleaved_uses_tokens_endpoint(self, rlm_env): mock_client = MagicMock() mock_response = MagicMock() + mock_response.choices = [MagicMock()] mock_client.post = AsyncMock(return_value=mock_response) mock_client.chat.completions.create = AsyncMock() diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py index e8842dba4..8e324f5ab 100644 --- a/verifiers/envs/environment.py +++ b/verifiers/envs/environment.py @@ -456,6 +456,40 @@ def normalize_sampling_args(sampling_args: SamplingArgs) -> SamplingArgs: sampling_args.pop("max_completion_tokens") return {k: v for k, v in sampling_args.items() if v is not None} + client, model, oai_tools, sampling_args, message_type = resolve_optional_args( + client, model, oai_tools, sampling_args, message_type + ) + sampling_args = normalize_sampling_args(sampling_args) + if self.interleaved_rollouts: + sampling_args = prepare_sampling_args_for_token_prompts(sampling_args) + + prompt_ids: list[int] | None = None + if self.interleaved_rollouts and len(state["trajectory"]) > 0: + prompt_ids = await get_prompt_ids(state, prompt, client) + + return await self._call_model_api( + client=client, + model=model, + prompt=prompt, + oai_tools=oai_tools, + sampling_args=sampling_args, + message_type=message_type, + prompt_ids=prompt_ids, + ) + + async def _call_model_api( + self, + *, + client: AsyncOpenAI, + model: str, + prompt: Messages, + oai_tools: list[ChatCompletionToolParam] | None, + sampling_args: SamplingArgs, + message_type: MessageType, + prompt_ids: list[int] | None = None, + ) -> ModelResponse: + """Shared low-level model call used by main and sub-LLM paths.""" + def handle_overlong_prompt(func): """Decorator to handle overlong prompt errors from the model API.""" @@ -487,7 +521,7 @@ async def wrapper(*args, **kwargs): return wrapper @handle_overlong_prompt - async def get_model_response_with_messages( + async def call_with_messages( client: AsyncOpenAI, model: str, prompt: Messages, @@ -547,7 +581,7 @@ async def get_model_response_with_messages( return response @handle_overlong_prompt - async def get_model_response_with_tokens( + async def call_with_tokens( client: AsyncOpenAI, model: str, prompt: Messages, @@ -581,16 +615,8 @@ async def get_model_response_with_tokens( cast_to=ChatCompletion, ) - client, model, oai_tools, sampling_args, message_type = resolve_optional_args( - client, model, oai_tools, sampling_args, message_type - ) - sampling_args = normalize_sampling_args(sampling_args) - if self.interleaved_rollouts: - sampling_args = prepare_sampling_args_for_token_prompts(sampling_args) - - if self.interleaved_rollouts and len(state["trajectory"]) > 0: - prompt_ids = await get_prompt_ids(state, prompt, client) - response = await get_model_response_with_tokens( + if prompt_ids is not None: + response = await call_with_tokens( client=client, model=model, prompt=prompt, @@ -600,7 +626,7 @@ async def get_model_response_with_tokens( message_type=message_type, ) else: - response = await get_model_response_with_messages( + response = await call_with_messages( client=client, model=model, prompt=prompt, diff --git a/verifiers/envs/experimental/rlm_env.py b/verifiers/envs/experimental/rlm_env.py index ec516865d..07bd1a8f6 100644 --- a/verifiers/envs/experimental/rlm_env.py +++ b/verifiers/envs/experimental/rlm_env.py @@ -51,7 +51,7 @@ from typing import TypedDict from aiohttp import web -from openai.types.chat import ChatCompletion, ChatCompletionFunctionToolParam +from openai.types.chat import ChatCompletionFunctionToolParam from prime_tunnel import Tunnel import verifiers as vf from verifiers.types import ( @@ -3084,46 +3084,25 @@ async def _call_sub_llm_api( sampling_args = self._prepare_sub_llm_sampling_args( state, interleaved=self.interleaved_rollouts ) - payload: dict[str, Any] = { - "model": model, - "messages": normalized_messages, - "tools": tools, - } try: + prompt_ids: list[int] | None = None if self.interleaved_rollouts: - extra_body = sampling_args.pop("extra_body", {}) prompt_ids = await tokenize_vllm( client=client, messages=normalized_messages, tools=tools, model=model, ) - payload = { - "model": model, - "messages": normalized_messages, - "tools": tools, - "tokens": prompt_ids, - **sampling_args, - **extra_body, - } - return await asyncio.wait_for( - client.post( - "/chat/completions/tokens", - body=payload, - cast_to=ChatCompletion, - ), - timeout=self.sub_llm_api_timeout, - ) - payload = { - "model": model, - "messages": normalized_messages, - "tools": tools, - **sampling_args, - } return await asyncio.wait_for( - client.chat.completions.create( - **payload, + self._call_model_api( + client=client, + model=model, + prompt=normalized_messages, + oai_tools=tools, + sampling_args=sampling_args, + message_type="chat", + prompt_ids=prompt_ids, ), timeout=self.sub_llm_api_timeout, ) From 8fc1aacd92eeaa7e0612d9e31d1545772122cbff Mon Sep 17 00:00:00 2001 From: Sebastian Date: Sun, 25 Jan 2026 13:02:40 +0100 Subject: [PATCH 2/8] Unify sub-LLM interleaving and expand tests --- tests/test_rlm_env.py | 97 ++++++++++++++++++++++++++ verifiers/envs/experimental/rlm_env.py | 80 ++++++++++++++++++--- 2 files changed, 167 insertions(+), 10 deletions(-) diff --git a/tests/test_rlm_env.py b/tests/test_rlm_env.py index 82f72362b..38897afcf 100644 --- a/tests/test_rlm_env.py +++ b/tests/test_rlm_env.py @@ -1067,6 +1067,103 @@ async def test_interleaved_uses_tokens_endpoint(self, rlm_env): assert "max_tokens" not in body mock_client.chat.completions.create.assert_not_called() + @pytest.mark.asyncio + async def test_sub_llm_normalizes_messages(self, rlm_env): + mock_client = MagicMock() + mock_message = MagicMock() + mock_message.tool_calls = None + mock_message.content = "ok" + mock_response = MagicMock() + mock_response.choices = [MagicMock(message=mock_message)] + mock_client.chat.completions.create = AsyncMock(return_value=mock_response) + + rlm_env.interleaved_rollouts = False + messages = [ + {"role": "user", "content": {"type": "text", "text": "hello"}}, + {"role": "user", "content": {"role": "user", "content": "inner"}}, + ] + state = {} + + await rlm_env._call_sub_llm_api(state, mock_client, "gpt-4", messages) + + args, kwargs = mock_client.chat.completions.create.call_args + assert args == () + sent_messages = kwargs["messages"] + assert sent_messages[0]["content"] == [{"type": "text", "text": "hello"}] + assert sent_messages[1]["content"] == "inner" + + @pytest.mark.asyncio + async def test_interleaved_sub_llm_uses_incremental_prompt_ids( + self, rlm_env_with_sub_tools + ): + mock_client = MagicMock() + mock_client.chat.completions.create = AsyncMock() + + mock_tool_call = MagicMock() + mock_tool_call.id = "call_1" + mock_tool_call.function.name = "sample_tool" + mock_tool_call.function.arguments = '{"x": 2, "y": 3}' + + mock_message1 = MagicMock() + mock_message1.tool_calls = [mock_tool_call] + mock_message1.content = None + + mock_message2 = MagicMock() + mock_message2.tool_calls = None + mock_message2.content = "done" + + response1 = MagicMock() + response1.choices = [MagicMock(message=mock_message1)] + response2 = MagicMock() + response2.choices = [MagicMock(message=mock_message2)] + + mock_client.post = AsyncMock(side_effect=[response1, response2]) + + rlm_env_with_sub_tools.interleaved_rollouts = True + messages = [{"role": "user", "content": "Add 2 and 3"}] + state = {"sampling_args": {"max_tokens": 7}} + + token_payload = { + "prompt_ids": [1], + "prompt_mask": [0], + "completion_ids": [2], + "completion_mask": [1], + "completion_logprobs": [0.0], + "overlong_prompt": False, + "is_truncated": False, + } + + with ( + patch( + "verifiers.envs.experimental.rlm_env.tokenize_vllm", + new=AsyncMock(return_value=[1, 2, 3]), + ) as mock_tokenize, + patch( + "verifiers.envs.experimental.rlm_env.get_prompt_ids", + new=AsyncMock(return_value=[4, 5, 6]), + ) as mock_get_prompt_ids, + patch( + "verifiers.envs.experimental.rlm_env.parse_response_tokens", + new=AsyncMock(return_value=token_payload), + ), + patch( + "verifiers.envs.experimental.rlm_env.parse_response_messages", + new=AsyncMock(return_value=[{"role": "assistant", "content": "ok"}]), + ), + patch( + "verifiers.envs.experimental.rlm_env.parse_is_truncated", + new=AsyncMock(return_value=False), + ), + ): + await rlm_env_with_sub_tools._run_sub_llm( + state, mock_client, "gpt-4", messages + ) + + assert mock_client.post.await_count == 2 + mock_tokenize.assert_awaited_once() + mock_get_prompt_ids.assert_awaited_once() + mock_client.chat.completions.create.assert_not_called() + # ============================================================================= # 8. Root Tool Serialization (pickle) diff --git a/verifiers/envs/experimental/rlm_env.py b/verifiers/envs/experimental/rlm_env.py index 07bd1a8f6..347747ca5 100644 --- a/verifiers/envs/experimental/rlm_env.py +++ b/verifiers/envs/experimental/rlm_env.py @@ -73,6 +73,7 @@ from verifiers.utils.tool_utils import convert_func_to_oai_tool from verifiers.utils.token_utils import ( prepare_sampling_args_for_token_prompts, + get_prompt_ids, tokenize_vllm, ) from verifiers.utils.sandbox_exec_utils import SandboxExecutorMixin @@ -3078,6 +3079,8 @@ async def _call_sub_llm_api( model: str, messages: ChatMessages, tools: list | None = None, + *, + sub_state: State | None = None, ) -> Any | None: """Make a single sub-LLM API call matching main-model request mode.""" normalized_messages = self._normalize_message_content(messages) @@ -3088,12 +3091,17 @@ async def _call_sub_llm_api( try: prompt_ids: list[int] | None = None if self.interleaved_rollouts: - prompt_ids = await tokenize_vllm( - client=client, - messages=normalized_messages, - tools=tools, - model=model, - ) + if sub_state is not None and sub_state.get("trajectory"): + prompt_ids = await get_prompt_ids( + sub_state, normalized_messages, client + ) + else: + prompt_ids = await tokenize_vllm( + client=client, + messages=normalized_messages, + tools=tools, + model=model, + ) return await asyncio.wait_for( self._call_model_api( client=client, @@ -3137,6 +3145,19 @@ async def _run_sub_llm( self, state: State, client: Any, model: str, messages: ChatMessages ) -> SubLLMResult: """Run a sub-LLM call, with optional tool-calling loop.""" + sub_state: State | None = None + if self.interleaved_rollouts: + # Track a minimal sub-LLM trajectory so get_prompt_ids() can compute + # incremental prompt_ids (same interleaving strategy as the main LLM). + # This sub_state is only for tokenization continuity and is not added + # to the main trajectory or used for scoring. + sub_state = State() + sub_state["trajectory"] = [] + sub_state["client"] = client + sub_state["model"] = model + sub_state["oai_tools"] = self.sub_oai_tools or [] + sub_state["sampling_args"] = state.get("sampling_args") + # Fast path: no tools configured - single LLM call if not self.sub_tools: response = await self._call_sub_llm_api(state, client, model, messages) @@ -3171,10 +3192,16 @@ async def _run_sub_llm( for _ in range(self.sub_tool_max_turns): num_turns += 1 - prompt_snapshot = [cast(ChatMessage, dict(m)) for m in current_messages] + normalized_messages = self._normalize_message_content(current_messages) + prompt_snapshot = [cast(ChatMessage, dict(m)) for m in normalized_messages] response = await self._call_sub_llm_api( - state, client, model, current_messages, tools + state, + client, + model, + normalized_messages, + tools, + sub_state=sub_state, ) if response is None: return self._make_timeout_result( @@ -3185,6 +3212,32 @@ async def _run_sub_llm( num_turns, ) + if sub_state is not None: + tokens = await parse_response_tokens(response, "chat", self.max_seq_len) + if tokens is None: + sub_state = None + else: + completion_messages = await parse_response_messages( + response, "chat" + ) + response_is_truncated = await parse_is_truncated(response, "chat") + is_truncated = response_is_truncated or bool( + tokens.get("is_truncated") + ) + sub_state["trajectory"].append( + TrajectoryStep( + prompt=cast(Messages, prompt_snapshot), + completion=completion_messages, + response=response, + tokens=tokens, + reward=None, + advantage=None, + is_truncated=is_truncated, + trajectory_id="sub_llm_local", + extras={"is_sub_llm_call": True, "sub_state_only": True}, + ) + ) + prompt_tokens, completion_tokens = _extract_tokens_from_response(response) total_prompt_tokens += prompt_tokens total_completion_tokens += completion_tokens @@ -3239,8 +3292,15 @@ async def _run_sub_llm( ) ) - prompt_snapshot = [cast(ChatMessage, dict(m)) for m in current_messages] - response = await self._call_sub_llm_api(state, client, model, current_messages) + normalized_messages = self._normalize_message_content(current_messages) + prompt_snapshot = [cast(ChatMessage, dict(m)) for m in normalized_messages] + response = await self._call_sub_llm_api( + state, + client, + model, + normalized_messages, + sub_state=sub_state, + ) if response is None: return self._make_timeout_result( turns, From 21ae9490197168f7b04be3955869a3e25eb48876 Mon Sep 17 00:00:00 2001 From: Sebastian Date: Sun, 25 Jan 2026 13:43:29 +0100 Subject: [PATCH 3/8] Add task and example_id to rlm_secrets dataset --- environments/rlm_secrets/rlm_secrets.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/environments/rlm_secrets/rlm_secrets.py b/environments/rlm_secrets/rlm_secrets.py index daa038f5c..6f680501a 100644 --- a/environments/rlm_secrets/rlm_secrets.py +++ b/environments/rlm_secrets/rlm_secrets.py @@ -318,6 +318,7 @@ def build_dataset( Dataset with prompt, answer, and info columns """ rows = [] + task_name = "rlm-secrets" for i in range(num_examples): puzzle = generate_puzzle(num_files=num_files) @@ -359,9 +360,11 @@ def build_dataset( rows.append( { + "example_id": i, "prompt": prompt, "answer": str(puzzle["correct_position"]), "info": {"puzzle": puzzle}, + "task": task_name, } ) From 0671c217716a5965b7220b4ed44893f191c96420 Mon Sep 17 00:00:00 2001 From: Sebastian Date: Sun, 25 Jan 2026 14:46:25 +0100 Subject: [PATCH 4/8] Remove eval dataset params from rlm_secrets --- environments/rlm_secrets/README.md | 4 +++- environments/rlm_secrets/rlm_secrets.py | 8 -------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/environments/rlm_secrets/README.md b/environments/rlm_secrets/README.md index 5762587b7..5048cc318 100644 --- a/environments/rlm_secrets/README.md +++ b/environments/rlm_secrets/README.md @@ -72,7 +72,6 @@ Both reward functions have equal weight (0.5 each): | Parameter | Default | Description | |-----------|---------|-------------| | `num_train_examples` | 100 | Training puzzles | -| `num_eval_examples` | 20 | Evaluation puzzles | | `num_files` | 4 | Files per puzzle | | `max_turns` | 50 | Max REPL iterations | | `sub_tool_max_turns` | 3 | Max tool turns for sub-LLMs | @@ -80,6 +79,9 @@ Both reward functions have equal weight (0.5 each): | `code_execution_timeout` | 120 | Bash execution timeout (seconds) | | `**kwargs` | - | Passed on `RLMEnv.__init__` | +Note: The eval dataset is not built separately. For evaluation, re-instantiate the +environment with a different `seed` to generate a new synthetic split. + ## Why This Environment? This environment is specifically designed to test RLM capabilities: diff --git a/environments/rlm_secrets/rlm_secrets.py b/environments/rlm_secrets/rlm_secrets.py index 6f680501a..7d4d0a55f 100644 --- a/environments/rlm_secrets/rlm_secrets.py +++ b/environments/rlm_secrets/rlm_secrets.py @@ -446,7 +446,6 @@ async def correct_filesystem_state(state: State) -> float: def load_environment( num_train_examples: int = 100, - num_eval_examples: int = 20, num_files: int = 4, max_turns: int = 50, seed: int | None = None, @@ -461,7 +460,6 @@ def load_environment( Args: num_train_examples: Number of training puzzle instances - num_eval_examples: Number of evaluation puzzle instances num_files: Number of files per puzzle (default: 4) max_turns: Maximum REPL iterations (default: 50) seed: Random seed for dataset generation @@ -480,11 +478,6 @@ def load_environment( num_files=num_files, ) - eval_dataset = build_dataset( - num_examples=num_eval_examples, - num_files=num_files, - ) - rubric = vf.Rubric( funcs=[correct_answer, correct_filesystem_state], weights=[0.5, 0.5], @@ -492,7 +485,6 @@ def load_environment( return RLMSecretsEnv( dataset=train_dataset, - eval_dataset=eval_dataset, num_files=num_files, repl_language=repl_language, rubric=rubric, From 5bd429a6f832c77b8deb1085986add8a5cb3bda1 Mon Sep 17 00:00:00 2001 From: Sebastian Date: Sun, 25 Jan 2026 17:14:31 +0100 Subject: [PATCH 5/8] Fix interleaved prompt IDs for main RLM --- verifiers/envs/experimental/rlm_env.py | 118 +++++++++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/verifiers/envs/experimental/rlm_env.py b/verifiers/envs/experimental/rlm_env.py index 347747ca5..d67f4ab8d 100644 --- a/verifiers/envs/experimental/rlm_env.py +++ b/verifiers/envs/experimental/rlm_env.py @@ -51,14 +51,18 @@ from typing import TypedDict from aiohttp import web +from openai import AsyncOpenAI from openai.types.chat import ChatCompletionFunctionToolParam from prime_tunnel import Tunnel import verifiers as vf from verifiers.types import ( + ChatCompletionToolParam, ChatMessage, ChatMessages, Messages, + MessageType, ModelResponse, + SamplingArgs, State, TrajectoryStep, ) @@ -4201,6 +4205,120 @@ async def add_trajectory_step(self, state: State, trajectory_step: TrajectorySte # MultiTurnEnv Interface # ========================================================================= + async def get_model_response( + self, + state: State, + prompt: Messages, + client: AsyncOpenAI | None = None, + model: str | None = None, + oai_tools: list[ChatCompletionToolParam] | None = None, + sampling_args: SamplingArgs | None = None, + message_type: MessageType | None = None, + ) -> ModelResponse: + """ + Override to keep interleaved prompt_id computation scoped to main-LLM steps. + + RLMEnv injects sub-LLM turns into state["trajectory"] for training. If we + feed that mixed trajectory into get_prompt_ids, the "previous turn" + becomes a sub-LLM step and the main prompt looks unrelated. That produces + an empty env_response and breaks /tokenize. We avoid that by filtering + trajectory steps to the main trajectory_id only. + """ + + def resolve_optional_args( + client: AsyncOpenAI | None, + model: str | None, + oai_tools: list[ChatCompletionToolParam] | None, + sampling_args: SamplingArgs | None, + message_type: MessageType | None, + ) -> tuple[ + AsyncOpenAI, + str, + list[ChatCompletionToolParam] | None, + SamplingArgs, + MessageType, + ]: + client = client or state["client"] + model = model or state["model"] + assert client is not None and model is not None + oai_tools = oai_tools or state["oai_tools"] + sampling_args = cast( + SamplingArgs, sampling_args or state["sampling_args"] or {} + ) + message_type = message_type or self.message_type + return client, model, oai_tools, sampling_args, message_type + + def normalize_sampling_args(sampling_args: SamplingArgs) -> SamplingArgs: + if "max_tokens" in sampling_args: + if sampling_args["max_tokens"] is None: + sampling_args.pop("max_tokens") + elif message_type == "chat": + sampling_args["max_completion_tokens"] = sampling_args.pop( + "max_tokens" + ) + if ( + "max_completion_tokens" in sampling_args + and sampling_args["max_completion_tokens"] is None + ): + sampling_args.pop("max_completion_tokens") + return {k: v for k, v in sampling_args.items() if v is not None} + + client, model, oai_tools, sampling_args, message_type = resolve_optional_args( + client, model, oai_tools, sampling_args, message_type + ) + sampling_args = normalize_sampling_args(sampling_args) + if self.interleaved_rollouts: + sampling_args = prepare_sampling_args_for_token_prompts(sampling_args) + + prompt_ids: list[int] | None = None + if ( + self.interleaved_rollouts + and message_type == "chat" + and len(state["trajectory"]) > 0 + ): + main_trajectory_id = state.get("trajectory_id") + main_steps = [ + step + for step in state["trajectory"] + if step.get("trajectory_id") == main_trajectory_id + ] + if main_steps: + # Do not mutate the original state; build a minimal view for + # prompt_id computation that excludes sub-LLM turns. + prompt_state = State() + prompt_state["trajectory"] = main_steps + prompt_state["client"] = client + prompt_state["model"] = model + prompt_state["oai_tools"] = oai_tools or [] + # Reuse cached suffix ids if available to avoid extra tokenization. + if "_cached_suffix_ids" in state: + prompt_state["_cached_suffix_ids"] = state["_cached_suffix_ids"] + # Keep sub-LLM debug context for empty-env-response logging. + if "_last_sub_llm_root_call" in state: + prompt_state["_last_sub_llm_root_call"] = state[ + "_last_sub_llm_root_call" + ] + prompt_ids = await get_prompt_ids(prompt_state, prompt, client) + else: + # If no main steps are present (should be rare), fall back to + # full-tokenize so we still use the tokens endpoint. + prompt_ids = await tokenize_vllm( + client=client, + messages=prompt, + tools=oai_tools, + model=model, + ) + + return await self._call_model_api( + client=client, + model=model, + prompt=prompt, + oai_tools=oai_tools, + sampling_args=sampling_args, + message_type=message_type, + prompt_ids=prompt_ids, + ) + async def get_prompt_messages(self, state: State) -> Messages: """Build prompt messages, adding system prompt with tool docs on first turn.""" if len(state["trajectory"]) == 0: From a1052f84bf876bd7c062908ea060623dcafb343b Mon Sep 17 00:00:00 2001 From: Sebastian Date: Sun, 25 Jan 2026 18:40:35 +0100 Subject: [PATCH 6/8] Drain bash REPL output between commands --- verifiers/envs/experimental/rlm_env.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/verifiers/envs/experimental/rlm_env.py b/verifiers/envs/experimental/rlm_env.py index d67f4ab8d..f2fee9c75 100644 --- a/verifiers/envs/experimental/rlm_env.py +++ b/verifiers/envs/experimental/rlm_env.py @@ -1140,11 +1140,26 @@ def _read_until_marker(marker: bytes) -> bytes: break return buffer + def _drain_fd(): + # Drain any immediately-available output (e.g., prompt text) so it + # doesn't leak into the next command's captured output. + while True: + ready, _, _ = select.select([master_fd], [], [], 0) + if master_fd not in ready: + break + try: + chunk = os.read(master_fd, 4096) + except Exception: + break + if not chunk: + break + def _parse_bool(value: str) -> bool: return value.strip().lower() in {"1", "true", "yes", "y", "on"} try: _read_until_marker(init_marker.encode("utf-8")) + _drain_fd() except Exception: pass @@ -1188,6 +1203,7 @@ def _parse_bool(value: str) -> bool: continue raw = _read_until_marker(env_marker.encode("utf-8")) + _drain_fd() text = raw.decode("utf-8", errors="replace") output = text From fcca9fdb6add4f22434333627f07d030fc29500c Mon Sep 17 00:00:00 2001 From: Sebastian Date: Sun, 25 Jan 2026 20:06:05 +0100 Subject: [PATCH 7/8] Revert "Drain bash REPL output between commands" This reverts commit a1052f84bf876bd7c062908ea060623dcafb343b. --- verifiers/envs/experimental/rlm_env.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/verifiers/envs/experimental/rlm_env.py b/verifiers/envs/experimental/rlm_env.py index f2fee9c75..d67f4ab8d 100644 --- a/verifiers/envs/experimental/rlm_env.py +++ b/verifiers/envs/experimental/rlm_env.py @@ -1140,26 +1140,11 @@ def _read_until_marker(marker: bytes) -> bytes: break return buffer - def _drain_fd(): - # Drain any immediately-available output (e.g., prompt text) so it - # doesn't leak into the next command's captured output. - while True: - ready, _, _ = select.select([master_fd], [], [], 0) - if master_fd not in ready: - break - try: - chunk = os.read(master_fd, 4096) - except Exception: - break - if not chunk: - break - def _parse_bool(value: str) -> bool: return value.strip().lower() in {"1", "true", "yes", "y", "on"} try: _read_until_marker(init_marker.encode("utf-8")) - _drain_fd() except Exception: pass @@ -1203,7 +1188,6 @@ def _parse_bool(value: str) -> bool: continue raw = _read_until_marker(env_marker.encode("utf-8")) - _drain_fd() text = raw.decode("utf-8", errors="replace") output = text From fd52bee524e02baf44bbb9aa544dce15f5a2e79b Mon Sep 17 00:00:00 2001 From: Sebastian Date: Sun, 25 Jan 2026 20:54:44 +0100 Subject: [PATCH 8/8] Persist cached suffix ids from prompt_state --- verifiers/envs/experimental/rlm_env.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/verifiers/envs/experimental/rlm_env.py b/verifiers/envs/experimental/rlm_env.py index d67f4ab8d..b066dd919 100644 --- a/verifiers/envs/experimental/rlm_env.py +++ b/verifiers/envs/experimental/rlm_env.py @@ -4299,6 +4299,10 @@ def normalize_sampling_args(sampling_args: SamplingArgs) -> SamplingArgs: "_last_sub_llm_root_call" ] prompt_ids = await get_prompt_ids(prompt_state, prompt, client) + # If suffix ids were computed on the temporary state, persist them + # on the main state to avoid re-tokenizing every turn. + if "_cached_suffix_ids" in prompt_state: + state["_cached_suffix_ids"] = prompt_state["_cached_suffix_ids"] else: # If no main steps are present (should be rare), fall back to # full-tokenize so we still use the tokens endpoint.