PrimeIntellect-ai · snimu · Jan 25, 2026 · Jan 25, 2026 · Jan 25, 2026 · Jan 25, 2026
diff --git a/environments/rlm_secrets/README.md b/environments/rlm_secrets/README.md
@@ -72,14 +72,16 @@ Both reward functions have equal weight (0.5 each):
 | Parameter | Default | Description |
 |-----------|---------|-------------|
 | `num_train_examples` | 100 | Training puzzles |
-| `num_eval_examples` | 20 | Evaluation puzzles |
 | `num_files` | 4 | Files per puzzle |
 | `max_turns` | 50 | Max REPL iterations |
 | `sub_tool_max_turns` | 3 | Max tool turns for sub-LLMs |
 | `max_sub_llm_parallelism` | 5 | Concurrent sub-LLM calls |
 | `code_execution_timeout` | 120 | Bash execution timeout (seconds) |
 | `**kwargs` | - | Passed on `RLMEnv.__init__` |
 
+Note: The eval dataset is not built separately. For evaluation, re-instantiate the
+environment with a different `seed` to generate a new synthetic split.
+
 ## Why This Environment?
 
 This environment is specifically designed to test RLM capabilities:

diff --git a/environments/rlm_secrets/rlm_secrets.py b/environments/rlm_secrets/rlm_secrets.py
@@ -318,6 +318,7 @@ def build_dataset(
         Dataset with prompt, answer, and info columns
     """
     rows = []
+    task_name = "rlm-secrets"
 
     for i in range(num_examples):
         puzzle = generate_puzzle(num_files=num_files)
@@ -359,9 +360,11 @@ def build_dataset(
 
         rows.append(
             {
+                "example_id": i,
                 "prompt": prompt,
                 "answer": str(puzzle["correct_position"]),
                 "info": {"puzzle": puzzle},
+                "task": task_name,
             }
         )
 
@@ -443,7 +446,6 @@ async def correct_filesystem_state(state: State) -> float:
 
 def load_environment(
     num_train_examples: int = 100,
-    num_eval_examples: int = 20,
     num_files: int = 4,
     max_turns: int = 50,
     seed: int | None = None,
@@ -458,7 +460,6 @@ def load_environment(
 
     Args:
         num_train_examples: Number of training puzzle instances
-        num_eval_examples: Number of evaluation puzzle instances
         num_files: Number of files per puzzle (default: 4)
         max_turns: Maximum REPL iterations (default: 50)
         seed: Random seed for dataset generation
@@ -477,19 +478,13 @@ def load_environment(
         num_files=num_files,
     )
 
-    eval_dataset = build_dataset(
-        num_examples=num_eval_examples,
-        num_files=num_files,
-    )
-
     rubric = vf.Rubric(
         funcs=[correct_answer, correct_filesystem_state],
         weights=[0.5, 0.5],
     )
 
     return RLMSecretsEnv(
         dataset=train_dataset,
-        eval_dataset=eval_dataset,
         num_files=num_files,
         repl_language=repl_language,
         rubric=rubric,

diff --git a/tests/test_rlm_env.py b/tests/test_rlm_env.py
@@ -1036,6 +1036,7 @@ class TestSubLLMRequestPaths:
     async def test_interleaved_uses_tokens_endpoint(self, rlm_env):
         mock_client = MagicMock()
         mock_response = MagicMock()
+        mock_response.choices = [MagicMock()]
         mock_client.post = AsyncMock(return_value=mock_response)
         mock_client.chat.completions.create = AsyncMock()
 
@@ -1066,6 +1067,103 @@ async def test_interleaved_uses_tokens_endpoint(self, rlm_env):
         assert "max_tokens" not in body
         mock_client.chat.completions.create.assert_not_called()
 
+    @pytest.mark.asyncio
+    async def test_sub_llm_normalizes_messages(self, rlm_env):
+        mock_client = MagicMock()
+        mock_message = MagicMock()
+        mock_message.tool_calls = None
+        mock_message.content = "ok"
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock(message=mock_message)]
+        mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
+
+        rlm_env.interleaved_rollouts = False
+        messages = [
+            {"role": "user", "content": {"type": "text", "text": "hello"}},
+            {"role": "user", "content": {"role": "user", "content": "inner"}},
+        ]
+        state = {}
+
+        await rlm_env._call_sub_llm_api(state, mock_client, "gpt-4", messages)
+
+        args, kwargs = mock_client.chat.completions.create.call_args
+        assert args == ()
+        sent_messages = kwargs["messages"]
+        assert sent_messages[0]["content"] == [{"type": "text", "text": "hello"}]
+        assert sent_messages[1]["content"] == "inner"
+
+    @pytest.mark.asyncio
+    async def test_interleaved_sub_llm_uses_incremental_prompt_ids(
+        self, rlm_env_with_sub_tools
+    ):
+        mock_client = MagicMock()
+        mock_client.chat.completions.create = AsyncMock()
+
+        mock_tool_call = MagicMock()
+        mock_tool_call.id = "call_1"
+        mock_tool_call.function.name = "sample_tool"
+        mock_tool_call.function.arguments = '{"x": 2, "y": 3}'
+
+        mock_message1 = MagicMock()
+        mock_message1.tool_calls = [mock_tool_call]
+        mock_message1.content = None
+
+        mock_message2 = MagicMock()
+        mock_message2.tool_calls = None
+        mock_message2.content = "done"
+
+        response1 = MagicMock()
+        response1.choices = [MagicMock(message=mock_message1)]
+        response2 = MagicMock()
+        response2.choices = [MagicMock(message=mock_message2)]
+
+        mock_client.post = AsyncMock(side_effect=[response1, response2])
+
+        rlm_env_with_sub_tools.interleaved_rollouts = True
+        messages = [{"role": "user", "content": "Add 2 and 3"}]
+        state = {"sampling_args": {"max_tokens": 7}}
+
+        token_payload = {
+            "prompt_ids": [1],
+            "prompt_mask": [0],
+            "completion_ids": [2],
+            "completion_mask": [1],
+            "completion_logprobs": [0.0],
+            "overlong_prompt": False,
+            "is_truncated": False,
+        }
+
+        with (
+            patch(
+                "verifiers.envs.experimental.rlm_env.tokenize_vllm",
+                new=AsyncMock(return_value=[1, 2, 3]),
+            ) as mock_tokenize,
+            patch(
+                "verifiers.envs.experimental.rlm_env.get_prompt_ids",
+                new=AsyncMock(return_value=[4, 5, 6]),
+            ) as mock_get_prompt_ids,
+            patch(
+                "verifiers.envs.experimental.rlm_env.parse_response_tokens",
+                new=AsyncMock(return_value=token_payload),
+            ),
+            patch(
+                "verifiers.envs.experimental.rlm_env.parse_response_messages",
+                new=AsyncMock(return_value=[{"role": "assistant", "content": "ok"}]),
+            ),
+            patch(
+                "verifiers.envs.experimental.rlm_env.parse_is_truncated",
+                new=AsyncMock(return_value=False),
+            ),
+        ):
+            await rlm_env_with_sub_tools._run_sub_llm(
+                state, mock_client, "gpt-4", messages
+            )
+
+        assert mock_client.post.await_count == 2
+        mock_tokenize.assert_awaited_once()
+        mock_get_prompt_ids.assert_awaited_once()
+        mock_client.chat.completions.create.assert_not_called()
+
 
 # =============================================================================
 # 8. Root Tool Serialization (pickle)

diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py
@@ -456,6 +456,40 @@ def normalize_sampling_args(sampling_args: SamplingArgs) -> SamplingArgs:
                 sampling_args.pop("max_completion_tokens")
             return {k: v for k, v in sampling_args.items() if v is not None}
 
+        client, model, oai_tools, sampling_args, message_type = resolve_optional_args(
+            client, model, oai_tools, sampling_args, message_type
+        )
+        sampling_args = normalize_sampling_args(sampling_args)
+        if self.interleaved_rollouts:
+            sampling_args = prepare_sampling_args_for_token_prompts(sampling_args)
+
+        prompt_ids: list[int] | None = None
+        if self.interleaved_rollouts and len(state["trajectory"]) > 0:
+            prompt_ids = await get_prompt_ids(state, prompt, client)
+
+        return await self._call_model_api(
+            client=client,
+            model=model,
+            prompt=prompt,
+            oai_tools=oai_tools,
+            sampling_args=sampling_args,
+            message_type=message_type,
+            prompt_ids=prompt_ids,
+        )
+
+    async def _call_model_api(
+        self,
+        *,
+        client: AsyncOpenAI,
+        model: str,
+        prompt: Messages,
+        oai_tools: list[ChatCompletionToolParam] | None,
+        sampling_args: SamplingArgs,
+        message_type: MessageType,
+        prompt_ids: list[int] | None = None,
+    ) -> ModelResponse:
+        """Shared low-level model call used by main and sub-LLM paths."""
+
         def handle_overlong_prompt(func):
             """Decorator to handle overlong prompt errors from the model API."""
 
@@ -487,7 +521,7 @@ async def wrapper(*args, **kwargs):
             return wrapper
 
         @handle_overlong_prompt
-        async def get_model_response_with_messages(
+        async def call_with_messages(
             client: AsyncOpenAI,
             model: str,
             prompt: Messages,
@@ -547,7 +581,7 @@ async def get_model_response_with_messages(
                 return response
 
         @handle_overlong_prompt
-        async def get_model_response_with_tokens(
+        async def call_with_tokens(
             client: AsyncOpenAI,
             model: str,
             prompt: Messages,
@@ -581,16 +615,8 @@ async def get_model_response_with_tokens(
                 cast_to=ChatCompletion,
             )
 
-        client, model, oai_tools, sampling_args, message_type = resolve_optional_args(
-            client, model, oai_tools, sampling_args, message_type
-        )
-        sampling_args = normalize_sampling_args(sampling_args)
-        if self.interleaved_rollouts:
-            sampling_args = prepare_sampling_args_for_token_prompts(sampling_args)
-
-        if self.interleaved_rollouts and len(state["trajectory"]) > 0:
-            prompt_ids = await get_prompt_ids(state, prompt, client)
-            response = await get_model_response_with_tokens(
+        if prompt_ids is not None:
+            response = await call_with_tokens(
                 client=client,
                 model=model,
                 prompt=prompt,
@@ -600,7 +626,7 @@ async def get_model_response_with_tokens(
                 message_type=message_type,
             )
         else:
-            response = await get_model_response_with_messages(
+            response = await call_with_messages(
                 client=client,
                 model=model,
                 prompt=prompt,