Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion environments/rlm_secrets/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,16 @@ Both reward functions have equal weight (0.5 each):
| Parameter | Default | Description |
|-----------|---------|-------------|
| `num_train_examples` | 100 | Training puzzles |
| `num_eval_examples` | 20 | Evaluation puzzles |
| `num_files` | 4 | Files per puzzle |
| `max_turns` | 50 | Max REPL iterations |
| `sub_tool_max_turns` | 3 | Max tool turns for sub-LLMs |
| `max_sub_llm_parallelism` | 5 | Concurrent sub-LLM calls |
| `code_execution_timeout` | 120 | Bash execution timeout (seconds) |
| `**kwargs` | - | Passed on `RLMEnv.__init__` |

Note: The eval dataset is not built separately. For evaluation, re-instantiate the
environment with a different `seed` to generate a new synthetic split.

## Why This Environment?

This environment is specifically designed to test RLM capabilities:
Expand Down
11 changes: 3 additions & 8 deletions environments/rlm_secrets/rlm_secrets.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ def build_dataset(
Dataset with prompt, answer, and info columns
"""
rows = []
task_name = "rlm-secrets"

for i in range(num_examples):
puzzle = generate_puzzle(num_files=num_files)
Expand Down Expand Up @@ -359,9 +360,11 @@ def build_dataset(

rows.append(
{
"example_id": i,
"prompt": prompt,
"answer": str(puzzle["correct_position"]),
"info": {"puzzle": puzzle},
"task": task_name,
}
)

Expand Down Expand Up @@ -443,7 +446,6 @@ async def correct_filesystem_state(state: State) -> float:

def load_environment(
num_train_examples: int = 100,
num_eval_examples: int = 20,
num_files: int = 4,
max_turns: int = 50,
seed: int | None = None,
Expand All @@ -458,7 +460,6 @@ def load_environment(

Args:
num_train_examples: Number of training puzzle instances
num_eval_examples: Number of evaluation puzzle instances
num_files: Number of files per puzzle (default: 4)
max_turns: Maximum REPL iterations (default: 50)
seed: Random seed for dataset generation
Expand All @@ -477,19 +478,13 @@ def load_environment(
num_files=num_files,
)

eval_dataset = build_dataset(
num_examples=num_eval_examples,
num_files=num_files,
)

rubric = vf.Rubric(
funcs=[correct_answer, correct_filesystem_state],
weights=[0.5, 0.5],
)

return RLMSecretsEnv(
dataset=train_dataset,
eval_dataset=eval_dataset,
num_files=num_files,
repl_language=repl_language,
rubric=rubric,
Expand Down
98 changes: 98 additions & 0 deletions tests/test_rlm_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -1036,6 +1036,7 @@ class TestSubLLMRequestPaths:
async def test_interleaved_uses_tokens_endpoint(self, rlm_env):
mock_client = MagicMock()
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_client.post = AsyncMock(return_value=mock_response)
mock_client.chat.completions.create = AsyncMock()

Expand Down Expand Up @@ -1066,6 +1067,103 @@ async def test_interleaved_uses_tokens_endpoint(self, rlm_env):
assert "max_tokens" not in body
mock_client.chat.completions.create.assert_not_called()

@pytest.mark.asyncio
async def test_sub_llm_normalizes_messages(self, rlm_env):
mock_client = MagicMock()
mock_message = MagicMock()
mock_message.tool_calls = None
mock_message.content = "ok"
mock_response = MagicMock()
mock_response.choices = [MagicMock(message=mock_message)]
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)

rlm_env.interleaved_rollouts = False
messages = [
{"role": "user", "content": {"type": "text", "text": "hello"}},
{"role": "user", "content": {"role": "user", "content": "inner"}},
]
state = {}

await rlm_env._call_sub_llm_api(state, mock_client, "gpt-4", messages)

args, kwargs = mock_client.chat.completions.create.call_args
assert args == ()
sent_messages = kwargs["messages"]
assert sent_messages[0]["content"] == [{"type": "text", "text": "hello"}]
assert sent_messages[1]["content"] == "inner"

@pytest.mark.asyncio
async def test_interleaved_sub_llm_uses_incremental_prompt_ids(
self, rlm_env_with_sub_tools
):
mock_client = MagicMock()
mock_client.chat.completions.create = AsyncMock()

mock_tool_call = MagicMock()
mock_tool_call.id = "call_1"
mock_tool_call.function.name = "sample_tool"
mock_tool_call.function.arguments = '{"x": 2, "y": 3}'

mock_message1 = MagicMock()
mock_message1.tool_calls = [mock_tool_call]
mock_message1.content = None

mock_message2 = MagicMock()
mock_message2.tool_calls = None
mock_message2.content = "done"

response1 = MagicMock()
response1.choices = [MagicMock(message=mock_message1)]
response2 = MagicMock()
response2.choices = [MagicMock(message=mock_message2)]

mock_client.post = AsyncMock(side_effect=[response1, response2])

rlm_env_with_sub_tools.interleaved_rollouts = True
messages = [{"role": "user", "content": "Add 2 and 3"}]
state = {"sampling_args": {"max_tokens": 7}}

token_payload = {
"prompt_ids": [1],
"prompt_mask": [0],
"completion_ids": [2],
"completion_mask": [1],
"completion_logprobs": [0.0],
"overlong_prompt": False,
"is_truncated": False,
}

with (
patch(
"verifiers.envs.experimental.rlm_env.tokenize_vllm",
new=AsyncMock(return_value=[1, 2, 3]),
) as mock_tokenize,
patch(
"verifiers.envs.experimental.rlm_env.get_prompt_ids",
new=AsyncMock(return_value=[4, 5, 6]),
) as mock_get_prompt_ids,
patch(
"verifiers.envs.experimental.rlm_env.parse_response_tokens",
new=AsyncMock(return_value=token_payload),
),
patch(
"verifiers.envs.experimental.rlm_env.parse_response_messages",
new=AsyncMock(return_value=[{"role": "assistant", "content": "ok"}]),
),
patch(
"verifiers.envs.experimental.rlm_env.parse_is_truncated",
new=AsyncMock(return_value=False),
),
):
await rlm_env_with_sub_tools._run_sub_llm(
state, mock_client, "gpt-4", messages
)

assert mock_client.post.await_count == 2
mock_tokenize.assert_awaited_once()
mock_get_prompt_ids.assert_awaited_once()
mock_client.chat.completions.create.assert_not_called()


# =============================================================================
# 8. Root Tool Serialization (pickle)
Expand Down
52 changes: 39 additions & 13 deletions verifiers/envs/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,40 @@ def normalize_sampling_args(sampling_args: SamplingArgs) -> SamplingArgs:
sampling_args.pop("max_completion_tokens")
return {k: v for k, v in sampling_args.items() if v is not None}

client, model, oai_tools, sampling_args, message_type = resolve_optional_args(
client, model, oai_tools, sampling_args, message_type
)
sampling_args = normalize_sampling_args(sampling_args)
if self.interleaved_rollouts:
sampling_args = prepare_sampling_args_for_token_prompts(sampling_args)

prompt_ids: list[int] | None = None
if self.interleaved_rollouts and len(state["trajectory"]) > 0:
prompt_ids = await get_prompt_ids(state, prompt, client)

return await self._call_model_api(
client=client,
model=model,
prompt=prompt,
oai_tools=oai_tools,
sampling_args=sampling_args,
message_type=message_type,
prompt_ids=prompt_ids,
)

async def _call_model_api(
self,
*,
client: AsyncOpenAI,
model: str,
prompt: Messages,
oai_tools: list[ChatCompletionToolParam] | None,
sampling_args: SamplingArgs,
message_type: MessageType,
prompt_ids: list[int] | None = None,
) -> ModelResponse:
"""Shared low-level model call used by main and sub-LLM paths."""

def handle_overlong_prompt(func):
"""Decorator to handle overlong prompt errors from the model API."""

Expand Down Expand Up @@ -487,7 +521,7 @@ async def wrapper(*args, **kwargs):
return wrapper

@handle_overlong_prompt
async def get_model_response_with_messages(
async def call_with_messages(
client: AsyncOpenAI,
model: str,
prompt: Messages,
Expand Down Expand Up @@ -547,7 +581,7 @@ async def get_model_response_with_messages(
return response

@handle_overlong_prompt
async def get_model_response_with_tokens(
async def call_with_tokens(
client: AsyncOpenAI,
model: str,
prompt: Messages,
Expand Down Expand Up @@ -581,16 +615,8 @@ async def get_model_response_with_tokens(
cast_to=ChatCompletion,
)

client, model, oai_tools, sampling_args, message_type = resolve_optional_args(
client, model, oai_tools, sampling_args, message_type
)
sampling_args = normalize_sampling_args(sampling_args)
if self.interleaved_rollouts:
sampling_args = prepare_sampling_args_for_token_prompts(sampling_args)

if self.interleaved_rollouts and len(state["trajectory"]) > 0:
prompt_ids = await get_prompt_ids(state, prompt, client)
response = await get_model_response_with_tokens(
if prompt_ids is not None:
response = await call_with_tokens(
client=client,
model=model,
prompt=prompt,
Expand All @@ -600,7 +626,7 @@ async def get_model_response_with_tokens(
message_type=message_type,
)
else:
response = await get_model_response_with_messages(
response = await call_with_messages(
client=client,
model=model,
prompt=prompt,
Expand Down
Loading
Loading