diff --git a/pyproject.toml b/pyproject.toml
index 3c53086..94d4b71 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -85,6 +85,7 @@ swe-agent = [
"unidiff>=0.7",
]
gcp = [
+ "cooperbench",
"google-cloud-batch>=0.17",
"google-cloud-compute>=1.0",
"google-cloud-storage>=2.0",
@@ -186,7 +187,7 @@ exclude_lines = [
]
[tool.uv.sources]
-cooperbench = { workspace = true }
openhands-sdk = { path = "src/cooperbench/agents/openhands_agent_sdk/openhands-sdk", editable = true }
openhands-tools = { path = "src/cooperbench/agents/openhands_agent_sdk/openhands-tools", editable = true }
openhands-workspace = { path = "src/cooperbench/agents/openhands_agent_sdk/openhands-workspace", editable = true }
+cooperbench = { workspace = true }
diff --git a/src/cooperbench/agents/__init__.py b/src/cooperbench/agents/__init__.py
index 6ad8c62..631f169 100644
--- a/src/cooperbench/agents/__init__.py
+++ b/src/cooperbench/agents/__init__.py
@@ -104,6 +104,7 @@ def run(
# Add your agent's shorthand here when registering a new adapter
AGENT_SHORTHANDS = {
"mini_swe_agent": "msa",
+ "mini_swe_agent_v2": "msa_v2",
"swe_agent": "sw",
"openhands_sdk": "oh",
}
diff --git a/src/cooperbench/agents/mini_swe_agent/environments/modal.py b/src/cooperbench/agents/mini_swe_agent/environments/modal.py
index 714b2c2..42e54b7 100644
--- a/src/cooperbench/agents/mini_swe_agent/environments/modal.py
+++ b/src/cooperbench/agents/mini_swe_agent/environments/modal.py
@@ -69,8 +69,7 @@ def _get_or_build_image(image_name: str) -> modal.Image:
if image_name in _image_cache:
return _image_cache[image_name]
- # Build and cache (CACHE_BUST env forces new image hash) TODO: @arpan remove this after testing
- image = modal.Image.from_registry(image_name).entrypoint([]).env({"COOPERBENCH_CACHE": "v6"})
+ image = modal.Image.from_registry(image_name).entrypoint([])
_image_cache[image_name] = image
return image
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/__init__.py b/src/cooperbench/agents/mini_swe_agent_v2/__init__.py
new file mode 100644
index 0000000..207db03
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/__init__.py
@@ -0,0 +1,102 @@
+"""
+mini-swe-agent v2 - A Minimal AI Agent for Software Engineering (Tool-calling version)
+
+Source: https://github.com/SWE-agent/mini-swe-agent
+Version: 2.1.0 (commit 56613dd)
+License: MIT
+Copyright (c) 2025 Kilian A. Lieret and Carlos E. Jimenez
+
+This code is copied directly from mini-swe-agent with modifications:
+- Import paths changed from 'minisweagent' to 'cooperbench.agents.mini_swe_agent_v2'
+- Removed text-based (regex) action parsing (v1 compat), keeping only tool calling
+- Removed interactive agent, non-Docker environments, non-litellm models
+- Added inter-agent communication (messaging + git connectors) for CooperBench
+
+Citation:
+ @misc{minisweagent2025,
+ title={mini-swe-agent: A Minimal AI Agent for Software Engineering},
+ author={Lieret, Kilian A. and Jimenez, Carlos E.},
+ year={2025},
+ url={https://github.com/SWE-agent/mini-swe-agent}
+ }
+
+This file provides:
+- Path settings for global config file & relative directories
+- Version numbering
+- Protocols for the core components of mini-swe-agent.
+"""
+
+__version__ = "2.1.0"
+
+import os
+from pathlib import Path
+from typing import Any, Protocol
+
+import dotenv
+from platformdirs import user_config_dir
+
+from cooperbench.agents.mini_swe_agent_v2.utils.log import logger
+
+package_dir = Path(__file__).resolve().parent
+
+global_config_dir = Path(os.getenv("MSWEA_GLOBAL_CONFIG_DIR") or user_config_dir("mini-swe-agent"))
+global_config_dir.mkdir(parents=True, exist_ok=True)
+global_config_file = Path(global_config_dir) / ".env"
+
+dotenv.load_dotenv(dotenv_path=global_config_file)
+
+
+# === Protocols ===
+# You can ignore them unless you want static type checking.
+
+
+class Model(Protocol):
+ """Protocol for language models."""
+
+ config: Any
+
+ def query(self, messages: list[dict[str, str]], **kwargs) -> dict: ...
+
+ def format_message(self, **kwargs) -> dict: ...
+
+ def format_observation_messages(
+ self, message: dict, outputs: list[dict], template_vars: dict | None = None
+ ) -> list[dict]: ...
+
+ def get_template_vars(self, **kwargs) -> dict[str, Any]: ...
+
+ def serialize(self) -> dict: ...
+
+
+class Environment(Protocol):
+ """Protocol for execution environments."""
+
+ config: Any
+
+ def execute(self, action: dict, cwd: str = "") -> dict[str, Any]: ...
+
+ def get_template_vars(self, **kwargs) -> dict[str, Any]: ...
+
+ def serialize(self) -> dict: ...
+
+
+class Agent(Protocol):
+ """Protocol for agents."""
+
+ config: Any
+
+ def run(self, task: str, **kwargs) -> dict: ...
+
+ def save(self, path: Path | None, *extra_dicts) -> dict: ...
+
+
+__all__ = [
+ "Agent",
+ "Model",
+ "Environment",
+ "package_dir",
+ "__version__",
+ "global_config_file",
+ "global_config_dir",
+ "logger",
+]
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/adapter.py b/src/cooperbench/agents/mini_swe_agent_v2/adapter.py
new file mode 100644
index 0000000..c9bcbf7
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/adapter.py
@@ -0,0 +1,145 @@
+"""Mini-SWE-Agent v2 adapter for CooperBench.
+
+This adapter wraps the mini-swe-agent v2 framework (tool-calling version)
+to conform to the AgentRunner interface used by CooperBench.
+"""
+
+import yaml
+
+from cooperbench.agents import AgentResult
+from cooperbench.agents.mini_swe_agent_v2.agents.default import DefaultAgent
+from cooperbench.agents.mini_swe_agent_v2.config import get_config_path
+from cooperbench.agents.mini_swe_agent_v2.connectors import GitConnector
+from cooperbench.agents.mini_swe_agent_v2.connectors.messaging import MessagingConnector
+from cooperbench.agents.mini_swe_agent_v2.models.litellm_model import LitellmModel
+from cooperbench.agents.mini_swe_agent_v2.models.utils.actions_toolcall import SEND_MESSAGE_TOOL
+from cooperbench.agents.registry import register
+
+
+@register("mini_swe_agent_v2")
+class MiniSweAgentV2Runner:
+ """Adapter for mini-swe-agent v2 framework (tool-calling)."""
+
+ def run(
+ self,
+ task: str,
+ image: str,
+ *,
+ agent_id: str = "agent",
+ model_name: str = "gpt-4o",
+ agents: list[str] | None = None,
+ comm_url: str | None = None,
+ git_server_url: str | None = None,
+ git_enabled: bool = False,
+ messaging_enabled: bool = True,
+ config: dict | None = None,
+ agent_config: str | None = None,
+ log_dir: str | None = None,
+ ) -> AgentResult:
+ """Run mini-swe-agent v2 on a task."""
+ # Always load default config, then merge with any overrides
+ config_path = get_config_path("mini")
+ with open(config_path) as f:
+ default_config = yaml.safe_load(f)
+
+ # Merge passed config overrides into default config
+ if config is not None:
+ default_config.update(config)
+
+ agent_cfg = default_config.get("agent", {})
+ model_cfg = default_config.get("model", {})
+ env_cfg = default_config.get("environment", {})
+ backend = default_config.get("backend", "modal")
+
+ # Create environment based on backend
+ env_kwargs = {
+ "image": image,
+ "cwd": "/workspace/repo",
+ "timeout": 3600,
+ }
+ if env_cfg.get("env"):
+ env_kwargs["env"] = env_cfg["env"]
+
+ if backend == "docker":
+ from cooperbench.agents.mini_swe_agent_v2.environments.docker import DockerEnvironment
+
+ if config and config.get("git_network"):
+ env_kwargs["network"] = config["git_network"]
+ env = DockerEnvironment(**env_kwargs)
+ else:
+ from cooperbench.agents.mini_swe_agent_v2.environments.modal import ModalEnvironment
+
+ env = ModalEnvironment(**env_kwargs)
+
+ # Capture base commit for patch generation
+ base_commit_result = env.execute({"command": "git rev-parse HEAD"})
+ base_commit = base_commit_result.get("output", "").strip()
+
+ # Setup messaging connector if enabled
+ comm = None
+ use_messaging = messaging_enabled and comm_url and agents and len(agents) > 1
+ if use_messaging:
+ comm = MessagingConnector(agent_id=agent_id, agents=agents, url=comm_url)
+
+ # Create LLM model with send_message tool if messaging is enabled
+ extra_tools = [SEND_MESSAGE_TOOL] if use_messaging else None
+ model = LitellmModel(model_name=model_name, extra_tools=extra_tools, **model_cfg)
+
+ # Setup git connector if enabled
+ if git_enabled and git_server_url and agents:
+ git_connector = GitConnector(
+ agent_id=agent_id,
+ agents=agents,
+ server_url=git_server_url,
+ )
+ git_connector.setup(env)
+
+ # Create agent with template variables for collaboration
+ extra_vars = {
+ "agent_id": agent_id if (agents and len(agents) > 1) else None,
+ "agents": agents if agents else [],
+ "git_enabled": git_enabled,
+ "messaging_enabled": messaging_enabled,
+ }
+
+ agent = DefaultAgent(
+ model=model,
+ env=env,
+ comm=comm,
+ agent_id=agent_id,
+ **agent_cfg,
+ )
+ agent.extra_template_vars.update(extra_vars)
+
+ # Run agent
+ error_msg = None
+ try:
+ result = agent.run(task=task)
+ status = result.get("exit_status", "Submitted")
+ except Exception as e:
+ status = "Error"
+ error_msg = str(e)
+
+ # Extract patch (committed + uncommitted changes)
+ patch = self._get_patch(env, base_commit)
+
+ # Cleanup
+ env.cleanup()
+
+ return AgentResult(
+ status=status,
+ patch=patch,
+ cost=agent.cost,
+ steps=agent.n_calls,
+ messages=agent.messages,
+ sent_messages=agent.sent_messages,
+ error=error_msg,
+ )
+
+ def _get_patch(self, env, base_commit: str) -> str:
+ """Extract git diff from base commit to current working tree state."""
+ try:
+ result = env.execute({"command": f"git diff {base_commit}"})
+ return result.get("output", "").strip()
+ except Exception:
+ return ""
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/agents/__init__.py b/src/cooperbench/agents/mini_swe_agent_v2/agents/__init__.py
new file mode 100644
index 0000000..d2df172
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/agents/__init__.py
@@ -0,0 +1 @@
+"""Agent implementations for mini-SWE-agent v2."""
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/agents/default.py b/src/cooperbench/agents/mini_swe_agent_v2/agents/default.py
new file mode 100644
index 0000000..75e2f47
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/agents/default.py
@@ -0,0 +1,205 @@
+"""Basic agent class. See https://mini-swe-agent.com/latest/advanced/control_flow/ for visual explanation
+or https://minimal-agent.com for a tutorial on the basic building principles.
+"""
+
+import json
+import logging
+import traceback
+from pathlib import Path
+
+from jinja2 import StrictUndefined, Template
+from pydantic import BaseModel
+
+from cooperbench.agents.mini_swe_agent_v2 import Environment, Model, __version__
+from cooperbench.agents.mini_swe_agent_v2.connectors.messaging import MessagingConnector
+from cooperbench.agents.mini_swe_agent_v2.exceptions import InterruptAgentFlow, LimitsExceeded
+from cooperbench.agents.mini_swe_agent_v2.utils.serialize import recursive_merge
+
+
+class AgentConfig(BaseModel):
+ """Check the config files in config/ for example settings."""
+
+ system_template: str
+ """Template for the system message (the first message)."""
+ instance_template: str
+ """Template for the first user message specifying the task (the second message overall)."""
+ step_limit: int = 0
+ """Maximum number of steps the agent can take."""
+ cost_limit: float = 3.0
+ """Stop agent after exceeding (!) this cost."""
+ output_path: Path | None = None
+ """Save the trajectory to this path."""
+
+
+class DefaultAgent:
+ def __init__(
+ self,
+ model: Model,
+ env: Environment,
+ *,
+ comm: MessagingConnector | None = None,
+ agent_id: str = "agent",
+ config_class: type = AgentConfig,
+ **kwargs,
+ ):
+ """See the `AgentConfig` class for permitted keyword arguments."""
+ self.config = config_class(**kwargs)
+ self.messages: list[dict] = []
+ self.model = model
+ self.env = env
+ self.comm = comm
+ self.agent_id = agent_id
+ self.extra_template_vars = {}
+ self.logger = logging.getLogger("agent")
+ self.cost = 0.0
+ self.n_calls = 0
+ self.sent_messages: list[dict] = []
+
+ def log(self, msg: str):
+ """Log message with agent prefix."""
+ self.logger.debug(f"[{self.agent_id}] {msg}")
+
+ def get_template_vars(self, **kwargs) -> dict:
+ return recursive_merge(
+ self.config.model_dump(),
+ self.env.get_template_vars(),
+ self.model.get_template_vars(),
+ {"n_model_calls": self.n_calls, "model_cost": self.cost},
+ self.extra_template_vars,
+ kwargs,
+ )
+
+ def _render_template(self, template: str) -> str:
+ return Template(template, undefined=StrictUndefined).render(**self.get_template_vars())
+
+ def add_messages(self, *messages: dict) -> list[dict]:
+ self.logger.debug(messages) # set log level to debug to see
+ self.messages.extend(messages)
+ return list(messages)
+
+ def handle_uncaught_exception(self, e: Exception) -> list[dict]:
+ return self.add_messages(
+ self.model.format_message(
+ role="exit",
+ content=str(e),
+ extra={
+ "exit_status": type(e).__name__,
+ "submission": "",
+ "exception_str": str(e),
+ "traceback": traceback.format_exc(),
+ },
+ )
+ )
+
+ def run(self, task: str = "", **kwargs) -> dict:
+ """Run step() until agent is finished. Returns dictionary with exit_status, submission keys."""
+ self.extra_template_vars |= {"task": task, **kwargs}
+ self.messages = []
+ self.add_messages(
+ self.model.format_message(role="system", content=self._render_template(self.config.system_template)),
+ self.model.format_message(role="user", content=self._render_template(self.config.instance_template)),
+ )
+ while True:
+ try:
+ self.step()
+ except InterruptAgentFlow as e:
+ self.add_messages(*e.messages)
+ except Exception as e:
+ self.handle_uncaught_exception(e)
+ raise
+ finally:
+ self.save(self.config.output_path)
+ if self.messages[-1].get("role") == "exit":
+ break
+ return self.messages[-1].get("extra", {})
+
+ def step(self) -> list[dict]:
+ """Query the LM, execute actions. Polls for inter-agent messages before querying."""
+ # Check for inter-agent messages before querying LLM
+ if self.comm:
+ messages = self.comm.receive()
+ for msg in messages:
+ ts = msg.get("timestamp", "")[:19].replace("T", " ")
+ self.log(f"INBOX: [{msg['from']} @ {ts}] {msg['content']}")
+ self.add_messages(
+ self.model.format_message(
+ role="user",
+ content=f"[Message from {msg['from']}]: {msg['content']}",
+ )
+ )
+ return self.execute_actions(self.query())
+
+ def query(self) -> dict:
+ """Query the model and return model messages. Override to add hooks."""
+ if 0 < self.config.step_limit <= self.n_calls or 0 < self.config.cost_limit <= self.cost:
+ raise LimitsExceeded(
+ {
+ "role": "exit",
+ "content": "LimitsExceeded",
+ "extra": {"exit_status": "LimitsExceeded", "submission": ""},
+ }
+ )
+ self.n_calls += 1
+ message = self.model.query(self.messages)
+ self.cost += message.get("extra", {}).get("cost", 0.0)
+ self.add_messages(message)
+ return message
+
+ def execute_actions(self, message: dict) -> list[dict]:
+ """Execute actions in message, add observation messages, return them.
+
+ Handles both bash and send_message tool calls.
+ """
+ actions = message.get("extra", {}).get("actions", [])
+ outputs = []
+ for action in actions:
+ tool_name = action.get("tool_name", "bash")
+ if tool_name == "send_message" and self.comm:
+ output = self._handle_send_message(action)
+ else:
+ outputs.append(self.env.execute(action))
+ continue
+ outputs.append(output)
+ return self.add_messages(*self.model.format_observation_messages(message, outputs, self.get_template_vars()))
+
+ def _handle_send_message(self, action: dict) -> dict:
+ """Handle a send_message tool call via the messaging connector."""
+ recipient = action.get("recipient", "")
+ content = action.get("content", "")
+ self.comm.send(recipient, content)
+ self.log(f"SENT to {recipient}: {content[:80]}...")
+ self.sent_messages.append({"to": recipient, "content": content})
+ return {"output": f"Message sent to {recipient}", "returncode": 0}
+
+ def serialize(self, *extra_dicts) -> dict:
+ """Serialize agent state to a json-compatible nested dictionary for saving."""
+ last_message = self.messages[-1] if self.messages else {}
+ last_extra = last_message.get("extra", {})
+ agent_data = {
+ "info": {
+ "model_stats": {
+ "instance_cost": self.cost,
+ "api_calls": self.n_calls,
+ },
+ "config": {
+ "agent": self.config.model_dump(mode="json"),
+ "agent_type": f"{self.__class__.__module__}.{self.__class__.__name__}",
+ },
+ "mini_version": __version__,
+ "exit_status": last_extra.get("exit_status", ""),
+ "submission": last_extra.get("submission", ""),
+ },
+ "messages": self.messages,
+ "trajectory_format": "mini-swe-agent-1.1",
+ }
+ return recursive_merge(agent_data, self.model.serialize(), self.env.serialize(), *extra_dicts)
+
+ def save(self, path: Path | None, *extra_dicts) -> dict:
+ """Save the trajectory of the agent to a file if path is given. Returns full serialized data.
+ You can pass additional dictionaries with extra data to be (recursively) merged into the output data.
+ """
+ data = self.serialize(*extra_dicts)
+ if path:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_text(json.dumps(data, indent=2))
+ return data
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/config/__init__.py b/src/cooperbench/agents/mini_swe_agent_v2/config/__init__.py
new file mode 100644
index 0000000..54ef179
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/config/__init__.py
@@ -0,0 +1,62 @@
+"""Configuration files and utilities for mini-SWE-agent."""
+
+import json
+import os
+from pathlib import Path
+
+import yaml
+
+builtin_config_dir = Path(__file__).parent
+
+
+def get_config_path(config_spec: str | Path) -> Path:
+ """Get the path to a config file."""
+ config_spec = Path(config_spec)
+ if config_spec.suffix != ".yaml":
+ config_spec = config_spec.with_suffix(".yaml")
+ candidates = [
+ Path(config_spec),
+ Path(os.getenv("MSWEA_CONFIG_DIR", ".")) / config_spec,
+ builtin_config_dir / config_spec,
+ builtin_config_dir / "extra" / config_spec,
+ builtin_config_dir / "benchmarks" / config_spec,
+ ]
+ for candidate in candidates:
+ if candidate.exists():
+ return candidate
+
+ raise FileNotFoundError(f"Could not find config file for {config_spec} (tried: {candidates})")
+
+
+def _key_value_spec_to_nested_dict(config_spec: str) -> dict:
+ """Interpret key-value specs from the command line.
+
+ Example:
+
+ "model.model_name=anthropic/claude-sonnet-4-5-20250929" ->
+ {"model": {"model_name": "anthropic/claude-sonnet-4-5-20250929"}}
+ """
+ key, value = config_spec.split("=", 1)
+ try:
+ value = json.loads(value)
+ except json.JSONDecodeError:
+ pass
+ keys = key.split(".")
+ result = {}
+ current = result
+ for k in keys[:-1]:
+ current[k] = {}
+ current = current[k]
+ current[keys[-1]] = value
+ return result
+
+
+def get_config_from_spec(config_spec: str | Path) -> dict:
+ """Get a config from a config spec."""
+ if isinstance(config_spec, str) and "=" in config_spec:
+ return _key_value_spec_to_nested_dict(config_spec)
+ path = get_config_path(config_spec)
+ return yaml.safe_load(path.read_text())
+
+
+__all__ = ["builtin_config_dir", "get_config_path", "get_config_from_spec", "_key_value_spec_to_nested_dict"]
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/config/mini.yaml b/src/cooperbench/agents/mini_swe_agent_v2/config/mini.yaml
new file mode 100644
index 0000000..b52ad8b
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/config/mini.yaml
@@ -0,0 +1,147 @@
+agent:
+ system_template: |
+ You are a helpful assistant that can interact with a computer.
+ instance_template: |
+ Please solve this issue: {{task}}
+
+ You can execute bash commands and edit files to implement the necessary changes.
+
+ ## Recommended Workflow
+
+ This workflows should be done step-by-step so that you can iterate on your changes and any possible problems.
+
+ 1. Analyze the codebase by finding and reading relevant files
+ 2. Create a script to reproduce the issue
+ 3. Edit the source code to resolve the issue
+ 4. Verify your fix works by running your script again
+ 5. Test edge cases to ensure your fix is robust
+ 6. Submit your changes and finish your work by issuing the following command: `echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT`.
+ Do not combine it with any other command. After this command, you cannot continue working on this task.
+
+ ## Command Execution Rules
+
+ You are operating in an environment where
+
+ 1. You issue at least one command
+ 2. The system executes the command(s) in a subshell
+ 3. You see the result(s)
+ 4. You write your next command(s)
+
+ Each response should include:
+
+ 1. **Reasoning text** where you explain your analysis and plan
+ 2. At least one tool call with your command
+
+ **CRITICAL REQUIREMENTS:**
+
+ - Your response SHOULD include reasoning text explaining what you're doing
+ - Your response MUST include AT LEAST ONE bash tool call
+ - Directory or environment variable changes are not persistent. Every action is executed in a new subshell.
+ - However, you can prefix any action with `MY_ENV_VAR=MY_VALUE cd /path/to/working/dir && ...` or write/load environment variables from files
+ - Submit your changes and finish your work by issuing the following command: `echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT`.
+ Do not combine it with any other command. After this command, you cannot continue working on this task.
+
+ Example of a CORRECT response:
+
+ I need to understand the structure of the repository first. Let me check what files are in the current directory to get a better understanding of the codebase.
+
+ [Makes bash tool call with {"command": "ls -la"} as arguments]
+
+
+
+ {{system}} {{release}} {{version}} {{machine}}
+
+
+ ## Useful command examples
+
+ ### Create a new file:
+
+ ```bash
+ cat <<'EOF' > newfile.py
+ import numpy as np
+ hello = "world"
+ print(hello)
+ EOF
+ ```
+
+ ### Edit files with sed:
+
+ {%- if system == "Darwin" -%}
+
+ You are on MacOS. For all the below examples, you need to use `sed -i ''` instead of `sed -i`.
+
+ {%- endif -%}
+
+ ```bash
+ # Replace all occurrences
+ sed -i 's/old_string/new_string/g' filename.py
+
+ # Replace only first occurrence
+ sed -i 's/old_string/new_string/' filename.py
+
+ # Replace first occurrence on line 1
+ sed -i '1s/old_string/new_string/' filename.py
+
+ # Replace all occurrences in lines 1-10
+ sed -i '1,10s/old_string/new_string/g' filename.py
+ ```
+
+ ### View file content:
+
+ ```bash
+ # View specific lines with numbers
+ nl -ba filename.py | sed -n '10,20p'
+ ```
+
+ ### Any other command you want to run
+
+ ```bash
+ anything
+ ```
+ step_limit: 0
+ cost_limit: 3.
+ mode: confirm
+environment:
+ env:
+ PAGER: cat
+ MANPAGER: cat
+ LESS: -R
+ PIP_PROGRESS_BAR: 'off'
+ TQDM_DISABLE: '1'
+model:
+ observation_template: |
+ {%- if output.output | length < 10000 -%}
+ {
+ "returncode": {{ output.returncode }},
+ "output": {{ output.output | tojson }}
+ {%- if output.exception_info %}, "exception_info": {{ output.exception_info | tojson }}{% endif %}
+ }
+ {%- else -%}
+ {
+ "returncode": {{ output.returncode }},
+ "output_head": {{ output.output[:5000] | tojson }},
+ "output_tail": {{ output.output[-5000:] | tojson }},
+ "elided_chars": {{ output.output | length - 10000 }},
+ "warning": "Output too long."
+ {%- if output.exception_info %}, "exception_info": {{ output.exception_info | tojson }}{% endif %}
+ }
+ {%- endif -%}
+ format_error_template: |
+ Tool call error:
+
+
+ {{error}}
+
+
+ Here is general guidance on how to submit correct toolcalls:
+
+ Every response needs to use the 'bash' tool at least once to execute commands.
+
+ Call the bash tool with your command as the argument:
+ - Tool: bash
+ - Arguments: {"command": "your_command_here"}
+
+ If you want to end the task, please issue the following command: `echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT`
+ without any other command.
+ model_kwargs:
+ drop_params: true
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/connectors/__init__.py b/src/cooperbench/agents/mini_swe_agent_v2/connectors/__init__.py
new file mode 100644
index 0000000..adc30bb
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/connectors/__init__.py
@@ -0,0 +1,21 @@
+"""Connectors for inter-agent communication."""
+
+from cooperbench.agents.mini_swe_agent_v2.connectors.git import GitConnector
+from cooperbench.agents.mini_swe_agent_v2.connectors.git_servers import (
+ DockerGitServer,
+ GCPGitServer,
+ GitServer,
+ ModalGitServer,
+ create_git_server,
+)
+from cooperbench.agents.mini_swe_agent_v2.connectors.messaging import MessagingConnector
+
+__all__ = [
+ "DockerGitServer",
+ "GCPGitServer",
+ "GitConnector",
+ "GitServer",
+ "MessagingConnector",
+ "ModalGitServer",
+ "create_git_server",
+]
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/connectors/git.py b/src/cooperbench/agents/mini_swe_agent_v2/connectors/git.py
new file mode 100644
index 0000000..b0781be
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/connectors/git.py
@@ -0,0 +1,132 @@
+"""Git-based code sharing between agents.
+
+Enables agents in separate containers to share code via git push/pull.
+Uses a shared git server sandbox that agents connect to as a remote.
+
+Architecture:
+ +---------------------------------------------------------+
+ | Git Server Sandbox |
+ | git daemon --enable=receive-pack (bare repo) |
+ +---------------------------------------------------------+
+ ^
+ +----------------+----------------+
+ | | |
+ git push git fetch git push
+ git pull git pull
+ | | |
+ +---------v----+ +---------v----+
+ | Agent A | | Agent B |
+ | sandbox | | sandbox |
+ +--------------+ +--------------+
+
+Example:
+ # Create shared git server (once per task)
+ from cooperbench.agents.mini_swe_agent_v2.connectors.git_servers import get_git_server
+
+ GitServerClass = get_git_server("docker") # or "modal"
+ git_server = GitServerClass.create(run_id="abc123")
+
+ # Create connector for each agent
+ git = GitConnector(
+ agent_id="agent1",
+ agents=["agent1", "agent2"],
+ server_url=git_server.url
+ )
+
+ # Configure agent's sandbox
+ git.setup(env)
+
+ # Agent can now use git normally:
+ # git push team agent1
+ # git fetch team
+ # git merge team/agent2
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from cooperbench.agents.mini_swe_agent_v2.environments.docker import DockerEnvironment
+
+
+class GitConnector:
+ """Configures an agent's sandbox for git collaboration.
+
+ After setup(), the agent can use standard git commands:
+ - git push team - share changes
+ - git fetch team - get other agents' branches
+ - git merge team/ - merge another agent's work
+ - git branch -r - list remote branches
+ """
+
+ # Remote name used in agent's git config
+ REMOTE_NAME = "team"
+
+ def __init__(
+ self,
+ agent_id: str,
+ agents: list[str],
+ server_url: str,
+ ):
+ """Initialize git connector.
+
+ Args:
+ agent_id: This agent's unique identifier (e.g., "agent1")
+ agents: List of all agent IDs in the collaboration
+ server_url: Git server URL from GitServer.url
+ """
+ self.agent_id = agent_id
+ self.agents = agents
+ self.server_url = server_url
+ self._logger = logging.getLogger("cooperbench.agents.mini_swe_agent_v2.git_connector")
+ self._initialized = False
+
+ def _exec(self, env: DockerEnvironment, command: str) -> dict:
+ """Execute a command in the environment (v2 uses dict-based actions)."""
+ return env.execute({"command": command})
+
+ def setup(self, env: DockerEnvironment) -> None:
+ """Configure git remote in the agent's sandbox.
+
+ This sets up the 'team' remote pointing to the shared git server,
+ creates an agent-specific branch, and pushes the initial state.
+
+ Args:
+ env: The agent's Docker environment
+
+ Raises:
+ RuntimeError: If git configuration fails
+ """
+ self._logger.debug(f"Setting up git for {self.agent_id}")
+
+ # Configure git user (needed for commits)
+ self._exec(env, 'git config user.email "agent@cooperbench.local"')
+ self._exec(env, f'git config user.name "{self.agent_id}"')
+
+ # Add shared remote
+ result = self._exec(env, f"git remote add {self.REMOTE_NAME} {self.server_url}")
+ if result.get("returncode", 0) != 0:
+ # Remote might already exist
+ self._exec(env, f"git remote set-url {self.REMOTE_NAME} {self.server_url}")
+
+ # Create agent's branch
+ self._exec(env, f"git checkout -b {self.agent_id}")
+
+ # Push initial state (first agent initializes the server)
+ # Use --force in case branch exists from a previous run
+ result = self._exec(env, f"git push -u {self.REMOTE_NAME} {self.agent_id} --force")
+ if result.get("returncode", 0) != 0:
+ self._logger.warning(f"Initial push failed: {result.get('output', '')}")
+
+ # Also push main/master as base reference
+ self._exec(env, f"git push {self.REMOTE_NAME} HEAD:refs/heads/main --force 2>/dev/null || true")
+
+ self._initialized = True
+ self._logger.debug(f"Git setup complete for {self.agent_id}")
+
+ @property
+ def is_initialized(self) -> bool:
+ """Whether setup() has been called."""
+ return self._initialized
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/__init__.py b/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/__init__.py
new file mode 100644
index 0000000..6f4b765
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/__init__.py
@@ -0,0 +1,89 @@
+"""Git servers for inter-agent code collaboration.
+
+Provides pluggable backends for hosting shared git repositories:
+- Modal: Cloud-based sandboxes (default)
+- Docker: Local containers
+- GCP: Google Cloud Platform VMs
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from cooperbench.agents.mini_swe_agent_v2.connectors.git_servers.base import GitServer
+from cooperbench.agents.mini_swe_agent_v2.connectors.git_servers.docker import DockerGitServer
+from cooperbench.agents.mini_swe_agent_v2.connectors.git_servers.gcp import GCPGitServer
+from cooperbench.agents.mini_swe_agent_v2.connectors.git_servers.modal import ModalGitServer
+
+if TYPE_CHECKING:
+ import modal
+
+__all__ = ["GitServer", "ModalGitServer", "DockerGitServer", "GCPGitServer", "create_git_server"]
+
+
+def create_git_server(
+ backend: str,
+ run_id: str,
+ *,
+ app: modal.App | None = None,
+ timeout: int = 3600,
+ # GCP-specific options
+ project_id: str | None = None,
+ zone: str = "us-central1-a",
+ machine_type: str = "e2-micro",
+ network: str | None = None,
+) -> ModalGitServer | DockerGitServer | GCPGitServer:
+ """Create a git server for the specified backend.
+
+ Args:
+ backend: Backend name ("modal", "docker", or "gcp")
+ run_id: Unique run identifier
+ app: Modal app (required for modal backend)
+ timeout: Server timeout in seconds
+ project_id: GCP project ID (gcp backend only)
+ zone: GCP zone (gcp backend only, default: us-central1-a)
+ machine_type: GCP machine type (gcp backend only, default: e2-micro)
+ network: VPC network name (gcp backend only, for agent connectivity)
+
+ Returns:
+ Git server instance ready to accept connections
+
+ Example:
+ # Docker backend
+ server = create_git_server("docker", run_id="my-run")
+
+ # Modal backend
+ app = modal.App.lookup("cooperbench", create_if_missing=True)
+ server = create_git_server("modal", run_id="my-run", app=app)
+
+ # GCP backend
+ server = create_git_server(
+ "gcp",
+ run_id="my-run",
+ project_id="my-project",
+ network="cooperbench-vpc",
+ )
+
+ # Use the server
+ print(server.url)
+ # ... agents push/pull ...
+ server.cleanup()
+ """
+ if backend == "docker":
+ return DockerGitServer.create(run_id=run_id, timeout=timeout)
+ elif backend == "modal":
+ if app is None:
+ raise ValueError("Modal backend requires 'app' parameter")
+ return ModalGitServer.create(app=app, run_id=run_id, timeout=timeout)
+ elif backend == "gcp":
+ return GCPGitServer.create(
+ run_id=run_id,
+ project_id=project_id,
+ zone=zone,
+ machine_type=machine_type,
+ network=network,
+ timeout=timeout,
+ )
+ else:
+ available = "docker, modal, gcp"
+ raise ValueError(f"Unknown git server backend: '{backend}'. Available: {available}")
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/base.py b/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/base.py
new file mode 100644
index 0000000..2dc4e9a
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/base.py
@@ -0,0 +1,24 @@
+"""Base protocol for git servers."""
+
+from typing import Protocol
+
+
+class GitServer(Protocol):
+ """Abstract git server for code collaboration.
+
+ Git servers provide a shared repository that agents can push to and pull from.
+ Different implementations can use Docker, Modal, GCP VMs, etc.
+ """
+
+ @property
+ def url(self) -> str:
+ """Git URL for agents to use as remote.
+
+ Returns:
+ Git URL for the repository (e.g., git://hostname:port/repo.git)
+ """
+ ...
+
+ def cleanup(self) -> None:
+ """Terminate and clean up the git server resources."""
+ ...
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/docker.py b/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/docker.py
new file mode 100644
index 0000000..8e32b2e
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/docker.py
@@ -0,0 +1,175 @@
+"""Docker-based git server for code collaboration."""
+
+from __future__ import annotations
+
+import logging
+import time
+
+import docker
+
+
+class DockerGitServer:
+ """Shared git server container for code collaboration using Docker.
+
+ Creates a Docker container running git-daemon that agents can push/pull to.
+ """
+
+ def __init__(self, container, hostname: str, port: int, network_name: str):
+ """Initialize with an existing container.
+
+ Use DockerGitServer.create() to create a new server.
+ """
+ self._container = container
+ self._hostname = hostname
+ self._port = port
+ self._network_name = network_name
+ self._logger = logging.getLogger("cooperbench.agents.mini_swe_agent_v2.git_server.docker")
+
+ @classmethod
+ def create(
+ cls,
+ run_id: str,
+ timeout: int = 3600,
+ ) -> DockerGitServer:
+ """Create and start a git server container.
+
+ Args:
+ run_id: Unique run identifier (for container naming)
+ timeout: Container timeout in seconds (not enforced, for compatibility)
+
+ Returns:
+ DockerGitServer instance ready to accept connections
+ """
+ logger = logging.getLogger("cooperbench.agents.mini_swe_agent_v2.git_server.docker")
+ logger.debug(f"Creating docker git server for run {run_id}")
+
+ client = docker.from_env()
+
+ # Use a simple Debian-based image with git
+ image = "debian:bookworm-slim"
+
+ # Pull image if not present
+ try:
+ client.images.get(image)
+ except docker.errors.ImageNotFound:
+ logger.debug(f"Pulling image {image}")
+ client.images.pull(image)
+
+ # Create or get shared network for git server and agents
+ network_name = f"cooperbench-git-{run_id}"
+ try:
+ client.networks.get(network_name)
+ except docker.errors.NotFound:
+ client.networks.create(network_name, driver="bridge")
+
+ # Container name based on run_id
+ container_name = f"cooperbench-git-{run_id}"
+
+ # Remove existing container if it exists
+ try:
+ old_container = client.containers.get(container_name)
+ old_container.remove(force=True)
+ except docker.errors.NotFound:
+ pass
+
+ # Create and start container with initialization script
+ # The script initializes the repo, then starts git daemon in foreground to keep container alive
+ init_script = """#!/bin/bash
+set -e
+apt-get update -qq
+apt-get install -y -qq git > /dev/null 2>&1
+mkdir -p /git/repo.git
+cd /git/repo.git
+git init --bare
+git config receive.denyCurrentBranch ignore
+touch git-daemon-export-ok
+exec git daemon --reuseaddr --export-all --enable=receive-pack --base-path=/git --listen=0.0.0.0 /git
+"""
+
+ container = client.containers.run(
+ image=image,
+ command=["bash", "-c", init_script],
+ name=container_name,
+ detach=True,
+ network=network_name,
+ ports={"9418/tcp": None}, # Auto-assign port for host access
+ remove=False,
+ )
+
+ # Wait for container to start and git daemon to initialize
+ time.sleep(3)
+
+ # Verify container is running
+ container.reload()
+ if container.status != "running":
+ logs = container.logs().decode("utf-8", errors="replace")
+ container.remove(force=True)
+ raise RuntimeError(f"Git server container failed to start. Logs: {logs}")
+
+ # Reload container to get port mapping
+ container.reload()
+
+ # Get the host port
+ port_bindings = container.attrs.get("NetworkSettings", {}).get("Ports", {})
+ if "9418/tcp" not in port_bindings or not port_bindings["9418/tcp"]:
+ container.stop()
+ container.remove(force=True)
+ raise RuntimeError("Failed to get port mapping for git daemon")
+
+ # Get container's IP on the network for inter-container communication
+ container.reload()
+ network_settings = container.attrs.get("NetworkSettings", {})
+ networks = network_settings.get("Networks", {})
+ if network_name in networks:
+ container_ip = networks[network_name].get("IPAddress")
+ if container_ip:
+ hostname = container_ip
+ else:
+ # Fallback to container name (DNS resolution on same network)
+ hostname = container_name
+ else:
+ # Fallback to container name
+ hostname = container_name
+
+ logger.debug(f"Git server ready at git://{hostname}:9418 (network: {network_name})")
+
+ return cls(container=container, hostname=hostname, port=9418, network_name=network_name)
+
+ @property
+ def url(self) -> str:
+ """Git URL for agents to use as remote.
+
+ Returns:
+ Git URL for the repository (git://hostname:port/repo.git)
+ """
+ return f"git://{self._hostname}:{self._port}/repo.git"
+
+ @property
+ def network_name(self) -> str:
+ """Docker network name for agent containers to join."""
+ return self._network_name
+
+ def cleanup(self) -> None:
+ """Stop and remove the git server container and network."""
+ if self._container:
+ try:
+ self._container.stop(timeout=5)
+ except Exception:
+ pass
+ try:
+ self._container.remove(force=True)
+ except Exception:
+ pass
+ self._container = None
+
+ # Clean up network
+ if hasattr(self, "_network_name") and self._network_name:
+ try:
+ client = docker.from_env()
+ try:
+ network = client.networks.get(self._network_name)
+ network.remove()
+ except docker.errors.NotFound:
+ pass
+ except Exception:
+ pass
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/gcp.py b/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/gcp.py
new file mode 100644
index 0000000..a3e41f5
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/gcp.py
@@ -0,0 +1,544 @@
+"""GCP VM-based git server for code collaboration."""
+
+from __future__ import annotations
+
+import logging
+import subprocess
+import time
+import uuid
+
+
+class GCPGitServer:
+ """Shared git server on GCP VM for code collaboration.
+
+ Creates a GCP VM running git-daemon that agents can push/pull to.
+ Supports VPC networking for agents on the same network.
+
+ Example:
+ server = GCPGitServer.create(
+ run_id="my-run",
+ project_id="my-project",
+ network="cooperbench-vpc",
+ )
+ print(server.url) # git://10.128.0.5:9418/repo.git
+ print(server.network_name) # cooperbench-vpc
+ # ... agents push/pull ...
+ server.cleanup()
+ """
+
+ def __init__(
+ self,
+ *,
+ vm_name: str,
+ project_id: str,
+ zone: str,
+ network: str | None,
+ git_url: str,
+ ):
+ """Initialize with existing VM info.
+
+ Use GCPGitServer.create() to create a new server.
+ """
+ self._vm_name = vm_name
+ self._project_id = project_id
+ self._zone = zone
+ self._network = network
+ self._git_url = git_url
+ self._vm_created = False
+ self._firewall_created = False
+ self._compute_client = None
+ self._logger = logging.getLogger("cooperbench.agents.mini_swe_agent_v2.git_server.gcp")
+
+ @classmethod
+ def create(
+ cls,
+ run_id: str,
+ *,
+ project_id: str | None = None,
+ zone: str = "us-central1-a",
+ machine_type: str = "e2-micro",
+ network: str | None = None,
+ timeout: int = 3600,
+ ) -> GCPGitServer:
+ """Create and start a git server VM.
+
+ Args:
+ run_id: Unique run identifier
+ project_id: GCP project ID (defaults to env/gcloud config)
+ zone: GCP zone for the VM
+ machine_type: VM machine type (e2-micro is smallest/cheapest)
+ network: VPC network name for agent connectivity (None for external IP)
+ timeout: VM timeout in seconds (not enforced, for reference)
+
+ Returns:
+ GCPGitServer instance ready to accept connections
+ """
+ logger = logging.getLogger("cooperbench.agents.mini_swe_agent_v2.git_server.gcp")
+ logger.debug(f"Creating GCP git server for run {run_id}")
+
+ # Resolve project ID
+ resolved_project_id = project_id or cls._get_default_project()
+ if not resolved_project_id:
+ raise ValueError(
+ "project_id required. Set via parameter, GOOGLE_CLOUD_PROJECT env var, "
+ "or gcloud config set project "
+ )
+
+ # Generate unique VM name (sanitize run_id for GCP naming rules)
+ sanitized_run_id = run_id.replace("_", "-").lower()[:20]
+ vm_name = f"cooperbench-git-{sanitized_run_id}-{uuid.uuid4().hex[:8]}"
+
+ instance = cls(
+ vm_name=vm_name,
+ project_id=resolved_project_id,
+ zone=zone,
+ network=network,
+ git_url="", # Will be set after VM is created
+ )
+
+ # Create VM and get IP
+ instance._create_git_server_vm(machine_type)
+ instance._wait_for_ssh()
+
+ # Create firewall rule to allow git protocol traffic
+ instance._create_firewall_rule()
+
+ # Get the git URL (internal IP if VPC, external IP otherwise)
+ ip_address = instance._get_vm_ip(use_internal=network is not None)
+ instance._git_url = f"git://{ip_address}:9418/repo.git"
+
+ logger.debug(f"Git server ready at {instance._git_url}")
+ return instance
+
+ @staticmethod
+ def _get_default_project() -> str | None:
+ """Get default GCP project from environment or gcloud config."""
+ import os
+
+ # Try environment variable first
+ project = os.environ.get("GOOGLE_CLOUD_PROJECT") or os.environ.get("GCLOUD_PROJECT")
+ if project:
+ return project
+
+ # Try cooperbench config
+ try:
+ from cooperbench.config import ConfigManager
+
+ config = ConfigManager()
+ project = config.get("gcp_project_id")
+ if project:
+ return project
+ except Exception:
+ pass
+
+ # Try gcloud config
+ try:
+ result = subprocess.run(
+ ["gcloud", "config", "get-value", "project"],
+ capture_output=True,
+ text=True,
+ timeout=10,
+ )
+ if result.returncode == 0 and result.stdout.strip():
+ return result.stdout.strip()
+ except Exception:
+ pass
+
+ return None
+
+ def _get_compute_client(self):
+ """Get or create Compute Engine client."""
+ if self._compute_client is None:
+ from google.cloud import compute_v1
+
+ self._compute_client = compute_v1.InstancesClient()
+ return self._compute_client
+
+ def _create_git_server_vm(self, machine_type: str):
+ """Create the GCP VM with git-daemon."""
+ from google.cloud import compute_v1
+
+ self._logger.debug(f"Creating GCP VM {self._vm_name} in {self._zone}")
+
+ client = self._get_compute_client()
+
+ # Startup script to install git and start git-daemon
+ startup_script = """#!/bin/bash
+set -e
+
+# Install git
+apt-get update && apt-get install -y git
+
+# Create bare repo
+mkdir -p /git/repo.git
+cd /git/repo.git
+git init --bare
+git config receive.denyCurrentBranch ignore
+touch git-daemon-export-ok
+
+# Start git daemon
+# --enable=receive-pack allows pushing
+# --export-all exports all repos
+# --base-path=/git means URL /repo.git maps to /git/repo.git
+# --reuseaddr allows quick restarts
+git daemon \
+ --reuseaddr \
+ --export-all \
+ --enable=receive-pack \
+ --base-path=/git \
+ --listen=0.0.0.0 \
+ /git &
+
+echo "Git daemon started"
+"""
+
+ # Build instance configuration
+ instance = compute_v1.Instance()
+ instance.name = self._vm_name
+ instance.machine_type = f"zones/{self._zone}/machineTypes/{machine_type}"
+
+ # Boot disk with Debian
+ disk = compute_v1.AttachedDisk()
+ disk.auto_delete = True
+ disk.boot = True
+ disk.type_ = "PERSISTENT"
+
+ disk_init = compute_v1.AttachedDiskInitializeParams()
+ disk_init.disk_size_gb = 10 # Small disk for git server
+ disk_init.source_image = "projects/debian-cloud/global/images/family/debian-11"
+ disk.initialize_params = disk_init
+ instance.disks = [disk]
+
+ # Network interface
+ network_interface = compute_v1.NetworkInterface()
+ if self._network:
+ network_interface.network = f"projects/{self._project_id}/global/networks/{self._network}"
+ else:
+ network_interface.network = f"projects/{self._project_id}/global/networks/default"
+
+ # External IP for SSH access (and git access if no VPC)
+ access_config = compute_v1.AccessConfig()
+ access_config.name = "External NAT"
+ access_config.type_ = "ONE_TO_ONE_NAT"
+ network_interface.access_configs = [access_config]
+ instance.network_interfaces = [network_interface]
+
+ # Service account with minimal scopes
+ service_account = compute_v1.ServiceAccount()
+ service_account.email = "default"
+ service_account.scopes = [
+ "https://www.googleapis.com/auth/logging.write",
+ ]
+ instance.service_accounts = [service_account]
+
+ # Metadata with startup script
+ metadata = compute_v1.Metadata()
+ metadata.items = [
+ compute_v1.Items(key="startup-script", value=startup_script),
+ ]
+ instance.metadata = metadata
+
+ # Tags for firewall rules
+ tags = compute_v1.Tags()
+ tags.items = [f"cooperbench-git-{self._vm_name}"]
+ instance.tags = tags
+
+ # Create the instance
+ request = compute_v1.InsertInstanceRequest()
+ request.project = self._project_id
+ request.zone = self._zone
+ request.instance_resource = instance
+
+ operation = client.insert(request=request)
+
+ # Mark VM as created immediately after insert succeeds
+ # This ensures cleanup() will delete the VM even if _wait_for_operation times out
+ self._vm_created = True
+
+ # Wait for operation to complete
+ self._wait_for_operation(operation.name)
+ self._logger.debug(f"VM {self._vm_name} created successfully")
+
+ def _wait_for_operation(self, operation_name: str, timeout: int = 300):
+ """Wait for a zone operation to complete."""
+ from google.cloud import compute_v1
+
+ client = compute_v1.ZoneOperationsClient()
+ start_time = time.time()
+
+ while time.time() - start_time < timeout:
+ operation = client.get(
+ project=self._project_id,
+ zone=self._zone,
+ operation=operation_name,
+ )
+
+ if operation.status == compute_v1.Operation.Status.DONE:
+ if operation.error:
+ errors = [e.message for e in operation.error.errors]
+ raise RuntimeError(f"VM operation failed: {errors}")
+ return
+
+ time.sleep(2)
+
+ raise TimeoutError(f"Operation {operation_name} timed out after {timeout}s")
+
+ def _wait_for_global_operation(self, operation_name: str, timeout: int = 300):
+ """Wait for a global operation to complete."""
+ from google.cloud import compute_v1
+
+ client = compute_v1.GlobalOperationsClient()
+ start_time = time.time()
+
+ while time.time() - start_time < timeout:
+ operation = client.get(
+ project=self._project_id,
+ operation=operation_name,
+ )
+
+ if operation.status == compute_v1.Operation.Status.DONE:
+ if operation.error:
+ errors = [e.message for e in operation.error.errors]
+ raise RuntimeError(f"Global operation failed: {errors}")
+ return
+
+ time.sleep(2)
+
+ raise TimeoutError(f"Operation {operation_name} timed out after {timeout}s")
+
+ def _wait_for_ssh(self, timeout: int = 180):
+ """Wait for SSH to become available on the VM."""
+ self._logger.debug(f"Waiting for SSH on {self._vm_name}...")
+ start_time = time.time()
+
+ while time.time() - start_time < timeout:
+ try:
+ # Try a simple SSH command
+ result = subprocess.run(
+ [
+ "gcloud",
+ "compute",
+ "ssh",
+ self._vm_name,
+ f"--zone={self._zone}",
+ f"--project={self._project_id}",
+ "--command=echo ready",
+ "--quiet",
+ "--strict-host-key-checking=no",
+ ],
+ capture_output=True,
+ text=True,
+ timeout=30,
+ )
+ if result.returncode == 0 and "ready" in result.stdout:
+ self._logger.debug("SSH is ready")
+ # Wait a bit more for startup script to complete
+ self._wait_for_git_daemon()
+ return
+ except subprocess.TimeoutExpired:
+ pass
+ except Exception as e:
+ self._logger.debug(f"SSH not ready yet: {e}")
+
+ time.sleep(5)
+
+ raise TimeoutError(f"SSH not available on {self._vm_name} after {timeout}s")
+
+ def _wait_for_git_daemon(self, timeout: int = 120):
+ """Wait for git-daemon to start on the VM."""
+ self._logger.debug("Waiting for git-daemon to start...")
+ start_time = time.time()
+
+ while time.time() - start_time < timeout:
+ try:
+ result = subprocess.run(
+ [
+ "gcloud",
+ "compute",
+ "ssh",
+ self._vm_name,
+ f"--zone={self._zone}",
+ f"--project={self._project_id}",
+ "--command=pgrep -f git-daemon",
+ "--quiet",
+ "--strict-host-key-checking=no",
+ ],
+ capture_output=True,
+ text=True,
+ timeout=30,
+ )
+ if result.returncode == 0:
+ self._logger.debug("git-daemon is running")
+ return
+ except Exception as e:
+ self._logger.debug(f"git-daemon not ready yet: {e}")
+
+ time.sleep(5)
+
+ raise TimeoutError(f"git-daemon not started on {self._vm_name} after {timeout}s")
+
+ def _create_firewall_rule(self):
+ """Create firewall rule to allow git protocol traffic."""
+ from google.cloud import compute_v1
+
+ firewall_name = f"cooperbench-git-{self._vm_name}"
+ self._logger.debug(f"Creating firewall rule {firewall_name}")
+
+ client = compute_v1.FirewallsClient()
+
+ firewall = compute_v1.Firewall()
+ firewall.name = firewall_name
+
+ # Use specified VPC or default network
+ network_name = self._network or "default"
+ firewall.network = f"projects/{self._project_id}/global/networks/{network_name}"
+
+ # Allow TCP port 9418 (git daemon)
+ allowed = compute_v1.Allowed()
+ allowed.I_p_protocol = "tcp"
+ allowed.ports = ["9418"]
+ firewall.allowed = [allowed]
+
+ # Source ranges: internal only for VPC, anywhere for external mode
+ if self._network:
+ firewall.source_ranges = ["10.0.0.0/8"]
+ else:
+ firewall.source_ranges = ["0.0.0.0/0"]
+
+ # Target only this VM
+ firewall.target_tags = [f"cooperbench-git-{self._vm_name}"]
+
+ request = compute_v1.InsertFirewallRequest()
+ request.project = self._project_id
+ request.firewall_resource = firewall
+
+ operation = client.insert(request=request)
+
+ # Mark firewall as created immediately
+ self._firewall_created = True
+
+ # Wait for operation
+ self._wait_for_global_operation(operation.name)
+ self._logger.debug(f"Firewall rule {firewall_name} created")
+
+ def _get_vm_ip(self, use_internal: bool = False) -> str:
+ """Get the IP address of the VM.
+
+ Args:
+ use_internal: If True, return internal IP (for VPC). Otherwise external.
+
+ Returns:
+ IP address string
+ """
+ from google.cloud import compute_v1
+
+ client = self._get_compute_client()
+
+ request = compute_v1.GetInstanceRequest()
+ request.project = self._project_id
+ request.zone = self._zone
+ request.instance = self._vm_name
+
+ instance = client.get(request=request)
+
+ if not instance.network_interfaces:
+ raise RuntimeError(f"VM {self._vm_name} has no network interfaces")
+
+ network_interface = instance.network_interfaces[0]
+
+ if use_internal:
+ # Return internal IP
+ return network_interface.network_i_p
+ else:
+ # Return external IP
+ if not network_interface.access_configs:
+ raise RuntimeError(f"VM {self._vm_name} has no external IP")
+ return network_interface.access_configs[0].nat_i_p
+
+ @property
+ def url(self) -> str:
+ """Git URL for agents to use as remote.
+
+ Returns:
+ Git URL for the repository (git://IP:9418/repo.git)
+ """
+ return self._git_url
+
+ @property
+ def network_name(self) -> str | None:
+ """VPC network name for agents to join.
+
+ Returns:
+ Network name or None if using external IP
+ """
+ return self._network
+
+ def cleanup(self) -> None:
+ """Delete the VM and firewall rule."""
+ # Delete firewall rule first
+ if self._firewall_created:
+ self._delete_firewall_rule()
+
+ # Delete VM
+ if self._vm_created:
+ self._delete_vm()
+
+ def _delete_firewall_rule(self):
+ """Delete the firewall rule."""
+ from google.cloud import compute_v1
+
+ firewall_name = f"cooperbench-git-{self._vm_name}"
+ self._logger.debug(f"Deleting firewall rule {firewall_name}")
+
+ try:
+ client = compute_v1.FirewallsClient()
+
+ request = compute_v1.DeleteFirewallRequest()
+ request.project = self._project_id
+ request.firewall = firewall_name
+
+ operation = client.delete(request=request)
+
+ try:
+ self._wait_for_global_operation(operation.name, timeout=120)
+ except TimeoutError:
+ self._logger.warning(f"Firewall deletion timed out: {firewall_name}")
+
+ self._firewall_created = False
+ self._logger.debug(f"Firewall rule {firewall_name} deleted")
+ except Exception as e:
+ self._logger.warning(f"Failed to delete firewall rule {firewall_name}: {e}")
+
+ def _delete_vm(self):
+ """Delete the VM."""
+ self._logger.debug(f"Deleting VM {self._vm_name}")
+
+ try:
+ from google.cloud import compute_v1
+
+ client = self._get_compute_client()
+
+ request = compute_v1.DeleteInstanceRequest()
+ request.project = self._project_id
+ request.zone = self._zone
+ request.instance = self._vm_name
+
+ operation = client.delete(request=request)
+
+ try:
+ self._wait_for_operation(operation.name, timeout=120)
+ except TimeoutError:
+ # VM deletion initiated, will complete asynchronously
+ pass
+
+ self._vm_created = False
+ self._logger.debug(f"VM {self._vm_name} deleted")
+ except Exception as e:
+ self._logger.warning(f"Failed to delete VM {self._vm_name}: {e}")
+
+ def __del__(self):
+ """Ensure cleanup on garbage collection."""
+ try:
+ self.cleanup()
+ except Exception:
+ pass
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/modal.py b/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/modal.py
new file mode 100644
index 0000000..fc8fc74
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/modal.py
@@ -0,0 +1,139 @@
+"""Modal-based git server for code collaboration."""
+
+from __future__ import annotations
+
+import logging
+import time
+
+import modal
+
+
+class ModalGitServer:
+ """Shared git server sandbox for code collaboration using Modal.
+
+ Creates a Modal sandbox running git-daemon that agents can push/pull to.
+ """
+
+ def __init__(self, sandbox: modal.Sandbox, hostname: str):
+ """Initialize with an existing sandbox.
+
+ Use ModalGitServer.create() to create a new server.
+ """
+ self._sandbox = sandbox
+ self._hostname = hostname
+ self._logger = logging.getLogger("cooperbench.agents.mini_swe_agent_v2.git_server.modal")
+
+ @classmethod
+ def create(
+ cls,
+ app: modal.App,
+ run_id: str,
+ timeout: int = 3600,
+ ) -> ModalGitServer:
+ """Create and start a git server sandbox.
+
+ Args:
+ app: Modal app to create sandbox in
+ run_id: Unique run identifier (for logging)
+ timeout: Sandbox timeout in seconds
+
+ Returns:
+ ModalGitServer instance ready to accept connections
+ """
+ logger = logging.getLogger("cooperbench.agents.mini_swe_agent_v2.git_server.modal")
+ logger.debug(f"Creating git server for run {run_id}")
+
+ # Image with git
+ image = modal.Image.debian_slim().run_commands(
+ "apt-get update && apt-get install -y git",
+ )
+
+ # Create sandbox with port 9418 exposed for git daemon (unencrypted TCP)
+ sandbox = modal.Sandbox.create(
+ image=image,
+ app=app,
+ timeout=timeout,
+ unencrypted_ports=[9418], # Expose git daemon port via TCP tunnel
+ )
+
+ # Initialize bare repo in /git/repo.git
+ proc = sandbox.exec(
+ "bash",
+ "-c",
+ """
+ set -e
+ mkdir -p /git/repo.git
+ cd /git/repo.git
+ git init --bare
+ git config receive.denyCurrentBranch ignore
+ touch git-daemon-export-ok
+ """,
+ )
+ proc.wait()
+ if proc.returncode != 0:
+ raise RuntimeError(f"Failed to init git repo: {proc.stderr.read()}")
+
+ # Start git daemon in background
+ # --enable=receive-pack allows pushing
+ # --export-all exports all repos
+ # --base-path=/git means URL /repo.git maps to /git/repo.git
+ # --listen=0.0.0.0 to accept connections from tunnel
+ proc = sandbox.exec(
+ "bash",
+ "-c",
+ """
+ git daemon \
+ --reuseaddr \
+ --export-all \
+ --enable=receive-pack \
+ --base-path=/git \
+ --listen=0.0.0.0 \
+ /git &
+
+ # Wait for daemon to start
+ sleep 1
+ echo "Git daemon started"
+ """,
+ )
+ proc.stdout.read()
+ proc.wait()
+
+ # Give daemon time to fully initialize
+ time.sleep(1)
+
+ # Get the tunnel URL for port 9418
+ tunnels = sandbox.tunnels()
+
+ if tunnels and 9418 in tunnels:
+ tunnel = tunnels[9418]
+ # Use the unencrypted endpoint for git protocol
+ # Tunnel has: host, port (encrypted), unencrypted_host, unencrypted_port
+ hostname = f"{tunnel.unencrypted_host}:{tunnel.unencrypted_port}"
+ logger.debug(f"Using unencrypted tunnel: {hostname}")
+ else:
+ raise RuntimeError(f"Failed to get tunnel for port 9418. Available tunnels: {tunnels}")
+
+ logger.debug(f"Git server ready at git://{hostname}")
+
+ return cls(sandbox=sandbox, hostname=hostname)
+
+ @property
+ def url(self) -> str:
+ """Git URL for agents to use as remote.
+
+ Returns:
+ Git URL for the repository (git://hostname/repo.git)
+ """
+ return f"git://{self._hostname}/repo.git"
+
+ def cleanup(self) -> None:
+ """Terminate the git server sandbox."""
+ if self._sandbox:
+ try:
+ self._sandbox.terminate()
+ except Exception:
+ pass
+
+
+# Backwards compatibility alias
+GitServer = ModalGitServer
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/connectors/messaging.py b/src/cooperbench/agents/mini_swe_agent_v2/connectors/messaging.py
new file mode 100644
index 0000000..59dd1c3
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/connectors/messaging.py
@@ -0,0 +1,113 @@
+"""Redis-based mailbox messaging between agents.
+
+Provides simple send/receive messaging via Redis lists. Each agent has an inbox
+that other agents can push messages to.
+
+Example:
+ connector = MessagingConnector(
+ agent_id="agent1",
+ agents=["agent1", "agent2"],
+ url="redis://localhost:6379#run:abc123"
+ )
+
+ # Send to specific agent
+ connector.send("agent2", "I found a bug in auth.py")
+
+ # Receive pending messages
+ messages = connector.receive()
+
+ # Broadcast to all
+ connector.broadcast("I'm starting on the API changes")
+"""
+
+import json
+from datetime import datetime
+from typing import Any
+
+import redis
+
+
+class MessagingConnector:
+ """Redis-based mailbox messaging between agents."""
+
+ def __init__(self, agent_id: str, agents: list[str], url: str = "redis://localhost:6379"):
+ """Initialize messaging connector.
+
+ Args:
+ agent_id: This agent's unique identifier (e.g., "agent1")
+ agents: List of all agent IDs in the collaboration
+ url: Redis URL. Supports namespacing via #prefix (e.g., "redis://host:6379#run:abc")
+ """
+ self.agent_id = agent_id
+ self.agents = agents
+
+ # Parse optional namespace prefix from URL (format: url#prefix)
+ if "#" in url:
+ url, self._prefix = url.split("#", 1)
+ self._prefix += ":"
+ else:
+ self._prefix = ""
+
+ self._client = redis.from_url(url)
+ self._inbox_key = f"{self._prefix}{agent_id}:inbox"
+
+ # Clear stale messages from previous runs
+ self._client.delete(self._inbox_key)
+
+ def setup(self, env: Any) -> None:
+ """Configure the agent's sandbox for messaging.
+
+ Messaging doesn't require sandbox configuration (it's pure Redis),
+ but this method exists for interface consistency with other connectors.
+
+ Args:
+ env: The agent's environment (unused for messaging)
+ """
+ pass
+
+ def send(self, recipient: str, content: str) -> None:
+ """Send a message to another agent's inbox.
+
+ Args:
+ recipient: Target agent's ID
+ content: Message content
+ """
+ message = {
+ "from": self.agent_id,
+ "to": recipient,
+ "content": content,
+ "timestamp": datetime.now().isoformat(),
+ }
+ self._client.rpush(f"{self._prefix}{recipient}:inbox", json.dumps(message))
+
+ def receive(self) -> list[dict]:
+ """Get all pending messages from inbox (empties the inbox).
+
+ Returns:
+ List of message dicts with from, to, content, timestamp
+ """
+ messages = []
+ while True:
+ msg = self._client.lpop(self._inbox_key)
+ if msg is None:
+ break
+ messages.append(json.loads(msg))
+ return messages
+
+ def broadcast(self, content: str) -> None:
+ """Send a message to all other agents.
+
+ Args:
+ content: Message content
+ """
+ for agent in self.agents:
+ if agent != self.agent_id:
+ self.send(agent, content)
+
+ def peek(self) -> int:
+ """Check how many messages are waiting without consuming them.
+
+ Returns:
+ Number of pending messages
+ """
+ return self._client.llen(self._inbox_key)
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/environments/__init__.py b/src/cooperbench/agents/mini_swe_agent_v2/environments/__init__.py
new file mode 100644
index 0000000..2e22b4e
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/environments/__init__.py
@@ -0,0 +1 @@
+"""Environment implementations for mini-SWE-agent v2."""
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/environments/docker.py b/src/cooperbench/agents/mini_swe_agent_v2/environments/docker.py
new file mode 100644
index 0000000..f651783
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/environments/docker.py
@@ -0,0 +1,161 @@
+import logging
+import os
+import platform
+import shlex
+import subprocess
+import uuid
+from typing import Any
+
+from pydantic import BaseModel
+
+from cooperbench.agents.mini_swe_agent_v2.exceptions import Submitted
+from cooperbench.agents.mini_swe_agent_v2.utils.serialize import recursive_merge
+
+
+class DockerEnvironmentConfig(BaseModel):
+ image: str
+ cwd: str = "/"
+ """Working directory in which to execute commands."""
+ env: dict[str, str] = {}
+ """Environment variables to set in the container."""
+ forward_env: list[str] = []
+ """Environment variables to forward to the container.
+ Variables are only forwarded if they are set in the host environment.
+ In case of conflict with `env`, the `env` variables take precedence.
+ """
+ timeout: int = 30
+ """Timeout for executing commands in the container."""
+ executable: str = os.getenv("MSWEA_DOCKER_EXECUTABLE", "docker")
+ """Path to the docker/container executable."""
+ run_args: list[str] = ["--rm"]
+ """Additional arguments to pass to the docker/container executable.
+ Default is ["--rm"], which removes the container after it exits.
+ """
+ container_timeout: str = "2h"
+ """Max duration to keep container running. Uses the same format as the sleep command."""
+ pull_timeout: int = 120
+ """Timeout in seconds for pulling images."""
+ interpreter: list[str] = ["bash", "-lc"]
+ """Interpreter to use to execute commands. Default is ["bash", "-lc"].
+ The actual command will be appended as argument to this. Override this to e.g., modify shell flags
+ (e.g., to remove the `-l` flag to disable login shell) or to use python instead of bash to interpret commands.
+ """
+
+
+class DockerEnvironment:
+ def __init__(
+ self,
+ *,
+ config_class: type = DockerEnvironmentConfig,
+ logger: logging.Logger | None = None,
+ **kwargs,
+ ):
+ """This class executes bash commands in a Docker container using direct docker commands.
+ See `DockerEnvironmentConfig` for keyword arguments.
+ """
+ self.logger = logger or logging.getLogger("mini_swe_agent_v2.environment")
+ self.container_id: str | None = None
+ self.config = config_class(**kwargs)
+ self._start_container()
+
+ def get_template_vars(self, **kwargs) -> dict[str, Any]:
+ return recursive_merge(self.config.model_dump(), platform.uname()._asdict(), kwargs)
+
+ def serialize(self) -> dict:
+ return {
+ "info": {
+ "config": {
+ "environment": self.config.model_dump(mode="json"),
+ "environment_type": f"{self.__class__.__module__}.{self.__class__.__name__}",
+ }
+ }
+ }
+
+ def _start_container(self):
+ """Start the Docker container and return the container ID."""
+ container_name = f"minisweagent-{uuid.uuid4().hex[:8]}"
+ cmd = [
+ self.config.executable,
+ "run",
+ "-d",
+ "--name",
+ container_name,
+ "-w",
+ self.config.cwd,
+ *self.config.run_args,
+ self.config.image,
+ "sleep",
+ self.config.container_timeout,
+ ]
+ self.logger.debug(f"Starting container with command: {shlex.join(cmd)}")
+ result = subprocess.run(
+ cmd,
+ capture_output=True,
+ text=True,
+ timeout=self.config.pull_timeout, # docker pull might take a while
+ check=True,
+ )
+ self.logger.info(f"Started container {container_name} with ID {result.stdout.strip()}")
+ self.container_id = result.stdout.strip()
+
+ def execute(self, action: dict, cwd: str = "", *, timeout: int | None = None) -> dict[str, Any]:
+ """Execute a command in the Docker container and return the result as a dict."""
+ command = action.get("command", "")
+ cwd = cwd or self.config.cwd
+ assert self.container_id, "Container not started"
+
+ cmd = [self.config.executable, "exec", "-w", cwd]
+ for key in self.config.forward_env:
+ if (value := os.getenv(key)) is not None:
+ cmd.extend(["-e", f"{key}={value}"])
+ for key, value in self.config.env.items():
+ cmd.extend(["-e", f"{key}={value}"])
+ cmd.extend([self.container_id, *self.config.interpreter, command])
+
+ try:
+ result = subprocess.run(
+ cmd,
+ text=True,
+ timeout=timeout or self.config.timeout,
+ encoding="utf-8",
+ errors="replace",
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ )
+ output = {"output": result.stdout, "returncode": result.returncode, "exception_info": ""}
+ except Exception as e:
+ raw_output = getattr(e, "output", None)
+ raw_output = (
+ raw_output.decode("utf-8", errors="replace") if isinstance(raw_output, bytes) else (raw_output or "")
+ )
+ output = {
+ "output": raw_output,
+ "returncode": -1,
+ "exception_info": f"An error occurred while executing the command: {e}",
+ "extra": {"exception_type": type(e).__name__, "exception": str(e)},
+ }
+ self._check_finished(output)
+ return output
+
+ def _check_finished(self, output: dict):
+ """Raises Submitted if the output indicates task completion."""
+ lines = output.get("output", "").lstrip().splitlines(keepends=True)
+ if lines and lines[0].strip() == "COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT" and output["returncode"] == 0:
+ submission = "".join(lines[1:])
+ raise Submitted(
+ {
+ "role": "exit",
+ "content": submission,
+ "extra": {"exit_status": "Submitted", "submission": submission},
+ }
+ )
+
+ def cleanup(self):
+ """Stop and remove the Docker container."""
+ if getattr(self, "container_id", None) is not None: # if init fails early, container_id might not be set
+ cmd = f"(timeout 60 {self.config.executable} stop {self.container_id} || {self.config.executable} rm -f {self.container_id}) >/dev/null 2>&1 &"
+ subprocess.Popen(cmd, shell=True)
+
+ def __del__(self):
+ """Cleanup container when object is destroyed."""
+ self.cleanup()
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/environments/modal.py b/src/cooperbench/agents/mini_swe_agent_v2/environments/modal.py
new file mode 100644
index 0000000..ab43d44
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/environments/modal.py
@@ -0,0 +1,264 @@
+"""Modal Sandbox environment for cloud execution (v2 interface)."""
+
+import logging
+import platform
+import threading
+import time
+from typing import Any
+
+import modal
+from pydantic import BaseModel
+
+# Retryable error patterns
+_RETRYABLE_PATTERNS = [
+ "Image build",
+ "UNAVAILABLE",
+ "DEADLINE_EXCEEDED",
+ "INTERNAL",
+ "temporarily unavailable",
+ "rate limit",
+ "ClientClosed",
+ "NOT_FOUND",
+ "Sandbox not found",
+ "already shut down",
+ "Container ID",
+ "finished",
+]
+
+# Global thread-safe image cache to prevent duplicate builds
+_image_cache: dict[str, modal.Image] = {}
+_image_locks: dict[str, threading.Lock] = {}
+_cache_lock = threading.Lock()
+
+# Global Modal app (shared across all environments)
+_global_app: modal.App | None = None
+_app_lock = threading.Lock()
+
+
+def _get_global_app() -> modal.App:
+ """Get or create the global Modal app (thread-safe)."""
+ global _global_app
+ with _app_lock:
+ if _global_app is None:
+ _global_app = modal.App.lookup("cooperbench", create_if_missing=True)
+ return _global_app
+
+
+def _reset_global_app() -> None:
+ """Reset the global app (e.g., after ClientClosed errors)."""
+ global _global_app
+ with _app_lock:
+ _global_app = None
+
+
+def _get_or_build_image(image_name: str) -> modal.Image:
+ """Get cached image or build it (thread-safe, only one build per image)."""
+ if image_name in _image_cache:
+ return _image_cache[image_name]
+
+ with _cache_lock:
+ if image_name not in _image_locks:
+ _image_locks[image_name] = threading.Lock()
+ lock = _image_locks[image_name]
+
+ with lock:
+ if image_name in _image_cache:
+ return _image_cache[image_name]
+
+ image = modal.Image.from_registry(image_name).entrypoint([])
+ _image_cache[image_name] = image
+ return image
+
+
+def _invalidate_image(image_name: str) -> None:
+ """Remove an image from cache (e.g., after build failure)."""
+ with _cache_lock:
+ _image_cache.pop(image_name, None)
+
+
+class ModalEnvironmentConfig(BaseModel):
+ image: str
+ cwd: str = "/"
+ timeout: int = 3600
+ env: dict[str, str] = {}
+ max_retries: int = 5
+ retry_delay: float = 5.0
+
+
+class ModalEnvironment:
+ sb: modal.Sandbox | None
+
+ def __init__(
+ self,
+ *,
+ config_class: type = ModalEnvironmentConfig,
+ logger: logging.Logger | None = None,
+ **kwargs,
+ ):
+ self.logger = logger or logging.getLogger("cooperbench.agents.mini_swe_agent_v2.modal")
+ self.config = config_class(**kwargs)
+ self.sb = None
+ self._start_sandbox_with_retry()
+
+ def _reset_client(self):
+ """Reset Modal client state for fresh connection."""
+ _reset_global_app()
+ _invalidate_image(self.config.image)
+
+ def _build_image(self):
+ """Build the Modal image (globally cached, thread-safe)."""
+ return _get_or_build_image(self.config.image)
+
+ def _is_retryable(self, error: Exception) -> bool:
+ """Check if error is retryable."""
+ error_str = str(error) + str(type(error).__name__)
+ return any(p in error_str for p in _RETRYABLE_PATTERNS)
+
+ def _start_sandbox(self):
+ """Create and start the Modal Sandbox (single attempt)."""
+ self.logger.debug(f"Creating Modal Sandbox with image: {self.config.image}")
+ image = self._build_image()
+ self.sb = modal.Sandbox.create(
+ image=image,
+ timeout=self.config.timeout,
+ workdir=self.config.cwd,
+ app=_get_global_app(),
+ )
+ self.logger.debug(f"Sandbox created: {self.sb.object_id}")
+
+ def _start_sandbox_with_retry(self):
+ """Create sandbox with retry logic for transient failures."""
+ last_error = None
+ for attempt in range(self.config.max_retries):
+ try:
+ self._start_sandbox()
+ return
+ except Exception as e:
+ last_error = e
+ error_str = str(e) + str(type(e).__name__)
+
+ if attempt < self.config.max_retries - 1 and self._is_retryable(e):
+ delay = self.config.retry_delay * (2**attempt)
+ self.logger.warning(
+ f"Sandbox creation failed (attempt {attempt + 1}/{self.config.max_retries}): {e}. "
+ f"Retrying in {delay:.1f}s..."
+ )
+ time.sleep(delay)
+ if "ClientClosed" in error_str or "Image build" in error_str:
+ self._reset_client()
+ else:
+ break
+
+ raise last_error
+
+ def _is_sandbox_dead(self, error: Exception) -> bool:
+ """Check if the error indicates the sandbox has terminated."""
+ error_str = str(error) + str(type(error).__name__)
+ return any(
+ x in error_str
+ for x in [
+ "NOT_FOUND",
+ "Sandbox not found",
+ "already shut down",
+ "Container ID",
+ "finished",
+ "ClientClosed",
+ ]
+ )
+
+ def _reconnect_sandbox(self):
+ """Attempt to create a new sandbox after the old one died."""
+ self.logger.warning("Sandbox terminated unexpectedly, creating new sandbox...")
+ old_sb = self.sb
+ self.sb = None
+ if old_sb:
+ try:
+ old_sb.terminate()
+ except Exception:
+ pass
+ self._reset_client()
+ self._start_sandbox_with_retry()
+
+ def get_template_vars(self, **kwargs) -> dict[str, Any]:
+ return self.config.model_dump() | {
+ "system": "Linux",
+ "release": "modal",
+ "version": "",
+ "machine": platform.machine(),
+ }
+
+ def serialize(self) -> dict:
+ return {
+ "info": {
+ "config": {
+ "environment": self.config.model_dump(mode="json"),
+ "environment_type": f"{self.__class__.__module__}.{self.__class__.__name__}",
+ }
+ }
+ }
+
+ def execute(self, action: dict, cwd: str = "", *, timeout: int | None = None) -> dict[str, Any]:
+ """Execute a command in the Modal Sandbox with retry on sandbox death.
+
+ v2 interface: action is a dict with {"command": "..."}.
+ """
+ command = action.get("command", "")
+ cwd = cwd or self.config.cwd
+ last_error: Exception | None = None
+
+ for attempt in range(self.config.max_retries):
+ try:
+ if self.sb is None:
+ raise RuntimeError("Sandbox not initialized")
+ proc = self.sb.exec("bash", "-lc", f"cd {cwd} && {command}")
+ stdout = proc.stdout.read()
+ stderr = proc.stderr.read()
+ proc.wait()
+ output = stdout + stderr if stderr else stdout
+ result = {"output": output, "returncode": proc.returncode, "exception_info": ""}
+ self._check_finished(result)
+ return result
+ except Exception as e:
+ # Re-raise Submitted exceptions (task completion)
+ from cooperbench.agents.mini_swe_agent_v2.exceptions import Submitted
+
+ if isinstance(e, Submitted):
+ raise
+ last_error = e
+ if self._is_sandbox_dead(e) and attempt < self.config.max_retries - 1:
+ self.logger.warning(
+ f"Sandbox died during execution (attempt {attempt + 1}/{self.config.max_retries}): {e}"
+ )
+ self._reconnect_sandbox()
+ else:
+ raise
+
+ if last_error is not None:
+ raise last_error
+ raise RuntimeError("No retries attempted")
+
+ def _check_finished(self, output: dict):
+ """Raises Submitted if the output indicates task completion."""
+ from cooperbench.agents.mini_swe_agent_v2.exceptions import Submitted
+
+ lines = output.get("output", "").lstrip().splitlines(keepends=True)
+ if lines and lines[0].strip() == "COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT" and output["returncode"] == 0:
+ submission = "".join(lines[1:])
+ raise Submitted(
+ {
+ "role": "exit",
+ "content": submission,
+ "extra": {"exit_status": "Submitted", "submission": submission},
+ }
+ )
+
+ def cleanup(self):
+ """Terminate the Modal Sandbox."""
+ if hasattr(self, "sb") and self.sb:
+ try:
+ self.sb.terminate()
+ except Exception:
+ pass
+
+ def __del__(self):
+ self.cleanup()
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/exceptions.py b/src/cooperbench/agents/mini_swe_agent_v2/exceptions.py
new file mode 100644
index 0000000..0295d7e
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/exceptions.py
@@ -0,0 +1,22 @@
+class InterruptAgentFlow(Exception):
+ """Raised to interrupt the agent flow and add messages."""
+
+ def __init__(self, *messages: dict):
+ self.messages = messages
+ super().__init__()
+
+
+class Submitted(InterruptAgentFlow):
+ """Raised when the agent has completed its task."""
+
+
+class LimitsExceeded(InterruptAgentFlow):
+ """Raised when the agent has exceeded its cost or step limit."""
+
+
+class UserInterruption(InterruptAgentFlow):
+ """Raised when the user interrupts the agent."""
+
+
+class FormatError(InterruptAgentFlow):
+ """Raised when the LM's output is not in the expected format."""
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/models/__init__.py b/src/cooperbench/agents/mini_swe_agent_v2/models/__init__.py
new file mode 100644
index 0000000..70b1047
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/models/__init__.py
@@ -0,0 +1,34 @@
+"""Model implementations for mini-SWE-agent v2."""
+
+import os
+import threading
+
+
+class GlobalModelStats:
+ """Global model statistics tracker with optional limits."""
+
+ def __init__(self):
+ self._cost = 0.0
+ self._n_calls = 0
+ self._lock = threading.Lock()
+ self.cost_limit = float(os.getenv("MSWEA_GLOBAL_COST_LIMIT", "0"))
+ self.call_limit = int(os.getenv("MSWEA_GLOBAL_CALL_LIMIT", "0"))
+
+ def add(self, cost: float) -> None:
+ """Add a model call with its cost, checking limits."""
+ with self._lock:
+ self._cost += cost
+ self._n_calls += 1
+ if 0 < self.cost_limit < self._cost or 0 < self.call_limit < self._n_calls + 1:
+ raise RuntimeError(f"Global cost/call limit exceeded: ${self._cost:.4f} / {self._n_calls}")
+
+ @property
+ def cost(self) -> float:
+ return self._cost
+
+ @property
+ def n_calls(self) -> int:
+ return self._n_calls
+
+
+GLOBAL_MODEL_STATS = GlobalModelStats()
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/models/litellm_model.py b/src/cooperbench/agents/mini_swe_agent_v2/models/litellm_model.py
new file mode 100644
index 0000000..fa9dad1
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/models/litellm_model.py
@@ -0,0 +1,149 @@
+import json
+import logging
+import os
+import time
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any, Literal
+
+import litellm
+from pydantic import BaseModel
+
+from cooperbench.agents.mini_swe_agent_v2.models import GLOBAL_MODEL_STATS
+from cooperbench.agents.mini_swe_agent_v2.models.utils.actions_toolcall import (
+ BASH_TOOL,
+ SEND_MESSAGE_TOOL,
+ format_toolcall_observation_messages,
+ parse_toolcall_actions,
+)
+from cooperbench.agents.mini_swe_agent_v2.models.utils.anthropic_utils import _reorder_anthropic_thinking_blocks
+from cooperbench.agents.mini_swe_agent_v2.models.utils.cache_control import set_cache_control
+from cooperbench.agents.mini_swe_agent_v2.models.utils.openai_multimodal import expand_multimodal_content
+from cooperbench.agents.mini_swe_agent_v2.models.utils.retry import retry
+
+logger = logging.getLogger("litellm_model")
+
+
+class LitellmModelConfig(BaseModel):
+ model_name: str
+ """Model name. Highly recommended to include the provider in the model name, e.g., `anthropic/claude-sonnet-4-5-20250929`."""
+ model_kwargs: dict[str, Any] = {}
+ """Additional arguments passed to the API."""
+ litellm_model_registry: Path | str | None = os.getenv("LITELLM_MODEL_REGISTRY_PATH")
+ """Model registry for cost tracking and model metadata. See the local model guide (https://mini-swe-agent.com/latest/models/local_models/) for more details."""
+ set_cache_control: Literal["default_end"] | None = None
+ """Set explicit cache control markers, for example for Anthropic models"""
+ cost_tracking: Literal["default", "ignore_errors"] = os.getenv("MSWEA_COST_TRACKING", "default")
+ """Cost tracking mode for this model. Can be "default" or "ignore_errors" (ignore errors/missing cost info)"""
+ format_error_template: str = "{{ error }}"
+ """Template used when the LM's output is not in the expected format."""
+ observation_template: str = (
+ "{% if output.exception_info %}{{output.exception_info}}\n{% endif %}"
+ "{{output.returncode}}\n"
+ )
+ """Template used to render the observation after executing an action."""
+ multimodal_regex: str = ""
+ """Regex to extract multimodal content. Empty string disables multimodal processing."""
+
+
+class LitellmModel:
+ abort_exceptions: list[type[Exception]] = [
+ litellm.exceptions.UnsupportedParamsError,
+ litellm.exceptions.NotFoundError,
+ litellm.exceptions.PermissionDeniedError,
+ litellm.exceptions.ContextWindowExceededError,
+ litellm.exceptions.AuthenticationError,
+ KeyboardInterrupt,
+ ]
+
+ def __init__(self, *, config_class: Callable = LitellmModelConfig, extra_tools: list[dict] | None = None, **kwargs):
+ self.config = config_class(**kwargs)
+ self._tools = [BASH_TOOL] + (extra_tools or [])
+ if self.config.litellm_model_registry and Path(self.config.litellm_model_registry).is_file():
+ litellm.utils.register_model(json.loads(Path(self.config.litellm_model_registry).read_text()))
+
+ def _query(self, messages: list[dict[str, str]], **kwargs):
+ try:
+ return litellm.completion(
+ model=self.config.model_name,
+ messages=messages,
+ tools=self._tools,
+ **(self.config.model_kwargs | kwargs),
+ )
+ except litellm.exceptions.AuthenticationError as e:
+ e.message += " You can permanently set your API key with `mini-extra config set KEY VALUE`."
+ raise e
+
+ def _prepare_messages_for_api(self, messages: list[dict]) -> list[dict]:
+ prepared = [{k: v for k, v in msg.items() if k != "extra"} for msg in messages]
+ prepared = _reorder_anthropic_thinking_blocks(prepared)
+ return set_cache_control(prepared, mode=self.config.set_cache_control)
+
+ def query(self, messages: list[dict[str, str]], **kwargs) -> dict:
+ for attempt in retry(logger=logger, abort_exceptions=self.abort_exceptions):
+ with attempt:
+ response = self._query(self._prepare_messages_for_api(messages), **kwargs)
+ cost_output = self._calculate_cost(response)
+ GLOBAL_MODEL_STATS.add(cost_output["cost"])
+ message = response.choices[0].message.model_dump()
+ message["extra"] = {
+ "actions": self._parse_actions(response),
+ "response": response.model_dump(),
+ **cost_output,
+ "timestamp": time.time(),
+ }
+ return message
+
+ def _calculate_cost(self, response) -> dict[str, float]:
+ try:
+ cost = litellm.cost_calculator.completion_cost(response, model=self.config.model_name)
+ if cost <= 0.0:
+ raise ValueError(f"Cost must be > 0.0, got {cost}")
+ except Exception as e:
+ cost = 0.0
+ if self.config.cost_tracking != "ignore_errors":
+ msg = (
+ f"Error calculating cost for model {self.config.model_name}: {e}, perhaps it's not registered? "
+ "You can ignore this issue from your config file with cost_tracking: 'ignore_errors' or "
+ "globally with export MSWEA_COST_TRACKING='ignore_errors'. "
+ "Alternatively check the 'Cost tracking' section in the documentation at "
+ "https://klieret.short.gy/mini-local-models. "
+ " Still stuck? Please open a github issue at https://github.com/SWE-agent/mini-swe-agent/issues/new/choose!"
+ )
+ logger.critical(msg)
+ raise RuntimeError(msg) from e
+ return {"cost": cost}
+
+ def _parse_actions(self, response) -> list[dict]:
+ """Parse tool calls from the response. Raises FormatError if unknown tool."""
+ tool_calls = response.choices[0].message.tool_calls or []
+ return parse_toolcall_actions(tool_calls, format_error_template=self.config.format_error_template)
+
+ def format_message(self, **kwargs) -> dict:
+ return expand_multimodal_content(kwargs, pattern=self.config.multimodal_regex)
+
+ def format_observation_messages(
+ self, message: dict, outputs: list[dict], template_vars: dict | None = None
+ ) -> list[dict]:
+ """Format execution outputs into tool result messages."""
+ actions = message.get("extra", {}).get("actions", [])
+ return format_toolcall_observation_messages(
+ actions=actions,
+ outputs=outputs,
+ observation_template=self.config.observation_template,
+ template_vars=template_vars,
+ multimodal_regex=self.config.multimodal_regex,
+ )
+
+ def get_template_vars(self, **kwargs) -> dict[str, Any]:
+ return self.config.model_dump()
+
+ def serialize(self) -> dict:
+ return {
+ "info": {
+ "config": {
+ "model": self.config.model_dump(mode="json"),
+ "model_type": f"{self.__class__.__module__}.{self.__class__.__name__}",
+ },
+ }
+ }
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/models/utils/__init__.py b/src/cooperbench/agents/mini_swe_agent_v2/models/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/models/utils/actions_toolcall.py b/src/cooperbench/agents/mini_swe_agent_v2/models/utils/actions_toolcall.py
new file mode 100644
index 0000000..47a8a34
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/models/utils/actions_toolcall.py
@@ -0,0 +1,137 @@
+"""Parse actions & format observations with toolcalls"""
+
+import json
+import time
+
+from jinja2 import StrictUndefined, Template
+
+from cooperbench.agents.mini_swe_agent_v2.exceptions import FormatError
+from cooperbench.agents.mini_swe_agent_v2.models.utils.openai_multimodal import expand_multimodal_content
+
+BASH_TOOL = {
+ "type": "function",
+ "function": {
+ "name": "bash",
+ "description": "Execute a bash command",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "command": {
+ "type": "string",
+ "description": "The bash command to execute",
+ }
+ },
+ "required": ["command"],
+ },
+ },
+}
+
+SEND_MESSAGE_TOOL = {
+ "type": "function",
+ "function": {
+ "name": "send_message",
+ "description": "Send a message to another agent for inter-agent communication",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "recipient": {
+ "type": "string",
+ "description": "The agent ID to send the message to",
+ },
+ "content": {
+ "type": "string",
+ "description": "The message content to send",
+ },
+ },
+ "required": ["recipient", "content"],
+ },
+ },
+}
+
+KNOWN_TOOLS = {"bash", "send_message"}
+
+
+def parse_toolcall_actions(tool_calls: list, *, format_error_template: str) -> list[dict]:
+ """Parse tool calls from the response. Raises FormatError if unknown tool or invalid args."""
+ if not tool_calls:
+ raise FormatError(
+ {
+ "role": "user",
+ "content": Template(format_error_template, undefined=StrictUndefined).render(
+ error="No tool calls found in the response. Every response MUST include at least one tool call."
+ ),
+ "extra": {"interrupt_type": "FormatError"},
+ }
+ )
+ actions = []
+ for tool_call in tool_calls:
+ error_msg = ""
+ args = {}
+ try:
+ args = json.loads(tool_call.function.arguments)
+ except Exception as e:
+ error_msg = f"Error parsing tool call arguments: {e}. "
+
+ tool_name = tool_call.function.name
+
+ if tool_name not in KNOWN_TOOLS:
+ error_msg += f"Unknown tool '{tool_name}'."
+ elif tool_name == "bash" and "command" not in args:
+ error_msg += "Missing 'command' argument in bash tool call."
+ elif tool_name == "send_message":
+ if "recipient" not in args:
+ error_msg += "Missing 'recipient' argument in send_message tool call."
+ if "content" not in args:
+ error_msg += "Missing 'content' argument in send_message tool call."
+
+ if error_msg:
+ raise FormatError(
+ {
+ "role": "user",
+ "content": Template(format_error_template, undefined=StrictUndefined).render(
+ error=error_msg.strip()
+ ),
+ "extra": {"interrupt_type": "FormatError"},
+ }
+ )
+
+ action = {"tool_name": tool_name, "tool_call_id": tool_call.id, **args}
+ actions.append(action)
+ return actions
+
+
+def format_toolcall_observation_messages(
+ *,
+ actions: list[dict],
+ outputs: list[dict],
+ observation_template: str,
+ template_vars: dict | None = None,
+ multimodal_regex: str = "",
+) -> list[dict]:
+ """Format execution outputs into tool result messages."""
+ not_executed = {"output": "", "returncode": -1, "exception_info": "action was not executed"}
+ padded_outputs = outputs + [not_executed] * (len(actions) - len(outputs))
+ results = []
+ for action, output in zip(actions, padded_outputs):
+ content = Template(observation_template, undefined=StrictUndefined).render(
+ output=output, **(template_vars or {})
+ )
+ msg = {
+ "content": content,
+ "extra": {
+ "raw_output": output.get("output", ""),
+ "returncode": output.get("returncode"),
+ "timestamp": time.time(),
+ "exception_info": output.get("exception_info"),
+ **output.get("extra", {}),
+ },
+ }
+ if "tool_call_id" in action:
+ msg["tool_call_id"] = action["tool_call_id"]
+ msg["role"] = "tool"
+ else:
+ msg["role"] = "user" # human issued commands
+ if multimodal_regex:
+ msg = expand_multimodal_content(msg, pattern=multimodal_regex)
+ results.append(msg)
+ return results
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/models/utils/anthropic_utils.py b/src/cooperbench/agents/mini_swe_agent_v2/models/utils/anthropic_utils.py
new file mode 100644
index 0000000..6636df2
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/models/utils/anthropic_utils.py
@@ -0,0 +1,28 @@
+"""Utilities for Anthropic API compatibility."""
+
+
+def _is_anthropic_thinking_block(block) -> bool:
+ """Check if a content block is a thinking-type block."""
+ if not isinstance(block, dict):
+ return False
+ return block.get("type") in ("thinking", "redacted_thinking")
+
+
+def _reorder_anthropic_thinking_blocks(messages: list[dict]) -> list[dict]:
+ """Reorder thinking blocks so they are not the final block in assistant messages.
+
+ This is an Anthropic API requirement: thinking blocks must come before other blocks.
+ """
+ result = []
+ for msg in messages:
+ if msg.get("role") == "assistant" and isinstance(msg.get("content"), list):
+ content = msg["content"]
+ thinking_blocks = [b for b in content if _is_anthropic_thinking_block(b)]
+ if thinking_blocks:
+ other_blocks = [b for b in content if not _is_anthropic_thinking_block(b)]
+ if other_blocks:
+ msg = {**msg, "content": thinking_blocks + other_blocks}
+ else:
+ msg = {**msg, "content": thinking_blocks + [{"type": "text", "text": ""}]}
+ result.append(msg)
+ return result
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/models/utils/cache_control.py b/src/cooperbench/agents/mini_swe_agent_v2/models/utils/cache_control.py
new file mode 100644
index 0000000..9c50ee0
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/models/utils/cache_control.py
@@ -0,0 +1,67 @@
+"""Cache control utilities are mostly for Anthropic models.
+They are used to explicitly set cache control points.
+"""
+
+import copy
+import warnings
+from typing import Literal
+
+
+def _get_content_text(entry: dict) -> str | None:
+ if entry["content"] is None:
+ return None
+ if isinstance(entry["content"], str):
+ return entry["content"]
+ assert len(entry["content"]) == 1, "Expected single message in content"
+ return entry["content"][0]["text"]
+
+
+def _clear_cache_control(entry: dict) -> None:
+ if isinstance(entry["content"], list):
+ assert len(entry["content"]) == 1, "Expected single message in content"
+ entry["content"][0].pop("cache_control", None)
+ # Note: entry["content"] can be None for assistant messages with only tool_use
+ entry.pop("cache_control", None)
+
+
+def _set_cache_control(entry: dict) -> None:
+ # Handle None content (e.g., assistant messages with only tool_use)
+ if entry["content"] is None:
+ entry["cache_control"] = {"type": "ephemeral"}
+ return
+
+ if not isinstance(entry["content"], list):
+ entry["content"] = [ # type: ignore
+ {
+ "type": "text",
+ "text": _get_content_text(entry),
+ "cache_control": {"type": "ephemeral"},
+ }
+ ]
+ else:
+ entry["content"][0]["cache_control"] = {"type": "ephemeral"}
+ if entry["role"] == "tool":
+ # Workaround for weird bug
+ entry["content"][0].pop("cache_control", None)
+ entry["cache_control"] = {"type": "ephemeral"}
+
+
+def set_cache_control(
+ messages: list[dict], *, mode: Literal["default_end"] | None = "default_end", last_n_messages_offset: int = 0
+) -> list[dict]:
+ """This messages processor adds manual cache control marks to the messages."""
+ if mode is None:
+ return messages
+ if mode != "default_end":
+ raise ValueError(f"Invalid mode: {mode}")
+ if last_n_messages_offset:
+ warnings.warn("last_n_messages_offset is deprecated and will be removed in the future. It has no effect.")
+
+ messages = copy.deepcopy(messages)
+ new_messages = []
+ for i_entry, entry in enumerate(reversed(messages)):
+ _clear_cache_control(entry)
+ if i_entry == 0:
+ _set_cache_control(entry)
+ new_messages.append(entry)
+ return list(reversed(new_messages))
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/models/utils/openai_multimodal.py b/src/cooperbench/agents/mini_swe_agent_v2/models/utils/openai_multimodal.py
new file mode 100644
index 0000000..4f76fa5
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/models/utils/openai_multimodal.py
@@ -0,0 +1,50 @@
+"""Utilities for handling multimodal content in OpenAI-style messages."""
+
+import copy
+import re
+from typing import Any
+
+DEFAULT_MULTIMODAL_REGEX = (
+ r"(?s)(.+?)(.+?)"
+)
+
+
+def _expand_content_string(*, content: str, pattern: str) -> list[dict]:
+ """Expand a content string, replacing multimodal tags with structured content."""
+ matches = list(re.finditer(pattern, content))
+ if not matches:
+ return [{"type": "text", "text": content}]
+ result = []
+ last_end = 0
+ for match in matches:
+ text_before = content[last_end : match.start()]
+ if text_before:
+ result.append({"type": "text", "text": text_before})
+ content_type = match.group(1).strip()
+ extracted = match.group(2).strip()
+ if content_type == "image_url":
+ result.append({"type": "image_url", "image_url": {"url": extracted}})
+ last_end = match.end()
+ text_after = content[last_end:]
+ if text_after:
+ result.append({"type": "text", "text": text_after})
+ return result
+
+
+def expand_multimodal_content(content: Any, *, pattern: str) -> Any:
+ """Recursively expand multimodal content in messages.
+ Note: Returns copy of content, original content is not modified.
+ """
+ if not pattern:
+ return content
+ content = copy.deepcopy(content)
+ if isinstance(content, str):
+ return _expand_content_string(content=content, pattern=pattern)
+ if isinstance(content, list):
+ return [expand_multimodal_content(item, pattern=pattern) for item in content]
+ if isinstance(content, dict):
+ if "content" not in content:
+ return content
+ content["content"] = expand_multimodal_content(content["content"], pattern=pattern)
+ return content
+ return str(content)
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/models/utils/retry.py b/src/cooperbench/agents/mini_swe_agent_v2/models/utils/retry.py
new file mode 100644
index 0000000..055d4b6
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/models/utils/retry.py
@@ -0,0 +1,25 @@
+"""Retry utility for model queries."""
+
+import logging
+import os
+
+from tenacity import Retrying, before_sleep_log, retry_if_not_exception_type, stop_after_attempt, wait_exponential
+
+
+def retry(*, logger: logging.Logger, abort_exceptions: list[type[Exception]]) -> Retrying:
+ """Thin wrapper around tenacity.Retrying to make use of global config etc.
+
+ Args:
+ logger: Logger to use for reporting retries
+ abort_exceptions: Exceptions to abort on.
+
+ Returns:
+ A tenacity.Retrying object.
+ """
+ return Retrying(
+ reraise=True,
+ stop=stop_after_attempt(int(os.getenv("MSWEA_MODEL_RETRY_STOP_AFTER_ATTEMPT", "10"))),
+ wait=wait_exponential(multiplier=1, min=4, max=60),
+ before_sleep=before_sleep_log(logger, logging.WARNING),
+ retry=retry_if_not_exception_type(tuple(abort_exceptions)),
+ )
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/utils/__init__.py b/src/cooperbench/agents/mini_swe_agent_v2/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/utils/log.py b/src/cooperbench/agents/mini_swe_agent_v2/utils/log.py
new file mode 100644
index 0000000..8069455
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/utils/log.py
@@ -0,0 +1,36 @@
+import logging
+from pathlib import Path
+
+from rich.logging import RichHandler
+
+
+def _setup_root_logger() -> None:
+ logger = logging.getLogger("mini_swe_agent_v2")
+ logger.setLevel(logging.DEBUG)
+ _handler = RichHandler(
+ show_path=False,
+ show_time=False,
+ show_level=False,
+ markup=True,
+ )
+ _formatter = logging.Formatter("%(name)s: %(levelname)s: %(message)s")
+ _handler.setFormatter(_formatter)
+ logger.addHandler(_handler)
+
+
+def add_file_handler(path: Path | str, level: int = logging.DEBUG, *, print_path: bool = True) -> None:
+ logger = logging.getLogger("mini_swe_agent_v2")
+ handler = logging.FileHandler(path)
+ handler.setLevel(level)
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+ handler.setFormatter(formatter)
+ logger.addHandler(handler)
+ if print_path:
+ print(f"Logging to '{path}'")
+
+
+_setup_root_logger()
+logger = logging.getLogger("mini_swe_agent_v2")
+
+
+__all__ = ["logger"]
diff --git a/src/cooperbench/agents/mini_swe_agent_v2/utils/serialize.py b/src/cooperbench/agents/mini_swe_agent_v2/utils/serialize.py
new file mode 100644
index 0000000..ccd5947
--- /dev/null
+++ b/src/cooperbench/agents/mini_swe_agent_v2/utils/serialize.py
@@ -0,0 +1,29 @@
+from typing import Any
+
+UNSET = object()
+
+
+def recursive_merge(*dictionaries: dict | None) -> dict:
+ """Merge multiple dictionaries recursively.
+
+ Later dictionaries take precedence over earlier ones.
+ Nested dictionaries are merged recursively.
+ UNSET values are skipped.
+ """
+ if not dictionaries:
+ return {}
+ result: dict[str, Any] = {}
+ for d in dictionaries:
+ if d is None:
+ continue
+ for key, value in d.items():
+ if value is UNSET:
+ continue
+ if key in result and isinstance(result[key], dict) and isinstance(value, dict):
+ result[key] = recursive_merge(result[key], value)
+ elif isinstance(value, dict):
+ # Recursively merge dict values to filter out nested UNSET values
+ result[key] = recursive_merge(value)
+ else:
+ result[key] = value
+ return result
diff --git a/src/cooperbench/agents/registry.py b/src/cooperbench/agents/registry.py
index cc14d36..22f71ef 100644
--- a/src/cooperbench/agents/registry.py
+++ b/src/cooperbench/agents/registry.py
@@ -81,6 +81,10 @@ def _auto_register():
import cooperbench.agents.openhands_agent_sdk.adapter # noqa: F401
except ImportError:
pass
+ try:
+ import cooperbench.agents.mini_swe_agent_v2.adapter # noqa: F401
+ except ImportError:
+ pass
# External agents via environment variable
import os