From 758fbc4300dabd4ef009f1b1c7c6ffac1b9c1b03 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 20:43:46 -0800 Subject: [PATCH 01/52] add docker example --- examples/docker_hello_world/Dockerfile | 15 +++ examples/docker_hello_world/hello_docker.py | 134 ++++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 examples/docker_hello_world/Dockerfile create mode 100644 examples/docker_hello_world/hello_docker.py diff --git a/examples/docker_hello_world/Dockerfile b/examples/docker_hello_world/Dockerfile new file mode 100644 index 0000000000..3ceb24b3b4 --- /dev/null +++ b/examples/docker_hello_world/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.12-slim + +RUN apt-get update && apt-get install -y \ + iproute2 \ + libx11-6 libgl1 libglib2.0-0 \ + libidn2-0 libgfortran5 libgomp1 \ + cowsay \ + && rm -rf /var/lib/apt/lists/* + + +# Copy example module so it's importable inside the container +COPY examples/docker_hello_world/hello_docker.py /dimos/source/examples/docker_hello_world/hello_docker.py +RUN touch /dimos/source/examples/__init__.py /dimos/source/examples/docker_hello_world/__init__.py + +WORKDIR /app diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py new file mode 100644 index 0000000000..c6a5f0bb3e --- /dev/null +++ b/examples/docker_hello_world/hello_docker.py @@ -0,0 +1,134 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Hello World Docker Module +========================== + +Minimal example showing a DimOS module running inside Docker. + +The module receives a string on its ``prompt`` input stream, runs it through +cowsay inside the container, and publishes the ASCII art on its ``greeting`` +output stream. + +NOTE: Requires Linux. Docker Desktop on macOS does not support host networking, +which is needed for LCM multicast between host and container. + +Usage: + python examples/docker_hello_world/hello_docker.py +""" + +from __future__ import annotations + +from pathlib import Path +import subprocess +import time + +from dimos.core.blueprints import autoconnect +from dimos.core.core import rpc +from dimos.core.docker_runner import DockerModuleConfig +from dimos.core.module import Module +from dimos.core.stream import In, Out + +# --------------------------------------------------------------------------- +# Docker module (runs inside container) +# --------------------------------------------------------------------------- + + +class HelloDockerConfig(DockerModuleConfig): + docker_image: str = "dimos-hello-docker:latest" + docker_file: Path | None = Path(__file__).parent / "Dockerfile" + docker_build_context: Path | None = Path(__file__).parents[2] # repo root + docker_gpus: str | None = None # no GPU needed + docker_rm: bool = True + docker_restart_policy: str = "no" + docker_env: dict[str, str] = {"CI": "1"} # skip interactive system configurator + + +class HelloDockerModule(Module["HelloDockerConfig"]): + """A trivial module that runs inside Docker and echoes greetings.""" + + default_config = HelloDockerConfig + + prompt: In[str] + greeting: Out[str] + + @rpc + def start(self) -> None: + super().start() + self.prompt.subscribe(self._on_prompt) + + def _cowsay(self, text: str) -> str: + """Run cowsay inside the container and return the ASCII art.""" + result = subprocess.run( + ["/usr/games/cowsay", text], + capture_output=True, + text=True, + ) + return result.stdout + + def _on_prompt(self, text: str) -> None: + art = self._cowsay(text) + print(f"[HelloDockerModule]\n{art}") + self.greeting.publish(art) + + @rpc + def greet(self, name: str) -> str: + """RPC method that can be called directly.""" + return self._cowsay(f"Hello, {name}!") + + +# --------------------------------------------------------------------------- +# Host-side module (sends prompts and prints greetings) +# --------------------------------------------------------------------------- + + +class PromptModule(Module): + """Publishes prompts and listens to greetings.""" + + prompt: Out[str] + greeting: In[str] + + @rpc + def start(self) -> None: + super().start() + self.greeting.subscribe(self._on_greeting) + + def _on_greeting(self, text: str) -> None: + print(f"[PromptModule] Received: {text}") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + coordinator = autoconnect( + PromptModule.blueprint(), + HelloDockerModule.blueprint(), + ).build() + + # Get module proxies + prompt_mod = coordinator.get_instance(PromptModule) + docker_mod = coordinator.get_instance(HelloDockerModule) + + # Test RPC + print(docker_mod.greet("World")) + + # Test stream + prompt_mod.prompt.publish("stream test") + time.sleep(2) + + coordinator.close_all() + print("Done!") From 4c9c27d2c813838164cbe1fc2fca4afc5cde778e Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 21:10:12 -0800 Subject: [PATCH 02/52] add docker module system --- dimos/core/docker_worker_manager.py | 57 ++++++++++++++++++++++++++++ dimos/core/module_coordinator.py | 58 +++++++++++++++++++++++++---- 2 files changed, 108 insertions(+), 7 deletions(-) create mode 100644 dimos/core/docker_worker_manager.py diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py new file mode 100644 index 0000000000..42843577ba --- /dev/null +++ b/dimos/core/docker_worker_manager.py @@ -0,0 +1,57 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from dimos.core.docker_runner import DockerModule +from dimos.utils.logging_config import setup_logger + +if TYPE_CHECKING: + from dimos.core.module import Module + +logger = setup_logger() + + +class DockerWorkerManager: + """Manages DockerModule instances, mirroring WorkerManager's interface for docker-based modules.""" + + def __init__(self) -> None: + self._docker_modules: list[DockerModule] = [] + self._closed = False + + def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> DockerModule: + if self._closed: + raise RuntimeError("DockerWorkerManager is closed") + + logger.info("Deploying module in Docker.", module=module_class.__name__) + dm = DockerModule(module_class, *args, **kwargs) + self._docker_modules.append(dm) + return dm + + def close_all(self) -> None: + if self._closed: + return + self._closed = True + + logger.info("Stopping all Docker modules...") + for dm in reversed(self._docker_modules): + try: + dm.stop() + except Exception: + logger.error("Error stopping Docker module", exc_info=True) + + self._docker_modules.clear() + logger.info("All Docker modules stopped.") diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 86afb9ebc4..9d33255d4c 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -18,6 +18,8 @@ import threading from typing import TYPE_CHECKING, Any +from dimos.core.docker_runner import is_docker_module +from dimos.core.docker_worker_manager import DockerWorkerManager from dimos.core.global_config import GlobalConfig, global_config from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager @@ -33,6 +35,7 @@ class ModuleCoordinator(Resource): # type: ignore[misc] _client: WorkerManager | None = None + _docker_client: DockerWorkerManager | None = None _global_config: GlobalConfig _n: int | None = None _memory_limit: str = "auto" @@ -53,6 +56,7 @@ def start(self) -> None: n = self._n if self._n is not None else 2 self._client = WorkerManager(n_workers=n) self._client.start() + self._docker_client = DockerWorkerManager() if self._global_config.dtop: from dimos.core.resource_monitor.monitor import StatsMonitor @@ -73,15 +77,23 @@ def stop(self) -> None: logger.error("Error stopping module", module=module_class.__name__, exc_info=True) logger.info("Module stopped.", module=module_class.__name__) + if self._docker_client is not None: + self._docker_client.close_all() self._client.close_all() # type: ignore[union-attr] def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") - module: ModuleProxy = self._client.deploy(module_class, *args, **kwargs) # type: ignore[union-attr, attr-defined, assignment] - self._deployed_modules[module_class] = module - return module + if is_docker_module(module_class): + if not self._docker_client: + self._docker_client = DockerWorkerManager() + module = self._docker_client.deploy(module_class, *args, **kwargs) # type: ignore[assignment] + else: + module = self._client.deploy(module_class, *args, **kwargs) # type: ignore[union-attr, attr-defined, assignment] + + self._deployed_modules[module_class] = module # type: ignore[assignment] + return module # type: ignore[return-value] def deploy_parallel( self, module_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] @@ -89,10 +101,42 @@ def deploy_parallel( if not self._client: raise ValueError("Not started") - modules = self._client.deploy_parallel(module_specs) - for (module_class, _, _), module in zip(module_specs, modules, strict=True): - self._deployed_modules[module_class] = module # type: ignore[assignment] - return modules # type: ignore[return-value] + # Separate docker modules from regular modules + docker_specs = [] + worker_specs = [] + spec_indices: list[tuple[str, int]] = [] # ("docker"|"worker", index_in_sublist) + + for spec in module_specs: + module_class = spec[0] + if is_docker_module(module_class): + spec_indices.append(("docker", len(docker_specs))) + docker_specs.append(spec) + else: + spec_indices.append(("worker", len(worker_specs))) + worker_specs.append(spec) + + # Deploy worker modules in parallel via WorkerManager + worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] + + # Deploy docker modules (each gets its own DockerModule) + docker_results: list[Any] = [] + for module_class, args, kwargs in docker_specs: + if not self._docker_client: + self._docker_client = DockerWorkerManager() + dm = self._docker_client.deploy(module_class, *args, **kwargs) + docker_results.append(dm) + + # Reassemble results in original order + results: list[Any] = [] + for kind, idx in spec_indices: + if kind == "docker": + results.append(docker_results[idx]) + else: + results.append(worker_results[idx]) + + for (module_class, _, _), module in zip(module_specs, results, strict=True): + self._deployed_modules[module_class] = module + return results # type: ignore[return-value] def start_all_modules(self) -> None: modules = list(self._deployed_modules.values()) From a0e719d867239c892e68662a09415fdc1baf4a22 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 22:15:46 -0800 Subject: [PATCH 03/52] fixup --- .gitignore | 1 + dimos/core/docker_runner.py | 41 +++- dimos/core/docker_worker_manager.py | 1 + dimos/core/module.py | 3 +- dimos/core/module_coordinator.py | 15 +- dimos/core/tests/test_docker_deployment.py | 223 ++++++++++++++++++++ examples/docker_hello_world/hello_docker.py | 9 +- pyproject.toml | 2 + uv.lock | 4 + 9 files changed, 285 insertions(+), 14 deletions(-) create mode 100644 dimos/core/tests/test_docker_deployment.py diff --git a/.gitignore b/.gitignore index 4045db012e..12b2f19ca3 100644 --- a/.gitignore +++ b/.gitignore @@ -42,6 +42,7 @@ package-lock.json # Ignore build artifacts dist/ build/ +.Dockerfile.dimos # Ignore data directory but keep .lfs subdirectory data/* diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index ee56163ca6..566e28a70e 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -25,17 +25,20 @@ import time from typing import TYPE_CHECKING, Any -from dimos.core.docker_build import build_image, image_exists -from dimos.core.module import Module, ModuleConfig +from dimos.core.module import ModuleConfig from dimos.core.rpc_client import RpcCall -from dimos.protocol.rpc import LCMRPC from dimos.utils.logging_config import setup_logger -from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT + +# Inlined from dimos.visualization.rerun.bridge to avoid heavy import chain in containers +RERUN_GRPC_PORT = 9876 +RERUN_WEB_PORT = 9090 if TYPE_CHECKING: from collections.abc import Callable from pathlib import Path + from dimos.core.module import Module + logger = setup_logger() DOCKER_RUN_TIMEOUT = 120 # Timeout for `docker run` command execution @@ -186,7 +189,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non or f"dimos_{module_class.__name__.lower()}_{os.getpid()}_{int(time.time())}" ) - # RPC setup + # RPC setup (lazy import to keep container-side imports light) + from dimos.protocol.rpc import LCMRPC + self.rpc = LCMRPC() self.rpcs = set(module_class.rpcs.keys()) # type: ignore[attr-defined] self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", []) @@ -194,6 +199,8 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self._bound_rpc_calls: dict[str, RpcCall] = {} # Build image if needed (but don't start - caller must call start() explicitly) + from dimos.core.docker_build import build_image, image_exists + if not image_exists(config): logger.info(f"Building {config.docker_image}") build_image(config) @@ -400,7 +407,29 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]: if cfg.docker_command: return list(cfg.docker_command) - module_path = f"{self._module_class.__module__}.{self._module_class.__name__}" + module_name = self._module_class.__module__ + if module_name == "__main__": + # When run as `python script.py`, __module__ is "__main__". + # Resolve to the actual dotted module path so the container can import it. + import __main__ + + spec = getattr(__main__, "__spec__", None) + if spec and spec.name: + module_name = spec.name + else: + # Fallback: derive from file path relative to cwd + main_file = getattr(__main__, "__file__", None) + if main_file: + import pathlib + + rel = pathlib.Path(main_file).resolve().relative_to(pathlib.Path.cwd()) + module_name = str(rel.with_suffix("")).replace("/", ".") + else: + raise RuntimeError( + "Cannot determine module path for __main__. " + "Run with `python -m` or set docker_command explicitly." + ) + module_path = f"{module_name}.{self._module_class.__name__}" # Filter out docker-specific kwargs (paths, etc.) - only pass module config kwargs = {"config": _extract_module_config(cfg)} payload = {"module_path": module_path, "args": list(self._args), "kwargs": kwargs} diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index 42843577ba..97f27a6d7a 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -38,6 +38,7 @@ def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Docke logger.info("Deploying module in Docker.", module=module_class.__name__) dm = DockerModule(module_class, *args, **kwargs) + dm.start() # Docker modules must be running before streams/RPC can be wired self._docker_modules.append(dm) return dm diff --git a/dimos/core/module.py b/dimos/core/module.py index 48a99a79a3..127be545fe 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -218,11 +218,12 @@ def inputs(self) -> dict[str, In]: # type: ignore[type-arg] @classproperty def rpcs(self) -> dict[str, Callable[..., Any]]: + _skip = {"rpcs", "blueprint", "module_info", "io"} return { name: getattr(self, name) for name in dir(self) if not name.startswith("_") - and name != "rpcs" # Exclude the rpcs property itself to prevent recursion + and name not in _skip and callable(getattr(self, name, None)) and hasattr(getattr(self, name), "__rpc__") } diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 9d33255d4c..dae1760b9e 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -18,14 +18,13 @@ import threading from typing import TYPE_CHECKING, Any -from dimos.core.docker_runner import is_docker_module -from dimos.core.docker_worker_manager import DockerWorkerManager from dimos.core.global_config import GlobalConfig, global_config from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager from dimos.utils.logging_config import setup_logger if TYPE_CHECKING: + from dimos.core.docker_worker_manager import DockerWorkerManager from dimos.core.module import Module, ModuleT from dimos.core.resource_monitor.monitor import StatsMonitor from dimos.core.rpc_client import ModuleProxy @@ -53,6 +52,8 @@ def __init__( self._deployed_modules = {} def start(self) -> None: + from dimos.core.docker_worker_manager import DockerWorkerManager + n = self._n if self._n is not None else 2 self._client = WorkerManager(n_workers=n) self._client.start() @@ -85,6 +86,9 @@ def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") + from dimos.core.docker_runner import is_docker_module + from dimos.core.docker_worker_manager import DockerWorkerManager + if is_docker_module(module_class): if not self._docker_client: self._docker_client = DockerWorkerManager() @@ -101,9 +105,12 @@ def deploy_parallel( if not self._client: raise ValueError("Not started") + from dimos.core.docker_runner import is_docker_module + from dimos.core.docker_worker_manager import DockerWorkerManager + # Separate docker modules from regular modules - docker_specs = [] - worker_specs = [] + docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] + worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] spec_indices: list[tuple[str, int]] = [] # ("docker"|"worker", index_in_sublist) for spec in module_specs: diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py new file mode 100644 index 0000000000..85f2b0508a --- /dev/null +++ b/dimos/core/tests/test_docker_deployment.py @@ -0,0 +1,223 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Smoke tests for Docker module deployment routing. + +These tests verify that the ModuleCoordinator correctly detects and routes +docker modules to the DockerWorkerManager WITHOUT actually running Docker. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING +from unittest.mock import MagicMock, patch + +import pytest + +from dimos.core.docker_runner import DockerModuleConfig, is_docker_module +from dimos.core.docker_worker_manager import DockerWorkerManager +from dimos.core.module import Module +from dimos.core.module_coordinator import ModuleCoordinator +from dimos.core.stream import Out + +if TYPE_CHECKING: + from pathlib import Path + +# -- Fixtures: fake module classes ------------------------------------------- + + +@dataclass +class FakeDockerConfig(DockerModuleConfig): + docker_image: str = "fake:latest" + docker_file: Path | None = None + docker_gpus: str | None = None + docker_rm: bool = True + docker_restart_policy: str = "no" + + +class FakeDockerModule(Module["FakeDockerConfig"]): + default_config = FakeDockerConfig + output: Out[str] + + +class FakeRegularModule(Module): + output: Out[str] + + +# -- Tests ------------------------------------------------------------------- + + +class TestIsDockerModule: + def test_docker_module_detected(self): + assert is_docker_module(FakeDockerModule) is True + + def test_regular_module_not_detected(self): + assert is_docker_module(FakeRegularModule) is False + + def test_plain_class_not_detected(self): + assert is_docker_module(str) is False + + def test_no_default_config(self): + class Bare(Module): + pass + + # Module has default_config = ModuleConfig, which is not DockerModuleConfig + assert is_docker_module(Bare) is False + + +class TestDockerWorkerManager: + @patch("dimos.core.docker_worker_manager.DockerModule") + def test_deploy_creates_docker_module(self, mock_docker_module_cls): + mock_instance = MagicMock() + mock_docker_module_cls.return_value = mock_instance + + mgr = DockerWorkerManager() + result = mgr.deploy(FakeDockerModule, some_kwarg="value") + + mock_docker_module_cls.assert_called_once_with(FakeDockerModule, some_kwarg="value") + assert result is mock_instance + assert len(mgr._docker_modules) == 1 + + @patch("dimos.core.docker_worker_manager.DockerModule") + def test_close_all_stops_in_reverse_order(self, mock_docker_module_cls): + dm1 = MagicMock() + dm2 = MagicMock() + mock_docker_module_cls.side_effect = [dm1, dm2] + + mgr = DockerWorkerManager() + mgr.deploy(FakeDockerModule) + mgr.deploy(FakeDockerModule) + mgr.close_all() + + # Stopped in reverse order + assert dm2.stop.call_count == 1 + assert dm1.stop.call_count == 1 + assert dm2.stop.called + assert dm1.stop.called + assert len(mgr._docker_modules) == 0 + + @patch("dimos.core.docker_worker_manager.DockerModule") + def test_close_all_idempotent(self, mock_docker_module_cls): + mock_docker_module_cls.return_value = MagicMock() + mgr = DockerWorkerManager() + mgr.deploy(FakeDockerModule) + mgr.close_all() + mgr.close_all() # second call should be no-op + + @patch("dimos.core.docker_worker_manager.DockerModule") + def test_deploy_after_close_raises(self, mock_docker_module_cls): + mgr = DockerWorkerManager() + mgr.close_all() + with pytest.raises(RuntimeError, match="closed"): + mgr.deploy(FakeDockerModule) + + +class TestModuleCoordinatorDockerRouting: + @patch("dimos.core.docker_worker_manager.DockerModule") + @patch("dimos.core.module_coordinator.WorkerManager") + def test_deploy_routes_docker_module_to_docker_manager( + self, mock_worker_manager_cls, mock_docker_module_cls + ): + mock_worker_mgr = MagicMock() + mock_worker_manager_cls.return_value = mock_worker_mgr + + mock_dm = MagicMock() + mock_docker_module_cls.return_value = mock_dm + + coordinator = ModuleCoordinator() + coordinator.start() + + result = coordinator.deploy(FakeDockerModule) + + # Should NOT go through worker manager + mock_worker_mgr.deploy.assert_not_called() + # Should create a DockerModule + mock_docker_module_cls.assert_called_once_with(FakeDockerModule) + assert result is mock_dm + # Should be tracked + assert coordinator.get_instance(FakeDockerModule) is mock_dm + + coordinator.stop() + + @patch("dimos.core.module_coordinator.WorkerManager") + def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manager_cls): + mock_worker_mgr = MagicMock() + mock_worker_manager_cls.return_value = mock_worker_mgr + mock_proxy = MagicMock() + mock_worker_mgr.deploy.return_value = mock_proxy + + coordinator = ModuleCoordinator() + coordinator.start() + + result = coordinator.deploy(FakeRegularModule) + + mock_worker_mgr.deploy.assert_called_once_with(FakeRegularModule) + assert result is mock_proxy + + coordinator.stop() + + @patch("dimos.core.docker_worker_manager.DockerModule") + @patch("dimos.core.module_coordinator.WorkerManager") + def test_deploy_parallel_separates_docker_and_regular( + self, mock_worker_manager_cls, mock_docker_module_cls + ): + mock_worker_mgr = MagicMock() + mock_worker_manager_cls.return_value = mock_worker_mgr + + regular_proxy = MagicMock() + mock_worker_mgr.deploy_parallel.return_value = [regular_proxy] + + mock_dm = MagicMock() + mock_docker_module_cls.return_value = mock_dm + + coordinator = ModuleCoordinator() + coordinator.start() + + specs = [ + (FakeRegularModule, (), {}), + (FakeDockerModule, (), {}), + ] + results = coordinator.deploy_parallel(specs) + + # Regular module goes through worker manager + mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})]) + # Docker module gets its own DockerModule + mock_docker_module_cls.assert_called_once_with(FakeDockerModule) + + # Results are in original order + assert results[0] is regular_proxy + assert results[1] is mock_dm + + coordinator.stop() + + @patch("dimos.core.docker_worker_manager.DockerModule") + @patch("dimos.core.module_coordinator.WorkerManager") + def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docker_module_cls): + mock_worker_mgr = MagicMock() + mock_worker_manager_cls.return_value = mock_worker_mgr + + mock_dm = MagicMock() + mock_docker_module_cls.return_value = mock_dm + + coordinator = ModuleCoordinator() + coordinator.start() + coordinator.deploy(FakeDockerModule) + coordinator.stop() + + # The deployed module's stop() is called during coordinator.stop() loop + mock_dm.stop.assert_called() + # Worker manager also closed + mock_worker_mgr.close_all.assert_called_once() diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py index c6a5f0bb3e..871be6f5d2 100644 --- a/examples/docker_hello_world/hello_docker.py +++ b/examples/docker_hello_world/hello_docker.py @@ -31,11 +31,11 @@ from __future__ import annotations +from dataclasses import dataclass, field from pathlib import Path import subprocess import time -from dimos.core.blueprints import autoconnect from dimos.core.core import rpc from dimos.core.docker_runner import DockerModuleConfig from dimos.core.module import Module @@ -46,6 +46,7 @@ # --------------------------------------------------------------------------- +@dataclass(kw_only=True) class HelloDockerConfig(DockerModuleConfig): docker_image: str = "dimos-hello-docker:latest" docker_file: Path | None = Path(__file__).parent / "Dockerfile" @@ -53,7 +54,7 @@ class HelloDockerConfig(DockerModuleConfig): docker_gpus: str | None = None # no GPU needed docker_rm: bool = True docker_restart_policy: str = "no" - docker_env: dict[str, str] = {"CI": "1"} # skip interactive system configurator + docker_env: dict[str, str] = field(default_factory=lambda: {"CI": "1"}) class HelloDockerModule(Module["HelloDockerConfig"]): @@ -114,6 +115,8 @@ def _on_greeting(self, text: str) -> None: # --------------------------------------------------------------------------- if __name__ == "__main__": + from dimos.core.blueprints import autoconnect + coordinator = autoconnect( PromptModule.blueprint(), HelloDockerModule.blueprint(), @@ -130,5 +133,5 @@ def _on_greeting(self, text: str) -> None: prompt_mod.prompt.publish("stream test") time.sleep(2) - coordinator.close_all() + coordinator.stop() print("Done!") diff --git a/pyproject.toml b/pyproject.toml index cb4607ced5..55eb570836 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -294,6 +294,8 @@ docker = [ "sortedcontainers", "PyTurboJPEG", "rerun-sdk", + "langchain-core", + "typing_extensions", "open3d-unofficial-arm; platform_system == 'Linux' and platform_machine == 'aarch64'", "open3d>=0.18.0; platform_system != 'Linux' or platform_machine != 'aarch64'", ] diff --git a/uv.lock b/uv.lock index 2f53ef0e6f..a7e9070a7d 100644 --- a/uv.lock +++ b/uv.lock @@ -1848,6 +1848,7 @@ dev = [ ] docker = [ { name = "dimos-lcm" }, + { name = "langchain-core" }, { name = "lcm" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, @@ -1865,6 +1866,7 @@ docker = [ { name = "sortedcontainers" }, { name = "structlog" }, { name = "typer" }, + { name = "typing-extensions" }, ] drone = [ { name = "pymavlink" }, @@ -2003,6 +2005,7 @@ requires-dist = [ { name = "langchain", marker = "extra == 'agents'", specifier = "==1.2.3" }, { name = "langchain-chroma", marker = "extra == 'agents'", specifier = ">=1,<2" }, { name = "langchain-core", marker = "extra == 'agents'", specifier = "==1.2.3" }, + { name = "langchain-core", marker = "extra == 'docker'" }, { name = "langchain-huggingface", marker = "extra == 'agents'", specifier = ">=1,<2" }, { name = "langchain-ollama", marker = "extra == 'agents'", specifier = ">=1,<2" }, { name = "langchain-openai", marker = "extra == 'agents'", specifier = ">=1,<2" }, @@ -2118,6 +2121,7 @@ requires-dist = [ { name = "types-tabulate", marker = "extra == 'dev'", specifier = ">=0.9.0.20241207,<1" }, { name = "types-tensorflow", marker = "extra == 'dev'", specifier = ">=2.18.0.20251008,<3" }, { name = "types-tqdm", marker = "extra == 'dev'", specifier = ">=4.67.0.20250809,<5" }, + { name = "typing-extensions", marker = "extra == 'docker'" }, { name = "ultralytics", marker = "extra == 'perception'", specifier = ">=8.3.70" }, { name = "unitree-webrtc-connect-leshy", marker = "extra == 'unitree'", specifier = ">=2.0.7" }, { name = "uvicorn", marker = "extra == 'web'", specifier = ">=0.34.0" }, From f559ff860ae89fd9e7e68cdc499fb2f784c7c284 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 22:33:41 -0800 Subject: [PATCH 04/52] fix rerun imports --- dimos/core/docker_runner.py | 5 +---- dimos/visualization/rerun/bridge.py | 3 --- dimos/visualization/rerun/constants.py | 17 +++++++++++++++++ 3 files changed, 18 insertions(+), 7 deletions(-) create mode 100644 dimos/visualization/rerun/constants.py diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 566e28a70e..2735b0cefe 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -28,10 +28,7 @@ from dimos.core.module import ModuleConfig from dimos.core.rpc_client import RpcCall from dimos.utils.logging_config import setup_logger - -# Inlined from dimos.visualization.rerun.bridge to avoid heavy import chain in containers -RERUN_GRPC_PORT = 9876 -RERUN_WEB_PORT = 9090 +from dimos.visualization.rerun.constants import RERUN_GRPC_PORT, RERUN_WEB_PORT if TYPE_CHECKING: from collections.abc import Callable diff --git a/dimos/visualization/rerun/bridge.py b/dimos/visualization/rerun/bridge.py index 47bce27dcf..420ffd1769 100644 --- a/dimos/visualization/rerun/bridge.py +++ b/dimos/visualization/rerun/bridge.py @@ -39,9 +39,6 @@ from dimos.protocol.pubsub.patterns import Glob, pattern_matches from dimos.utils.logging_config import setup_logger -RERUN_GRPC_PORT = 9876 -RERUN_WEB_PORT = 9090 - # TODO OUT visual annotations # # In the future it would be nice if modules can annotate their individual OUTs with (general or rerun specific) diff --git a/dimos/visualization/rerun/constants.py b/dimos/visualization/rerun/constants.py new file mode 100644 index 0000000000..e1c98176ad --- /dev/null +++ b/dimos/visualization/rerun/constants.py @@ -0,0 +1,17 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# isolated so that they can be imported into lightweight modules without importing all of rerun +RERUN_GRPC_PORT = 9876 +RERUN_WEB_PORT = 9090 From 13acbf5fe8f76c140080d2c04177d3b53a2f9ed2 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 22:37:43 -0800 Subject: [PATCH 05/52] fixup imports --- dimos/core/module_coordinator.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index dae1760b9e..155ffb28db 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -18,13 +18,14 @@ import threading from typing import TYPE_CHECKING, Any +from dimos.core.docker_runner import is_docker_module +from dimos.core.docker_worker_manager import DockerWorkerManager from dimos.core.global_config import GlobalConfig, global_config from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager from dimos.utils.logging_config import setup_logger if TYPE_CHECKING: - from dimos.core.docker_worker_manager import DockerWorkerManager from dimos.core.module import Module, ModuleT from dimos.core.resource_monitor.monitor import StatsMonitor from dimos.core.rpc_client import ModuleProxy @@ -52,8 +53,6 @@ def __init__( self._deployed_modules = {} def start(self) -> None: - from dimos.core.docker_worker_manager import DockerWorkerManager - n = self._n if self._n is not None else 2 self._client = WorkerManager(n_workers=n) self._client.start() @@ -86,9 +85,6 @@ def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") - from dimos.core.docker_runner import is_docker_module - from dimos.core.docker_worker_manager import DockerWorkerManager - if is_docker_module(module_class): if not self._docker_client: self._docker_client = DockerWorkerManager() @@ -105,9 +101,6 @@ def deploy_parallel( if not self._client: raise ValueError("Not started") - from dimos.core.docker_runner import is_docker_module - from dimos.core.docker_worker_manager import DockerWorkerManager - # Separate docker modules from regular modules docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] From 5ab56d5a67d277c30c29a5502ca775182196ade2 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 22:48:14 -0800 Subject: [PATCH 06/52] fixup --- dimos/core/docker_runner.py | 9 ++++++++- dimos/core/docker_worker_manager.py | 8 +++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 2735b0cefe..f6bbd98325 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -419,7 +419,14 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]: if main_file: import pathlib - rel = pathlib.Path(main_file).resolve().relative_to(pathlib.Path.cwd()) + try: + rel = pathlib.Path(main_file).resolve().relative_to(pathlib.Path.cwd()) + except ValueError: + raise RuntimeError( + f"Cannot derive module path: '{main_file}' is not under cwd " + f"'{pathlib.Path.cwd()}'. " + "Run with `python -m` or set docker_command explicitly." + ) from None module_name = str(rel.with_suffix("")).replace("/", ".") else: raise RuntimeError( diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index 97f27a6d7a..bd432f18e2 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -14,6 +14,7 @@ from __future__ import annotations +from contextlib import suppress from typing import TYPE_CHECKING, Any from dimos.core.docker_runner import DockerModule @@ -38,7 +39,12 @@ def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Docke logger.info("Deploying module in Docker.", module=module_class.__name__) dm = DockerModule(module_class, *args, **kwargs) - dm.start() # Docker modules must be running before streams/RPC can be wired + try: + dm.start() # Docker modules must be running before streams/RPC can be wired + except Exception: + with suppress(Exception): + dm.stop() + raise self._docker_modules.append(dm) return dm From 4fcd2bc8680c47b48c4f66f113bdbe9f1d7ce93a Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 22:52:39 -0800 Subject: [PATCH 07/52] simplify stop logic --- dimos/core/docker_worker_manager.py | 21 --------------------- dimos/core/module_coordinator.py | 2 -- 2 files changed, 23 deletions(-) diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index bd432f18e2..8e368d15a8 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -31,13 +31,8 @@ class DockerWorkerManager: def __init__(self) -> None: self._docker_modules: list[DockerModule] = [] - self._closed = False def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> DockerModule: - if self._closed: - raise RuntimeError("DockerWorkerManager is closed") - - logger.info("Deploying module in Docker.", module=module_class.__name__) dm = DockerModule(module_class, *args, **kwargs) try: dm.start() # Docker modules must be running before streams/RPC can be wired @@ -45,20 +40,4 @@ def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Docke with suppress(Exception): dm.stop() raise - self._docker_modules.append(dm) return dm - - def close_all(self) -> None: - if self._closed: - return - self._closed = True - - logger.info("Stopping all Docker modules...") - for dm in reversed(self._docker_modules): - try: - dm.stop() - except Exception: - logger.error("Error stopping Docker module", exc_info=True) - - self._docker_modules.clear() - logger.info("All Docker modules stopped.") diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 155ffb28db..97541640dc 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -77,8 +77,6 @@ def stop(self) -> None: logger.error("Error stopping module", module=module_class.__name__, exc_info=True) logger.info("Module stopped.", module=module_class.__name__) - if self._docker_client is not None: - self._docker_client.close_all() self._client.close_all() # type: ignore[union-attr] def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] From 057a3732e057552aef909ad67876d2339cf60011 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 23:10:06 -0800 Subject: [PATCH 08/52] simplify and explain --- dimos/core/docker_worker_manager.py | 43 ---------- dimos/core/module_coordinator.py | 36 ++++++--- dimos/core/tests/test_docker_deployment.py | 91 ++++++++-------------- 3 files changed, 57 insertions(+), 113 deletions(-) delete mode 100644 dimos/core/docker_worker_manager.py diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py deleted file mode 100644 index 8e368d15a8..0000000000 --- a/dimos/core/docker_worker_manager.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright 2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -from contextlib import suppress -from typing import TYPE_CHECKING, Any - -from dimos.core.docker_runner import DockerModule -from dimos.utils.logging_config import setup_logger - -if TYPE_CHECKING: - from dimos.core.module import Module - -logger = setup_logger() - - -class DockerWorkerManager: - """Manages DockerModule instances, mirroring WorkerManager's interface for docker-based modules.""" - - def __init__(self) -> None: - self._docker_modules: list[DockerModule] = [] - - def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> DockerModule: - dm = DockerModule(module_class, *args, **kwargs) - try: - dm.start() # Docker modules must be running before streams/RPC can be wired - except Exception: - with suppress(Exception): - dm.stop() - raise - return dm diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 97541640dc..25f8fdbc22 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -18,8 +18,7 @@ import threading from typing import TYPE_CHECKING, Any -from dimos.core.docker_runner import is_docker_module -from dimos.core.docker_worker_manager import DockerWorkerManager +from dimos.core.docker_runner import DockerModule, is_docker_module from dimos.core.global_config import GlobalConfig, global_config from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager @@ -35,7 +34,6 @@ class ModuleCoordinator(Resource): # type: ignore[misc] _client: WorkerManager | None = None - _docker_client: DockerWorkerManager | None = None _global_config: GlobalConfig _n: int | None = None _memory_limit: str = "auto" @@ -56,7 +54,6 @@ def start(self) -> None: n = self._n if self._n is not None else 2 self._client = WorkerManager(n_workers=n) self._client.start() - self._docker_client = DockerWorkerManager() if self._global_config.dtop: from dimos.core.resource_monitor.monitor import StatsMonitor @@ -79,14 +76,30 @@ def stop(self) -> None: self._client.close_all() # type: ignore[union-attr] + def _deploy_docker(self, module_class: type[Module], *args: Any, **kwargs: Any) -> DockerModule: + from contextlib import suppress + + logger.info("Deploying module in Docker.", module=module_class.__name__) + dm = DockerModule(module_class, *args, **kwargs) + try: + # why are docker modules started here? shouldn't they be started in start_all_modules? + # this is a bigger design problem we have with how blueprints, ModuleCoordinator, and WorkerManager are leaky abstractions with imperfect boundaries + # the Stream/RPC wiring (in blueprints) happens after deploy but before start. For docker modules, wiring needs the container's LCM transport to be reachable — which requires the container to be running. + # self.rpc.call_sync() send an RPC call to the container during wiring, the container must be running to handle that + # if we defer start() to start_all_modules, the container won't be up yet when _connect_streams and _connect_rpc_methods try to wire things + dm.start() + except Exception: + with suppress(Exception): + dm.stop() + raise + return dm + def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") if is_docker_module(module_class): - if not self._docker_client: - self._docker_client = DockerWorkerManager() - module = self._docker_client.deploy(module_class, *args, **kwargs) # type: ignore[assignment] + module = self._deploy_docker(module_class, *args, **kwargs) # type: ignore[assignment] else: module = self._client.deploy(module_class, *args, **kwargs) # type: ignore[union-attr, attr-defined, assignment] @@ -119,9 +132,7 @@ def deploy_parallel( # Deploy docker modules (each gets its own DockerModule) docker_results: list[Any] = [] for module_class, args, kwargs in docker_specs: - if not self._docker_client: - self._docker_client = DockerWorkerManager() - dm = self._docker_client.deploy(module_class, *args, **kwargs) + dm = self._deploy_docker(module_class, *args, **kwargs) docker_results.append(dm) # Reassemble results in original order @@ -137,9 +148,10 @@ def deploy_parallel( return results # type: ignore[return-value] def start_all_modules(self) -> None: - modules = list(self._deployed_modules.values()) + # Docker modules are already started during deploy, (see their deploy as to why this is) + modules = [m for cls, m in self._deployed_modules.items() if not is_docker_module(cls)] if isinstance(self._client, WorkerManager): - with ThreadPoolExecutor(max_workers=len(modules)) as executor: + with ThreadPoolExecutor(max_workers=max(len(modules), 1)) as executor: list(executor.map(lambda m: m.start(), modules)) else: for module in modules: diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index 85f2b0508a..99c1debbb6 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -16,7 +16,7 @@ Smoke tests for Docker module deployment routing. These tests verify that the ModuleCoordinator correctly detects and routes -docker modules to the DockerWorkerManager WITHOUT actually running Docker. +docker modules to DockerModule WITHOUT actually running Docker. """ from __future__ import annotations @@ -28,7 +28,6 @@ import pytest from dimos.core.docker_runner import DockerModuleConfig, is_docker_module -from dimos.core.docker_worker_manager import DockerWorkerManager from dimos.core.module import Module from dimos.core.module_coordinator import ModuleCoordinator from dimos.core.stream import Out @@ -78,59 +77,10 @@ class Bare(Module): assert is_docker_module(Bare) is False -class TestDockerWorkerManager: - @patch("dimos.core.docker_worker_manager.DockerModule") - def test_deploy_creates_docker_module(self, mock_docker_module_cls): - mock_instance = MagicMock() - mock_docker_module_cls.return_value = mock_instance - - mgr = DockerWorkerManager() - result = mgr.deploy(FakeDockerModule, some_kwarg="value") - - mock_docker_module_cls.assert_called_once_with(FakeDockerModule, some_kwarg="value") - assert result is mock_instance - assert len(mgr._docker_modules) == 1 - - @patch("dimos.core.docker_worker_manager.DockerModule") - def test_close_all_stops_in_reverse_order(self, mock_docker_module_cls): - dm1 = MagicMock() - dm2 = MagicMock() - mock_docker_module_cls.side_effect = [dm1, dm2] - - mgr = DockerWorkerManager() - mgr.deploy(FakeDockerModule) - mgr.deploy(FakeDockerModule) - mgr.close_all() - - # Stopped in reverse order - assert dm2.stop.call_count == 1 - assert dm1.stop.call_count == 1 - assert dm2.stop.called - assert dm1.stop.called - assert len(mgr._docker_modules) == 0 - - @patch("dimos.core.docker_worker_manager.DockerModule") - def test_close_all_idempotent(self, mock_docker_module_cls): - mock_docker_module_cls.return_value = MagicMock() - mgr = DockerWorkerManager() - mgr.deploy(FakeDockerModule) - mgr.close_all() - mgr.close_all() # second call should be no-op - - @patch("dimos.core.docker_worker_manager.DockerModule") - def test_deploy_after_close_raises(self, mock_docker_module_cls): - mgr = DockerWorkerManager() - mgr.close_all() - with pytest.raises(RuntimeError, match="closed"): - mgr.deploy(FakeDockerModule) - - class TestModuleCoordinatorDockerRouting: - @patch("dimos.core.docker_worker_manager.DockerModule") + @patch("dimos.core.module_coordinator.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") - def test_deploy_routes_docker_module_to_docker_manager( - self, mock_worker_manager_cls, mock_docker_module_cls - ): + def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_module_cls): mock_worker_mgr = MagicMock() mock_worker_manager_cls.return_value = mock_worker_mgr @@ -144,14 +94,38 @@ def test_deploy_routes_docker_module_to_docker_manager( # Should NOT go through worker manager mock_worker_mgr.deploy.assert_not_called() - # Should create a DockerModule + # Should create a DockerModule and start it mock_docker_module_cls.assert_called_once_with(FakeDockerModule) + mock_dm.start.assert_called_once() assert result is mock_dm # Should be tracked assert coordinator.get_instance(FakeDockerModule) is mock_dm coordinator.stop() + @patch("dimos.core.module_coordinator.DockerModule") + @patch("dimos.core.module_coordinator.WorkerManager") + def test_deploy_docker_cleans_up_on_start_failure( + self, mock_worker_manager_cls, mock_docker_module_cls + ): + mock_worker_mgr = MagicMock() + mock_worker_manager_cls.return_value = mock_worker_mgr + + mock_dm = MagicMock() + mock_dm.start.side_effect = RuntimeError("start failed") + mock_docker_module_cls.return_value = mock_dm + + coordinator = ModuleCoordinator() + coordinator.start() + + with pytest.raises(RuntimeError, match="start failed"): + coordinator.deploy(FakeDockerModule) + + # stop() called to clean up the failed container + mock_dm.stop.assert_called_once() + + coordinator.stop() + @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manager_cls): mock_worker_mgr = MagicMock() @@ -169,7 +143,7 @@ def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manage coordinator.stop() - @patch("dimos.core.docker_worker_manager.DockerModule") + @patch("dimos.core.module_coordinator.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_parallel_separates_docker_and_regular( self, mock_worker_manager_cls, mock_docker_module_cls @@ -196,6 +170,7 @@ def test_deploy_parallel_separates_docker_and_regular( mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})]) # Docker module gets its own DockerModule mock_docker_module_cls.assert_called_once_with(FakeDockerModule) + mock_dm.start.assert_called_once() # Results are in original order assert results[0] is regular_proxy @@ -203,7 +178,7 @@ def test_deploy_parallel_separates_docker_and_regular( coordinator.stop() - @patch("dimos.core.docker_worker_manager.DockerModule") + @patch("dimos.core.module_coordinator.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docker_module_cls): mock_worker_mgr = MagicMock() @@ -217,7 +192,7 @@ def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docke coordinator.deploy(FakeDockerModule) coordinator.stop() - # The deployed module's stop() is called during coordinator.stop() loop - mock_dm.stop.assert_called() + # stop() called exactly once (no double cleanup) + assert mock_dm.stop.call_count == 1 # Worker manager also closed mock_worker_mgr.close_all.assert_called_once() From 002725811a6ad73e2e42634c1f20d8d7c98772e7 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 23:26:39 -0800 Subject: [PATCH 09/52] parallel start of docker modules --- dimos/core/module_coordinator.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 25f8fdbc22..b16812a4dd 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -129,11 +129,16 @@ def deploy_parallel( # Deploy worker modules in parallel via WorkerManager worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] - # Deploy docker modules (each gets its own DockerModule) - docker_results: list[Any] = [] - for module_class, args, kwargs in docker_specs: - dm = self._deploy_docker(module_class, *args, **kwargs) - docker_results.append(dm) + # Deploy docker modules in parallel (each starts its own container) + if docker_specs: + with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor: + futures = [ + executor.submit(self._deploy_docker, module_class, *args, **kwargs) + for module_class, args, kwargs in docker_specs + ] + docker_results: list[Any] = [f.result() for f in futures] + else: + docker_results: list[Any] = [] # Reassemble results in original order results: list[Any] = [] From f685fc0804f7ea5fb706a9a88848547982a1feae Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 23:33:09 -0800 Subject: [PATCH 10/52] fix container name to be stable --- dimos/core/docker_runner.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index f6bbd98325..1fc281c035 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -18,7 +18,6 @@ from dataclasses import dataclass, field import importlib import json -import os import signal import subprocess import threading @@ -181,9 +180,8 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self._kwargs = kwargs self._running = False self.remote_name = module_class.__name__ - self._container_name = ( - config.docker_container_name - or f"dimos_{module_class.__name__.lower()}_{os.getpid()}_{int(time.time())}" + self._container_name = config.docker_container_name or self._default_container_name( + module_class, config ) # RPC setup (lazy import to keep container-side imports light) @@ -202,6 +200,16 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non logger.info(f"Building {config.docker_image}") build_image(config) + @staticmethod + def _default_container_name(module_class: type[Module], config: DockerModuleConfig) -> str: + import hashlib + + name = module_class.__name__.lower() + path_hash = hashlib.sha256( + str(config.docker_file.resolve()).encode() # type: ignore[union-attr] + ).hexdigest()[:12] + return f"dimos_{name}_{path_hash}" + def set_rpc_method(self, method: str, callable: RpcCall) -> None: callable.set_rpc(self.rpc) self._bound_rpc_calls[method] = callable From c8276e1ca42122f59f2b3bd52f692b14c21bd81b Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 23:51:12 -0800 Subject: [PATCH 11/52] lazy import --- dimos/core/o3dpickle.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/dimos/core/o3dpickle.py b/dimos/core/o3dpickle.py index 1912ab7739..1c1464fece 100644 --- a/dimos/core/o3dpickle.py +++ b/dimos/core/o3dpickle.py @@ -14,25 +14,34 @@ import copyreg -import numpy as np -import open3d as o3d # type: ignore[import-untyped] - +# open3d is imported lazily (inside functions) rather than at module level. +# dimos.core.core imports this module just to register pickle handlers, and core is +# imported by almost everything — including lightweight docker modules that don't use +# open3d. A module-level import would drag in open3d's sklearn/scipy chain everywhere, +# which crashes in environments where those packages aren't installed or version-matched. +# (i.e. minimal docker envs) def reduce_external(obj): # type: ignore[no-untyped-def] + import numpy as np + # Convert Vector3dVector to numpy array for pickling points_array = np.asarray(obj.points) return (reconstruct_pointcloud, (points_array,)) def reconstruct_pointcloud(points_array): # type: ignore[no-untyped-def] - # Create new PointCloud and assign the points + import open3d as o3d # type: ignore[import-untyped] + pc = o3d.geometry.PointCloud() pc.points = o3d.utility.Vector3dVector(points_array) return pc def register_picklers() -> None: - # Register for the actual PointCloud class that gets instantiated - # We need to create a dummy PointCloud to get its actual class + try: + import open3d as o3d # type: ignore[import-untyped] + except ImportError: + return # open3d not installed in this environment; skip registration + _dummy_pc = o3d.geometry.PointCloud() copyreg.pickle(_dummy_pc.__class__, reduce_external) From 971d2f75e3131239ff97208ba322fda25bffe27a Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 10:08:21 -0800 Subject: [PATCH 12/52] clean up --- dimos/core/docker_runner.py | 139 ++++++++++++-------- dimos/core/module.py | 25 +++- dimos/core/module_coordinator.py | 89 ++++--------- dimos/core/o3dpickle.py | 21 +-- dimos/core/tests/test_docker_deployment.py | 21 ++- examples/docker_hello_world/hello_docker.py | 7 +- 6 files changed, 155 insertions(+), 147 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 1fc281c035..c6a196b7a7 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -26,6 +26,7 @@ from dimos.core.module import ModuleConfig from dimos.core.rpc_client import RpcCall +from dimos.protocol.rpc import LCMRPC from dimos.utils.logging_config import setup_logger from dimos.visualization.rerun.constants import RERUN_GRPC_PORT, RERUN_WEB_PORT @@ -139,6 +140,32 @@ def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> s return out + ("\n" + err if err else "") +def _prompt_restart(container_name: str) -> bool: + """Ask the user whether to restart a running container. + + Returns True to restart, False to reuse. + Falls back to restart when stdin is not a TTY (e.g. CI). + """ + import sys + + if not sys.stdin.isatty(): + logger.warning( + f"Container '{container_name}' already running — restarting (non-interactive)." + ) + return True + + print(f"\nContainer '{container_name}' is already running.") + print(" [r] Restart — stop the existing container and start a fresh one") + print(" [u] Use — attach to the existing container as-is") + while True: + choice = input("Choice [r/u]: ").strip().lower() + if choice in ("r", "restart"): + return True + if choice in ("u", "use"): + return False + print("Please enter 'r' or 'u'.") + + def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]: """Extract JSON-serializable config fields for the container (excludes docker_* fields).""" out: dict[str, Any] = {} @@ -161,21 +188,22 @@ class DockerModule: Host-side handle for a module running inside Docker. Lifecycle: - - start(): launches container, waits for module ready via RPC - - stop(): stops container - - __getattr__: exposes RpcCall for @rpc methods on remote module + - start(): builds the image if needed, launches the container, waits for readiness, calls the remote module's start() RPC (after streams are wired) + - stop(): stops the container and cleans up Communication: All RPC happens via LCM multicast (requires --network=host). """ + config : DockerModuleConfig def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None: # Config config_class = getattr(module_class, "default_config", DockerModuleConfig) + assert issubclass(config_class, DockerModuleConfig) config = config_class(**kwargs) - + # Module info self._module_class = module_class - self._config = config + self.config = config self._args = args self._kwargs = kwargs self._running = False @@ -184,21 +212,13 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non module_class, config ) - # RPC setup (lazy import to keep container-side imports light) - from dimos.protocol.rpc import LCMRPC self.rpc = LCMRPC() self.rpcs = set(module_class.rpcs.keys()) # type: ignore[attr-defined] self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", []) self._unsub_fns: list[Callable[[], None]] = [] self._bound_rpc_calls: dict[str, RpcCall] = {} - - # Build image if needed (but don't start - caller must call start() explicitly) - from dimos.core.docker_build import build_image, image_exists - - if not image_exists(config): - logger.info(f"Building {config.docker_image}") - build_image(config) + self._deferred_transports: dict[str, str] = {} # stream_name -> topic @staticmethod def _default_container_name(module_class: type[Module], config: DockerModuleConfig) -> str: @@ -210,44 +230,56 @@ def _default_container_name(module_class: type[Module], config: DockerModuleConf ).hexdigest()[:12] return f"dimos_{name}_{path_hash}" + def get_rpc_method_names(self) -> list[str]: + return self.rpc_calls + def set_rpc_method(self, method: str, callable: RpcCall) -> None: callable.set_rpc(self.rpc) self._bound_rpc_calls[method] = callable def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: - # Check all requested methods exist missing = set(methods) - self._bound_rpc_calls.keys() if missing: raise ValueError(f"RPC methods not found: {missing}") - # Return single RpcCall or tuple calls = tuple(self._bound_rpc_calls[m] for m in methods) return calls[0] if len(calls) == 1 else calls def start(self) -> None: - if self._running: - return + """Invoke the remote module's start() RPC. - cfg = self._config + Called after stream transports are wired so the module can subscribe + to its streams with valid transports. + """ + from dimos.core.docker_build import build_image, image_exists - # Prevent accidental kill of running container with same name - if _is_container_running(cfg, self._container_name): - raise RuntimeError( - f"Container '{self._container_name}' already running. " - "Choose a different container_name or stop the existing container." - ) - _remove_container(cfg, self._container_name) - - cmd = self._build_docker_run_command() - logger.info(f"Starting docker container: {self._container_name}") - r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT) - if r.returncode != 0: - raise RuntimeError( - f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" - ) + if not image_exists(self.config): + logger.info(f"Building {self.config.docker_image}") + build_image(self.config) + try: - self.rpc.start() - self._running = True - self._wait_for_ready() + cfg = self.config + if _is_container_running(cfg, self._container_name): + restart = _prompt_restart(self._container_name) + if restart: + _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) + _remove_container(cfg, self._container_name) + + cmd = self._build_docker_run_command() + logger.info(f"Starting docker container: {self._container_name}") + r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT) + if r.returncode != 0: + raise RuntimeError( + f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" + ) + + self.rpc.start() + self._running = True + self._configure_streams(self._deferred_transports) + self.rpc.call_sync(f"{self.remote_name}/start", ([], {})) + except Exception: + with suppress(Exception): + self.stop() + raise def stop(self) -> None: """Gracefully stop the Docker container and clean up resources.""" @@ -263,13 +295,13 @@ def stop(self) -> None: self._unsub_fns.clear() # Stop and remove container - _run([_docker_bin(self._config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) - _remove_container(self._config, self._container_name) + _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) + _remove_container(self.config, self._container_name) self._running = False logger.info(f"Stopped container: {self._container_name}") def status(self) -> dict[str, Any]: - cfg = self._config + cfg = self.config return { "module": self.remote_name, "container_name": self._container_name, @@ -278,19 +310,17 @@ def status(self) -> dict[str, Any]: } def tail_logs(self, n: int = 200) -> str: - return _tail_logs(self._config, self._container_name, n=n) + return _tail_logs(self.config, self._container_name, n=n) def set_transport(self, stream_name: str, transport: Any) -> bool: - """Configure stream transport in container. Mirrors Module.set_transport() for autoconnect().""" + """Defer stream transport config until start() when the container is running.""" topic = getattr(transport, "topic", None) if topic is None: return False if hasattr(topic, "topic"): topic = topic.topic - result, _ = self.rpc.call_sync( - f"{self.remote_name}/configure_stream", ([stream_name, str(topic)], {}) - ) - return bool(result) + self._deferred_transports[stream_name] = str(topic) + return True def __getattr__(self, name: str) -> Any: if name in self.rpcs: @@ -302,7 +332,7 @@ def __getattr__(self, name: str) -> Any: def _build_docker_run_command(self) -> list[str]: """Build the complete `docker run` command.""" - cfg = self._config + cfg = self.config self._validate_config(cfg) cmd = [_docker_bin(cfg), "run", "-d"] @@ -448,9 +478,13 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]: # DimOS base image entrypoint already runs "dimos.core.docker_runner run" return ["--payload", json.dumps(payload, separators=(",", ":"))] - def _wait_for_ready(self) -> None: - """Poll the module's RPC endpoint until ready, crashed, or timeout.""" - cfg = self._config + def _configure_streams(self, streams: dict[str, str]) -> None: + """Poll configure_streams RPC until the container's RPC server is up, then wire streams. + + Also serves as the liveness gate — the first successful call proves the + container is ready to accept RPCs. + """ + cfg = self.config start_time = time.time() logger.info(f"Waiting for {self.remote_name} to be ready...") @@ -462,13 +496,14 @@ def _wait_for_ready(self) -> None: try: self.rpc.call_sync( - f"{self.remote_name}/start", ([], {}), rpc_timeout=RPC_READY_TIMEOUT + f"{self.remote_name}/configure_streams", + ([streams], {}), + rpc_timeout=RPC_READY_TIMEOUT, ) elapsed = time.time() - start_time logger.info(f"{self.remote_name} ready ({elapsed:.1f}s)") return except (TimeoutError, ConnectionError, OSError): - # Module not ready yet - retry after poll interval time.sleep(cfg.docker_poll_interval) logs = _tail_logs(cfg, self._container_name) diff --git a/dimos/core/module.py b/dimos/core/module.py index 127be545fe..72df61d4c7 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -446,15 +446,26 @@ def set_transport(self, stream_name: str, transport: Transport) -> bool: # type return True @rpc - def configure_stream(self, stream_name: str, topic: str) -> bool: - """Configure a stream's transport by topic. Called by DockerModule for stream wiring.""" + def configure_streams(self, streams: dict[str, str]) -> dict[str, bool]: + """Configure stream transports in bulk by topic. Called by DockerModule for stream wiring. + + Args: + streams: mapping of stream_name -> topic + + Returns: + mapping of stream_name -> success + """ from dimos.core.transport import pLCMTransport - stream = getattr(self, stream_name, None) - if not isinstance(stream, (Out, In)): - return False - stream._transport = pLCMTransport(topic) - return True + results: dict[str, bool] = {} + for stream_name, topic in streams.items(): + stream = getattr(self, stream_name, None) + if not isinstance(stream, (Out, In)): + results[stream_name] = False + else: + stream._transport = pLCMTransport(topic) + results[stream_name] = True + return results # called from remote def connect_stream(self, input_name: str, remote_stream: RemoteOut[T]): # type: ignore[no-untyped-def] diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index b16812a4dd..3d71e8776b 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -76,33 +76,14 @@ def stop(self) -> None: self._client.close_all() # type: ignore[union-attr] - def _deploy_docker(self, module_class: type[Module], *args: Any, **kwargs: Any) -> DockerModule: - from contextlib import suppress - - logger.info("Deploying module in Docker.", module=module_class.__name__) - dm = DockerModule(module_class, *args, **kwargs) - try: - # why are docker modules started here? shouldn't they be started in start_all_modules? - # this is a bigger design problem we have with how blueprints, ModuleCoordinator, and WorkerManager are leaky abstractions with imperfect boundaries - # the Stream/RPC wiring (in blueprints) happens after deploy but before start. For docker modules, wiring needs the container's LCM transport to be reachable — which requires the container to be running. - # self.rpc.call_sync() send an RPC call to the container during wiring, the container must be running to handle that - # if we defer start() to start_all_modules, the container won't be up yet when _connect_streams and _connect_rpc_methods try to wire things - dm.start() - except Exception: - with suppress(Exception): - dm.stop() - raise - return dm - def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") - - if is_docker_module(module_class): - module = self._deploy_docker(module_class, *args, **kwargs) # type: ignore[assignment] - else: - module = self._client.deploy(module_class, *args, **kwargs) # type: ignore[union-attr, attr-defined, assignment] - + module = ( + DockerModule(module_class, *args, **kwargs) # type: ignore[assignment] + if is_docker_module(module_class) + else self._client.deploy(module_class, *args, **kwargs) # type: ignore[union-attr, attr-defined, assignment] + ) self._deployed_modules[module_class] = module # type: ignore[assignment] return module # type: ignore[return-value] @@ -112,49 +93,38 @@ def deploy_parallel( if not self._client: raise ValueError("Not started") - # Separate docker modules from regular modules - docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] - worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] - spec_indices: list[tuple[str, int]] = [] # ("docker"|"worker", index_in_sublist) - - for spec in module_specs: - module_class = spec[0] - if is_docker_module(module_class): - spec_indices.append(("docker", len(docker_specs))) - docker_specs.append(spec) - else: - spec_indices.append(("worker", len(worker_specs))) - worker_specs.append(spec) - - # Deploy worker modules in parallel via WorkerManager + docker_specs = [ + (module_class, args, kwargs) for module_class, args, kwargs in module_specs if is_docker_module(module_class) + ] + worker_specs = [ + (module_class, args, kwargs) for module_class, args, kwargs in module_specs if not is_docker_module(module_class) + ] + worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] - # Deploy docker modules in parallel (each starts its own container) + docker_results: list[Any] = [] if docker_specs: with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor: - futures = [ - executor.submit(self._deploy_docker, module_class, *args, **kwargs) - for module_class, args, kwargs in docker_specs - ] - docker_results: list[Any] = [f.result() for f in futures] - else: - docker_results: list[Any] = [] - - # Reassemble results in original order - results: list[Any] = [] - for kind, idx in spec_indices: - if kind == "docker": - results.append(docker_results[idx]) - else: - results.append(worker_results[idx]) + docker_results = list( + executor.map( + lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs + ) + ) + + # Reassemble in original order + worker_iter = iter(worker_results) + docker_iter = iter(docker_results) + results: list[Any] = [ + next(docker_iter) if is_docker_module(module_class) else next(worker_iter) + for module_class, _, _ in module_specs + ] for (module_class, _, _), module in zip(module_specs, results, strict=True): - self._deployed_modules[module_class] = module + self._deployed_modules[module_class] = module # type: ignore[assignment] return results # type: ignore[return-value] def start_all_modules(self) -> None: - # Docker modules are already started during deploy, (see their deploy as to why this is) - modules = [m for cls, m in self._deployed_modules.items() if not is_docker_module(cls)] + modules = list(self._deployed_modules.values()) if isinstance(self._client, WorkerManager): with ThreadPoolExecutor(max_workers=max(len(modules), 1)) as executor: list(executor.map(lambda m: m.start(), modules)) @@ -162,10 +132,9 @@ def start_all_modules(self) -> None: for module in modules: module.start() - module_list = list(self._deployed_modules.values()) for module in modules: if hasattr(module, "on_system_modules"): - module.on_system_modules(module_list) + module.on_system_modules(modules) def get_instance(self, module: type[ModuleT]) -> ModuleProxy: return self._deployed_modules.get(module) # type: ignore[return-value, no-any-return] diff --git a/dimos/core/o3dpickle.py b/dimos/core/o3dpickle.py index 1c1464fece..1912ab7739 100644 --- a/dimos/core/o3dpickle.py +++ b/dimos/core/o3dpickle.py @@ -14,34 +14,25 @@ import copyreg -# open3d is imported lazily (inside functions) rather than at module level. -# dimos.core.core imports this module just to register pickle handlers, and core is -# imported by almost everything — including lightweight docker modules that don't use -# open3d. A module-level import would drag in open3d's sklearn/scipy chain everywhere, -# which crashes in environments where those packages aren't installed or version-matched. -# (i.e. minimal docker envs) +import numpy as np +import open3d as o3d # type: ignore[import-untyped] -def reduce_external(obj): # type: ignore[no-untyped-def] - import numpy as np +def reduce_external(obj): # type: ignore[no-untyped-def] # Convert Vector3dVector to numpy array for pickling points_array = np.asarray(obj.points) return (reconstruct_pointcloud, (points_array,)) def reconstruct_pointcloud(points_array): # type: ignore[no-untyped-def] - import open3d as o3d # type: ignore[import-untyped] - + # Create new PointCloud and assign the points pc = o3d.geometry.PointCloud() pc.points = o3d.utility.Vector3dVector(points_array) return pc def register_picklers() -> None: - try: - import open3d as o3d # type: ignore[import-untyped] - except ImportError: - return # open3d not installed in this environment; skip registration - + # Register for the actual PointCloud class that gets instantiated + # We need to create a dummy PointCloud to get its actual class _dummy_pc = o3d.geometry.PointCloud() copyreg.pickle(_dummy_pc.__class__, reduce_external) diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index 99c1debbb6..7a02682fda 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -94,36 +94,32 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_ # Should NOT go through worker manager mock_worker_mgr.deploy.assert_not_called() - # Should create a DockerModule and start it + # Should construct a DockerModule (container launch happens inside __init__) mock_docker_module_cls.assert_called_once_with(FakeDockerModule) - mock_dm.start.assert_called_once() + # start() is NOT called during deploy — it's called in start_all_modules + mock_dm.start.assert_not_called() assert result is mock_dm - # Should be tracked assert coordinator.get_instance(FakeDockerModule) is mock_dm coordinator.stop() @patch("dimos.core.module_coordinator.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") - def test_deploy_docker_cleans_up_on_start_failure( + def test_deploy_docker_propagates_constructor_failure( self, mock_worker_manager_cls, mock_docker_module_cls ): mock_worker_mgr = MagicMock() mock_worker_manager_cls.return_value = mock_worker_mgr - mock_dm = MagicMock() - mock_dm.start.side_effect = RuntimeError("start failed") - mock_docker_module_cls.return_value = mock_dm + # Container launch fails inside __init__; DockerModule handles its own cleanup + mock_docker_module_cls.side_effect = RuntimeError("launch failed") coordinator = ModuleCoordinator() coordinator.start() - with pytest.raises(RuntimeError, match="start failed"): + with pytest.raises(RuntimeError, match="launch failed"): coordinator.deploy(FakeDockerModule) - # stop() called to clean up the failed container - mock_dm.stop.assert_called_once() - coordinator.stop() @patch("dimos.core.module_coordinator.WorkerManager") @@ -170,7 +166,8 @@ def test_deploy_parallel_separates_docker_and_regular( mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})]) # Docker module gets its own DockerModule mock_docker_module_cls.assert_called_once_with(FakeDockerModule) - mock_dm.start.assert_called_once() + # start() is NOT called during deploy — it's called in start_all_modules + mock_dm.start.assert_not_called() # Results are in original order assert results[0] is regular_proxy diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py index 871be6f5d2..187384854e 100644 --- a/examples/docker_hello_world/hello_docker.py +++ b/examples/docker_hello_world/hello_docker.py @@ -106,6 +106,11 @@ def start(self) -> None: super().start() self.greeting.subscribe(self._on_greeting) + @rpc + def send(self, text: str) -> None: + """Publish a prompt message onto the stream.""" + self.prompt.publish(text) + def _on_greeting(self, text: str) -> None: print(f"[PromptModule] Received: {text}") @@ -130,7 +135,7 @@ def _on_greeting(self, text: str) -> None: print(docker_mod.greet("World")) # Test stream - prompt_mod.prompt.publish("stream test") + prompt_mod.send("stream test") time.sleep(2) coordinator.stop() From 868e3560d07d58f210028e80221372b5b8177e65 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 10:19:20 -0800 Subject: [PATCH 13/52] revert --- dimos/core/docker_runner.py | 2 +- dimos/visualization/rerun/bridge.py | 3 +++ dimos/visualization/rerun/constants.py | 17 ----------------- 3 files changed, 4 insertions(+), 18 deletions(-) delete mode 100644 dimos/visualization/rerun/constants.py diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index c6a196b7a7..e1a583b285 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -28,7 +28,7 @@ from dimos.core.rpc_client import RpcCall from dimos.protocol.rpc import LCMRPC from dimos.utils.logging_config import setup_logger -from dimos.visualization.rerun.constants import RERUN_GRPC_PORT, RERUN_WEB_PORT +from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT if TYPE_CHECKING: from collections.abc import Callable diff --git a/dimos/visualization/rerun/bridge.py b/dimos/visualization/rerun/bridge.py index 9cadbc617f..cc4b13ecb9 100644 --- a/dimos/visualization/rerun/bridge.py +++ b/dimos/visualization/rerun/bridge.py @@ -39,6 +39,9 @@ from dimos.protocol.pubsub.patterns import Glob, pattern_matches from dimos.utils.logging_config import setup_logger +RERUN_GRPC_PORT = 9876 +RERUN_WEB_PORT = 9090 + # TODO OUT visual annotations # # In the future it would be nice if modules can annotate their individual OUTs with (general or rerun specific) diff --git a/dimos/visualization/rerun/constants.py b/dimos/visualization/rerun/constants.py deleted file mode 100644 index e1c98176ad..0000000000 --- a/dimos/visualization/rerun/constants.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# isolated so that they can be imported into lightweight modules without importing all of rerun -RERUN_GRPC_PORT = 9876 -RERUN_WEB_PORT = 9090 From bea6f7a1d721f01ab54f29cc1ab3ea8376a79276 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 10:29:35 -0800 Subject: [PATCH 14/52] cleanup --- dimos/core/docker_runner.py | 4 ++-- dimos/core/module.py | 3 +-- dimos/core/module_coordinator.py | 11 ++++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index e1a583b285..3f1b3031c7 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -25,7 +25,7 @@ from typing import TYPE_CHECKING, Any from dimos.core.module import ModuleConfig -from dimos.core.rpc_client import RpcCall +from dimos.core.rpc_client import RpcCall, ModuleProxy from dimos.protocol.rpc import LCMRPC from dimos.utils.logging_config import setup_logger from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT @@ -183,7 +183,7 @@ def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]: # Host-side Docker-backed Module handle -class DockerModule: +class DockerModule(ModuleProxy): """ Host-side handle for a module running inside Docker. diff --git a/dimos/core/module.py b/dimos/core/module.py index 72df61d4c7..24be321ee2 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -218,12 +218,11 @@ def inputs(self) -> dict[str, In]: # type: ignore[type-arg] @classproperty def rpcs(self) -> dict[str, Callable[..., Any]]: - _skip = {"rpcs", "blueprint", "module_info", "io"} return { name: getattr(self, name) for name in dir(self) if not name.startswith("_") - and name not in _skip + and name != "rpcs" # Exclude the rpcs property itself to prevent recursion and callable(getattr(self, name, None)) and hasattr(getattr(self, name), "__rpc__") } diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 3d71e8776b..c2483bdd74 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -79,11 +79,12 @@ def stop(self) -> None: def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") - module = ( - DockerModule(module_class, *args, **kwargs) # type: ignore[assignment] - if is_docker_module(module_class) - else self._client.deploy(module_class, *args, **kwargs) # type: ignore[union-attr, attr-defined, assignment] - ) + + deployed_module : ModuleProxy + if is_docker_module(module_class): + deployed_module = DockerModule(module_class, *args, **kwargs) + else: + deployed_module = self._client.deploy(module_class, *args, **kwargs) self._deployed_modules[module_class] = module # type: ignore[assignment] return module # type: ignore[return-value] From 16b2007d84a6321887e3828b8c8f7246825515bf Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 11:01:11 -0800 Subject: [PATCH 15/52] fixup deploy_parallel --- dimos/core/module.py | 2 +- dimos/core/module_coordinator.py | 33 ++++++++++++++------------------ 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/dimos/core/module.py b/dimos/core/module.py index 24be321ee2..14aeea6da5 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -446,7 +446,7 @@ def set_transport(self, stream_name: str, transport: Transport) -> bool: # type @rpc def configure_streams(self, streams: dict[str, str]) -> dict[str, bool]: - """Configure stream transports in bulk by topic. Called by DockerModule for stream wiring. + """Configure stream transports in bulk by topic. NOTE: called before start, used by DockerModule for stream wiring. Args: streams: mapping of stream_name -> topic diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index c2483bdd74..8698af55cf 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -94,16 +94,19 @@ def deploy_parallel( if not self._client: raise ValueError("Not started") - docker_specs = [ - (module_class, args, kwargs) for module_class, args, kwargs in module_specs if is_docker_module(module_class) - ] - worker_specs = [ - (module_class, args, kwargs) for module_class, args, kwargs in module_specs if not is_docker_module(module_class) - ] + # Separate docker modules from regular modules + docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] + worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] + spec_indices: list[tuple[str, int]] = [] # ("docker"|"worker", index_in_sublist) - worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] + for module_class, args, kwargs in module_specs: + if is_docker_module(module_class): + docker_specs.append(spec) + else: + worker_specs.append(spec) - docker_results: list[Any] = [] + worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] + docker_results = [] if docker_specs: with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor: docker_results = list( @@ -111,17 +114,9 @@ def deploy_parallel( lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs ) ) - - # Reassemble in original order - worker_iter = iter(worker_results) - docker_iter = iter(docker_results) - results: list[Any] = [ - next(docker_iter) if is_docker_module(module_class) else next(worker_iter) - for module_class, _, _ in module_specs - ] - - for (module_class, _, _), module in zip(module_specs, results, strict=True): - self._deployed_modules[module_class] = module # type: ignore[assignment] + + for (module_class, _, _), module in zip(worker_specs+docker_specs, worker_results+docker_results, strict=True): + self._deployed_modules[module_class] = module return results # type: ignore[return-value] def start_all_modules(self) -> None: From aa42ced2f233103f3fe83a7c04af5f548b9d47d9 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 11:08:13 -0800 Subject: [PATCH 16/52] clean up reconnect logic --- dimos/core/docker_runner.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 3f1b3031c7..c7e40f0997 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -140,7 +140,7 @@ def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> s return out + ("\n" + err if err else "") -def _prompt_restart(container_name: str) -> bool: +def _prompt_reconnect(container_name: str) -> bool: """Ask the user whether to restart a running container. Returns True to restart, False to reuse. @@ -152,7 +152,7 @@ def _prompt_restart(container_name: str) -> bool: logger.warning( f"Container '{container_name}' already running — restarting (non-interactive)." ) - return True + return False print(f"\nContainer '{container_name}' is already running.") print(" [r] Restart — stop the existing container and start a fresh one") @@ -160,9 +160,9 @@ def _prompt_restart(container_name: str) -> bool: while True: choice = input("Choice [r/u]: ").strip().lower() if choice in ("r", "restart"): - return True - if choice in ("u", "use"): return False + if choice in ("u", "use"): + return True print("Please enter 'r' or 'u'.") @@ -258,12 +258,14 @@ def start(self) -> None: try: cfg = self.config + reconnect = False if _is_container_running(cfg, self._container_name): - restart = _prompt_restart(self._container_name) - if restart: + reconnect = _prompt_reconnect(self._container_name) + if not reconnect: _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) - _remove_container(cfg, self._container_name) - + if not reconnect: + _remove_container(cfg, self._container_name) + cmd = self._build_docker_run_command() logger.info(f"Starting docker container: {self._container_name}") r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT) From 6d07778905cc1e14528ca7d79b18530faea70446 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 11:20:53 -0800 Subject: [PATCH 17/52] fixup --- dimos/core/module_coordinator.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 8698af55cf..9689a6119b 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -114,19 +114,17 @@ def deploy_parallel( lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs ) ) + specs = worker_specs+docker_specs + results = worker_results+docker_results - for (module_class, _, _), module in zip(worker_specs+docker_specs, worker_results+docker_results, strict=True): + for (module_class, _, _), module in zip(specs, results, strict=True): self._deployed_modules[module_class] = module return results # type: ignore[return-value] def start_all_modules(self) -> None: modules = list(self._deployed_modules.values()) - if isinstance(self._client, WorkerManager): - with ThreadPoolExecutor(max_workers=max(len(modules), 1)) as executor: - list(executor.map(lambda m: m.start(), modules)) - else: - for module in modules: - module.start() + with ThreadPoolExecutor(max_workers=len(modules)) as executor: + list(executor.map(lambda m: m.start(), modules)) for module in modules: if hasattr(module, "on_system_modules"): From d74173fe1a752405e5efae3f17cddbbc807ab0da Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 11:22:15 -0800 Subject: [PATCH 18/52] - --- dimos/core/module_coordinator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 9689a6119b..2d15734b30 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -85,8 +85,8 @@ def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: deployed_module = DockerModule(module_class, *args, **kwargs) else: deployed_module = self._client.deploy(module_class, *args, **kwargs) - self._deployed_modules[module_class] = module # type: ignore[assignment] - return module # type: ignore[return-value] + self._deployed_modules[module_class] = deployed_module + return deployed_module def deploy_parallel( self, module_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] From d2aafeef13500e275c081211c70e0c2b99c54c5d Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 11:53:32 -0800 Subject: [PATCH 19/52] fix deployment/coordinator timeline --- dimos/core/docker_runner.py | 110 ++++++++++++++++++------------------ dimos/core/module.py | 22 -------- 2 files changed, 54 insertions(+), 78 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index c7e40f0997..fb3fc28af7 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -196,12 +196,12 @@ class DockerModule(ModuleProxy): config : DockerModuleConfig def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None: - # Config + from dimos.core.docker_build import build_image, image_exists + config_class = getattr(module_class, "default_config", DockerModuleConfig) assert issubclass(config_class, DockerModuleConfig) config = config_class(**kwargs) - - # Module info + self._module_class = module_class self.config = config self._args = args @@ -212,13 +212,43 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non module_class, config ) - self.rpc = LCMRPC() self.rpcs = set(module_class.rpcs.keys()) # type: ignore[attr-defined] self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", []) self._unsub_fns: list[Callable[[], None]] = [] self._bound_rpc_calls: dict[str, RpcCall] = {} - self._deferred_transports: dict[str, str] = {} # stream_name -> topic + + # Build image, launch container, wait for RPC server — mirrors worker Module.__init__ + try: + if not image_exists(config): + logger.info(f"Building {config.docker_image}") + build_image(config) + + reconnect = False + if _is_container_running(config, self._container_name): + reconnect = _prompt_reconnect(self._container_name) + if not reconnect: + _run([_docker_bin(config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) + if not reconnect: + _remove_container(config, self._container_name) + + cmd = self._build_docker_run_command() + logger.info(f"Starting docker container: {self._container_name}") + r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT) + if r.returncode != 0: + raise RuntimeError( + f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" + ) + + self.rpc.start() + self._running = True + # docker run -d returns before Module.__init__ finishes in the container, + # so we poll until the RPC server is reachable before returning. + self._wait_for_rpc() + except Exception: + with suppress(Exception): + self.stop() + raise @staticmethod def _default_container_name(module_class: type[Module], config: DockerModuleConfig) -> str: @@ -236,6 +266,11 @@ def get_rpc_method_names(self) -> list[str]: def set_rpc_method(self, method: str, callable: RpcCall) -> None: callable.set_rpc(self.rpc) self._bound_rpc_calls[method] = callable + # Forward to container — Module.set_rpc_method unpickles the RpcCall + # and wires it with the container's own LCMRPC + self.rpc.call_sync( + f"{self.remote_name}/set_rpc_method", ([method, callable], {}) + ) def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: missing = set(methods) - self._bound_rpc_calls.keys() @@ -245,38 +280,8 @@ def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: return calls[0] if len(calls) == 1 else calls def start(self) -> None: - """Invoke the remote module's start() RPC. - - Called after stream transports are wired so the module can subscribe - to its streams with valid transports. - """ - from dimos.core.docker_build import build_image, image_exists - - if not image_exists(self.config): - logger.info(f"Building {self.config.docker_image}") - build_image(self.config) + """Invoke the remote module's start() RPC.""" try: - - cfg = self.config - reconnect = False - if _is_container_running(cfg, self._container_name): - reconnect = _prompt_reconnect(self._container_name) - if not reconnect: - _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) - if not reconnect: - _remove_container(cfg, self._container_name) - - cmd = self._build_docker_run_command() - logger.info(f"Starting docker container: {self._container_name}") - r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT) - if r.returncode != 0: - raise RuntimeError( - f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" - ) - - self.rpc.start() - self._running = True - self._configure_streams(self._deferred_transports) self.rpc.call_sync(f"{self.remote_name}/start", ([], {})) except Exception: with suppress(Exception): @@ -285,10 +290,11 @@ def start(self) -> None: def stop(self) -> None: """Gracefully stop the Docker container and clean up resources.""" - # Signal remote module, stop RPC, unsubscribe handlers (ignore failures) + if not self._running: + return + with suppress(Exception): - if self._running: - self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {})) + self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {})) with suppress(Exception): self.rpc.stop() for unsub in self._unsub_fns: @@ -296,7 +302,6 @@ def stop(self) -> None: unsub() self._unsub_fns.clear() - # Stop and remove container _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) _remove_container(self.config, self._container_name) self._running = False @@ -315,14 +320,11 @@ def tail_logs(self, n: int = 200) -> str: return _tail_logs(self.config, self._container_name, n=n) def set_transport(self, stream_name: str, transport: Any) -> bool: - """Defer stream transport config until start() when the container is running.""" - topic = getattr(transport, "topic", None) - if topic is None: - return False - if hasattr(topic, "topic"): - topic = topic.topic - self._deferred_transports[stream_name] = str(topic) - return True + """Forward to the container's Module.set_transport RPC.""" + result, _ = self.rpc.call_sync( + f"{self.remote_name}/set_transport", ([stream_name, transport], {}) + ) + return bool(result) def __getattr__(self, name: str) -> Any: if name in self.rpcs: @@ -480,12 +482,8 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]: # DimOS base image entrypoint already runs "dimos.core.docker_runner run" return ["--payload", json.dumps(payload, separators=(",", ":"))] - def _configure_streams(self, streams: dict[str, str]) -> None: - """Poll configure_streams RPC until the container's RPC server is up, then wire streams. - - Also serves as the liveness gate — the first successful call proves the - container is ready to accept RPCs. - """ + def _wait_for_rpc(self) -> None: + """Poll until the container's RPC server is reachable.""" cfg = self.config start_time = time.time() @@ -498,8 +496,8 @@ def _configure_streams(self, streams: dict[str, str]) -> None: try: self.rpc.call_sync( - f"{self.remote_name}/configure_streams", - ([streams], {}), + f"{self.remote_name}/get_rpc_method_names", + ([], {}), rpc_timeout=RPC_READY_TIMEOUT, ) elapsed = time.time() - start_time diff --git a/dimos/core/module.py b/dimos/core/module.py index 14aeea6da5..af642b71bd 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -444,28 +444,6 @@ def set_transport(self, stream_name: str, transport: Transport) -> bool: # type stream._transport = transport return True - @rpc - def configure_streams(self, streams: dict[str, str]) -> dict[str, bool]: - """Configure stream transports in bulk by topic. NOTE: called before start, used by DockerModule for stream wiring. - - Args: - streams: mapping of stream_name -> topic - - Returns: - mapping of stream_name -> success - """ - from dimos.core.transport import pLCMTransport - - results: dict[str, bool] = {} - for stream_name, topic in streams.items(): - stream = getattr(self, stream_name, None) - if not isinstance(stream, (Out, In)): - results[stream_name] = False - else: - stream._transport = pLCMTransport(topic) - results[stream_name] = True - return results - # called from remote def connect_stream(self, input_name: str, remote_stream: RemoteOut[T]): # type: ignore[no-untyped-def] input_stream = getattr(self, input_name, None) From f6b4c57e8999fa4ed473e8068a36a2e6360eb5df Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 12:32:36 -0800 Subject: [PATCH 20/52] fir enforcement of either dockerfile or image pull --- dimos/core/docker_runner.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index fb3fc28af7..c7b2528969 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -221,8 +221,17 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non # Build image, launch container, wait for RPC server — mirrors worker Module.__init__ try: if not image_exists(config): - logger.info(f"Building {config.docker_image}") - build_image(config) + if config.docker_file is not None: + logger.info(f"Building {config.docker_image}") + build_image(config) + else: + logger.info(f"Pulling {config.docker_image}") + r = _run([_docker_bin(config), "pull", config.docker_image], timeout=DOCKER_RUN_TIMEOUT) + if r.returncode != 0: + raise RuntimeError( + f"Failed to pull image '{config.docker_image}'.\n" + f"STDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" + ) reconnect = False if _is_container_running(config, self._container_name): From 2c03652add369226573681046bb31f9b3f267696 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 12:33:24 -0800 Subject: [PATCH 21/52] fix reconnect system --- dimos/core/docker_runner.py | 49 ++++++++++--------------------------- 1 file changed, 13 insertions(+), 36 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index c7b2528969..8cca64ca16 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -140,31 +140,6 @@ def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> s return out + ("\n" + err if err else "") -def _prompt_reconnect(container_name: str) -> bool: - """Ask the user whether to restart a running container. - - Returns True to restart, False to reuse. - Falls back to restart when stdin is not a TTY (e.g. CI). - """ - import sys - - if not sys.stdin.isatty(): - logger.warning( - f"Container '{container_name}' already running — restarting (non-interactive)." - ) - return False - - print(f"\nContainer '{container_name}' is already running.") - print(" [r] Restart — stop the existing container and start a fresh one") - print(" [u] Use — attach to the existing container as-is") - while True: - choice = input("Choice [r/u]: ").strip().lower() - if choice in ("r", "restart"): - return False - if choice in ("u", "use"): - return True - print("Please enter 'r' or 'u'.") - def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]: """Extract JSON-serializable config fields for the container (excludes docker_* fields).""" @@ -235,20 +210,22 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non reconnect = False if _is_container_running(config, self._container_name): - reconnect = _prompt_reconnect(self._container_name) - if not reconnect: + if config.docker_reconnect_container: + logger.info(f"Reconnecting to running container: {self._container_name}") + reconnect = True + else: + logger.info(f"Stopping existing container: {self._container_name}") _run([_docker_bin(config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) + if not reconnect: _remove_container(config, self._container_name) - - cmd = self._build_docker_run_command() - logger.info(f"Starting docker container: {self._container_name}") - r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT) - if r.returncode != 0: - raise RuntimeError( - f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" - ) - + cmd = self._build_docker_run_command() + logger.info(f"Starting docker container: {self._container_name}") + r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT) + if r.returncode != 0: + raise RuntimeError( + f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" + ) self.rpc.start() self._running = True # docker run -d returns before Module.__init__ finishes in the container, From c225a9aadc758b2098734d27f5b8bdc155a8cc1c Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 12:33:42 -0800 Subject: [PATCH 22/52] - --- dimos/core/docker_runner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 8cca64ca16..15677a0e03 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -98,6 +98,9 @@ class DockerModuleConfig(ModuleConfig): docker_startup_timeout: float = 120.0 docker_poll_interval: float = 1.0 + # Reconnect to a running container instead of restarting it + docker_reconnect_container: bool = False + # Advanced docker_bin: str = "docker" From 4fd09b7787144ff3ef61cb2883fab652e04986a2 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 12:34:14 -0800 Subject: [PATCH 23/52] fix deploy_parallel --- dimos/core/module_coordinator.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 2d15734b30..59e1e5a657 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -94,19 +94,11 @@ def deploy_parallel( if not self._client: raise ValueError("Not started") - # Separate docker modules from regular modules - docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] - worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] - spec_indices: list[tuple[str, int]] = [] # ("docker"|"worker", index_in_sublist) - - for module_class, args, kwargs in module_specs: - if is_docker_module(module_class): - docker_specs.append(spec) - else: - worker_specs.append(spec) + docker_specs = [spec for spec in module_specs if is_docker_module(spec[0])] + worker_specs = [spec for spec in module_specs if not is_docker_module(spec[0])] worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] - docker_results = [] + docker_results: list[Any] = [] if docker_specs: with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor: docker_results = list( @@ -114,12 +106,13 @@ def deploy_parallel( lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs ) ) - specs = worker_specs+docker_specs - results = worker_results+docker_results - - for (module_class, _, _), module in zip(specs, results, strict=True): + + results = worker_results + docker_results + for (module_class, _, _), module in zip( + worker_specs + docker_specs, results, strict=True + ): self._deployed_modules[module_class] = module - return results # type: ignore[return-value] + return results def start_all_modules(self) -> None: modules = list(self._deployed_modules.values()) From b514747ab4b873aaf6e68ebca7b697c5777fe415 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 12:34:38 -0800 Subject: [PATCH 24/52] better error --- dimos/core/module_coordinator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 59e1e5a657..3dda7c38b0 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -116,6 +116,8 @@ def deploy_parallel( def start_all_modules(self) -> None: modules = list(self._deployed_modules.values()) + if not modules: + raise ValueError("No modules deployed. Call deploy() before start_all_modules().") with ThreadPoolExecutor(max_workers=len(modules)) as executor: list(executor.map(lambda m: m.start(), modules)) From cb18fd220de6e16f25d4e3403a87a83f1a8d1871 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 12:36:59 -0800 Subject: [PATCH 25/52] clean container name generation --- dimos/core/docker_runner.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 15677a0e03..d11a68e2a1 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -186,9 +186,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self._kwargs = kwargs self._running = False self.remote_name = module_class.__name__ - self._container_name = config.docker_container_name or self._default_container_name( - module_class, config - ) + # Derive container name from image name: "my-registry/foo:v2" → "dimos_foo" + image_base = config.docker_image.rsplit(":", 1)[0].rsplit("/", 1)[-1] + self._container_name = config.docker_container_name or f"dimos_{image_base}" self.rpc = LCMRPC() self.rpcs = set(module_class.rpcs.keys()) # type: ignore[attr-defined] @@ -239,16 +239,6 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self.stop() raise - @staticmethod - def _default_container_name(module_class: type[Module], config: DockerModuleConfig) -> str: - import hashlib - - name = module_class.__name__.lower() - path_hash = hashlib.sha256( - str(config.docker_file.resolve()).encode() # type: ignore[union-attr] - ).hexdigest()[:12] - return f"dimos_{name}_{path_hash}" - def get_rpc_method_names(self) -> list[str]: return self.rpc_calls From ff482c2c964db67484b8f71b4c00d3c182428f9d Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 12:43:44 -0800 Subject: [PATCH 26/52] fixup typing for ModuleProxy --- dimos/core/docker_runner.py | 24 +++++++++++++----------- dimos/core/module_coordinator.py | 10 +++++----- dimos/core/rpc_client.py | 15 +++++++++++++-- 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index d11a68e2a1..74e7c840c8 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -25,7 +25,7 @@ from typing import TYPE_CHECKING, Any from dimos.core.module import ModuleConfig -from dimos.core.rpc_client import RpcCall, ModuleProxy +from dimos.core.rpc_client import ModuleProxyProtocol, RpcCall from dimos.protocol.rpc import LCMRPC from dimos.utils.logging_config import setup_logger from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT @@ -161,7 +161,7 @@ def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]: # Host-side Docker-backed Module handle -class DockerModule(ModuleProxy): +class DockerModule(ModuleProxyProtocol): """ Host-side handle for a module running inside Docker. @@ -171,13 +171,17 @@ class DockerModule(ModuleProxy): Communication: All RPC happens via LCM multicast (requires --network=host). """ - config : DockerModuleConfig + config: DockerModuleConfig def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None: from dimos.core.docker_build import build_image, image_exists config_class = getattr(module_class, "default_config", DockerModuleConfig) - assert issubclass(config_class, DockerModuleConfig) + if not issubclass(config_class, DockerModuleConfig): + raise TypeError( + f"{module_class.__name__}.default_config must be a DockerModuleConfig subclass, " + f"got {config_class.__name__}" + ) config = config_class(**kwargs) self._module_class = module_class @@ -196,7 +200,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self._unsub_fns: list[Callable[[], None]] = [] self._bound_rpc_calls: dict[str, RpcCall] = {} - # Build image, launch container, wait for RPC server — mirrors worker Module.__init__ + # Build or pull image, launch container, wait for RPC server try: if not image_exists(config): if config.docker_file is not None: @@ -269,9 +273,6 @@ def start(self) -> None: def stop(self) -> None: """Gracefully stop the Docker container and clean up resources.""" - if not self._running: - return - with suppress(Exception): self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {})) with suppress(Exception): @@ -280,9 +281,10 @@ def stop(self) -> None: with suppress(Exception): unsub() self._unsub_fns.clear() - - _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) - _remove_container(self.config, self._container_name) + with suppress(Exception): + _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) + with suppress(Exception): + _remove_container(self.config, self._container_name) self._running = False logger.info(f"Stopped container: {self._container_name}") diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 3dda7c38b0..5534d9f9a7 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -27,7 +27,7 @@ if TYPE_CHECKING: from dimos.core.module import Module, ModuleT from dimos.core.resource_monitor.monitor import StatsMonitor - from dimos.core.rpc_client import ModuleProxy + from dimos.core.rpc_client import ModuleProxy, ModuleProxyProtocol logger = setup_logger() @@ -37,7 +37,7 @@ class ModuleCoordinator(Resource): # type: ignore[misc] _global_config: GlobalConfig _n: int | None = None _memory_limit: str = "auto" - _deployed_modules: dict[type[Module], ModuleProxy] + _deployed_modules: dict[type[Module], ModuleProxyProtocol] _stats_monitor: StatsMonitor | None = None def __init__( @@ -79,14 +79,14 @@ def stop(self) -> None: def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") - - deployed_module : ModuleProxy + + deployed_module: ModuleProxyProtocol if is_docker_module(module_class): deployed_module = DockerModule(module_class, *args, **kwargs) else: deployed_module = self._client.deploy(module_class, *args, **kwargs) self._deployed_modules[module_class] = deployed_module - return deployed_module + return deployed_module # type: ignore[return-value] def deploy_parallel( self, module_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py index e46124469c..a89c54caf0 100644 --- a/dimos/core/rpc_client.py +++ b/dimos/core/rpc_client.py @@ -13,7 +13,7 @@ # limitations under the License. from collections.abc import Callable -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Protocol from dimos.core.stream import RemoteStream from dimos.core.worker import MethodCallProxy @@ -80,7 +80,18 @@ def __setstate__(self, state) -> None: # type: ignore[no-untyped-def] self._stop_rpc_client = None -class RPCClient: +class ModuleProxyProtocol(Protocol): + """Protocol for host-side handles to remote modules (worker or Docker).""" + + def start(self) -> None: ... + def stop(self) -> None: ... + def set_transport(self, stream_name: str, transport: Any) -> bool: ... + def get_rpc_method_names(self) -> list[str]: ... + def set_rpc_method(self, method: str, callable: RpcCall) -> None: ... + def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: ... + + +class RPCClient(ModuleProxyProtocol): def __init__(self, actor_instance, actor_class) -> None: # type: ignore[no-untyped-def] self.rpc = LCMRPC() self.actor_class = actor_class From 1d22e60e825adfaf8430533d5957faefe91a8d8a Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 12:53:53 -0800 Subject: [PATCH 27/52] misc --- dimos/core/docker_runner.py | 6 ++--- dimos/core/module_coordinator.py | 28 ++++++++++++---------- dimos/core/tests/test_docker_deployment.py | 2 +- pyproject.toml | 4 +++- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 74e7c840c8..1a0fc718ae 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -190,9 +190,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self._kwargs = kwargs self._running = False self.remote_name = module_class.__name__ - # Derive container name from image name: "my-registry/foo:v2" → "dimos_foo" - image_base = config.docker_image.rsplit(":", 1)[0].rsplit("/", 1)[-1] - self._container_name = config.docker_container_name or f"dimos_{image_base}" + # Derive container name from image name: "my-registry/foo:v2" → "dimos_foo_v2" + image_ref = config.docker_image.rsplit("/", 1)[-1] + self._container_name = config.docker_container_name or f"dimos_{image_ref.replace(':', '_')}" self.rpc = LCMRPC() self.rpcs = set(module_class.rpcs.keys()) # type: ignore[attr-defined] diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 5534d9f9a7..90538cfc0a 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -97,21 +97,25 @@ def deploy_parallel( docker_specs = [spec for spec in module_specs if is_docker_module(spec[0])] worker_specs = [spec for spec in module_specs if not is_docker_module(spec[0])] - worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] + worker_results: list[Any] = [] docker_results: list[Any] = [] - if docker_specs: - with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor: - docker_results = list( - executor.map( - lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs + try: + worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] + if docker_specs: + with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor: + docker_results = list( + executor.map( + lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs + ) ) - ) + finally: + results = worker_results + docker_results + # Register whatever succeeded so stop() can clean them up + for (module_class, _, _), module in zip( + worker_specs + docker_specs, results, strict=False + ): + self._deployed_modules[module_class] = module - results = worker_results + docker_results - for (module_class, _, _), module in zip( - worker_specs + docker_specs, results, strict=True - ): - self._deployed_modules[module_class] = module return results def start_all_modules(self) -> None: diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index 7a02682fda..e6ddbc4a73 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -169,7 +169,7 @@ def test_deploy_parallel_separates_docker_and_regular( # start() is NOT called during deploy — it's called in start_all_modules mock_dm.start.assert_not_called() - # Results are in original order + # Results are worker-first, then docker assert results[0] is regular_proxy assert results[1] is mock_dm diff --git a/pyproject.toml b/pyproject.toml index dcd2a5d987..31d3322453 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -321,10 +321,12 @@ docker = [ "sortedcontainers", "PyTurboJPEG", "rerun-sdk", - "langchain-core", "typing_extensions", "open3d-unofficial-arm; platform_system == 'Linux' and platform_machine == 'aarch64'", "open3d>=0.18.0; platform_system != 'Linux' or platform_machine != 'aarch64'", + # these below should be removed later, right now they are needed even for running `dimos --help` (seperate non-docker issue) + "langchain-core", + "matplotlib", ] base = [ From 8ecb905cfbf19c0548b8bccb1bc00128e1d27d52 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 13:48:44 -0800 Subject: [PATCH 28/52] testing fixup --- dimos/core/docker_runner.py | 25 +++++++++++++++------- dimos/core/module_coordinator.py | 5 ++++- dimos/core/rpc_client.py | 2 +- dimos/core/test_core.py | 2 +- dimos/core/tests/test_docker_deployment.py | 8 +++---- uv.lock | 2 ++ 6 files changed, 29 insertions(+), 15 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 1a0fc718ae..7ce89c40e6 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -143,7 +143,6 @@ def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> s return out + ("\n" + err if err else "") - def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]: """Extract JSON-serializable config fields for the container (excludes docker_* fields).""" out: dict[str, Any] = {} @@ -171,6 +170,7 @@ class DockerModule(ModuleProxyProtocol): Communication: All RPC happens via LCM multicast (requires --network=host). """ + config: DockerModuleConfig def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None: @@ -192,7 +192,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self.remote_name = module_class.__name__ # Derive container name from image name: "my-registry/foo:v2" → "dimos_foo_v2" image_ref = config.docker_image.rsplit("/", 1)[-1] - self._container_name = config.docker_container_name or f"dimos_{image_ref.replace(':', '_')}" + self._container_name = ( + config.docker_container_name or f"dimos_{image_ref.replace(':', '_')}" + ) self.rpc = LCMRPC() self.rpcs = set(module_class.rpcs.keys()) # type: ignore[attr-defined] @@ -208,7 +210,10 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non build_image(config) else: logger.info(f"Pulling {config.docker_image}") - r = _run([_docker_bin(config), "pull", config.docker_image], timeout=DOCKER_RUN_TIMEOUT) + r = _run( + [_docker_bin(config), "pull", config.docker_image], + timeout=DOCKER_RUN_TIMEOUT, + ) if r.returncode != 0: raise RuntimeError( f"Failed to pull image '{config.docker_image}'.\n" @@ -222,7 +227,10 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non reconnect = True else: logger.info(f"Stopping existing container: {self._container_name}") - _run([_docker_bin(config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) + _run( + [_docker_bin(config), "stop", self._container_name], + timeout=DOCKER_STOP_TIMEOUT, + ) if not reconnect: _remove_container(config, self._container_name) @@ -251,9 +259,7 @@ def set_rpc_method(self, method: str, callable: RpcCall) -> None: self._bound_rpc_calls[method] = callable # Forward to container — Module.set_rpc_method unpickles the RpcCall # and wires it with the container's own LCMRPC - self.rpc.call_sync( - f"{self.remote_name}/set_rpc_method", ([method, callable], {}) - ) + self.rpc.call_sync(f"{self.remote_name}/set_rpc_method", ([method, callable], {})) def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: missing = set(methods) - self._bound_rpc_calls.keys() @@ -282,7 +288,10 @@ def stop(self) -> None: unsub() self._unsub_fns.clear() with suppress(Exception): - _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) + _run( + [_docker_bin(self.config), "stop", self._container_name], + timeout=DOCKER_STOP_TIMEOUT, + ) with suppress(Exception): _remove_container(self.config, self._container_name) self._running = False diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 90538cfc0a..3e8ff31018 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -18,7 +18,6 @@ import threading from typing import TYPE_CHECKING, Any -from dimos.core.docker_runner import DockerModule, is_docker_module from dimos.core.global_config import GlobalConfig, global_config from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager @@ -77,6 +76,8 @@ def stop(self) -> None: self._client.close_all() # type: ignore[union-attr] def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] + from dimos.core.docker_runner import DockerModule, is_docker_module + if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") @@ -91,6 +92,8 @@ def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: def deploy_parallel( self, module_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] ) -> list[ModuleProxy]: + from dimos.core.docker_runner import DockerModule, is_docker_module + if not self._client: raise ValueError("Not started") diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py index a89c54caf0..c9e73ac54e 100644 --- a/dimos/core/rpc_client.py +++ b/dimos/core/rpc_client.py @@ -91,7 +91,7 @@ def set_rpc_method(self, method: str, callable: RpcCall) -> None: ... def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: ... -class RPCClient(ModuleProxyProtocol): +class RPCClient: def __init__(self, actor_instance, actor_class) -> None: # type: ignore[no-untyped-def] self.rpc = LCMRPC() self.actor_class = actor_class diff --git a/dimos/core/test_core.py b/dimos/core/test_core.py index 197539ef67..30f14c93b4 100644 --- a/dimos/core/test_core.py +++ b/dimos/core/test_core.py @@ -80,7 +80,7 @@ def test_classmethods() -> None: # Check that we have the expected RPC methods assert "navigate_to" in class_rpcs, "navigate_to should be in rpcs" assert "start" in class_rpcs, "start should be in rpcs" - assert len(class_rpcs) == 9 + assert len(class_rpcs) == 8 # Check that the values are callable assert callable(class_rpcs["navigate_to"]), "navigate_to should be callable" diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index e6ddbc4a73..f60f37a21a 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -78,7 +78,7 @@ class Bare(Module): class TestModuleCoordinatorDockerRouting: - @patch("dimos.core.module_coordinator.DockerModule") + @patch("dimos.core.docker_runner.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_module_cls): mock_worker_mgr = MagicMock() @@ -103,7 +103,7 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_ coordinator.stop() - @patch("dimos.core.module_coordinator.DockerModule") + @patch("dimos.core.docker_runner.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_docker_propagates_constructor_failure( self, mock_worker_manager_cls, mock_docker_module_cls @@ -139,7 +139,7 @@ def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manage coordinator.stop() - @patch("dimos.core.module_coordinator.DockerModule") + @patch("dimos.core.docker_runner.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_parallel_separates_docker_and_regular( self, mock_worker_manager_cls, mock_docker_module_cls @@ -175,7 +175,7 @@ def test_deploy_parallel_separates_docker_and_regular( coordinator.stop() - @patch("dimos.core.module_coordinator.DockerModule") + @patch("dimos.core.docker_runner.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docker_module_cls): mock_worker_mgr = MagicMock() diff --git a/uv.lock b/uv.lock index 084e157ee5..820bb92f2d 100644 --- a/uv.lock +++ b/uv.lock @@ -1852,6 +1852,7 @@ docker = [ { name = "dimos-lcm" }, { name = "langchain-core" }, { name = "lcm" }, + { name = "matplotlib" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "open3d", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, @@ -2022,6 +2023,7 @@ requires-dist = [ { name = "lcm", marker = "extra == 'docker'" }, { name = "llvmlite", specifier = ">=0.42.0" }, { name = "lxml-stubs", marker = "extra == 'dev'", specifier = ">=0.5.1,<1" }, + { name = "matplotlib", marker = "extra == 'docker'" }, { name = "matplotlib", marker = "extra == 'manipulation'", specifier = ">=3.7.1" }, { name = "md-babel-py", marker = "extra == 'dev'", specifier = "==1.1.1" }, { name = "moondream", marker = "extra == 'perception'" }, From bf18c25a303f8d1ac0eec77aa958ec161a714b54 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 15:54:12 -0800 Subject: [PATCH 29/52] maintain order --- dimos/core/docker_runner.py | 50 ++++++++++++++++++++++++++++---- dimos/core/module_coordinator.py | 29 +++++++++++++----- 2 files changed, 66 insertions(+), 13 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 7ce89c40e6..776cef516d 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -39,6 +39,7 @@ logger = setup_logger() DOCKER_RUN_TIMEOUT = 120 # Timeout for `docker run` command execution +DOCKER_PULL_TIMEOUT = 600 # Timeout for `docker pull` (large images over slow connections) DOCKER_CMD_TIMEOUT = 20 # Timeout for quick Docker commands (inspect, rm, logs) DOCKER_STATUS_TIMEOUT = 10 # Timeout for container status checks DOCKER_STOP_TIMEOUT = 30 # Timeout for `docker stop` command (graceful shutdown) @@ -136,6 +137,31 @@ def _is_container_running(cfg: DockerModuleConfig, name: str) -> bool: return r.returncode == 0 and r.stdout.strip() == "true" +def _container_started_at(cfg: DockerModuleConfig, name: str) -> float | None: + """Return the container's start time as a Unix timestamp, or None on failure.""" + r = _run( + [_docker_bin(cfg), "inspect", "-f", "{{.State.StartedAt}}", name], + timeout=DOCKER_STATUS_TIMEOUT, + ) + if r.returncode != 0: + return None + from datetime import datetime + + try: + # Docker returns RFC 3339 with nanoseconds, e.g. "2024-01-02T03:04:05.123456789Z" + raw = r.stdout.strip() + # Truncate nanoseconds to microseconds for fromisoformat compatibility + if "." in raw: + base, frac = raw.split(".", 1) + frac = frac.rstrip("Z")[:6] + raw = f"{base}.{frac}+00:00" + else: + raw = raw.rstrip("Z") + "+00:00" + return datetime.fromisoformat(raw).timestamp() + except (ValueError, OSError): + return None + + def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> str: r = _run([_docker_bin(cfg), "logs", "--tail", str(n), name], timeout=DOCKER_CMD_TIMEOUT) out = (r.stdout or "").rstrip() @@ -190,10 +216,11 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self._kwargs = kwargs self._running = False self.remote_name = module_class.__name__ - # Derive container name from image name: "my-registry/foo:v2" → "dimos_foo_v2" + # Derive container name from image + class name: "my-registry/foo:v2" → "dimos_myclass_foo_v2" image_ref = config.docker_image.rsplit("/", 1)[-1] self._container_name = ( - config.docker_container_name or f"dimos_{image_ref.replace(':', '_')}" + config.docker_container_name + or f"dimos_{module_class.__name__.lower()}_{image_ref.replace(':', '_')}" ) self.rpc = LCMRPC() @@ -212,7 +239,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non logger.info(f"Pulling {config.docker_image}") r = _run( [_docker_bin(config), "pull", config.docker_image], - timeout=DOCKER_RUN_TIMEOUT, + timeout=DOCKER_PULL_TIMEOUT, ) if r.returncode != 0: raise RuntimeError( @@ -223,9 +250,18 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non reconnect = False if _is_container_running(config, self._container_name): if config.docker_reconnect_container: - logger.info(f"Reconnecting to running container: {self._container_name}") - reconnect = True - else: + # Verify the container hasn't restarted since we last ran + container_start = _container_started_at(config, self._container_name) + process_start = time.time() # conservative: current time as upper bound + if container_start is not None and container_start > process_start - 5: + logger.warning( + f"Container {self._container_name} appears to have restarted recently " + f"(started at {container_start:.0f}). Treating as fresh start." + ) + else: + logger.info(f"Reconnecting to running container: {self._container_name}") + reconnect = True + if not reconnect: logger.info(f"Stopping existing container: {self._container_name}") _run( [_docker_bin(config), "stop", self._container_name], @@ -279,6 +315,8 @@ def start(self) -> None: def stop(self) -> None: """Gracefully stop the Docker container and clean up resources.""" + if not self._running: + return with suppress(Exception): self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {})) with suppress(Exception): diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 3e8ff31018..01f657dd1a 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -97,8 +97,19 @@ def deploy_parallel( if not self._client: raise ValueError("Not started") - docker_specs = [spec for spec in module_specs if is_docker_module(spec[0])] - worker_specs = [spec for spec in module_specs if not is_docker_module(spec[0])] + # Split by type, tracking original indices for reassembly + docker_indices: list[int] = [] + worker_indices: list[int] = [] + docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] + worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] + # the i is needed for maintaining order on the returned output + for i, spec in enumerate(module_specs): + if is_docker_module(spec[0]): + docker_indices.append(i) + docker_specs.append(spec) + else: + worker_indices.append(i) + worker_specs.append(spec) worker_results: list[Any] = [] docker_results: list[Any] = [] @@ -112,12 +123,16 @@ def deploy_parallel( ) ) finally: - results = worker_results + docker_results + # Reassemble results in original input order + results: list[Any] = [None] * len(module_specs) + for idx, mod in zip(worker_indices, worker_results, strict=False): + results[idx] = mod + for idx, mod in zip(docker_indices, docker_results, strict=False): + results[idx] = mod # Register whatever succeeded so stop() can clean them up - for (module_class, _, _), module in zip( - worker_specs + docker_specs, results, strict=False - ): - self._deployed_modules[module_class] = module + for spec, module in zip(module_specs, results, strict=False): + if module is not None: + self._deployed_modules[spec[0]] = module return results From 1bd1c952922e352d476fa2f47a67f714e23cbc83 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 16:12:44 -0800 Subject: [PATCH 30/52] refine --- dimos/core/docker_runner.py | 46 ++++----------------- dimos/core/tests/test_docker_deployment.py | 2 +- examples/docker_hello_world/hello_docker.py | 1 + 3 files changed, 10 insertions(+), 39 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 776cef516d..aacdbe7c19 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -137,31 +137,6 @@ def _is_container_running(cfg: DockerModuleConfig, name: str) -> bool: return r.returncode == 0 and r.stdout.strip() == "true" -def _container_started_at(cfg: DockerModuleConfig, name: str) -> float | None: - """Return the container's start time as a Unix timestamp, or None on failure.""" - r = _run( - [_docker_bin(cfg), "inspect", "-f", "{{.State.StartedAt}}", name], - timeout=DOCKER_STATUS_TIMEOUT, - ) - if r.returncode != 0: - return None - from datetime import datetime - - try: - # Docker returns RFC 3339 with nanoseconds, e.g. "2024-01-02T03:04:05.123456789Z" - raw = r.stdout.strip() - # Truncate nanoseconds to microseconds for fromisoformat compatibility - if "." in raw: - base, frac = raw.split(".", 1) - frac = frac.rstrip("Z")[:6] - raw = f"{base}.{frac}+00:00" - else: - raw = raw.rstrip("Z") + "+00:00" - return datetime.fromisoformat(raw).timestamp() - except (ValueError, OSError): - return None - - def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> str: r = _run([_docker_bin(cfg), "logs", "--tail", str(n), name], timeout=DOCKER_CMD_TIMEOUT) out = (r.stdout or "").rstrip() @@ -250,18 +225,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non reconnect = False if _is_container_running(config, self._container_name): if config.docker_reconnect_container: - # Verify the container hasn't restarted since we last ran - container_start = _container_started_at(config, self._container_name) - process_start = time.time() # conservative: current time as upper bound - if container_start is not None and container_start > process_start - 5: - logger.warning( - f"Container {self._container_name} appears to have restarted recently " - f"(started at {container_start:.0f}). Treating as fresh start." - ) - else: - logger.info(f"Reconnecting to running container: {self._container_name}") - reconnect = True - if not reconnect: + logger.info(f"Reconnecting to running container: {self._container_name}") + reconnect = True + else: logger.info(f"Stopping existing container: {self._container_name}") _run( [_docker_bin(config), "stop", self._container_name], @@ -284,7 +250,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self._wait_for_rpc() except Exception: with suppress(Exception): - self.stop() + self._cleanup() raise def get_rpc_method_names(self) -> list[str]: @@ -319,6 +285,10 @@ def stop(self) -> None: return with suppress(Exception): self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {})) + self._cleanup() + + def _cleanup(self) -> None: + """Release all resources. Safe to call multiple times or from partial init.""" with suppress(Exception): self.rpc.stop() for unsub in self._unsub_fns: diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index f60f37a21a..95db171e1c 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -169,7 +169,7 @@ def test_deploy_parallel_separates_docker_and_regular( # start() is NOT called during deploy — it's called in start_all_modules mock_dm.start.assert_not_called() - # Results are worker-first, then docker + # Results preserve input order assert results[0] is regular_proxy assert results[1] is mock_dm diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py index 187384854e..eb4765a629 100644 --- a/examples/docker_hello_world/hello_docker.py +++ b/examples/docker_hello_world/hello_docker.py @@ -76,6 +76,7 @@ def _cowsay(self, text: str) -> str: ["/usr/games/cowsay", text], capture_output=True, text=True, + check=True, ) return result.stdout From 90feac1bdb63b159236cfbad33e40b0f88bb5357 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 16:37:59 -0800 Subject: [PATCH 31/52] make pull out configurable --- dimos/core/docker_runner.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index aacdbe7c19..89fa9d9af3 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -39,7 +39,7 @@ logger = setup_logger() DOCKER_RUN_TIMEOUT = 120 # Timeout for `docker run` command execution -DOCKER_PULL_TIMEOUT = 600 # Timeout for `docker pull` (large images over slow connections) +DOCKER_PULL_TIMEOUT_DEFAULT = 600 # Default timeout for `docker pull` DOCKER_CMD_TIMEOUT = 20 # Timeout for quick Docker commands (inspect, rm, logs) DOCKER_STATUS_TIMEOUT = 10 # Timeout for container status checks DOCKER_STOP_TIMEOUT = 30 # Timeout for `docker stop` command (graceful shutdown) @@ -95,7 +95,8 @@ class DockerModuleConfig(ModuleConfig): docker_command: list[str] | None = None docker_extra_args: list[str] = field(default_factory=list) - # Startup readiness + # Timeouts + docker_pull_timeout: float = DOCKER_PULL_TIMEOUT_DEFAULT docker_startup_timeout: float = 120.0 docker_poll_interval: float = 1.0 @@ -214,7 +215,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non logger.info(f"Pulling {config.docker_image}") r = _run( [_docker_bin(config), "pull", config.docker_image], - timeout=DOCKER_PULL_TIMEOUT, + timeout=config.docker_pull_timeout, ) if r.returncode != 0: raise RuntimeError( From e95fe972154951ba1df61141317df199554cff62 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 16:38:25 -0800 Subject: [PATCH 32/52] have example show using normal config --- examples/docker_hello_world/hello_docker.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py index eb4765a629..66e95df316 100644 --- a/examples/docker_hello_world/hello_docker.py +++ b/examples/docker_hello_world/hello_docker.py @@ -56,6 +56,9 @@ class HelloDockerConfig(DockerModuleConfig): docker_restart_policy: str = "no" docker_env: dict[str, str] = field(default_factory=lambda: {"CI": "1"}) + # Custom (non-docker) config field — passed to the container via JSON + greeting_prefix: str = "Hello" + class HelloDockerModule(Module["HelloDockerConfig"]): """A trivial module that runs inside Docker and echoes greetings.""" @@ -88,7 +91,13 @@ def _on_prompt(self, text: str) -> None: @rpc def greet(self, name: str) -> str: """RPC method that can be called directly.""" - return self._cowsay(f"Hello, {name}!") + prefix = self.config.greeting_prefix + return self._cowsay(f"{prefix}, {name}!") + + @rpc + def get_greeting_prefix(self) -> str: + """Return the config value to verify it was passed to the container.""" + return self.config.greeting_prefix # --------------------------------------------------------------------------- @@ -125,14 +134,19 @@ def _on_greeting(self, text: str) -> None: coordinator = autoconnect( PromptModule.blueprint(), - HelloDockerModule.blueprint(), + HelloDockerModule.blueprint(greeting_prefix="Howdy"), ).build() # Get module proxies prompt_mod = coordinator.get_instance(PromptModule) docker_mod = coordinator.get_instance(HelloDockerModule) - # Test RPC + # Test that custom config was passed to the container + prefix = docker_mod.get_greeting_prefix() + assert prefix == "Howdy", f"Expected 'Howdy', got {prefix!r}" + print(f"Config passed to container: greeting_prefix={prefix!r}") + + # Test RPC (should use the custom prefix) print(docker_mod.greet("World")) # Test stream From 4f10e8259c3f2a39a85d90abbe7214cebb27eee6 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 17:55:27 -0800 Subject: [PATCH 33/52] Add DockerWorkerManager --- dimos/core/docker_runner.py | 7 ++- dimos/core/docker_worker_manager.py | 59 ++++++++++++++++++++++ dimos/core/module_coordinator.py | 13 ++--- dimos/core/tests/test_docker_deployment.py | 10 ++-- 4 files changed, 75 insertions(+), 14 deletions(-) create mode 100644 dimos/core/docker_worker_manager.py diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 89fa9d9af3..26d822ce73 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -110,7 +110,11 @@ class DockerModuleConfig(ModuleConfig): def is_docker_module(module_class: type) -> bool: """Check if a module class should run in Docker based on its default_config.""" default_config = getattr(module_class, "default_config", None) - return default_config is not None and issubclass(default_config, DockerModuleConfig) + return ( + default_config is not None + and isinstance(default_config, type) + and issubclass(default_config, DockerModuleConfig) + ) # Docker helpers @@ -284,6 +288,7 @@ def stop(self) -> None: """Gracefully stop the Docker container and clean up resources.""" if not self._running: return + self._running = False # claim shutdown before any side-effects with suppress(Exception): self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {})) self._cleanup() diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py new file mode 100644 index 0000000000..52317d984b --- /dev/null +++ b/dimos/core/docker_worker_manager.py @@ -0,0 +1,59 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from concurrent.futures import Future, ThreadPoolExecutor, as_completed +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from dimos.core.docker_runner import DockerModule + from dimos.core.module import Module + + +class DockerWorkerManager: + """Parallel deployment of Docker-backed modules.""" + + @staticmethod + def deploy_parallel( + specs: list[tuple[type[Module], tuple[Any, ...], dict[str, Any]]], + ) -> list[DockerModule]: + """Deploy multiple DockerModules in parallel, collecting partial results on failure. + + Returns all successfully-created DockerModules. If any deployment fails, + the successful ones are still returned (so the caller can register them + for cleanup), and the first exception is re-raised. + """ + from dimos.core.docker_runner import DockerModule + + results: dict[int, DockerModule] = {} + first_exc: Exception | None = None + + with ThreadPoolExecutor(max_workers=len(specs)) as executor: + futures: dict[Future[DockerModule], int] = { + executor.submit(lambda s=spec: DockerModule(s[0], *s[1], **s[2])): i + for i, spec in enumerate(specs) + } + for fut in as_completed(futures): + idx = futures[fut] + try: + results[idx] = fut.result() + except Exception as e: + if first_exc is None: + first_exc = e + + # Return in input order (missing indices = failed deployments) + ordered = [results[i] for i in sorted(results)] + if first_exc is not None: + raise first_exc + return ordered diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 01f657dd1a..4ede195571 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -18,6 +18,7 @@ import threading from typing import TYPE_CHECKING, Any +from dimos.core.docker_worker_manager import DockerWorkerManager from dimos.core.global_config import GlobalConfig, global_config from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager @@ -76,6 +77,7 @@ def stop(self) -> None: self._client.close_all() # type: ignore[union-attr] def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] + # Inline to avoid circular import: module_coordinator → docker_runner → module → blueprints → module_coordinator from dimos.core.docker_runner import DockerModule, is_docker_module if not self._client: @@ -92,7 +94,8 @@ def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: def deploy_parallel( self, module_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] ) -> list[ModuleProxy]: - from dimos.core.docker_runner import DockerModule, is_docker_module + # Inline to avoid circular import: module_coordinator → docker_runner → module → blueprints → module_coordinator + from dimos.core.docker_runner import is_docker_module if not self._client: raise ValueError("Not started") @@ -102,7 +105,6 @@ def deploy_parallel( worker_indices: list[int] = [] docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] - # the i is needed for maintaining order on the returned output for i, spec in enumerate(module_specs): if is_docker_module(spec[0]): docker_indices.append(i) @@ -116,12 +118,7 @@ def deploy_parallel( try: worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] if docker_specs: - with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor: - docker_results = list( - executor.map( - lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs - ) - ) + docker_results = DockerWorkerManager.deploy_parallel(docker_specs) finally: # Reassemble results in original input order results: list[Any] = [None] * len(module_specs) diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index 95db171e1c..17d1290916 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -139,10 +139,10 @@ def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manage coordinator.stop() - @patch("dimos.core.docker_runner.DockerModule") + @patch("dimos.core.docker_worker_manager.DockerWorkerManager.deploy_parallel") @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_parallel_separates_docker_and_regular( - self, mock_worker_manager_cls, mock_docker_module_cls + self, mock_worker_manager_cls, mock_docker_deploy ): mock_worker_mgr = MagicMock() mock_worker_manager_cls.return_value = mock_worker_mgr @@ -151,7 +151,7 @@ def test_deploy_parallel_separates_docker_and_regular( mock_worker_mgr.deploy_parallel.return_value = [regular_proxy] mock_dm = MagicMock() - mock_docker_module_cls.return_value = mock_dm + mock_docker_deploy.return_value = [mock_dm] coordinator = ModuleCoordinator() coordinator.start() @@ -164,8 +164,8 @@ def test_deploy_parallel_separates_docker_and_regular( # Regular module goes through worker manager mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})]) - # Docker module gets its own DockerModule - mock_docker_module_cls.assert_called_once_with(FakeDockerModule) + # Docker specs go through DockerWorkerManager + mock_docker_deploy.assert_called_once_with([(FakeDockerModule, (), {})]) # start() is NOT called during deploy — it's called in start_all_modules mock_dm.start.assert_not_called() From 8d6ef32d8cc0fe618f9e3768c4ed454ab7a1c97d Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 21:44:22 -0800 Subject: [PATCH 34/52] add proper cleanup handling if a module fails to deploy correctly --- dimos/core/docker_worker_manager.py | 43 ++-- dimos/core/module_coordinator.py | 11 +- .../tests/test_parallel_deploy_cleanup.py | 219 ++++++++++++++++++ dimos/core/worker_manager.py | 30 ++- dimos/utils/safe_thread_map.py | 92 ++++++++ 5 files changed, 350 insertions(+), 45 deletions(-) create mode 100644 dimos/core/tests/test_parallel_deploy_cleanup.py create mode 100644 dimos/utils/safe_thread_map.py diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index 52317d984b..b70ff3ba52 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -13,9 +13,11 @@ # limitations under the License. from __future__ import annotations -from concurrent.futures import Future, ThreadPoolExecutor, as_completed +from contextlib import suppress from typing import TYPE_CHECKING, Any +from dimos.utils.safe_thread_map import safe_thread_map + if TYPE_CHECKING: from dimos.core.docker_runner import DockerModule from dimos.core.module import Module @@ -28,32 +30,21 @@ class DockerWorkerManager: def deploy_parallel( specs: list[tuple[type[Module], tuple[Any, ...], dict[str, Any]]], ) -> list[DockerModule]: - """Deploy multiple DockerModules in parallel, collecting partial results on failure. + """Deploy multiple DockerModules in parallel. - Returns all successfully-created DockerModules. If any deployment fails, - the successful ones are still returned (so the caller can register them - for cleanup), and the first exception is re-raised. + If any deployment fails, all successfully-started containers are + stopped before an ExceptionGroup is raised. """ from dimos.core.docker_runner import DockerModule - results: dict[int, DockerModule] = {} - first_exc: Exception | None = None - - with ThreadPoolExecutor(max_workers=len(specs)) as executor: - futures: dict[Future[DockerModule], int] = { - executor.submit(lambda s=spec: DockerModule(s[0], *s[1], **s[2])): i - for i, spec in enumerate(specs) - } - for fut in as_completed(futures): - idx = futures[fut] - try: - results[idx] = fut.result() - except Exception as e: - if first_exc is None: - first_exc = e - - # Return in input order (missing indices = failed deployments) - ordered = [results[i] for i in sorted(results)] - if first_exc is not None: - raise first_exc - return ordered + def _on_errors( + _outcomes: list, successes: list[DockerModule], errors: list[Exception] + ) -> None: + for mod in successes: + with suppress(Exception): + mod.stop() + raise ExceptionGroup("docker deploy_parallel failed", errors) + + return safe_thread_map( + specs, lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), _on_errors + ) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 4ede195571..48546c5568 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -113,12 +113,9 @@ def deploy_parallel( worker_indices.append(i) worker_specs.append(spec) - worker_results: list[Any] = [] - docker_results: list[Any] = [] try: - worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] - if docker_specs: - docker_results = DockerWorkerManager.deploy_parallel(docker_specs) + worker_results = self._client.deploy_parallel(worker_specs) + docker_results = DockerWorkerManager.deploy_parallel(docker_specs) finally: # Reassemble results in original input order results: list[Any] = [None] * len(module_specs) @@ -127,9 +124,9 @@ def deploy_parallel( for idx, mod in zip(docker_indices, docker_results, strict=False): results[idx] = mod # Register whatever succeeded so stop() can clean them up - for spec, module in zip(module_specs, results, strict=False): + for (module_class, _, _), module in zip(module_specs, results, strict=False): if module is not None: - self._deployed_modules[spec[0]] = module + self._deployed_modules[module_class] = module return results diff --git a/dimos/core/tests/test_parallel_deploy_cleanup.py b/dimos/core/tests/test_parallel_deploy_cleanup.py new file mode 100644 index 0000000000..1987fa4be7 --- /dev/null +++ b/dimos/core/tests/test_parallel_deploy_cleanup.py @@ -0,0 +1,219 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Tests that deploy_parallel cleans up successfully-started modules when a +sibling deployment fails ("middle module throws" scenario). +""" + +from __future__ import annotations + +import threading +from unittest.mock import MagicMock, patch + +import pytest + + +class TestDockerWorkerManagerPartialFailure: + """DockerWorkerManager.deploy_parallel must stop successful containers when one fails.""" + + @patch("dimos.core.docker_runner.DockerModule") + def test_middle_module_fails_stops_siblings(self, mock_docker_module_cls): + """Deploy 3 modules where the middle one fails. The other two must be stopped.""" + from dimos.core.docker_worker_manager import DockerWorkerManager + + mod_a = MagicMock(name="ModuleA") + mod_c = MagicMock(name="ModuleC") + + barrier = threading.Barrier(3, timeout=5) + + def fake_constructor(cls, *args, **kwargs): + label = cls.__name__ + barrier.wait() + if label == "B": + raise RuntimeError("B failed to start") + return mod_a if label == "A" else mod_c + + mock_docker_module_cls.side_effect = fake_constructor + + FakeA = type("A", (), {}) + FakeB = type("B", (), {}) + FakeC = type("C", (), {}) + + with pytest.raises(ExceptionGroup, match="docker deploy_parallel failed") as exc_info: + DockerWorkerManager.deploy_parallel( + [ + (FakeA, (), {}), + (FakeB, (), {}), + (FakeC, (), {}), + ] + ) + + assert len(exc_info.value.exceptions) == 1 + assert "B failed to start" in str(exc_info.value.exceptions[0]) + + # Both successful modules must have been stopped exactly once + mod_a.stop.assert_called_once() + mod_c.stop.assert_called_once() + + @patch("dimos.core.docker_runner.DockerModule") + def test_multiple_failures_raises_exception_group(self, mock_docker_module_cls): + """Deploy 3 modules where two fail. Should raise ExceptionGroup with both errors.""" + from dimos.core.docker_worker_manager import DockerWorkerManager + + mod_a = MagicMock(name="ModuleA") + + barrier = threading.Barrier(3, timeout=5) + + def fake_constructor(cls, *args, **kwargs): + label = cls.__name__ + barrier.wait() + if label == "B": + raise RuntimeError("B failed") + if label == "C": + raise ValueError("C failed") + return mod_a + + mock_docker_module_cls.side_effect = fake_constructor + + FakeA = type("A", (), {}) + FakeB = type("B", (), {}) + FakeC = type("C", (), {}) + + with pytest.raises(ExceptionGroup, match="docker deploy_parallel failed") as exc_info: + DockerWorkerManager.deploy_parallel( + [ + (FakeA, (), {}), + (FakeB, (), {}), + (FakeC, (), {}), + ] + ) + + assert len(exc_info.value.exceptions) == 2 + messages = {str(e) for e in exc_info.value.exceptions} + assert "B failed" in messages + assert "C failed" in messages + + # The one successful module must have been stopped + mod_a.stop.assert_called_once() + + @patch("dimos.core.docker_runner.DockerModule") + def test_all_succeed_no_stops(self, mock_docker_module_cls): + """When all deployments succeed, no modules should be stopped.""" + from dimos.core.docker_worker_manager import DockerWorkerManager + + mocks = [MagicMock(name=f"Mod{i}") for i in range(3)] + + def fake_constructor(cls, *args, **kwargs): + return mocks[["A", "B", "C"].index(cls.__name__)] + + mock_docker_module_cls.side_effect = fake_constructor + + FakeA = type("A", (), {}) + FakeB = type("B", (), {}) + FakeC = type("C", (), {}) + + results = DockerWorkerManager.deploy_parallel( + [ + (FakeA, (), {}), + (FakeB, (), {}), + (FakeC, (), {}), + ] + ) + + assert len(results) == 3 + for m in mocks: + m.stop.assert_not_called() + + @patch("dimos.core.docker_runner.DockerModule") + def test_stop_failure_does_not_mask_deploy_error(self, mock_docker_module_cls): + """If stop() itself raises during cleanup, the original deploy error still propagates.""" + from dimos.core.docker_worker_manager import DockerWorkerManager + + mod_a = MagicMock(name="ModuleA") + mod_a.stop.side_effect = OSError("stop failed") + + barrier = threading.Barrier(2, timeout=5) + + def fake_constructor(cls, *args, **kwargs): + barrier.wait() + if cls.__name__ == "B": + raise RuntimeError("B exploded") + return mod_a + + mock_docker_module_cls.side_effect = fake_constructor + + FakeA = type("A", (), {}) + FakeB = type("B", (), {}) + + with pytest.raises(ExceptionGroup, match="docker deploy_parallel failed"): + DockerWorkerManager.deploy_parallel([(FakeA, (), {}), (FakeB, (), {})]) + + # stop was attempted despite it raising + mod_a.stop.assert_called_once() + + +class TestWorkerManagerPartialFailure: + """WorkerManager.deploy_parallel must clean up successful RPCClients when one fails.""" + + def test_middle_module_fails_cleans_up_siblings(self): + from dimos.core.worker_manager import WorkerManager + + manager = WorkerManager(n_workers=2) + + mock_workers = [MagicMock(name=f"Worker{i}") for i in range(2)] + for w in mock_workers: + w.module_count = 0 + w.reserve_slot = MagicMock( + side_effect=lambda w=w: setattr(w, "module_count", w.module_count + 1) + ) + + manager._workers = mock_workers + manager._started = True + + def fake_deploy_module(module_class, args=(), kwargs=None): + if module_class.__name__ == "B": + raise RuntimeError("B failed to deploy") + return MagicMock(name=f"actor_{module_class.__name__}") + + for w in mock_workers: + w.deploy_module = fake_deploy_module + + FakeA = type("A", (), {}) + FakeB = type("B", (), {}) + FakeC = type("C", (), {}) + + rpc_clients_created: list[MagicMock] = [] + + with patch("dimos.core.worker_manager.RPCClient") as mock_rpc_cls: + + def make_rpc(actor, cls): + client = MagicMock(name=f"rpc_{cls.__name__}") + rpc_clients_created.append(client) + return client + + mock_rpc_cls.side_effect = make_rpc + + with pytest.raises(ExceptionGroup, match="worker deploy_parallel failed"): + manager.deploy_parallel( + [ + (FakeA, (), {}), + (FakeB, (), {}), + (FakeC, (), {}), + ] + ) + + # Every successfully-created RPC client must have been cleaned up exactly once + for client in rpc_clients_created: + client.stop_rpc_client.assert_called_once() diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager.py index 4dbb51eb54..25a052590c 100644 --- a/dimos/core/worker_manager.py +++ b/dimos/core/worker_manager.py @@ -14,12 +14,13 @@ from __future__ import annotations -from concurrent.futures import ThreadPoolExecutor +from contextlib import suppress from typing import TYPE_CHECKING, Any from dimos.core.rpc_client import RPCClient from dimos.core.worker import Worker from dimos.utils.logging_config import setup_logger +from dimos.utils.safe_thread_map import safe_thread_map if TYPE_CHECKING: from dimos.core.module import ModuleT @@ -65,6 +66,9 @@ def deploy_parallel( if self._closed: raise RuntimeError("WorkerManager is closed") + if len(module_specs) == 0: + return [] + # Auto-start for backward compatibility if not self._started: self.start() @@ -78,17 +82,19 @@ def deploy_parallel( worker.reserve_slot() assignments.append((worker, module_class, args, kwargs)) - def _deploy( - item: tuple[Worker, type[ModuleT], tuple[Any, ...], dict[Any, Any]], - ) -> RPCClient: - worker, module_class, args, kwargs = item - actor = worker.deploy_module(module_class, args=args, kwargs=kwargs) - return RPCClient(actor, module_class) - - with ThreadPoolExecutor(max_workers=len(assignments)) as pool: - results = list(pool.map(_deploy, assignments)) - - return results + def _on_errors( + _outcomes: list, successes: list[RPCClient], errors: list[Exception] + ) -> None: + for rpc_client in successes: + with suppress(Exception): + rpc_client.stop_rpc_client() + raise ExceptionGroup("worker deploy_parallel failed", errors) + + return safe_thread_map( + assignments, + lambda item: RPCClient(item[0].deploy_module(item[1], item[2], item[3]), item[1]), + _on_errors, + ) @property def workers(self) -> list[Worker]: diff --git a/dimos/utils/safe_thread_map.py b/dimos/utils/safe_thread_map.py new file mode 100644 index 0000000000..f051b0d950 --- /dev/null +++ b/dimos/utils/safe_thread_map.py @@ -0,0 +1,92 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from concurrent.futures import Future, ThreadPoolExecutor, as_completed +from typing import TYPE_CHECKING, Any, TypeVar + +if TYPE_CHECKING: + from collections.abc import Callable, Sequence + +T = TypeVar("T") +R = TypeVar("R") + + +def safe_thread_map( + items: Sequence[T], + fn: Callable[[T], R], + on_errors: Callable[[list[tuple[T, R | Exception]], list[R], list[Exception]], Any] + | None = None, +) -> list[R]: + """Thread-pool map that waits for all items to finish before raising and a cleanup handler + + - Empty *items* → returns ``[]`` immediately. + - All succeed → returns results in input order. + - Any fail → calls ``on_errors(outcomes, successes, errors)`` where + *outcomes* is a list of ``(input, result_or_exception)`` pairs in input + order, *successes* is the list of successful results, and *errors* is + the list of exceptions. If *on_errors* raises, that exception propagates. + If *on_errors* returns normally, its return value is returned from + ``safe_thread_map``. If *on_errors* is ``None``, raises an + ``ExceptionGroup``. + + Example:: + + def start_service(name: str) -> Connection: + return connect(name) + + def cleanup( + outcomes: list[tuple[str, Connection | Exception]], + successes: list[Connection], + errors: list[Exception], + ) -> None: + for conn in successes: + conn.close() + raise ExceptionGroup("failed to start services", errors) + + connections = safe_thread_map( + ["db", "cache", "queue"], + start_service, + cleanup, # called only if any start_service() raises + ) + """ + if not items: + return [] + + outcomes: dict[int, R | Exception] = {} + + with ThreadPoolExecutor(max_workers=len(items)) as pool: + futures: dict[Future[R], int] = {pool.submit(fn, item): i for i, item in enumerate(items)} + for fut in as_completed(futures): + idx = futures[fut] + try: + outcomes[idx] = fut.result() + except Exception as e: + outcomes[idx] = e + + successes: list[R] = [] + errors: list[Exception] = [] + for v in outcomes.values(): + if isinstance(v, Exception): + errors.append(v) + else: + successes.append(v) + + if errors: + if on_errors is not None: + zipped = [(items[i], outcomes[i]) for i in range(len(items))] + return on_errors(zipped, successes, errors) # type: ignore[return-value] + raise ExceptionGroup("safe_thread_map failed", errors) + + return [outcomes[i] for i in range(len(items))] # type: ignore[misc] From 951b1aa4d35b20ccf162617ef433c3ce7cb9dd62 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 21:48:13 -0800 Subject: [PATCH 35/52] mypy fixup --- dimos/core/docker_worker_manager.py | 2 +- dimos/core/module_coordinator.py | 4 ++-- dimos/core/worker_manager.py | 2 +- dimos/utils/safe_thread_map.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index b70ff3ba52..34183fda9f 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -38,7 +38,7 @@ def deploy_parallel( from dimos.core.docker_runner import DockerModule def _on_errors( - _outcomes: list, successes: list[DockerModule], errors: list[Exception] + _outcomes: list[Any], successes: list[DockerModule], errors: list[Exception] ) -> None: for mod in successes: with suppress(Exception): diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 48546c5568..8269a47bf9 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -115,13 +115,13 @@ def deploy_parallel( try: worker_results = self._client.deploy_parallel(worker_specs) - docker_results = DockerWorkerManager.deploy_parallel(docker_specs) + docker_results = DockerWorkerManager.deploy_parallel(docker_specs) # type: ignore[arg-type] finally: # Reassemble results in original input order results: list[Any] = [None] * len(module_specs) for idx, mod in zip(worker_indices, worker_results, strict=False): results[idx] = mod - for idx, mod in zip(docker_indices, docker_results, strict=False): + for idx, mod in zip(docker_indices, docker_results, strict=False): # type: ignore[assignment] results[idx] = mod # Register whatever succeeded so stop() can clean them up for (module_class, _, _), module in zip(module_specs, results, strict=False): diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager.py index 25a052590c..b9c25c8445 100644 --- a/dimos/core/worker_manager.py +++ b/dimos/core/worker_manager.py @@ -83,7 +83,7 @@ def deploy_parallel( assignments.append((worker, module_class, args, kwargs)) def _on_errors( - _outcomes: list, successes: list[RPCClient], errors: list[Exception] + _outcomes: list[Any], successes: list[RPCClient], errors: list[Exception] ) -> None: for rpc_client in successes: with suppress(Exception): diff --git a/dimos/utils/safe_thread_map.py b/dimos/utils/safe_thread_map.py index f051b0d950..240f5e7099 100644 --- a/dimos/utils/safe_thread_map.py +++ b/dimos/utils/safe_thread_map.py @@ -86,7 +86,7 @@ def cleanup( if errors: if on_errors is not None: zipped = [(items[i], outcomes[i]) for i in range(len(items))] - return on_errors(zipped, successes, errors) # type: ignore[return-value] + return on_errors(zipped, successes, errors) # type: ignore[return-value, no-any-return] raise ExceptionGroup("safe_thread_map failed", errors) return [outcomes[i] for i in range(len(items))] # type: ignore[misc] From 55cc94cec29890b25e109eba83db02a55ee466b8 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 22:18:08 -0800 Subject: [PATCH 36/52] - --- dimos/core/module_coordinator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 8269a47bf9..cbcdb179e9 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -113,6 +113,8 @@ def deploy_parallel( worker_indices.append(i) worker_specs.append(spec) + worker_results: list[Any] = [] + docker_results: list[Any] = [] try: worker_results = self._client.deploy_parallel(worker_specs) docker_results = DockerWorkerManager.deploy_parallel(docker_specs) # type: ignore[arg-type] From d2d761aea183c61b8b10031bba152f45c9573b1b Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Fri, 6 Mar 2026 21:35:53 -0800 Subject: [PATCH 37/52] add docker_build_ssh and image rebuild check --- dimos/core/docker_build.py | 41 +++++++++++++++++++++++++++++++++++++ dimos/core/docker_runner.py | 36 ++++++++++++++++++++------------ 2 files changed, 64 insertions(+), 13 deletions(-) diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py index 7ee90fc5c3..2679450269 100644 --- a/dimos/core/docker_build.py +++ b/dimos/core/docker_build.py @@ -20,6 +20,7 @@ from __future__ import annotations +import hashlib import subprocess from typing import TYPE_CHECKING @@ -90,14 +91,52 @@ def _convert_dockerfile(dockerfile: Path) -> Path: return converted +_BUILD_HASH_LABEL = "dimos.build.hash" + + +def _compute_build_hash(cfg: DockerModuleConfig) -> str: + """Hash Dockerfile contents, build args, and build context path.""" + assert cfg.docker_file is not None + digest = hashlib.sha256() + digest.update(cfg.docker_file.read_bytes()) + for key, val in sorted(cfg.docker_build_args.items()): + digest.update(f"{key}={val}".encode()) + return digest.hexdigest() + + +def _get_image_build_hash(docker_bin: str, image_name: str) -> str | None: + """Read the build hash label from an existing Docker image.""" + r = _run( + [ + docker_bin, + "image", + "inspect", + "-f", + '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}', + image_name, + ], + timeout=DOCKER_CMD_TIMEOUT, + ) + if r.returncode != 0: + return None + value = r.stdout.strip() + # docker prints "" when the label is missing + return value if value and value != "" else None + + def build_image(cfg: DockerModuleConfig) -> None: """Build Docker image using footer mode conversion.""" if cfg.docker_file is None: raise ValueError("docker_file is required for building Docker images") + + build_hash = _compute_build_hash(cfg) dockerfile = _convert_dockerfile(cfg.docker_file) context = cfg.docker_build_context or cfg.docker_file.parent cmd = [_docker_bin(cfg), "build", "-t", cfg.docker_image, "-f", str(dockerfile)] + cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"]) + if cfg.docker_build_ssh: + cmd.extend(["--ssh", "default"]) for k, v in cfg.docker_build_args.items(): cmd.extend(["--build-arg", f"{k}={v}"]) cmd.append(str(context)) @@ -115,6 +154,8 @@ def image_exists(cfg: DockerModuleConfig) -> bool: __all__ = [ "DIMOS_FOOTER", + "_compute_build_hash", + "_get_image_build_hash", "build_image", "image_exists", ] diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 26d822ce73..4a19746c5e 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -54,6 +54,8 @@ class DockerModuleConfig(ModuleConfig): For advanced Docker options not listed here, use docker_extra_args. Example: docker_extra_args=["--cap-add=SYS_ADMIN", "--read-only"] + + NOTE: a DockerModule will rebuild automatically if the Dockerfile or build args change """ # Build / image @@ -61,6 +63,7 @@ class DockerModuleConfig(ModuleConfig): docker_file: Path | None = None # Required on host for building, not needed in container docker_build_context: Path | None = None docker_build_args: dict[str, str] = field(default_factory=dict) + docker_build_ssh: bool = False # Pass --ssh default to docker build (for private repo clones) # Identity docker_container_name: str | None = None @@ -180,7 +183,12 @@ class DockerModule(ModuleProxyProtocol): config: DockerModuleConfig def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None: - from dimos.core.docker_build import build_image, image_exists + from dimos.core.docker_build import ( + _compute_build_hash, + _get_image_build_hash, + build_image, + image_exists, + ) config_class = getattr(module_class, "default_config", DockerModuleConfig) if not issubclass(config_class, DockerModuleConfig): @@ -211,21 +219,23 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non # Build or pull image, launch container, wait for RPC server try: - if not image_exists(config): - if config.docker_file is not None: + if config.docker_file is not None: + current_hash = _compute_build_hash(config) + stored_hash = _get_image_build_hash(_docker_bin(config), config.docker_image) + if current_hash != stored_hash: logger.info(f"Building {config.docker_image}") build_image(config) - else: - logger.info(f"Pulling {config.docker_image}") - r = _run( - [_docker_bin(config), "pull", config.docker_image], - timeout=config.docker_pull_timeout, + elif not image_exists(config): + logger.info(f"Pulling {config.docker_image}") + r = _run( + [_docker_bin(config), "pull", config.docker_image], + timeout=config.docker_pull_timeout, + ) + if r.returncode != 0: + raise RuntimeError( + f"Failed to pull image '{config.docker_image}'.\n" + f"STDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" ) - if r.returncode != 0: - raise RuntimeError( - f"Failed to pull image '{config.docker_image}'.\n" - f"STDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" - ) reconnect = False if _is_container_running(config, self._container_name): From a41b9f165802ecebfeb2c7ce829eaa8080751c45 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Fri, 6 Mar 2026 21:42:48 -0800 Subject: [PATCH 38/52] simplify --- dimos/core/docker_build.py | 52 ++++++++++++++----------------------- dimos/core/docker_runner.py | 21 ++++++--------- 2 files changed, 27 insertions(+), 46 deletions(-) diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py index 2679450269..d3fbcec685 100644 --- a/dimos/core/docker_build.py +++ b/dimos/core/docker_build.py @@ -33,10 +33,11 @@ logger = setup_logger() -# Timeout for quick Docker commands +_BUILD_HASH_LABEL = "dimos.build.hash" + DOCKER_CMD_TIMEOUT = 20 -# Sentinel value to detect already-converted Dockerfiles (UUID ensures uniqueness) +# the way of detecting already-converted Dockerfiles (UUID ensures uniqueness) DIMOS_SENTINEL = "DIMOS-MODULE-CONVERSION-427593ae-c6e8-4cf1-9b2d-ee81a420a5dc" # Footer appended to Dockerfiles for DimOS module conversion @@ -54,28 +55,6 @@ """ -def _run(cmd: list[str], *, timeout: float | None = None) -> subprocess.CompletedProcess[str]: - """Run a command and return the result.""" - return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False) - - -def _run_streaming(cmd: list[str]) -> int: - """Run command and stream output to terminal. Returns exit code.""" - result = subprocess.run(cmd, text=True) - return result.returncode - - -def _docker_bin(cfg: DockerModuleConfig) -> str: - """Get docker binary path.""" - return cfg.docker_bin or "docker" - - -def _image_exists(docker_bin: str, image_name: str) -> bool: - """Check if a Docker image exists locally.""" - r = _run([docker_bin, "image", "inspect", image_name], timeout=DOCKER_CMD_TIMEOUT) - return r.returncode == 0 - - def _convert_dockerfile(dockerfile: Path) -> Path: """Append DimOS footer to Dockerfile. Returns path to converted file.""" content = dockerfile.read_text() @@ -91,9 +70,6 @@ def _convert_dockerfile(dockerfile: Path) -> Path: return converted -_BUILD_HASH_LABEL = "dimos.build.hash" - - def _compute_build_hash(cfg: DockerModuleConfig) -> str: """Hash Dockerfile contents, build args, and build context path.""" assert cfg.docker_file is not None @@ -106,7 +82,7 @@ def _compute_build_hash(cfg: DockerModuleConfig) -> str: def _get_image_build_hash(docker_bin: str, image_name: str) -> str | None: """Read the build hash label from an existing Docker image.""" - r = _run( + r = subprocess.run( [ docker_bin, "image", @@ -115,7 +91,10 @@ def _get_image_build_hash(docker_bin: str, image_name: str) -> str | None: '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}', image_name, ], + capture_output=True, + text=True, timeout=DOCKER_CMD_TIMEOUT, + check=False, ) if r.returncode != 0: return None @@ -133,7 +112,7 @@ def build_image(cfg: DockerModuleConfig) -> None: dockerfile = _convert_dockerfile(cfg.docker_file) context = cfg.docker_build_context or cfg.docker_file.parent - cmd = [_docker_bin(cfg), "build", "-t", cfg.docker_image, "-f", str(dockerfile)] + cmd = [cfg.docker_bin, "build", "-t", cfg.docker_image, "-f", str(dockerfile)] cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"]) if cfg.docker_build_ssh: cmd.extend(["--ssh", "default"]) @@ -142,14 +121,21 @@ def build_image(cfg: DockerModuleConfig) -> None: cmd.append(str(context)) logger.info(f"Building Docker image: {cfg.docker_image}") - exit_code = _run_streaming(cmd) - if exit_code != 0: - raise RuntimeError(f"Docker build failed with exit code {exit_code}") + result = subprocess.run(cmd, text=True) + if result.returncode != 0: + raise RuntimeError(f"Docker build failed with exit code {result.returncode}") def image_exists(cfg: DockerModuleConfig) -> bool: """Check if the configured Docker image exists locally.""" - return _image_exists(_docker_bin(cfg), cfg.docker_image) + r = subprocess.run( + [cfg.docker_bin, "image", "inspect", cfg.docker_image], + capture_output=True, + text=True, + timeout=DOCKER_CMD_TIMEOUT, + check=False, + ) + return r.returncode == 0 __all__ = [ diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 4a19746c5e..c81d4367bc 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -128,25 +128,20 @@ def _run(cmd: list[str], *, timeout: float | None = None) -> subprocess.Complete return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False) -def _docker_bin(cfg: DockerModuleConfig) -> str: - """Get docker binary path, defaulting to 'docker' if empty/None.""" - return cfg.docker_bin or "docker" - - def _remove_container(cfg: DockerModuleConfig, name: str) -> None: - _run([_docker_bin(cfg), "rm", "-f", name], timeout=DOCKER_CMD_TIMEOUT) + _run([cfg.docker_bin, "rm", "-f", name], timeout=DOCKER_CMD_TIMEOUT) def _is_container_running(cfg: DockerModuleConfig, name: str) -> bool: r = _run( - [_docker_bin(cfg), "inspect", "-f", "{{.State.Running}}", name], + [cfg.docker_bin, "inspect", "-f", "{{.State.Running}}", name], timeout=DOCKER_STATUS_TIMEOUT, ) return r.returncode == 0 and r.stdout.strip() == "true" def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> str: - r = _run([_docker_bin(cfg), "logs", "--tail", str(n), name], timeout=DOCKER_CMD_TIMEOUT) + r = _run([cfg.docker_bin, "logs", "--tail", str(n), name], timeout=DOCKER_CMD_TIMEOUT) out = (r.stdout or "").rstrip() err = (r.stderr or "").rstrip() return out + ("\n" + err if err else "") @@ -221,14 +216,14 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non try: if config.docker_file is not None: current_hash = _compute_build_hash(config) - stored_hash = _get_image_build_hash(_docker_bin(config), config.docker_image) + stored_hash = _get_image_build_hash(config.docker_bin, config.docker_image) if current_hash != stored_hash: logger.info(f"Building {config.docker_image}") build_image(config) elif not image_exists(config): logger.info(f"Pulling {config.docker_image}") r = _run( - [_docker_bin(config), "pull", config.docker_image], + [config.docker_bin, "pull", config.docker_image], timeout=config.docker_pull_timeout, ) if r.returncode != 0: @@ -245,7 +240,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non else: logger.info(f"Stopping existing container: {self._container_name}") _run( - [_docker_bin(config), "stop", self._container_name], + [config.docker_bin, "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT, ) @@ -313,7 +308,7 @@ def _cleanup(self) -> None: self._unsub_fns.clear() with suppress(Exception): _run( - [_docker_bin(self.config), "stop", self._container_name], + [self.config.docker_bin, "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT, ) with suppress(Exception): @@ -353,7 +348,7 @@ def _build_docker_run_command(self) -> list[str]: cfg = self.config self._validate_config(cfg) - cmd = [_docker_bin(cfg), "run", "-d"] + cmd = [cfg.docker_bin, "run", "-d"] self._add_lifecycle_args(cmd, cfg) self._add_network_args(cmd, cfg) self._add_port_args(cmd, cfg) From fadabd9f85035e384cf7afe5af10a4b3528b0be2 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Fri, 6 Mar 2026 23:07:30 -0800 Subject: [PATCH 39/52] misc --- dimos/core/docker_build.py | 19 +++++++++++-------- dimos/core/docker_runner.py | 2 +- dimos/core/module_coordinator.py | 9 +++++++-- dimos/utils/safe_thread_map.py | 2 ++ 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py index d3fbcec685..036c4cfd6c 100644 --- a/dimos/core/docker_build.py +++ b/dimos/core/docker_build.py @@ -71,25 +71,26 @@ def _convert_dockerfile(dockerfile: Path) -> Path: def _compute_build_hash(cfg: DockerModuleConfig) -> str: - """Hash Dockerfile contents, build args, and build context path.""" + """Hash Dockerfile contents, build args, and SSH flag.""" assert cfg.docker_file is not None digest = hashlib.sha256() digest.update(cfg.docker_file.read_bytes()) for key, val in sorted(cfg.docker_build_args.items()): digest.update(f"{key}={val}".encode()) + digest.update(f"ssh={cfg.docker_build_ssh}".encode()) return digest.hexdigest() -def _get_image_build_hash(docker_bin: str, image_name: str) -> str | None: +def _get_image_build_hash(cfg: DockerModuleConfig) -> str | None: """Read the build hash label from an existing Docker image.""" r = subprocess.run( [ - docker_bin, + cfg.docker_bin, "image", "inspect", "-f", '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}', - image_name, + cfg.docker_image, ], capture_output=True, text=True, @@ -121,9 +122,13 @@ def build_image(cfg: DockerModuleConfig) -> None: cmd.append(str(context)) logger.info(f"Building Docker image: {cfg.docker_image}") - result = subprocess.run(cmd, text=True) + # Stream stdout to terminal so the user sees build progress, but capture + # stderr separately so we can include it in the error message on failure. + result = subprocess.run(cmd, text=True, stderr=subprocess.PIPE) if result.returncode != 0: - raise RuntimeError(f"Docker build failed with exit code {result.returncode}") + raise RuntimeError( + f"Docker build failed with exit code {result.returncode}\nSTDERR:\n{result.stderr}" + ) def image_exists(cfg: DockerModuleConfig) -> bool: @@ -140,8 +145,6 @@ def image_exists(cfg: DockerModuleConfig) -> bool: __all__ = [ "DIMOS_FOOTER", - "_compute_build_hash", - "_get_image_build_hash", "build_image", "image_exists", ] diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index c81d4367bc..97dbe5e209 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -216,7 +216,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non try: if config.docker_file is not None: current_hash = _compute_build_hash(config) - stored_hash = _get_image_build_hash(config.docker_bin, config.docker_image) + stored_hash = _get_image_build_hash(config) if current_hash != stored_hash: logger.info(f"Building {config.docker_image}") build_image(config) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index cbcdb179e9..7e42f566fa 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -113,19 +113,24 @@ def deploy_parallel( worker_indices.append(i) worker_specs.append(spec) + # Intentionally sequential: worker deploys first, then docker. + # Both internally parallelize across their own items. Running them + # concurrently would add complexity for minimal gain since they use + # different resource pools (processes vs containers). worker_results: list[Any] = [] docker_results: list[Any] = [] try: worker_results = self._client.deploy_parallel(worker_specs) docker_results = DockerWorkerManager.deploy_parallel(docker_specs) # type: ignore[arg-type] finally: - # Reassemble results in original input order + # Reassemble whatever succeeded into original input order so + # stop() can clean them up even if a later deploy raised. + # zip(strict=False) safely handles partial results (empty lists). results: list[Any] = [None] * len(module_specs) for idx, mod in zip(worker_indices, worker_results, strict=False): results[idx] = mod for idx, mod in zip(docker_indices, docker_results, strict=False): # type: ignore[assignment] results[idx] = mod - # Register whatever succeeded so stop() can clean them up for (module_class, _, _), module in zip(module_specs, results, strict=False): if module is not None: self._deployed_modules[module_class] = module diff --git a/dimos/utils/safe_thread_map.py b/dimos/utils/safe_thread_map.py index 240f5e7099..6729c989f3 100644 --- a/dimos/utils/safe_thread_map.py +++ b/dimos/utils/safe_thread_map.py @@ -75,6 +75,8 @@ def cleanup( except Exception as e: outcomes[idx] = e + # Note: successes/errors are in completion order, not input order. + # This is fine — on_errors only needs them for cleanup, not ordering. successes: list[R] = [] errors: list[Exception] = [] for v in outcomes.values(): From d6ec65805c41a82eb100c0f8e65e01f3b672ed45 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 7 Mar 2026 00:11:47 -0800 Subject: [PATCH 40/52] add docker_build_extra_args --- dimos/core/docker_build.py | 6 +++--- dimos/core/docker_runner.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py index 036c4cfd6c..5b54ecbf22 100644 --- a/dimos/core/docker_build.py +++ b/dimos/core/docker_build.py @@ -77,7 +77,8 @@ def _compute_build_hash(cfg: DockerModuleConfig) -> str: digest.update(cfg.docker_file.read_bytes()) for key, val in sorted(cfg.docker_build_args.items()): digest.update(f"{key}={val}".encode()) - digest.update(f"ssh={cfg.docker_build_ssh}".encode()) + for arg in cfg.docker_build_extra_args: + digest.update(arg.encode()) return digest.hexdigest() @@ -115,10 +116,9 @@ def build_image(cfg: DockerModuleConfig) -> None: context = cfg.docker_build_context or cfg.docker_file.parent cmd = [cfg.docker_bin, "build", "-t", cfg.docker_image, "-f", str(dockerfile)] cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"]) - if cfg.docker_build_ssh: - cmd.extend(["--ssh", "default"]) for k, v in cfg.docker_build_args.items(): cmd.extend(["--build-arg", f"{k}={v}"]) + cmd.extend(cfg.docker_build_extra_args) cmd.append(str(context)) logger.info(f"Building Docker image: {cfg.docker_image}") diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 97dbe5e209..a72718b564 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -63,7 +63,7 @@ class DockerModuleConfig(ModuleConfig): docker_file: Path | None = None # Required on host for building, not needed in container docker_build_context: Path | None = None docker_build_args: dict[str, str] = field(default_factory=dict) - docker_build_ssh: bool = False # Pass --ssh default to docker build (for private repo clones) + docker_build_extra_args: list[str] = field(default_factory=list) # Extra args for docker build # Identity docker_container_name: str | None = None From 87cdcc0a2638e24a8d073482c474b6cb551183ca Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 7 Mar 2026 02:49:39 -0800 Subject: [PATCH 41/52] PR review fixes: better error messages, consistent API, restore install.sh - Include docker_build_ssh in build hash so toggling SSH triggers rebuild - Capture stderr on build failure for actionable error messages - Change _get_image_build_hash to take cfg instead of raw docker_bin str - Remove private names from __all__ in docker_build.py - Add helpful TypeError when DockerModule payload isn't JSON-serializable - Replace ThreadPoolExecutor.map in start_all_modules with safe_thread_map to surface all failures via ExceptionGroup instead of losing all but first - Restore scripts/install.sh and README.md (accidentally removed) - Add intent comments on deploy_parallel and safe_thread_map design choices --- dimos/core/docker_runner.py | 10 +++++++++- dimos/core/module_coordinator.py | 12 +++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index a72718b564..6d12705521 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -489,7 +489,15 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]: kwargs = {"config": _extract_module_config(cfg)} payload = {"module_path": module_path, "args": list(self._args), "kwargs": kwargs} # DimOS base image entrypoint already runs "dimos.core.docker_runner run" - return ["--payload", json.dumps(payload, separators=(",", ":"))] + try: + payload_json = json.dumps(payload, separators=(",", ":")) + except TypeError as e: + raise TypeError( + f"Cannot serialize DockerModule payload to JSON: {e}\n" + f"Ensure all constructor args/kwargs for {self._module_class.__name__} are " + f"JSON-serializable, or use docker_command to bypass automatic payload generation." + ) from e + return ["--payload", payload_json] def _wait_for_rpc(self) -> None: """Poll until the container's RPC server is reachable.""" diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index ac693c1795..6c639117bc 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -14,7 +14,6 @@ from __future__ import annotations -from concurrent.futures import ThreadPoolExecutor import threading from typing import TYPE_CHECKING, Any @@ -173,11 +172,18 @@ def deploy_parallel( return results def start_all_modules(self) -> None: + from dimos.utils.safe_thread_map import safe_thread_map + modules = list(self._deployed_modules.values()) if not modules: raise ValueError("No modules deployed. Call deploy() before start_all_modules().") - with ThreadPoolExecutor(max_workers=len(modules)) as executor: - list(executor.map(lambda m: m.start(), modules)) + + def _on_start_errors( + _outcomes: list[Any], _successes: list[Any], errors: list[Exception] + ) -> None: + raise ExceptionGroup("start_all_modules failed", errors) + + safe_thread_map(modules, lambda m: m.start(), _on_start_errors) for module in modules: if hasattr(module, "on_system_modules"): From d7ef2db92af2a9f16892d43839853ee6721e78dc Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 7 Mar 2026 02:56:50 -0800 Subject: [PATCH 42/52] fix pull problem --- dimos/core/docker_runner.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 6d12705521..987e834eae 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -222,14 +222,15 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non build_image(config) elif not image_exists(config): logger.info(f"Pulling {config.docker_image}") - r = _run( + r = subprocess.run( [config.docker_bin, "pull", config.docker_image], + text=True, + stderr=subprocess.PIPE, timeout=config.docker_pull_timeout, ) if r.returncode != 0: raise RuntimeError( - f"Failed to pull image '{config.docker_image}'.\n" - f"STDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" + f"Failed to pull image '{config.docker_image}'.\nSTDERR:\n{r.stderr}" ) reconnect = False From 7639f3d1c1ef7d8b06fba377981a548c447164e4 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 7 Mar 2026 14:31:00 -0800 Subject: [PATCH 43/52] fix reconnect edgecase and __getattr__ loop edgecase --- dimos/core/docker_runner.py | 22 ++--- dimos/core/tests/test_docker_deployment.py | 97 ++++++++++++++++++++++ 2 files changed, 109 insertions(+), 10 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 987e834eae..db5f804659 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -307,15 +307,16 @@ def _cleanup(self) -> None: with suppress(Exception): unsub() self._unsub_fns.clear() - with suppress(Exception): - _run( - [self.config.docker_bin, "stop", self._container_name], - timeout=DOCKER_STOP_TIMEOUT, - ) - with suppress(Exception): - _remove_container(self.config, self._container_name) + if not self.config.docker_reconnect_container: + with suppress(Exception): + _run( + [self.config.docker_bin, "stop", self._container_name], + timeout=DOCKER_STOP_TIMEOUT, + ) + with suppress(Exception): + _remove_container(self.config, self._container_name) self._running = False - logger.info(f"Stopped container: {self._container_name}") + logger.info(f"Cleaned up container handle: {self._container_name}") def status(self) -> dict[str, Any]: cfg = self.config @@ -337,10 +338,11 @@ def set_transport(self, stream_name: str, transport: Any) -> bool: return bool(result) def __getattr__(self, name: str) -> Any: - if name in self.rpcs: + rpcs = self.__dict__.get("rpcs") + if rpcs is not None and name in rpcs: original_method = getattr(self._module_class, name, None) return RpcCall(original_method, self.rpc, name, self.remote_name, self._unsub_fns, None) - raise AttributeError(f"{name} not found on {self._module_class.__name__}") + raise AttributeError(f"{name} not found on {type(self).__name__}") # Docker command building (split into focused helpers for readability) diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index 17d1290916..e89b88e327 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -193,3 +193,100 @@ def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docke assert mock_dm.stop.call_count == 1 # Worker manager also closed mock_worker_mgr.close_all.assert_called_once() + + +class TestDockerModuleGetattr: + """Tests for DockerModule.__getattr__ avoiding infinite recursion.""" + + def test_getattr_no_recursion_when_rpcs_not_set(self): + """If __init__ fails before self.rpcs is assigned, __getattr__ must not recurse.""" + from dimos.core.docker_runner import DockerModule + + dm = DockerModule.__new__(DockerModule) + # Don't set rpcs, _module_class, or any instance attrs — simulates early __init__ failure + with pytest.raises(AttributeError): + _ = dm.some_method + + def test_getattr_no_recursion_on_cleanup_attrs(self): + """Accessing cleanup-related attrs before they exist must raise, not recurse.""" + from dimos.core.docker_runner import DockerModule + + dm = DockerModule.__new__(DockerModule) + # These are accessed during _cleanup() — if rpcs isn't set, they must not recurse + for attr in ("rpc", "config", "_container_name", "_unsub_fns"): + with pytest.raises(AttributeError): + getattr(dm, attr) + + def test_getattr_delegates_to_rpc_when_rpcs_set(self): + from dimos.core.docker_runner import DockerModule + from dimos.core.rpc_client import RpcCall + + dm = DockerModule.__new__(DockerModule) + dm.rpcs = {"do_thing"} + + # _module_class needs a real method with __name__ for RpcCall + class FakeMod: + def do_thing(self) -> None: ... + + dm._module_class = FakeMod + dm.rpc = MagicMock() + dm.remote_name = "FakeMod" + dm._unsub_fns = [] + + result = dm.do_thing + assert isinstance(result, RpcCall) + + def test_getattr_raises_for_unknown_method(self): + from dimos.core.docker_runner import DockerModule + + dm = DockerModule.__new__(DockerModule) + dm.rpcs = {"do_thing"} + + with pytest.raises(AttributeError, match="not found"): + _ = dm.nonexistent + + +class TestDockerModuleCleanupReconnect: + """Tests for DockerModule._cleanup with docker_reconnect_container.""" + + def test_cleanup_skips_stop_when_reconnect(self): + from dimos.core.docker_runner import DockerModule + + with patch.object(DockerModule, "__init__", lambda self: None): + dm = DockerModule.__new__(DockerModule) + dm._running = True + dm._container_name = "test_container" + dm._unsub_fns = [] + dm.rpc = MagicMock() + dm.remote_name = "TestModule" + + # reconnect mode: should NOT stop/rm the container + dm.config = FakeDockerConfig(docker_reconnect_container=True) + with ( + patch("dimos.core.docker_runner._run") as mock_run, + patch("dimos.core.docker_runner._remove_container") as mock_rm, + ): + dm._cleanup() + mock_run.assert_not_called() + mock_rm.assert_not_called() + + def test_cleanup_stops_container_when_not_reconnect(self): + from dimos.core.docker_runner import DockerModule + + with patch.object(DockerModule, "__init__", lambda self: None): + dm = DockerModule.__new__(DockerModule) + dm._running = True + dm._container_name = "test_container" + dm._unsub_fns = [] + dm.rpc = MagicMock() + dm.remote_name = "TestModule" + + # normal mode: should stop and rm the container + dm.config = FakeDockerConfig(docker_reconnect_container=False) + with ( + patch("dimos.core.docker_runner._run") as mock_run, + patch("dimos.core.docker_runner._remove_container") as mock_rm, + ): + dm._cleanup() + mock_run.assert_called_once() # docker stop + mock_rm.assert_called_once() # docker rm -f From 14e3d1e6a0ffccd986a9ba43a2e1b46aee4ce24a Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 7 Mar 2026 14:36:00 -0800 Subject: [PATCH 44/52] change the ignore postfix --- .gitignore | 1 - dimos/core/docker_build.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 12b2f19ca3..4045db012e 100644 --- a/.gitignore +++ b/.gitignore @@ -42,7 +42,6 @@ package-lock.json # Ignore build artifacts dist/ build/ -.Dockerfile.dimos # Ignore data directory but keep .lfs subdirectory data/* diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py index 5b54ecbf22..1e357d987b 100644 --- a/dimos/core/docker_build.py +++ b/dimos/core/docker_build.py @@ -65,7 +65,7 @@ def _convert_dockerfile(dockerfile: Path) -> Path: logger.info(f"Converting {dockerfile.name} to DimOS format") - converted = dockerfile.parent / f".{dockerfile.name}.dimos" + converted = dockerfile.parent / f".{dockerfile.name}.ignore" converted.write_text(content.rstrip() + "\n" + DIMOS_FOOTER.lstrip("\n")) return converted From 7dc73b88874ea78645bbc607b8276d79988a7cb4 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 7 Mar 2026 17:31:53 -0800 Subject: [PATCH 45/52] fix docker defaults, make deploy better --- dimos/core/docker_build.py | 2 +- dimos/core/docker_runner.py | 15 +++++----- dimos/core/module_coordinator.py | 49 +++++++++++++++++++------------- dimos/core/worker.py | 38 +++++++++++++------------ 4 files changed, 58 insertions(+), 46 deletions(-) diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py index 1e357d987b..24fd2b3e44 100644 --- a/dimos/core/docker_build.py +++ b/dimos/core/docker_build.py @@ -71,7 +71,7 @@ def _convert_dockerfile(dockerfile: Path) -> Path: def _compute_build_hash(cfg: DockerModuleConfig) -> str: - """Hash Dockerfile contents, build args, and SSH flag.""" + """Hash Dockerfile contents and build args.""" assert cfg.docker_file is not None digest = hashlib.sha256() digest.update(cfg.docker_file.read_bytes()) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index db5f804659..6f0b2e777c 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -77,9 +77,9 @@ class DockerModuleConfig(ModuleConfig): ) # (host, container, proto) # Runtime resources - docker_gpus: str | None = "all" - docker_shm_size: str = "2g" - docker_restart_policy: str = "on-failure:3" + docker_gpus: str | None = None + docker_shm_size: str = "4g" + docker_restart_policy: str = "no" # Env + volumes + devices docker_env_files: list[str] = field(default_factory=list) @@ -300,14 +300,15 @@ def stop(self) -> None: self._cleanup() def _cleanup(self) -> None: - """Release all resources. Safe to call multiple times or from partial init.""" + """Release all resources. Idempotent — safe to call from partial init or after stop().""" with suppress(Exception): self.rpc.stop() - for unsub in self._unsub_fns: + for unsub in getattr(self, "_unsub_fns", []): with suppress(Exception): unsub() - self._unsub_fns.clear() - if not self.config.docker_reconnect_container: + with suppress(Exception): + self._unsub_fns.clear() + if not getattr(getattr(self, "config", None), "docker_reconnect_container", False): with suppress(Exception): _run( [self.config.docker_bin, "stop", self._container_name], diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 6c639117bc..59e1013175 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -22,6 +22,7 @@ from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager from dimos.utils.logging_config import setup_logger +from dimos.utils.safe_thread_map import safe_thread_map if TYPE_CHECKING: from dimos.core.module import Module, ModuleT @@ -147,33 +148,41 @@ def deploy_parallel( worker_indices.append(i) worker_specs.append(spec) - # Intentionally sequential: worker deploys first, then docker. - # Both internally parallelize across their own items. Running them - # concurrently would add complexity for minimal gain since they use - # different resource pools (processes vs containers). - worker_results: list[Any] = [] - docker_results: list[Any] = [] - try: - worker_results = self._client.deploy_parallel(worker_specs) - docker_results = DockerWorkerManager.deploy_parallel(docker_specs) # type: ignore[arg-type] - finally: - # Reassemble whatever succeeded into original input order so - # stop() can clean them up even if a later deploy raised. - # zip(strict=False) safely handles partial results (empty lists). - results: list[Any] = [None] * len(module_specs) - for idx, mod in zip(worker_indices, worker_results, strict=False): - results[idx] = mod - for idx, mod in zip(docker_indices, docker_results, strict=False): # type: ignore[assignment] - results[idx] = mod + # Deploy worker and docker modules in parallel. + results: list[Any] = [None] * len(module_specs) + + def _deploy_workers() -> None: + if not worker_specs: + return + for (index, _), module in zip( + worker_indices, self._client.deploy_parallel(worker_specs), strict=False + ): # type: ignore[union-attr] + results[index] = module + + def _deploy_docker() -> None: + if not docker_specs: + return + for (index, _), module in zip( + docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=False + ): # type: ignore[arg-type] + results[index] = module + + def _register() -> None: for (module_class, _, _), module in zip(module_specs, results, strict=False): if module is not None: self._deployed_modules[module_class] = module + def _on_errors( + _outcomes: list[Any], _successes: list[Any], errors: list[Exception] + ) -> None: + _register() + raise ExceptionGroup("deploy_parallel failed", errors) + + safe_thread_map([_deploy_workers, _deploy_docker], lambda fn: fn(), _on_errors) + _register() return results def start_all_modules(self) -> None: - from dimos.utils.safe_thread_map import safe_thread_map - modules = list(self._deployed_modules.values()) if not modules: raise ValueError("No modules deployed. Call deploy() before start_all_modules().") diff --git a/dimos/core/worker.py b/dimos/core/worker.py index b0dd802841..cce79796f5 100644 --- a/dimos/core/worker.py +++ b/dimos/core/worker.py @@ -206,25 +206,27 @@ def deploy_module( "args": args, "kwargs": kwargs, } - with self._lock: - self._conn.send(request) - response = self._conn.recv() + try: + with self._lock: + self._conn.send(request) + response = self._conn.recv() - if response.get("error"): - raise RuntimeError(f"Failed to deploy module: {response['error']}") - - actor = Actor(self._conn, module_class, self._worker_id, module_id, self._lock) - actor.set_ref(actor).result() - - self._modules[module_id] = actor - self._reserved = max(0, self._reserved - 1) - logger.info( - "Deployed module.", - module=module_class.__name__, - worker_id=self._worker_id, - module_id=module_id, - ) - return actor + if response.get("error"): + raise RuntimeError(f"Failed to deploy module: {response['error']}") + + actor = Actor(self._conn, module_class, self._worker_id, module_id, self._lock) + actor.set_ref(actor).result() + + self._modules[module_id] = actor + logger.info( + "Deployed module.", + module=module_class.__name__, + worker_id=self._worker_id, + module_id=module_id, + ) + return actor + finally: + self._reserved = max(0, self._reserved - 1) def shutdown(self) -> None: if self._conn is not None: From 0c29524e78588415ac408bc3ad8022489dcddadc Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sun, 8 Mar 2026 14:06:34 -0700 Subject: [PATCH 46/52] misc --- dimos/core/docker_runner.py | 4 ++-- dimos/core/module_coordinator.py | 7 ++++--- dimos/core/run_registry.py | 4 +--- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 6f0b2e777c..10438298b1 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -39,7 +39,7 @@ logger = setup_logger() DOCKER_RUN_TIMEOUT = 120 # Timeout for `docker run` command execution -DOCKER_PULL_TIMEOUT_DEFAULT = 600 # Default timeout for `docker pull` +DOCKER_PULL_TIMEOUT_DEFAULT = None # No timeout for `docker pull` (images can be large) DOCKER_CMD_TIMEOUT = 20 # Timeout for quick Docker commands (inspect, rm, logs) DOCKER_STATUS_TIMEOUT = 10 # Timeout for container status checks DOCKER_STOP_TIMEOUT = 30 # Timeout for `docker stop` command (graceful shutdown) @@ -99,7 +99,7 @@ class DockerModuleConfig(ModuleConfig): docker_extra_args: list[str] = field(default_factory=list) # Timeouts - docker_pull_timeout: float = DOCKER_PULL_TIMEOUT_DEFAULT + docker_pull_timeout: float | None = DOCKER_PULL_TIMEOUT_DEFAULT docker_startup_timeout: float = 120.0 docker_poll_interval: float = 1.0 diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 59e1013175..7d2478dcb1 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -109,7 +109,8 @@ def stop(self) -> None: logger.error("Error stopping module", module=module_class.__name__, exc_info=True) logger.info("Module stopped.", module=module_class.__name__) - self._client.close_all() # type: ignore[union-attr] + if self._client is not None: + self._client.close_all() def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] # Inline to avoid circular import: module_coordinator → docker_runner → module → blueprints → module_coordinator @@ -154,7 +155,7 @@ def deploy_parallel( def _deploy_workers() -> None: if not worker_specs: return - for (index, _), module in zip( + for index, module in zip( worker_indices, self._client.deploy_parallel(worker_specs), strict=False ): # type: ignore[union-attr] results[index] = module @@ -162,7 +163,7 @@ def _deploy_workers() -> None: def _deploy_docker() -> None: if not docker_specs: return - for (index, _), module in zip( + for index, module in zip( docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=False ): # type: ignore[arg-type] results[index] = module diff --git a/dimos/core/run_registry.py b/dimos/core/run_registry.py index 9f8e7f3358..848eafde4e 100644 --- a/dimos/core/run_registry.py +++ b/dimos/core/run_registry.py @@ -21,6 +21,7 @@ import os from pathlib import Path import re +import signal import time from dimos.utils.logging_config import setup_logger @@ -142,9 +143,6 @@ def get_most_recent(alive_only: bool = True) -> RunEntry | None: return runs[-1] if runs else None -import signal - - def stop_entry(entry: RunEntry, force: bool = False) -> tuple[str, bool]: """Stop a DimOS instance by registry entry. From eb3d30324ff143dd50e45baa686faf269d5b0b84 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sun, 8 Mar 2026 17:53:29 -0700 Subject: [PATCH 47/52] fix mypy --- dimos/core/module_coordinator.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 7d2478dcb1..ee417f93cb 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -139,7 +139,7 @@ def deploy_parallel( # Split by type, tracking original indices for reassembly docker_indices: list[int] = [] worker_indices: list[int] = [] - docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] + docker_specs: list[tuple[type[Module], tuple[Any, ...], dict[str, Any]]] = [] worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] for i, spec in enumerate(module_specs): if is_docker_module(spec[0]): @@ -155,9 +155,10 @@ def deploy_parallel( def _deploy_workers() -> None: if not worker_specs: return + assert self._client is not None for index, module in zip( worker_indices, self._client.deploy_parallel(worker_specs), strict=False - ): # type: ignore[union-attr] + ): results[index] = module def _deploy_docker() -> None: @@ -165,7 +166,7 @@ def _deploy_docker() -> None: return for index, module in zip( docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=False - ): # type: ignore[arg-type] + ): results[index] = module def _register() -> None: From 614dde87a0db57c281e33bd7f1ef8e15d1e68107 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Mon, 9 Mar 2026 13:05:17 -0700 Subject: [PATCH 48/52] fix ExceptionGroup edgecase --- dimos/core/docker_worker_manager.py | 2 +- dimos/core/module_coordinator.py | 2 +- dimos/core/resource_monitor/stats.py | 2 +- dimos/core/worker_manager.py | 2 +- dimos/utils/safe_thread_map.py | 16 ++++++++++++++++ 5 files changed, 20 insertions(+), 4 deletions(-) diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index 34183fda9f..29c7c2a29d 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -16,7 +16,7 @@ from contextlib import suppress from typing import TYPE_CHECKING, Any -from dimos.utils.safe_thread_map import safe_thread_map +from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map if TYPE_CHECKING: from dimos.core.docker_runner import DockerModule diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index ee417f93cb..deb867453e 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -22,7 +22,7 @@ from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager from dimos.utils.logging_config import setup_logger -from dimos.utils.safe_thread_map import safe_thread_map +from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map if TYPE_CHECKING: from dimos.core.module import Module, ModuleT diff --git a/dimos/core/resource_monitor/stats.py b/dimos/core/resource_monitor/stats.py index c020c853e0..f401358890 100644 --- a/dimos/core/resource_monitor/stats.py +++ b/dimos/core/resource_monitor/stats.py @@ -90,7 +90,7 @@ class IoStats(TypedDict): def _collect_io(proc: psutil.Process) -> IoStats: """Collect IO counters in bytes. Call inside oneshot().""" try: - io = proc.io_counters() + io = proc.io_counters() # type: ignore[attr-defined] # Linux-only return IoStats(io_read_bytes=io.read_bytes, io_write_bytes=io.write_bytes) except (psutil.AccessDenied, AttributeError): return IoStats(io_read_bytes=0, io_write_bytes=0) diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager.py index b9c25c8445..fa448cb15d 100644 --- a/dimos/core/worker_manager.py +++ b/dimos/core/worker_manager.py @@ -20,7 +20,7 @@ from dimos.core.rpc_client import RPCClient from dimos.core.worker import Worker from dimos.utils.logging_config import setup_logger -from dimos.utils.safe_thread_map import safe_thread_map +from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map if TYPE_CHECKING: from dimos.core.module import ModuleT diff --git a/dimos/utils/safe_thread_map.py b/dimos/utils/safe_thread_map.py index 6729c989f3..f480f2c97d 100644 --- a/dimos/utils/safe_thread_map.py +++ b/dimos/utils/safe_thread_map.py @@ -14,8 +14,24 @@ from __future__ import annotations from concurrent.futures import Future, ThreadPoolExecutor, as_completed +import sys from typing import TYPE_CHECKING, Any, TypeVar +if sys.version_info < (3, 11): + + class ExceptionGroup(Exception): # type: ignore[no-redef] # noqa: N818 + """Minimal ExceptionGroup polyfill for Python 3.10.""" + + exceptions: tuple[BaseException, ...] + + def __init__(self, message: str, exceptions: Sequence[BaseException]) -> None: + super().__init__(message) + self.exceptions = tuple(exceptions) +else: + import builtins + + ExceptionGroup = builtins.ExceptionGroup # type: ignore[misc] + if TYPE_CHECKING: from collections.abc import Callable, Sequence From bf8b4296d5fd9720052d774dba5bbba7e95c4287 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Fri, 13 Mar 2026 15:09:33 -0700 Subject: [PATCH 49/52] fix: update Docker deployment to use ModuleSpec format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - docker_worker_manager: accept ModuleSpec format, pass global_config - module_coordinator: add type: ignore for ModuleBase→Module cast - worker_manager: convert Iterable to list for len() check - test_docker_deployment: fix Path import, update test assertions for new global_config signature Co-Authored-By: Claude Opus 4.6 --- dimos/core/docker_worker_manager.py | 8 +++++--- dimos/core/module_coordinator.py | 2 +- dimos/core/tests/test_docker_deployment.py | 14 ++++++-------- dimos/core/worker_manager.py | 1 + 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index 29c7c2a29d..520468182f 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -16,11 +16,11 @@ from contextlib import suppress from typing import TYPE_CHECKING, Any +from dimos.core.module import ModuleSpec from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map if TYPE_CHECKING: from dimos.core.docker_runner import DockerModule - from dimos.core.module import Module class DockerWorkerManager: @@ -28,7 +28,7 @@ class DockerWorkerManager: @staticmethod def deploy_parallel( - specs: list[tuple[type[Module], tuple[Any, ...], dict[str, Any]]], + specs: list[ModuleSpec], ) -> list[DockerModule]: """Deploy multiple DockerModules in parallel. @@ -46,5 +46,7 @@ def _on_errors( raise ExceptionGroup("docker deploy_parallel failed", errors) return safe_thread_map( - specs, lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), _on_errors + specs, + lambda spec: DockerModule(spec[0], global_config=spec[1], **spec[2]), # type: ignore[arg-type] + _on_errors, ) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index d9931b7876..43e3e44f0a 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -131,7 +131,7 @@ def deploy( deployed_module: ModuleProxyProtocol if is_docker_module(module_class): - deployed_module = DockerModule(module_class, global_config=global_config, **kwargs) + deployed_module = DockerModule(module_class, global_config=global_config, **kwargs) # type: ignore[arg-type] else: deployed_module = self._client.deploy(module_class, global_config, kwargs) self._deployed_modules[module_class] = deployed_module # type: ignore[assignment] diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index e89b88e327..a3bb0b716d 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -21,24 +21,20 @@ from __future__ import annotations -from dataclasses import dataclass -from typing import TYPE_CHECKING +from pathlib import Path from unittest.mock import MagicMock, patch import pytest from dimos.core.docker_runner import DockerModuleConfig, is_docker_module +from dimos.core.global_config import global_config from dimos.core.module import Module from dimos.core.module_coordinator import ModuleCoordinator from dimos.core.stream import Out -if TYPE_CHECKING: - from pathlib import Path - # -- Fixtures: fake module classes ------------------------------------------- -@dataclass class FakeDockerConfig(DockerModuleConfig): docker_image: str = "fake:latest" docker_file: Path | None = None @@ -95,7 +91,9 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_ # Should NOT go through worker manager mock_worker_mgr.deploy.assert_not_called() # Should construct a DockerModule (container launch happens inside __init__) - mock_docker_module_cls.assert_called_once_with(FakeDockerModule) + mock_docker_module_cls.assert_called_once_with( + FakeDockerModule, global_config=global_config + ) # start() is NOT called during deploy — it's called in start_all_modules mock_dm.start.assert_not_called() assert result is mock_dm @@ -134,7 +132,7 @@ def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manage result = coordinator.deploy(FakeRegularModule) - mock_worker_mgr.deploy.assert_called_once_with(FakeRegularModule) + mock_worker_mgr.deploy.assert_called_once_with(FakeRegularModule, global_config, {}) assert result is mock_proxy coordinator.stop() diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager.py index 52313ca5d4..2b778c433e 100644 --- a/dimos/core/worker_manager.py +++ b/dimos/core/worker_manager.py @@ -66,6 +66,7 @@ def deploy_parallel(self, module_specs: Iterable[ModuleSpec]) -> list[RPCClient] if self._closed: raise RuntimeError("WorkerManager is closed") + module_specs = list(module_specs) if len(module_specs) == 0: return [] From 77e5aae4aa0bf6855347a0e0a544ffe616e890c6 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Fri, 13 Mar 2026 23:43:35 -0700 Subject: [PATCH 50/52] fix(mypy): cover import-not-found for onnxruntime type: ignore Pre-existing mypy errors: onnxruntime is excluded from install (--no-extra cuda) so import-not-found needs to be ignored alongside import-untyped. Co-Authored-By: Claude Opus 4.6 --- dimos/agents_deprecated/memory/image_embedding.py | 2 +- dimos/simulation/mujoco/policy.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dimos/agents_deprecated/memory/image_embedding.py b/dimos/agents_deprecated/memory/image_embedding.py index 27e16f1aa8..d6b0967642 100644 --- a/dimos/agents_deprecated/memory/image_embedding.py +++ b/dimos/agents_deprecated/memory/image_embedding.py @@ -63,7 +63,7 @@ def __init__(self, model_name: str = "clip", dimensions: int = 512) -> None: def _initialize_model(self): # type: ignore[no-untyped-def] """Initialize the specified embedding model.""" try: - import onnxruntime as ort # type: ignore[import-untyped] + import onnxruntime as ort # type: ignore[import-untyped,import-not-found] import torch # noqa: F401 from transformers import ( # type: ignore[import-untyped] AutoFeatureExtractor, diff --git a/dimos/simulation/mujoco/policy.py b/dimos/simulation/mujoco/policy.py index 212c7ac60a..1d0598ce46 100644 --- a/dimos/simulation/mujoco/policy.py +++ b/dimos/simulation/mujoco/policy.py @@ -20,7 +20,7 @@ import mujoco import numpy as np -import onnxruntime as ort # type: ignore[import-untyped] +import onnxruntime as ort # type: ignore[import-untyped,import-not-found] from dimos.simulation.mujoco.input_controller import InputController from dimos.utils.logging_config import setup_logger From e1f91be4cd9a0000f6c7d6fcbe23ce64fbedd432 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Fri, 13 Mar 2026 23:46:28 -0700 Subject: [PATCH 51/52] fix: remove section markers from hello_docker.py and untrack .venv - Remove comment section markers (dashed lines) that violate the no-section-markers test policy - Remove .venv symlink from git tracking (already in .gitignore) Co-Authored-By: Claude Opus 4.6 --- .venv | 1 - examples/docker_hello_world/hello_docker.py | 12 +----------- 2 files changed, 1 insertion(+), 12 deletions(-) delete mode 120000 .venv diff --git a/.venv b/.venv deleted file mode 120000 index 3c94680097..0000000000 --- a/.venv +++ /dev/null @@ -1 +0,0 @@ -/home/dimos/auto/dimos/.venv \ No newline at end of file diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py index 66e95df316..af3bfc19d3 100644 --- a/examples/docker_hello_world/hello_docker.py +++ b/examples/docker_hello_world/hello_docker.py @@ -41,10 +41,6 @@ from dimos.core.module import Module from dimos.core.stream import In, Out -# --------------------------------------------------------------------------- -# Docker module (runs inside container) -# --------------------------------------------------------------------------- - @dataclass(kw_only=True) class HelloDockerConfig(DockerModuleConfig): @@ -100,10 +96,6 @@ def get_greeting_prefix(self) -> str: return self.config.greeting_prefix -# --------------------------------------------------------------------------- -# Host-side module (sends prompts and prints greetings) -# --------------------------------------------------------------------------- - class PromptModule(Module): """Publishes prompts and listens to greetings.""" @@ -125,9 +117,7 @@ def _on_greeting(self, text: str) -> None: print(f"[PromptModule] Received: {text}") -# --------------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------------- + if __name__ == "__main__": from dimos.core.blueprints import autoconnect From f83ed5137bdf0697501286dd65192f7faa99a0b3 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Fri, 13 Mar 2026 23:46:41 -0700 Subject: [PATCH 52/52] style: fix formatting in hello_docker.py Co-Authored-By: Claude Opus 4.6 --- examples/docker_hello_world/hello_docker.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py index af3bfc19d3..3b8e96e49b 100644 --- a/examples/docker_hello_world/hello_docker.py +++ b/examples/docker_hello_world/hello_docker.py @@ -96,7 +96,6 @@ def get_greeting_prefix(self) -> str: return self.config.greeting_prefix - class PromptModule(Module): """Publishes prompts and listens to greetings.""" @@ -117,8 +116,6 @@ def _on_greeting(self, text: str) -> None: print(f"[PromptModule] Received: {text}") - - if __name__ == "__main__": from dimos.core.blueprints import autoconnect