diff --git a/.github/workflows/devcontainer.yml b/.github/workflows/devcontainer.yml
index 1ea79d6757..14f2a05232 100644
--- a/.github/workflows/devcontainer.yml
+++ b/.github/workflows/devcontainer.yml
@@ -208,13 +208,6 @@ jobs:
         # Now actually run the tests.
         docker exec --user vscode --env USER=vscode mlos-devcontainer make CONDA_INFO_LEVEL=-v test
 
-    - name: Upload coverage report as build artifact
-      if: always() # ensures it runs even if tests fail
-      uses: actions/upload-artifact@v4
-      with:
-        name: coverage-report
-        path: coverage.xml
-
     - name: Generate and test binary distribution files
       timeout-minutes: 10
       run: |
@@ -298,6 +291,21 @@ jobs:
         docker tag mlos-devcontainer:latest ${{ secrets.ACR_LOGINURL }}/mlos-devcontainer:$image_tag
         docker push ${{ secrets.ACR_LOGINURL }}/mlos-devcontainer:$image_tag
 
+    # ensure these run even if tests fail, but only after everything else is done or skipped
+    - name: Upload coverage report as build artifact
+      if: always()
+      uses: actions/upload-artifact@v5
+      with:
+        name: coverage-report
+        path: coverage.xml
+
+    - name: Upload pytest logs
+      if: always()
+      uses: actions/upload-artifact@v5
+      with:
+        name: devcontainer-pytest-logs
+        path: logs/pytest*.log
+        retention-days: 7
 
   PublishDocs:
     name: Publish Documentation
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 5518b15d97..2eef298e60 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -181,3 +181,11 @@ jobs:
 
     - name: Generate and test binary distribution files
       run: make CONDA_ENV_NAME=$CONDA_ENV_NAME CONDA_INFO_LEVEL=-v dist dist-test
+
+    - name: Upload pytest logs
+      if: always()
+      uses: actions/upload-artifact@v5
+      with:
+        name: linux-${{ matrix.python_version }}-pytest-logs
+        path: logs/pytest*.log
+        retention-days: 7
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index 37bd8e39e4..3fd4d7b064 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -163,6 +163,13 @@ jobs:
     - name: Generate and test binary distribution files
       run: make CONDA_ENV_NAME=$CONDA_ENV_NAME CONDA_INFO_LEVEL=-v dist dist-test
 
+    - name: Upload pytest logs
+      if: always()
+      uses: actions/upload-artifact@v5
+      with:
+        name: macos-pytest-logs
+        path: logs/pytest*.log
+        retention-days: 7
 
   MacOSDevContainerBuildTest:
     name: MacOS DevContainer Build/Test
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index fde7611678..cd5051b0c2 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -142,6 +142,13 @@ jobs:
       run: |
         .github/workflows/build-dist-test.ps1
 
+    - name: Upload pytest logs
+      if: always()
+      uses: actions/upload-artifact@v5
+      with:
+        name: windows-pytest-logs
+        path: logs/pytest*.log
+        retention-days: 7
 
   WindowsDevContainerBuildTest:
     name: Windows DevContainer Build/Test
diff --git a/.gitignore b/.gitignore
index 471d653344..1bb01c59b7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -171,3 +171,6 @@ build/*.build-stamp
 *.duckdb
 *.db.wal
 *.duckdb.wal
+
+# pytest logs
+logs/pytest*.log
diff --git a/conftest.py b/conftest.py
index 84a04727c4..de9cc2646c 100644
--- a/conftest.py
+++ b/conftest.py
@@ -7,6 +7,7 @@
 # Note: This file is named conftest.py so that pytest picks it up automatically
 # without the need to adjust PYTHONPATH or sys.path as much.
 
+import logging
 import os
 import shutil
 from tempfile import mkdtemp
@@ -15,6 +16,14 @@
 import pytest
 from xdist.workermanage import WorkerController
 
+# See Also: setup.cfg and launcher.py
+LOG_FMT = (
+    "%(asctime)s.%(msecs)03d [%(process)d][%(threadName)s] "
+    "[%(filename)s:%(lineno)d %(funcName)s] "
+    "%(levelname)s: %(message)s"
+)
+DATE_FMT = "%Y-%m-%d %H:%M:%S"
+
 
 def is_master(config: pytest.Config) -> bool:
     """True if the code running the given pytest.config object is running in a xdist
@@ -56,6 +65,20 @@ def pytest_configure(config: pytest.Config) -> None:
         # Add it to the config so that it can passed to the worker nodes.
         setattr(config, "shared_temp_dir", mkdtemp())
 
+    # Configure per-worker log file.
+    worker_id = getattr(config, "workerinput", {}).get("workerid", "master")
+    os.makedirs("logs", exist_ok=True)
+    log_file_path = os.path.join("logs", f"pytest-{worker_id}.log")
+    file_handler = logging.FileHandler(log_file_path, mode="w")
+    if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
+        file_handler.setLevel(logging.DEBUG)
+    else:
+        file_handler.setLevel(logging.INFO)
+    # logging.basicConfig(level=file_handler.level, format=LOG_FMT, datefmt=DATE_FMT)
+    log_formatter = logging.Formatter(fmt=LOG_FMT, datefmt=DATE_FMT)
+    file_handler.setFormatter(log_formatter)
+    logging.getLogger().addHandler(file_handler)
+
 
 def pytest_configure_node(node: WorkerController) -> None:
     """Xdist hook used to inform workers of the location of the shared temp dir."""
diff --git a/mlos_bench/mlos_bench/launcher.py b/mlos_bench/mlos_bench/launcher.py
index c728ed7fb2..cd76f456cf 100644
--- a/mlos_bench/mlos_bench/launcher.py
+++ b/mlos_bench/mlos_bench/launcher.py
@@ -37,6 +37,14 @@
 _LOG_FORMAT = "%(asctime)s %(filename)s:%(lineno)d %(funcName)s %(levelname)s %(message)s"
 logging.basicConfig(level=_LOG_LEVEL, format=_LOG_FORMAT)
 
+# TODO: Future PR: See Also: /conftest.py, setup.cfg
+# _LOG_FORMAT = (
+#    "%(asctime)s.%(msecs)03d [%(process)d][%(threadName)s] "
+#    "[%(filename)s:%(lineno)d %(funcName)s] "
+#    "%(levelname)s %(message)s"
+# )
+# logging.basicConfig(level=_LOG_LEVEL, format=_LOG_FORMAT, datefmt="%Y-%m-%d %H:%M:%S")
+
 _LOG = logging.getLogger(__name__)
 
 
diff --git a/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py b/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py
index 5955f6dfd1..199d640294 100644
--- a/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py
+++ b/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py
@@ -9,6 +9,7 @@
 from abc import ABCMeta
 from asyncio import Event as CoroEvent
 from asyncio import Lock as CoroLock
+from asyncio import sleep as async_sleep
 from collections.abc import Callable, Coroutine
 from threading import current_thread
 from types import TracebackType
@@ -172,34 +173,60 @@ async def get_client_connection(
             A tuple of (SSHClientConnection, SshClient).
         """
         _LOG.debug("%s: get_client_connection: %s", current_thread().name, connect_params)
-        async with self._cache_lock:
-            connection_id = SshClient.id_from_params(connect_params)
-            client: None | SshClient | asyncssh.SSHClient
-            _, client = self._cache.get(connection_id, (None, None))
-            if client:
-                _LOG.debug("%s: Checking cached client %s", current_thread().name, connection_id)
-                connection = await client.connection()
-                if not connection:
-                    _LOG.debug(
-                        "%s: Removing stale client connection %s from cache.",
-                        current_thread().name,
-                        connection_id,
-                    )
-                    self._cache.pop(connection_id)
-                    # Try to reconnect next.
-                else:
-                    _LOG.debug("%s: Using cached client %s", current_thread().name, connection_id)
-            if connection_id not in self._cache:
-                _LOG.debug(
-                    "%s: Establishing client connection to %s",
+        connection_id = SshClient.id_from_params(connect_params)
+        for i in range(3):  # TODO: make the retry count configurable
+            try:
+                async with self._cache_lock:
+                    client: None | SshClient | asyncssh.SSHClient
+                    _, client = self._cache.get(connection_id, (None, None))
+                    if client:
+                        _LOG.debug(
+                            "%s: Checking cached client %s", current_thread().name, connection_id
+                        )
+                        connection = await client.connection()
+                        if not connection:
+                            _LOG.debug(
+                                "%s: Removing stale client connection %s from cache.",
+                                current_thread().name,
+                                connection_id,
+                            )
+                            self._cache.pop(connection_id)
+                            # Try to reconnect next.
+                        else:
+                            _LOG.debug(
+                                "%s: Using cached client %s", current_thread().name, connection_id
+                            )
+                    if connection_id not in self._cache:
+                        _LOG.debug(
+                            "%s: Establishing client connection to %s",
+                            current_thread().name,
+                            connection_id,
+                        )
+                        connection, client = await asyncssh.create_connection(
+                            SshClient, **connect_params
+                        )
+                        assert isinstance(client, SshClient)
+                        self._cache[connection_id] = (connection, client)
+                        _LOG.debug(
+                            "%s: Created connection to %s.", current_thread().name, connection_id
+                        )
+                    return self._cache[connection_id]
+            except ConnectionRefusedError as ex:  # TODO: Add other error handling here too
+                _LOG.warning(
+                    "%s: Attempt %d: Failed to connect to %s: %s",
                     current_thread().name,
+                    i + 1,
                     connection_id,
+                    ex,
                 )
-                connection, client = await asyncssh.create_connection(SshClient, **connect_params)
-                assert isinstance(client, SshClient)
-                self._cache[connection_id] = (connection, client)
-                _LOG.debug("%s: Created connection to %s.", current_thread().name, connection_id)
-            return self._cache[connection_id]
+                if i < 2:  # TODO: adjust to match max range
+                    await async_sleep(1.0)  # TODO: Make this configurable
+                if i == 2:  # TODO: adjust to match max range
+                    _LOG.error(
+                        "%s: Giving up connecting to %s", current_thread().name, connection_id
+                    )
+                    raise
+        raise RuntimeError("Unreachable code in get_client_connection")
 
     def cleanup(self) -> None:
         """Closes all cached connections."""
diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py
index d723cfa957..6c89bf6de5 100644
--- a/mlos_bench/mlos_bench/tests/__init__.py
+++ b/mlos_bench/mlos_bench/tests/__init__.py
@@ -9,6 +9,7 @@
 """
 import filecmp
 import json
+import logging
 import os
 import shutil
 import socket
@@ -24,6 +25,8 @@
 
 from mlos_bench.util import get_class_from_name, nullable
 
+_LOG = logging.getLogger(__name__)
+
 ZONE_NAMES = [
     # Explicit time zones.
     "UTC",
@@ -171,11 +174,24 @@ def wait_docker_service_healthy(
 
 def wait_docker_service_socket(docker_services: DockerServices, hostname: str, port: int) -> None:
     """Wait until a docker service is ready."""
-    docker_services.wait_until_responsive(
-        check=lambda: check_socket(hostname, port),
-        timeout=60.0,
-        pause=0.5,
-    )
+    _LOG.info("Waiting for %s:%d to become responsive", hostname, port)
+
+    def check_with_logging() -> bool:
+        result = check_socket(hostname, port)
+        if not result:
+            _LOG.debug("Socket check failed for %s:%d", hostname, port)
+        return result
+
+    try:
+        docker_services.wait_until_responsive(
+            check=check_with_logging,
+            timeout=60.0,
+            pause=0.5,
+        )
+        _LOG.info("Socket %s:%d is now responsive", hostname, port)
+    except Exception as e:
+        _LOG.error("Failed waiting for %s:%d: %s", hostname, port, e)
+        raise
 
 
 def check_socket(host: str, port: int, timeout: float = 1.0) -> bool:
@@ -192,10 +208,27 @@ def check_socket(host: str, port: int, timeout: float = 1.0) -> bool:
     -------
     bool
     """
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
-        sock.settimeout(timeout)  # seconds
-        result = sock.connect_ex((host, port))
-        return result == 0
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+            sock.settimeout(timeout)  # seconds
+            result = sock.connect_ex((host, port))
+            success = result == 0
+            if not success:
+                _LOG.debug(
+                    "Socket connection to %s:%d failed with code %d",
+                    host,
+                    port,
+                    result,
+                )
+            return success
+    except (OSError, TimeoutError) as e:
+        _LOG.debug(
+            "Socket check exception for %s:%d: %s",
+            host,
+            port,
+            e,
+        )
+        return False
 
 
 def resolve_host_name(host: str) -> str | None:
diff --git a/mlos_bench/mlos_bench/tests/docker_fixtures_util.py b/mlos_bench/mlos_bench/tests/docker_fixtures_util.py
index 2309123bc8..69c23f3aba 100644
--- a/mlos_bench/mlos_bench/tests/docker_fixtures_util.py
+++ b/mlos_bench/mlos_bench/tests/docker_fixtures_util.py
@@ -16,9 +16,14 @@
 """
 # pylint: disable=redefined-outer-name
 
+import json
+import logging
 import os
 import sys
 from collections.abc import Generator
+from json import JSONDecodeError
+from subprocess import CalledProcessError, run
+from time import sleep
 from typing import Any
 
 import pytest
@@ -26,6 +31,72 @@
 from pytest_docker.plugin import Services as DockerServices
 from pytest_docker.plugin import get_docker_services
 
+_LOG = logging.getLogger(__name__)
+
+
+def wait_docker_service_healthy(
+    compose_project_name: str,
+    service_name: str,
+    timeout_seconds: int = 30,
+    check_interval_seconds: int = 1,
+) -> None:
+    """
+    Waits for the specified docker service to become healthy.
+
+    Parameters
+    ----------
+    compose_project_name : str
+        The docker-compose project name.
+    service_name : str
+        The name of the service to wait for.
+    timeout_seconds : int
+        The maximum time to wait for the service to become healthy.
+    check_interval_seconds : int
+        The interval between health checks.
+    """
+    for _ in range(0, timeout_seconds, check_interval_seconds):
+        try:
+            inspect_cmd = run(
+                (
+                    f"docker compose -p {compose_project_name} "
+                    f"ps --format '{{{{.ID}}}}' {service_name}"
+                ),
+                shell=True,
+                check=True,
+                capture_output=True,
+            )
+            container_id = inspect_cmd.stdout.decode().strip()
+            if not container_id:
+                raise RuntimeError(f"Container ID for {service_name} not found")
+            inspect_state_cmd = run(
+                f"docker inspect {container_id}",
+                shell=True,
+                check=True,
+                capture_output=True,
+            )
+            inspect_data = json.loads(inspect_state_cmd.stdout.decode())
+            state = inspect_data[0].get("State", {})
+            health = state.get("Health")
+            if state.get("Status") == "running" and (
+                not health or health.get("Status") == "healthy"
+            ):
+                _LOG.debug("Container %s is running and healthy", service_name)
+                return
+            else:
+                _LOG.info(
+                    (
+                        "Waiting for %s to become running (State.Status: %s) "
+                        "and healthy (Health.Status=%s)"
+                    ),
+                    service_name,
+                    state.get("Status") if state else "UNKNOWN",
+                    health.get("Status") if health else "N/A",
+                )
+        except (RuntimeError, CalledProcessError, JSONDecodeError) as e:
+            _LOG.error("Error checking health of %s: %s", service_name, e)
+        sleep(check_interval_seconds)
+    raise RuntimeError(f"Container {service_name} did not become healthy in time.")
+
 
 # Fixtures to configure the pytest-docker plugin.
 @pytest.fixture(scope="session")
@@ -44,16 +115,16 @@ def docker_setup() -> list[str] | str:
 @pytest.fixture(scope="session")
 def docker_compose_file(pytestconfig: pytest.Config) -> list[str]:
     """
-    Fixture for the path to the docker-compose file.
+    Base fixture for the path to the docker-compose file(s).
 
     Parameters
     ----------
     pytestconfig : pytest.Config
 
-    Returns
-    -------
-    list[str]
-        List of paths to the docker-compose file(s).
+    Notes
+    -----
+    This fixture should be overridden in the local conftest.py to point to the actual
+    docker-compose.yml files needed for the tests.
     """
     _ = pytestconfig  # unused
     # Add additional configs as necessary here.
diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/Dockerfile b/mlos_bench/mlos_bench/tests/services/remote/ssh/Dockerfile
index d157473dbf..87fee7faff 100644
--- a/mlos_bench/mlos_bench/tests/services/remote/ssh/Dockerfile
+++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/Dockerfile
@@ -6,6 +6,7 @@ RUN apt-get update \
         openssh-server openssh-client \
         sudo
 ARG PORT=2254
+ENV PORT=${PORT}
 EXPOSE ${PORT}
 RUN echo "Port ${PORT}" > /etc/ssh/sshd_config.d/local.conf \
     && echo "PermitRootLogin prohibit-password" >> /etc/ssh/sshd_config.d/local.conf \
@@ -15,3 +16,6 @@ RUN echo "Port ${PORT}" > /etc/ssh/sshd_config.d/local.conf \
 ENV TIMEOUT=180
 # NOTE: The "-n" is important in the call to "wait" so that an exit failure can be detected.
 CMD ["/bin/bash", "-eux", "-c", "trap : TERM INT; service ssh start && sleep ${TIMEOUT:-180} & wait -n"]
+
+HEALTHCHECK --interval=5s --timeout=3s --retries=5 --start-period=1s \
+  CMD ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes -o ConnectTimeout=2 -p ${PORT} -i /root/.ssh/id_rsa root@localhost 'echo ok'
diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py
index de822fe2eb..81f30941ed 100644
--- a/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py
+++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py
@@ -4,9 +4,13 @@
 #
 """Common data classes for the SSH service tests."""
 
+import logging
 from dataclasses import dataclass
 from subprocess import run
 
+from mlos_bench.tests import check_socket
+from mlos_bench.tests.docker_fixtures_util import wait_docker_service_healthy
+
 # The SSH test server port and name.
 # See Also: docker-compose.yml
 SSH_TEST_SERVER_PORT = 2254
@@ -14,6 +18,8 @@
 ALT_TEST_SERVER_NAME = "alt-server"
 REBOOT_TEST_SERVER_NAME = "reboot-server"
 
+_LOG = logging.getLogger(__name__)
+
 
 @dataclass
 class SshTestServerInfo:
@@ -32,24 +38,62 @@ class SshTestServerInfo:
     id_rsa_path: str
     _port: int | None = None
 
-    def get_port(self, uncached: bool = False) -> int:
+    def get_port(self, uncached: bool = False, check_port: bool = True) -> int:
         """
         Gets the port that the SSH test server is listening on.
 
         Note: this value can change when the service restarts so we can't rely on
         the DockerServices.
         """
-        if self._port is None or uncached:
-            port_cmd = run(
-                (
-                    f"docker compose -p {self.compose_project_name} "
-                    f"port {self.service_name} {SSH_TEST_SERVER_PORT}"
-                ),
-                shell=True,
-                check=True,
-                capture_output=True,
-            )
-            self._port = int(port_cmd.stdout.decode().strip().split(":")[1])
+        if not uncached and self._port is not None:
+            # NOTE: this cache may become stale in another worker if the
+            # container restarts in one and the other worker doesn't notice the new port.
+            # Optionally check the status of the cached port before returning it.
+            if not check_port or self.validate_connection():
+                _LOG.debug(
+                    "Using cached port %s for %s %s validation.",
+                    self._port,
+                    self.service_name,
+                    "with" if check_port else "without",
+                )
+                return self._port
+
+        # Check container state before proceeding to avoid a race in docker
+        # container startup.
+        _LOG.info(
+            (
+                "Discovering port for %s (uncached=%s, cached_port=%s) "
+                "after waiting for container readiness."
+            ),
+            self.service_name,
+            uncached,
+            self._port,
+        )
+        wait_docker_service_healthy(
+            self.compose_project_name,
+            self.service_name,
+        )
+        _LOG.debug("Container %s is healthy, getting port...", self.service_name)
+
+        port_cmd = run(
+            (
+                f"docker compose -p {self.compose_project_name} "
+                f"port {self.service_name} {SSH_TEST_SERVER_PORT}"
+            ),
+            shell=True,
+            check=True,
+            capture_output=True,
+        )
+        new_port = int(port_cmd.stdout.decode().strip().split(":")[1])
+        old_port = self._port
+        self._port = new_port
+        _LOG.info(
+            "Port for %s: %s -> %s (uncached=%s)",
+            self.service_name,
+            old_port,
+            new_port,
+            uncached,
+        )
         return self._port
 
     def to_ssh_service_config(self, uncached: bool = False) -> dict:
@@ -72,3 +116,27 @@ def to_connect_params(self, uncached: bool = False) -> dict:
             "port": self.get_port(uncached),
             "username": self.username,
         }
+
+    def validate_connection(self, timeout: float = 2.0) -> bool:
+        """
+        Validate that the current cached port is still connectable.
+
+        Returns False if the connection fails, indicating the port may be stale.
+        """
+        if self._port is None:
+            return False
+
+        is_connectable = check_socket(self.hostname, self._port, timeout)
+        if not is_connectable:
+            _LOG.warning(
+                "Connection validation FAILED for %s:%d - port may be stale!",
+                self.service_name,
+                self._port,
+            )
+        else:
+            _LOG.debug(
+                "Connection validation OK for %s:%d",
+                self.service_name,
+                self._port,
+            )
+        return is_connectable
diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py
index 9e3d25ad5d..543dd05ba5 100644
--- a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py
+++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py
@@ -8,6 +8,7 @@
 Note: these are not in the conftest.py file because they are also used by remote_ssh_env_test.py
 """
 
+import logging
 import os
 import tempfile
 from collections.abc import Generator
@@ -28,6 +29,8 @@
 
 # pylint: disable=redefined-outer-name
 
+_LOG = logging.getLogger(__name__)
+
 
 @pytest.fixture(scope="session")
 def docker_compose_file(pytestconfig: pytest.Config) -> list[str]:
@@ -81,6 +84,8 @@ def ssh_test_server(
     """
     # Get a copy of the ssh id_rsa key from the test ssh server.
     with tempfile.NamedTemporaryFile() as id_rsa_file:
+        _LOG.info("Setting up ssh_test_server fixture")
+
         ssh_test_server_info = SshTestServerInfo(
             compose_project_name=docker_compose_project_name,
             service_name=SSH_TEST_SERVER_NAME,
@@ -88,11 +93,17 @@ def ssh_test_server(
             username="root",
             id_rsa_path=id_rsa_file.name,
         )
+
+        port = ssh_test_server_info.get_port()
+        _LOG.info("Main SSH server discovered on port %d", port)
+
         wait_docker_service_socket(
             locked_docker_services,
             ssh_test_server_info.hostname,
-            ssh_test_server_info.get_port(),
+            port,
         )
+
+        _LOG.info("Main SSH server validated and ready")
         id_rsa_src = f"/{ssh_test_server_info.username}/.ssh/id_rsa"
         docker_cp_cmd = (
             f"docker compose -p {docker_compose_project_name} "
@@ -128,6 +139,9 @@ def alt_test_server(
     # Note: The alt-server uses the same image as the ssh-server container, so
     # the id_rsa key and username should all match.
     # Only the host port it is allocate is different.
+
+    _LOG.info("Setting up alt_test_server fixture")
+
     alt_test_server_info = SshTestServerInfo(
         compose_project_name=ssh_test_server.compose_project_name,
         service_name=ALT_TEST_SERVER_NAME,
@@ -135,11 +149,17 @@ def alt_test_server(
         username=ssh_test_server.username,
         id_rsa_path=ssh_test_server.id_rsa_path,
     )
+
+    port = alt_test_server_info.get_port()
+    _LOG.info("Alt SSH server discovered on port %d", port)
+
     wait_docker_service_socket(
         locked_docker_services,
         alt_test_server_info.hostname,
-        alt_test_server_info.get_port(),
+        port,
     )
+
+    _LOG.info("Alt SSH server validated and ready")
     return alt_test_server_info
 
 
@@ -156,6 +176,9 @@ def reboot_test_server(
     # Note: The reboot-server uses the same image as the ssh-server container, so
     # the id_rsa key and username should all match.
     # Only the host port it is allocate is different.
+
+    _LOG.info("Setting up reboot_test_server fixture")
+
     reboot_test_server_info = SshTestServerInfo(
         compose_project_name=ssh_test_server.compose_project_name,
         service_name=REBOOT_TEST_SERVER_NAME,
@@ -163,11 +186,17 @@ def reboot_test_server(
         username=ssh_test_server.username,
         id_rsa_path=ssh_test_server.id_rsa_path,
     )
+
+    port = reboot_test_server_info.get_port()
+    _LOG.info("Reboot SSH server discovered on port %d", port)
+
     wait_docker_service_socket(
         locked_docker_services,
         reboot_test_server_info.hostname,
-        reboot_test_server_info.get_port(),
+        port,
     )
+
+    _LOG.info("Reboot SSH server validated and ready")
     return reboot_test_server_info
 
 
diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py
index 1dce67a13d..d205c0edea 100644
--- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py
+++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py
@@ -35,7 +35,17 @@ def test_ssh_service_remote_exec(
     This checks state of the service across multiple invocations and states to check for
     internal cache handling logic as well.
     """
-    # pylint: disable=protected-access
+    # pylint: disable=protected-access,too-many-locals
+    _LOG.info("test_ssh_service_remote_exec starting - validating server connections")
+
+    # Validate connections before starting test
+    for server_info, name in ((ssh_test_server, "main"), (alt_test_server, "alt")):
+        if not server_info.validate_connection():
+            _LOG.warning(
+                "%s server connection validation failed, may have stale port",
+                name,
+            )
+
     with ssh_host_service:
         config = ssh_test_server.to_ssh_service_config()
 
@@ -149,12 +159,26 @@ def check_ssh_service_reboot(
     graceful: bool,
 ) -> None:
     """Check the SshHostService reboot operation."""
+    # pylint: disable=too-many-locals
+
     # Note: rebooting changes the port number unfortunately, but makes it
     # easier to check for success.
     # Also, it may cause issues with other parallel unit tests, so we run it as
     # a part of the same unit test for now.
+
+    _LOG.warning(
+        "*** STARTING REBOOT TEST (graceful=%s) - This will change container ports! ***",
+        graceful,
+    )
+
     with ssh_host_service:
         reboot_test_srv_ssh_svc_conf = reboot_test_server.to_ssh_service_config(uncached=True)
+        original_port = reboot_test_srv_ssh_svc_conf["ssh_port"]
+        _LOG.warning(
+            "Reboot test using original port %d for %s",
+            original_port,
+            reboot_test_server.service_name,
+        )
         (status, results_info) = ssh_host_service.remote_exec(
             script=['echo "sleeping..."', "sleep 30", 'echo "should not reach this point"'],
             config=reboot_test_srv_ssh_svc_conf,
@@ -183,7 +207,7 @@ def check_ssh_service_reboot(
         assert "should not reach this point" not in stdout
 
         reboot_test_srv_ssh_svc_conf_new: dict = {}
-        for _ in range(0, 3):
+        for attempt in range(0, 3):
             # Give docker some time to restart the service after the "reboot".
             # Note: this relies on having a `restart_policy` in the docker-compose.yml file.
             time.sleep(1)
@@ -195,21 +219,41 @@ def check_ssh_service_reboot(
                     capture_output=True,
                     check=False,
                 )
-                print(run_res.stdout.decode())
-                print(run_res.stderr.decode())
+                _LOG.info(
+                    "Docker ps output (attempt %d):\nSTDOUT:\n%s\nSTDERR:\n%s",
+                    attempt + 1,
+                    run_res.stdout.decode(),
+                    run_res.stderr.decode(),
+                )
                 reboot_test_srv_ssh_svc_conf_new = reboot_test_server.to_ssh_service_config(
                     uncached=True
                 )
-                if (
-                    reboot_test_srv_ssh_svc_conf_new["ssh_port"]
-                    != reboot_test_srv_ssh_svc_conf["ssh_port"]
-                ):
+                new_port = reboot_test_srv_ssh_svc_conf_new["ssh_port"]
+                _LOG.warning(
+                    "Port check attempt %d: %d -> %d",
+                    attempt + 1,
+                    original_port,
+                    new_port,
+                )
+                if new_port != original_port:
+                    _LOG.warning(
+                        "*** PORT CHANGED: %d -> %d (this affects ALL workers!) ***",
+                        original_port,
+                        new_port,
+                    )
                     break
             except CalledProcessError as ex:
-                _LOG.info("Failed to check port for reboot test server: %s", ex)
+                _LOG.info(
+                    "Failed to check port for reboot test server (attempt %d): %s",
+                    attempt + 1,
+                    ex,
+                )
         assert (
             reboot_test_srv_ssh_svc_conf_new["ssh_port"]
             != reboot_test_srv_ssh_svc_conf["ssh_port"]
+        ), (
+            f"""Port should have changed from {original_port}"""
+            f"""but is still {reboot_test_srv_ssh_svc_conf_new["ssh_port"]}"""
         )
 
         wait_docker_service_socket(
@@ -227,6 +271,10 @@ def check_ssh_service_reboot(
         assert status.is_succeeded()
         assert results["stdout"].strip() == REBOOT_TEST_SERVER_NAME
 
+        _LOG.warning(
+            "*** REBOOT TEST COMPLETED - Other workers may now have stale cached ports! ***",
+        )
+
 
 @requires_docker
 def test_ssh_service_reboot(
diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py
index d06516b3fe..7fec2fd813 100644
--- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py
+++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py
@@ -5,6 +5,7 @@
 """Tests for mlos_bench.services.remote.ssh.SshService base class."""
 
 import asyncio
+import logging
 import time
 from importlib.metadata import PackageNotFoundError, version
 from subprocess import run
@@ -40,6 +41,8 @@
         # OK: pytest-lazy-fixture not installed
         pass
 
+_LOG = logging.getLogger(__name__)
+
 
 @requires_docker
 @requires_ssh
@@ -52,12 +55,22 @@
 )
 def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_name: str) -> None:
     """Check for the pytest-docker ssh test infra."""
+    _LOG.info("test_ssh_service_test_infra starting with %s", server_name)
+
     assert ssh_test_server_info.service_name == server_name
 
     ip_addr = resolve_host_name(ssh_test_server_info.hostname)
     assert ip_addr is not None
 
+    # Use validation method to detect stale ports
+    if not ssh_test_server_info.validate_connection():
+        _LOG.warning(
+            "Cached port validation failed, getting fresh port for %s",
+            server_name,
+        )
+
     local_port = ssh_test_server_info.get_port()
+    _LOG.info("Using port %d for %s", local_port, server_name)
     assert check_socket(ip_addr, local_port)
     ssh_cmd = (
         "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new "
@@ -67,6 +80,8 @@ def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_
     cmd = run(ssh_cmd.split(), capture_output=True, text=True, check=True)
     assert cmd.stdout.strip() == server_name
 
+    # assert False, "Force failure for debugging."
+
 
 @pytest.mark.filterwarnings(
     "ignore:.*(coroutine 'sleep' was never awaited).*:RuntimeWarning:.*event_loop_context_test.*:0"
diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/up.sh b/mlos_bench/mlos_bench/tests/services/remote/ssh/up.sh
index f0f152975d..24278d0b17 100755
--- a/mlos_bench/mlos_bench/tests/services/remote/ssh/up.sh
+++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/up.sh
@@ -24,6 +24,13 @@ docker compose -p "$PROJECT_NAME" cp ssh-server:/root/.ssh/id_rsa ./id_rsa
 chmod 0600 ./id_rsa
 set +x
 
+for server in ssh-server alt-server reboot-server; do
+    until [ "$(docker inspect -f '{{.State.Health.Status}}' $(docker compose -p ${PROJECT_NAME} ps -q ${server}))" = "healthy" ]; do
+        echo "Waiting for ${server} container to become healthy..."
+        sleep 1
+    done
+done
+
 echo "OK: private key available at '$scriptdir/id_rsa'. Connect to the ssh-server container at the following port:"
 docker compose -p "$PROJECT_NAME" port ssh-server ${PORT:-2254} | cut -d: -f2
 echo "INFO: And this port for the alt-server container:"
diff --git a/setup.cfg b/setup.cfg
index 1c1cfe1416..2a6e3ecc7f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -31,6 +31,15 @@ extend-ignore = E203
 
 [tool:pytest]
 minversion = 7.1
+# Enable the CLI log so that when xdist runners fail a particular test, the
+# fixture setup code is also captured.
+# Note: this can be a little noisy when running without xdist (e.g., -n0), but
+# we typically only do that for single tests while debugging.
+# See Also: conftest.py where we also configure file based logging.
+log_cli = true
+log_cli_level = INFO
+log_cli_format = %(asctime)s.%(msecs)03d [%(process)d][%(threadName)s] [%(filename)s:%(lineno)d %(funcName)s] %(levelname)s: %(message)s
+log_cli_date_format = %Y-%m-%d %H:%M:%S
 #pythonpath = .
 # Run all tests to completion on multiple processes.
 # Run failed and new tests first by default.