diff --git a/.github/workflows/devcontainer.yml b/.github/workflows/devcontainer.yml index 1ea79d6757..14f2a05232 100644 --- a/.github/workflows/devcontainer.yml +++ b/.github/workflows/devcontainer.yml @@ -208,13 +208,6 @@ jobs: # Now actually run the tests. docker exec --user vscode --env USER=vscode mlos-devcontainer make CONDA_INFO_LEVEL=-v test - - name: Upload coverage report as build artifact - if: always() # ensures it runs even if tests fail - uses: actions/upload-artifact@v4 - with: - name: coverage-report - path: coverage.xml - - name: Generate and test binary distribution files timeout-minutes: 10 run: | @@ -298,6 +291,21 @@ jobs: docker tag mlos-devcontainer:latest ${{ secrets.ACR_LOGINURL }}/mlos-devcontainer:$image_tag docker push ${{ secrets.ACR_LOGINURL }}/mlos-devcontainer:$image_tag + # ensure these run even if tests fail, but only after everything else is done or skipped + - name: Upload coverage report as build artifact + if: always() + uses: actions/upload-artifact@v5 + with: + name: coverage-report + path: coverage.xml + + - name: Upload pytest logs + if: always() + uses: actions/upload-artifact@v5 + with: + name: devcontainer-pytest-logs + path: logs/pytest*.log + retention-days: 7 PublishDocs: name: Publish Documentation diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 5518b15d97..2eef298e60 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -181,3 +181,11 @@ jobs: - name: Generate and test binary distribution files run: make CONDA_ENV_NAME=$CONDA_ENV_NAME CONDA_INFO_LEVEL=-v dist dist-test + + - name: Upload pytest logs + if: always() + uses: actions/upload-artifact@v5 + with: + name: linux-${{ matrix.python_version }}-pytest-logs + path: logs/pytest*.log + retention-days: 7 diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 37bd8e39e4..3fd4d7b064 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -163,6 +163,13 @@ jobs: - name: Generate and test binary distribution files run: make CONDA_ENV_NAME=$CONDA_ENV_NAME CONDA_INFO_LEVEL=-v dist dist-test + - name: Upload pytest logs + if: always() + uses: actions/upload-artifact@v5 + with: + name: macos-pytest-logs + path: logs/pytest*.log + retention-days: 7 MacOSDevContainerBuildTest: name: MacOS DevContainer Build/Test diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index fde7611678..cd5051b0c2 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -142,6 +142,13 @@ jobs: run: | .github/workflows/build-dist-test.ps1 + - name: Upload pytest logs + if: always() + uses: actions/upload-artifact@v5 + with: + name: windows-pytest-logs + path: logs/pytest*.log + retention-days: 7 WindowsDevContainerBuildTest: name: Windows DevContainer Build/Test diff --git a/.gitignore b/.gitignore index 471d653344..1bb01c59b7 100644 --- a/.gitignore +++ b/.gitignore @@ -171,3 +171,6 @@ build/*.build-stamp *.duckdb *.db.wal *.duckdb.wal + +# pytest logs +logs/pytest*.log diff --git a/conftest.py b/conftest.py index 84a04727c4..de9cc2646c 100644 --- a/conftest.py +++ b/conftest.py @@ -7,6 +7,7 @@ # Note: This file is named conftest.py so that pytest picks it up automatically # without the need to adjust PYTHONPATH or sys.path as much. +import logging import os import shutil from tempfile import mkdtemp @@ -15,6 +16,14 @@ import pytest from xdist.workermanage import WorkerController +# See Also: setup.cfg and launcher.py +LOG_FMT = ( + "%(asctime)s.%(msecs)03d [%(process)d][%(threadName)s] " + "[%(filename)s:%(lineno)d %(funcName)s] " + "%(levelname)s: %(message)s" +) +DATE_FMT = "%Y-%m-%d %H:%M:%S" + def is_master(config: pytest.Config) -> bool: """True if the code running the given pytest.config object is running in a xdist @@ -56,6 +65,20 @@ def pytest_configure(config: pytest.Config) -> None: # Add it to the config so that it can passed to the worker nodes. setattr(config, "shared_temp_dir", mkdtemp()) + # Configure per-worker log file. + worker_id = getattr(config, "workerinput", {}).get("workerid", "master") + os.makedirs("logs", exist_ok=True) + log_file_path = os.path.join("logs", f"pytest-{worker_id}.log") + file_handler = logging.FileHandler(log_file_path, mode="w") + if logging.getLogger().getEffectiveLevel() == logging.DEBUG: + file_handler.setLevel(logging.DEBUG) + else: + file_handler.setLevel(logging.INFO) + # logging.basicConfig(level=file_handler.level, format=LOG_FMT, datefmt=DATE_FMT) + log_formatter = logging.Formatter(fmt=LOG_FMT, datefmt=DATE_FMT) + file_handler.setFormatter(log_formatter) + logging.getLogger().addHandler(file_handler) + def pytest_configure_node(node: WorkerController) -> None: """Xdist hook used to inform workers of the location of the shared temp dir.""" diff --git a/mlos_bench/mlos_bench/launcher.py b/mlos_bench/mlos_bench/launcher.py index c728ed7fb2..cd76f456cf 100644 --- a/mlos_bench/mlos_bench/launcher.py +++ b/mlos_bench/mlos_bench/launcher.py @@ -37,6 +37,14 @@ _LOG_FORMAT = "%(asctime)s %(filename)s:%(lineno)d %(funcName)s %(levelname)s %(message)s" logging.basicConfig(level=_LOG_LEVEL, format=_LOG_FORMAT) +# TODO: Future PR: See Also: /conftest.py, setup.cfg +# _LOG_FORMAT = ( +# "%(asctime)s.%(msecs)03d [%(process)d][%(threadName)s] " +# "[%(filename)s:%(lineno)d %(funcName)s] " +# "%(levelname)s %(message)s" +# ) +# logging.basicConfig(level=_LOG_LEVEL, format=_LOG_FORMAT, datefmt="%Y-%m-%d %H:%M:%S") + _LOG = logging.getLogger(__name__) diff --git a/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py b/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py index 5955f6dfd1..199d640294 100644 --- a/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py +++ b/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py @@ -9,6 +9,7 @@ from abc import ABCMeta from asyncio import Event as CoroEvent from asyncio import Lock as CoroLock +from asyncio import sleep as async_sleep from collections.abc import Callable, Coroutine from threading import current_thread from types import TracebackType @@ -172,34 +173,60 @@ async def get_client_connection( A tuple of (SSHClientConnection, SshClient). """ _LOG.debug("%s: get_client_connection: %s", current_thread().name, connect_params) - async with self._cache_lock: - connection_id = SshClient.id_from_params(connect_params) - client: None | SshClient | asyncssh.SSHClient - _, client = self._cache.get(connection_id, (None, None)) - if client: - _LOG.debug("%s: Checking cached client %s", current_thread().name, connection_id) - connection = await client.connection() - if not connection: - _LOG.debug( - "%s: Removing stale client connection %s from cache.", - current_thread().name, - connection_id, - ) - self._cache.pop(connection_id) - # Try to reconnect next. - else: - _LOG.debug("%s: Using cached client %s", current_thread().name, connection_id) - if connection_id not in self._cache: - _LOG.debug( - "%s: Establishing client connection to %s", + connection_id = SshClient.id_from_params(connect_params) + for i in range(3): # TODO: make the retry count configurable + try: + async with self._cache_lock: + client: None | SshClient | asyncssh.SSHClient + _, client = self._cache.get(connection_id, (None, None)) + if client: + _LOG.debug( + "%s: Checking cached client %s", current_thread().name, connection_id + ) + connection = await client.connection() + if not connection: + _LOG.debug( + "%s: Removing stale client connection %s from cache.", + current_thread().name, + connection_id, + ) + self._cache.pop(connection_id) + # Try to reconnect next. + else: + _LOG.debug( + "%s: Using cached client %s", current_thread().name, connection_id + ) + if connection_id not in self._cache: + _LOG.debug( + "%s: Establishing client connection to %s", + current_thread().name, + connection_id, + ) + connection, client = await asyncssh.create_connection( + SshClient, **connect_params + ) + assert isinstance(client, SshClient) + self._cache[connection_id] = (connection, client) + _LOG.debug( + "%s: Created connection to %s.", current_thread().name, connection_id + ) + return self._cache[connection_id] + except ConnectionRefusedError as ex: # TODO: Add other error handling here too + _LOG.warning( + "%s: Attempt %d: Failed to connect to %s: %s", current_thread().name, + i + 1, connection_id, + ex, ) - connection, client = await asyncssh.create_connection(SshClient, **connect_params) - assert isinstance(client, SshClient) - self._cache[connection_id] = (connection, client) - _LOG.debug("%s: Created connection to %s.", current_thread().name, connection_id) - return self._cache[connection_id] + if i < 2: # TODO: adjust to match max range + await async_sleep(1.0) # TODO: Make this configurable + if i == 2: # TODO: adjust to match max range + _LOG.error( + "%s: Giving up connecting to %s", current_thread().name, connection_id + ) + raise + raise RuntimeError("Unreachable code in get_client_connection") def cleanup(self) -> None: """Closes all cached connections.""" diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index d723cfa957..6c89bf6de5 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -9,6 +9,7 @@ """ import filecmp import json +import logging import os import shutil import socket @@ -24,6 +25,8 @@ from mlos_bench.util import get_class_from_name, nullable +_LOG = logging.getLogger(__name__) + ZONE_NAMES = [ # Explicit time zones. "UTC", @@ -171,11 +174,24 @@ def wait_docker_service_healthy( def wait_docker_service_socket(docker_services: DockerServices, hostname: str, port: int) -> None: """Wait until a docker service is ready.""" - docker_services.wait_until_responsive( - check=lambda: check_socket(hostname, port), - timeout=60.0, - pause=0.5, - ) + _LOG.info("Waiting for %s:%d to become responsive", hostname, port) + + def check_with_logging() -> bool: + result = check_socket(hostname, port) + if not result: + _LOG.debug("Socket check failed for %s:%d", hostname, port) + return result + + try: + docker_services.wait_until_responsive( + check=check_with_logging, + timeout=60.0, + pause=0.5, + ) + _LOG.info("Socket %s:%d is now responsive", hostname, port) + except Exception as e: + _LOG.error("Failed waiting for %s:%d: %s", hostname, port, e) + raise def check_socket(host: str, port: int, timeout: float = 1.0) -> bool: @@ -192,10 +208,27 @@ def check_socket(host: str, port: int, timeout: float = 1.0) -> bool: ------- bool """ - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.settimeout(timeout) # seconds - result = sock.connect_ex((host, port)) - return result == 0 + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.settimeout(timeout) # seconds + result = sock.connect_ex((host, port)) + success = result == 0 + if not success: + _LOG.debug( + "Socket connection to %s:%d failed with code %d", + host, + port, + result, + ) + return success + except (OSError, TimeoutError) as e: + _LOG.debug( + "Socket check exception for %s:%d: %s", + host, + port, + e, + ) + return False def resolve_host_name(host: str) -> str | None: diff --git a/mlos_bench/mlos_bench/tests/docker_fixtures_util.py b/mlos_bench/mlos_bench/tests/docker_fixtures_util.py index 2309123bc8..69c23f3aba 100644 --- a/mlos_bench/mlos_bench/tests/docker_fixtures_util.py +++ b/mlos_bench/mlos_bench/tests/docker_fixtures_util.py @@ -16,9 +16,14 @@ """ # pylint: disable=redefined-outer-name +import json +import logging import os import sys from collections.abc import Generator +from json import JSONDecodeError +from subprocess import CalledProcessError, run +from time import sleep from typing import Any import pytest @@ -26,6 +31,72 @@ from pytest_docker.plugin import Services as DockerServices from pytest_docker.plugin import get_docker_services +_LOG = logging.getLogger(__name__) + + +def wait_docker_service_healthy( + compose_project_name: str, + service_name: str, + timeout_seconds: int = 30, + check_interval_seconds: int = 1, +) -> None: + """ + Waits for the specified docker service to become healthy. + + Parameters + ---------- + compose_project_name : str + The docker-compose project name. + service_name : str + The name of the service to wait for. + timeout_seconds : int + The maximum time to wait for the service to become healthy. + check_interval_seconds : int + The interval between health checks. + """ + for _ in range(0, timeout_seconds, check_interval_seconds): + try: + inspect_cmd = run( + ( + f"docker compose -p {compose_project_name} " + f"ps --format '{{{{.ID}}}}' {service_name}" + ), + shell=True, + check=True, + capture_output=True, + ) + container_id = inspect_cmd.stdout.decode().strip() + if not container_id: + raise RuntimeError(f"Container ID for {service_name} not found") + inspect_state_cmd = run( + f"docker inspect {container_id}", + shell=True, + check=True, + capture_output=True, + ) + inspect_data = json.loads(inspect_state_cmd.stdout.decode()) + state = inspect_data[0].get("State", {}) + health = state.get("Health") + if state.get("Status") == "running" and ( + not health or health.get("Status") == "healthy" + ): + _LOG.debug("Container %s is running and healthy", service_name) + return + else: + _LOG.info( + ( + "Waiting for %s to become running (State.Status: %s) " + "and healthy (Health.Status=%s)" + ), + service_name, + state.get("Status") if state else "UNKNOWN", + health.get("Status") if health else "N/A", + ) + except (RuntimeError, CalledProcessError, JSONDecodeError) as e: + _LOG.error("Error checking health of %s: %s", service_name, e) + sleep(check_interval_seconds) + raise RuntimeError(f"Container {service_name} did not become healthy in time.") + # Fixtures to configure the pytest-docker plugin. @pytest.fixture(scope="session") @@ -44,16 +115,16 @@ def docker_setup() -> list[str] | str: @pytest.fixture(scope="session") def docker_compose_file(pytestconfig: pytest.Config) -> list[str]: """ - Fixture for the path to the docker-compose file. + Base fixture for the path to the docker-compose file(s). Parameters ---------- pytestconfig : pytest.Config - Returns - ------- - list[str] - List of paths to the docker-compose file(s). + Notes + ----- + This fixture should be overridden in the local conftest.py to point to the actual + docker-compose.yml files needed for the tests. """ _ = pytestconfig # unused # Add additional configs as necessary here. diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/Dockerfile b/mlos_bench/mlos_bench/tests/services/remote/ssh/Dockerfile index d157473dbf..87fee7faff 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/Dockerfile +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/Dockerfile @@ -6,6 +6,7 @@ RUN apt-get update \ openssh-server openssh-client \ sudo ARG PORT=2254 +ENV PORT=${PORT} EXPOSE ${PORT} RUN echo "Port ${PORT}" > /etc/ssh/sshd_config.d/local.conf \ && echo "PermitRootLogin prohibit-password" >> /etc/ssh/sshd_config.d/local.conf \ @@ -15,3 +16,6 @@ RUN echo "Port ${PORT}" > /etc/ssh/sshd_config.d/local.conf \ ENV TIMEOUT=180 # NOTE: The "-n" is important in the call to "wait" so that an exit failure can be detected. CMD ["/bin/bash", "-eux", "-c", "trap : TERM INT; service ssh start && sleep ${TIMEOUT:-180} & wait -n"] + +HEALTHCHECK --interval=5s --timeout=3s --retries=5 --start-period=1s \ + CMD ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes -o ConnectTimeout=2 -p ${PORT} -i /root/.ssh/id_rsa root@localhost 'echo ok' diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py index de822fe2eb..81f30941ed 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py @@ -4,9 +4,13 @@ # """Common data classes for the SSH service tests.""" +import logging from dataclasses import dataclass from subprocess import run +from mlos_bench.tests import check_socket +from mlos_bench.tests.docker_fixtures_util import wait_docker_service_healthy + # The SSH test server port and name. # See Also: docker-compose.yml SSH_TEST_SERVER_PORT = 2254 @@ -14,6 +18,8 @@ ALT_TEST_SERVER_NAME = "alt-server" REBOOT_TEST_SERVER_NAME = "reboot-server" +_LOG = logging.getLogger(__name__) + @dataclass class SshTestServerInfo: @@ -32,24 +38,62 @@ class SshTestServerInfo: id_rsa_path: str _port: int | None = None - def get_port(self, uncached: bool = False) -> int: + def get_port(self, uncached: bool = False, check_port: bool = True) -> int: """ Gets the port that the SSH test server is listening on. Note: this value can change when the service restarts so we can't rely on the DockerServices. """ - if self._port is None or uncached: - port_cmd = run( - ( - f"docker compose -p {self.compose_project_name} " - f"port {self.service_name} {SSH_TEST_SERVER_PORT}" - ), - shell=True, - check=True, - capture_output=True, - ) - self._port = int(port_cmd.stdout.decode().strip().split(":")[1]) + if not uncached and self._port is not None: + # NOTE: this cache may become stale in another worker if the + # container restarts in one and the other worker doesn't notice the new port. + # Optionally check the status of the cached port before returning it. + if not check_port or self.validate_connection(): + _LOG.debug( + "Using cached port %s for %s %s validation.", + self._port, + self.service_name, + "with" if check_port else "without", + ) + return self._port + + # Check container state before proceeding to avoid a race in docker + # container startup. + _LOG.info( + ( + "Discovering port for %s (uncached=%s, cached_port=%s) " + "after waiting for container readiness." + ), + self.service_name, + uncached, + self._port, + ) + wait_docker_service_healthy( + self.compose_project_name, + self.service_name, + ) + _LOG.debug("Container %s is healthy, getting port...", self.service_name) + + port_cmd = run( + ( + f"docker compose -p {self.compose_project_name} " + f"port {self.service_name} {SSH_TEST_SERVER_PORT}" + ), + shell=True, + check=True, + capture_output=True, + ) + new_port = int(port_cmd.stdout.decode().strip().split(":")[1]) + old_port = self._port + self._port = new_port + _LOG.info( + "Port for %s: %s -> %s (uncached=%s)", + self.service_name, + old_port, + new_port, + uncached, + ) return self._port def to_ssh_service_config(self, uncached: bool = False) -> dict: @@ -72,3 +116,27 @@ def to_connect_params(self, uncached: bool = False) -> dict: "port": self.get_port(uncached), "username": self.username, } + + def validate_connection(self, timeout: float = 2.0) -> bool: + """ + Validate that the current cached port is still connectable. + + Returns False if the connection fails, indicating the port may be stale. + """ + if self._port is None: + return False + + is_connectable = check_socket(self.hostname, self._port, timeout) + if not is_connectable: + _LOG.warning( + "Connection validation FAILED for %s:%d - port may be stale!", + self.service_name, + self._port, + ) + else: + _LOG.debug( + "Connection validation OK for %s:%d", + self.service_name, + self._port, + ) + return is_connectable diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py index 9e3d25ad5d..543dd05ba5 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py @@ -8,6 +8,7 @@ Note: these are not in the conftest.py file because they are also used by remote_ssh_env_test.py """ +import logging import os import tempfile from collections.abc import Generator @@ -28,6 +29,8 @@ # pylint: disable=redefined-outer-name +_LOG = logging.getLogger(__name__) + @pytest.fixture(scope="session") def docker_compose_file(pytestconfig: pytest.Config) -> list[str]: @@ -81,6 +84,8 @@ def ssh_test_server( """ # Get a copy of the ssh id_rsa key from the test ssh server. with tempfile.NamedTemporaryFile() as id_rsa_file: + _LOG.info("Setting up ssh_test_server fixture") + ssh_test_server_info = SshTestServerInfo( compose_project_name=docker_compose_project_name, service_name=SSH_TEST_SERVER_NAME, @@ -88,11 +93,17 @@ def ssh_test_server( username="root", id_rsa_path=id_rsa_file.name, ) + + port = ssh_test_server_info.get_port() + _LOG.info("Main SSH server discovered on port %d", port) + wait_docker_service_socket( locked_docker_services, ssh_test_server_info.hostname, - ssh_test_server_info.get_port(), + port, ) + + _LOG.info("Main SSH server validated and ready") id_rsa_src = f"/{ssh_test_server_info.username}/.ssh/id_rsa" docker_cp_cmd = ( f"docker compose -p {docker_compose_project_name} " @@ -128,6 +139,9 @@ def alt_test_server( # Note: The alt-server uses the same image as the ssh-server container, so # the id_rsa key and username should all match. # Only the host port it is allocate is different. + + _LOG.info("Setting up alt_test_server fixture") + alt_test_server_info = SshTestServerInfo( compose_project_name=ssh_test_server.compose_project_name, service_name=ALT_TEST_SERVER_NAME, @@ -135,11 +149,17 @@ def alt_test_server( username=ssh_test_server.username, id_rsa_path=ssh_test_server.id_rsa_path, ) + + port = alt_test_server_info.get_port() + _LOG.info("Alt SSH server discovered on port %d", port) + wait_docker_service_socket( locked_docker_services, alt_test_server_info.hostname, - alt_test_server_info.get_port(), + port, ) + + _LOG.info("Alt SSH server validated and ready") return alt_test_server_info @@ -156,6 +176,9 @@ def reboot_test_server( # Note: The reboot-server uses the same image as the ssh-server container, so # the id_rsa key and username should all match. # Only the host port it is allocate is different. + + _LOG.info("Setting up reboot_test_server fixture") + reboot_test_server_info = SshTestServerInfo( compose_project_name=ssh_test_server.compose_project_name, service_name=REBOOT_TEST_SERVER_NAME, @@ -163,11 +186,17 @@ def reboot_test_server( username=ssh_test_server.username, id_rsa_path=ssh_test_server.id_rsa_path, ) + + port = reboot_test_server_info.get_port() + _LOG.info("Reboot SSH server discovered on port %d", port) + wait_docker_service_socket( locked_docker_services, reboot_test_server_info.hostname, - reboot_test_server_info.get_port(), + port, ) + + _LOG.info("Reboot SSH server validated and ready") return reboot_test_server_info diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py index 1dce67a13d..d205c0edea 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py @@ -35,7 +35,17 @@ def test_ssh_service_remote_exec( This checks state of the service across multiple invocations and states to check for internal cache handling logic as well. """ - # pylint: disable=protected-access + # pylint: disable=protected-access,too-many-locals + _LOG.info("test_ssh_service_remote_exec starting - validating server connections") + + # Validate connections before starting test + for server_info, name in ((ssh_test_server, "main"), (alt_test_server, "alt")): + if not server_info.validate_connection(): + _LOG.warning( + "%s server connection validation failed, may have stale port", + name, + ) + with ssh_host_service: config = ssh_test_server.to_ssh_service_config() @@ -149,12 +159,26 @@ def check_ssh_service_reboot( graceful: bool, ) -> None: """Check the SshHostService reboot operation.""" + # pylint: disable=too-many-locals + # Note: rebooting changes the port number unfortunately, but makes it # easier to check for success. # Also, it may cause issues with other parallel unit tests, so we run it as # a part of the same unit test for now. + + _LOG.warning( + "*** STARTING REBOOT TEST (graceful=%s) - This will change container ports! ***", + graceful, + ) + with ssh_host_service: reboot_test_srv_ssh_svc_conf = reboot_test_server.to_ssh_service_config(uncached=True) + original_port = reboot_test_srv_ssh_svc_conf["ssh_port"] + _LOG.warning( + "Reboot test using original port %d for %s", + original_port, + reboot_test_server.service_name, + ) (status, results_info) = ssh_host_service.remote_exec( script=['echo "sleeping..."', "sleep 30", 'echo "should not reach this point"'], config=reboot_test_srv_ssh_svc_conf, @@ -183,7 +207,7 @@ def check_ssh_service_reboot( assert "should not reach this point" not in stdout reboot_test_srv_ssh_svc_conf_new: dict = {} - for _ in range(0, 3): + for attempt in range(0, 3): # Give docker some time to restart the service after the "reboot". # Note: this relies on having a `restart_policy` in the docker-compose.yml file. time.sleep(1) @@ -195,21 +219,41 @@ def check_ssh_service_reboot( capture_output=True, check=False, ) - print(run_res.stdout.decode()) - print(run_res.stderr.decode()) + _LOG.info( + "Docker ps output (attempt %d):\nSTDOUT:\n%s\nSTDERR:\n%s", + attempt + 1, + run_res.stdout.decode(), + run_res.stderr.decode(), + ) reboot_test_srv_ssh_svc_conf_new = reboot_test_server.to_ssh_service_config( uncached=True ) - if ( - reboot_test_srv_ssh_svc_conf_new["ssh_port"] - != reboot_test_srv_ssh_svc_conf["ssh_port"] - ): + new_port = reboot_test_srv_ssh_svc_conf_new["ssh_port"] + _LOG.warning( + "Port check attempt %d: %d -> %d", + attempt + 1, + original_port, + new_port, + ) + if new_port != original_port: + _LOG.warning( + "*** PORT CHANGED: %d -> %d (this affects ALL workers!) ***", + original_port, + new_port, + ) break except CalledProcessError as ex: - _LOG.info("Failed to check port for reboot test server: %s", ex) + _LOG.info( + "Failed to check port for reboot test server (attempt %d): %s", + attempt + 1, + ex, + ) assert ( reboot_test_srv_ssh_svc_conf_new["ssh_port"] != reboot_test_srv_ssh_svc_conf["ssh_port"] + ), ( + f"""Port should have changed from {original_port}""" + f"""but is still {reboot_test_srv_ssh_svc_conf_new["ssh_port"]}""" ) wait_docker_service_socket( @@ -227,6 +271,10 @@ def check_ssh_service_reboot( assert status.is_succeeded() assert results["stdout"].strip() == REBOOT_TEST_SERVER_NAME + _LOG.warning( + "*** REBOOT TEST COMPLETED - Other workers may now have stale cached ports! ***", + ) + @requires_docker def test_ssh_service_reboot( diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py index d06516b3fe..7fec2fd813 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py @@ -5,6 +5,7 @@ """Tests for mlos_bench.services.remote.ssh.SshService base class.""" import asyncio +import logging import time from importlib.metadata import PackageNotFoundError, version from subprocess import run @@ -40,6 +41,8 @@ # OK: pytest-lazy-fixture not installed pass +_LOG = logging.getLogger(__name__) + @requires_docker @requires_ssh @@ -52,12 +55,22 @@ ) def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_name: str) -> None: """Check for the pytest-docker ssh test infra.""" + _LOG.info("test_ssh_service_test_infra starting with %s", server_name) + assert ssh_test_server_info.service_name == server_name ip_addr = resolve_host_name(ssh_test_server_info.hostname) assert ip_addr is not None + # Use validation method to detect stale ports + if not ssh_test_server_info.validate_connection(): + _LOG.warning( + "Cached port validation failed, getting fresh port for %s", + server_name, + ) + local_port = ssh_test_server_info.get_port() + _LOG.info("Using port %d for %s", local_port, server_name) assert check_socket(ip_addr, local_port) ssh_cmd = ( "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new " @@ -67,6 +80,8 @@ def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_ cmd = run(ssh_cmd.split(), capture_output=True, text=True, check=True) assert cmd.stdout.strip() == server_name + # assert False, "Force failure for debugging." + @pytest.mark.filterwarnings( "ignore:.*(coroutine 'sleep' was never awaited).*:RuntimeWarning:.*event_loop_context_test.*:0" diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/up.sh b/mlos_bench/mlos_bench/tests/services/remote/ssh/up.sh index f0f152975d..24278d0b17 100755 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/up.sh +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/up.sh @@ -24,6 +24,13 @@ docker compose -p "$PROJECT_NAME" cp ssh-server:/root/.ssh/id_rsa ./id_rsa chmod 0600 ./id_rsa set +x +for server in ssh-server alt-server reboot-server; do + until [ "$(docker inspect -f '{{.State.Health.Status}}' $(docker compose -p ${PROJECT_NAME} ps -q ${server}))" = "healthy" ]; do + echo "Waiting for ${server} container to become healthy..." + sleep 1 + done +done + echo "OK: private key available at '$scriptdir/id_rsa'. Connect to the ssh-server container at the following port:" docker compose -p "$PROJECT_NAME" port ssh-server ${PORT:-2254} | cut -d: -f2 echo "INFO: And this port for the alt-server container:" diff --git a/setup.cfg b/setup.cfg index 1c1cfe1416..2a6e3ecc7f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,6 +31,15 @@ extend-ignore = E203 [tool:pytest] minversion = 7.1 +# Enable the CLI log so that when xdist runners fail a particular test, the +# fixture setup code is also captured. +# Note: this can be a little noisy when running without xdist (e.g., -n0), but +# we typically only do that for single tests while debugging. +# See Also: conftest.py where we also configure file based logging. +log_cli = true +log_cli_level = INFO +log_cli_format = %(asctime)s.%(msecs)03d [%(process)d][%(threadName)s] [%(filename)s:%(lineno)d %(funcName)s] %(levelname)s: %(message)s +log_cli_date_format = %Y-%m-%d %H:%M:%S #pythonpath = . # Run all tests to completion on multiple processes. # Run failed and new tests first by default.