From 0678f9648e62005d19d3b5be5be01a394029c2ce Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 17 Jul 2025 20:11:48 +0000 Subject: [PATCH 01/76] Accept the terms of service for conda repos --- .devcontainer/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 4a778ed13a..cb63cd34e4 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -80,6 +80,9 @@ RUN echo "Setup miniconda" \ && /opt/conda/bin/conda init \ && /opt/conda/bin/conda config --set channel_priority strict \ && /opt/conda/bin/conda info \ + && /opt/conda/bin/conda config --show \ + && /opt/conda/bin/conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \ + && /opt/conda/bin/conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \ && /opt/conda/bin/conda update -v -y -n base -c conda-forge -c defaults --all \ && /opt/conda/bin/conda list -n base \ && /opt/conda/bin/conda install -v -y -n base -c conda-forge -c defaults conda-libmamba-solver \ From a3a82f775909defe44bfaeff45b381d90e81116f Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 17 Jul 2025 20:12:00 +0000 Subject: [PATCH 02/76] Make conda output easier for debugging --- .devcontainer/Dockerfile | 2 ++ .github/workflows/linux.yml | 2 ++ .github/workflows/macos.yml | 2 ++ Makefile | 3 +++ 4 files changed, 9 insertions(+) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index cb63cd34e4..f8a8298c35 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -64,6 +64,8 @@ COPY --from=deps-prep --chown=vscode:conda /tmp/conda-tmp/.pre-commit-config.yam # Combine the installation of miniconda and the mlos dependencies into a single step in order to save space. # This allows the mlos env to reference the base env's packages without duplication across layers. RUN echo "Setup miniconda" \ + && set -x \ + && export CONDA_NO_PROGRESS_BARS=1 \ && curl -Ss --url https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-$(uname -m).sh -o /tmp/miniconda3.sh \ && sudo sh /tmp/miniconda3.sh -b -u -p /opt/conda \ && rm -rf /tmp/miniconda3.sh \ diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 5518b15d97..50be59ae51 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -47,6 +47,8 @@ jobs: CONDA_ENV_NAME: unset # See notes about $CONDA below. CONDA_DIR: unset + # Use simple output for conda. + CONDA_NO_PROGRESS_BARS: 1 # When parallel jobs are used, group the output to make debugging easier. MAKEFLAGS: -Oline diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 37bd8e39e4..7a42105637 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -52,6 +52,8 @@ jobs: CONDA_ENV_NAME: unset # See notes about $CONDA below. CONDA_DIR: unset + # Use simple output for conda. + CONDA_NO_PROGRESS_BARS: 1 # When parallel jobs are used, group the output to make debugging easier. MAKEFLAGS: -Oline diff --git a/Makefile b/Makefile index 6172491fad..9a8427fa28 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,9 @@ CONDA_ENV_NAME ?= mlos PYTHON_VERSION := $(shell echo "${CONDA_ENV_NAME}" | sed -r -e 's/^mlos[-]?//') ENV_YML := conda-envs/${CONDA_ENV_NAME}.yml +CONDA_NO_PROGRESS_BARS ?= 1 +export CONDA_NO_PROGRESS_BARS + # Find the non-build python files we should consider as rule dependencies. # Do a single find and multiple filters for better performance. REPO_FILES := $(shell find . -type f 2>/dev/null | egrep -v -e '^./(mlos_(core|bench|viz)/)?build/' -e '^./doc/source/' -e '^./doc/build/') From fcbfc49d03c31c9d200505689441c418cc019863 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 17 Jul 2025 20:23:54 +0000 Subject: [PATCH 03/76] more output adjustments --- .devcontainer/Dockerfile | 2 ++ .github/workflows/linux.yml | 2 ++ .github/workflows/macos.yml | 2 ++ Makefile | 11 ++++++++--- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index f8a8298c35..6d77b9fcf9 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -65,6 +65,8 @@ COPY --from=deps-prep --chown=vscode:conda /tmp/conda-tmp/.pre-commit-config.yam # This allows the mlos env to reference the base env's packages without duplication across layers. RUN echo "Setup miniconda" \ && set -x \ + && export PYTHONUNBUFFERED=1 \ + && export TERM=dumb \ && export CONDA_NO_PROGRESS_BARS=1 \ && curl -Ss --url https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-$(uname -m).sh -o /tmp/miniconda3.sh \ && sudo sh /tmp/miniconda3.sh -b -u -p /opt/conda \ diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 50be59ae51..9ee351638e 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -48,6 +48,8 @@ jobs: # See notes about $CONDA below. CONDA_DIR: unset # Use simple output for conda. + PYTHONUNBUFFERED: 1 + TERM: dumb CONDA_NO_PROGRESS_BARS: 1 # When parallel jobs are used, group the output to make debugging easier. MAKEFLAGS: -Oline diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 7a42105637..1d5f14fe05 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -53,6 +53,8 @@ jobs: # See notes about $CONDA below. CONDA_DIR: unset # Use simple output for conda. + PYTHONUNBUFFERED: 1 + TERM: dumb CONDA_NO_PROGRESS_BARS: 1 # When parallel jobs are used, group the output to make debugging easier. MAKEFLAGS: -Oline diff --git a/Makefile b/Makefile index 9a8427fa28..f29d5dee8f 100644 --- a/Makefile +++ b/Makefile @@ -5,9 +5,6 @@ CONDA_ENV_NAME ?= mlos PYTHON_VERSION := $(shell echo "${CONDA_ENV_NAME}" | sed -r -e 's/^mlos[-]?//') ENV_YML := conda-envs/${CONDA_ENV_NAME}.yml -CONDA_NO_PROGRESS_BARS ?= 1 -export CONDA_NO_PROGRESS_BARS - # Find the non-build python files we should consider as rule dependencies. # Do a single find and multiple filters for better performance. REPO_FILES := $(shell find . -type f 2>/dev/null | egrep -v -e '^./(mlos_(core|bench|viz)/)?build/' -e '^./doc/source/' -e '^./doc/build/') @@ -29,6 +26,14 @@ MKDIR_BUILD := $(shell test -d build || mkdir build) # Run make in parallel by default. MAKEFLAGS += -j$(shell nproc 2>/dev/null || sysctl -n hw.ncpu) + +# Adjust to use plain output for easier debugging. +PYTHONUNBUFFERED ?= 1 +export PYTHONUNBUFFERED +TERM ?= dumb +export TERM +CONDA_NO_PROGRESS_BARS ?= 1 +export CONDA_NO_PROGRESS_BARS #MAKEFLAGS += -Oline .PHONY: all From c6669152309a4c918f59c37103e9a5fe31c73e59 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 17 Jul 2025 20:29:53 +0000 Subject: [PATCH 04/76] Revert "more output adjustments" This reverts commit fcbfc49d03c31c9d200505689441c418cc019863. --- .devcontainer/Dockerfile | 2 -- .github/workflows/linux.yml | 2 -- .github/workflows/macos.yml | 2 -- Makefile | 11 +++-------- 4 files changed, 3 insertions(+), 14 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 6d77b9fcf9..f8a8298c35 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -65,8 +65,6 @@ COPY --from=deps-prep --chown=vscode:conda /tmp/conda-tmp/.pre-commit-config.yam # This allows the mlos env to reference the base env's packages without duplication across layers. RUN echo "Setup miniconda" \ && set -x \ - && export PYTHONUNBUFFERED=1 \ - && export TERM=dumb \ && export CONDA_NO_PROGRESS_BARS=1 \ && curl -Ss --url https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-$(uname -m).sh -o /tmp/miniconda3.sh \ && sudo sh /tmp/miniconda3.sh -b -u -p /opt/conda \ diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 9ee351638e..50be59ae51 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -48,8 +48,6 @@ jobs: # See notes about $CONDA below. CONDA_DIR: unset # Use simple output for conda. - PYTHONUNBUFFERED: 1 - TERM: dumb CONDA_NO_PROGRESS_BARS: 1 # When parallel jobs are used, group the output to make debugging easier. MAKEFLAGS: -Oline diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 1d5f14fe05..7a42105637 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -53,8 +53,6 @@ jobs: # See notes about $CONDA below. CONDA_DIR: unset # Use simple output for conda. - PYTHONUNBUFFERED: 1 - TERM: dumb CONDA_NO_PROGRESS_BARS: 1 # When parallel jobs are used, group the output to make debugging easier. MAKEFLAGS: -Oline diff --git a/Makefile b/Makefile index f29d5dee8f..9a8427fa28 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,9 @@ CONDA_ENV_NAME ?= mlos PYTHON_VERSION := $(shell echo "${CONDA_ENV_NAME}" | sed -r -e 's/^mlos[-]?//') ENV_YML := conda-envs/${CONDA_ENV_NAME}.yml +CONDA_NO_PROGRESS_BARS ?= 1 +export CONDA_NO_PROGRESS_BARS + # Find the non-build python files we should consider as rule dependencies. # Do a single find and multiple filters for better performance. REPO_FILES := $(shell find . -type f 2>/dev/null | egrep -v -e '^./(mlos_(core|bench|viz)/)?build/' -e '^./doc/source/' -e '^./doc/build/') @@ -26,14 +29,6 @@ MKDIR_BUILD := $(shell test -d build || mkdir build) # Run make in parallel by default. MAKEFLAGS += -j$(shell nproc 2>/dev/null || sysctl -n hw.ncpu) - -# Adjust to use plain output for easier debugging. -PYTHONUNBUFFERED ?= 1 -export PYTHONUNBUFFERED -TERM ?= dumb -export TERM -CONDA_NO_PROGRESS_BARS ?= 1 -export CONDA_NO_PROGRESS_BARS #MAKEFLAGS += -Oline .PHONY: all From 2dcfa426d24170d470288e5629b51fd67cba6be5 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 17 Jul 2025 20:29:55 +0000 Subject: [PATCH 05/76] Revert "Make conda output easier for debugging" This reverts commit a3a82f775909defe44bfaeff45b381d90e81116f. --- .devcontainer/Dockerfile | 2 -- .github/workflows/linux.yml | 2 -- .github/workflows/macos.yml | 2 -- Makefile | 3 --- 4 files changed, 9 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index f8a8298c35..cb63cd34e4 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -64,8 +64,6 @@ COPY --from=deps-prep --chown=vscode:conda /tmp/conda-tmp/.pre-commit-config.yam # Combine the installation of miniconda and the mlos dependencies into a single step in order to save space. # This allows the mlos env to reference the base env's packages without duplication across layers. RUN echo "Setup miniconda" \ - && set -x \ - && export CONDA_NO_PROGRESS_BARS=1 \ && curl -Ss --url https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-$(uname -m).sh -o /tmp/miniconda3.sh \ && sudo sh /tmp/miniconda3.sh -b -u -p /opt/conda \ && rm -rf /tmp/miniconda3.sh \ diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 50be59ae51..5518b15d97 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -47,8 +47,6 @@ jobs: CONDA_ENV_NAME: unset # See notes about $CONDA below. CONDA_DIR: unset - # Use simple output for conda. - CONDA_NO_PROGRESS_BARS: 1 # When parallel jobs are used, group the output to make debugging easier. MAKEFLAGS: -Oline diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 7a42105637..37bd8e39e4 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -52,8 +52,6 @@ jobs: CONDA_ENV_NAME: unset # See notes about $CONDA below. CONDA_DIR: unset - # Use simple output for conda. - CONDA_NO_PROGRESS_BARS: 1 # When parallel jobs are used, group the output to make debugging easier. MAKEFLAGS: -Oline diff --git a/Makefile b/Makefile index 9a8427fa28..6172491fad 100644 --- a/Makefile +++ b/Makefile @@ -5,9 +5,6 @@ CONDA_ENV_NAME ?= mlos PYTHON_VERSION := $(shell echo "${CONDA_ENV_NAME}" | sed -r -e 's/^mlos[-]?//') ENV_YML := conda-envs/${CONDA_ENV_NAME}.yml -CONDA_NO_PROGRESS_BARS ?= 1 -export CONDA_NO_PROGRESS_BARS - # Find the non-build python files we should consider as rule dependencies. # Do a single find and multiple filters for better performance. REPO_FILES := $(shell find . -type f 2>/dev/null | egrep -v -e '^./(mlos_(core|bench|viz)/)?build/' -e '^./doc/source/' -e '^./doc/build/') From 9965602cc44abd3cbbe58681cbb00b9d554307cc Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 17 Jul 2025 20:33:44 +0000 Subject: [PATCH 06/76] try a suggestion --- .devcontainer/Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index cb63cd34e4..4a5e27ae47 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -81,8 +81,7 @@ RUN echo "Setup miniconda" \ && /opt/conda/bin/conda config --set channel_priority strict \ && /opt/conda/bin/conda info \ && /opt/conda/bin/conda config --show \ - && /opt/conda/bin/conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \ - && /opt/conda/bin/conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \ + && /opt/conda/bin/conda tos accept --override-channels --channel default \ && /opt/conda/bin/conda update -v -y -n base -c conda-forge -c defaults --all \ && /opt/conda/bin/conda list -n base \ && /opt/conda/bin/conda install -v -y -n base -c conda-forge -c defaults conda-libmamba-solver \ From 57b673a247bbc791ccadeafd8b322aa06ddc5e06 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 17 Jul 2025 20:38:05 +0000 Subject: [PATCH 07/76] fixups --- .devcontainer/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 4a5e27ae47..e127c4f1a0 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -81,7 +81,7 @@ RUN echo "Setup miniconda" \ && /opt/conda/bin/conda config --set channel_priority strict \ && /opt/conda/bin/conda info \ && /opt/conda/bin/conda config --show \ - && /opt/conda/bin/conda tos accept --override-channels --channel default \ + && /opt/conda/bin/conda tos accept --override-channels --channel defaults \ && /opt/conda/bin/conda update -v -y -n base -c conda-forge -c defaults --all \ && /opt/conda/bin/conda list -n base \ && /opt/conda/bin/conda install -v -y -n base -c conda-forge -c defaults conda-libmamba-solver \ From ecebe33aaad5848d8df7e346cb122f84f6ce470a Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 18 Sep 2025 14:59:03 -0500 Subject: [PATCH 08/76] type fixups --- mlos_bench/mlos_bench/optimizers/mlos_core_optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/optimizers/mlos_core_optimizer.py b/mlos_bench/mlos_bench/optimizers/mlos_core_optimizer.py index 0ef9751228..ba3cfb33c8 100644 --- a/mlos_bench/mlos_bench/optimizers/mlos_core_optimizer.py +++ b/mlos_bench/mlos_bench/optimizers/mlos_core_optimizer.py @@ -285,7 +285,7 @@ def bulk_register( if status is not None: # Select only the completed trials, set scores for failed trials to +inf. - df_status = pd.Series(status) + df_status = pd.Series(list(status), dtype=object) # TODO: Be more flexible with values used for failed trials (not just +inf). # Issue: https://github.com/microsoft/MLOS/issues/523 df_scores[df_status != Status.SUCCEEDED] = float("inf") From e7f524d8eee6715bb5212669db6b12e72232fc0d Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 18 Sep 2025 20:04:04 +0000 Subject: [PATCH 09/76] type checking fixups --- .../b61aa446e724_support_fractional_seconds_with_mysql.py | 4 ++-- mlos_bench/mlos_bench/storage/sql/schema.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlos_bench/mlos_bench/storage/sql/alembic/versions/b61aa446e724_support_fractional_seconds_with_mysql.py b/mlos_bench/mlos_bench/storage/sql/alembic/versions/b61aa446e724_support_fractional_seconds_with_mysql.py index 7e41b93f09..d65a6344c6 100644 --- a/mlos_bench/mlos_bench/storage/sql/alembic/versions/b61aa446e724_support_fractional_seconds_with_mysql.py +++ b/mlos_bench/mlos_bench/storage/sql/alembic/versions/b61aa446e724_support_fractional_seconds_with_mysql.py @@ -34,8 +34,8 @@ def _mysql_datetime(*, with_fsp: bool = False) -> mysql.DATETIME: See for details. """ if with_fsp: - return mysql.DATETIME(fsp=6) # type: ignore[no-untyped-call] - return mysql.DATETIME() # type: ignore[no-untyped-call] + return mysql.DATETIME(fsp=6) + return mysql.DATETIME() def upgrade() -> None: diff --git a/mlos_bench/mlos_bench/storage/sql/schema.py b/mlos_bench/mlos_bench/storage/sql/schema.py index 475a772312..23f5676f93 100644 --- a/mlos_bench/mlos_bench/storage/sql/schema.py +++ b/mlos_bench/mlos_bench/storage/sql/schema.py @@ -56,7 +56,7 @@ def _mysql_datetime_with_fsp() -> mysql.DATETIME: Split out to allow single mypy ignore. See for details. """ - return mysql.DATETIME(fsp=6) # type: ignore[no-untyped-call] + return mysql.DATETIME(fsp=6) class _DDL: From 1b490aee06aec6e5929d241a8f6af5dfc0922605 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 18 Sep 2025 20:04:14 +0000 Subject: [PATCH 10/76] use conda by default --- .vscode/settings.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index a01d1d6418..35b93cdda0 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -176,5 +176,7 @@ "githubPullRequests.experimental.chat": true, "github.copilot.chat.codesearch.enabled": true, "github.copilot.chat.copilotDebugCommand.enabled": true, - "github.copilot.chat.reviewSelection.enabled": true + "github.copilot.chat.reviewSelection.enabled": true, + "python-envs.defaultEnvManager": "ms-python.python:conda", + "python-envs.defaultPackageManager": "ms-python.python:conda" } From 9ad779473756392a1a0f0044ea3e25bfb4ed2890 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Mon, 22 Sep 2025 19:38:47 +0000 Subject: [PATCH 11/76] small pyright fixups --- conda-envs/mlos.yml | 2 +- mlos_viz/mlos_viz/base.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/conda-envs/mlos.yml b/conda-envs/mlos.yml index 25045bcdfc..65505b9efd 100644 --- a/conda-envs/mlos.yml +++ b/conda-envs/mlos.yml @@ -28,7 +28,7 @@ dependencies: - pylint==3.3.7 - tomlkit - mypy==1.15.0 - - pyright==1.1.400 + - pyright==1.1.405 - pandas-stubs - types-beautifulsoup4 - types-colorama diff --git a/mlos_viz/mlos_viz/base.py b/mlos_viz/mlos_viz/base.py index 2350e99470..8eca6e653b 100644 --- a/mlos_viz/mlos_viz/base.py +++ b/mlos_viz/mlos_viz/base.py @@ -30,6 +30,8 @@ def _get_kwarg_defaults(target: Callable, **kwargs: Any) -> dict[str, Any]: Note: this only works with non-positional kwargs (e.g., those after a * arg). """ target_kwargs = {} + if target.__kwdefaults__ is None: + return target_kwargs for kword in target.__kwdefaults__: # or {} # intentionally omitted for now if kword in kwargs: target_kwargs[kword] = kwargs[kword] From 3c0aa23f2e382248b28ba634041cac6b4f397a96 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Mon, 22 Sep 2025 20:19:25 +0000 Subject: [PATCH 12/76] Install pre-built pyrfr from conda to workaround build error. See Also: https://github.com/microsoft/MLOS/issues/1001 --- conda-envs/mlos-3.10.yml | 3 +++ conda-envs/mlos-3.11.yml | 3 +++ conda-envs/mlos-3.12.yml | 3 +++ conda-envs/mlos-3.13.yml | 3 +++ conda-envs/mlos.yml | 3 +++ 5 files changed, 15 insertions(+) diff --git a/conda-envs/mlos-3.10.yml b/conda-envs/mlos-3.10.yml index 75f7a40022..2fe9aef78c 100644 --- a/conda-envs/mlos-3.10.yml +++ b/conda-envs/mlos-3.10.yml @@ -20,6 +20,9 @@ dependencies: - python=3.10 # See comments in mlos.yml. #- gcc_linux-64 + # Install an SMAC requirement pre-compiled from conda-forge. + # See Also: https://github.com/microsoft/MLOS/issues/1001 + - pyrfr>=0.9.0 - pip: - bump2version - check-jsonschema diff --git a/conda-envs/mlos-3.11.yml b/conda-envs/mlos-3.11.yml index 4a52bfae0a..1cdecc5119 100644 --- a/conda-envs/mlos-3.11.yml +++ b/conda-envs/mlos-3.11.yml @@ -20,6 +20,9 @@ dependencies: - python=3.11 # See comments in mlos.yml. #- gcc_linux-64 + # Install an SMAC requirement pre-compiled from conda-forge. + # See Also: https://github.com/microsoft/MLOS/issues/1001 + - pyrfr>=0.9.0 - pip: - bump2version - check-jsonschema diff --git a/conda-envs/mlos-3.12.yml b/conda-envs/mlos-3.12.yml index 2a1a33f079..ac01f11712 100644 --- a/conda-envs/mlos-3.12.yml +++ b/conda-envs/mlos-3.12.yml @@ -22,6 +22,9 @@ dependencies: - python=3.12 # See comments in mlos.yml. #- gcc_linux-64 + # Install an SMAC requirement pre-compiled from conda-forge. + # See Also: https://github.com/microsoft/MLOS/issues/1001 + - pyrfr>=0.9.0 - pip: - bump2version - check-jsonschema diff --git a/conda-envs/mlos-3.13.yml b/conda-envs/mlos-3.13.yml index 5887328dd7..9e668db065 100644 --- a/conda-envs/mlos-3.13.yml +++ b/conda-envs/mlos-3.13.yml @@ -22,6 +22,9 @@ dependencies: - python=3.13 # See comments in mlos.yml. #- gcc_linux-64 + # Install an SMAC requirement pre-compiled from conda-forge. + # See Also: https://github.com/microsoft/MLOS/issues/1001 + - pyrfr>=0.9.0 - pip: - bump2version - check-jsonschema diff --git a/conda-envs/mlos.yml b/conda-envs/mlos.yml index 65505b9efd..ee05dfe1d3 100644 --- a/conda-envs/mlos.yml +++ b/conda-envs/mlos.yml @@ -18,6 +18,9 @@ dependencies: # FIXME: Temporarily avoid broken libpq that's missing client headers. - libpq<17.0 - python + # Install an SMAC requirement pre-compiled from conda-forge. + # See Also: https://github.com/microsoft/MLOS/issues/1001 + - pyrfr>=0.9.0 - pip: - bump2version - check-jsonschema From a8d9ccd38679fee35dec97722cd0c0bdcaf76943 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Mon, 22 Sep 2025 15:36:35 -0500 Subject: [PATCH 13/76] ignore a deprecation warning in matplotlib --- mlos_viz/mlos_viz/dabl.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mlos_viz/mlos_viz/dabl.py b/mlos_viz/mlos_viz/dabl.py index e0b033b7a8..7ad636e8c5 100644 --- a/mlos_viz/mlos_viz/dabl.py +++ b/mlos_viz/mlos_viz/dabl.py @@ -101,3 +101,10 @@ def ignore_plotter_warnings() -> None: module="dabl", message="The legendHandles attribute was deprecated in Matplotlib 3.7 and will be removed", ) + + warnings.filterwarnings( + "ignore", + module="matplotlib", + category=DeprecationWarning, + message="'mode' parameter is deprecated and will be removed in Pillow 13", + ) From 186203706730771d778f015ee50751f68575f407 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Mon, 22 Sep 2025 15:40:59 -0500 Subject: [PATCH 14/76] fixups --- mlos_viz/mlos_viz/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mlos_viz/mlos_viz/base.py b/mlos_viz/mlos_viz/base.py index 8eca6e653b..7358554717 100644 --- a/mlos_viz/mlos_viz/base.py +++ b/mlos_viz/mlos_viz/base.py @@ -30,9 +30,7 @@ def _get_kwarg_defaults(target: Callable, **kwargs: Any) -> dict[str, Any]: Note: this only works with non-positional kwargs (e.g., those after a * arg). """ target_kwargs = {} - if target.__kwdefaults__ is None: - return target_kwargs - for kword in target.__kwdefaults__: # or {} # intentionally omitted for now + for kword in target.__kwdefaults__ or {}: if kword in kwargs: target_kwargs[kword] = kwargs[kword] return target_kwargs From b2530a3877b4cdf05386d89462488f82f687c0d9 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Mon, 22 Sep 2025 15:58:20 -0500 Subject: [PATCH 15/76] ignore more warnings --- mlos_viz/mlos_viz/base.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mlos_viz/mlos_viz/base.py b/mlos_viz/mlos_viz/base.py index 7358554717..626e6c6979 100644 --- a/mlos_viz/mlos_viz/base.py +++ b/mlos_viz/mlos_viz/base.py @@ -59,6 +59,13 @@ def ignore_plotter_warnings() -> None: ), ) + warnings.filterwarnings( + "ignore", + module="matplotlib", + category=DeprecationWarning, + message="'mode' parameter is deprecated and will be removed in Pillow 13", + ) + def _add_groupby_desc_column( results_df: pandas.DataFrame, From d914bcc4f540856bbee671a58ec75c9c25be5206 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Mon, 22 Sep 2025 21:47:18 +0000 Subject: [PATCH 16/76] Avoid Debian trixie for now since there's no azure-cli package yet. Closes #1002. --- .devcontainer/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index e127c4f1a0..d6e3a159a1 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,7 +1,8 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -FROM mcr.microsoft.com/vscode/devcontainers/base AS base +# TODO: FIXME: https://github.com/microsoft/MLOS/issues/1002 +FROM mcr.microsoft.com/vscode/devcontainers/base:bookworm AS base # Add some additional packages for the devcontainer terminal environment. USER root From b6b5561bdf05f3d5a42f98674645ef347536d8ad Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Mon, 22 Sep 2025 21:47:36 +0000 Subject: [PATCH 17/76] Moby has also been removed from Debian Trixie --- .devcontainer/devcontainer.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index f9cbbc4a38..ee56ba926f 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -31,7 +31,9 @@ "features": { "ghcr.io/devcontainers/features/azure-cli:1": {}, //"ghcr.io/devcontainers/features/conda:1": {}, - "ghcr.io/devcontainers/features/docker-outside-of-docker:1": {}, + "ghcr.io/devcontainers/features/docker-outside-of-docker:1": { + "moby": false + }, "ghcr.io/devcontainers/features/github-cli:1": {}, "ghcr.io/stuartleeks/dev-container-features/azure-cli-persistence:0": {}, "ghcr.io/stuartleeks/dev-container-features/shell-history:0": {} From b059ac384b0aeea3cd5d8be50496711510e8d99f Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 23 Sep 2025 16:49:31 +0000 Subject: [PATCH 18/76] add more debug logging for missing docker support --- .github/workflows/devcontainer.yml | 2 +- mlos_bench/mlos_bench/tests/__init__.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/devcontainer.yml b/.github/workflows/devcontainer.yml index 44ac8ee424..495645ede0 100644 --- a/.github/workflows/devcontainer.yml +++ b/.github/workflows/devcontainer.yml @@ -78,7 +78,7 @@ jobs: echo "NO_CACHE=true" >> $GITHUB_ENV - name: Run pytest with debug logging enabled for nightly runs - if: github.event_name == 'schedule' + #if: github.event_name == 'schedule' run: | echo "PYTEST_EXTRA_OPTIONS=--log-level=DEBUG" >> $GITHUB_ENV diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index 778d89836d..0f271b169d 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -52,8 +52,10 @@ if cmd.returncode != 0 or not any( line for line in stdout.splitlines() if "Platform" in line and "linux" in line ): - debug("Docker is available but missing support for targeting linux platform.") + debug("Docker is available but missing buildx support for targeting linux platform.") DOCKER = None +else: + debug("Docker is not available on this system.") requires_docker = pytest.mark.skipif( not DOCKER, reason="Docker with Linux support is not available on this system.", From 021f45162f730d38209a6d6677c78d5d17b234bc Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 23 Sep 2025 16:56:42 +0000 Subject: [PATCH 19/76] more debugging --- mlos_bench/mlos_bench/tests/__init__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index 0f271b169d..24fc393348 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -55,7 +55,7 @@ debug("Docker is available but missing buildx support for targeting linux platform.") DOCKER = None else: - debug("Docker is not available on this system.") + debug("docker is not available on this system.") requires_docker = pytest.mark.skipif( not DOCKER, reason="Docker with Linux support is not available on this system.", @@ -64,6 +64,8 @@ # A decorator for tests that require ssh. # Use with @requires_ssh above a test_...() function. SSH = shutil.which("ssh") +if not SSH: + debug("ssh is not available on this system.") requires_ssh = pytest.mark.skipif(not SSH, reason="ssh is not available on this system.") # A common seed to use to avoid tracking down race conditions and intermingling @@ -114,7 +116,7 @@ def wait_docker_service_healthy( docker_services: DockerServices, project_name: str, service_name: str, - timeout: float = 30.0, + timeout: float = 60.0, ) -> None: """Wait until a docker service is healthy.""" docker_services.wait_until_responsive( @@ -128,7 +130,7 @@ def wait_docker_service_socket(docker_services: DockerServices, hostname: str, p """Wait until a docker service is ready.""" docker_services.wait_until_responsive( check=lambda: check_socket(hostname, port), - timeout=30.0, + timeout=60.0, pause=0.5, ) From ac57951e9840e17626250819899708dbf9302c8f Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 23 Sep 2025 16:57:08 +0000 Subject: [PATCH 20/76] comments --- mlos_bench/mlos_bench/tests/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mlos_bench/mlos_bench/tests/conftest.py b/mlos_bench/mlos_bench/tests/conftest.py index 59dfbff9d2..59ed4b97bd 100644 --- a/mlos_bench/mlos_bench/tests/conftest.py +++ b/mlos_bench/mlos_bench/tests/conftest.py @@ -104,6 +104,8 @@ def docker_compose_file(pytestconfig: pytest.Config) -> list[str]: Path to the docker-compose file. """ _ = pytestconfig # unused + # TODO: move this closer to the necessary submodules so that different + # docker tests can run independently. return [ os.path.join(os.path.dirname(__file__), "services", "remote", "ssh", "docker-compose.yml"), os.path.join(os.path.dirname(__file__), "storage", "sql", "docker-compose.yml"), From 4462cdff4272cfdb9694a4571986a7bf81367efb Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 23 Sep 2025 17:44:18 +0000 Subject: [PATCH 21/76] fixup for local testing --- .devcontainer/scripts/run-devcontainer.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.devcontainer/scripts/run-devcontainer.sh b/.devcontainer/scripts/run-devcontainer.sh index dfd7d3f65d..5cefe57b69 100755 --- a/.devcontainer/scripts/run-devcontainer.sh +++ b/.devcontainer/scripts/run-devcontainer.sh @@ -50,5 +50,6 @@ docker run -it --rm \ --env http_proxy="${http_proxy:-}" \ --env https_proxy="${https_proxy:-}" \ --env no_proxy="${no_proxy:-}" \ + --add-host host.docker.internal:host-gateway \ mlos-devcontainer \ $* From 74d94cf9a971a3af4f67e5ec1591eb821be9c889 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 23 Sep 2025 17:48:35 +0000 Subject: [PATCH 22/76] log docker missing warnings --- mlos_bench/mlos_bench/tests/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index 24fc393348..3c5b084b88 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -13,7 +13,7 @@ import shutil import socket from datetime import tzinfo -from logging import debug, warning +from logging import warning from subprocess import run import pytest @@ -52,10 +52,10 @@ if cmd.returncode != 0 or not any( line for line in stdout.splitlines() if "Platform" in line and "linux" in line ): - debug("Docker is available but missing buildx support for targeting linux platform.") + warning("Docker is available but missing buildx support for targeting linux platform.") DOCKER = None -else: - debug("docker is not available on this system.") +if not DOCKER: + warning("Docker is not available on this system. Some tests will be skipped.") requires_docker = pytest.mark.skipif( not DOCKER, reason="Docker with Linux support is not available on this system.", @@ -65,7 +65,7 @@ # Use with @requires_ssh above a test_...() function. SSH = shutil.which("ssh") if not SSH: - debug("ssh is not available on this system.") + warning("ssh is not available on this system. Some tests will be skipped.") requires_ssh = pytest.mark.skipif(not SSH, reason="ssh is not available on this system.") # A common seed to use to avoid tracking down race conditions and intermingling From 06a982697a32bad380ce26014fd53785f343541b Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 23 Sep 2025 17:49:40 +0000 Subject: [PATCH 23/76] Split docker test fixtures out so they can come up and down in parallel. --- mlos_bench/mlos_bench/tests/conftest.py | 144 --------------- .../mlos_bench/tests/docker_fixtures_util.py | 172 ++++++++++++++++++ .../tests/services/remote/ssh/conftest.py | 9 + .../tests/services/remote/ssh/fixtures.py | 35 ++++ .../mlos_bench/tests/storage/conftest.py | 9 + .../mlos_bench/tests/storage/sql/fixtures.py | 33 ++++ 6 files changed, 258 insertions(+), 144 deletions(-) create mode 100644 mlos_bench/mlos_bench/tests/docker_fixtures_util.py diff --git a/mlos_bench/mlos_bench/tests/conftest.py b/mlos_bench/mlos_bench/tests/conftest.py index 59ed4b97bd..a4d328ebcc 100644 --- a/mlos_bench/mlos_bench/tests/conftest.py +++ b/mlos_bench/mlos_bench/tests/conftest.py @@ -4,15 +4,9 @@ # """Common fixtures for mock TunableGroups and Environment objects.""" -import os import sys -from collections.abc import Generator -from typing import Any import pytest -from fasteners import InterProcessLock, InterProcessReaderWriterLock -from pytest_docker.plugin import Services as DockerServices -from pytest_docker.plugin import get_docker_services from mlos_bench.environments.mock_env import MockEnv from mlos_bench.tests import SEED, resolve_host_name, tunable_groups_fixtures @@ -73,141 +67,3 @@ def mock_env_no_noise(tunable_groups: TunableGroups) -> MockEnv: }, tunables=tunable_groups, ) - - -# Fixtures to configure the pytest-docker plugin. -@pytest.fixture(scope="session") -def docker_setup() -> list[str] | str: - """Setup for docker services.""" - if sys.platform == "darwin" or os.environ.get("HOST_OSTYPE", "").lower().startswith("darwin"): - # Workaround an oddity on macOS where the "docker-compose up" - # command always recreates the containers. - # That leads to races when multiple workers are trying to - # start and use the same services. - return ["up --build -d --no-recreate"] - else: - return ["up --build -d"] - - -@pytest.fixture(scope="session") -def docker_compose_file(pytestconfig: pytest.Config) -> list[str]: - """ - Returns the path to the docker-compose file. - - Parameters - ---------- - pytestconfig : pytest.Config - - Returns - ------- - str - Path to the docker-compose file. - """ - _ = pytestconfig # unused - # TODO: move this closer to the necessary submodules so that different - # docker tests can run independently. - return [ - os.path.join(os.path.dirname(__file__), "services", "remote", "ssh", "docker-compose.yml"), - os.path.join(os.path.dirname(__file__), "storage", "sql", "docker-compose.yml"), - # Add additional configs as necessary here. - ] - - -@pytest.fixture(scope="session") -def docker_compose_project_name(short_testrun_uid: str) -> str: - """ - Returns the name of the docker-compose project. - - Returns - ------- - str - Name of the docker-compose project. - """ - # Use the xdist testrun UID to ensure that the docker-compose project name - # is unique across sessions, but shared amongst workers. - return f"mlos_bench-test-{short_testrun_uid}" - - -@pytest.fixture(scope="session") -def docker_services_lock( - shared_temp_dir: str, - short_testrun_uid: str, -) -> InterProcessReaderWriterLock: - """ - Gets a pytest session lock for xdist workers to mark when they're using the docker - services. - - Yields - ------ - A lock to ensure that setup/teardown operations don't happen while a - worker is using the docker services. - """ - return InterProcessReaderWriterLock( - f"{shared_temp_dir}/pytest_docker_services-{short_testrun_uid}.lock" - ) - - -@pytest.fixture(scope="session") -def docker_setup_teardown_lock(shared_temp_dir: str, short_testrun_uid: str) -> InterProcessLock: - """ - Gets a pytest session lock between xdist workers for the docker setup/teardown - operations. - - Yields - ------ - A lock to ensure that only one worker is doing setup/teardown at a time. - """ - return InterProcessLock( - f"{shared_temp_dir}/pytest_docker_services-setup-teardown-{short_testrun_uid}.lock" - ) - - -@pytest.fixture(scope="session") -def locked_docker_services( - docker_compose_command: Any, - docker_compose_file: Any, - docker_compose_project_name: Any, - docker_setup: Any, - docker_cleanup: Any, - docker_setup_teardown_lock: InterProcessLock, - docker_services_lock: InterProcessReaderWriterLock, -) -> Generator[DockerServices, Any, None]: - """A locked version of the docker_services fixture to implement xdist single - instance locking. - """ - # pylint: disable=too-many-arguments,too-many-positional-arguments - # Mark the services as in use with the reader lock. - docker_services_lock.acquire_read_lock() - # Acquire the setup lock to prevent multiple setup operations at once. - docker_setup_teardown_lock.acquire() - # This "with get_docker_services(...)"" pattern is in the default fixture. - # We call it instead of docker_services() to avoid pytest complaints about - # calling fixtures directly. - with get_docker_services( - docker_compose_command, - docker_compose_file, - docker_compose_project_name, - docker_setup, - docker_cleanup, - ) as docker_services: - # Release the setup/tear down lock in order to let the setup operation - # continue for other workers (should be a no-op at this point). - docker_setup_teardown_lock.release() - # Yield the services so that tests within this worker can use them. - yield docker_services - # Now tests that use those services get to run on this worker... - # Once the tests are done, release the read lock that marks the services as in use. - docker_services_lock.release_read_lock() - # Now as we prepare to execute the cleanup code on context exit we need - # to acquire the setup/teardown lock again. - # First we attempt to get the write lock so that we wait for other - # readers to finish and guard against a lock inversion possibility. - docker_services_lock.acquire_write_lock() - # Next, acquire the setup/teardown lock - # First one here is the one to do actual work, everyone else is basically a no-op. - # Upon context exit, we should execute the docker_cleanup code. - # And try to get the setup/tear down lock again. - docker_setup_teardown_lock.acquire() - # Finally, after the docker_cleanup code has finished, remove both locks. - docker_setup_teardown_lock.release() - docker_services_lock.release_write_lock() diff --git a/mlos_bench/mlos_bench/tests/docker_fixtures_util.py b/mlos_bench/mlos_bench/tests/docker_fixtures_util.py new file mode 100644 index 0000000000..834c21f185 --- /dev/null +++ b/mlos_bench/mlos_bench/tests/docker_fixtures_util.py @@ -0,0 +1,172 @@ +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +"""Helper functions for various docker test fixtures. + +Test functions needing to use these should import them and then add them to +their namespace in a conftest.py file. + +The intent of keeping these separate from conftest.py is to allow individual +test to setup their own docker-compose configurations that are independent. + +As such, each conftest.py should set their own docker_compose_file fixture +pointing to the appropriate docker-compose.yml file(s) and set a unique +docker_compose_project_name. +""" +# pylint: disable=redefined-outer-name + +import os +import sys + +from collections.abc import Generator +from typing import Any +import pytest +from fasteners import InterProcessLock, InterProcessReaderWriterLock +from pytest_docker.plugin import Services as DockerServices +from pytest_docker.plugin import get_docker_services + +# Fixtures to configure the pytest-docker plugin. +@pytest.fixture(scope="session") +def docker_setup() -> list[str] | str: + """Setup for docker services.""" + if sys.platform == "darwin" or os.environ.get("HOST_OSTYPE", "").lower().startswith("darwin"): + # Workaround an oddity on macOS where the "docker-compose up" + # command always recreates the containers. + # That leads to races when multiple workers are trying to + # start and use the same services. + return ["up --build -d --no-recreate"] + else: + return ["up --build -d"] + + +@pytest.fixture(scope="session") +def docker_compose_file(pytestconfig: pytest.Config) -> list[str]: + """ + Fixture for the path to the docker-compose file. + + Parameters + ---------- + pytestconfig : pytest.Config + + Returns + ------- + str + Path to the docker-compose file. + """ + _ = pytestconfig # unused + # Add additional configs as necessary here. + # return [] + raise NotImplementedError("Please implement docker_compose_file in your conftest.py") + + +@pytest.fixture(scope="session") +def docker_compose_project_name(short_testrun_uid: str) -> str: + """ + Fixture for the name of the docker-compose project. + + Returns + ------- + str + Name of the docker-compose project. + """ + # Use the xdist testrun UID to ensure that the docker-compose project name + # is unique across sessions, but shared amongst workers. + # return f"mlos_bench-test-{short_testrun_uid}-{__name__.replace('.', '-')}" + raise NotImplementedError("Please implement docker_compose_project_name in your conftest.py") + + +@pytest.fixture(scope="session") +def docker_services_lock( + shared_temp_dir: str, + short_testrun_uid: str, +) -> InterProcessReaderWriterLock: + """ + Gets a pytest session lock for xdist workers to mark when they're using the docker + services. + + Yields + ------ + A lock to ensure that setup/teardown operations don't happen while a + worker is using the docker services. + """ + return InterProcessReaderWriterLock( + f"{shared_temp_dir}/pytest_docker_services-{short_testrun_uid}.lock" + ) + + +@pytest.fixture(scope="session") +def docker_setup_teardown_lock(shared_temp_dir: str, short_testrun_uid: str) -> InterProcessLock: + """ + Gets a pytest session lock between xdist workers for the docker setup/teardown + operations. + + Yields + ------ + A lock to ensure that only one worker is doing setup/teardown at a time. + """ + return InterProcessLock( + f"{shared_temp_dir}/pytest_docker_services-setup-teardown-{short_testrun_uid}.lock" + ) + + +@pytest.fixture(scope="session") +def locked_docker_services( + docker_compose_command: Any, + docker_compose_file: Any, + docker_compose_project_name: Any, + docker_setup: Any, + docker_cleanup: Any, + docker_setup_teardown_lock: InterProcessLock, + docker_services_lock: InterProcessReaderWriterLock, +) -> Generator[DockerServices, Any, None]: + """A locked version of the docker_services fixture to implement xdist single + instance locking. + """ + # pylint: disable=too-many-arguments,too-many-positional-arguments + # Mark the services as in use with the reader lock. + docker_services_lock.acquire_read_lock() + # Acquire the setup lock to prevent multiple setup operations at once. + docker_setup_teardown_lock.acquire() + # This "with get_docker_services(...)"" pattern is in the default fixture. + # We call it instead of docker_services() to avoid pytest complaints about + # calling fixtures directly. + with get_docker_services( + docker_compose_command, + docker_compose_file, + docker_compose_project_name, + docker_setup, + docker_cleanup, + ) as docker_services: + # Release the setup/tear down lock in order to let the setup operation + # continue for other workers (should be a no-op at this point). + docker_setup_teardown_lock.release() + # Yield the services so that tests within this worker can use them. + yield docker_services + # Now tests that use those services get to run on this worker... + # Once the tests are done, release the read lock that marks the services as in use. + docker_services_lock.release_read_lock() + # Now as we prepare to execute the cleanup code on context exit we need + # to acquire the setup/teardown lock again. + # First we attempt to get the write lock so that we wait for other + # readers to finish and guard against a lock inversion possibility. + docker_services_lock.acquire_write_lock() + # Next, acquire the setup/teardown lock + # First one here is the one to do actual work, everyone else is basically a no-op. + # Upon context exit, we should execute the docker_cleanup code. + # And try to get the setup/tear down lock again. + docker_setup_teardown_lock.acquire() + # Finally, after the docker_cleanup code has finished, remove both locks. + docker_setup_teardown_lock.release() + docker_services_lock.release_write_lock() + + +__all__ = [ + # These two should be implemented in the conftest.py of the local test suite + #"docker_compose_file", + #"docker_compose_project_name", + "docker_setup", + "docker_services_lock", + "docker_setup_teardown_lock", + "locked_docker_services", +] diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/conftest.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/conftest.py index 83e98c44a2..7efb937808 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/conftest.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/conftest.py @@ -4,9 +4,18 @@ # """Fixtures for the SSH service tests.""" +import mlos_bench.tests.docker_fixtures_util as docker_fixtures_util import mlos_bench.tests.services.remote.ssh.fixtures as ssh_fixtures # Expose some of those as local names so they can be picked up as fixtures by pytest. +docker_compose_file = ssh_fixtures.docker_compose_file +docker_compose_project_name = ssh_fixtures.docker_compose_project_name + +docker_setup = docker_fixtures_util.docker_setup +docker_services_lock = docker_fixtures_util.docker_services_lock +docker_setup_teardown_lock = docker_fixtures_util.docker_setup_teardown_lock +locked_docker_services = docker_fixtures_util.locked_docker_services + ssh_test_server = ssh_fixtures.ssh_test_server alt_test_server = ssh_fixtures.alt_test_server reboot_test_server = ssh_fixtures.reboot_test_server diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py index fda84e9f79..f16cf58087 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py @@ -29,6 +29,41 @@ # pylint: disable=redefined-outer-name +@pytest.fixture(scope="session") +def docker_compose_file(pytestconfig: pytest.Config) -> list[str]: + """ + Fixture for the path to the docker-compose file. + + Parameters + ---------- + pytestconfig : pytest.Config + + Returns + ------- + str + Path to the docker-compose file. + """ + _ = pytestconfig # unused + return [ + os.path.join(os.path.dirname(__file__), "docker-compose.yml"), + ] + + +@pytest.fixture(scope="session") +def docker_compose_project_name(short_testrun_uid: str) -> str: + """ + Returns the name of the docker-compose project. + + Returns + ------- + str + Name of the docker-compose project. + """ + # Use the xdist testrun UID to ensure that the docker-compose project name + # is unique across sessions, but shared amongst workers. + return f"mlos_bench-test-{short_testrun_uid}-{__name__.replace('.', '-')}" + + @pytest.fixture(scope="session") def ssh_test_server( docker_hostname: str, diff --git a/mlos_bench/mlos_bench/tests/storage/conftest.py b/mlos_bench/mlos_bench/tests/storage/conftest.py index acdbe5024a..5067af1744 100644 --- a/mlos_bench/mlos_bench/tests/storage/conftest.py +++ b/mlos_bench/mlos_bench/tests/storage/conftest.py @@ -4,6 +4,7 @@ # """Export test fixtures for mlos_bench storage.""" +import mlos_bench.tests.docker_fixtures_util as docker_fixtures_util import mlos_bench.tests.storage.sql.fixtures as sql_storage_fixtures # NOTE: For future storage implementation additions, we can refactor this to use @@ -11,6 +12,14 @@ # same. # Expose some of those as local names so they can be picked up as fixtures by pytest. +docker_compose_file = sql_storage_fixtures.docker_compose_file +docker_compose_project_name = sql_storage_fixtures.docker_compose_project_name + +docker_setup = docker_fixtures_util.docker_setup +docker_services_lock = docker_fixtures_util.docker_services_lock +docker_setup_teardown_lock = docker_fixtures_util.docker_setup_teardown_lock +locked_docker_services = docker_fixtures_util.locked_docker_services + mysql_storage_info = sql_storage_fixtures.mysql_storage_info mysql_storage = sql_storage_fixtures.mysql_storage postgres_storage_info = sql_storage_fixtures.postgres_storage_info diff --git a/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py b/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py index 0b25f963a9..793c8ed38a 100644 --- a/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py +++ b/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py @@ -53,6 +53,39 @@ PERSISTENT_SQL_STORAGE_FIXTURES.extend(DOCKER_DBMS_FIXTURES) +@pytest.fixture(scope="session") +def docker_compose_file(pytestconfig: pytest.Config) -> list[str]: + """ + Fixture for the path to the docker-compose file. + + Parameters + ---------- + pytestconfig : pytest.Config + + Returns + ------- + str + Path to the docker-compose file. + """ + _ = pytestconfig # unused + return [ + os.path.join(os.path.dirname(__file__), "docker-compose.yml"), + ] + + +@pytest.fixture(scope="session") +def docker_compose_project_name(short_testrun_uid: str) -> str: + """ + Fixture for the name of the docker-compose project. + + Returns + ------- + str + Name of the docker-compose project. + """ + return f"mlos_bench-test-{short_testrun_uid}-{__name__.replace('.', '-')}" + + @pytest.fixture(scope="session") def mysql_storage_info( docker_hostname: str, From cba92cd924d343b4910a964863602179ceccc819 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Sep 2025 17:52:51 +0000 Subject: [PATCH 24/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../mlos_bench/tests/docker_fixtures_util.py | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/docker_fixtures_util.py b/mlos_bench/mlos_bench/tests/docker_fixtures_util.py index 834c21f185..88cfa354e8 100644 --- a/mlos_bench/mlos_bench/tests/docker_fixtures_util.py +++ b/mlos_bench/mlos_bench/tests/docker_fixtures_util.py @@ -2,30 +2,31 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. # -"""Helper functions for various docker test fixtures. +""" +Helper functions for various docker test fixtures. -Test functions needing to use these should import them and then add them to -their namespace in a conftest.py file. +Test functions needing to use these should import them and then add them to their +namespace in a conftest.py file. -The intent of keeping these separate from conftest.py is to allow individual -test to setup their own docker-compose configurations that are independent. +The intent of keeping these separate from conftest.py is to allow individual test to +setup their own docker-compose configurations that are independent. -As such, each conftest.py should set their own docker_compose_file fixture -pointing to the appropriate docker-compose.yml file(s) and set a unique -docker_compose_project_name. +As such, each conftest.py should set their own docker_compose_file fixture pointing to +the appropriate docker-compose.yml file(s) and set a unique docker_compose_project_name. """ # pylint: disable=redefined-outer-name import os import sys - from collections.abc import Generator from typing import Any + import pytest from fasteners import InterProcessLock, InterProcessReaderWriterLock from pytest_docker.plugin import Services as DockerServices from pytest_docker.plugin import get_docker_services + # Fixtures to configure the pytest-docker plugin. @pytest.fixture(scope="session") def docker_setup() -> list[str] | str: @@ -163,8 +164,8 @@ def locked_docker_services( __all__ = [ # These two should be implemented in the conftest.py of the local test suite - #"docker_compose_file", - #"docker_compose_project_name", + # "docker_compose_file", + # "docker_compose_project_name", "docker_setup", "docker_services_lock", "docker_setup_teardown_lock", From 797ac40b93f7dc0f632a9de121729c85d6ae1388 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 23 Sep 2025 17:59:57 +0000 Subject: [PATCH 25/76] comments and revert to moby --- .devcontainer/Dockerfile | 5 ++++- .devcontainer/devcontainer.json | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index d6e3a159a1..d619fec293 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,7 +1,10 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -# TODO: FIXME: https://github.com/microsoft/MLOS/issues/1002 +# TODO: FIXME: Temporarily pin to bookworm until latest (trixie) base image is fully supported. +# See Also: +# - +# - .devcontainer/devcontainer.json FROM mcr.microsoft.com/vscode/devcontainers/base:bookworm AS base # Add some additional packages for the devcontainer terminal environment. diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ee56ba926f..13e424ba34 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -32,7 +32,11 @@ "ghcr.io/devcontainers/features/azure-cli:1": {}, //"ghcr.io/devcontainers/features/conda:1": {}, "ghcr.io/devcontainers/features/docker-outside-of-docker:1": { - "moby": false + // Note: this may be required when moving to Debian Trixie base image. + // See Also: + // - + // - .devcontainer/Dockerfile + //"moby": false }, "ghcr.io/devcontainers/features/github-cli:1": {}, "ghcr.io/stuartleeks/dev-container-features/azure-cli-persistence:0": {}, From 07924d67aecc4f860c18838a3514cd8176e450fb Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 23 Sep 2025 17:59:57 +0000 Subject: [PATCH 26/76] comments and revert to moby --- .devcontainer/Dockerfile | 5 ++++- .devcontainer/devcontainer.json | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index d6e3a159a1..d619fec293 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,7 +1,10 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -# TODO: FIXME: https://github.com/microsoft/MLOS/issues/1002 +# TODO: FIXME: Temporarily pin to bookworm until latest (trixie) base image is fully supported. +# See Also: +# - +# - .devcontainer/devcontainer.json FROM mcr.microsoft.com/vscode/devcontainers/base:bookworm AS base # Add some additional packages for the devcontainer terminal environment. diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ee56ba926f..13e424ba34 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -32,7 +32,11 @@ "ghcr.io/devcontainers/features/azure-cli:1": {}, //"ghcr.io/devcontainers/features/conda:1": {}, "ghcr.io/devcontainers/features/docker-outside-of-docker:1": { - "moby": false + // Note: this may be required when moving to Debian Trixie base image. + // See Also: + // - + // - .devcontainer/Dockerfile + //"moby": false }, "ghcr.io/devcontainers/features/github-cli:1": {}, "ghcr.io/stuartleeks/dev-container-features/azure-cli-persistence:0": {}, From 9870d35f779d98927bd4a45724b24b242dab6f13 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 23 Sep 2025 18:05:51 +0000 Subject: [PATCH 27/76] make some more fixtures available --- .../mlos_bench/tests/environments/remote/conftest.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mlos_bench/mlos_bench/tests/environments/remote/conftest.py b/mlos_bench/mlos_bench/tests/environments/remote/conftest.py index b8ea5a2a6b..685b178a70 100644 --- a/mlos_bench/mlos_bench/tests/environments/remote/conftest.py +++ b/mlos_bench/mlos_bench/tests/environments/remote/conftest.py @@ -4,7 +4,16 @@ # """Fixtures for the RemoteEnv tests using SSH Services.""" +import mlos_bench.tests.docker_fixtures_util as docker_fixtures_util import mlos_bench.tests.services.remote.ssh.fixtures as ssh_fixtures # Expose some of those as local names so they can be picked up as fixtures by pytest. +docker_compose_file = ssh_fixtures.docker_compose_file +docker_compose_project_name = ssh_fixtures.docker_compose_project_name + +docker_setup = docker_fixtures_util.docker_setup +docker_services_lock = docker_fixtures_util.docker_services_lock +docker_setup_teardown_lock = docker_fixtures_util.docker_setup_teardown_lock +locked_docker_services = docker_fixtures_util.locked_docker_services + ssh_test_server = ssh_fixtures.ssh_test_server From c775d8914dd2e1d22ea469c1b82a83c81b908bba Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 23 Sep 2025 18:10:23 +0000 Subject: [PATCH 28/76] quotes fixup --- mlos_bench/mlos_bench/tests/docker_fixtures_util.py | 2 +- mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py | 2 +- mlos_bench/mlos_bench/tests/storage/sql/fixtures.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/docker_fixtures_util.py b/mlos_bench/mlos_bench/tests/docker_fixtures_util.py index 88cfa354e8..fe4e753ed5 100644 --- a/mlos_bench/mlos_bench/tests/docker_fixtures_util.py +++ b/mlos_bench/mlos_bench/tests/docker_fixtures_util.py @@ -73,7 +73,7 @@ def docker_compose_project_name(short_testrun_uid: str) -> str: """ # Use the xdist testrun UID to ensure that the docker-compose project name # is unique across sessions, but shared amongst workers. - # return f"mlos_bench-test-{short_testrun_uid}-{__name__.replace('.', '-')}" + # return f"""mlos_bench-test-{short_testrun_uid}-{__name__.replace(".", "-")}""" raise NotImplementedError("Please implement docker_compose_project_name in your conftest.py") diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py index f16cf58087..8080f182cc 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py @@ -61,7 +61,7 @@ def docker_compose_project_name(short_testrun_uid: str) -> str: """ # Use the xdist testrun UID to ensure that the docker-compose project name # is unique across sessions, but shared amongst workers. - return f"mlos_bench-test-{short_testrun_uid}-{__name__.replace('.', '-')}" + return f"""mlos_bench-test-{short_testrun_uid}-{__name__.replace(".", "-")}""" @pytest.fixture(scope="session") diff --git a/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py b/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py index 793c8ed38a..7049e6f007 100644 --- a/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py +++ b/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py @@ -83,7 +83,7 @@ def docker_compose_project_name(short_testrun_uid: str) -> str: str Name of the docker-compose project. """ - return f"mlos_bench-test-{short_testrun_uid}-{__name__.replace('.', '-')}" + return f"""mlos_bench-test-{short_testrun_uid}-{__name__.replace(".", "-")}""" @pytest.fixture(scope="session") From 72aec7b9d69525420ae7467bf436d856ad4a1a3b Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 23 Sep 2025 21:12:59 +0000 Subject: [PATCH 29/76] upload the coverage.xml file regardless --- .github/workflows/devcontainer.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/devcontainer.yml b/.github/workflows/devcontainer.yml index 495645ede0..2867816317 100644 --- a/.github/workflows/devcontainer.yml +++ b/.github/workflows/devcontainer.yml @@ -175,6 +175,13 @@ jobs: # Now actually run the tests. docker exec --user vscode --env USER=vscode mlos-devcontainer make CONDA_INFO_LEVEL=-v test + - name: Upload coverage report as build artifact + if: always() # ensures it runs even if tests fail + uses: actions/upload-artifact@v4 + with: + name: coverage-report + path: coverage.xml + - name: Generate and test binary distribution files timeout-minutes: 10 run: | From 9fd68437234dd7d5ac32da2ec32ff34b162c97d7 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 23 Sep 2025 21:31:39 +0000 Subject: [PATCH 30/76] comments and port forwarding for doc viewing --- .devcontainer/devcontainer.json | 5 +++++ .vscode/settings.json | 12 +++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 13e424ba34..5aa3d91102 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -42,6 +42,11 @@ "ghcr.io/stuartleeks/dev-container-features/azure-cli-persistence:0": {}, "ghcr.io/stuartleeks/dev-container-features/shell-history:0": {} }, + "forwardPorts": [ + // Make the nginx instance started as a part of `make doc` available to view + // the coverage results. + 8080 + ], // Use 'postCreateCommand' to run commands after the container is created. // "postCreateCommand": "python --version", // Configure tool-specific properties. diff --git a/.vscode/settings.json b/.vscode/settings.json index 35b93cdda0..441bee0e5a 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -178,5 +178,15 @@ "github.copilot.chat.copilotDebugCommand.enabled": true, "github.copilot.chat.reviewSelection.enabled": true, "python-envs.defaultEnvManager": "ms-python.python:conda", - "python-envs.defaultPackageManager": "ms-python.python:conda" + "python-envs.defaultPackageManager": "ms-python.python:conda", + // Make the nginx instance started as a part of `make doc` available to view + // the coverage results. + "remote.localPortHost": "localhost", + "remote.SSH.defaultForwardedPorts": [ + { + "name": "nginx for doc viewing", + "localPort": 8080, + "remotePort": 8080 + } + ] } From 6fbd25b1096359593799bc6f7eab78322d113d72 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 23 Sep 2025 22:45:55 +0000 Subject: [PATCH 31/76] revert --- .github/workflows/devcontainer.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/devcontainer.yml b/.github/workflows/devcontainer.yml index 2867816317..d16f067112 100644 --- a/.github/workflows/devcontainer.yml +++ b/.github/workflows/devcontainer.yml @@ -78,7 +78,7 @@ jobs: echo "NO_CACHE=true" >> $GITHUB_ENV - name: Run pytest with debug logging enabled for nightly runs - #if: github.event_name == 'schedule' + if: github.event_name == 'schedule' run: | echo "PYTEST_EXTRA_OPTIONS=--log-level=DEBUG" >> $GITHUB_ENV From 382ab7896f38ca77529deb47a6fe86d30f860684 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 23 Sep 2025 22:51:46 +0000 Subject: [PATCH 32/76] revertme: temporarily make docker required to see what the issue is in CI --- mlos_bench/mlos_bench/tests/__init__.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index 3c5b084b88..322c5005b4 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -52,10 +52,17 @@ if cmd.returncode != 0 or not any( line for line in stdout.splitlines() if "Platform" in line and "linux" in line ): - warning("Docker is available but missing buildx support for targeting linux platform.") DOCKER = None + warning("Docker is available but missing buildx support for targeting linux platform.") + raise RuntimeError( + "DEBUGGING: " + "Docker is available but missing buildx support for targeting linux platform." + ) if not DOCKER: warning("Docker is not available on this system. Some tests will be skipped.") + raise RuntimeError( + "DEBUGGING: Docker is not available on this system. Some tests will be skipped." + ) requires_docker = pytest.mark.skipif( not DOCKER, reason="Docker with Linux support is not available on this system.", @@ -66,6 +73,9 @@ SSH = shutil.which("ssh") if not SSH: warning("ssh is not available on this system. Some tests will be skipped.") + raise RuntimeError( + "DEBUGGING: ssh is not available on this system. Some tests will be skipped." + ) requires_ssh = pytest.mark.skipif(not SSH, reason="ssh is not available on this system.") # A common seed to use to avoid tracking down race conditions and intermingling From 496fba8fab65ae5277b5e8e7ac7840433fff8abc Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Wed, 24 Sep 2025 18:36:37 -0500 Subject: [PATCH 33/76] more debugging --- .github/workflows/devcontainer.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/devcontainer.yml b/.github/workflows/devcontainer.yml index d16f067112..ae87229f47 100644 --- a/.github/workflows/devcontainer.yml +++ b/.github/workflows/devcontainer.yml @@ -170,7 +170,7 @@ jobs: test_count=$(docker exec --user vscode --env USER=vscode mlos-devcontainer \ conda run -n mlos python -m pytest -svxl -n auto --collect-only --rootdir /workspaces/MLOS -s --cache-clear \ | grep -c '&2; exit 1; fi + # FIXME: UNDO: if [ "${test_count:-0}" -lt 725 ]; then echo "Expected at least 725 tests, got '$test_count'" >&2; exit 1; fi # Now actually run the tests. docker exec --user vscode --env USER=vscode mlos-devcontainer make CONDA_INFO_LEVEL=-v test From 6298fa42321a575de9afa088a56fefd22f051f7e Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 21 Oct 2025 12:09:24 -0500 Subject: [PATCH 34/76] more debug info --- mlos_bench/mlos_bench/tests/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index 322c5005b4..d49a91abc3 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -53,10 +53,14 @@ line for line in stdout.splitlines() if "Platform" in line and "linux" in line ): DOCKER = None - warning("Docker is available but missing buildx support for targeting linux platform.") + warning( + "Docker is available but missing buildx support for targeting linux platform: " + + stdout + ) raise RuntimeError( "DEBUGGING: " - "Docker is available but missing buildx support for targeting linux platform." + + "Docker is available but missing buildx support for targeting linux platform: " + + stdout ) if not DOCKER: warning("Docker is not available on this system. Some tests will be skipped.") From 2f7c0104eb7d6ce1fb609ed787b001ed469d19dc Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 21 Oct 2025 13:14:54 -0500 Subject: [PATCH 35/76] trying to print some extra info about the docker env while in the github CI env --- mlos_bench/mlos_bench/tests/__init__.py | 36 +++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index d49a91abc3..b165dc3322 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -21,6 +21,9 @@ from pytest_docker.plugin import Services as DockerServices from mlos_bench.util import get_class_from_name, nullable +import stat +import pwd +import grp ZONE_NAMES = [ # Explicit time zones. @@ -38,10 +41,43 @@ "trial_runner_id": None, } + +# Gathering info about Github CI docker.sock permissions for debugging purposes. +def _print_docker_sock_info(): + sock_path = "/var/run/docker.sock" + try: + st = os.stat(sock_path) + mode = stat.filemode(st.st_mode) + uid = st.st_uid + gid = st.st_gid + user = pwd.getpwuid(uid).pw_name + group = grp.getgrgid(gid).gr_name + warning(f"{sock_path}:") + warning(f" Mode: {mode}") + warning(f" Owner: {user} (UID: {uid})") + warning(f" Group: {group} (GID: {gid})") + except Exception as e: # pylint: disable=broad-except + warning(f"Could not stat {sock_path}: {e}") + + try: + current_uid = os.getuid() + current_gid = os.getgid() + current_user = pwd.getpwuid(current_uid).pw_name + current_group = grp.getgrgid(current_gid).gr_name + groups = [grp.getgrgid(g).gr_name for g in os.getgroups()] + warning("Current user info:") + warning(f" User: {current_user} (UID: {current_uid})") + warning(f" Group: {current_group} (GID: {current_gid})") + warning(f" Groups: {groups}") + except Exception as e: # pylint: disable=broad-except + warning(f"Could not get current user info: {e}") + + # A decorator for tests that require docker. # Use with @requires_docker above a test_...() function. DOCKER = shutil.which("docker") if DOCKER: + _print_docker_sock_info() cmd = run( "docker builder inspect default || docker buildx inspect default", shell=True, From 06d4d8dd9a669dc911921ea3776a20e8daed925d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Oct 2025 18:15:21 +0000 Subject: [PATCH 36/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mlos_bench/mlos_bench/tests/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index b165dc3322..367585fed2 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -8,10 +8,13 @@ Used to make mypy happy about multiple conftest.py modules. """ import filecmp +import grp import json import os +import pwd import shutil import socket +import stat from datetime import tzinfo from logging import warning from subprocess import run @@ -21,9 +24,6 @@ from pytest_docker.plugin import Services as DockerServices from mlos_bench.util import get_class_from_name, nullable -import stat -import pwd -import grp ZONE_NAMES = [ # Explicit time zones. From 36215063f5e527690a08ad08473cdccdbdcd0df8 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 21 Oct 2025 13:23:23 -0500 Subject: [PATCH 37/76] more debugging --- .github/workflows/devcontainer.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/devcontainer.yml b/.github/workflows/devcontainer.yml index ae87229f47..21c07c20e7 100644 --- a/.github/workflows/devcontainer.yml +++ b/.github/workflows/devcontainer.yml @@ -108,6 +108,11 @@ jobs: timeout-minutes: 3 run: | set -x + ls -l /var/run/docker.sock + stat /var/run/docker.sock + id + id -u + id -g docker run -d --rm --user root \ --volume /var/run/docker.sock:/var/run/docker.sock \ --env DOCKER_BUILDKIT=$DOCKER_BUILDKIT \ From 751028e34ca251b3b3d61d1331530be49e428a6a Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 21 Oct 2025 13:24:53 -0500 Subject: [PATCH 38/76] more debugging --- mlos_bench/mlos_bench/tests/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index 367585fed2..1c9630e283 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -64,7 +64,7 @@ def _print_docker_sock_info(): current_gid = os.getgid() current_user = pwd.getpwuid(current_uid).pw_name current_group = grp.getgrgid(current_gid).gr_name - groups = [grp.getgrgid(g).gr_name for g in os.getgroups()] + groups = {g: grp.getgrgid(g).gr_name for g in os.getgroups()} warning("Current user info:") warning(f" User: {current_user} (UID: {current_uid})") warning(f" Group: {current_group} (GID: {current_gid})") From d4fc3bab7c6bd6b28abc3524065b89fc3f9b6b98 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 21 Oct 2025 13:35:38 -0500 Subject: [PATCH 39/76] mypy --- mlos_bench/mlos_bench/tests/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index 1c9630e283..989706df3f 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -43,7 +43,7 @@ # Gathering info about Github CI docker.sock permissions for debugging purposes. -def _print_docker_sock_info(): +def _print_docker_sock_info() -> None: sock_path = "/var/run/docker.sock" try: st = os.stat(sock_path) From 5ef813280f32bd95a65b77d9edb4ffa194e51f4a Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 21 Oct 2025 13:59:49 -0500 Subject: [PATCH 40/76] fix debugging output --- mlos_bench/mlos_bench/tests/__init__.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index 989706df3f..4afab66782 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -42,8 +42,9 @@ } -# Gathering info about Github CI docker.sock permissions for debugging purposes. -def _print_docker_sock_info() -> None: +DOCKER = shutil.which("docker") +if DOCKER: + # Gathering info about Github CI docker.sock permissions for debugging purposes. sock_path = "/var/run/docker.sock" try: st = os.stat(sock_path) @@ -72,12 +73,6 @@ def _print_docker_sock_info() -> None: except Exception as e: # pylint: disable=broad-except warning(f"Could not get current user info: {e}") - -# A decorator for tests that require docker. -# Use with @requires_docker above a test_...() function. -DOCKER = shutil.which("docker") -if DOCKER: - _print_docker_sock_info() cmd = run( "docker builder inspect default || docker buildx inspect default", shell=True, @@ -98,11 +93,15 @@ def _print_docker_sock_info() -> None: + "Docker is available but missing buildx support for targeting linux platform: " + stdout ) + if not DOCKER: warning("Docker is not available on this system. Some tests will be skipped.") raise RuntimeError( "DEBUGGING: Docker is not available on this system. Some tests will be skipped." ) + +# A decorator for tests that require docker. +# Use with @requires_docker above a test_...() function. requires_docker = pytest.mark.skipif( not DOCKER, reason="Docker with Linux support is not available on this system.", From 74c30f83f8e5644f954149b8b68e507faade279c Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 21 Oct 2025 14:19:55 -0500 Subject: [PATCH 41/76] add some more debug info --- .github/workflows/devcontainer.yml | 33 +++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/.github/workflows/devcontainer.yml b/.github/workflows/devcontainer.yml index 21c07c20e7..1c7b555057 100644 --- a/.github/workflows/devcontainer.yml +++ b/.github/workflows/devcontainer.yml @@ -108,12 +108,18 @@ jobs: timeout-minutes: 3 run: | set -x - ls -l /var/run/docker.sock - stat /var/run/docker.sock - id - id -u - id -g + gid=`id -g` + docker_sock_gid=`stat -c '%g' /var/run/docker.sock` + docker_gid=`getent group docker | cut -d: -f3 || echo 999` + if [ -n "$docker_sock_gid" ]; then + docker_gid=$docker_sock_gid + echo "Using docker gid: $docker_gid" + fi + if [ "$docker_sock_gid" != "$docker_gid" ] && [ -n "$docker_sock_gid" != "$gid" ] && [ "$docker_sock_gid" != 0 ]; then + echo "WARNING: docker.sock gid ($docker_sock_gid) does not match host docker group gid ($docker_gid) or user gid ($gid)." >&2 + fi docker run -d --rm --user root \ + --group-add $docker_sock_gid \ --volume /var/run/docker.sock:/var/run/docker.sock \ --env DOCKER_BUILDKIT=$DOCKER_BUILDKIT \ --volume $(pwd):/workspaces/MLOS \ @@ -134,6 +140,23 @@ jobs: - name: Print some debug info from inside the container run: | docker exec --user vscode --env USER=vscode mlos-devcontainer printenv + docker exec --user vscode --env USER=vscode mlos-devcontainer id + docker exec --user vscode --env USER=vscode mlos-devcontainer ls -l /var/run/docker.sock + docker exec --user vscode --env USER=vscode mlos-devcontainer stat /var/run/docker.sock + + - name: Check that docker is usable inside the devcontainer + timeout-minutes: 2 + run: | + set -x + docker_sock_gid=`stat -c '%g' /var/run/docker.sock` + if ! docker exec --user vscode --env USER=vscode mlos-devcontainer id -G | sed 's/ /\n/' | grep -q -w $docker_sock_gid; then + echo "ERROR: vscode user in container does not have access to docker.sock group (gid: $docker_sock_gid)" >&2 + exit 1 + fi + docker exec --user vscode --env USER=vscode mlos-devcontainer docker version + docker exec --user vscode --env USER=vscode mlos-devcontainer docker info + docker exec --user vscode --env USER=vscode mlos-devcontainer docker buildx version + docker exec --user vscode --env USER=vscode mlos-devcontainer docker buildx ls | grep linux/ - name: Check that github.com is in the ssh known_hosts file run: | From 4b090c603ee61ed1961851d64fb2d3cfcb94b5a7 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 21 Oct 2025 14:40:50 -0500 Subject: [PATCH 42/76] improved debug checks --- .github/workflows/devcontainer.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/devcontainer.yml b/.github/workflows/devcontainer.yml index 1c7b555057..aab7634150 100644 --- a/.github/workflows/devcontainer.yml +++ b/.github/workflows/devcontainer.yml @@ -149,14 +149,19 @@ jobs: run: | set -x docker_sock_gid=`stat -c '%g' /var/run/docker.sock` - if ! docker exec --user vscode --env USER=vscode mlos-devcontainer id -G | sed 's/ /\n/' | grep -q -w $docker_sock_gid; then + if ! [ -w /var/run/docker.sock ] \ + || ! docker exec --user vscode --env USER=vscode mlos-devcontainer id -G | sed 's/ /\n/' | grep -q -w $docker_sock_gid + then echo "ERROR: vscode user in container does not have access to docker.sock group (gid: $docker_sock_gid)" >&2 exit 1 fi docker exec --user vscode --env USER=vscode mlos-devcontainer docker version docker exec --user vscode --env USER=vscode mlos-devcontainer docker info docker exec --user vscode --env USER=vscode mlos-devcontainer docker buildx version - docker exec --user vscode --env USER=vscode mlos-devcontainer docker buildx ls | grep linux/ + if ! docker exec --user vscode --env USER=vscode mlos-devcontainer docker buildx ls | grep linux/; then + echo "ERROR: docker buildx does not have linux builder available" >&2 + exit 1 + fi - name: Check that github.com is in the ssh known_hosts file run: | From fd61c79230c16b2884ea920fa0ee2943cf4b0506 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 21 Oct 2025 14:46:17 -0500 Subject: [PATCH 43/76] cleanup --- mlos_bench/mlos_bench/tests/__init__.py | 37 +++++++++---------------- 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index 4afab66782..d332504d93 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -51,26 +51,19 @@ mode = stat.filemode(st.st_mode) uid = st.st_uid gid = st.st_gid - user = pwd.getpwuid(uid).pw_name - group = grp.getgrgid(gid).gr_name - warning(f"{sock_path}:") - warning(f" Mode: {mode}") - warning(f" Owner: {user} (UID: {uid})") - warning(f" Group: {group} (GID: {gid})") except Exception as e: # pylint: disable=broad-except + mode = None + uid = None + gid = None warning(f"Could not stat {sock_path}: {e}") - try: current_uid = os.getuid() current_gid = os.getgid() - current_user = pwd.getpwuid(current_uid).pw_name - current_group = grp.getgrgid(current_gid).gr_name - groups = {g: grp.getgrgid(g).gr_name for g in os.getgroups()} - warning("Current user info:") - warning(f" User: {current_user} (UID: {current_uid})") - warning(f" Group: {current_group} (GID: {current_gid})") - warning(f" Groups: {groups}") + gids = os.getgroups() except Exception as e: # pylint: disable=broad-except + current_uid = None + current_gid = None + gids = None warning(f"Could not get current user info: {e}") cmd = run( @@ -80,25 +73,21 @@ capture_output=True, ) stdout = cmd.stdout.decode() + stderr = cmd.stderr.decode() if cmd.returncode != 0 or not any( line for line in stdout.splitlines() if "Platform" in line and "linux" in line ): DOCKER = None warning( - "Docker is available but missing buildx support for targeting linux platform: " - + stdout - ) - raise RuntimeError( - "DEBUGGING: " - + "Docker is available but missing buildx support for targeting linux platform: " - + stdout + "Docker is available but missing buildx support for targeting linux platform:\n" + + f"stdout:\n{stdout}\n" + + f"stderr:\n{stderr}\n" + + f"sock_path: {sock_path} sock mode: {mode} sock uid: {uid} gid: {gid}\n" + + f"current_uid: {current_uid} groups: {gids}\n" ) if not DOCKER: warning("Docker is not available on this system. Some tests will be skipped.") - raise RuntimeError( - "DEBUGGING: Docker is not available on this system. Some tests will be skipped." - ) # A decorator for tests that require docker. # Use with @requires_docker above a test_...() function. From 5b28210b45c174a86ebd53e92f52ec1b3f85583a Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 21 Oct 2025 14:47:29 -0500 Subject: [PATCH 44/76] cleanup --- .github/workflows/devcontainer.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/devcontainer.yml b/.github/workflows/devcontainer.yml index aab7634150..f5dbaf1b10 100644 --- a/.github/workflows/devcontainer.yml +++ b/.github/workflows/devcontainer.yml @@ -203,7 +203,7 @@ jobs: test_count=$(docker exec --user vscode --env USER=vscode mlos-devcontainer \ conda run -n mlos python -m pytest -svxl -n auto --collect-only --rootdir /workspaces/MLOS -s --cache-clear \ | grep -c '&2; exit 1; fi + # if [ "${test_count:-0}" -lt 800 ]; then echo "Expected at least 800 tests, got '$test_count'" >&2; exit 1; fi # Now actually run the tests. docker exec --user vscode --env USER=vscode mlos-devcontainer make CONDA_INFO_LEVEL=-v test @@ -222,9 +222,7 @@ jobs: docker exec --user vscode --env USER=vscode mlos-devcontainer make CONDA_INFO_LEVEL=-v dist dist-test - name: Test rebuilding the devcontainer in the devcontainer - # FIXME: - # timeout-minutes: 3 - timeout-minutes: 10 + timeout-minutes: 5 run: | set -x git --no-pager diff --exit-code From 2c8668fbeff22df72e011b159215e039da3fa5c8 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 21 Oct 2025 14:48:55 -0500 Subject: [PATCH 45/76] cleanup --- mlos_bench/mlos_bench/tests/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index d332504d93..73e564a063 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -101,9 +101,6 @@ SSH = shutil.which("ssh") if not SSH: warning("ssh is not available on this system. Some tests will be skipped.") - raise RuntimeError( - "DEBUGGING: ssh is not available on this system. Some tests will be skipped." - ) requires_ssh = pytest.mark.skipif(not SSH, reason="ssh is not available on this system.") # A common seed to use to avoid tracking down race conditions and intermingling From 5088cae03299a1f161a259a77bf9938a39549ed2 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 21 Oct 2025 16:07:00 -0500 Subject: [PATCH 46/76] comments and sync --- conda-envs/mlos-3.10.yml | 2 +- conda-envs/mlos-3.11.yml | 2 +- conda-envs/mlos-3.12.yml | 2 +- conda-envs/mlos-3.13.yml | 2 +- conda-envs/mlos-windows.yml | 2 +- conda-envs/mlos.yml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conda-envs/mlos-3.10.yml b/conda-envs/mlos-3.10.yml index 2fe9aef78c..fed6ed8bca 100644 --- a/conda-envs/mlos-3.10.yml +++ b/conda-envs/mlos-3.10.yml @@ -33,7 +33,7 @@ dependencies: - pylint==3.3.7 - tomlkit - mypy==1.15.0 - - pyright==1.1.400 + - pyright==1.1.406 - pandas-stubs - types-beautifulsoup4 - types-colorama diff --git a/conda-envs/mlos-3.11.yml b/conda-envs/mlos-3.11.yml index 1cdecc5119..6545c7a15a 100644 --- a/conda-envs/mlos-3.11.yml +++ b/conda-envs/mlos-3.11.yml @@ -33,7 +33,7 @@ dependencies: - pylint==3.3.7 - tomlkit - mypy==1.15.0 - - pyright==1.1.400 + - pyright==1.1.406 - pandas-stubs - types-beautifulsoup4 - types-colorama diff --git a/conda-envs/mlos-3.12.yml b/conda-envs/mlos-3.12.yml index ac01f11712..64bc115e04 100644 --- a/conda-envs/mlos-3.12.yml +++ b/conda-envs/mlos-3.12.yml @@ -35,7 +35,7 @@ dependencies: - pylint==3.3.7 - tomlkit - mypy==1.15.0 - - pyright==1.1.400 + - pyright==1.1.406 - pandas-stubs - types-beautifulsoup4 - types-colorama diff --git a/conda-envs/mlos-3.13.yml b/conda-envs/mlos-3.13.yml index 9e668db065..48e110073f 100644 --- a/conda-envs/mlos-3.13.yml +++ b/conda-envs/mlos-3.13.yml @@ -35,7 +35,7 @@ dependencies: - pylint==3.3.7 - tomlkit - mypy==1.15.0 - - pyright==1.1.400 + - pyright==1.1.406 - pandas-stubs - types-beautifulsoup4 - types-colorama diff --git a/conda-envs/mlos-windows.yml b/conda-envs/mlos-windows.yml index b37cbab1e3..3850ec5b11 100644 --- a/conda-envs/mlos-windows.yml +++ b/conda-envs/mlos-windows.yml @@ -33,7 +33,7 @@ dependencies: - pylint==3.3.7 - tomlkit - mypy==1.15.0 - - pyright==1.1.400 + - pyright==1.1.406 - pandas-stubs - types-beautifulsoup4 - types-colorama diff --git a/conda-envs/mlos.yml b/conda-envs/mlos.yml index ee05dfe1d3..aeda1b4e47 100644 --- a/conda-envs/mlos.yml +++ b/conda-envs/mlos.yml @@ -31,7 +31,7 @@ dependencies: - pylint==3.3.7 - tomlkit - mypy==1.15.0 - - pyright==1.1.405 + - pyright==1.1.406 - pandas-stubs - types-beautifulsoup4 - types-colorama From bee6f9920f4f4480f7df2359c9bf6a0d588e2513 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 21 Oct 2025 16:17:27 -0500 Subject: [PATCH 47/76] lint fixups --- mlos_bench/mlos_bench/tests/__init__.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index 73e564a063..fda025372f 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -45,25 +45,25 @@ DOCKER = shutil.which("docker") if DOCKER: # Gathering info about Github CI docker.sock permissions for debugging purposes. - sock_path = "/var/run/docker.sock" + DOCKER_SOCK_PATH = "/var/run/docker.sock" + mode: str | None = None + uid: int | None = None + gid: int | None = None + current_uid: int | None = None + current_gid: int | None = None + gids: list[int] | None = None try: - st = os.stat(sock_path) + st = os.stat(DOCKER_SOCK_PATH) mode = stat.filemode(st.st_mode) uid = st.st_uid gid = st.st_gid except Exception as e: # pylint: disable=broad-except - mode = None - uid = None - gid = None - warning(f"Could not stat {sock_path}: {e}") + warning(f"Could not stat {DOCKER_SOCK_PATH}: {e}") try: current_uid = os.getuid() current_gid = os.getgid() gids = os.getgroups() except Exception as e: # pylint: disable=broad-except - current_uid = None - current_gid = None - gids = None warning(f"Could not get current user info: {e}") cmd = run( @@ -82,7 +82,7 @@ "Docker is available but missing buildx support for targeting linux platform:\n" + f"stdout:\n{stdout}\n" + f"stderr:\n{stderr}\n" - + f"sock_path: {sock_path} sock mode: {mode} sock uid: {uid} gid: {gid}\n" + + f"sock_path: {DOCKER_SOCK_PATH} sock mode: {mode} sock uid: {uid} gid: {gid}\n" + f"current_uid: {current_uid} groups: {gids}\n" ) From ba46ff6e61887a13c6b3884c4315df8109562779 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Wed, 22 Oct 2025 16:29:02 -0500 Subject: [PATCH 48/76] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py index 8080f182cc..9e3d25ad5d 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py @@ -40,8 +40,8 @@ def docker_compose_file(pytestconfig: pytest.Config) -> list[str]: Returns ------- - str - Path to the docker-compose file. + list[str] + List of paths to docker-compose files. """ _ = pytestconfig # unused return [ From 30f87f1c806cd60a63bb626c20f21957df209da6 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Wed, 22 Oct 2025 16:29:12 -0500 Subject: [PATCH 49/76] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- mlos_bench/mlos_bench/tests/storage/sql/fixtures.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py b/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py index 7049e6f007..35451528c2 100644 --- a/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py +++ b/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py @@ -64,8 +64,8 @@ def docker_compose_file(pytestconfig: pytest.Config) -> list[str]: Returns ------- - str - Path to the docker-compose file. + list[str] + List of paths to the docker-compose file(s). """ _ = pytestconfig # unused return [ From 30b7d6a3ea331553da40a2a1fb1ddf70889ccade Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Wed, 22 Oct 2025 16:29:26 -0500 Subject: [PATCH 50/76] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- mlos_bench/mlos_bench/tests/docker_fixtures_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/docker_fixtures_util.py b/mlos_bench/mlos_bench/tests/docker_fixtures_util.py index fe4e753ed5..2309123bc8 100644 --- a/mlos_bench/mlos_bench/tests/docker_fixtures_util.py +++ b/mlos_bench/mlos_bench/tests/docker_fixtures_util.py @@ -52,8 +52,8 @@ def docker_compose_file(pytestconfig: pytest.Config) -> list[str]: Returns ------- - str - Path to the docker-compose file. + list[str] + List of paths to the docker-compose file(s). """ _ = pytestconfig # unused # Add additional configs as necessary here. From a4be9fb1c7d08176be592491a340e3cd28f2a2e4 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Wed, 22 Oct 2025 16:44:39 -0500 Subject: [PATCH 51/76] remove unused imports --- mlos_bench/mlos_bench/tests/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index fda025372f..601d066138 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -8,10 +8,8 @@ Used to make mypy happy about multiple conftest.py modules. """ import filecmp -import grp import json import os -import pwd import shutil import socket import stat From 8718d65a84d62493cde87db49a6fdd4b981c6914 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 23 Oct 2025 22:24:54 +0000 Subject: [PATCH 52/76] add retry logic for connecting --- .../services/remote/ssh/ssh_service.py | 76 +++++++++++++------ 1 file changed, 51 insertions(+), 25 deletions(-) diff --git a/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py b/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py index 5955f6dfd1..cef5baec55 100644 --- a/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py +++ b/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py @@ -7,6 +7,7 @@ import logging import os from abc import ABCMeta +from asyncio import sleep as async_sleep from asyncio import Event as CoroEvent from asyncio import Lock as CoroLock from collections.abc import Callable, Coroutine @@ -172,34 +173,59 @@ async def get_client_connection( A tuple of (SSHClientConnection, SshClient). """ _LOG.debug("%s: get_client_connection: %s", current_thread().name, connect_params) - async with self._cache_lock: - connection_id = SshClient.id_from_params(connect_params) - client: None | SshClient | asyncssh.SSHClient - _, client = self._cache.get(connection_id, (None, None)) - if client: - _LOG.debug("%s: Checking cached client %s", current_thread().name, connection_id) - connection = await client.connection() - if not connection: - _LOG.debug( - "%s: Removing stale client connection %s from cache.", - current_thread().name, - connection_id, - ) - self._cache.pop(connection_id) - # Try to reconnect next. - else: - _LOG.debug("%s: Using cached client %s", current_thread().name, connection_id) - if connection_id not in self._cache: - _LOG.debug( - "%s: Establishing client connection to %s", + connection_id = SshClient.id_from_params(connect_params) + for i in range(3): # TODO: make the retry count configurable + try: + async with self._cache_lock: + client: None | SshClient | asyncssh.SSHClient + _, client = self._cache.get(connection_id, (None, None)) + if client: + _LOG.debug( + "%s: Checking cached client %s", current_thread().name, connection_id + ) + connection = await client.connection() + if not connection: + _LOG.debug( + "%s: Removing stale client connection %s from cache.", + current_thread().name, + connection_id, + ) + self._cache.pop(connection_id) + # Try to reconnect next. + else: + _LOG.debug( + "%s: Using cached client %s", current_thread().name, connection_id + ) + if connection_id not in self._cache: + _LOG.debug( + "%s: Establishing client connection to %s", + current_thread().name, + connection_id, + ) + connection, client = await asyncssh.create_connection( + SshClient, **connect_params + ) + assert isinstance(client, SshClient) + self._cache[connection_id] = (connection, client) + _LOG.debug( + "%s: Created connection to %s.", current_thread().name, connection_id + ) + return self._cache[connection_id] + except (ConnectionRefusedError, asyncssh.Error) as ex: + _LOG.warning( + "%s: Attempt %d: Failed to connect to %s: %s", current_thread().name, + i + 1, connection_id, + ex, ) - connection, client = await asyncssh.create_connection(SshClient, **connect_params) - assert isinstance(client, SshClient) - self._cache[connection_id] = (connection, client) - _LOG.debug("%s: Created connection to %s.", current_thread().name, connection_id) - return self._cache[connection_id] + if i < 2: # TODO: adjust to match max range + await async_sleep(1.0) # TODO: Make this configurable + if i == 2: # TODO: adjust to match max range + _LOG.error( + "%s: Giving up connecting to %s", current_thread().name, connection_id + ) + raise def cleanup(self) -> None: """Closes all cached connections.""" From f44a4ee8e547c82dc791fdf7a766c8175a5e7556 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 23 Oct 2025 22:26:11 +0000 Subject: [PATCH 53/76] reduce errors handled for now --- mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py b/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py index cef5baec55..9694df6416 100644 --- a/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py +++ b/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py @@ -211,7 +211,7 @@ async def get_client_connection( "%s: Created connection to %s.", current_thread().name, connection_id ) return self._cache[connection_id] - except (ConnectionRefusedError, asyncssh.Error) as ex: + except ConnectionRefusedError as ex: # TODO: Add other error handling here too _LOG.warning( "%s: Attempt %d: Failed to connect to %s: %s", current_thread().name, From d405bfb551c557b7ec2b3c8cb57e1f4d2849be82 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 23 Oct 2025 22:28:23 +0000 Subject: [PATCH 54/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py b/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py index 9694df6416..7216d3b6aa 100644 --- a/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py +++ b/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py @@ -7,9 +7,9 @@ import logging import os from abc import ABCMeta -from asyncio import sleep as async_sleep from asyncio import Event as CoroEvent from asyncio import Lock as CoroLock +from asyncio import sleep as async_sleep from collections.abc import Callable, Coroutine from threading import current_thread from types import TracebackType From 8334ede4333d5531f0391489c19c5def02a47af1 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Mon, 27 Oct 2025 16:00:45 -0700 Subject: [PATCH 55/76] avoid mypy error --- mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py b/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py index 7216d3b6aa..199d640294 100644 --- a/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py +++ b/mlos_bench/mlos_bench/services/remote/ssh/ssh_service.py @@ -226,6 +226,7 @@ async def get_client_connection( "%s: Giving up connecting to %s", current_thread().name, connection_id ) raise + raise RuntimeError("Unreachable code in get_client_connection") def cleanup(self) -> None: """Closes all cached connections.""" From 4458b57b1c361a14fb404df8b11116e459324b81 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Mon, 27 Oct 2025 23:17:05 -0700 Subject: [PATCH 56/76] copilot debugging --- .github/workflows/linux.yml | 4 + debug_ssh_tests.py | 114 ++++++++++++++++++ mlos_bench/mlos_bench/tests/__init__.py | 64 ++++++++-- .../tests/services/remote/ssh/__init__.py | 108 +++++++++++++++-- .../tests/services/remote/ssh/fixtures.py | 41 ++++++- .../remote/ssh/test_ssh_host_service.py | 79 ++++++++++-- .../services/remote/ssh/test_ssh_service.py | 23 +++- 7 files changed, 402 insertions(+), 31 deletions(-) create mode 100644 debug_ssh_tests.py diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 5518b15d97..80f7991e92 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -175,6 +175,10 @@ jobs: timeout-minutes: 5 run: make CONDA_ENV_NAME=$CONDA_ENV_NAME check + - name: Run tests + continue-on-error: true + run: conda -n $CONDA_ENV_NAME run python3 debug_ssh_tests.py + # Only run the coverage checks on the devcontainer job. - name: Run tests run: make CONDA_ENV_NAME=$CONDA_ENV_NAME SKIP_COVERAGE=true test diff --git a/debug_ssh_tests.py b/debug_ssh_tests.py new file mode 100644 index 0000000000..e4a4483dac --- /dev/null +++ b/debug_ssh_tests.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +""" +Debug script for SSH test race conditions. + +Run this to collect detailed logs about SSH test fixture behavior and port management. +""" + +import logging +import subprocess +import sys +from pathlib import Path + +def setup_logging(): + """Configure detailed logging for debugging.""" + # Configure root logger + logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s.%(msecs)03d [%(levelname)8s] %(name)s: %(message)s', + datefmt='%H:%M:%S', + handlers=[ + logging.StreamHandler(sys.stdout), + logging.FileHandler('ssh_test_debug.log', mode='w') + ] + ) + + # Set specific loggers to appropriate levels + logging.getLogger('mlos_bench.tests.services.remote.ssh').setLevel(logging.DEBUG) + logging.getLogger('mlos_bench.tests').setLevel(logging.INFO) + logging.getLogger('pytest_docker').setLevel(logging.INFO) + + print("Logging configured. Debug output will be saved to ssh_test_debug.log") + +def run_tests(): + """Run SSH tests with debugging enabled.""" + setup_logging() + + test_dir = Path(__file__).parent / "mlos_bench" / "mlos_bench" / "tests" / "services" / "remote" / "ssh" + + cmd = [ + sys.executable, "-m", "pytest", + "-v", "-s", # Verbose and don't capture output + "--tb=short", # Short traceback format + "--log-level=DEBUG", # Enable pytest debug logging + "-x", # Stop on first failure + str(test_dir / "test_ssh_service.py"), + str(test_dir / "test_ssh_host_service.py"), + ] + + print(f"Running command: {' '.join(cmd)}") + print("=" * 80) + + try: + result = subprocess.run(cmd, cwd=Path(__file__).parent, check=False) + return result.returncode + except KeyboardInterrupt: + print("\nTest run interrupted by user") + return 130 + +def run_parallel_tests(): + """Run tests in parallel to reproduce race conditions.""" + setup_logging() + + test_dir = Path(__file__).parent / "mlos_bench" / "mlos_bench" / "tests" / "services" / "remote" / "ssh" + + cmd = [ + sys.executable, "-m", "pytest", + "-v", "-s", # Verbose and don't capture output + "--tb=short", # Short traceback format + "--log-level=INFO", # Less verbose for parallel runs + "-n", "2", # Run with 2 workers to increase chance of race + "--dist", "worksteal", # Dynamic work distribution + str(test_dir / "test_ssh_service.py"), + str(test_dir / "test_ssh_host_service.py"), + ] + + print(f"Running parallel tests: {' '.join(cmd)}") + print("=" * 80) + + try: + result = subprocess.run(cmd, cwd=Path(__file__).parent, check=False) + return result.returncode + except KeyboardInterrupt: + print("\nParallel test run interrupted by user") + return 130 + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Debug SSH test race conditions") + parser.add_argument( + "--parallel", + action="store_true", + help="Run tests in parallel to reproduce race conditions" + ) + + args = parser.parse_args() + + if args.parallel: + exit_code = run_parallel_tests() + else: + exit_code = run_tests() + + print("\n" + "=" * 80) + print(f"Tests completed with exit code: {exit_code}") + + if exit_code != 0: + print("Check ssh_test_debug.log for detailed debugging information") + print("\nKey things to look for:") + print("1. Port changes after reboot tests") + print("2. Connection validation failures") + print("3. Socket connection refused errors") + print("4. Thread IDs to track which worker has issues") + + sys.exit(exit_code) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index d723cfa957..cde179f572 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -171,11 +171,30 @@ def wait_docker_service_healthy( def wait_docker_service_socket(docker_services: DockerServices, hostname: str, port: int) -> None: """Wait until a docker service is ready.""" - docker_services.wait_until_responsive( - check=lambda: check_socket(hostname, port), - timeout=60.0, - pause=0.5, - ) + import logging + import threading + + _LOG = logging.getLogger(__name__) + thread_id = threading.get_ident() + + _LOG.info("[Thread %s] Waiting for %s:%d to become responsive", thread_id, hostname, port) + + def check_with_logging() -> bool: + result = check_socket(hostname, port) + if not result: + _LOG.debug("[Thread %s] Socket check failed for %s:%d", thread_id, hostname, port) + return result + + try: + docker_services.wait_until_responsive( + check=check_with_logging, + timeout=60.0, + pause=0.5, + ) + _LOG.info("[Thread %s] Socket %s:%d is now responsive", thread_id, hostname, port) + except Exception as e: + _LOG.error("[Thread %s] Failed waiting for %s:%d: %s", thread_id, hostname, port, e) + raise def check_socket(host: str, port: int, timeout: float = 1.0) -> bool: @@ -192,10 +211,37 @@ def check_socket(host: str, port: int, timeout: float = 1.0) -> bool: ------- bool """ - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.settimeout(timeout) # seconds - result = sock.connect_ex((host, port)) - return result == 0 + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.settimeout(timeout) # seconds + result = sock.connect_ex((host, port)) + success = result == 0 + if not success: + import logging + import threading + + _LOG = logging.getLogger(__name__) + _LOG.debug( + "[Thread %s] Socket connection to %s:%d failed with code %d", + threading.get_ident(), + host, + port, + result, + ) + return success + except Exception as e: + import logging + import threading + + _LOG = logging.getLogger(__name__) + _LOG.debug( + "[Thread %s] Socket check exception for %s:%d: %s", + threading.get_ident(), + host, + port, + e, + ) + return False def resolve_host_name(host: str) -> str | None: diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py index de822fe2eb..54f3d4c025 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py @@ -4,6 +4,8 @@ # """Common data classes for the SSH service tests.""" +import logging +import threading from dataclasses import dataclass from subprocess import run @@ -39,17 +41,47 @@ def get_port(self, uncached: bool = False) -> int: Note: this value can change when the service restarts so we can't rely on the DockerServices. """ + _LOG = logging.getLogger(__name__) + thread_id = threading.get_ident() + if self._port is None or uncached: - port_cmd = run( - ( - f"docker compose -p {self.compose_project_name} " - f"port {self.service_name} {SSH_TEST_SERVER_PORT}" - ), - shell=True, - check=True, - capture_output=True, + _LOG.info( + "[Thread %s] Discovering port for %s (uncached=%s, cached_port=%s)", + thread_id, + self.service_name, + uncached, + self._port, + ) + try: + port_cmd = run( + ( + f"docker compose -p {self.compose_project_name} " + f"port {self.service_name} {SSH_TEST_SERVER_PORT}" + ), + shell=True, + check=True, + capture_output=True, + ) + new_port = int(port_cmd.stdout.decode().strip().split(":")[1]) + old_port = self._port + self._port = new_port + _LOG.info( + "[Thread %s] Port for %s: %s -> %s (uncached=%s)", + thread_id, + self.service_name, + old_port, + new_port, + uncached, + ) + except Exception as e: + _LOG.error( + "[Thread %s] Failed to get port for %s: %s", thread_id, self.service_name, e + ) + raise + else: + _LOG.debug( + "[Thread %s] Using cached port %s for %s", thread_id, self._port, self.service_name ) - self._port = int(port_cmd.stdout.decode().strip().split(":")[1]) return self._port def to_ssh_service_config(self, uncached: bool = False) -> dict: @@ -72,3 +104,61 @@ def to_connect_params(self, uncached: bool = False) -> dict: "port": self.get_port(uncached), "username": self.username, } + + def validate_connection(self, timeout: float = 2.0) -> bool: + """ + Validate that the current cached port is still connectable. + + Returns False if the connection fails, indicating the port may be stale. + """ + if self._port is None: + return False + + _LOG = logging.getLogger(__name__) + thread_id = threading.get_ident() + + # Import here to avoid circular imports + from mlos_bench.tests import check_socket + + is_connectable = check_socket(self.hostname, self._port, timeout) + if not is_connectable: + _LOG.warning( + "[Thread %s] Connection validation FAILED for %s:%d - port may be stale!", + thread_id, + self.service_name, + self._port, + ) + else: + _LOG.debug( + "[Thread %s] Connection validation OK for %s:%d", + thread_id, + self.service_name, + self._port, + ) + return is_connectable + + def get_port_with_validation(self, timeout: float = 2.0) -> int: + """ + Get port with automatic validation and refresh if connection fails. + + This helps detect stale cached ports caused by container restarts. + """ + _LOG = logging.getLogger(__name__) + thread_id = threading.get_ident() + + # First try cached port + if self._port is not None: + if self.validate_connection(timeout): + return self._port + else: + _LOG.warning( + "[Thread %s] Cached port %d for %s failed validation, refreshing...", + thread_id, + self._port, + self.service_name, + ) + # Force refresh + return self.get_port(uncached=True) + else: + # No cached port, get fresh one + return self.get_port(uncached=False) diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py index 9e3d25ad5d..e25a524c6e 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py @@ -8,8 +8,10 @@ Note: these are not in the conftest.py file because they are also used by remote_ssh_env_test.py """ +import logging import os import tempfile +import threading from collections.abc import Generator from subprocess import run @@ -81,6 +83,11 @@ def ssh_test_server( """ # Get a copy of the ssh id_rsa key from the test ssh server. with tempfile.NamedTemporaryFile() as id_rsa_file: + _LOG = logging.getLogger(__name__) + thread_id = threading.get_ident() + + _LOG.info("[Thread %s] Setting up ssh_test_server fixture", thread_id) + ssh_test_server_info = SshTestServerInfo( compose_project_name=docker_compose_project_name, service_name=SSH_TEST_SERVER_NAME, @@ -88,11 +95,17 @@ def ssh_test_server( username="root", id_rsa_path=id_rsa_file.name, ) + + port = ssh_test_server_info.get_port() + _LOG.info("[Thread %s] Main SSH server discovered on port %d", thread_id, port) + wait_docker_service_socket( locked_docker_services, ssh_test_server_info.hostname, - ssh_test_server_info.get_port(), + port, ) + + _LOG.info("[Thread %s] Main SSH server validated and ready", thread_id) id_rsa_src = f"/{ssh_test_server_info.username}/.ssh/id_rsa" docker_cp_cmd = ( f"docker compose -p {docker_compose_project_name} " @@ -128,6 +141,11 @@ def alt_test_server( # Note: The alt-server uses the same image as the ssh-server container, so # the id_rsa key and username should all match. # Only the host port it is allocate is different. + _LOG = logging.getLogger(__name__) + thread_id = threading.get_ident() + + _LOG.info("[Thread %s] Setting up alt_test_server fixture", thread_id) + alt_test_server_info = SshTestServerInfo( compose_project_name=ssh_test_server.compose_project_name, service_name=ALT_TEST_SERVER_NAME, @@ -135,11 +153,17 @@ def alt_test_server( username=ssh_test_server.username, id_rsa_path=ssh_test_server.id_rsa_path, ) + + port = alt_test_server_info.get_port() + _LOG.info("[Thread %s] Alt SSH server discovered on port %d", thread_id, port) + wait_docker_service_socket( locked_docker_services, alt_test_server_info.hostname, - alt_test_server_info.get_port(), + port, ) + + _LOG.info("[Thread %s] Alt SSH server validated and ready", thread_id) return alt_test_server_info @@ -156,6 +180,11 @@ def reboot_test_server( # Note: The reboot-server uses the same image as the ssh-server container, so # the id_rsa key and username should all match. # Only the host port it is allocate is different. + _LOG = logging.getLogger(__name__) + thread_id = threading.get_ident() + + _LOG.info("[Thread %s] Setting up reboot_test_server fixture", thread_id) + reboot_test_server_info = SshTestServerInfo( compose_project_name=ssh_test_server.compose_project_name, service_name=REBOOT_TEST_SERVER_NAME, @@ -163,11 +192,17 @@ def reboot_test_server( username=ssh_test_server.username, id_rsa_path=ssh_test_server.id_rsa_path, ) + + port = reboot_test_server_info.get_port() + _LOG.info("[Thread %s] Reboot SSH server discovered on port %d", thread_id, port) + wait_docker_service_socket( locked_docker_services, reboot_test_server_info.hostname, - reboot_test_server_info.get_port(), + port, ) + + _LOG.info("[Thread %s] Reboot SSH server validated and ready", thread_id) return reboot_test_server_info diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py index 1dce67a13d..5275dfe6cf 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py @@ -36,6 +36,24 @@ def test_ssh_service_remote_exec( internal cache handling logic as well. """ # pylint: disable=protected-access + import threading + + thread_id = threading.get_ident() + + _LOG.info( + "[Thread %s] test_ssh_service_remote_exec starting - validating server connections", + thread_id, + ) + + # Validate connections before starting test + for server_info, name in [(ssh_test_server, "main"), (alt_test_server, "alt")]: + if not server_info.validate_connection(): + _LOG.warning( + "[Thread %s] %s server connection validation failed, may have stale port", + thread_id, + name, + ) + with ssh_host_service: config = ssh_test_server.to_ssh_service_config() @@ -153,8 +171,25 @@ def check_ssh_service_reboot( # easier to check for success. # Also, it may cause issues with other parallel unit tests, so we run it as # a part of the same unit test for now. + import threading + + thread_id = threading.get_ident() + + _LOG.warning( + "[Thread %s] *** STARTING REBOOT TEST (graceful=%s) - This will change container ports! ***", + thread_id, + graceful, + ) + with ssh_host_service: reboot_test_srv_ssh_svc_conf = reboot_test_server.to_ssh_service_config(uncached=True) + original_port = reboot_test_srv_ssh_svc_conf["ssh_port"] + _LOG.warning( + "[Thread %s] Reboot test using original port %d for %s", + thread_id, + original_port, + reboot_test_server.service_name, + ) (status, results_info) = ssh_host_service.remote_exec( script=['echo "sleeping..."', "sleep 30", 'echo "should not reach this point"'], config=reboot_test_srv_ssh_svc_conf, @@ -183,7 +218,7 @@ def check_ssh_service_reboot( assert "should not reach this point" not in stdout reboot_test_srv_ssh_svc_conf_new: dict = {} - for _ in range(0, 3): + for attempt in range(0, 3): # Give docker some time to restart the service after the "reboot". # Note: this relies on having a `restart_policy` in the docker-compose.yml file. time.sleep(1) @@ -195,22 +230,43 @@ def check_ssh_service_reboot( capture_output=True, check=False, ) - print(run_res.stdout.decode()) - print(run_res.stderr.decode()) + _LOG.info( + "[Thread %s] Docker ps output (attempt %d):\nSTDOUT:\n%s\nSTDERR:\n%s", + thread_id, + attempt + 1, + run_res.stdout.decode(), + run_res.stderr.decode(), + ) reboot_test_srv_ssh_svc_conf_new = reboot_test_server.to_ssh_service_config( uncached=True ) - if ( - reboot_test_srv_ssh_svc_conf_new["ssh_port"] - != reboot_test_srv_ssh_svc_conf["ssh_port"] - ): + new_port = reboot_test_srv_ssh_svc_conf_new["ssh_port"] + _LOG.warning( + "[Thread %s] Port check attempt %d: %d -> %d", + thread_id, + attempt + 1, + original_port, + new_port, + ) + if new_port != original_port: + _LOG.warning( + "[Thread %s] *** PORT CHANGED: %d -> %d (this affects ALL workers!) ***", + thread_id, + original_port, + new_port, + ) break except CalledProcessError as ex: - _LOG.info("Failed to check port for reboot test server: %s", ex) + _LOG.error( + "[Thread %s] Failed to check port for reboot test server (attempt %d): %s", + thread_id, + attempt + 1, + ex, + ) assert ( reboot_test_srv_ssh_svc_conf_new["ssh_port"] != reboot_test_srv_ssh_svc_conf["ssh_port"] - ) + ), f"Port should have changed from {original_port} but is still {reboot_test_srv_ssh_svc_conf_new['ssh_port']}" wait_docker_service_socket( docker_services, @@ -227,6 +283,11 @@ def check_ssh_service_reboot( assert status.is_succeeded() assert results["stdout"].strip() == REBOOT_TEST_SERVER_NAME + _LOG.warning( + "[Thread %s] *** REBOOT TEST COMPLETED - Other workers may now have stale cached ports! ***", + thread_id, + ) + @requires_docker def test_ssh_service_reboot( diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py index d06516b3fe..1e7a7a9eaa 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py @@ -52,12 +52,33 @@ ) def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_name: str) -> None: """Check for the pytest-docker ssh test infra.""" + import logging + import threading + _LOG = logging.getLogger(__name__) + thread_id = threading.get_ident() + + _LOG.info( + "[Thread %s] test_ssh_service_test_infra starting with %s", + thread_id, server_name + ) + assert ssh_test_server_info.service_name == server_name ip_addr = resolve_host_name(ssh_test_server_info.hostname) assert ip_addr is not None - local_port = ssh_test_server_info.get_port() + # Use validation method to detect stale ports + if not ssh_test_server_info.validate_connection(): + _LOG.warning( + "[Thread %s] Cached port validation failed, getting fresh port for %s", + thread_id, server_name + ) + + local_port = ssh_test_server_info.get_port_with_validation() + _LOG.info( + "[Thread %s] Using port %d for %s", + thread_id, local_port, server_name + ) assert check_socket(ip_addr, local_port) ssh_cmd = ( "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new " From 5a7929259d4095cd5fc66b6491ae51935e09563b Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Mon, 27 Oct 2025 23:18:14 -0700 Subject: [PATCH 57/76] capture all output --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 1c1cfe1416..6e596fb04e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,6 +36,7 @@ minversion = 7.1 # Run failed and new tests first by default. addopts = -vv + -s -l --ff --nf -n auto From 59893d7746dbe83e29e3da00b76be1b94a292423 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 28 Oct 2025 06:19:15 +0000 Subject: [PATCH 58/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- debug_ssh_tests.py | 64 ++++++++++++++----- .../services/remote/ssh/test_ssh_service.py | 20 +++--- 2 files changed, 56 insertions(+), 28 deletions(-) diff --git a/debug_ssh_tests.py b/debug_ssh_tests.py index e4a4483dac..ccbdc1c4b7 100644 --- a/debug_ssh_tests.py +++ b/debug_ssh_tests.py @@ -1,4 +1,8 @@ #!/usr/bin/env python3 +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# """ Debug script for SSH test race conditions. @@ -10,35 +14,48 @@ import sys from pathlib import Path + def setup_logging(): """Configure detailed logging for debugging.""" # Configure root logger logging.basicConfig( level=logging.DEBUG, - format='%(asctime)s.%(msecs)03d [%(levelname)8s] %(name)s: %(message)s', - datefmt='%H:%M:%S', + format="%(asctime)s.%(msecs)03d [%(levelname)8s] %(name)s: %(message)s", + datefmt="%H:%M:%S", handlers=[ logging.StreamHandler(sys.stdout), - logging.FileHandler('ssh_test_debug.log', mode='w') - ] + logging.FileHandler("ssh_test_debug.log", mode="w"), + ], ) # Set specific loggers to appropriate levels - logging.getLogger('mlos_bench.tests.services.remote.ssh').setLevel(logging.DEBUG) - logging.getLogger('mlos_bench.tests').setLevel(logging.INFO) - logging.getLogger('pytest_docker').setLevel(logging.INFO) + logging.getLogger("mlos_bench.tests.services.remote.ssh").setLevel(logging.DEBUG) + logging.getLogger("mlos_bench.tests").setLevel(logging.INFO) + logging.getLogger("pytest_docker").setLevel(logging.INFO) print("Logging configured. Debug output will be saved to ssh_test_debug.log") + def run_tests(): """Run SSH tests with debugging enabled.""" setup_logging() - test_dir = Path(__file__).parent / "mlos_bench" / "mlos_bench" / "tests" / "services" / "remote" / "ssh" + test_dir = ( + Path(__file__).parent + / "mlos_bench" + / "mlos_bench" + / "tests" + / "services" + / "remote" + / "ssh" + ) cmd = [ - sys.executable, "-m", "pytest", - "-v", "-s", # Verbose and don't capture output + sys.executable, + "-m", + "pytest", + "-v", + "-s", # Verbose and don't capture output "--tb=short", # Short traceback format "--log-level=DEBUG", # Enable pytest debug logging "-x", # Stop on first failure @@ -56,19 +73,33 @@ def run_tests(): print("\nTest run interrupted by user") return 130 + def run_parallel_tests(): """Run tests in parallel to reproduce race conditions.""" setup_logging() - test_dir = Path(__file__).parent / "mlos_bench" / "mlos_bench" / "tests" / "services" / "remote" / "ssh" + test_dir = ( + Path(__file__).parent + / "mlos_bench" + / "mlos_bench" + / "tests" + / "services" + / "remote" + / "ssh" + ) cmd = [ - sys.executable, "-m", "pytest", - "-v", "-s", # Verbose and don't capture output + sys.executable, + "-m", + "pytest", + "-v", + "-s", # Verbose and don't capture output "--tb=short", # Short traceback format "--log-level=INFO", # Less verbose for parallel runs - "-n", "2", # Run with 2 workers to increase chance of race - "--dist", "worksteal", # Dynamic work distribution + "-n", + "2", # Run with 2 workers to increase chance of race + "--dist", + "worksteal", # Dynamic work distribution str(test_dir / "test_ssh_service.py"), str(test_dir / "test_ssh_host_service.py"), ] @@ -83,6 +114,7 @@ def run_parallel_tests(): print("\nParallel test run interrupted by user") return 130 + if __name__ == "__main__": import argparse @@ -90,7 +122,7 @@ def run_parallel_tests(): parser.add_argument( "--parallel", action="store_true", - help="Run tests in parallel to reproduce race conditions" + help="Run tests in parallel to reproduce race conditions", ) args = parser.parse_args() diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py index 1e7a7a9eaa..eac3320e3d 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py @@ -54,14 +54,12 @@ def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_ """Check for the pytest-docker ssh test infra.""" import logging import threading + _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() - - _LOG.info( - "[Thread %s] test_ssh_service_test_infra starting with %s", - thread_id, server_name - ) - + + _LOG.info("[Thread %s] test_ssh_service_test_infra starting with %s", thread_id, server_name) + assert ssh_test_server_info.service_name == server_name ip_addr = resolve_host_name(ssh_test_server_info.hostname) @@ -71,14 +69,12 @@ def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_ if not ssh_test_server_info.validate_connection(): _LOG.warning( "[Thread %s] Cached port validation failed, getting fresh port for %s", - thread_id, server_name + thread_id, + server_name, ) - + local_port = ssh_test_server_info.get_port_with_validation() - _LOG.info( - "[Thread %s] Using port %d for %s", - thread_id, local_port, server_name - ) + _LOG.info("[Thread %s] Using port %d for %s", thread_id, local_port, server_name) assert check_socket(ip_addr, local_port) ssh_cmd = ( "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new " From 991d5042c449019c5cc03ab0fc71f37e76a765b5 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 28 Oct 2025 11:31:49 -0700 Subject: [PATCH 59/76] more debugging hacks --- .github/workflows/linux.yml | 6 +- mlos_bench/mlos_bench/tests/__init__.py | 56 ++++++---- .../tests/services/remote/ssh/__init__.py | 87 +++++++++------ .../tests/services/remote/ssh/fixtures.py | 22 ++-- .../remote/ssh/test_ssh_host_service.py | 104 +++++++++++------- .../services/remote/ssh/test_ssh_service.py | 18 +-- setup.cfg | 1 - 7 files changed, 175 insertions(+), 119 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 80f7991e92..bb044b5ae6 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -175,10 +175,14 @@ jobs: timeout-minutes: 5 run: make CONDA_ENV_NAME=$CONDA_ENV_NAME check - - name: Run tests + - name: Run debug_ssh_tests.py continue-on-error: true run: conda -n $CONDA_ENV_NAME run python3 debug_ssh_tests.py + - name: Run ssh tests + continue-on-error: true + run: conda -n $CONDA_ENV_NAME run python3 -m pytest -vv -rP mlos_bench/mlos_bench/tests/services/remote/ssh + # Only run the coverage checks on the devcontainer job. - name: Run tests run: make CONDA_ENV_NAME=$CONDA_ENV_NAME SKIP_COVERAGE=true test diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index cde179f572..b6bee1e0b3 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -171,18 +171,22 @@ def wait_docker_service_healthy( def wait_docker_service_socket(docker_services: DockerServices, hostname: str, port: int) -> None: """Wait until a docker service is ready.""" - import logging import threading - _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() - _LOG.info("[Thread %s] Waiting for %s:%d to become responsive", thread_id, hostname, port) + warn( + "[Thread %s] Waiting for %s:%d to become responsive" % (thread_id, hostname, port), + UserWarning, + ) def check_with_logging() -> bool: result = check_socket(hostname, port) if not result: - _LOG.debug("[Thread %s] Socket check failed for %s:%d", thread_id, hostname, port) + warn( + "[Thread %s] Socket check failed for %s:%d" % (thread_id, hostname, port), + UserWarning, + ) return result try: @@ -191,9 +195,15 @@ def check_with_logging() -> bool: timeout=60.0, pause=0.5, ) - _LOG.info("[Thread %s] Socket %s:%d is now responsive", thread_id, hostname, port) + warn( + "[Thread %s] Socket %s:%d is now responsive" % (thread_id, hostname, port), + UserWarning, + ) except Exception as e: - _LOG.error("[Thread %s] Failed waiting for %s:%d: %s", thread_id, hostname, port, e) + warn( + "[Thread %s] Failed waiting for %s:%d: %s" % (thread_id, hostname, port, e), + UserWarning, + ) raise @@ -217,29 +227,31 @@ def check_socket(host: str, port: int, timeout: float = 1.0) -> bool: result = sock.connect_ex((host, port)) success = result == 0 if not success: - import logging import threading - _LOG = logging.getLogger(__name__) - _LOG.debug( - "[Thread %s] Socket connection to %s:%d failed with code %d", - threading.get_ident(), - host, - port, - result, + warn( + "[Thread %s] Socket connection to %s:%d failed with code %d" + % ( + threading.get_ident(), + host, + port, + result, + ), + UserWarning, ) return success except Exception as e: - import logging import threading - _LOG = logging.getLogger(__name__) - _LOG.debug( - "[Thread %s] Socket check exception for %s:%d: %s", - threading.get_ident(), - host, - port, - e, + warn( + "[Thread %s] Socket check exception for %s:%d: %s" + % ( + threading.get_ident(), + host, + port, + e, + ), + UserWarning, ) return False diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py index 54f3d4c025..7a54c1e61f 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py @@ -6,6 +6,7 @@ import logging import threading +from warnings import warn from dataclasses import dataclass from subprocess import run @@ -41,18 +42,16 @@ def get_port(self, uncached: bool = False) -> int: Note: this value can change when the service restarts so we can't rely on the DockerServices. """ - _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() if self._port is None or uncached: - _LOG.info( - "[Thread %s] Discovering port for %s (uncached=%s, cached_port=%s)", - thread_id, - self.service_name, - uncached, - self._port, + warn( + "[Thread %s] Discovering port for %s (uncached=%s, cached_port=%s)" + % (thread_id, self.service_name, uncached, self._port), + UserWarning, ) try: + # NOTE: this cache may become stale in another worker if the container restarts in one and the other worker doesn't notice the new port. port_cmd = run( ( f"docker compose -p {self.compose_project_name} " @@ -65,22 +64,33 @@ def get_port(self, uncached: bool = False) -> int: new_port = int(port_cmd.stdout.decode().strip().split(":")[1]) old_port = self._port self._port = new_port - _LOG.info( - "[Thread %s] Port for %s: %s -> %s (uncached=%s)", - thread_id, - self.service_name, - old_port, - new_port, - uncached, + warn( + "[Thread %s] Port for %s: %s -> %s (uncached=%s)" + % ( + thread_id, + self.service_name, + old_port, + new_port, + uncached, + ), + UserWarning, ) except Exception as e: - _LOG.error( - "[Thread %s] Failed to get port for %s: %s", thread_id, self.service_name, e + warn( + "[Thread %s] Failed to get port for %s: %s" + % ( + thread_id, + self.service_name, + e, + ), + UserWarning, ) raise else: - _LOG.debug( - "[Thread %s] Using cached port %s for %s", thread_id, self._port, self.service_name + warn( + "[Thread %s] Using cached port %s for %s" + % (thread_id, self._port, self.service_name), + UserWarning, ) return self._port @@ -114,7 +124,6 @@ def validate_connection(self, timeout: float = 2.0) -> bool: if self._port is None: return False - _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() # Import here to avoid circular imports @@ -122,18 +131,24 @@ def validate_connection(self, timeout: float = 2.0) -> bool: is_connectable = check_socket(self.hostname, self._port, timeout) if not is_connectable: - _LOG.warning( - "[Thread %s] Connection validation FAILED for %s:%d - port may be stale!", - thread_id, - self.service_name, - self._port, + warn( + "[Thread %s] Connection validation FAILED for %s:%d - port may be stale!" + % ( + thread_id, + self.service_name, + self._port, + ), + UserWarning, ) else: - _LOG.debug( - "[Thread %s] Connection validation OK for %s:%d", - thread_id, - self.service_name, - self._port, + warn( + "[Thread %s] Connection validation OK for %s:%d" + % ( + thread_id, + self.service_name, + self._port, + ), + UserWarning, ) return is_connectable @@ -143,7 +158,6 @@ def get_port_with_validation(self, timeout: float = 2.0) -> int: This helps detect stale cached ports caused by container restarts. """ - _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() # First try cached port @@ -151,11 +165,14 @@ def get_port_with_validation(self, timeout: float = 2.0) -> int: if self.validate_connection(timeout): return self._port else: - _LOG.warning( - "[Thread %s] Cached port %d for %s failed validation, refreshing...", - thread_id, - self._port, - self.service_name, + warn( + "[Thread %s] Cached port %d for %s failed validation, refreshing..." + % ( + thread_id, + self._port, + self.service_name, + ), + UserWarning, ) # Force refresh return self.get_port(uncached=True) diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py index e25a524c6e..3e12f67887 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py @@ -14,6 +14,7 @@ import threading from collections.abc import Generator from subprocess import run +from warnings import warn import pytest from pytest_docker.plugin import Services as DockerServices @@ -83,10 +84,9 @@ def ssh_test_server( """ # Get a copy of the ssh id_rsa key from the test ssh server. with tempfile.NamedTemporaryFile() as id_rsa_file: - _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() - _LOG.info("[Thread %s] Setting up ssh_test_server fixture", thread_id) + warn("[Thread %s] Setting up ssh_test_server fixture" % (thread_id,), UserWarning) ssh_test_server_info = SshTestServerInfo( compose_project_name=docker_compose_project_name, @@ -97,7 +97,7 @@ def ssh_test_server( ) port = ssh_test_server_info.get_port() - _LOG.info("[Thread %s] Main SSH server discovered on port %d", thread_id, port) + warn("[Thread %s] Main SSH server discovered on port %d" % (thread_id, port), UserWarning) wait_docker_service_socket( locked_docker_services, @@ -105,7 +105,7 @@ def ssh_test_server( port, ) - _LOG.info("[Thread %s] Main SSH server validated and ready", thread_id) + warn("[Thread %s] Main SSH server validated and ready" % (thread_id,), UserWarning) id_rsa_src = f"/{ssh_test_server_info.username}/.ssh/id_rsa" docker_cp_cmd = ( f"docker compose -p {docker_compose_project_name} " @@ -141,10 +141,9 @@ def alt_test_server( # Note: The alt-server uses the same image as the ssh-server container, so # the id_rsa key and username should all match. # Only the host port it is allocate is different. - _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() - _LOG.info("[Thread %s] Setting up alt_test_server fixture", thread_id) + warn("[Thread %s] Setting up alt_test_server fixture" % (thread_id,), UserWarning) alt_test_server_info = SshTestServerInfo( compose_project_name=ssh_test_server.compose_project_name, @@ -155,7 +154,7 @@ def alt_test_server( ) port = alt_test_server_info.get_port() - _LOG.info("[Thread %s] Alt SSH server discovered on port %d", thread_id, port) + warn("[Thread %s] Alt SSH server discovered on port %d" % (thread_id, port), UserWarning) wait_docker_service_socket( locked_docker_services, @@ -163,7 +162,7 @@ def alt_test_server( port, ) - _LOG.info("[Thread %s] Alt SSH server validated and ready", thread_id) + warn("[Thread %s] Alt SSH server validated and ready" % (thread_id,), UserWarning) return alt_test_server_info @@ -180,10 +179,9 @@ def reboot_test_server( # Note: The reboot-server uses the same image as the ssh-server container, so # the id_rsa key and username should all match. # Only the host port it is allocate is different. - _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() - _LOG.info("[Thread %s] Setting up reboot_test_server fixture", thread_id) + warn("[Thread %s] Setting up reboot_test_server fixture" % (thread_id,), UserWarning) reboot_test_server_info = SshTestServerInfo( compose_project_name=ssh_test_server.compose_project_name, @@ -194,7 +192,7 @@ def reboot_test_server( ) port = reboot_test_server_info.get_port() - _LOG.info("[Thread %s] Reboot SSH server discovered on port %d", thread_id, port) + warn("[Thread %s] Reboot SSH server discovered on port %d" % (thread_id, port), UserWarning) wait_docker_service_socket( locked_docker_services, @@ -202,7 +200,7 @@ def reboot_test_server( port, ) - _LOG.info("[Thread %s] Reboot SSH server validated and ready", thread_id) + warn("[Thread %s] Reboot SSH server validated and ready" % (thread_id,), UserWarning) return reboot_test_server_info diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py index 5275dfe6cf..8b623a5ffe 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py @@ -7,6 +7,7 @@ import logging import time from subprocess import CalledProcessError, run +from warnings import warn from pytest_docker.plugin import Services as DockerServices @@ -40,18 +41,22 @@ def test_ssh_service_remote_exec( thread_id = threading.get_ident() - _LOG.info( - "[Thread %s] test_ssh_service_remote_exec starting - validating server connections", - thread_id, + warn( + "[Thread %s] test_ssh_service_remote_exec starting - validating server connections" + % (thread_id,), + UserWarning, ) # Validate connections before starting test for server_info, name in [(ssh_test_server, "main"), (alt_test_server, "alt")]: if not server_info.validate_connection(): - _LOG.warning( - "[Thread %s] %s server connection validation failed, may have stale port", - thread_id, - name, + warn( + "[Thread %s] %s server connection validation failed, may have stale port" + % ( + thread_id, + name, + ), + UserWarning, ) with ssh_host_service: @@ -175,20 +180,26 @@ def check_ssh_service_reboot( thread_id = threading.get_ident() - _LOG.warning( - "[Thread %s] *** STARTING REBOOT TEST (graceful=%s) - This will change container ports! ***", - thread_id, - graceful, + warn( + "[Thread %s] *** STARTING REBOOT TEST (graceful=%s) - This will change container ports! ***" + % ( + thread_id, + graceful, + ), + UserWarning, ) with ssh_host_service: reboot_test_srv_ssh_svc_conf = reboot_test_server.to_ssh_service_config(uncached=True) original_port = reboot_test_srv_ssh_svc_conf["ssh_port"] - _LOG.warning( - "[Thread %s] Reboot test using original port %d for %s", - thread_id, - original_port, - reboot_test_server.service_name, + warn( + "[Thread %s] Reboot test using original port %d for %s" + % ( + thread_id, + original_port, + reboot_test_server.service_name, + ), + UserWarning, ) (status, results_info) = ssh_host_service.remote_exec( script=['echo "sleeping..."', "sleep 30", 'echo "should not reach this point"'], @@ -230,38 +241,50 @@ def check_ssh_service_reboot( capture_output=True, check=False, ) - _LOG.info( - "[Thread %s] Docker ps output (attempt %d):\nSTDOUT:\n%s\nSTDERR:\n%s", - thread_id, - attempt + 1, - run_res.stdout.decode(), - run_res.stderr.decode(), + warn( + "[Thread %s] Docker ps output (attempt %d):\nSTDOUT:\n%s\nSTDERR:\n%s" + % ( + thread_id, + attempt + 1, + run_res.stdout.decode(), + run_res.stderr.decode(), + ), + UserWarning, ) reboot_test_srv_ssh_svc_conf_new = reboot_test_server.to_ssh_service_config( uncached=True ) new_port = reboot_test_srv_ssh_svc_conf_new["ssh_port"] - _LOG.warning( - "[Thread %s] Port check attempt %d: %d -> %d", - thread_id, - attempt + 1, - original_port, - new_port, - ) - if new_port != original_port: - _LOG.warning( - "[Thread %s] *** PORT CHANGED: %d -> %d (this affects ALL workers!) ***", + warn( + "[Thread %s] Port check attempt %d: %d -> %d" + % ( thread_id, + attempt + 1, original_port, new_port, + ), + UserWarning, + ) + if new_port != original_port: + warn( + "[Thread %s] *** PORT CHANGED: %d -> %d (this affects ALL workers!) ***" + % ( + thread_id, + original_port, + new_port, + ), + UserWarning, ) break except CalledProcessError as ex: - _LOG.error( - "[Thread %s] Failed to check port for reboot test server (attempt %d): %s", - thread_id, - attempt + 1, - ex, + warn( + "[Thread %s] Failed to check port for reboot test server (attempt %d): %s" + % ( + thread_id, + attempt + 1, + ex, + ), + UserWarning, ) assert ( reboot_test_srv_ssh_svc_conf_new["ssh_port"] @@ -283,9 +306,10 @@ def check_ssh_service_reboot( assert status.is_succeeded() assert results["stdout"].strip() == REBOOT_TEST_SERVER_NAME - _LOG.warning( - "[Thread %s] *** REBOOT TEST COMPLETED - Other workers may now have stale cached ports! ***", - thread_id, + warn( + "[Thread %s] *** REBOOT TEST COMPLETED - Other workers may now have stale cached ports! ***" + % (thread_id,), + UserWarning, ) diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py index eac3320e3d..f8523cfb50 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py @@ -6,6 +6,7 @@ import asyncio import time +from warnings import warn from importlib.metadata import PackageNotFoundError, version from subprocess import run from threading import Thread @@ -52,13 +53,14 @@ ) def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_name: str) -> None: """Check for the pytest-docker ssh test infra.""" - import logging import threading - _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() - _LOG.info("[Thread %s] test_ssh_service_test_infra starting with %s", thread_id, server_name) + warn( + "[Thread %s] test_ssh_service_test_infra starting with %s" % (thread_id, server_name), + UserWarning, + ) assert ssh_test_server_info.service_name == server_name @@ -67,14 +69,14 @@ def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_ # Use validation method to detect stale ports if not ssh_test_server_info.validate_connection(): - _LOG.warning( - "[Thread %s] Cached port validation failed, getting fresh port for %s", - thread_id, - server_name, + warn( + "[Thread %s] Cached port validation failed, getting fresh port for %s" + % (thread_id, server_name), + UserWarning, ) local_port = ssh_test_server_info.get_port_with_validation() - _LOG.info("[Thread %s] Using port %d for %s", thread_id, local_port, server_name) + warn("[Thread %s] Using port %d for %s" % (thread_id, local_port, server_name), UserWarning) assert check_socket(ip_addr, local_port) ssh_cmd = ( "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new " diff --git a/setup.cfg b/setup.cfg index 6e596fb04e..1c1cfe1416 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,7 +36,6 @@ minversion = 7.1 # Run failed and new tests first by default. addopts = -vv - -s -l --ff --nf -n auto From ca5943243a74dd1a381a1898dba5e74095d5335c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 28 Oct 2025 18:32:58 +0000 Subject: [PATCH 60/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../mlos_bench/tests/services/remote/ssh/__init__.py | 2 +- .../mlos_bench/tests/services/remote/ssh/fixtures.py | 12 ++++++------ .../tests/services/remote/ssh/test_ssh_service.py | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py index 7a54c1e61f..51545d1207 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py @@ -6,9 +6,9 @@ import logging import threading -from warnings import warn from dataclasses import dataclass from subprocess import run +from warnings import warn # The SSH test server port and name. # See Also: docker-compose.yml diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py index 3e12f67887..4b795caa29 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py @@ -86,7 +86,7 @@ def ssh_test_server( with tempfile.NamedTemporaryFile() as id_rsa_file: thread_id = threading.get_ident() - warn("[Thread %s] Setting up ssh_test_server fixture" % (thread_id,), UserWarning) + warn("[Thread {}] Setting up ssh_test_server fixture".format(thread_id), UserWarning) ssh_test_server_info = SshTestServerInfo( compose_project_name=docker_compose_project_name, @@ -105,7 +105,7 @@ def ssh_test_server( port, ) - warn("[Thread %s] Main SSH server validated and ready" % (thread_id,), UserWarning) + warn("[Thread {}] Main SSH server validated and ready".format(thread_id), UserWarning) id_rsa_src = f"/{ssh_test_server_info.username}/.ssh/id_rsa" docker_cp_cmd = ( f"docker compose -p {docker_compose_project_name} " @@ -143,7 +143,7 @@ def alt_test_server( # Only the host port it is allocate is different. thread_id = threading.get_ident() - warn("[Thread %s] Setting up alt_test_server fixture" % (thread_id,), UserWarning) + warn("[Thread {}] Setting up alt_test_server fixture".format(thread_id), UserWarning) alt_test_server_info = SshTestServerInfo( compose_project_name=ssh_test_server.compose_project_name, @@ -162,7 +162,7 @@ def alt_test_server( port, ) - warn("[Thread %s] Alt SSH server validated and ready" % (thread_id,), UserWarning) + warn("[Thread {}] Alt SSH server validated and ready".format(thread_id), UserWarning) return alt_test_server_info @@ -181,7 +181,7 @@ def reboot_test_server( # Only the host port it is allocate is different. thread_id = threading.get_ident() - warn("[Thread %s] Setting up reboot_test_server fixture" % (thread_id,), UserWarning) + warn("[Thread {}] Setting up reboot_test_server fixture".format(thread_id), UserWarning) reboot_test_server_info = SshTestServerInfo( compose_project_name=ssh_test_server.compose_project_name, @@ -200,7 +200,7 @@ def reboot_test_server( port, ) - warn("[Thread %s] Reboot SSH server validated and ready" % (thread_id,), UserWarning) + warn("[Thread {}] Reboot SSH server validated and ready".format(thread_id), UserWarning) return reboot_test_server_info diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py index f8523cfb50..b522f4e289 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py @@ -6,10 +6,10 @@ import asyncio import time -from warnings import warn from importlib.metadata import PackageNotFoundError, version from subprocess import run from threading import Thread +from warnings import warn import pytest from pytest_lazy_fixtures.lazy_fixture import lf as lazy_fixture @@ -58,7 +58,7 @@ def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_ thread_id = threading.get_ident() warn( - "[Thread %s] test_ssh_service_test_infra starting with %s" % (thread_id, server_name), + "[Thread {}] test_ssh_service_test_infra starting with {}".format(thread_id, server_name), UserWarning, ) From 590481714fa17e76be16eaabace57c2069f57639 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 28 Oct 2025 11:34:52 -0700 Subject: [PATCH 61/76] more logs --- .../mlos_bench/tests/services/remote/ssh/__init__.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py index 51545d1207..5df977a458 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py @@ -44,12 +44,13 @@ def get_port(self, uncached: bool = False) -> int: """ thread_id = threading.get_ident() + warn( + "[Thread %s] Discovering port for %s (uncached=%s, cached_port=%s)" + % (thread_id, self.service_name, uncached, self._port), + UserWarning, + ) + if self._port is None or uncached: - warn( - "[Thread %s] Discovering port for %s (uncached=%s, cached_port=%s)" - % (thread_id, self.service_name, uncached, self._port), - UserWarning, - ) try: # NOTE: this cache may become stale in another worker if the container restarts in one and the other worker doesn't notice the new port. port_cmd = run( From 74fa084a70f967a63b502437f93d32114ca6c1c3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 28 Oct 2025 18:35:24 +0000 Subject: [PATCH 62/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../mlos_bench/tests/services/remote/ssh/fixtures.py | 12 ++++++------ .../tests/services/remote/ssh/test_ssh_service.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py index 4b795caa29..a8896563f5 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py @@ -86,7 +86,7 @@ def ssh_test_server( with tempfile.NamedTemporaryFile() as id_rsa_file: thread_id = threading.get_ident() - warn("[Thread {}] Setting up ssh_test_server fixture".format(thread_id), UserWarning) + warn(f"[Thread {thread_id}] Setting up ssh_test_server fixture", UserWarning) ssh_test_server_info = SshTestServerInfo( compose_project_name=docker_compose_project_name, @@ -105,7 +105,7 @@ def ssh_test_server( port, ) - warn("[Thread {}] Main SSH server validated and ready".format(thread_id), UserWarning) + warn(f"[Thread {thread_id}] Main SSH server validated and ready", UserWarning) id_rsa_src = f"/{ssh_test_server_info.username}/.ssh/id_rsa" docker_cp_cmd = ( f"docker compose -p {docker_compose_project_name} " @@ -143,7 +143,7 @@ def alt_test_server( # Only the host port it is allocate is different. thread_id = threading.get_ident() - warn("[Thread {}] Setting up alt_test_server fixture".format(thread_id), UserWarning) + warn(f"[Thread {thread_id}] Setting up alt_test_server fixture", UserWarning) alt_test_server_info = SshTestServerInfo( compose_project_name=ssh_test_server.compose_project_name, @@ -162,7 +162,7 @@ def alt_test_server( port, ) - warn("[Thread {}] Alt SSH server validated and ready".format(thread_id), UserWarning) + warn(f"[Thread {thread_id}] Alt SSH server validated and ready", UserWarning) return alt_test_server_info @@ -181,7 +181,7 @@ def reboot_test_server( # Only the host port it is allocate is different. thread_id = threading.get_ident() - warn("[Thread {}] Setting up reboot_test_server fixture".format(thread_id), UserWarning) + warn(f"[Thread {thread_id}] Setting up reboot_test_server fixture", UserWarning) reboot_test_server_info = SshTestServerInfo( compose_project_name=ssh_test_server.compose_project_name, @@ -200,7 +200,7 @@ def reboot_test_server( port, ) - warn("[Thread {}] Reboot SSH server validated and ready".format(thread_id), UserWarning) + warn(f"[Thread {thread_id}] Reboot SSH server validated and ready", UserWarning) return reboot_test_server_info diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py index b522f4e289..c731e27a87 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py @@ -58,7 +58,7 @@ def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_ thread_id = threading.get_ident() warn( - "[Thread {}] test_ssh_service_test_infra starting with {}".format(thread_id, server_name), + f"[Thread {thread_id}] test_ssh_service_test_infra starting with {server_name}", UserWarning, ) From 421e2c5744fc55e0aa7253deab7e83f2ef425cd9 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 28 Oct 2025 15:34:40 -0700 Subject: [PATCH 63/76] test logging --- mlos_bench/mlos_bench/tests/__init__.py | 5 +++++ .../mlos_bench/tests/services/remote/ssh/fixtures.py | 5 +++++ .../tests/services/remote/ssh/test_ssh_service.py | 7 +++++++ setup.cfg | 4 ++++ 4 files changed, 21 insertions(+) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index b6bee1e0b3..b6828e07b7 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -171,6 +171,11 @@ def wait_docker_service_healthy( def wait_docker_service_socket(docker_services: DockerServices, hostname: str, port: int) -> None: """Wait until a docker service is ready.""" + import logging + + _LOG = logging.getLogger(__name__) + _LOG.info("log in wait_docker_service_socket: %s", __name__) + import threading thread_id = threading.get_ident() diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py index a8896563f5..c81a5cf636 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py @@ -99,6 +99,11 @@ def ssh_test_server( port = ssh_test_server_info.get_port() warn("[Thread %s] Main SSH server discovered on port %d" % (thread_id, port), UserWarning) + import logging + + _LOG = logging.getLogger(__name__) + _LOG.info("log in ssh_test_server fixture: %s", __name__) + wait_docker_service_socket( locked_docker_services, ssh_test_server_info.hostname, diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py index c731e27a87..e34b20b82c 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py @@ -9,6 +9,7 @@ from importlib.metadata import PackageNotFoundError, version from subprocess import run from threading import Thread +from venv import logger from warnings import warn import pytest @@ -86,6 +87,12 @@ def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_ cmd = run(ssh_cmd.split(), capture_output=True, text=True, check=True) assert cmd.stdout.strip() == server_name + import logging + + _LOG = logging.getLogger(__name__) + _LOG.info("log in test: %s", __name__) + assert False, "Force failure for debugging" + @pytest.mark.filterwarnings( "ignore:.*(coroutine 'sleep' was never awaited).*:RuntimeWarning:.*event_loop_context_test.*:0" diff --git a/setup.cfg b/setup.cfg index 1c1cfe1416..7746bfe080 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,6 +31,10 @@ extend-ignore = E203 [tool:pytest] minversion = 7.1 +log_cli = true +log_cli_level = INFO +log_cli_format = %(asctime)s [%(levelname)s] %(name)s: %(message)s +log_cli_date_format = %Y-%m-%d %H:%M:%S.%N #pythonpath = . # Run all tests to completion on multiple processes. # Run failed and new tests first by default. From 0f52e5f19e88106617e881241f0aa0ca3a7a60e0 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 28 Oct 2025 15:45:16 -0700 Subject: [PATCH 64/76] partial revert to use logger instead of warn() --- mlos_bench/mlos_bench/tests/__init__.py | 59 ++++------ .../tests/services/remote/ssh/__init__.py | 89 ++++++--------- .../tests/services/remote/ssh/fixtures.py | 27 ++--- .../remote/ssh/test_ssh_host_service.py | 103 +++++++----------- .../services/remote/ssh/test_ssh_service.py | 25 ++--- 5 files changed, 117 insertions(+), 186 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index b6828e07b7..cde179f572 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -172,26 +172,17 @@ def wait_docker_service_healthy( def wait_docker_service_socket(docker_services: DockerServices, hostname: str, port: int) -> None: """Wait until a docker service is ready.""" import logging - - _LOG = logging.getLogger(__name__) - _LOG.info("log in wait_docker_service_socket: %s", __name__) - import threading + _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() - warn( - "[Thread %s] Waiting for %s:%d to become responsive" % (thread_id, hostname, port), - UserWarning, - ) + _LOG.info("[Thread %s] Waiting for %s:%d to become responsive", thread_id, hostname, port) def check_with_logging() -> bool: result = check_socket(hostname, port) if not result: - warn( - "[Thread %s] Socket check failed for %s:%d" % (thread_id, hostname, port), - UserWarning, - ) + _LOG.debug("[Thread %s] Socket check failed for %s:%d", thread_id, hostname, port) return result try: @@ -200,15 +191,9 @@ def check_with_logging() -> bool: timeout=60.0, pause=0.5, ) - warn( - "[Thread %s] Socket %s:%d is now responsive" % (thread_id, hostname, port), - UserWarning, - ) + _LOG.info("[Thread %s] Socket %s:%d is now responsive", thread_id, hostname, port) except Exception as e: - warn( - "[Thread %s] Failed waiting for %s:%d: %s" % (thread_id, hostname, port, e), - UserWarning, - ) + _LOG.error("[Thread %s] Failed waiting for %s:%d: %s", thread_id, hostname, port, e) raise @@ -232,31 +217,29 @@ def check_socket(host: str, port: int, timeout: float = 1.0) -> bool: result = sock.connect_ex((host, port)) success = result == 0 if not success: + import logging import threading - warn( - "[Thread %s] Socket connection to %s:%d failed with code %d" - % ( - threading.get_ident(), - host, - port, - result, - ), - UserWarning, + _LOG = logging.getLogger(__name__) + _LOG.debug( + "[Thread %s] Socket connection to %s:%d failed with code %d", + threading.get_ident(), + host, + port, + result, ) return success except Exception as e: + import logging import threading - warn( - "[Thread %s] Socket check exception for %s:%d: %s" - % ( - threading.get_ident(), - host, - port, - e, - ), - UserWarning, + _LOG = logging.getLogger(__name__) + _LOG.debug( + "[Thread %s] Socket check exception for %s:%d: %s", + threading.get_ident(), + host, + port, + e, ) return False diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py index 5df977a458..7c70d95221 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py @@ -8,7 +8,6 @@ import threading from dataclasses import dataclass from subprocess import run -from warnings import warn # The SSH test server port and name. # See Also: docker-compose.yml @@ -42,15 +41,17 @@ def get_port(self, uncached: bool = False) -> int: Note: this value can change when the service restarts so we can't rely on the DockerServices. """ + _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() - warn( - "[Thread %s] Discovering port for %s (uncached=%s, cached_port=%s)" - % (thread_id, self.service_name, uncached, self._port), - UserWarning, - ) - if self._port is None or uncached: + _LOG.info( + "[Thread %s] Discovering port for %s (uncached=%s, cached_port=%s)", + thread_id, + self.service_name, + uncached, + self._port, + ) try: # NOTE: this cache may become stale in another worker if the container restarts in one and the other worker doesn't notice the new port. port_cmd = run( @@ -65,33 +66,22 @@ def get_port(self, uncached: bool = False) -> int: new_port = int(port_cmd.stdout.decode().strip().split(":")[1]) old_port = self._port self._port = new_port - warn( - "[Thread %s] Port for %s: %s -> %s (uncached=%s)" - % ( - thread_id, - self.service_name, - old_port, - new_port, - uncached, - ), - UserWarning, + _LOG.info( + "[Thread %s] Port for %s: %s -> %s (uncached=%s)", + thread_id, + self.service_name, + old_port, + new_port, + uncached, ) except Exception as e: - warn( - "[Thread %s] Failed to get port for %s: %s" - % ( - thread_id, - self.service_name, - e, - ), - UserWarning, + _LOG.error( + "[Thread %s] Failed to get port for %s: %s", thread_id, self.service_name, e ) raise else: - warn( - "[Thread %s] Using cached port %s for %s" - % (thread_id, self._port, self.service_name), - UserWarning, + _LOG.debug( + "[Thread %s] Using cached port %s for %s", thread_id, self._port, self.service_name ) return self._port @@ -125,6 +115,7 @@ def validate_connection(self, timeout: float = 2.0) -> bool: if self._port is None: return False + _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() # Import here to avoid circular imports @@ -132,24 +123,18 @@ def validate_connection(self, timeout: float = 2.0) -> bool: is_connectable = check_socket(self.hostname, self._port, timeout) if not is_connectable: - warn( - "[Thread %s] Connection validation FAILED for %s:%d - port may be stale!" - % ( - thread_id, - self.service_name, - self._port, - ), - UserWarning, + _LOG.warning( + "[Thread %s] Connection validation FAILED for %s:%d - port may be stale!", + thread_id, + self.service_name, + self._port, ) else: - warn( - "[Thread %s] Connection validation OK for %s:%d" - % ( - thread_id, - self.service_name, - self._port, - ), - UserWarning, + _LOG.debug( + "[Thread %s] Connection validation OK for %s:%d", + thread_id, + self.service_name, + self._port, ) return is_connectable @@ -159,6 +144,7 @@ def get_port_with_validation(self, timeout: float = 2.0) -> int: This helps detect stale cached ports caused by container restarts. """ + _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() # First try cached port @@ -166,14 +152,11 @@ def get_port_with_validation(self, timeout: float = 2.0) -> int: if self.validate_connection(timeout): return self._port else: - warn( - "[Thread %s] Cached port %d for %s failed validation, refreshing..." - % ( - thread_id, - self._port, - self.service_name, - ), - UserWarning, + _LOG.warning( + "[Thread %s] Cached port %d for %s failed validation, refreshing...", + thread_id, + self._port, + self.service_name, ) # Force refresh return self.get_port(uncached=True) diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py index c81a5cf636..e25a524c6e 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py @@ -14,7 +14,6 @@ import threading from collections.abc import Generator from subprocess import run -from warnings import warn import pytest from pytest_docker.plugin import Services as DockerServices @@ -84,9 +83,10 @@ def ssh_test_server( """ # Get a copy of the ssh id_rsa key from the test ssh server. with tempfile.NamedTemporaryFile() as id_rsa_file: + _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() - warn(f"[Thread {thread_id}] Setting up ssh_test_server fixture", UserWarning) + _LOG.info("[Thread %s] Setting up ssh_test_server fixture", thread_id) ssh_test_server_info = SshTestServerInfo( compose_project_name=docker_compose_project_name, @@ -97,12 +97,7 @@ def ssh_test_server( ) port = ssh_test_server_info.get_port() - warn("[Thread %s] Main SSH server discovered on port %d" % (thread_id, port), UserWarning) - - import logging - - _LOG = logging.getLogger(__name__) - _LOG.info("log in ssh_test_server fixture: %s", __name__) + _LOG.info("[Thread %s] Main SSH server discovered on port %d", thread_id, port) wait_docker_service_socket( locked_docker_services, @@ -110,7 +105,7 @@ def ssh_test_server( port, ) - warn(f"[Thread {thread_id}] Main SSH server validated and ready", UserWarning) + _LOG.info("[Thread %s] Main SSH server validated and ready", thread_id) id_rsa_src = f"/{ssh_test_server_info.username}/.ssh/id_rsa" docker_cp_cmd = ( f"docker compose -p {docker_compose_project_name} " @@ -146,9 +141,10 @@ def alt_test_server( # Note: The alt-server uses the same image as the ssh-server container, so # the id_rsa key and username should all match. # Only the host port it is allocate is different. + _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() - warn(f"[Thread {thread_id}] Setting up alt_test_server fixture", UserWarning) + _LOG.info("[Thread %s] Setting up alt_test_server fixture", thread_id) alt_test_server_info = SshTestServerInfo( compose_project_name=ssh_test_server.compose_project_name, @@ -159,7 +155,7 @@ def alt_test_server( ) port = alt_test_server_info.get_port() - warn("[Thread %s] Alt SSH server discovered on port %d" % (thread_id, port), UserWarning) + _LOG.info("[Thread %s] Alt SSH server discovered on port %d", thread_id, port) wait_docker_service_socket( locked_docker_services, @@ -167,7 +163,7 @@ def alt_test_server( port, ) - warn(f"[Thread {thread_id}] Alt SSH server validated and ready", UserWarning) + _LOG.info("[Thread %s] Alt SSH server validated and ready", thread_id) return alt_test_server_info @@ -184,9 +180,10 @@ def reboot_test_server( # Note: The reboot-server uses the same image as the ssh-server container, so # the id_rsa key and username should all match. # Only the host port it is allocate is different. + _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() - warn(f"[Thread {thread_id}] Setting up reboot_test_server fixture", UserWarning) + _LOG.info("[Thread %s] Setting up reboot_test_server fixture", thread_id) reboot_test_server_info = SshTestServerInfo( compose_project_name=ssh_test_server.compose_project_name, @@ -197,7 +194,7 @@ def reboot_test_server( ) port = reboot_test_server_info.get_port() - warn("[Thread %s] Reboot SSH server discovered on port %d" % (thread_id, port), UserWarning) + _LOG.info("[Thread %s] Reboot SSH server discovered on port %d", thread_id, port) wait_docker_service_socket( locked_docker_services, @@ -205,7 +202,7 @@ def reboot_test_server( port, ) - warn(f"[Thread {thread_id}] Reboot SSH server validated and ready", UserWarning) + _LOG.info("[Thread %s] Reboot SSH server validated and ready", thread_id) return reboot_test_server_info diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py index 8b623a5ffe..fb129f636b 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py @@ -7,7 +7,6 @@ import logging import time from subprocess import CalledProcessError, run -from warnings import warn from pytest_docker.plugin import Services as DockerServices @@ -41,22 +40,18 @@ def test_ssh_service_remote_exec( thread_id = threading.get_ident() - warn( - "[Thread %s] test_ssh_service_remote_exec starting - validating server connections" - % (thread_id,), - UserWarning, + _LOG.info( + "[Thread %s] test_ssh_service_remote_exec starting - validating server connections", + thread_id, ) # Validate connections before starting test for server_info, name in [(ssh_test_server, "main"), (alt_test_server, "alt")]: if not server_info.validate_connection(): - warn( - "[Thread %s] %s server connection validation failed, may have stale port" - % ( - thread_id, - name, - ), - UserWarning, + _LOG.warning( + "[Thread %s] %s server connection validation failed, may have stale port", + thread_id, + name, ) with ssh_host_service: @@ -180,26 +175,20 @@ def check_ssh_service_reboot( thread_id = threading.get_ident() - warn( - "[Thread %s] *** STARTING REBOOT TEST (graceful=%s) - This will change container ports! ***" - % ( - thread_id, - graceful, - ), - UserWarning, + _LOG.warning( + "[Thread %s] *** STARTING REBOOT TEST (graceful=%s) - This will change container ports! ***", + thread_id, + graceful, ) with ssh_host_service: reboot_test_srv_ssh_svc_conf = reboot_test_server.to_ssh_service_config(uncached=True) original_port = reboot_test_srv_ssh_svc_conf["ssh_port"] - warn( - "[Thread %s] Reboot test using original port %d for %s" - % ( - thread_id, - original_port, - reboot_test_server.service_name, - ), - UserWarning, + _LOG.warning( + "[Thread %s] Reboot test using original port %d for %s", + thread_id, + original_port, + reboot_test_server.service_name, ) (status, results_info) = ssh_host_service.remote_exec( script=['echo "sleeping..."', "sleep 30", 'echo "should not reach this point"'], @@ -241,50 +230,39 @@ def check_ssh_service_reboot( capture_output=True, check=False, ) - warn( - "[Thread %s] Docker ps output (attempt %d):\nSTDOUT:\n%s\nSTDERR:\n%s" - % ( - thread_id, - attempt + 1, - run_res.stdout.decode(), - run_res.stderr.decode(), - ), - UserWarning, + _LOG.info( + "[Thread %s] Docker ps output (attempt %d):\nSTDOUT:\n%s\nSTDERR:\n%s", + thread_id, + attempt + 1, + run_res.stdout.decode(), + run_res.stderr.decode(), ) reboot_test_srv_ssh_svc_conf_new = reboot_test_server.to_ssh_service_config( uncached=True ) new_port = reboot_test_srv_ssh_svc_conf_new["ssh_port"] - warn( - "[Thread %s] Port check attempt %d: %d -> %d" - % ( + _LOG.warning( + "[Thread %s] Port check attempt %d: %d -> %d", + thread_id, + attempt + 1, + original_port, + new_port, + ) + if new_port != original_port: + _LOG.warning( + "[Thread %s] *** PORT CHANGED: %d -> %d (this affects ALL workers!) ***", thread_id, attempt + 1, original_port, new_port, - ), - UserWarning, - ) - if new_port != original_port: - warn( - "[Thread %s] *** PORT CHANGED: %d -> %d (this affects ALL workers!) ***" - % ( - thread_id, - original_port, - new_port, - ), - UserWarning, ) break except CalledProcessError as ex: - warn( - "[Thread %s] Failed to check port for reboot test server (attempt %d): %s" - % ( - thread_id, - attempt + 1, - ex, - ), - UserWarning, + _LOG.error( + "[Thread %s] Failed to check port for reboot test server (attempt %d): %s", + thread_id, + attempt + 1, + ex, ) assert ( reboot_test_srv_ssh_svc_conf_new["ssh_port"] @@ -306,10 +284,9 @@ def check_ssh_service_reboot( assert status.is_succeeded() assert results["stdout"].strip() == REBOOT_TEST_SERVER_NAME - warn( - "[Thread %s] *** REBOOT TEST COMPLETED - Other workers may now have stale cached ports! ***" - % (thread_id,), - UserWarning, + _LOG.warning( + "[Thread %s] *** REBOOT TEST COMPLETED - Other workers may now have stale cached ports! ***", + thread_id, ) diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py index e34b20b82c..eac3320e3d 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py @@ -9,8 +9,6 @@ from importlib.metadata import PackageNotFoundError, version from subprocess import run from threading import Thread -from venv import logger -from warnings import warn import pytest from pytest_lazy_fixtures.lazy_fixture import lf as lazy_fixture @@ -54,14 +52,13 @@ ) def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_name: str) -> None: """Check for the pytest-docker ssh test infra.""" + import logging import threading + _LOG = logging.getLogger(__name__) thread_id = threading.get_ident() - warn( - f"[Thread {thread_id}] test_ssh_service_test_infra starting with {server_name}", - UserWarning, - ) + _LOG.info("[Thread %s] test_ssh_service_test_infra starting with %s", thread_id, server_name) assert ssh_test_server_info.service_name == server_name @@ -70,14 +67,14 @@ def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_ # Use validation method to detect stale ports if not ssh_test_server_info.validate_connection(): - warn( - "[Thread %s] Cached port validation failed, getting fresh port for %s" - % (thread_id, server_name), - UserWarning, + _LOG.warning( + "[Thread %s] Cached port validation failed, getting fresh port for %s", + thread_id, + server_name, ) local_port = ssh_test_server_info.get_port_with_validation() - warn("[Thread %s] Using port %d for %s" % (thread_id, local_port, server_name), UserWarning) + _LOG.info("[Thread %s] Using port %d for %s", thread_id, local_port, server_name) assert check_socket(ip_addr, local_port) ssh_cmd = ( "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new " @@ -87,12 +84,6 @@ def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_ cmd = run(ssh_cmd.split(), capture_output=True, text=True, check=True) assert cmd.stdout.strip() == server_name - import logging - - _LOG = logging.getLogger(__name__) - _LOG.info("log in test: %s", __name__) - assert False, "Force failure for debugging" - @pytest.mark.filterwarnings( "ignore:.*(coroutine 'sleep' was never awaited).*:RuntimeWarning:.*event_loop_context_test.*:0" From 5ddff6459259aa3e393c281ecd65a18b08319834 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 28 Oct 2025 15:43:26 -0700 Subject: [PATCH 65/76] comment --- .../mlos_bench/tests/services/remote/ssh/test_ssh_service.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py index eac3320e3d..c35db27abb 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py @@ -84,6 +84,8 @@ def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_ cmd = run(ssh_cmd.split(), capture_output=True, text=True, check=True) assert cmd.stdout.strip() == server_name + # assert False, "Force failure for debugging." + @pytest.mark.filterwarnings( "ignore:.*(coroutine 'sleep' was never awaited).*:RuntimeWarning:.*event_loop_context_test.*:0" From a59c2a8c06a5a7d8984d44d3ad87ae0fb8ee8f4b Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 28 Oct 2025 17:02:40 -0700 Subject: [PATCH 66/76] logging tweaks --- conftest.py | 20 ++++++++++++++++++++ mlos_bench/mlos_bench/launcher.py | 3 ++- setup.cfg | 9 +++++++-- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/conftest.py b/conftest.py index 84a04727c4..98c19374e8 100644 --- a/conftest.py +++ b/conftest.py @@ -7,6 +7,7 @@ # Note: This file is named conftest.py so that pytest picks it up automatically # without the need to adjust PYTHONPATH or sys.path as much. +import logging import os import shutil from tempfile import mkdtemp @@ -15,6 +16,12 @@ import pytest from xdist.workermanage import WorkerController +LOG_FMT = ( + "%(asctime)s.%(msecs)03d [%(process)d][%(threadName)s] " + "[%(filename)s:%(lineno)d %(funcName)s] " + "%(levelname)s: %(message)s" +) +DATE_FMT = "%Y-%m-%d %H:%M:%S" def is_master(config: pytest.Config) -> bool: """True if the code running the given pytest.config object is running in a xdist @@ -56,6 +63,19 @@ def pytest_configure(config: pytest.Config) -> None: # Add it to the config so that it can passed to the worker nodes. setattr(config, "shared_temp_dir", mkdtemp()) + # Configure per-worker log file. + worker_id = getattr(config, "workerinput", {}).get("workerid", "master") + os.makedirs("logs", exist_ok=True) + log_file_path = os.path.join("logs", f"pytest-{worker_id}.log") + file_handler = logging.FileHandler(log_file_path, mode="w") + if logging.getLogger().getEffectiveLevel() == logging.DEBUG: + file_handler.setLevel(logging.DEBUG) + else: + file_handler.setLevel(logging.INFO) + # logging.basicConfig(level=file_handler.level, format=LOG_FMT, datefmt=DATE_FMT) + log_formatter = logging.Formatter(fmt=LOG_FMT, datefmt=DATE_FMT) + file_handler.setFormatter(log_formatter) + logging.getLogger().addHandler(file_handler) def pytest_configure_node(node: WorkerController) -> None: """Xdist hook used to inform workers of the location of the shared temp dir.""" diff --git a/mlos_bench/mlos_bench/launcher.py b/mlos_bench/mlos_bench/launcher.py index c728ed7fb2..4f49d2c7d4 100644 --- a/mlos_bench/mlos_bench/launcher.py +++ b/mlos_bench/mlos_bench/launcher.py @@ -34,8 +34,9 @@ from mlos_bench.util import try_parse_val _LOG_LEVEL = logging.INFO -_LOG_FORMAT = "%(asctime)s %(filename)s:%(lineno)d %(funcName)s %(levelname)s %(message)s" +_LOG_FORMAT = "%(asctime)s [%(processName)s] [%(threadName)s] %(filename)s:%(lineno)d %(funcName)s %(levelname)s %(message)s" logging.basicConfig(level=_LOG_LEVEL, format=_LOG_FORMAT) +logging.basicConfig(datefmt="%Y-%m-%d %H:%M:%S.%f") _LOG = logging.getLogger(__name__) diff --git a/setup.cfg b/setup.cfg index 7746bfe080..2a6e3ecc7f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,10 +31,15 @@ extend-ignore = E203 [tool:pytest] minversion = 7.1 +# Enable the CLI log so that when xdist runners fail a particular test, the +# fixture setup code is also captured. +# Note: this can be a little noisy when running without xdist (e.g., -n0), but +# we typically only do that for single tests while debugging. +# See Also: conftest.py where we also configure file based logging. log_cli = true log_cli_level = INFO -log_cli_format = %(asctime)s [%(levelname)s] %(name)s: %(message)s -log_cli_date_format = %Y-%m-%d %H:%M:%S.%N +log_cli_format = %(asctime)s.%(msecs)03d [%(process)d][%(threadName)s] [%(filename)s:%(lineno)d %(funcName)s] %(levelname)s: %(message)s +log_cli_date_format = %Y-%m-%d %H:%M:%S #pythonpath = . # Run all tests to completion on multiple processes. # Run failed and new tests first by default. From 804bb918005975727729fb2d035936b4c1816cde Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 28 Oct 2025 21:49:23 -0700 Subject: [PATCH 67/76] adding pytest log capture and upload - even on failure --- .github/workflows/devcontainer.yml | 22 +++++++++++++++------- .github/workflows/linux.yml | 8 ++++++++ .github/workflows/macos.yml | 7 +++++++ .github/workflows/windows.yml | 7 +++++++ .gitignore | 3 +++ 5 files changed, 40 insertions(+), 7 deletions(-) diff --git a/.github/workflows/devcontainer.yml b/.github/workflows/devcontainer.yml index 1ea79d6757..0ab2f2ebe1 100644 --- a/.github/workflows/devcontainer.yml +++ b/.github/workflows/devcontainer.yml @@ -208,13 +208,6 @@ jobs: # Now actually run the tests. docker exec --user vscode --env USER=vscode mlos-devcontainer make CONDA_INFO_LEVEL=-v test - - name: Upload coverage report as build artifact - if: always() # ensures it runs even if tests fail - uses: actions/upload-artifact@v4 - with: - name: coverage-report - path: coverage.xml - - name: Generate and test binary distribution files timeout-minutes: 10 run: | @@ -298,6 +291,21 @@ jobs: docker tag mlos-devcontainer:latest ${{ secrets.ACR_LOGINURL }}/mlos-devcontainer:$image_tag docker push ${{ secrets.ACR_LOGINURL }}/mlos-devcontainer:$image_tag + # ensure these run even if tests fail, but only after everything else is done or skipped + - name: Upload coverage report as build artifact + if: always() + uses: actions/upload-artifact@v5 + with: + name: coverage-report + path: coverage.xml + + - name: Upload pytest logs + if: always() + uses: actions/upload-artifact@v5 + with: + name: pytest-logs + path: logs/pytest*.log + retention-days: 7 PublishDocs: name: Publish Documentation diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index bb044b5ae6..7e9e25d65c 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -189,3 +189,11 @@ jobs: - name: Generate and test binary distribution files run: make CONDA_ENV_NAME=$CONDA_ENV_NAME CONDA_INFO_LEVEL=-v dist dist-test + + - name: Upload pytest logs + if: always() + uses: actions/upload-artifact@v5 + with: + name: pytest-logs + path: logs/pytest*.log + retention-days: 7 diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 37bd8e39e4..f61a22e6f2 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -163,6 +163,13 @@ jobs: - name: Generate and test binary distribution files run: make CONDA_ENV_NAME=$CONDA_ENV_NAME CONDA_INFO_LEVEL=-v dist dist-test + - name: Upload pytest logs + if: always() + uses: actions/upload-artifact@v5 + with: + name: pytest-logs + path: logs/pytest*.log + retention-days: 7 MacOSDevContainerBuildTest: name: MacOS DevContainer Build/Test diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index fde7611678..f4b1325032 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -142,6 +142,13 @@ jobs: run: | .github/workflows/build-dist-test.ps1 + - name: Upload pytest logs + if: always() + uses: actions/upload-artifact@v5 + with: + name: pytest-logs + path: logs/pytest*.log + retention-days: 7 WindowsDevContainerBuildTest: name: Windows DevContainer Build/Test diff --git a/.gitignore b/.gitignore index 471d653344..1bb01c59b7 100644 --- a/.gitignore +++ b/.gitignore @@ -171,3 +171,6 @@ build/*.build-stamp *.duckdb *.db.wal *.duckdb.wal + +# pytest logs +logs/pytest*.log From ff31b4e90c2dc01f5976d63aa76f1e8b41e8f27d Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 28 Oct 2025 23:01:09 -0700 Subject: [PATCH 68/76] adding a basic healthcheck to look for readiness of the container --- mlos_bench/mlos_bench/tests/services/remote/ssh/Dockerfile | 4 ++++ mlos_bench/mlos_bench/tests/services/remote/ssh/up.sh | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/Dockerfile b/mlos_bench/mlos_bench/tests/services/remote/ssh/Dockerfile index d157473dbf..87fee7faff 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/Dockerfile +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/Dockerfile @@ -6,6 +6,7 @@ RUN apt-get update \ openssh-server openssh-client \ sudo ARG PORT=2254 +ENV PORT=${PORT} EXPOSE ${PORT} RUN echo "Port ${PORT}" > /etc/ssh/sshd_config.d/local.conf \ && echo "PermitRootLogin prohibit-password" >> /etc/ssh/sshd_config.d/local.conf \ @@ -15,3 +16,6 @@ RUN echo "Port ${PORT}" > /etc/ssh/sshd_config.d/local.conf \ ENV TIMEOUT=180 # NOTE: The "-n" is important in the call to "wait" so that an exit failure can be detected. CMD ["/bin/bash", "-eux", "-c", "trap : TERM INT; service ssh start && sleep ${TIMEOUT:-180} & wait -n"] + +HEALTHCHECK --interval=5s --timeout=3s --retries=5 --start-period=1s \ + CMD ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes -o ConnectTimeout=2 -p ${PORT} -i /root/.ssh/id_rsa root@localhost 'echo ok' diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/up.sh b/mlos_bench/mlos_bench/tests/services/remote/ssh/up.sh index f0f152975d..24278d0b17 100755 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/up.sh +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/up.sh @@ -24,6 +24,13 @@ docker compose -p "$PROJECT_NAME" cp ssh-server:/root/.ssh/id_rsa ./id_rsa chmod 0600 ./id_rsa set +x +for server in ssh-server alt-server reboot-server; do + until [ "$(docker inspect -f '{{.State.Health.Status}}' $(docker compose -p ${PROJECT_NAME} ps -q ${server}))" = "healthy" ]; do + echo "Waiting for ${server} container to become healthy..." + sleep 1 + done +done + echo "OK: private key available at '$scriptdir/id_rsa'. Connect to the ssh-server container at the following port:" docker compose -p "$PROJECT_NAME" port ssh-server ${PORT:-2254} | cut -d: -f2 echo "INFO: And this port for the alt-server container:" From dae49cd0fc1556b1aaa7ce7fcd15b95eb0de5625 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 28 Oct 2025 23:58:51 -0700 Subject: [PATCH 69/76] pylint --- .../mlos_bench/tests/docker_fixtures_util.py | 81 ++++++++++- .../tests/services/remote/ssh/__init__.py | 133 ++++++++---------- .../tests/services/remote/ssh/fixtures.py | 28 ++-- .../remote/ssh/test_ssh_host_service.py | 48 +++---- .../services/remote/ssh/test_ssh_service.py | 20 ++- 5 files changed, 167 insertions(+), 143 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/docker_fixtures_util.py b/mlos_bench/mlos_bench/tests/docker_fixtures_util.py index 2309123bc8..69c23f3aba 100644 --- a/mlos_bench/mlos_bench/tests/docker_fixtures_util.py +++ b/mlos_bench/mlos_bench/tests/docker_fixtures_util.py @@ -16,9 +16,14 @@ """ # pylint: disable=redefined-outer-name +import json +import logging import os import sys from collections.abc import Generator +from json import JSONDecodeError +from subprocess import CalledProcessError, run +from time import sleep from typing import Any import pytest @@ -26,6 +31,72 @@ from pytest_docker.plugin import Services as DockerServices from pytest_docker.plugin import get_docker_services +_LOG = logging.getLogger(__name__) + + +def wait_docker_service_healthy( + compose_project_name: str, + service_name: str, + timeout_seconds: int = 30, + check_interval_seconds: int = 1, +) -> None: + """ + Waits for the specified docker service to become healthy. + + Parameters + ---------- + compose_project_name : str + The docker-compose project name. + service_name : str + The name of the service to wait for. + timeout_seconds : int + The maximum time to wait for the service to become healthy. + check_interval_seconds : int + The interval between health checks. + """ + for _ in range(0, timeout_seconds, check_interval_seconds): + try: + inspect_cmd = run( + ( + f"docker compose -p {compose_project_name} " + f"ps --format '{{{{.ID}}}}' {service_name}" + ), + shell=True, + check=True, + capture_output=True, + ) + container_id = inspect_cmd.stdout.decode().strip() + if not container_id: + raise RuntimeError(f"Container ID for {service_name} not found") + inspect_state_cmd = run( + f"docker inspect {container_id}", + shell=True, + check=True, + capture_output=True, + ) + inspect_data = json.loads(inspect_state_cmd.stdout.decode()) + state = inspect_data[0].get("State", {}) + health = state.get("Health") + if state.get("Status") == "running" and ( + not health or health.get("Status") == "healthy" + ): + _LOG.debug("Container %s is running and healthy", service_name) + return + else: + _LOG.info( + ( + "Waiting for %s to become running (State.Status: %s) " + "and healthy (Health.Status=%s)" + ), + service_name, + state.get("Status") if state else "UNKNOWN", + health.get("Status") if health else "N/A", + ) + except (RuntimeError, CalledProcessError, JSONDecodeError) as e: + _LOG.error("Error checking health of %s: %s", service_name, e) + sleep(check_interval_seconds) + raise RuntimeError(f"Container {service_name} did not become healthy in time.") + # Fixtures to configure the pytest-docker plugin. @pytest.fixture(scope="session") @@ -44,16 +115,16 @@ def docker_setup() -> list[str] | str: @pytest.fixture(scope="session") def docker_compose_file(pytestconfig: pytest.Config) -> list[str]: """ - Fixture for the path to the docker-compose file. + Base fixture for the path to the docker-compose file(s). Parameters ---------- pytestconfig : pytest.Config - Returns - ------- - list[str] - List of paths to the docker-compose file(s). + Notes + ----- + This fixture should be overridden in the local conftest.py to point to the actual + docker-compose.yml files needed for the tests. """ _ = pytestconfig # unused # Add additional configs as necessary here. diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py index 7c70d95221..81f30941ed 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/__init__.py @@ -5,10 +5,12 @@ """Common data classes for the SSH service tests.""" import logging -import threading from dataclasses import dataclass from subprocess import run +from mlos_bench.tests import check_socket +from mlos_bench.tests.docker_fixtures_util import wait_docker_service_healthy + # The SSH test server port and name. # See Also: docker-compose.yml SSH_TEST_SERVER_PORT = 2254 @@ -16,6 +18,8 @@ ALT_TEST_SERVER_NAME = "alt-server" REBOOT_TEST_SERVER_NAME = "reboot-server" +_LOG = logging.getLogger(__name__) + @dataclass class SshTestServerInfo: @@ -34,55 +38,62 @@ class SshTestServerInfo: id_rsa_path: str _port: int | None = None - def get_port(self, uncached: bool = False) -> int: + def get_port(self, uncached: bool = False, check_port: bool = True) -> int: """ Gets the port that the SSH test server is listening on. Note: this value can change when the service restarts so we can't rely on the DockerServices. """ - _LOG = logging.getLogger(__name__) - thread_id = threading.get_ident() - - if self._port is None or uncached: - _LOG.info( - "[Thread %s] Discovering port for %s (uncached=%s, cached_port=%s)", - thread_id, - self.service_name, - uncached, - self._port, - ) - try: - # NOTE: this cache may become stale in another worker if the container restarts in one and the other worker doesn't notice the new port. - port_cmd = run( - ( - f"docker compose -p {self.compose_project_name} " - f"port {self.service_name} {SSH_TEST_SERVER_PORT}" - ), - shell=True, - check=True, - capture_output=True, - ) - new_port = int(port_cmd.stdout.decode().strip().split(":")[1]) - old_port = self._port - self._port = new_port - _LOG.info( - "[Thread %s] Port for %s: %s -> %s (uncached=%s)", - thread_id, + if not uncached and self._port is not None: + # NOTE: this cache may become stale in another worker if the + # container restarts in one and the other worker doesn't notice the new port. + # Optionally check the status of the cached port before returning it. + if not check_port or self.validate_connection(): + _LOG.debug( + "Using cached port %s for %s %s validation.", + self._port, self.service_name, - old_port, - new_port, - uncached, - ) - except Exception as e: - _LOG.error( - "[Thread %s] Failed to get port for %s: %s", thread_id, self.service_name, e + "with" if check_port else "without", ) - raise - else: - _LOG.debug( - "[Thread %s] Using cached port %s for %s", thread_id, self._port, self.service_name - ) + return self._port + + # Check container state before proceeding to avoid a race in docker + # container startup. + _LOG.info( + ( + "Discovering port for %s (uncached=%s, cached_port=%s) " + "after waiting for container readiness." + ), + self.service_name, + uncached, + self._port, + ) + wait_docker_service_healthy( + self.compose_project_name, + self.service_name, + ) + _LOG.debug("Container %s is healthy, getting port...", self.service_name) + + port_cmd = run( + ( + f"docker compose -p {self.compose_project_name} " + f"port {self.service_name} {SSH_TEST_SERVER_PORT}" + ), + shell=True, + check=True, + capture_output=True, + ) + new_port = int(port_cmd.stdout.decode().strip().split(":")[1]) + old_port = self._port + self._port = new_port + _LOG.info( + "Port for %s: %s -> %s (uncached=%s)", + self.service_name, + old_port, + new_port, + uncached, + ) return self._port def to_ssh_service_config(self, uncached: bool = False) -> dict: @@ -115,51 +126,17 @@ def validate_connection(self, timeout: float = 2.0) -> bool: if self._port is None: return False - _LOG = logging.getLogger(__name__) - thread_id = threading.get_ident() - - # Import here to avoid circular imports - from mlos_bench.tests import check_socket - is_connectable = check_socket(self.hostname, self._port, timeout) if not is_connectable: _LOG.warning( - "[Thread %s] Connection validation FAILED for %s:%d - port may be stale!", - thread_id, + "Connection validation FAILED for %s:%d - port may be stale!", self.service_name, self._port, ) else: _LOG.debug( - "[Thread %s] Connection validation OK for %s:%d", - thread_id, + "Connection validation OK for %s:%d", self.service_name, self._port, ) return is_connectable - - def get_port_with_validation(self, timeout: float = 2.0) -> int: - """ - Get port with automatic validation and refresh if connection fails. - - This helps detect stale cached ports caused by container restarts. - """ - _LOG = logging.getLogger(__name__) - thread_id = threading.get_ident() - - # First try cached port - if self._port is not None: - if self.validate_connection(timeout): - return self._port - else: - _LOG.warning( - "[Thread %s] Cached port %d for %s failed validation, refreshing...", - thread_id, - self._port, - self.service_name, - ) - # Force refresh - return self.get_port(uncached=True) - else: - # No cached port, get fresh one - return self.get_port(uncached=False) diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py index e25a524c6e..543dd05ba5 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/fixtures.py @@ -11,7 +11,6 @@ import logging import os import tempfile -import threading from collections.abc import Generator from subprocess import run @@ -30,6 +29,8 @@ # pylint: disable=redefined-outer-name +_LOG = logging.getLogger(__name__) + @pytest.fixture(scope="session") def docker_compose_file(pytestconfig: pytest.Config) -> list[str]: @@ -83,10 +84,7 @@ def ssh_test_server( """ # Get a copy of the ssh id_rsa key from the test ssh server. with tempfile.NamedTemporaryFile() as id_rsa_file: - _LOG = logging.getLogger(__name__) - thread_id = threading.get_ident() - - _LOG.info("[Thread %s] Setting up ssh_test_server fixture", thread_id) + _LOG.info("Setting up ssh_test_server fixture") ssh_test_server_info = SshTestServerInfo( compose_project_name=docker_compose_project_name, @@ -97,7 +95,7 @@ def ssh_test_server( ) port = ssh_test_server_info.get_port() - _LOG.info("[Thread %s] Main SSH server discovered on port %d", thread_id, port) + _LOG.info("Main SSH server discovered on port %d", port) wait_docker_service_socket( locked_docker_services, @@ -105,7 +103,7 @@ def ssh_test_server( port, ) - _LOG.info("[Thread %s] Main SSH server validated and ready", thread_id) + _LOG.info("Main SSH server validated and ready") id_rsa_src = f"/{ssh_test_server_info.username}/.ssh/id_rsa" docker_cp_cmd = ( f"docker compose -p {docker_compose_project_name} " @@ -141,10 +139,8 @@ def alt_test_server( # Note: The alt-server uses the same image as the ssh-server container, so # the id_rsa key and username should all match. # Only the host port it is allocate is different. - _LOG = logging.getLogger(__name__) - thread_id = threading.get_ident() - _LOG.info("[Thread %s] Setting up alt_test_server fixture", thread_id) + _LOG.info("Setting up alt_test_server fixture") alt_test_server_info = SshTestServerInfo( compose_project_name=ssh_test_server.compose_project_name, @@ -155,7 +151,7 @@ def alt_test_server( ) port = alt_test_server_info.get_port() - _LOG.info("[Thread %s] Alt SSH server discovered on port %d", thread_id, port) + _LOG.info("Alt SSH server discovered on port %d", port) wait_docker_service_socket( locked_docker_services, @@ -163,7 +159,7 @@ def alt_test_server( port, ) - _LOG.info("[Thread %s] Alt SSH server validated and ready", thread_id) + _LOG.info("Alt SSH server validated and ready") return alt_test_server_info @@ -180,10 +176,8 @@ def reboot_test_server( # Note: The reboot-server uses the same image as the ssh-server container, so # the id_rsa key and username should all match. # Only the host port it is allocate is different. - _LOG = logging.getLogger(__name__) - thread_id = threading.get_ident() - _LOG.info("[Thread %s] Setting up reboot_test_server fixture", thread_id) + _LOG.info("Setting up reboot_test_server fixture") reboot_test_server_info = SshTestServerInfo( compose_project_name=ssh_test_server.compose_project_name, @@ -194,7 +188,7 @@ def reboot_test_server( ) port = reboot_test_server_info.get_port() - _LOG.info("[Thread %s] Reboot SSH server discovered on port %d", thread_id, port) + _LOG.info("Reboot SSH server discovered on port %d", port) wait_docker_service_socket( locked_docker_services, @@ -202,7 +196,7 @@ def reboot_test_server( port, ) - _LOG.info("[Thread %s] Reboot SSH server validated and ready", thread_id) + _LOG.info("Reboot SSH server validated and ready") return reboot_test_server_info diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py index fb129f636b..28d9c92a38 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py @@ -35,22 +35,14 @@ def test_ssh_service_remote_exec( This checks state of the service across multiple invocations and states to check for internal cache handling logic as well. """ - # pylint: disable=protected-access - import threading - - thread_id = threading.get_ident() - - _LOG.info( - "[Thread %s] test_ssh_service_remote_exec starting - validating server connections", - thread_id, - ) + # pylint: disable=protected-access,too-many-locals + _LOG.info("test_ssh_service_remote_exec starting - validating server connections") # Validate connections before starting test - for server_info, name in [(ssh_test_server, "main"), (alt_test_server, "alt")]: + for server_info, name in ((ssh_test_server, "main"), (alt_test_server, "alt")): if not server_info.validate_connection(): _LOG.warning( - "[Thread %s] %s server connection validation failed, may have stale port", - thread_id, + "%s server connection validation failed, may have stale port", name, ) @@ -167,17 +159,15 @@ def check_ssh_service_reboot( graceful: bool, ) -> None: """Check the SshHostService reboot operation.""" + # pylint: disable=too-many-locals + # Note: rebooting changes the port number unfortunately, but makes it # easier to check for success. # Also, it may cause issues with other parallel unit tests, so we run it as # a part of the same unit test for now. - import threading - - thread_id = threading.get_ident() _LOG.warning( - "[Thread %s] *** STARTING REBOOT TEST (graceful=%s) - This will change container ports! ***", - thread_id, + "*** STARTING REBOOT TEST (graceful=%s) - This will change container ports! ***", graceful, ) @@ -185,8 +175,7 @@ def check_ssh_service_reboot( reboot_test_srv_ssh_svc_conf = reboot_test_server.to_ssh_service_config(uncached=True) original_port = reboot_test_srv_ssh_svc_conf["ssh_port"] _LOG.warning( - "[Thread %s] Reboot test using original port %d for %s", - thread_id, + "Reboot test using original port %d for %s", original_port, reboot_test_server.service_name, ) @@ -231,8 +220,7 @@ def check_ssh_service_reboot( check=False, ) _LOG.info( - "[Thread %s] Docker ps output (attempt %d):\nSTDOUT:\n%s\nSTDERR:\n%s", - thread_id, + "Docker ps output (attempt %d):\nSTDOUT:\n%s\nSTDERR:\n%s", attempt + 1, run_res.stdout.decode(), run_res.stderr.decode(), @@ -242,32 +230,31 @@ def check_ssh_service_reboot( ) new_port = reboot_test_srv_ssh_svc_conf_new["ssh_port"] _LOG.warning( - "[Thread %s] Port check attempt %d: %d -> %d", - thread_id, + "Port check attempt %d: %d -> %d", attempt + 1, original_port, new_port, ) if new_port != original_port: _LOG.warning( - "[Thread %s] *** PORT CHANGED: %d -> %d (this affects ALL workers!) ***", - thread_id, - attempt + 1, + "*** PORT CHANGED: %d -> %d (this affects ALL workers!) ***", original_port, new_port, ) break except CalledProcessError as ex: _LOG.error( - "[Thread %s] Failed to check port for reboot test server (attempt %d): %s", - thread_id, + "Failed to check port for reboot test server (attempt %d): %s", attempt + 1, ex, ) assert ( reboot_test_srv_ssh_svc_conf_new["ssh_port"] != reboot_test_srv_ssh_svc_conf["ssh_port"] - ), f"Port should have changed from {original_port} but is still {reboot_test_srv_ssh_svc_conf_new['ssh_port']}" + ), ( + f"""Port should have changed from {original_port}""" + f"""but is still {reboot_test_srv_ssh_svc_conf_new["ssh_port"]}""" + ) wait_docker_service_socket( docker_services, @@ -285,8 +272,7 @@ def check_ssh_service_reboot( assert results["stdout"].strip() == REBOOT_TEST_SERVER_NAME _LOG.warning( - "[Thread %s] *** REBOOT TEST COMPLETED - Other workers may now have stale cached ports! ***", - thread_id, + "*** REBOOT TEST COMPLETED - Other workers may now have stale cached ports! ***", ) diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py index c35db27abb..3972e1e56a 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py @@ -5,6 +5,7 @@ """Tests for mlos_bench.services.remote.ssh.SshService base class.""" import asyncio +import logging import time from importlib.metadata import PackageNotFoundError, version from subprocess import run @@ -40,6 +41,8 @@ # OK: pytest-lazy-fixture not installed pass +_LOG = logging.getLogger(__name__) + @requires_docker @requires_ssh @@ -52,13 +55,7 @@ ) def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_name: str) -> None: """Check for the pytest-docker ssh test infra.""" - import logging - import threading - - _LOG = logging.getLogger(__name__) - thread_id = threading.get_ident() - - _LOG.info("[Thread %s] test_ssh_service_test_infra starting with %s", thread_id, server_name) + _LOG.info("test_ssh_service_test_infra starting with %s", server_name) assert ssh_test_server_info.service_name == server_name @@ -68,13 +65,12 @@ def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_ # Use validation method to detect stale ports if not ssh_test_server_info.validate_connection(): _LOG.warning( - "[Thread %s] Cached port validation failed, getting fresh port for %s", - thread_id, + "Cached port validation failed, getting fresh port for %s", server_name, ) - local_port = ssh_test_server_info.get_port_with_validation() - _LOG.info("[Thread %s] Using port %d for %s", thread_id, local_port, server_name) + local_port = ssh_test_server_info.get_port() + _LOG.info("Using port %d for %s", local_port, server_name) assert check_socket(ip_addr, local_port) ssh_cmd = ( "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new " @@ -84,7 +80,7 @@ def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_ cmd = run(ssh_cmd.split(), capture_output=True, text=True, check=True) assert cmd.stdout.strip() == server_name - # assert False, "Force failure for debugging." + assert False, "Force failure for debugging." @pytest.mark.filterwarnings( From 4b981f826f6154ffbdbbd93a980ed30b54525a5c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 29 Oct 2025 06:59:34 +0000 Subject: [PATCH 70/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/conftest.py b/conftest.py index 98c19374e8..1fddfc36fa 100644 --- a/conftest.py +++ b/conftest.py @@ -23,6 +23,7 @@ ) DATE_FMT = "%Y-%m-%d %H:%M:%S" + def is_master(config: pytest.Config) -> bool: """True if the code running the given pytest.config object is running in a xdist master node or not running xdist at all. @@ -77,6 +78,7 @@ def pytest_configure(config: pytest.Config) -> None: file_handler.setFormatter(log_formatter) logging.getLogger().addHandler(file_handler) + def pytest_configure_node(node: WorkerController) -> None: """Xdist hook used to inform workers of the location of the shared temp dir.""" workerinput: dict = getattr(node, "workerinput") From fcd226659a2badfce0919870f3a7ee60ccb7259d Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Wed, 29 Oct 2025 00:03:12 -0700 Subject: [PATCH 71/76] remove --- .../mlos_bench/tests/services/remote/ssh/test_ssh_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py index 3972e1e56a..7fec2fd813 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_service.py @@ -80,7 +80,7 @@ def test_ssh_service_test_infra(ssh_test_server_info: SshTestServerInfo, server_ cmd = run(ssh_cmd.split(), capture_output=True, text=True, check=True) assert cmd.stdout.strip() == server_name - assert False, "Force failure for debugging." + # assert False, "Force failure for debugging." @pytest.mark.filterwarnings( From 3d5da84f9ea9dfbc6e47cf216831f201b0800605 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Wed, 29 Oct 2025 00:05:32 -0700 Subject: [PATCH 72/76] remove special testing checks --- .github/workflows/linux.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 7e9e25d65c..250a248791 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -175,14 +175,6 @@ jobs: timeout-minutes: 5 run: make CONDA_ENV_NAME=$CONDA_ENV_NAME check - - name: Run debug_ssh_tests.py - continue-on-error: true - run: conda -n $CONDA_ENV_NAME run python3 debug_ssh_tests.py - - - name: Run ssh tests - continue-on-error: true - run: conda -n $CONDA_ENV_NAME run python3 -m pytest -vv -rP mlos_bench/mlos_bench/tests/services/remote/ssh - # Only run the coverage checks on the devcontainer job. - name: Run tests run: make CONDA_ENV_NAME=$CONDA_ENV_NAME SKIP_COVERAGE=true test From 95f7bc506fb96e6929d60191c2edb3550c2f6e0e Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Wed, 29 Oct 2025 00:23:38 -0700 Subject: [PATCH 73/76] cleanup and consolidation --- conftest.py | 1 + debug_ssh_tests.py | 146 ------------------ mlos_bench/mlos_bench/launcher.py | 10 +- mlos_bench/mlos_bench/tests/__init__.py | 33 ++-- .../remote/ssh/test_ssh_host_service.py | 2 +- 5 files changed, 19 insertions(+), 173 deletions(-) delete mode 100644 debug_ssh_tests.py diff --git a/conftest.py b/conftest.py index 1fddfc36fa..de9cc2646c 100644 --- a/conftest.py +++ b/conftest.py @@ -16,6 +16,7 @@ import pytest from xdist.workermanage import WorkerController +# See Also: setup.cfg and launcher.py LOG_FMT = ( "%(asctime)s.%(msecs)03d [%(process)d][%(threadName)s] " "[%(filename)s:%(lineno)d %(funcName)s] " diff --git a/debug_ssh_tests.py b/debug_ssh_tests.py deleted file mode 100644 index ccbdc1c4b7..0000000000 --- a/debug_ssh_tests.py +++ /dev/null @@ -1,146 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. -# -""" -Debug script for SSH test race conditions. - -Run this to collect detailed logs about SSH test fixture behavior and port management. -""" - -import logging -import subprocess -import sys -from pathlib import Path - - -def setup_logging(): - """Configure detailed logging for debugging.""" - # Configure root logger - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s.%(msecs)03d [%(levelname)8s] %(name)s: %(message)s", - datefmt="%H:%M:%S", - handlers=[ - logging.StreamHandler(sys.stdout), - logging.FileHandler("ssh_test_debug.log", mode="w"), - ], - ) - - # Set specific loggers to appropriate levels - logging.getLogger("mlos_bench.tests.services.remote.ssh").setLevel(logging.DEBUG) - logging.getLogger("mlos_bench.tests").setLevel(logging.INFO) - logging.getLogger("pytest_docker").setLevel(logging.INFO) - - print("Logging configured. Debug output will be saved to ssh_test_debug.log") - - -def run_tests(): - """Run SSH tests with debugging enabled.""" - setup_logging() - - test_dir = ( - Path(__file__).parent - / "mlos_bench" - / "mlos_bench" - / "tests" - / "services" - / "remote" - / "ssh" - ) - - cmd = [ - sys.executable, - "-m", - "pytest", - "-v", - "-s", # Verbose and don't capture output - "--tb=short", # Short traceback format - "--log-level=DEBUG", # Enable pytest debug logging - "-x", # Stop on first failure - str(test_dir / "test_ssh_service.py"), - str(test_dir / "test_ssh_host_service.py"), - ] - - print(f"Running command: {' '.join(cmd)}") - print("=" * 80) - - try: - result = subprocess.run(cmd, cwd=Path(__file__).parent, check=False) - return result.returncode - except KeyboardInterrupt: - print("\nTest run interrupted by user") - return 130 - - -def run_parallel_tests(): - """Run tests in parallel to reproduce race conditions.""" - setup_logging() - - test_dir = ( - Path(__file__).parent - / "mlos_bench" - / "mlos_bench" - / "tests" - / "services" - / "remote" - / "ssh" - ) - - cmd = [ - sys.executable, - "-m", - "pytest", - "-v", - "-s", # Verbose and don't capture output - "--tb=short", # Short traceback format - "--log-level=INFO", # Less verbose for parallel runs - "-n", - "2", # Run with 2 workers to increase chance of race - "--dist", - "worksteal", # Dynamic work distribution - str(test_dir / "test_ssh_service.py"), - str(test_dir / "test_ssh_host_service.py"), - ] - - print(f"Running parallel tests: {' '.join(cmd)}") - print("=" * 80) - - try: - result = subprocess.run(cmd, cwd=Path(__file__).parent, check=False) - return result.returncode - except KeyboardInterrupt: - print("\nParallel test run interrupted by user") - return 130 - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="Debug SSH test race conditions") - parser.add_argument( - "--parallel", - action="store_true", - help="Run tests in parallel to reproduce race conditions", - ) - - args = parser.parse_args() - - if args.parallel: - exit_code = run_parallel_tests() - else: - exit_code = run_tests() - - print("\n" + "=" * 80) - print(f"Tests completed with exit code: {exit_code}") - - if exit_code != 0: - print("Check ssh_test_debug.log for detailed debugging information") - print("\nKey things to look for:") - print("1. Port changes after reboot tests") - print("2. Connection validation failures") - print("3. Socket connection refused errors") - print("4. Thread IDs to track which worker has issues") - - sys.exit(exit_code) diff --git a/mlos_bench/mlos_bench/launcher.py b/mlos_bench/mlos_bench/launcher.py index 4f49d2c7d4..18734054f1 100644 --- a/mlos_bench/mlos_bench/launcher.py +++ b/mlos_bench/mlos_bench/launcher.py @@ -34,9 +34,13 @@ from mlos_bench.util import try_parse_val _LOG_LEVEL = logging.INFO -_LOG_FORMAT = "%(asctime)s [%(processName)s] [%(threadName)s] %(filename)s:%(lineno)d %(funcName)s %(levelname)s %(message)s" -logging.basicConfig(level=_LOG_LEVEL, format=_LOG_FORMAT) -logging.basicConfig(datefmt="%Y-%m-%d %H:%M:%S.%f") +# See Also: /conftest.py, setup.cfg +_LOG_FORMAT = ( + "%(asctime)s.%(msecs)03d [%(process)d][%(threadName)s] " + "[%(filename)s:%(lineno)d %(funcName)s] " + "%(levelname)s %(message)s" +) +logging.basicConfig(level=_LOG_LEVEL, format=_LOG_FORMAT, datefmt="%Y-%m-%d %H:%M:%S") _LOG = logging.getLogger(__name__) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index cde179f572..684b8c15c5 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -8,6 +8,7 @@ Used to make mypy happy about multiple conftest.py modules. """ import filecmp +import logging import json import os import shutil @@ -24,6 +25,8 @@ from mlos_bench.util import get_class_from_name, nullable +_LOG = logging.getLogger(__name__) + ZONE_NAMES = [ # Explicit time zones. "UTC", @@ -171,18 +174,12 @@ def wait_docker_service_healthy( def wait_docker_service_socket(docker_services: DockerServices, hostname: str, port: int) -> None: """Wait until a docker service is ready.""" - import logging - import threading - - _LOG = logging.getLogger(__name__) - thread_id = threading.get_ident() - - _LOG.info("[Thread %s] Waiting for %s:%d to become responsive", thread_id, hostname, port) + _LOG.info("Waiting for %s:%d to become responsive", hostname, port) def check_with_logging() -> bool: result = check_socket(hostname, port) if not result: - _LOG.debug("[Thread %s] Socket check failed for %s:%d", thread_id, hostname, port) + _LOG.debug("Socket check failed for %s:%d", hostname, port) return result try: @@ -191,9 +188,9 @@ def check_with_logging() -> bool: timeout=60.0, pause=0.5, ) - _LOG.info("[Thread %s] Socket %s:%d is now responsive", thread_id, hostname, port) + _LOG.info("Socket %s:%d is now responsive", hostname, port) except Exception as e: - _LOG.error("[Thread %s] Failed waiting for %s:%d: %s", thread_id, hostname, port, e) + _LOG.error("Failed waiting for %s:%d: %s", hostname, port, e) raise @@ -217,26 +214,16 @@ def check_socket(host: str, port: int, timeout: float = 1.0) -> bool: result = sock.connect_ex((host, port)) success = result == 0 if not success: - import logging - import threading - - _LOG = logging.getLogger(__name__) _LOG.debug( - "[Thread %s] Socket connection to %s:%d failed with code %d", - threading.get_ident(), + "Socket connection to %s:%d failed with code %d", host, port, result, ) return success - except Exception as e: - import logging - import threading - - _LOG = logging.getLogger(__name__) + except (IOError, TimeoutError) as e: _LOG.debug( - "[Thread %s] Socket check exception for %s:%d: %s", - threading.get_ident(), + "Socket check exception for %s:%d: %s", host, port, e, diff --git a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py index 28d9c92a38..d205c0edea 100644 --- a/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py +++ b/mlos_bench/mlos_bench/tests/services/remote/ssh/test_ssh_host_service.py @@ -243,7 +243,7 @@ def check_ssh_service_reboot( ) break except CalledProcessError as ex: - _LOG.error( + _LOG.info( "Failed to check port for reboot test server (attempt %d): %s", attempt + 1, ex, From 586472fa33e0a9fef6e7dbd694c6011f37697bad Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Wed, 29 Oct 2025 00:32:14 -0700 Subject: [PATCH 74/76] add unique artifact names --- .github/workflows/devcontainer.yml | 2 +- .github/workflows/linux.yml | 2 +- .github/workflows/macos.yml | 2 +- .github/workflows/windows.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/devcontainer.yml b/.github/workflows/devcontainer.yml index 0ab2f2ebe1..14f2a05232 100644 --- a/.github/workflows/devcontainer.yml +++ b/.github/workflows/devcontainer.yml @@ -303,7 +303,7 @@ jobs: if: always() uses: actions/upload-artifact@v5 with: - name: pytest-logs + name: devcontainer-pytest-logs path: logs/pytest*.log retention-days: 7 diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 250a248791..2eef298e60 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -186,6 +186,6 @@ jobs: if: always() uses: actions/upload-artifact@v5 with: - name: pytest-logs + name: linux-${{ matrix.python_version }}-pytest-logs path: logs/pytest*.log retention-days: 7 diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index f61a22e6f2..3fd4d7b064 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -167,7 +167,7 @@ jobs: if: always() uses: actions/upload-artifact@v5 with: - name: pytest-logs + name: macos-pytest-logs path: logs/pytest*.log retention-days: 7 diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index f4b1325032..cd5051b0c2 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -146,7 +146,7 @@ jobs: if: always() uses: actions/upload-artifact@v5 with: - name: pytest-logs + name: windows-pytest-logs path: logs/pytest*.log retention-days: 7 From 8e2af7259da2b05b3d73256086ce9196762c68db Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 29 Oct 2025 07:32:49 +0000 Subject: [PATCH 75/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mlos_bench/mlos_bench/tests/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/__init__.py b/mlos_bench/mlos_bench/tests/__init__.py index 684b8c15c5..6c89bf6de5 100644 --- a/mlos_bench/mlos_bench/tests/__init__.py +++ b/mlos_bench/mlos_bench/tests/__init__.py @@ -8,8 +8,8 @@ Used to make mypy happy about multiple conftest.py modules. """ import filecmp -import logging import json +import logging import os import shutil import socket @@ -221,7 +221,7 @@ def check_socket(host: str, port: int, timeout: float = 1.0) -> bool: result, ) return success - except (IOError, TimeoutError) as e: + except (OSError, TimeoutError) as e: _LOG.debug( "Socket check exception for %s:%d: %s", host, From 938d53f8d6a72b54c0c0e600b1e12d8eac1aba30 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Wed, 29 Oct 2025 08:28:18 -0700 Subject: [PATCH 76/76] partial revert of launcher.py log format changes --- mlos_bench/mlos_bench/launcher.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mlos_bench/mlos_bench/launcher.py b/mlos_bench/mlos_bench/launcher.py index 18734054f1..cd76f456cf 100644 --- a/mlos_bench/mlos_bench/launcher.py +++ b/mlos_bench/mlos_bench/launcher.py @@ -34,13 +34,16 @@ from mlos_bench.util import try_parse_val _LOG_LEVEL = logging.INFO -# See Also: /conftest.py, setup.cfg -_LOG_FORMAT = ( - "%(asctime)s.%(msecs)03d [%(process)d][%(threadName)s] " - "[%(filename)s:%(lineno)d %(funcName)s] " - "%(levelname)s %(message)s" -) -logging.basicConfig(level=_LOG_LEVEL, format=_LOG_FORMAT, datefmt="%Y-%m-%d %H:%M:%S") +_LOG_FORMAT = "%(asctime)s %(filename)s:%(lineno)d %(funcName)s %(levelname)s %(message)s" +logging.basicConfig(level=_LOG_LEVEL, format=_LOG_FORMAT) + +# TODO: Future PR: See Also: /conftest.py, setup.cfg +# _LOG_FORMAT = ( +# "%(asctime)s.%(msecs)03d [%(process)d][%(threadName)s] " +# "[%(filename)s:%(lineno)d %(funcName)s] " +# "%(levelname)s %(message)s" +# ) +# logging.basicConfig(level=_LOG_LEVEL, format=_LOG_FORMAT, datefmt="%Y-%m-%d %H:%M:%S") _LOG = logging.getLogger(__name__)