From a2f97978b498b6eb3b25e2ca56041fdccdef4f48 Mon Sep 17 00:00:00 2001 From: kithumma Date: Wed, 21 Jan 2026 04:11:28 +0000 Subject: [PATCH 1/5] workflow enhancement --- .github/workflows/enroot-tests.yml | 62 +++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 5 deletions(-) diff --git a/.github/workflows/enroot-tests.yml b/.github/workflows/enroot-tests.yml index 4ee2c96..2b1ae9e 100644 --- a/.github/workflows/enroot-tests.yml +++ b/.github/workflows/enroot-tests.yml @@ -1,6 +1,9 @@ name: Enroot Tests on: + push: + branches: + - main workflow_dispatch: inputs: test_name: @@ -10,6 +13,7 @@ on: options: - test_single_node_pytorch - test_multi_node_distributed_pytorch + - test_multi_node_rccl no_install: description: 'Skip installation (--no-install)' required: false @@ -26,41 +30,89 @@ on: type: string default: '' testbed_file: - description: 'Path to testbed file (e.g. tests/enroot/testbeds/mi325.yaml)' + description: 'Path to testbed file (e.g. tests/enroot/testbeds/mi325.yaml) - defaults to secrets.TESTBED_FILE' required: false type: string - default: 'testbed/enroot_tb.yml' + default: '' jobs: run-enroot-tests: runs-on: enroot-runners timeout-minutes: 120 + strategy: + matrix: + test_name: + - test_single_node_pytorch + - test_multi_node_distributed_pytorch + - test_multi_node_rccl + max-parallel: 1 # Run tests sequentially steps: - name: Checkout repository uses: actions/checkout@v4 + if: ${{ (github.event_name == 'push' && matrix.test_name != 'test_multi_node_rccl') || (github.event_name == 'workflow_dispatch' && inputs.test_name == matrix.test_name) }} - name: Set up Python uses: actions/setup-python@v5 + if: ${{ (github.event_name == 'push' && matrix.test_name != 'test_multi_node_rccl') || (github.event_name == 'workflow_dispatch' && inputs.test_name == matrix.test_name) }} with: python-version: '3.8' - name: Install dependencies + if: ${{ (github.event_name == 'push' && matrix.test_name != 'test_multi_node_rccl') || (github.event_name == 'workflow_dispatch' && inputs.test_name == matrix.test_name) }} run: | python3 -m pip install --upgrade pip pip install -r tests/enroot/requirements.txt - name: Run enroot tests + if: ${{ (github.event_name == 'push' && matrix.test_name != 'test_multi_node_rccl') || (github.event_name == 'workflow_dispatch' && inputs.test_name == matrix.test_name) }} working-directory: tests/enroot run: | - python3 run_test.py "${{ inputs.test_name }}" "${{ inputs.docker_image }}" "${{ inputs.no_install }}" "${{ inputs.no_uninstall }}" "${{ inputs.testbed_file }}" + # Use matrix test_name for the test to run + TEST_NAME="${{ matrix.test_name }}" + + # Determine testbed file based on test type + if [ "${{ github.event_name }}" = "push" ]; then + # For push events: use test-type-specific secrets + if [ "$TEST_NAME" = "test_single_node_pytorch" ]; then + DEFAULT_TESTBED="${{ secrets.SINGLE_NODE_TESTBED_FILE }}" + else + DEFAULT_TESTBED="${{ secrets.MULTI_NODE_TESTBED_FILE }}" + fi + DOCKER_IMAGE="" + NO_INSTALL="false" + NO_UNINSTALL="false" + TESTBED_FILE="$DEFAULT_TESTBED" + else + # For workflow_dispatch: allow input override, otherwise use test-type-specific secrets + if [ -n "${{ inputs.testbed_file }}" ]; then + TESTBED_FILE="${{ inputs.testbed_file }}" + else + if [ "$TEST_NAME" = "test_single_node_pytorch" ]; then + TESTBED_FILE="${{ secrets.SINGLE_NODE_TESTBED_FILE }}" + else + # Both multi-node tests use MULTI_NODE_TESTBED_FILE + TESTBED_FILE="${{ secrets.MULTI_NODE_TESTBED_FILE }}" + fi + fi + DOCKER_IMAGE="${{ inputs.docker_image }}" + NO_INSTALL="${{ inputs.no_install }}" + NO_UNINSTALL="${{ inputs.no_uninstall }}" + fi + + # Run RCCL test differently (pytest directly with hardcoded flags) + if [ "$TEST_NAME" = "test_multi_node_rccl" ]; then + python3 -m pytest test_enroot.py --testbed "$TESTBED_FILE" -k test_multi_node_rccl --no-install --no-uninstall + else + python3 run_test.py "$TEST_NAME" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "$TESTBED_FILE" + fi - name: Upload test results - if: always() + if: ${{ always() && ((github.event_name == 'push' && matrix.test_name != 'test_multi_node_rccl') || (github.event_name == 'workflow_dispatch' && inputs.test_name == matrix.test_name)) }} uses: actions/upload-artifact@v4 with: - name: test-results-${{ inputs.test_name }}-${{ github.run_number }} + name: test-results-${{ matrix.test_name }}-${{ github.run_number }} path: tests/enroot/results/ if-no-files-found: warn retention-days: 30 From a951bf35ac05b6fd9a98f6a7a82cf7f0d1c7a627 Mon Sep 17 00:00:00 2001 From: kithumma Date: Thu, 22 Jan 2026 02:12:44 +0000 Subject: [PATCH 2/5] update PYTHONPATH --- .github/workflows/enroot-tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/enroot-tests.yml b/.github/workflows/enroot-tests.yml index 2b1ae9e..52b0e93 100644 --- a/.github/workflows/enroot-tests.yml +++ b/.github/workflows/enroot-tests.yml @@ -103,6 +103,9 @@ jobs: # Run RCCL test differently (pytest directly with hardcoded flags) if [ "$TEST_NAME" = "test_multi_node_rccl" ]; then + # Set PYTHONPATH and cd to testsuites directory for pytest + export PYTHONPATH=$(pwd):$PYTHONPATH + cd testsuites python3 -m pytest test_enroot.py --testbed "$TESTBED_FILE" -k test_multi_node_rccl --no-install --no-uninstall else python3 run_test.py "$TEST_NAME" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "$TESTBED_FILE" From bba9f004e632577ad6e70818c01fcde50d47dced Mon Sep 17 00:00:00 2001 From: kithumma Date: Thu, 22 Jan 2026 03:30:25 +0000 Subject: [PATCH 3/5] update all three tests and BASE_IMAGE options --- .github/workflows/enroot-tests.yml | 130 +++++++++++++++++++++++------ 1 file changed, 103 insertions(+), 27 deletions(-) diff --git a/.github/workflows/enroot-tests.yml b/.github/workflows/enroot-tests.yml index 52b0e93..f620c33 100644 --- a/.github/workflows/enroot-tests.yml +++ b/.github/workflows/enroot-tests.yml @@ -6,14 +6,36 @@ on: - main workflow_dispatch: inputs: - test_name: - description: 'Select test to run' - required: true - type: choice - options: - - test_single_node_pytorch - - test_multi_node_distributed_pytorch - - test_multi_node_rccl + run_single_node_test: + description: 'Run single-node PyTorch test' + required: false + type: boolean + default: true + run_multi_node_test: + description: 'Run multi-node distributed PyTorch test' + required: false + type: boolean + default: true + run_rccl_test: + description: 'Run multi-node RCCL test' + required: false + type: boolean + default: true + base_image_single_node: + description: 'Docker image for single-node test (default: rocm/pytorch:latest from batch script)' + required: false + type: string + default: '' + base_image_multi_node: + description: 'Docker image for multi-node test (default: docker://rocm/pytorch:rocm7.0.2_ubuntu22.04_py3.10_pytorch_release_2.7.1 from batch script)' + required: false + type: string + default: '' + base_image_rccl: + description: 'Docker image for RCCL test (default: docker://rocm/roce-workload:ubuntu24_rocm-7.0.2_rccl-7.0.2_anp-v1.2.0_ainic-1.117.5-a-56 from batch script)' + required: false + type: string + default: '' no_install: description: 'Skip installation (--no-install)' required: false @@ -24,11 +46,6 @@ on: required: false type: boolean default: false - docker_image: - description: 'Docker image to use (default: rocm/pytorch:latest for single-node, docker://rocm/pytorch:rocm7.0.2_ubuntu22.04_py3.10_pytorch_release_2.7.1 for multi-node)' - required: false - type: string - default: '' testbed_file: description: 'Path to testbed file (e.g. tests/enroot/testbeds/mi325.yaml) - defaults to secrets.TESTBED_FILE' required: false @@ -51,68 +68,127 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 - if: ${{ (github.event_name == 'push' && matrix.test_name != 'test_multi_node_rccl') || (github.event_name == 'workflow_dispatch' && inputs.test_name == matrix.test_name) }} + if: | + ${{ + github.event_name == 'push' || + (github.event_name == 'workflow_dispatch' && ( + (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test) || + (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test) || + (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test) + )) + }} - name: Set up Python uses: actions/setup-python@v5 - if: ${{ (github.event_name == 'push' && matrix.test_name != 'test_multi_node_rccl') || (github.event_name == 'workflow_dispatch' && inputs.test_name == matrix.test_name) }} + if: | + ${{ + github.event_name == 'push' || + (github.event_name == 'workflow_dispatch' && ( + (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test) || + (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test) || + (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test) + )) + }} with: python-version: '3.8' - name: Install dependencies - if: ${{ (github.event_name == 'push' && matrix.test_name != 'test_multi_node_rccl') || (github.event_name == 'workflow_dispatch' && inputs.test_name == matrix.test_name) }} + if: | + ${{ + github.event_name == 'push' || + (github.event_name == 'workflow_dispatch' && ( + (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test) || + (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test) || + (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test) + )) + }} run: | python3 -m pip install --upgrade pip pip install -r tests/enroot/requirements.txt - name: Run enroot tests - if: ${{ (github.event_name == 'push' && matrix.test_name != 'test_multi_node_rccl') || (github.event_name == 'workflow_dispatch' && inputs.test_name == matrix.test_name) }} + if: | + ${{ + github.event_name == 'push' || + (github.event_name == 'workflow_dispatch' && ( + (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test) || + (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test) || + (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test) + )) + }} working-directory: tests/enroot run: | # Use matrix test_name for the test to run TEST_NAME="${{ matrix.test_name }}" - # Determine testbed file based on test type + # Determine testbed file and docker image based on test type and event if [ "${{ github.event_name }}" = "push" ]; then - # For push events: use test-type-specific secrets + # For push events: use test-type-specific secrets and default images from batch scripts if [ "$TEST_NAME" = "test_single_node_pytorch" ]; then - DEFAULT_TESTBED="${{ secrets.SINGLE_NODE_TESTBED_FILE }}" + TESTBED_FILE="${{ secrets.SINGLE_NODE_TESTBED_FILE }}" else - DEFAULT_TESTBED="${{ secrets.MULTI_NODE_TESTBED_FILE }}" + TESTBED_FILE="${{ secrets.MULTI_NODE_TESTBED_FILE }}" fi DOCKER_IMAGE="" NO_INSTALL="false" NO_UNINSTALL="false" - TESTBED_FILE="$DEFAULT_TESTBED" else - # For workflow_dispatch: allow input override, otherwise use test-type-specific secrets + # For workflow_dispatch: use inputs if [ -n "${{ inputs.testbed_file }}" ]; then TESTBED_FILE="${{ inputs.testbed_file }}" else if [ "$TEST_NAME" = "test_single_node_pytorch" ]; then TESTBED_FILE="${{ secrets.SINGLE_NODE_TESTBED_FILE }}" else - # Both multi-node tests use MULTI_NODE_TESTBED_FILE TESTBED_FILE="${{ secrets.MULTI_NODE_TESTBED_FILE }}" fi fi - DOCKER_IMAGE="${{ inputs.docker_image }}" NO_INSTALL="${{ inputs.no_install }}" NO_UNINSTALL="${{ inputs.no_uninstall }}" + + # Set DOCKER_IMAGE based on test type + if [ "$TEST_NAME" = "test_single_node_pytorch" ]; then + DOCKER_IMAGE="${{ inputs.base_image_single_node }}" + elif [ "$TEST_NAME" = "test_multi_node_distributed_pytorch" ]; then + DOCKER_IMAGE="${{ inputs.base_image_multi_node }}" + elif [ "$TEST_NAME" = "test_multi_node_rccl" ]; then + DOCKER_IMAGE="${{ inputs.base_image_rccl }}" + fi fi - # Run RCCL test differently (pytest directly with hardcoded flags) + # Run RCCL test differently (pytest directly) if [ "$TEST_NAME" = "test_multi_node_rccl" ]; then + # For RCCL test: extract version tag from docker image if provided + if [ -n "$DOCKER_IMAGE" ]; then + # Extract version tag from full docker image path + # Example: docker://rocm/roce-workload:ubuntu24_rocm-7.0.2_rccl-7.0.2_anp-v1.2.0_ainic-1.117.5-a-56 + # Result: ubuntu24_rocm-7.0.2_rccl-7.0.2_anp-v1.2.0_ainic-1.117.5-a-56 + DOCKER_IMAGE_VERSION=$(echo "$DOCKER_IMAGE" | sed 's/.*://') + export DOCKER_IMAGE_VERSION + echo "Using RCCL Docker image version: $DOCKER_IMAGE_VERSION" + fi + # Set PYTHONPATH and cd to testsuites directory for pytest export PYTHONPATH=$(pwd):$PYTHONPATH cd testsuites python3 -m pytest test_enroot.py --testbed "$TESTBED_FILE" -k test_multi_node_rccl --no-install --no-uninstall else + # For other tests: use run_test.py python3 run_test.py "$TEST_NAME" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "$TESTBED_FILE" fi - name: Upload test results - if: ${{ always() && ((github.event_name == 'push' && matrix.test_name != 'test_multi_node_rccl') || (github.event_name == 'workflow_dispatch' && inputs.test_name == matrix.test_name)) }} + if: | + ${{ + always() && ( + github.event_name == 'push' || + (github.event_name == 'workflow_dispatch' && ( + (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test) || + (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test) || + (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test) + )) + ) + }} uses: actions/upload-artifact@v4 with: name: test-results-${{ matrix.test_name }}-${{ github.run_number }} From 7c30e6fa9042541e6647bf6151e824e6db5554b0 Mon Sep 17 00:00:00 2001 From: kithumma Date: Thu, 22 Jan 2026 03:57:43 +0000 Subject: [PATCH 4/5] udpate test files --- .github/workflows/enroot-tests-local.yml | 240 +++++++++++++++++++++++ .github/workflows/enroot-tests.yml | 30 +-- 2 files changed, 255 insertions(+), 15 deletions(-) create mode 100644 .github/workflows/enroot-tests-local.yml diff --git a/.github/workflows/enroot-tests-local.yml b/.github/workflows/enroot-tests-local.yml new file mode 100644 index 0000000..08e2975 --- /dev/null +++ b/.github/workflows/enroot-tests-local.yml @@ -0,0 +1,240 @@ +# Local workflow for testing with `act` +# Usage: See act commands at the bottom of this file +name: Enroot Tests (Local) + +on: + workflow_dispatch: + inputs: + run_single_node_test: + description: 'Run single-node PyTorch test' + required: false + type: boolean + default: true + run_multi_node_test: + description: 'Run multi-node distributed PyTorch test' + required: false + type: boolean + default: true + run_rccl_test: + description: 'Run multi-node RCCL test' + required: false + type: boolean + default: true + base_image_single_node: + description: 'Docker image for single-node test' + required: false + type: string + default: '' + base_image_multi_node: + description: 'Docker image for multi-node test' + required: false + type: string + default: '' + base_image_rccl: + description: 'Docker image for RCCL test' + required: false + type: string + default: '' + no_install: + description: 'Skip installation (--no-install)' + required: false + type: boolean + default: false + no_uninstall: + description: 'Skip uninstallation (--no-uninstall)' + required: false + type: boolean + default: false + testbed_file: + description: 'Path to testbed file' + required: false + type: string + default: '' + +jobs: + run-enroot-tests: + # Use ubuntu-latest for act compatibility (or map enroot-runners with -P flag) + runs-on: ubuntu-latest + timeout-minutes: 120 + strategy: + matrix: + test_name: + - test_single_node_pytorch + - test_multi_node_distributed_pytorch + - test_multi_node_rccl + max-parallel: 1 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + if: | + ${{ + (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || + (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || + (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) + }} + + - name: Set up Python + uses: actions/setup-python@v5 + if: | + ${{ + (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || + (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || + (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) + }} + with: + python-version: '3.8' + + - name: Install dependencies + if: | + ${{ + (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || + (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || + (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) + }} + run: | + python3 -m pip install --upgrade pip + pip install -r tests/enroot/requirements.txt + + - name: Run enroot tests + if: | + ${{ + (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || + (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || + (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) + }} + working-directory: tests/enroot + run: | + TEST_NAME="${{ matrix.test_name }}" + + # Use input testbed_file or fall back to secrets + if [ -n "${{ inputs.testbed_file }}" ]; then + TESTBED_FILE="${{ inputs.testbed_file }}" + else + if [ "$TEST_NAME" = "test_single_node_pytorch" ]; then + TESTBED_FILE="${{ secrets.SINGLE_NODE_TESTBED_FILE }}" + else + TESTBED_FILE="${{ secrets.MULTI_NODE_TESTBED_FILE }}" + fi + fi + + NO_INSTALL="${{ inputs.no_install }}" + NO_UNINSTALL="${{ inputs.no_uninstall }}" + + # Set DOCKER_IMAGE based on test type + if [ "$TEST_NAME" = "test_single_node_pytorch" ]; then + DOCKER_IMAGE="${{ inputs.base_image_single_node }}" + elif [ "$TEST_NAME" = "test_multi_node_distributed_pytorch" ]; then + DOCKER_IMAGE="${{ inputs.base_image_multi_node }}" + elif [ "$TEST_NAME" = "test_multi_node_rccl" ]; then + DOCKER_IMAGE="${{ inputs.base_image_rccl }}" + fi + + echo "=== Test Configuration ===" + echo "TEST_NAME: $TEST_NAME" + echo "TESTBED_FILE: $TESTBED_FILE" + echo "DOCKER_IMAGE: $DOCKER_IMAGE" + echo "NO_INSTALL: $NO_INSTALL" + echo "NO_UNINSTALL: $NO_UNINSTALL" + echo "==========================" + + # Run RCCL test differently (pytest directly) + if [ "$TEST_NAME" = "test_multi_node_rccl" ]; then + if [ -n "$DOCKER_IMAGE" ]; then + DOCKER_IMAGE_VERSION=$(echo "$DOCKER_IMAGE" | sed 's/.*://') + export DOCKER_IMAGE_VERSION + echo "Using RCCL Docker image version: $DOCKER_IMAGE_VERSION" + fi + + export PYTHONPATH=$(pwd):$PYTHONPATH + cd testsuites + python3 -m pytest test_enroot.py --testbed "$TESTBED_FILE" -k test_multi_node_rccl --no-install --no-uninstall + else + python3 run_test.py "$TEST_NAME" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "$TESTBED_FILE" + fi + + - name: Upload test results + if: | + ${{ + always() && ( + (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || + (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || + (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) + ) + }} + uses: actions/upload-artifact@v4 + with: + name: test-results-${{ matrix.test_name }}-${{ github.run_number }} + path: tests/enroot/results/ + if-no-files-found: warn + retention-days: 30 + +# ============================================================================= +# ACT COMMANDS TO RUN THIS WORKFLOW LOCALLY +# ============================================================================= +# +# Prerequisites: +# 1. Install act: https://github.com/nektos/act +# - macOS: brew install act +# - Linux: curl -s https://raw.githubusercontent.com/nektos/act/master/install.sh | sudo bash +# +# 2. Create a secrets file at .secrets (in repo root): +# SINGLE_NODE_TESTBED_FILE=/path/to/your/single_node_testbed.yml +# MULTI_NODE_TESTBED_FILE=/path/to/your/multi_node_testbed.yml +# +# Run all three tests with defaults: +# act workflow_dispatch \ +# -W .github/workflows/enroot-tests-local.yml \ +# --secret-file .secrets \ +# --input run_single_node_test=true \ +# --input run_multi_node_test=true \ +# --input run_rccl_test=true \ +# --input no_install=false \ +# --input no_uninstall=false +# +# Run only single-node test: +# act workflow_dispatch \ +# -W .github/workflows/enroot-tests-local.yml \ +# --secret-file .secrets \ +# --input run_single_node_test=true \ +# --input run_multi_node_test=false \ +# --input run_rccl_test=false +# +# Run with custom testbed file: +# act workflow_dispatch \ +# -W .github/workflows/enroot-tests-local.yml \ +# --secret-file .secrets \ +# --input run_single_node_test=true \ +# --input run_multi_node_test=true \ +# --input run_rccl_test=true \ +# --input testbed_file=/path/to/testbed.yml +# +# Run with custom Docker images: +# act workflow_dispatch \ +# -W .github/workflows/enroot-tests-local.yml \ +# --secret-file .secrets \ +# --input run_single_node_test=true \ +# --input run_multi_node_test=true \ +# --input run_rccl_test=true \ +# --input base_image_single_node=rocm/pytorch:latest \ +# --input base_image_multi_node=docker://rocm/pytorch:rocm7.0.2_ubuntu22.04_py3.10_pytorch_release_2.7.1 \ +# --input base_image_rccl=docker://rocm/roce-workload:ubuntu24_rocm-7.0.2_rccl-7.0.2_anp-v1.2.0_ainic-1.117.5-a-56 +# +# Run with --no-install and --no-uninstall flags: +# act workflow_dispatch \ +# -W .github/workflows/enroot-tests-local.yml \ +# --secret-file .secrets \ +# --input run_single_node_test=true \ +# --input run_multi_node_test=true \ +# --input run_rccl_test=true \ +# --input no_install=true \ +# --input no_uninstall=true +# +# Additional act options: +# -v # Verbose output +# --container-architecture linux/amd64 # Specify architecture +# -P ubuntu-latest=catthehacker/ubuntu:act-latest # Use different runner image +# --bind # Bind working directory instead of copy +# -n # Dry run (don't actually run) +# +# ============================================================================= diff --git a/.github/workflows/enroot-tests.yml b/.github/workflows/enroot-tests.yml index f620c33..57c0630 100644 --- a/.github/workflows/enroot-tests.yml +++ b/.github/workflows/enroot-tests.yml @@ -72,9 +72,9 @@ jobs: ${{ github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && ( - (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test) || - (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test) || - (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test) + (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || + (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || + (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) )) }} @@ -84,9 +84,9 @@ jobs: ${{ github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && ( - (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test) || - (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test) || - (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test) + (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || + (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || + (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) )) }} with: @@ -97,9 +97,9 @@ jobs: ${{ github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && ( - (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test) || - (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test) || - (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test) + (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || + (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || + (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) )) }} run: | @@ -111,9 +111,9 @@ jobs: ${{ github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && ( - (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test) || - (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test) || - (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test) + (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || + (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || + (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) )) }} working-directory: tests/enroot @@ -183,9 +183,9 @@ jobs: always() && ( github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && ( - (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test) || - (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test) || - (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test) + (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || + (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || + (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) )) ) }} From ea2103b2895c408bbe980d3db7fc96451f215be2 Mon Sep 17 00:00:00 2001 From: kithumma Date: Thu, 22 Jan 2026 05:14:24 +0000 Subject: [PATCH 5/5] update workflow --- .github/workflows/enroot-tests.yml | 66 ++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 7 deletions(-) diff --git a/.github/workflows/enroot-tests.yml b/.github/workflows/enroot-tests.yml index 57c0630..aac5746 100644 --- a/.github/workflows/enroot-tests.yml +++ b/.github/workflows/enroot-tests.yml @@ -47,7 +47,7 @@ on: type: boolean default: false testbed_file: - description: 'Path to testbed file (e.g. tests/enroot/testbeds/mi325.yaml) - defaults to secrets.TESTBED_FILE' + description: 'Path to testbed file (overrides secret-based testbed). If not provided, uses SINGLE_NODE_TESTBED_FILE or MULTI_NODE_TESTBED_FILE secrets (which should contain YAML content).' required: false type: string default: '' @@ -106,6 +106,44 @@ jobs: python3 -m pip install --upgrade pip pip install -r tests/enroot/requirements.txt + - name: Create testbed file from secret + if: | + ${{ + github.event_name == 'push' || + (github.event_name == 'workflow_dispatch' && ( + (matrix.test_name == 'test_single_node_pytorch' && inputs.run_single_node_test == true) || + (matrix.test_name == 'test_multi_node_distributed_pytorch' && inputs.run_multi_node_test == true) || + (matrix.test_name == 'test_multi_node_rccl' && inputs.run_rccl_test == true) + )) + }} + working-directory: tests/enroot + env: + SINGLE_NODE_TESTBED: ${{ secrets.SINGLE_NODE_TESTBED_FILE }} + MULTI_NODE_TESTBED: ${{ secrets.MULTI_NODE_TESTBED_FILE }} + run: | + # Create testbed files from secrets (secrets contain YAML content) + mkdir -p testbed + + # Write single-node testbed if secret exists + if [ -n "$SINGLE_NODE_TESTBED" ]; then + printf '%s\n' "$SINGLE_NODE_TESTBED" > testbed/single_node_tb.yml + echo "Created testbed/single_node_tb.yml from secret" + else + echo "[WARNING] SINGLE_NODE_TESTBED_FILE secret is not set" + fi + + # Write multi-node testbed if secret exists + if [ -n "$MULTI_NODE_TESTBED" ]; then + printf '%s\n' "$MULTI_NODE_TESTBED" > testbed/multi_node_tb.yml + echo "Created testbed/multi_node_tb.yml from secret" + else + echo "[WARNING] MULTI_NODE_TESTBED_FILE secret is not set" + fi + + # List created testbed files for debugging + echo "Testbed files created:" + ls -la testbed/ || echo "No testbed directory" + - name: Run enroot tests if: | ${{ @@ -123,11 +161,11 @@ jobs: # Determine testbed file and docker image based on test type and event if [ "${{ github.event_name }}" = "push" ]; then - # For push events: use test-type-specific secrets and default images from batch scripts + # For push events: use test-type-specific testbed files and default images from batch scripts if [ "$TEST_NAME" = "test_single_node_pytorch" ]; then - TESTBED_FILE="${{ secrets.SINGLE_NODE_TESTBED_FILE }}" + TESTBED_FILE="testbed/single_node_tb.yml" else - TESTBED_FILE="${{ secrets.MULTI_NODE_TESTBED_FILE }}" + TESTBED_FILE="testbed/multi_node_tb.yml" fi DOCKER_IMAGE="" NO_INSTALL="false" @@ -138,9 +176,9 @@ jobs: TESTBED_FILE="${{ inputs.testbed_file }}" else if [ "$TEST_NAME" = "test_single_node_pytorch" ]; then - TESTBED_FILE="${{ secrets.SINGLE_NODE_TESTBED_FILE }}" + TESTBED_FILE="testbed/single_node_tb.yml" else - TESTBED_FILE="${{ secrets.MULTI_NODE_TESTBED_FILE }}" + TESTBED_FILE="testbed/multi_node_tb.yml" fi fi NO_INSTALL="${{ inputs.no_install }}" @@ -156,6 +194,17 @@ jobs: fi fi + # Validate testbed file exists + if [ ! -f "$TESTBED_FILE" ]; then + echo "[ERROR] Testbed file not found: $TESTBED_FILE" + echo "Please ensure the appropriate secret is set:" + echo " - SINGLE_NODE_TESTBED_FILE for single-node tests" + echo " - MULTI_NODE_TESTBED_FILE for multi-node tests" + echo "Or provide a custom testbed_file input via workflow_dispatch." + exit 1 + fi + echo "Using testbed file: $TESTBED_FILE" + # Run RCCL test differently (pytest directly) if [ "$TEST_NAME" = "test_multi_node_rccl" ]; then # For RCCL test: extract version tag from docker image if provided @@ -168,10 +217,13 @@ jobs: echo "Using RCCL Docker image version: $DOCKER_IMAGE_VERSION" fi + # Convert testbed file to absolute path before changing directory + TESTBED_FILE_ABS="$(pwd)/$TESTBED_FILE" + # Set PYTHONPATH and cd to testsuites directory for pytest export PYTHONPATH=$(pwd):$PYTHONPATH cd testsuites - python3 -m pytest test_enroot.py --testbed "$TESTBED_FILE" -k test_multi_node_rccl --no-install --no-uninstall + python3 -m pytest test_enroot.py --testbed "$TESTBED_FILE_ABS" -k test_multi_node_rccl --no-install --no-uninstall else # For other tests: use run_test.py python3 run_test.py "$TEST_NAME" "$DOCKER_IMAGE" "$NO_INSTALL" "$NO_UNINSTALL" "$TESTBED_FILE"