From b0adbdea39437df6de8875dae291a891f85612ee Mon Sep 17 00:00:00 2001 From: JRPAN <25518778+JRPan@users.noreply.github.com> Date: Sun, 25 Jan 2026 16:14:18 -0500 Subject: [PATCH] Update CI workflows to use tgrogers-raid for job execution and replace direct calls with srun for GPU jobs --- .github/workflows/main.yml | 16 +++------------- .github/workflows/weekly.yml | 10 +++++----- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f7de0a34c..c86ac6901 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -240,7 +240,7 @@ jobs: ./util/job_launching/monitor_func_test.py -v --sleep_time 300 -s stats-per-app-ptx.csv -N short-ptx-${{ github.run_number }}_${{ github.run_attempt}} Tracer-Tool: needs: [check-format] - runs-on: tgrogers-gpu01 + runs-on: tgrogers-raid timeout-minutes: 720 defaults: run: @@ -295,16 +295,6 @@ jobs: source ./gpu-simulator/setup_environment.sh make clean -C gpu-simulator make -j20 -C gpu-simulator - - name: test-prebuilt-traces - run: | - source ./env-setup/12.8_env_setup.sh - source ./gpu-simulator/setup_environment.sh - ./get-accel-sim-traces.py -a tesla-v100/rodinia_2.0-ft - cd hw_run; tar -xzvf rodinia_2.0-ft.tgz; cd - - ./util/job_launching/run_simulations.py -B rodinia_2.0-ft -C QV100-SASS -T ./hw_run/ -N rodinia_2.0-ft-online-$$ - ./util/job_launching/monitor_func_test.py --sleep_time 300 -N rodinia_2.0-ft-online-$$ -v - rm -rf hw_run - rm -rf sim_run_11.0 - name: Build Tracer run: | source ./env-setup/12.8_env_setup.sh @@ -324,12 +314,12 @@ jobs: source ./env-setup/12.8_env_setup.sh source ./gpu-app-collection/src/setup_environment rm -rf ./hw_run/ - ./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft -D 7 + srun --job-name=gpu-lock --dependency=singleton --partition=tgrogers-dgx -- ./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft -D 7 - name: generate-rodinia_2.0-ft-hw_stats run: | source ./env-setup/12.8_env_setup.sh source ./gpu-app-collection/src/setup_environment - ./util/hw_stats/run_hw.py -B rodinia_2.0-ft -D 7 + srun --job-name=gpu-lock --dependency=singleton --partition=tgrogers-dgx -- ./util/hw_stats/run_hw.py -B rodinia_2.0-ft -D 7 - name: test-new-traces run: | source ./env-setup/12.8_env_setup.sh diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml index eee5bee98..83d24a9bf 100644 --- a/.github/workflows/weekly.yml +++ b/.github/workflows/weekly.yml @@ -11,7 +11,7 @@ jobs: Tracer-Weekly: timeout-minutes: 720 if: ${{ github.repository == 'accel-sim/accel-sim-framework' || github.event_name == 'workflow_dispatch' }} - runs-on: tgrogers-gpu01 + runs-on: tgrogers-raid defaults: run: shell: bash @@ -52,8 +52,8 @@ jobs: source ./env-setup/12.8_env_setup.sh source ./gpu-app-collection/src/setup_environment rm -rf ./hw_run/ - ./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -D 7 - ./util/hw_stats/run_hw.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -D 7 + srun --job-name=gpu-lock --dependency=singleton --partition=tgrogers-dgx -- ./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -D 7 + srun --job-name=gpu-lock --dependency=singleton --partition=tgrogers-dgx -- ./util/hw_stats/run_hw.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -D 7 rm -rf /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces mkdir -p /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces mv ./hw_run /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run @@ -63,9 +63,9 @@ jobs: source ./env-setup/12.8_env_setup.sh source ./gpu-app-collection/src/setup_environment rm -rf ./hw_run/ - ./util/tracer_nvbit/run_hw_trace.py -B Spinlock -D 7 --spinlock_handling fast_forward + srun --job-name=gpu-lock --dependency=singleton --partition=tgrogers-dgx -- ./util/tracer_nvbit/run_hw_trace.py -B Spinlock -D 7 --spinlock_handling fast_forward mv ./hw_run /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run_fast_forward - ./util/tracer_nvbit/run_hw_trace.py -B Spinlock -D 7 --spinlock_handling none + srun --job-name=gpu-lock --dependency=singleton --partition=tgrogers-dgx -- ./util/tracer_nvbit/run_hw_trace.py -B Spinlock -D 7 --spinlock_handling none mv ./hw_run /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run_none SASS-Weekly: timeout-minutes: 720