From c7c3048d6ca9e6da8bcec5cd84d56fc466335116 Mon Sep 17 00:00:00 2001 From: Priyanka Kini J Date: Tue, 20 Jan 2026 05:25:01 -0800 Subject: [PATCH 1/3] Adding python test case to run rccl test --- tests/enroot/testsuites/test_enroot.py | 77 +++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/tests/enroot/testsuites/test_enroot.py b/tests/enroot/testsuites/test_enroot.py index 5f74dbb..391785a 100644 --- a/tests/enroot/testsuites/test_enroot.py +++ b/tests/enroot/testsuites/test_enroot.py @@ -418,7 +418,82 @@ def test_multi_node_distributed_pytorch(): log.info(f"Output : ") log.info(output['stdout'].encode().decode('unicode_escape')) - # Copy back results and deleted the directory and files + # Copy back results and delete the directory and files + log.info(f"Copying all the results to {str(pytest.testdata.results_dir)}...") + + for file in copy_file_list: + local_file = pytest.testdata.results_dir / Path(file).name + exit_code = amd_host.copy_from_host(file,local_file) + assert not exit_code, f" Error copying the file {file} !" + exit_code, output = amd_host.execute_command(f"sudo rm -rf {file}") + assert not exit_code , f" Error deleting the file {file} !, {output['stderr']}" + + # Remove the parent directory + exit_code, output = amd_host.execute_command(f"sudo rm -rf {parent_dir}") + assert not exit_code, f" Error deleting the folder {parent_dir} !, {output['stderr']}" + + # Delete the batch script on the remote host + exit_code, output = amd_host.execute_command(f"sudo rm -rf {remote_script}") + assert not exit_code , f" Error deleting the script {remote_script}!, {output['stderr']}" + +def test_multi_node_rccl(): + """ + Use sbatch to run rccl test on multiple nodes + + TestID: TCID-ENROOT-MULTI-NODE-RCCL + + Setup: + 1.Copy batch file to the home directory + 2.Launch the sbatch script + + Validation: + 1. Verify if sbatch test is completed + 2. Verify if the output file - logs/rccl_test_%j.out is created and print that output + 2. Verify and print the results + Raises: + AssertionError: Above validation points are failed + """ + + amd_host = pytest.testdata.amd_host[0] + # Create batch script + local_script = batch_scripts_folder / "rccl_tests_sbatch.sh" + remote_script = str(local_script.name) + log.info(f"Creating {local_script.name} on {amd_host.host_ip}...") + exit_code = create_batch_script(amd_host,local_script) + if exit_code: + assert False, f"{local_script.name} on {amd_host.host_ip} couldnt be created!!" + log.info(f"Creating {local_script.name} on {amd_host.host_ip} - Successfull !!") + + # Run the batch script -> get jobid + exit_code, output = amd_host.execute_command(f"sbatch --parsable --gres=gpu:{amd_host.gpu_num} {remote_script} ") + assert not exit_code, f"sbatch command couldnt be launched !! : {output['stderr']}" + job_id = output['stdout'].strip() + log.info(f"sbatch job - {job_id} submitted !!") + + # Wait for job completion + job_state, sacct_output = wait_for_job_completion(amd_host,job_id) + log.info(f"Job state of {job_id} : {job_state}") + log.info(f"sacct output : {sacct_output}") + err_file = f"logs/rccl_test_{job_id}.err" + output_file = f"logs/rccl_test_{job_id}.out" + copy_file_list.append(output_file) + copy_file_list.append(err_file) + + if "COMPLETED" not in job_state: + exit_code, output = amd_host.execute_command(f"cat {err_file}") + assert not exit_code, f"{amd_host.host_ip}:Couldnt print the batch error file {err_file} : {output['stderr']}" + log.info(f"ERROR file : {output['stdout']}") + assert False, "RCCL test case failed.. !! " + + # Check for output file and print the results + parent_dir="logs" + log.info(f"Checking {parent_dir}/ ...") + exit_code, output = amd_host.execute_command(f"cat {output_file} ") + assert not exit_code, f" Error retrieving the file {output_file}!, {output['stderr']}" + log.info(f"Output : ") + log.info(output['stdout'].encode().decode('unicode_escape')) + + # Copy back results and delete the directory and files log.info(f"Copying all the results to {str(pytest.testdata.results_dir)}...") for file in copy_file_list: From 5fde3e18db0da8a15d77e43b93118931a3837e11 Mon Sep 17 00:00:00 2001 From: Priyanka Kini J Date: Tue, 20 Jan 2026 08:48:29 -0800 Subject: [PATCH 2/3] Updated the READme --- tests/enroot/README.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/enroot/README.md b/tests/enroot/README.md index d0375dd..c3f4e25 100644 --- a/tests/enroot/README.md +++ b/tests/enroot/README.md @@ -118,7 +118,11 @@ Test flow : * Copy batch file and helper script required * Launch sbatch to run the test * Once the test is complete, copy back all the results and logs to "results" folder -4. Testbed teardown: +4. Run the *test_multi_node_rccl* test: + * Copy the sbatch file to the host + * Launch sbatch to run the test + * Once the test is complete, copy back all the results and logs to "results" folder +5. Testbed teardown: * Uninstall slurm, enroot and pyxis(skip this if *--no-uninstall* flag is given in the command line) ```bash @@ -148,5 +152,10 @@ Run only the multinode distributed pytorch test and skip installation, if slurm, ```bash python3 -m pytest test_enroot.py --testbed ../testbed/enroot_tb.yml -k test_multi_node_distributed_pytorch --no-install --no-uninstall ``` +Run only the multinode rccl test and skip installation, if slurm, enroot and pyxis are already installed + +```bash +python3 -m pytest test_enroot.py --testbed ../testbed/enroot_tb.yml -k test_multi_node_rccl --no-install --no-uninstall +``` --- From c7d24464b131fc2e10a3b0f66310b542d45ff0a5 Mon Sep 17 00:00:00 2001 From: Priyanka Kini J Date: Wed, 21 Jan 2026 06:14:37 -0800 Subject: [PATCH 3/3] Fix variable error --- tests/enroot/testsuites/test_enroot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/enroot/testsuites/test_enroot.py b/tests/enroot/testsuites/test_enroot.py index 391785a..7f83f12 100644 --- a/tests/enroot/testsuites/test_enroot.py +++ b/tests/enroot/testsuites/test_enroot.py @@ -455,6 +455,7 @@ def test_multi_node_rccl(): """ amd_host = pytest.testdata.amd_host[0] + copy_file_list =[] # Create batch script local_script = batch_scripts_folder / "rccl_tests_sbatch.sh" remote_script = str(local_script.name)