diff --git a/tests/pytorch/nightly/common.libsonnet b/tests/pytorch/nightly/common.libsonnet index 76a6c191c..116f4b67d 100644 --- a/tests/pytorch/nightly/common.libsonnet +++ b/tests/pytorch/nightly/common.libsonnet @@ -120,7 +120,7 @@ local volumes = import 'templates/volumes.libsonnet'; }, GpuMixin:: { local config = self, - imageTag+: '_cuda_11.8', + imageTag: 'nightly_3.10_cuda_12.1', podTemplate+:: { spec+: { diff --git a/tests/pytorch/nightly/resnet50-mp.libsonnet b/tests/pytorch/nightly/resnet50-mp.libsonnet index b24e5bc53..3be6c0141 100644 --- a/tests/pytorch/nightly/resnet50-mp.libsonnet +++ b/tests/pytorch/nightly/resnet50-mp.libsonnet @@ -145,12 +145,84 @@ local tpus = import 'templates/tpus.libsonnet'; memory: '40Gi', // Disable XLA metrics report on GPU - command+: [ - '--nometrics_debug', + command: [ + 'bash', + '-c', + ||| + export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 + + nvidia-smi + nvcc -V + + pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 + pip install --user https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.1.0-cp310-cp310-manylinux_2_28_x86_64.whl + + git clone --depth=1 https://github.com/pytorch/pytorch.git + cd pytorch + git clone https://github.com/pytorch/xla.git + + while true + do + ip=$(getent hosts ptxla-hello-world-0.headless-svc-$(JOB_NAME) | awk {'print $1'}) + if [ $? -eq 0 ] && [ \"${ip}\" != \"\" ] + then + break + else + sleep 10 + fi + done + echo $ip + + PJRT_DEVICE=CUDA torchrun --nnodes=4 --node_rank=$JOB_COMPLETION_INDEX --nproc_per_node=4 --rdzv_endpoint=$ip:12355 xla/test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=128 --num_epochs=1" + |||, ], flags+: { modelDir: null, }, + + jobTemplate+:: { + spec+: { + completionMode: 'Indexed', + completions: 4, + parallelism: 4, + }, + }, + + podTemplate+:: { + spec+: { + initContainerMap+:: { + 'tpu-version': { + command: [ + "echo JOB_NAME=$(JOB_NAME)", + "echo POD_NAME=$(POD_NAME)", + "kubectl patch job $(JOB_NAME) -p \'{\"spec\":{\"subdomain\": \"headless-svc-$(JOB_NAME)\"}}\'", + "kubectl expose headless-svc-$(JOB_NAME) --type='None' --selector='job-name: $(JOB_NAME)'", + ], + "image": "google/cloud-sdk", + }, + }, + containerMap+:: { + train+: { + ports: [ + { + containerPort: 1234, + }, + ], + }, + }, + // subdomain: 'headless-svc-$(JOB_NAME)', doesn't work. + // subdomain: "headless-svc-metadata.labels['job-name']", doesn't work. + tolerations: [ + { + key: "nvidia.com/gpu", + operator: "Exists", + effect: "NoSchedule", + }, + ], + + }, + }, }, local v100x4 = self.v100x4, v100x4:: gpu { @@ -194,6 +266,7 @@ local tpus = import 'templates/tpus.libsonnet'; }, configs: [ + resnet50 + functional + v100x4 + timeouts.Hours(2), // PJRT resnet50 + fake_data + v2_8 + timeouts.Hours(3) + pjrt, resnet50 + fake_data + v3_8 + timeouts.Hours(2) + pjrt,