From b9c588b4855b94a662c66019922e4ea744e14b08 Mon Sep 17 00:00:00 2001 From: Xiongfei Wei Date: Wed, 8 Nov 2023 19:26:06 +0000 Subject: [PATCH 1/7] need to add initContainer next --- tests/pytorch/nightly/common.libsonnet | 2 +- tests/pytorch/nightly/resnet50-mp.libsonnet | 66 ++++++++++++++++++++- 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/tests/pytorch/nightly/common.libsonnet b/tests/pytorch/nightly/common.libsonnet index 76a6c191c..2d99c31d7 100644 --- a/tests/pytorch/nightly/common.libsonnet +++ b/tests/pytorch/nightly/common.libsonnet @@ -120,7 +120,7 @@ local volumes = import 'templates/volumes.libsonnet'; }, GpuMixin:: { local config = self, - imageTag+: '_cuda_11.8', + imageTag+: 'nightly_3.10_cuda_12.1', podTemplate+:: { spec+: { diff --git a/tests/pytorch/nightly/resnet50-mp.libsonnet b/tests/pytorch/nightly/resnet50-mp.libsonnet index b24e5bc53..7471f6c82 100644 --- a/tests/pytorch/nightly/resnet50-mp.libsonnet +++ b/tests/pytorch/nightly/resnet50-mp.libsonnet @@ -145,12 +145,73 @@ local tpus = import 'templates/tpus.libsonnet'; memory: '40Gi', // Disable XLA metrics report on GPU - command+: [ - '--nometrics_debug', + command: [ + 'bash', + '-c', + ||| + export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 + + pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 + pip install --user https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.1.0-cp310-cp310-manylinux_2_28_x86_64.whl + + git clone --depth=1 https://github.com/pytorch/pytorch.git + cd pytorch + git clone https://github.com/pytorch/xla.git + + while true + do + ip=$(getent hosts ptxla-hello-world-0.headless-svc | awk {'print $1'}) + if [ $? -eq 0 ] && [ \"${ip}\" != \"\" ] + then + break + else + sleep 10 + fi + done + echo $ip + + PJRT_DEVICE=CUDA torchrun --nnodes=4 --node_rank=$JOB_COMPLETION_INDEX --nproc_per_node=4 --rdzv_endpoint=$ip:12355 xla/test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=128 --num_epochs=1" + |||, ], flags+: { modelDir: null, }, + + jobTemplate+:: { + spec+: { + completionMode: 'Indexed', + completions: 4, + parallelism: 4, + }, + }, + + podTemplate+:: { + spec+: { + backoffLimit: 1, + initContainerMap+:: { + }, + containerMap+:: { + train+: { + envMap+: { + }, + }, + }, + subdomain: '$(JOB_NAME)', # xw32: need to verify. + tolerations: [ + { + key: "nvidia.com/gpu", + operator: "Exists", + effect: "NoSchedule", + }, + ], + ports: [ + { + containerPort: 1234, + }, + ], + }, + }, }, local v100x4 = self.v100x4, v100x4:: gpu { @@ -194,6 +255,7 @@ local tpus = import 'templates/tpus.libsonnet'; }, configs: [ + resnet50 + functional + v100x4 + timeouts.Hours(2), // PJRT resnet50 + fake_data + v2_8 + timeouts.Hours(3) + pjrt, resnet50 + fake_data + v3_8 + timeouts.Hours(2) + pjrt, From ab974d2e8c8cbbc2c9e9f8057a808d5dbd612a67 Mon Sep 17 00:00:00 2001 From: Xiongfei Wei Date: Wed, 8 Nov 2023 22:47:45 +0000 Subject: [PATCH 2/7] added init container --- tests/pytorch/nightly/resnet50-mp.libsonnet | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/pytorch/nightly/resnet50-mp.libsonnet b/tests/pytorch/nightly/resnet50-mp.libsonnet index 7471f6c82..3108e2fdd 100644 --- a/tests/pytorch/nightly/resnet50-mp.libsonnet +++ b/tests/pytorch/nightly/resnet50-mp.libsonnet @@ -161,7 +161,7 @@ local tpus = import 'templates/tpus.libsonnet'; while true do - ip=$(getent hosts ptxla-hello-world-0.headless-svc | awk {'print $1'}) + ip=$(getent hosts ptxla-hello-world-0.headless-svc-$(JOB_NAME) | awk {'print $1'}) if [ $? -eq 0 ] && [ \"${ip}\" != \"\" ] then break @@ -190,6 +190,11 @@ local tpus = import 'templates/tpus.libsonnet'; spec+: { backoffLimit: 1, initContainerMap+:: { + 'tpu-version': { + command: [ + "kubectl expose headless-svc-$(JOB_NAME) --type='None' --selector='job-name: $(JOB_NAME)'", + ], + }, }, containerMap+:: { train+: { @@ -197,7 +202,7 @@ local tpus = import 'templates/tpus.libsonnet'; }, }, }, - subdomain: '$(JOB_NAME)', # xw32: need to verify. + subdomain: 'headless-svc-$(JOB_NAME)', # xw32: need to verify. tolerations: [ { key: "nvidia.com/gpu", From c41955ad9a82aadda3a028e8f67da46bf63937d2 Mon Sep 17 00:00:00 2001 From: Xiongfei Wei Date: Wed, 8 Nov 2023 23:01:45 +0000 Subject: [PATCH 3/7] print version --- tests/pytorch/nightly/resnet50-mp.libsonnet | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/pytorch/nightly/resnet50-mp.libsonnet b/tests/pytorch/nightly/resnet50-mp.libsonnet index 3108e2fdd..62e8a92f6 100644 --- a/tests/pytorch/nightly/resnet50-mp.libsonnet +++ b/tests/pytorch/nightly/resnet50-mp.libsonnet @@ -152,6 +152,9 @@ local tpus = import 'templates/tpus.libsonnet'; export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 + nvidia-smi + nvcc -V + pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 pip install --user https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.1.0-cp310-cp310-manylinux_2_28_x86_64.whl From 4283cfd5c18d214451f9083e51c0a13d1533aaf3 Mon Sep 17 00:00:00 2001 From: Xiongfei Wei Date: Wed, 8 Nov 2023 23:25:40 +0000 Subject: [PATCH 4/7] added the containerPort --- tests/pytorch/nightly/resnet50-mp.libsonnet | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/pytorch/nightly/resnet50-mp.libsonnet b/tests/pytorch/nightly/resnet50-mp.libsonnet index 62e8a92f6..2f40407ea 100644 --- a/tests/pytorch/nightly/resnet50-mp.libsonnet +++ b/tests/pytorch/nightly/resnet50-mp.libsonnet @@ -191,7 +191,6 @@ local tpus = import 'templates/tpus.libsonnet'; podTemplate+:: { spec+: { - backoffLimit: 1, initContainerMap+:: { 'tpu-version': { command: [ @@ -201,8 +200,11 @@ local tpus = import 'templates/tpus.libsonnet'; }, containerMap+:: { train+: { - envMap+: { - }, + ports: [ + { + containerPort: 1234, + }, + ], }, }, subdomain: 'headless-svc-$(JOB_NAME)', # xw32: need to verify. @@ -213,11 +215,7 @@ local tpus = import 'templates/tpus.libsonnet'; effect: "NoSchedule", }, ], - ports: [ - { - containerPort: 1234, - }, - ], + }, }, }, From 243d7fc016c6da74e22d755edf64fdd89b55a658 Mon Sep 17 00:00:00 2001 From: Xiongfei Wei Date: Wed, 8 Nov 2023 23:28:53 +0000 Subject: [PATCH 5/7] fix another error --- tests/pytorch/nightly/resnet50-mp.libsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/pytorch/nightly/resnet50-mp.libsonnet b/tests/pytorch/nightly/resnet50-mp.libsonnet index 2f40407ea..722630672 100644 --- a/tests/pytorch/nightly/resnet50-mp.libsonnet +++ b/tests/pytorch/nightly/resnet50-mp.libsonnet @@ -196,6 +196,7 @@ local tpus = import 'templates/tpus.libsonnet'; command: [ "kubectl expose headless-svc-$(JOB_NAME) --type='None' --selector='job-name: $(JOB_NAME)'", ], + "image": "google/cloud-sdk", }, }, containerMap+:: { From 10ad060c8449968215cdb0cc7bdfa7f70aea4f6f Mon Sep 17 00:00:00 2001 From: Xiongfei Wei Date: Tue, 21 Nov 2023 22:48:07 +0000 Subject: [PATCH 6/7] did some experiment --- tests/pytorch/nightly/resnet50-mp.libsonnet | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/pytorch/nightly/resnet50-mp.libsonnet b/tests/pytorch/nightly/resnet50-mp.libsonnet index 722630672..938f1f884 100644 --- a/tests/pytorch/nightly/resnet50-mp.libsonnet +++ b/tests/pytorch/nightly/resnet50-mp.libsonnet @@ -208,7 +208,8 @@ local tpus = import 'templates/tpus.libsonnet'; ], }, }, - subdomain: 'headless-svc-$(JOB_NAME)', # xw32: need to verify. + // subdomain: 'headless-svc-$(JOB_NAME)', doesn't work. + // subdomain: "headless-svc-metadata.labels['job-name']", doesn't work. tolerations: [ { key: "nvidia.com/gpu", From a940186fc700164035b028f1b0329379fb4712c4 Mon Sep 17 00:00:00 2001 From: Xiongfei Wei Date: Wed, 22 Nov 2023 00:00:42 +0000 Subject: [PATCH 7/7] Try kubectl patch --- tests/pytorch/nightly/common.libsonnet | 2 +- tests/pytorch/nightly/resnet50-mp.libsonnet | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/pytorch/nightly/common.libsonnet b/tests/pytorch/nightly/common.libsonnet index 2d99c31d7..116f4b67d 100644 --- a/tests/pytorch/nightly/common.libsonnet +++ b/tests/pytorch/nightly/common.libsonnet @@ -120,7 +120,7 @@ local volumes = import 'templates/volumes.libsonnet'; }, GpuMixin:: { local config = self, - imageTag+: 'nightly_3.10_cuda_12.1', + imageTag: 'nightly_3.10_cuda_12.1', podTemplate+:: { spec+: { diff --git a/tests/pytorch/nightly/resnet50-mp.libsonnet b/tests/pytorch/nightly/resnet50-mp.libsonnet index 938f1f884..3be6c0141 100644 --- a/tests/pytorch/nightly/resnet50-mp.libsonnet +++ b/tests/pytorch/nightly/resnet50-mp.libsonnet @@ -194,6 +194,9 @@ local tpus = import 'templates/tpus.libsonnet'; initContainerMap+:: { 'tpu-version': { command: [ + "echo JOB_NAME=$(JOB_NAME)", + "echo POD_NAME=$(POD_NAME)", + "kubectl patch job $(JOB_NAME) -p \'{\"spec\":{\"subdomain\": \"headless-svc-$(JOB_NAME)\"}}\'", "kubectl expose headless-svc-$(JOB_NAME) --type='None' --selector='job-name: $(JOB_NAME)'", ], "image": "google/cloud-sdk",