From 82e87649de7448d84721223291af370012bca8ce Mon Sep 17 00:00:00 2001 From: Chandra Devarakonda Date: Sat, 2 Sep 2023 00:32:15 +0000 Subject: [PATCH 1/3] Fix pjrt tests --- tests/tensorflow/experimental.libsonnet | 1 + tests/tensorflow/nightly/common.libsonnet | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/tests/tensorflow/experimental.libsonnet b/tests/tensorflow/experimental.libsonnet index 45316bff7..4cb9b71e8 100644 --- a/tests/tensorflow/experimental.libsonnet +++ b/tests/tensorflow/experimental.libsonnet @@ -21,6 +21,7 @@ local mixins = import 'templates/mixins.libsonnet'; tpuSettings+: { tpuVmEnvVars+: { PYTHONPATH: '${PWD}', + WRAPT_DISABLE_EXTENSIONS: 'true', } + if config.accelerator.replicas > 1 then { TPU_LOAD_LIBRARY: '0', } else {}, diff --git a/tests/tensorflow/nightly/common.libsonnet b/tests/tensorflow/nightly/common.libsonnet index 70e1b88a6..557567175 100644 --- a/tests/tensorflow/nightly/common.libsonnet +++ b/tests/tensorflow/nightly/common.libsonnet @@ -115,6 +115,10 @@ local volumes = import 'templates/volumes.libsonnet'; tpuVm:: experimental.TensorFlowTpuVmMixin { local config = self, tpuSettings+: { + tpuVmEnvVars+: { + TF_PLUGGABLE_DEVICE_LIBRARY_PATH: '/lib/libtpu.so', + NEXT_PLUGGABLE_DEVICE_USE_C_API: 'true', + }, softwareVersion: if config.accelerator.replicas == 1 then 'v2-nightly' else @@ -178,6 +182,9 @@ local volumes = import 'templates/volumes.libsonnet'; softwareVersion=%(softwareVersion)s gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "echo 'WRAPT_DISABLE_EXTENSIONS=true' | sudo tee -a /etc/environment" + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command 'sudo sed -i "/HEALTH_AGENT_DOCKER_URL/c\HEALTH_AGENT_DOCKER_URL=\"gcr.io/cloud-tpu-v2-images/tpu_agents:cl_560157697\"" /home/tpu-runtime/tpu-env' + + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo systemctl daemon-reload && sudo systemctl restart healthagent.service" if [[ ${softwareVersion: -3} == "pod" ]]; then gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo sed -i 's/TF_DOCKER_URL=.*/TF_DOCKER_URL=gcr.io\/cloud-tpu-v2-images-dev\/grpc_tpu_worker:nightly\"/' /etc/systemd/system/tpu-runtime.service" gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo systemctl daemon-reload && sudo systemctl restart tpu-runtime" From b17f271acfe7740a3839b2e980f8c3cfa1466341 Mon Sep 17 00:00:00 2001 From: Chandra Devarakonda Date: Fri, 8 Sep 2023 18:45:23 +0000 Subject: [PATCH 2/3] Update tpu health agent --- tests/tensorflow/nightly/common.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tensorflow/nightly/common.libsonnet b/tests/tensorflow/nightly/common.libsonnet index 557567175..b49b6bc41 100644 --- a/tests/tensorflow/nightly/common.libsonnet +++ b/tests/tensorflow/nightly/common.libsonnet @@ -182,7 +182,7 @@ local volumes = import 'templates/volumes.libsonnet'; softwareVersion=%(softwareVersion)s gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "echo 'WRAPT_DISABLE_EXTENSIONS=true' | sudo tee -a /etc/environment" - gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command 'sudo sed -i "/HEALTH_AGENT_DOCKER_URL/c\HEALTH_AGENT_DOCKER_URL=\"gcr.io/cloud-tpu-v2-images/tpu_agents:cl_560157697\"" /home/tpu-runtime/tpu-env' + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command 'sudo sed -i "/HEALTH_AGENT_DOCKER_URL/c\HEALTH_AGENT_DOCKER_URL=\"gcr.io/cloud-tpu-v2-images/tpu_agents:cl_562025307\"" /home/tpu-runtime/tpu-env' gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo systemctl daemon-reload && sudo systemctl restart healthagent.service" if [[ ${softwareVersion: -3} == "pod" ]]; then From 58ede5827e8936d48620de71610e27dc7a74c8a2 Mon Sep 17 00:00:00 2001 From: Chandra Devarakonda Date: Fri, 8 Sep 2023 19:06:05 +0000 Subject: [PATCH 3/3] Since PjRt is functional, opening up all the tests --- tests/tensorflow/nightly/common.libsonnet | 5 +---- tests/tensorflow/nightly/targets.jsonnet | 14 +++++++------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/tests/tensorflow/nightly/common.libsonnet b/tests/tensorflow/nightly/common.libsonnet index b49b6bc41..b07db55f0 100644 --- a/tests/tensorflow/nightly/common.libsonnet +++ b/tests/tensorflow/nightly/common.libsonnet @@ -302,10 +302,7 @@ local volumes = import 'templates/volumes.libsonnet'; }, local functional_schedule = '0 9 * * *', Functional:: mixins.Functional { - schedule: if !(self.accelerator.type == 'tpu') || self.accelerator.name == 'v3-8' || self.accelerator.name == 'v4-8' then - functional_schedule - else - null, + schedule: functional_schedule, metricConfig+: { sourceMap+:: { tensorboard+: { diff --git a/tests/tensorflow/nightly/targets.jsonnet b/tests/tensorflow/nightly/targets.jsonnet index 93348e722..f61e1fc9b 100644 --- a/tests/tensorflow/nightly/targets.jsonnet +++ b/tests/tensorflow/nightly/targets.jsonnet @@ -24,13 +24,13 @@ local wmt = import 'tf-wmt-wmt14_translate.libsonnet'; // Add new models here std.flattenArrays([ - // dlrm.configs, + dlrm.configs, keras_api.configs, - // bert.configs, - // wmt.configs, - // maskrcnn.configs, - // retinanet.configs, + bert.configs, + wmt.configs, + maskrcnn.configs, + retinanet.configs, resnet.configs, - // resnetrs.configs, - // gpt2.configs, + resnetrs.configs, + gpt2.configs, ])