From 2a713601b1fadf474b3ee18aa4d57ddac88b8784 Mon Sep 17 00:00:00 2001 From: Mark Bzomowski Date: Tue, 24 Oct 2023 18:52:01 +0000 Subject: [PATCH 1/4] Add TensorFlow 2.15 tests for TPU VM --- tests/tensorflow/r2.15/common.libsonnet | 359 ++++++++++++++++++ tests/tensorflow/r2.15/targets.jsonnet | 36 ++ .../r2.15/tf-bert-glue_mnli.libsonnet | 81 ++++ .../tensorflow/r2.15/tf-dlrm-criteo.libsonnet | 125 ++++++ .../r2.15/tf-gpt2-wikitext.libsonnet | 57 +++ tests/tensorflow/r2.15/tf-keras-api.libsonnet | 138 +++++++ .../r2.15/tf-maskrcnn-coco.libsonnet | 65 ++++ .../r2.15/tf-resnet-imagenet.libsonnet | 67 ++++ .../r2.15/tf-resnetrs-imagenet.libsonnet | 55 +++ .../r2.15/tf-retinanet-coco.libsonnet | 69 ++++ .../r2.15/tf-wmt-wmt14_translate.libsonnet | 116 ++++++ 11 files changed, 1168 insertions(+) create mode 100644 tests/tensorflow/r2.15/common.libsonnet create mode 100644 tests/tensorflow/r2.15/targets.jsonnet create mode 100644 tests/tensorflow/r2.15/tf-bert-glue_mnli.libsonnet create mode 100644 tests/tensorflow/r2.15/tf-dlrm-criteo.libsonnet create mode 100644 tests/tensorflow/r2.15/tf-gpt2-wikitext.libsonnet create mode 100644 tests/tensorflow/r2.15/tf-keras-api.libsonnet create mode 100644 tests/tensorflow/r2.15/tf-maskrcnn-coco.libsonnet create mode 100644 tests/tensorflow/r2.15/tf-resnet-imagenet.libsonnet create mode 100644 tests/tensorflow/r2.15/tf-resnetrs-imagenet.libsonnet create mode 100644 tests/tensorflow/r2.15/tf-retinanet-coco.libsonnet create mode 100644 tests/tensorflow/r2.15/tf-wmt-wmt14_translate.libsonnet diff --git a/tests/tensorflow/r2.15/common.libsonnet b/tests/tensorflow/r2.15/common.libsonnet new file mode 100644 index 000000000..3d60c7961 --- /dev/null +++ b/tests/tensorflow/r2.15/common.libsonnet @@ -0,0 +1,359 @@ +// Copyright 2020 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +local common = import '../common.libsonnet'; +local experimental = import '../experimental.libsonnet'; +local metrics = import 'templates/metrics.libsonnet'; +local mixins = import 'templates/mixins.libsonnet'; +local utils = import 'templates/utils.libsonnet'; +local volumes = import 'templates/volumes.libsonnet'; + +{ + HuggingFaceTransformer:: common.ModelGardenTest { + local config = self, + + frameworkPrefix: 'tf-r2.15.0', + tpuSettings+: { + softwareVersion: '2.15.0', + }, + imageTag: 'r2.15.0', + script: { + initialSetup: + ||| + cd /tmp + git clone https://github.com/huggingface/transformers.git + cd transformers + pip install . + pip install -r examples/tensorflow/_tests_requirements.txt + |||, + }, + }, + ModelGardenTest:: common.ModelGardenTest { + local config = self, + + frameworkPrefix: 'tf-r2.15.0', + tpuSettings+: { + softwareVersion: '2.15.0', + }, + imageTag: 'r2.15.0', + podTemplate+:: if config.accelerator.type == 'tpu' then + { + spec+: { + initContainerMap+:: { + 'tpu-version': { + image: config.podTemplate.spec.containerMap.train.image, + env+: [ + { + name: 'TPU_NAME', + valueFrom: { + fieldRef: { + fieldPath: "metadata.annotations['name.cloud-tpus.google.com/train']", + }, + }, + }, + { + name: 'POD_UID', + valueFrom: { + fieldRef: { + fieldPath: 'metadata.uid', + }, + }, + }, + ], + local tpuCreateSettings = { + acceleratorName: std.escapeStringBash(config.accelerator.name), + softwareVersion: std.escapeStringBash(config.tpuSettings.softwareVersion), + startupScript: std.escapeStringBash(config.tpuSettings.tpuVmStartupScript), + sleepTime: config.tpuSettings.tpuVmCreateSleepSeconds, + testName: std.strReplace(config.testName, '.', '-'), + }, + command: [ + 'python3', + '-c', + ||| + import os + import tensorflow as tf + import urllib + import json + import cloud_tpu_client + import sys + print('python version: ' + str(sys.version)) + print('tf_version: ' + str(tf.__version__)) + #TODO(chandrasekhard): + # Add extra condition to fail if it picks stale image + print(str(tf.__file__)) + ctc = cloud_tpu_client.Client(tpu=os.path.basename('$(TPU_NAME)'), zone=os.path.dirname('$(TPU_NAME)')) + ctc.wait_for_healthy() + ctc.configure_tpu_version('nightly', restart_type='always') + ctc.wait_for_healthy() + _VERSION_SWITCHER_ENDPOINT = 'http://{}:8475/requestversion' + url = _VERSION_SWITCHER_ENDPOINT.format(ctc.network_endpoints()[0]['ipAddress']) + req = urllib.request.Request(url) + resp = urllib.request.urlopen(req) + version_details = json.loads(resp.read()) + print(version_details) + |||, + ], + }, + }, + }, + } + else + {}, + }, + tpuVm:: experimental.TensorFlowTpuVmMixin { + local config = self, + tpuSettings+: { + softwareVersion: if config.accelerator.replicas == 1 then + 'tpu-vm-tf-2.15.0' + else + 'tpu-vm-tf-2.15.0-pod' + tpuVmEnvVars+: (if std.parseInt(std.split(config.accelerator.name, '-')[1]) <= 8 then { + TF_PLUGGABLE_DEVICE_LIBRARY_PATH: '/lib/libtpu.so', + NEXT_PLUGGABLE_DEVICE_USE_C_API: 'true', + } else {}), + }, + podTemplate+:: { + spec+: { + initContainerMap+:: { + 'create-tpu'+: { + local tpuCreateSettings = { + acceleratorName: std.escapeStringBash(config.accelerator.name), + softwareVersion: std.escapeStringBash(config.tpuSettings.softwareVersion), + startupScript: std.escapeStringBash(config.tpuSettings.tpuVmStartupScript), + sleepTime: config.tpuSettings.tpuVmCreateSleepSeconds, + testName: std.strReplace(config.testName, '.', '-'), + }, + command: utils.scriptCommand(||| + project=$(curl -sS "http://metadata.google.internal/computeMetadata/v1/project/project-id" -H "Metadata-Flavor: Google") + zone=$(curl -sS "http://metadata.google.internal/computeMetadata/v1/instance/zone" -H "Metadata-Flavor: Google" | awk -F'/' '{print $4}') + tpu_name=tpu-${POD_UID} + ssh-keygen -t rsa -f /scripts/id_rsa -q -N "" + + echo " + gcloud alpha compute tpus tpu-vm delete -q --async ${tpu_name} --zone=${zone} + sleep 60 + " > /scripts/cleanup.sh + + echo "xl-ml-test:$(cat /scripts/id_rsa.pub)" > ssh-keys.txt + echo %(startupScript)s > startup-script.txt + + # Retry every 30 seconds for up to 10 minutes + start_time="$(date -u +%%s)" + for i in {1..20}; do + set +e + gcloud alpha compute tpus tpu-vm create ${tpu_name} \ + --accelerator-type=%(acceleratorName)s \ + --version=%(softwareVersion)s \ + --metadata-from-file='ssh-keys=ssh-keys.txt,startup-script=startup-script.txt' \ + --labels='test-name=%(testName)s' \ + --zone=${zone} + + exit_code=$? + set -e + + current_time="$(date -u +%%s)" + elapsed_seconds=$(($current_time-$start_time)) + # Break if command passed or 10-minute limit reached + test $exit_code = 0 && break + test $elapsed_seconds -gt 600 && break + sleep 30 + done + + if [ $exit_code -ne 0 ]; then + exit $exit_code + fi + + + echo ${zone} > /scripts/zone + echo ${tpu_name} > /scripts/tpu_name + gcloud compute tpus describe ${tpu_name} --project=${project} --zone=${zone} --format="value(networkEndpoints[0].ipAddress)" > /scripts/tpu_ip + gcloud compute tpus describe ${tpu_name} --project=${project} --zone=${zone} --flatten="networkEndpoints[]" --format="csv[no-heading](networkEndpoints.ipAddress)" > /scripts/all_tpu_ips + sleep %(sleepTime)d + + softwareVersion=%(softwareVersion)s + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "pip install tensorflow-text-nightly" + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/tensorflow/tf-nightly/latest/*.whl /tmp/ && pip install /tmp/tf*.whl --force" + + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/libtpu/latest/libtpu.so /lib/" + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "sudo mkdir -p /usr/share/tpu && cd /usr/share/tpu && git clone https://github.com/tensorflow/models.git" + + accelerator_type=%(acceleratorName)s + if (( ${accelerator_type: -2} > 8 )); then + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo sed -i 's/TF_DOCKER_URL=.*/TF_DOCKER_URL=gcr.io\/cloud-tpu-v2-images-dev\/grpc_tpu_worker:nightly\"/' /etc/systemd/system/tpu-runtime.service" + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo systemctl daemon-reload && sudo systemctl restart tpu-runtime" + fi + ||| % tpuCreateSettings), + }, + 'tpu-version': { + image: 'google/cloud-sdk', + command: null, + }, + }, + }, + }, + }, + TfVisionTest:: self.ModelGardenTest + common.TfNlpVisionMixin { + scriptConfig+: { + runnerPath: 'official/vision/train.py', + }, + }, + TfNlpTest:: self.ModelGardenTest + common.TfNlpVisionMixin { + scriptConfig+: { + runnerPath: 'official/nlp/train.py', + }, + }, + TfRankingTest:: self.ModelGardenTest { + paramsOverride:: { + runtime: { + distribution_strategy: error 'Must set `runtime.distribution_strategy`', + }, + task: { + train_data: { + input_path: '$(CRITEO_DATA_DIR)/train/*', + global_batch_size: 16384, + }, + validation_data: { + input_path: '$(CRITEO_DATA_DIR)/eval/*', + global_batch_size: 16384, + }, + model: { + num_dense_features: 13, + bottom_mlp: [512, 256, 64], + embedding_dim: 64, + top_mlp: [1024, 1024, 512, 256, 1], + vocab_sizes: [ + 39884406, + 39043, + 17289, + 7420, + 20263, + 3, + 7120, + 1543, + 63, + 38532951, + 2953546, + 403346, + 10, + 2208, + 11938, + 155, + 4, + 976, + 14, + 39979771, + 25641295, + 39664984, + 585935, + 12972, + 108, + 36, + ], + }, + }, + trainer: { + use_orbit: true, + validation_interval: 90000, + checkpoint_interval: 270000, + validation_steps: 5440, + train_steps: 256054, + optimizer_config: { + embedding_optimizer: 'SGD', + lr_config: { + decay_exp: 1.6, + decay_start_steps: 150000, + decay_steps: 136054, + learning_rate: 30, + warmup_steps: 8000, + }, + }, + }, + }, + command: [ + 'python3', + 'official/recommendation/ranking/train.py', + '--params_override=%s' % (std.manifestYamlDoc(self.paramsOverride) + '\n'), + '--model_dir=$(MODEL_DIR)', + ], + }, + imagenet:: { + scriptConfig+: { + trainFilePattern: '$(IMAGENET_DIR)/train*', + evalFilePattern: '$(IMAGENET_DIR)/valid*', + }, + }, + coco:: { + scriptConfig+: { + trainFilePattern: '$(COCO_DIR)/train*', + evalFilePattern: '$(COCO_DIR)/val*', + paramsOverride+: { + task+: { + annotation_file: '$(COCO_DIR)/instances_val2017.json', + }, + }, + }, + }, + local functional_schedule = '0 9 * * *', + Functional:: mixins.Functional { + schedule: if !(self.accelerator.type == 'tpu') || self.accelerator.name == 'v3-8' || self.accelerator.name == 'v4-8' then + functional_schedule + else + functional_schedule, + metricConfig+: { + sourceMap+:: { + tensorboard+: { + aggregateAssertionsMap+:: { + examples_per_second: { + AVERAGE: { + inclusive_bounds: true, + std_devs_from_mean: { + comparison: 'GREATER', + std_devs: 4.0, + }, + wait_for_n_data_points: 0, + }, + }, + }, + }, + }, + }, + }, + // Override default schedule for Functional. + RunNightly:: { + schedule: functional_schedule, + }, + Convergence:: mixins.Convergence { + schedule: '0 5 * * 0,2,4', + metricConfig+: { + sourceMap+:: { + tensorboard+: { + aggregateAssertionsMap+:: { + examples_per_second: { + AVERAGE: { + inclusive_bounds: true, + std_devs_from_mean: { + comparison: 'GREATER', + // TODO(wcromar): Tighten this restriction + std_devs: 2.0, + }, + wait_for_n_data_points: 0, + }, + }, + }, + }, + }, + }, + }, +} diff --git a/tests/tensorflow/r2.15/targets.jsonnet b/tests/tensorflow/r2.15/targets.jsonnet new file mode 100644 index 000000000..f61e1fc9b --- /dev/null +++ b/tests/tensorflow/r2.15/targets.jsonnet @@ -0,0 +1,36 @@ +// Copyright 2020 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +local bert = import 'tf-bert-glue_mnli.libsonnet'; +local dlrm = import 'tf-dlrm-criteo.libsonnet'; +local gpt2 = import 'tf-gpt2-wikitext.libsonnet'; +local keras_api = import 'tf-keras-api.libsonnet'; +local maskrcnn = import 'tf-maskrcnn-coco.libsonnet'; +local resnet = import 'tf-resnet-imagenet.libsonnet'; +local resnetrs = import 'tf-resnetrs-imagenet.libsonnet'; +local retinanet = import 'tf-retinanet-coco.libsonnet'; +local wmt = import 'tf-wmt-wmt14_translate.libsonnet'; + +// Add new models here +std.flattenArrays([ + dlrm.configs, + keras_api.configs, + bert.configs, + wmt.configs, + maskrcnn.configs, + retinanet.configs, + resnet.configs, + resnetrs.configs, + gpt2.configs, +]) diff --git a/tests/tensorflow/r2.15/tf-bert-glue_mnli.libsonnet b/tests/tensorflow/r2.15/tf-bert-glue_mnli.libsonnet new file mode 100644 index 000000000..9280d8a2b --- /dev/null +++ b/tests/tensorflow/r2.15/tf-bert-glue_mnli.libsonnet @@ -0,0 +1,81 @@ +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +local experimental = import '../experimental.libsonnet'; +local common = import 'common.libsonnet'; +local mixins = import 'templates/mixins.libsonnet'; +local timeouts = import 'templates/timeouts.libsonnet'; +local tpus = import 'templates/tpus.libsonnet'; +local utils = import 'templates/utils.libsonnet'; + +{ + local bert = common.TfNlpTest { + modelName: 'bert-glue.mnli', + scriptConfig+: { + experiment: 'bert/sentence_prediction_text', + configFiles: [ + 'official/nlp/configs/experiments/glue_mnli_text.yaml', + ], + paramsOverride+: { + task+: { + init_checkpoint+: '$(TF_NLP_BERT_DIR)/uncased_L-12_H-768_A-12/bert_model.ckpt', + train_data+: { + vocab_file: '$(TF_NLP_BERT_DIR)/uncased_L-12_H-768_A-12/vocab.txt', + }, + validation_data+: { + vocab_file: '$(TF_NLP_BERT_DIR)/uncased_L-12_H-768_A-12/vocab.txt', + }, + }, + }, + }, + }, + local functional = self.functional, + functional:: common.Functional { + scriptConfig+: { + paramsOverride+: { + trainer+: { + train_steps: 2000, + validation_interval: 1000, + }, + }, + }, + }, + local convergence = self.convergence, + convergence:: common.Convergence, + local v2_8 = self.v2_8, + v2_8:: { + accelerator: tpus.v2_8, + }, + local v3_8 = self.v3_8, + v3_8:: { + accelerator: tpus.v3_8, + }, + local v2_32 = self.v2_32, + v2_32:: { + accelerator: tpus.v2_32, + }, + local v3_32 = self.v3_32, + v3_32:: { + accelerator: tpus.v3_32, + }, + local tpuVm = common.tpuVm, + + configs: [ + bert + accelerator + functional + tpuVm + for accelerator in [v2_8, v3_8] + ] + [ + bert + v2_32 + convergence + tpuVm, + bert + v3_32 + convergence + tpuVm, + ], +} diff --git a/tests/tensorflow/r2.15/tf-dlrm-criteo.libsonnet b/tests/tensorflow/r2.15/tf-dlrm-criteo.libsonnet new file mode 100644 index 000000000..e5a374284 --- /dev/null +++ b/tests/tensorflow/r2.15/tf-dlrm-criteo.libsonnet @@ -0,0 +1,125 @@ +// Copyright 2020 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +local experimental = import '../experimental.libsonnet'; +local common = import 'common.libsonnet'; +local mixins = import 'templates/mixins.libsonnet'; +local timeouts = import 'templates/timeouts.libsonnet'; +local tpus = import 'templates/tpus.libsonnet'; + +{ + local dlrm = common.TfRankingTest { + modelName: 'dlrm-criteo', + paramsOverride+:: { + task+: { + model+: { + interaction: 'dot', + }, + }, + }, + }, + local functional = self.functional, + functional:: common.Functional { + command+: [ + '--mode=train', + ], + paramsOverride+: { + trainer+: { + train_steps: 10000, + }, + }, + }, + local convergence = self.convergence, + convergence:: common.Convergence { + local config = self, + + command+: [ + '--mode=train_and_eval', + ], + paramsOverride+: { + trainer+: { + train_steps: 256054, + }, + }, + }, + + local tpu_common = self.tpu_common, + tpu_common:: { + paramsOverride+:: { + runtime+: { + distribution_strategy: 'tpu', + tpu: '$(KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS)', + }, + }, + }, + + local v2_8 = self.v2_8, + v2_8:: tpu_common { + accelerator: tpus.v2_8, + paramsOverride+:: { + task+: { + model+: { + bottom_mlp: [512, 256, 16], + embedding_dim: 16, + }, + }, + }, + }, + + local v2_32 = self.v2_32, + v2_32:: tpu_common { + accelerator: tpus.v2_32, + paramsOverride+:: { + task+: { + model+: { + bottom_mlp: [512, 256, 64], + embedding_dim: 64, + }, + }, + }, + }, + + local v4_8 = self.v4_8, + v4_8:: tpu_common { + accelerator: tpus.v4_8, + paramsOverride+:: { + task+: { + model+: { + bottom_mlp: [512, 256, 64], + embedding_dim: 64, + }, + }, + }, + }, + local v4_32 = self.v4_32, + v4_32:: tpu_common { + accelerator: tpus.v4_32, + paramsOverride+:: { + task+: { + model+: { + bottom_mlp: [512, 256, 128], + embedding_dim: 128, + }, + }, + }, + }, + local tpuVm = self.tpuVm, + tpuVm:: common.tpuVm, + + configs: [ + dlrm + functional + v2_8 + tpuVm, + dlrm + convergence + v2_32 + tpuVm, + dlrm + functional + v4_8 + tpuVm, + ], +} diff --git a/tests/tensorflow/r2.15/tf-gpt2-wikitext.libsonnet b/tests/tensorflow/r2.15/tf-gpt2-wikitext.libsonnet new file mode 100644 index 000000000..9ebc25e54 --- /dev/null +++ b/tests/tensorflow/r2.15/tf-gpt2-wikitext.libsonnet @@ -0,0 +1,57 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +local common = import 'common.libsonnet'; +local tpus = import 'templates/tpus.libsonnet'; +local utils = import 'templates/utils.libsonnet'; + +{ + local gpt2 = common.HuggingFaceTransformer { + modelName: 'gpt2-wikitext', + command: utils.scriptCommand( + ||| + %(initialSetup)s + cd /tmp/transformers/examples/tensorflow/language-modeling + pip install -r requirements.txt + mkdir /tmp/gpt2-wikitext + python3 run_clm.py \ + --model_name_or_path distilgpt2 \ + --max_train_samples 1000 \ + --max_eval_samples 100 \ + --num_train_epochs 1 \ + --output_dir /tmp/gpt2-wikitext \ + --dataset_name wikitext \ + --dataset_config_name wikitext-103-raw-v1 + ||| % self.script, + ), + }, + + local v2_8 = self.v2_8, + v2_8:: { + accelerator: tpus.v2_8, + }, + local v3_8 = self.v3_8, + v3_8:: { + accelerator: tpus.v3_8, + }, + local v4_8 = self.v4_8, + v4_8:: { + accelerator: tpus.v4_8, + }, + + configs: [ + gpt2 + accelerator + common.Functional + common.tpuVm + for accelerator in [v2_8, v3_8, v4_8] + ], +} diff --git a/tests/tensorflow/r2.15/tf-keras-api.libsonnet b/tests/tensorflow/r2.15/tf-keras-api.libsonnet new file mode 100644 index 000000000..6b5f03439 --- /dev/null +++ b/tests/tensorflow/r2.15/tf-keras-api.libsonnet @@ -0,0 +1,138 @@ +// Copyright 2020 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +local experimental = import '../experimental.libsonnet'; +local common = import 'common.libsonnet'; +local mixins = import 'templates/mixins.libsonnet'; +local timeouts = import 'templates/timeouts.libsonnet'; +local tpus = import 'templates/tpus.libsonnet'; +local utils = import 'templates/utils.libsonnet'; + +{ + local keras_test = self.keras_test, + keras_test:: common.ModelGardenTest { + testFeature:: error 'Must override `testFeature`', + modelName: 'keras-api', + isTPUPod:: error 'Must set `isTPUPod`', + command: utils.scriptCommand( + ||| + cd ~ + export PATH=$PATH:/root/google-cloud-sdk/bin + export PATH=$PATH:/home/xl-ml-test/.local/bin + export TPU_NAME=$(KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS) + gcloud source repos clone tf2-api-tests --project=xl-ml-test + cd tf2-api-tests + pip3 install behave + behave -e ipynb_checkpoints --tags=-fails %s -i %s + ||| % [if self.isTPUPod then '--tags=-failspod' else '', self.testFeature] + ), + }, + + local API = self.API, + API:: common.RunNightly { + mode: 'api', + timeout: timeouts.one_hour, + tpuSettings+: { + preemptible: true, + }, + }, + + local connection = self.connection, + connection:: API { + mode: 'connection', + testFeature:: 'aaa_connection', + }, + + local custom_layers = self.custom_layers, + custom_layers:: API { + mode: 'custom-layers', + testFeature:: 'custom_layers_model', + }, + + local custom_training_loop = self.custom_training_loop, + custom_training_loop:: API { + mode: 'ctl', + testFeature:: 'custom_training_loop', + }, + + local feature_column = self.feature_column, + feature_column:: API { + mode: 'feature-column', + testFeature:: 'feature_column', + }, + + local rnn = self.rnn, + rnn:: API { + mode: 'rnn', + testFeature:: 'rnn', + }, + + local upsample = self.upsample, + upsample:: API { + mode: 'upsample', + testFeature:: 'upsample', + }, + + local save_load_io_device_local = self.save_load_io_device_local, + save_load_io_device_local:: API { + mode: 'save-load-localhost', + testFeature:: 'save_and_load_io_device_local_drive', + }, + + local save_and_load = self.save_and_load, + save_and_load:: API { + mode: 'save-and-load', + testFeature:: 'save_and_load.feature', + }, + + local train_and_evaluate = self.train_and_evaluate, + train_and_evaluate:: API { + mode: 'train-and-evaluate', + testFeature:: 'train_and_evaluate', + }, + + local transfer_learning = self.transfer_learning, + transfer_learning:: API { + mode: 'transfer-learning', + testFeature:: 'transfer_learning', + }, + + local v2_8 = self.v2_8, + v2_8:: { + accelerator: tpus.v2_8, + isTPUPod: false, + }, + + local v2_32 = self.v2_32, + v2_32:: { + accelerator: tpus.v2_32, + isTPUPod: true, + }, + + local tpuVm = self.tpuVm, + tpuVm:: common.tpuVm, + + configs: [ + keras_test + v2_8 + connection + tpuVm, + keras_test + v2_8 + custom_layers + tpuVm, + keras_test + v2_8 + custom_training_loop + tpuVm, + keras_test + v2_8 + feature_column + timeouts.Hours(2) + tpuVm, + keras_test + v2_8 + upsample + tpuVm, + keras_test + v2_8 + rnn + tpuVm, + keras_test + v2_8 + save_and_load + timeouts.Hours(2) + tpuVm, + keras_test + v2_8 + save_load_io_device_local + timeouts.Hours(2) + tpuVm, + keras_test + v2_8 + train_and_evaluate + timeouts.Hours(3) + tpuVm, + keras_test + v2_8 + transfer_learning + tpuVm, + ], +} diff --git a/tests/tensorflow/r2.15/tf-maskrcnn-coco.libsonnet b/tests/tensorflow/r2.15/tf-maskrcnn-coco.libsonnet new file mode 100644 index 000000000..035a7aa90 --- /dev/null +++ b/tests/tensorflow/r2.15/tf-maskrcnn-coco.libsonnet @@ -0,0 +1,65 @@ +// Copyright 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +local experimental = import '../experimental.libsonnet'; +local common = import 'common.libsonnet'; +local mixins = import 'templates/mixins.libsonnet'; +local timeouts = import 'templates/timeouts.libsonnet'; +local tpus = import 'templates/tpus.libsonnet'; +local utils = import 'templates/utils.libsonnet'; + +{ + local tpu_common = { + local config = self, + scriptConfig+: { + paramsOverride+: { + task+: { + validation_data+: { + global_batch_size: 8 * config.accelerator.replicas, + }, + }, + }, + }, + }, + local maskrcnn = common.TfVisionTest + common.coco { + modelName: 'maskrcnn-coco', + scriptConfig+: { + experiment: 'maskrcnn_resnetfpn_coco', + }, + }, + local functional = common.Functional { + scriptConfig+: { + paramsOverride+: { + trainer+: { + train_steps: 400, + validation_interval: 200, + validation_steps: 100, + }, + }, + }, + }, + local convergence = common.Convergence, + local v4_8 = tpu_common { + accelerator: tpus.v4_8, + }, + local v4_32 = tpu_common { + accelerator: tpus.v4_32, + }, + local tpuVm = common.tpuVm, + + configs: [ + maskrcnn + v4_8 + functional + tpuVm, + maskrcnn + v4_32 + convergence + tpuVm, + ], +} diff --git a/tests/tensorflow/r2.15/tf-resnet-imagenet.libsonnet b/tests/tensorflow/r2.15/tf-resnet-imagenet.libsonnet new file mode 100644 index 000000000..428e6ac87 --- /dev/null +++ b/tests/tensorflow/r2.15/tf-resnet-imagenet.libsonnet @@ -0,0 +1,67 @@ +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +local experimental = import '../experimental.libsonnet'; +local common = import 'common.libsonnet'; +local mixins = import 'templates/mixins.libsonnet'; +local tpus = import 'templates/tpus.libsonnet'; +local utils = import 'templates/utils.libsonnet'; + +{ + local resnet = common.TfVisionTest + common.imagenet { + modelName: 'resnet-imagenet', + scriptConfig+: { + experiment: 'resnet_imagenet', + }, + }, + local functional = common.Functional { + scriptConfig+: { + paramsOverride+: { + trainer: { + train_steps: 320, + validation_interval: 320, + }, + }, + }, + }, + local convergence = self.convergence, + convergence:: common.Convergence, + local v2_32 = self.v2_32, + v2_32:: { + accelerator: tpus.v2_32, + }, + local v3_32 = self.v3_32, + v3_32:: { + accelerator: tpus.v3_32, + }, + local v4_8 = self.v4_8, + v4_8:: { + accelerator: tpus.v4_8, + }, + local v4_32 = self.v4_32, + v4_32:: { + accelerator: tpus.v4_32, + }, + local tpuVm = self.tpuVm, + tpuVm:: common.tpuVm, + + local convergenceTests = [ + resnet + v2_32 + convergence + tpuVm, + resnet + v3_32 + convergence + tpuVm, + ], + configs: convergenceTests + [ + resnet + v4_8 + functional + tpuVm, + resnet + v4_32 + convergence + tpuVm, + ], +} diff --git a/tests/tensorflow/r2.15/tf-resnetrs-imagenet.libsonnet b/tests/tensorflow/r2.15/tf-resnetrs-imagenet.libsonnet new file mode 100644 index 000000000..b021688a3 --- /dev/null +++ b/tests/tensorflow/r2.15/tf-resnetrs-imagenet.libsonnet @@ -0,0 +1,55 @@ +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +local experimental = import '../experimental.libsonnet'; +local common = import 'common.libsonnet'; +local mixins = import 'templates/mixins.libsonnet'; +local timeouts = import 'templates/timeouts.libsonnet'; +local tpus = import 'templates/tpus.libsonnet'; +local utils = import 'templates/utils.libsonnet'; + +{ + local resnet_rs = common.TfVisionTest + common.imagenet { + modelName: 'resnetrs-imagenet', + scriptConfig+: { + experiment: 'resnet_rs_imagenet', + configFiles: ['official/vision/configs/experiments/image_classification/imagenet_resnetrs50_i160.yaml'], + }, + }, + local functional = common.Functional { + scriptConfig+: { + paramsOverride+: { + trainer: { + train_steps: 320, + validation_interval: 320, + }, + }, + }, + }, + local convergence = common.Convergence, + local v2_32 = { + accelerator: tpus.v2_32, + }, + local v3_32 = { + accelerator: tpus.v3_32, + }, + local tpuVm = self.tpuVm, + tpuVm:: common.tpuVm, + + local convergenceTests = [ + resnet_rs + v2_32 + convergence + tpuVm + timeouts.Hours(15), + resnet_rs + v3_32 + convergence + tpuVm + timeouts.Hours(15), + ], + configs: convergenceTests, +} diff --git a/tests/tensorflow/r2.15/tf-retinanet-coco.libsonnet b/tests/tensorflow/r2.15/tf-retinanet-coco.libsonnet new file mode 100644 index 000000000..3148b6a19 --- /dev/null +++ b/tests/tensorflow/r2.15/tf-retinanet-coco.libsonnet @@ -0,0 +1,69 @@ +// Copyright 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +local experimental = import '../experimental.libsonnet'; +local common = import 'common.libsonnet'; +local mixins = import 'templates/mixins.libsonnet'; +local timeouts = import 'templates/timeouts.libsonnet'; +local tpus = import 'templates/tpus.libsonnet'; +local utils = import 'templates/utils.libsonnet'; + +{ + local tpu_common = { + local config = self, + scriptConfig+: { + paramsOverride+: { + task+: { + validation_data+: { + global_batch_size: 8 * config.accelerator.replicas, + }, + }, + }, + }, + }, + local retinanet = common.TfVisionTest + common.coco { + modelName: 'retinanet-coco', + scriptConfig+: { + experiment: 'retinanet_resnetfpn_coco', + }, + }, + local functional = common.Functional { + scriptConfig+: { + paramsOverride+: { + trainer+: { + train_steps: 400, + validation_interval: 200, + validation_steps: 100, + }, + }, + }, + }, + local convergence = self.convergence, + convergence:: common.Convergence, + local v4_8 = self.v4_8, + v4_8:: tpu_common { + accelerator: tpus.v4_8, + }, + local v4_32 = self.v4_32, + v4_32:: tpu_common { + accelerator: tpus.v4_32, + }, + local tpuVm = self.tpuVm, + tpuVm:: common.tpuVm, + + configs: [ + retinanet + v4_8 + functional + tpuVm, + retinanet + v4_32 + convergence + tpuVm, + ], +} diff --git a/tests/tensorflow/r2.15/tf-wmt-wmt14_translate.libsonnet b/tests/tensorflow/r2.15/tf-wmt-wmt14_translate.libsonnet new file mode 100644 index 000000000..eaaf74b9a --- /dev/null +++ b/tests/tensorflow/r2.15/tf-wmt-wmt14_translate.libsonnet @@ -0,0 +1,116 @@ +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +local common = import 'common.libsonnet'; +local timeouts = import 'templates/timeouts.libsonnet'; +local tpus = import 'templates/tpus.libsonnet'; + +{ + local transformer = common.TfNlpTest { + modelName: 'wmt-wmt14.translate', + scriptConfig+: { + experiment: 'wmt_transformer/large', + paramsOverride+: { + task+: { + sentencepiece_model_path: '$(TRANSFORMER_DIR)/ende_bpe_32k.model', + }, + }, + }, + }, + local functional = self.functional, + functional:: common.Functional { + scriptConfig+: { + paramsOverride+: { + trainer+: { + train_steps: 10000, + validation_interval: 10000, + }, + }, + }, + }, + local convergence = self.convergence, + convergence:: common.Convergence { + local config = self, + scriptConfig+: { + paramsOverride+: { + trainer+: { + train_steps: 200000 / config.accelerator.replicas, + }, + }, + }, + }, + + local v2_8 = self.v2_8, + v2_8:: { + accelerator: tpus.v2_8, + scriptConfig+: { + paramsOverride+: { + task+: { + train_data+: { + global_batch_size: 6144, + }, + }, + }, + }, + }, + + local v3_8 = self.v3_8, + v3_8:: { + accelerator: tpus.v3_8, + scriptConfig+: { + paramsOverride+: { + task+: { + train_data+: { + global_batch_size: 6144, + }, + }, + }, + }, + }, + local v2_32 = self.v2_32, + v2_32:: { + accelerator: tpus.v2_32, + scriptConfig+: { + paramsOverride+: { + task+: { + train_data+: { + global_batch_size: 24576, + }, + }, + }, + }, + }, + local v3_32 = self.v3_32, + v3_32:: { + accelerator: tpus.v3_32, + scriptConfig+: { + paramsOverride+: { + task+: { + train_data+: { + global_batch_size: 24576, + }, + }, + }, + }, + }, + local tpuVm = common.tpuVm, + + configs: [ + transformer + accelerator + functional + tpuVm + for accelerator in [v2_8, v3_8] + ] + [ + transformer + v2_32 + convergence + tpuVm, + transformer + v3_32 + convergence + tpuVm, + ], +} From 5d3de5a6e73be4999d992fef9661ee9df4e1e04d Mon Sep 17 00:00:00 2001 From: Mark Bzomowski Date: Wed, 25 Oct 2023 20:46:56 +0000 Subject: [PATCH 2/4] Add 2.15.0 tests for TensorFlow --- tests/tensorflow/r2.15/common.libsonnet | 2 +- tests/tensorflow/targets.jsonnet | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/tensorflow/r2.15/common.libsonnet b/tests/tensorflow/r2.15/common.libsonnet index 3d60c7961..67e433098 100644 --- a/tests/tensorflow/r2.15/common.libsonnet +++ b/tests/tensorflow/r2.15/common.libsonnet @@ -118,7 +118,7 @@ local volumes = import 'templates/volumes.libsonnet'; softwareVersion: if config.accelerator.replicas == 1 then 'tpu-vm-tf-2.15.0' else - 'tpu-vm-tf-2.15.0-pod' + 'tpu-vm-tf-2.15.0-pod', tpuVmEnvVars+: (if std.parseInt(std.split(config.accelerator.name, '-')[1]) <= 8 then { TF_PLUGGABLE_DEVICE_LIBRARY_PATH: '/lib/libtpu.so', NEXT_PLUGGABLE_DEVICE_USE_C_API: 'true', diff --git a/tests/tensorflow/targets.jsonnet b/tests/tensorflow/targets.jsonnet index 51e411f59..1b485ee85 100644 --- a/tests/tensorflow/targets.jsonnet +++ b/tests/tensorflow/targets.jsonnet @@ -16,6 +16,7 @@ local experimental = import 'experimental/targets.jsonnet'; local nightly_se = import 'nightly-se/targets.jsonnet'; local nightly = import 'nightly/targets.jsonnet'; local r2_14 = import 'r2.14/targets.jsonnet'; +local r2_15 = import 'r2.15/targets.jsonnet'; // Add new versions here std.flattenArrays([ @@ -23,4 +24,5 @@ std.flattenArrays([ nightly, nightly_se, r2_14, + r2_15, ]) From bae21b3557cb821c73aac8a6e9bf8985c4d523ef Mon Sep 17 00:00:00 2001 From: Mark Bzomowski Date: Fri, 27 Oct 2023 17:35:40 +0000 Subject: [PATCH 3/4] Added to common --- tests/tensorflow/r2.15/common.libsonnet | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/tensorflow/r2.15/common.libsonnet b/tests/tensorflow/r2.15/common.libsonnet index 67e433098..2dca6bad6 100644 --- a/tests/tensorflow/r2.15/common.libsonnet +++ b/tests/tensorflow/r2.15/common.libsonnet @@ -116,10 +116,11 @@ local volumes = import 'templates/volumes.libsonnet'; local config = self, tpuSettings+: { softwareVersion: if config.accelerator.replicas == 1 then - 'tpu-vm-tf-2.15.0' + 'v2-alpha-tpuv5' else - 'tpu-vm-tf-2.15.0-pod', + 'v2-alpha-tpuv5', tpuVmEnvVars+: (if std.parseInt(std.split(config.accelerator.name, '-')[1]) <= 8 then { + WRAPT_DISABLE_EXTENSIONS: 'true', TF_PLUGGABLE_DEVICE_LIBRARY_PATH: '/lib/libtpu.so', NEXT_PLUGGABLE_DEVICE_USE_C_API: 'true', } else {}), @@ -183,15 +184,15 @@ local volumes = import 'templates/volumes.libsonnet'; sleep %(sleepTime)d softwareVersion=%(softwareVersion)s - gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "pip install tensorflow-text-nightly" - gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/tensorflow/tf-nightly/latest/*.whl /tmp/ && pip install /tmp/tf*.whl --force" + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "pip install tensorflow-text==2.15.0rc0" # tensorflow-text-nightly" + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/tensorflow/tf-2-15-0/latest/tensorflow-2.15.0rc0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl /tmp/ && pip install /tmp/tensorflow*.whl --force" - gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/libtpu/latest/libtpu.so /lib/" - gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "sudo mkdir -p /usr/share/tpu && cd /usr/share/tpu && git clone https://github.com/tensorflow/models.git" + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/libtpu/1.9.0/latest/libtpu.so /lib" + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "sudo mkdir -p /usr/share/tpu && cd /usr/share/tpu && git clone https://github.com/tensorflow/models.git && cd models && git checkout r2.15.0" accelerator_type=%(acceleratorName)s if (( ${accelerator_type: -2} > 8 )); then - gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo sed -i 's/TF_DOCKER_URL=.*/TF_DOCKER_URL=gcr.io\/cloud-tpu-v2-images-dev\/grpc_tpu_worker:nightly\"/' /etc/systemd/system/tpu-runtime.service" + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo sed -i 's/TF_DOCKER_URL=.*/TF_DOCKER_URL=gcr.io\/cloud-tpu-v2-images-dev\/grpc_tpu_worker:tf-2.15.0-pjrt\"/' /etc/systemd/system/tpu-runtime.service" gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo systemctl daemon-reload && sudo systemctl restart tpu-runtime" fi ||| % tpuCreateSettings), From a6175bc819f6d9feb02bca128476e902bc24526f Mon Sep 17 00:00:00 2001 From: Mark Bzomowski Date: Fri, 27 Oct 2023 21:58:28 +0000 Subject: [PATCH 4/4] Change common setup steps --- tests/tensorflow/r2.15/common.libsonnet | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tests/tensorflow/r2.15/common.libsonnet b/tests/tensorflow/r2.15/common.libsonnet index 2dca6bad6..688e1274c 100644 --- a/tests/tensorflow/r2.15/common.libsonnet +++ b/tests/tensorflow/r2.15/common.libsonnet @@ -115,10 +115,7 @@ local volumes = import 'templates/volumes.libsonnet'; tpuVm:: experimental.TensorFlowTpuVmMixin { local config = self, tpuSettings+: { - softwareVersion: if config.accelerator.replicas == 1 then - 'v2-alpha-tpuv5' - else - 'v2-alpha-tpuv5', + softwareVersion: 'v2-alpha-tpuv5', tpuVmEnvVars+: (if std.parseInt(std.split(config.accelerator.name, '-')[1]) <= 8 then { WRAPT_DISABLE_EXTENSIONS: 'true', TF_PLUGGABLE_DEVICE_LIBRARY_PATH: '/lib/libtpu.so', @@ -184,16 +181,17 @@ local volumes = import 'templates/volumes.libsonnet'; sleep %(sleepTime)d softwareVersion=%(softwareVersion)s - gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "pip install tensorflow-text==2.15.0rc0" # tensorflow-text-nightly" - gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/tensorflow/tf-2-15-0/latest/tensorflow-2.15.0rc0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl /tmp/ && pip install /tmp/tensorflow*.whl --force" + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "pip install tensorflow-text==2.15.0rc0" + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "gsutil -m cp gs://ptxla-debug/tf/215/*.whl /tmp/ && pip install /tmp/tensorflow*.whl --force" - gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/libtpu/1.9.0/latest/libtpu.so /lib" + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "sudo gsutil -m cp gs://ptxla-debug/tf/215/libtpu.so /lib/" gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "sudo mkdir -p /usr/share/tpu && cd /usr/share/tpu && git clone https://github.com/tensorflow/models.git && cd models && git checkout r2.15.0" accelerator_type=%(acceleratorName)s if (( ${accelerator_type: -2} > 8 )); then - gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo sed -i 's/TF_DOCKER_URL=.*/TF_DOCKER_URL=gcr.io\/cloud-tpu-v2-images-dev\/grpc_tpu_worker:tf-2.15.0-pjrt\"/' /etc/systemd/system/tpu-runtime.service" - gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo systemctl daemon-reload && sudo systemctl restart tpu-runtime" + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo sed -i 's/HEALTH_AGENT_DOCKER_URL=.*/HEALTH_AGENT_DOCKER_URL=gcr.io\/cloud-tpu-v2-images\/tpu_agents:cl_562025307\"/' /home/tpu-runtime/tpu-env" + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo systemctl daemon-reload && sudo systemctl restart healthagent.service" + gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo sed -i 's/TF_DOCKER_URL=.*/TF_DOCKER_URL=gcr.io\/cloud-tpu-v2-images-dev\/grpc_tpu_worker:tf-2.15.0-pjrt\"/' /etc/systemd/system/tpu-runtime.service" fi ||| % tpuCreateSettings), },