-
Notifications
You must be signed in to change notification settings - Fork 59
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
r2.15.0 #1009
base: master
Are you sure you want to change the base?
r2.15.0 #1009
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,359 @@ | ||
// Copyright 2020 Google LLC | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
local common = import '../common.libsonnet'; | ||
local experimental = import '../experimental.libsonnet'; | ||
local metrics = import 'templates/metrics.libsonnet'; | ||
local mixins = import 'templates/mixins.libsonnet'; | ||
local utils = import 'templates/utils.libsonnet'; | ||
local volumes = import 'templates/volumes.libsonnet'; | ||
|
||
{ | ||
HuggingFaceTransformer:: common.ModelGardenTest { | ||
local config = self, | ||
|
||
frameworkPrefix: 'tf-r2.15.0', | ||
tpuSettings+: { | ||
softwareVersion: '2.15.0', | ||
}, | ||
imageTag: 'r2.15.0', | ||
script: { | ||
initialSetup: | ||
||| | ||
cd /tmp | ||
git clone https://github.com/huggingface/transformers.git | ||
cd transformers | ||
pip install . | ||
pip install -r examples/tensorflow/_tests_requirements.txt | ||
|||, | ||
}, | ||
}, | ||
ModelGardenTest:: common.ModelGardenTest { | ||
local config = self, | ||
|
||
frameworkPrefix: 'tf-r2.15.0', | ||
tpuSettings+: { | ||
softwareVersion: '2.15.0', | ||
}, | ||
imageTag: 'r2.15.0', | ||
podTemplate+:: if config.accelerator.type == 'tpu' then | ||
{ | ||
spec+: { | ||
initContainerMap+:: { | ||
'tpu-version': { | ||
image: config.podTemplate.spec.containerMap.train.image, | ||
env+: [ | ||
{ | ||
name: 'TPU_NAME', | ||
valueFrom: { | ||
fieldRef: { | ||
fieldPath: "metadata.annotations['name.cloud-tpus.google.com/train']", | ||
}, | ||
}, | ||
}, | ||
{ | ||
name: 'POD_UID', | ||
valueFrom: { | ||
fieldRef: { | ||
fieldPath: 'metadata.uid', | ||
}, | ||
}, | ||
}, | ||
], | ||
local tpuCreateSettings = { | ||
acceleratorName: std.escapeStringBash(config.accelerator.name), | ||
softwareVersion: std.escapeStringBash(config.tpuSettings.softwareVersion), | ||
startupScript: std.escapeStringBash(config.tpuSettings.tpuVmStartupScript), | ||
sleepTime: config.tpuSettings.tpuVmCreateSleepSeconds, | ||
testName: std.strReplace(config.testName, '.', '-'), | ||
}, | ||
command: [ | ||
'python3', | ||
'-c', | ||
||| | ||
import os | ||
import tensorflow as tf | ||
import urllib | ||
import json | ||
import cloud_tpu_client | ||
import sys | ||
print('python version: ' + str(sys.version)) | ||
print('tf_version: ' + str(tf.__version__)) | ||
#TODO(chandrasekhard): | ||
# Add extra condition to fail if it picks stale image | ||
print(str(tf.__file__)) | ||
ctc = cloud_tpu_client.Client(tpu=os.path.basename('$(TPU_NAME)'), zone=os.path.dirname('$(TPU_NAME)')) | ||
ctc.wait_for_healthy() | ||
ctc.configure_tpu_version('nightly', restart_type='always') | ||
ctc.wait_for_healthy() | ||
_VERSION_SWITCHER_ENDPOINT = 'http://{}:8475/requestversion' | ||
url = _VERSION_SWITCHER_ENDPOINT.format(ctc.network_endpoints()[0]['ipAddress']) | ||
req = urllib.request.Request(url) | ||
resp = urllib.request.urlopen(req) | ||
version_details = json.loads(resp.read()) | ||
print(version_details) | ||
|||, | ||
], | ||
}, | ||
}, | ||
}, | ||
} | ||
else | ||
{}, | ||
}, | ||
tpuVm:: experimental.TensorFlowTpuVmMixin { | ||
local config = self, | ||
tpuSettings+: { | ||
softwareVersion: if config.accelerator.replicas == 1 then | ||
'tpu-vm-tf-2.15.0' | ||
else | ||
'tpu-vm-tf-2.15.0-pod', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. v2-alpha-tpuv5 |
||
tpuVmEnvVars+: (if std.parseInt(std.split(config.accelerator.name, '-')[1]) <= 8 then { | ||
TF_PLUGGABLE_DEVICE_LIBRARY_PATH: '/lib/libtpu.so', | ||
NEXT_PLUGGABLE_DEVICE_USE_C_API: 'true', | ||
} else {}), | ||
}, | ||
podTemplate+:: { | ||
spec+: { | ||
initContainerMap+:: { | ||
'create-tpu'+: { | ||
local tpuCreateSettings = { | ||
acceleratorName: std.escapeStringBash(config.accelerator.name), | ||
softwareVersion: std.escapeStringBash(config.tpuSettings.softwareVersion), | ||
startupScript: std.escapeStringBash(config.tpuSettings.tpuVmStartupScript), | ||
sleepTime: config.tpuSettings.tpuVmCreateSleepSeconds, | ||
testName: std.strReplace(config.testName, '.', '-'), | ||
}, | ||
command: utils.scriptCommand(||| | ||
project=$(curl -sS "http://metadata.google.internal/computeMetadata/v1/project/project-id" -H "Metadata-Flavor: Google") | ||
zone=$(curl -sS "http://metadata.google.internal/computeMetadata/v1/instance/zone" -H "Metadata-Flavor: Google" | awk -F'/' '{print $4}') | ||
tpu_name=tpu-${POD_UID} | ||
ssh-keygen -t rsa -f /scripts/id_rsa -q -N "" | ||
|
||
echo " | ||
gcloud alpha compute tpus tpu-vm delete -q --async ${tpu_name} --zone=${zone} | ||
sleep 60 | ||
" > /scripts/cleanup.sh | ||
|
||
echo "xl-ml-test:$(cat /scripts/id_rsa.pub)" > ssh-keys.txt | ||
echo %(startupScript)s > startup-script.txt | ||
|
||
# Retry every 30 seconds for up to 10 minutes | ||
start_time="$(date -u +%%s)" | ||
for i in {1..20}; do | ||
set +e | ||
gcloud alpha compute tpus tpu-vm create ${tpu_name} \ | ||
--accelerator-type=%(acceleratorName)s \ | ||
--version=%(softwareVersion)s \ | ||
--metadata-from-file='ssh-keys=ssh-keys.txt,startup-script=startup-script.txt' \ | ||
--labels='test-name=%(testName)s' \ | ||
--zone=${zone} | ||
|
||
exit_code=$? | ||
set -e | ||
|
||
current_time="$(date -u +%%s)" | ||
elapsed_seconds=$(($current_time-$start_time)) | ||
# Break if command passed or 10-minute limit reached | ||
test $exit_code = 0 && break | ||
test $elapsed_seconds -gt 600 && break | ||
sleep 30 | ||
done | ||
|
||
if [ $exit_code -ne 0 ]; then | ||
exit $exit_code | ||
fi | ||
|
||
|
||
echo ${zone} > /scripts/zone | ||
echo ${tpu_name} > /scripts/tpu_name | ||
gcloud compute tpus describe ${tpu_name} --project=${project} --zone=${zone} --format="value(networkEndpoints[0].ipAddress)" > /scripts/tpu_ip | ||
gcloud compute tpus describe ${tpu_name} --project=${project} --zone=${zone} --flatten="networkEndpoints[]" --format="csv[no-heading](networkEndpoints.ipAddress)" > /scripts/all_tpu_ips | ||
sleep %(sleepTime)d | ||
|
||
softwareVersion=%(softwareVersion)s | ||
gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "pip install tensorflow-text-nightly" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pip install tensorflow-text==2.15.0rc0. But we're still waiting for it to release. So you can comment it for now. |
||
gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/tensorflow/tf-nightly/latest/*.whl /tmp/ && pip install /tmp/tf*.whl --force" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/tensorflow/tf-2-15-0/latest/tensorflow-2.15.0rc0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl /tmp/ && pip install /tmp/tensorflow*.whl --force |
||
|
||
gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/libtpu/latest/libtpu.so /lib/" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/libtpu/1.9.0/latest/libtpu.so . |
||
gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=0 --command "sudo mkdir -p /usr/share/tpu && cd /usr/share/tpu && git clone https://github.com/tensorflow/models.git" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sudo mkdir -p /usr/share/tpu && cd /usr/share/tpu && git clone https://github.com/tensorflow/models.git && git checkout r2.15.0 |
||
|
||
accelerator_type=%(acceleratorName)s | ||
if (( ${accelerator_type: -2} > 8 )); then | ||
gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo sed -i 's/TF_DOCKER_URL=.*/TF_DOCKER_URL=gcr.io\/cloud-tpu-v2-images-dev\/grpc_tpu_worker:nightly\"/' /etc/systemd/system/tpu-runtime.service" | ||
gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo systemctl daemon-reload && sudo systemctl restart tpu-runtime" | ||
fi | ||
||| % tpuCreateSettings), | ||
}, | ||
'tpu-version': { | ||
image: 'google/cloud-sdk', | ||
command: null, | ||
}, | ||
}, | ||
}, | ||
}, | ||
}, | ||
TfVisionTest:: self.ModelGardenTest + common.TfNlpVisionMixin { | ||
scriptConfig+: { | ||
runnerPath: 'official/vision/train.py', | ||
}, | ||
}, | ||
TfNlpTest:: self.ModelGardenTest + common.TfNlpVisionMixin { | ||
scriptConfig+: { | ||
runnerPath: 'official/nlp/train.py', | ||
}, | ||
}, | ||
TfRankingTest:: self.ModelGardenTest { | ||
paramsOverride:: { | ||
runtime: { | ||
distribution_strategy: error 'Must set `runtime.distribution_strategy`', | ||
}, | ||
task: { | ||
train_data: { | ||
input_path: '$(CRITEO_DATA_DIR)/train/*', | ||
global_batch_size: 16384, | ||
}, | ||
validation_data: { | ||
input_path: '$(CRITEO_DATA_DIR)/eval/*', | ||
global_batch_size: 16384, | ||
}, | ||
model: { | ||
num_dense_features: 13, | ||
bottom_mlp: [512, 256, 64], | ||
embedding_dim: 64, | ||
top_mlp: [1024, 1024, 512, 256, 1], | ||
vocab_sizes: [ | ||
39884406, | ||
39043, | ||
17289, | ||
7420, | ||
20263, | ||
3, | ||
7120, | ||
1543, | ||
63, | ||
38532951, | ||
2953546, | ||
403346, | ||
10, | ||
2208, | ||
11938, | ||
155, | ||
4, | ||
976, | ||
14, | ||
39979771, | ||
25641295, | ||
39664984, | ||
585935, | ||
12972, | ||
108, | ||
36, | ||
], | ||
}, | ||
}, | ||
trainer: { | ||
use_orbit: true, | ||
validation_interval: 90000, | ||
checkpoint_interval: 270000, | ||
validation_steps: 5440, | ||
train_steps: 256054, | ||
optimizer_config: { | ||
embedding_optimizer: 'SGD', | ||
lr_config: { | ||
decay_exp: 1.6, | ||
decay_start_steps: 150000, | ||
decay_steps: 136054, | ||
learning_rate: 30, | ||
warmup_steps: 8000, | ||
}, | ||
}, | ||
}, | ||
}, | ||
command: [ | ||
'python3', | ||
'official/recommendation/ranking/train.py', | ||
'--params_override=%s' % (std.manifestYamlDoc(self.paramsOverride) + '\n'), | ||
'--model_dir=$(MODEL_DIR)', | ||
], | ||
}, | ||
imagenet:: { | ||
scriptConfig+: { | ||
trainFilePattern: '$(IMAGENET_DIR)/train*', | ||
evalFilePattern: '$(IMAGENET_DIR)/valid*', | ||
}, | ||
}, | ||
coco:: { | ||
scriptConfig+: { | ||
trainFilePattern: '$(COCO_DIR)/train*', | ||
evalFilePattern: '$(COCO_DIR)/val*', | ||
paramsOverride+: { | ||
task+: { | ||
annotation_file: '$(COCO_DIR)/instances_val2017.json', | ||
}, | ||
}, | ||
}, | ||
}, | ||
local functional_schedule = '0 9 * * *', | ||
Functional:: mixins.Functional { | ||
schedule: if !(self.accelerator.type == 'tpu') || self.accelerator.name == 'v3-8' || self.accelerator.name == 'v4-8' then | ||
functional_schedule | ||
else | ||
functional_schedule, | ||
metricConfig+: { | ||
sourceMap+:: { | ||
tensorboard+: { | ||
aggregateAssertionsMap+:: { | ||
examples_per_second: { | ||
AVERAGE: { | ||
inclusive_bounds: true, | ||
std_devs_from_mean: { | ||
comparison: 'GREATER', | ||
std_devs: 4.0, | ||
}, | ||
wait_for_n_data_points: 0, | ||
}, | ||
}, | ||
}, | ||
}, | ||
}, | ||
}, | ||
}, | ||
// Override default schedule for Functional. | ||
RunNightly:: { | ||
schedule: functional_schedule, | ||
}, | ||
Convergence:: mixins.Convergence { | ||
schedule: '0 5 * * 0,2,4', | ||
metricConfig+: { | ||
sourceMap+:: { | ||
tensorboard+: { | ||
aggregateAssertionsMap+:: { | ||
examples_per_second: { | ||
AVERAGE: { | ||
inclusive_bounds: true, | ||
std_devs_from_mean: { | ||
comparison: 'GREATER', | ||
// TODO(wcromar): Tighten this restriction | ||
std_devs: 2.0, | ||
}, | ||
wait_for_n_data_points: 0, | ||
}, | ||
}, | ||
}, | ||
}, | ||
}, | ||
}, | ||
}, | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
// Copyright 2020 Google LLC | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
local bert = import 'tf-bert-glue_mnli.libsonnet'; | ||
local dlrm = import 'tf-dlrm-criteo.libsonnet'; | ||
local gpt2 = import 'tf-gpt2-wikitext.libsonnet'; | ||
local keras_api = import 'tf-keras-api.libsonnet'; | ||
local maskrcnn = import 'tf-maskrcnn-coco.libsonnet'; | ||
local resnet = import 'tf-resnet-imagenet.libsonnet'; | ||
local resnetrs = import 'tf-resnetrs-imagenet.libsonnet'; | ||
local retinanet = import 'tf-retinanet-coco.libsonnet'; | ||
local wmt = import 'tf-wmt-wmt14_translate.libsonnet'; | ||
|
||
// Add new models here | ||
std.flattenArrays([ | ||
dlrm.configs, | ||
keras_api.configs, | ||
bert.configs, | ||
wmt.configs, | ||
maskrcnn.configs, | ||
retinanet.configs, | ||
resnet.configs, | ||
resnetrs.configs, | ||
gpt2.configs, | ||
]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
v2-alpha-tpuv5