Skip to content

Commit

Permalink
Fix se tests (#88)
Browse files Browse the repository at this point in the history
* Fix se tests

* fix formmatting

* revert format-codes.sh file changes

* Fix runtime versions

* Reformat
  • Loading branch information
chandrasekhard2 authored Feb 6, 2024
1 parent 218a49f commit 5e0d91f
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 23 deletions.
13 changes: 11 additions & 2 deletions dags/solutions_team/configs/tensorflow/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,17 @@
}


def set_up_pjrt_nightly() -> Tuple[str]:
"""Common set up for PJRT nightly tests."""
def set_up_se_nightly() -> Tuple[str]:
"""Adjust grpc_tpu_worker for SE tests"""
return (
"sudo sed -i 's/TF_DOCKER_URL=.*/TF_DOCKER_URL=gcr.io\/cloud-tpu-v2-images-dev\/grpc_tpu_worker:nightly\"/' /etc/systemd/system/tpu-runtime.service",
"sudo systemctl daemon-reload && sudo systemctl restart tpu-runtime",
"cat /etc/systemd/system/tpu-runtime.service",
)


def install_tf_nightly() -> Tuple[str]:
"""Install tf nightly + libtpu."""
return (
"pip install tensorflow-text-nightly",
"sudo gsutil -m cp gs://cloud-tpu-v2-images-dev-artifacts/tensorflow/tf-nightly/latest/*.whl /tmp/ && pip install /tmp/tf*.whl --force",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ def get_tf_keras_config(
dataset_name=metric_config.DatasetOption.XLML_DATASET,
)

set_up_cmds = common.set_up_pjrt_nightly() + common.set_up_tensorflow_keras()
set_up_cmds = common.install_tf_nightly() + common.set_up_tensorflow_keras()
if not is_pjrt and is_pod:
set_up_cmds += common.set_up_se_nightly()
keras_test_name = f"tf_keras_api_{test_name}"
benchmark_id = f"{keras_test_name}-v{tpu_version.value}-{tpu_cores}"
# Add default_var to pass DAG check
Expand Down Expand Up @@ -108,7 +110,10 @@ def get_tf_resnet_config(
dataset_name=metric_config.DatasetOption.XLML_DATASET,
)

set_up_cmds = common.set_up_pjrt_nightly() + common.set_up_google_tensorflow_models()
set_up_cmds = common.install_tf_nightly() + common.set_up_google_tensorflow_models()
if not is_pjrt and is_pod:
set_up_cmds += common.set_up_se_nightly()

params_override = {
"runtime": {"distribution_strategy": "tpu"},
"task": {
Expand Down Expand Up @@ -142,7 +147,7 @@ def get_tf_resnet_config(
" PYTHONPATH='.' TF_USE_LEGACY_KERAS=1"
" python3 official/vision/train.py"
f" --tpu={tpu_name} --experiment=resnet_imagenet"
" --mode=train_and_eval --model_dir=/tmp/output"
" --mode=train_and_eval --model_dir=/tmp/"
" --params_override='%s'" % str(params_override)
),
)
Expand Down Expand Up @@ -194,7 +199,10 @@ def get_tf_dlrm_config(
dataset_name=metric_config.DatasetOption.XLML_DATASET,
)

set_up_cmds = common.set_up_google_tensorflow_models()
set_up_cmds = common.install_tf_nightly() + common.set_up_google_tensorflow_models()
if not is_pjrt and is_pod:
set_up_cmds += common.set_up_se_nightly()

params_override = {
"runtime": {"distribution_strategy": "tpu"},
"task": {
Expand Down
6 changes: 3 additions & 3 deletions dags/solutions_team/solutionsteam_tf_nightly_supported.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@
tpu_zone=Zone.US_CENTRAL2_B.value,
time_out_in_min=60,
is_pod=True,
runtime_version=RuntimeVersion.TPU_VM_TF_2150_POD_PJRT.value,
runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value,
).run()

tf_resnet_v5e_4 = tf_config.get_tf_resnet_config(
Expand All @@ -133,7 +133,7 @@
network=V5_NETWORKS,
subnetwork=V5E_SUBNETWORKS,
is_pod=True,
runtime_version=RuntimeVersion.TPU_VM_TF_2150_POD_PJRT.value,
runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value,
).run()

tf_resnet_v5p_8 = tf_config.get_tf_resnet_config(
Expand All @@ -155,7 +155,7 @@
network=V5_NETWORKS,
subnetwork=V5P_SUBNETWORKS,
is_pod=True,
runtime_version=RuntimeVersion.TPU_VM_TF_2150_POD_PJRT.value,
runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value,
).run()

# Test dependencies
Expand Down
25 changes: 11 additions & 14 deletions dags/solutions_team/solutionsteam_tf_se_nightly_supported.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
# Run once a day at 4 pm UTC (8 am PST)
SCHEDULED_TIME = "0 16 * * *" if composer_env.is_prod_env() else None


with models.DAG(
dag_id="tf_se_nightly_supported",
schedule=SCHEDULED_TIME,
Expand All @@ -45,7 +44,7 @@
test_feature=feature,
test_name=name,
is_pjrt=False,
runtime_version=RuntimeVersion.TPU_VM_TF_2150_SE.value,
runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY.value,
).run()
tf_keras_v2_8[-1] >> test
tf_keras_v2_8.append(test)
Expand All @@ -58,7 +57,7 @@
time_out_in_min=60,
global_batch_size=1024,
is_pjrt=False,
runtime_version=RuntimeVersion.TPU_VM_TF_2150_SE.value,
runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY.value,
).run()

tf_resnet_v2_32 = tf_config.get_tf_resnet_config(
Expand All @@ -69,7 +68,7 @@
global_batch_size=1024,
is_pod=True,
is_pjrt=False,
runtime_version=RuntimeVersion.TPU_VM_TF_2150_POD_SE.value,
runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value,
).run()

tf_resnet_v3_8 = tf_config.get_tf_resnet_config(
Expand All @@ -78,7 +77,7 @@
tpu_zone=Zone.US_EAST1_D.value,
time_out_in_min=60,
is_pjrt=False,
runtime_version=RuntimeVersion.TPU_VM_TF_2150_SE.value,
runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY.value,
).run()

tf_resnet_v3_32 = tf_config.get_tf_resnet_config(
Expand All @@ -88,7 +87,7 @@
time_out_in_min=60,
is_pod=True,
is_pjrt=False,
runtime_version=RuntimeVersion.TPU_VM_TF_2150_POD_SE.value,
runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value,
).run()

tf_resnet_v4_8 = tf_config.get_tf_resnet_config(
Expand All @@ -97,19 +96,17 @@
tpu_zone=Zone.US_CENTRAL2_B.value,
time_out_in_min=60,
is_pjrt=False,
runtime_version=RuntimeVersion.TPU_VM_TF_2150_SE.value,
runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY.value,
).run()

tf_resnet_v4_32 = tf_config.get_tf_resnet_config(
tpu_version=TpuVersion.V4,
tpu_cores=32,
tpu_zone=Zone.US_CENTRAL2_B.value,
time_out_in_min=60,
is_pod=True,
is_pjrt=False,
runtime_version=RuntimeVersion.TPU_VM_TF_2150_POD_SE.value,
runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value,
).run()

# DLRM
tf_dlrm_v2_8 = tf_config.get_tf_dlrm_config(
tpu_version=TpuVersion.V2,
Expand All @@ -121,7 +118,7 @@
train_steps=10000,
extraFlags="--mode=train",
is_pjrt=False,
runtime_version=RuntimeVersion.TPU_VM_TF_2150_SE.value,
runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY.value,
).run()

tf_dlrm_v2_32 = tf_config.get_tf_dlrm_config(
Expand All @@ -135,7 +132,7 @@
extraFlags="--mode=train_and_eval",
is_pod=True,
is_pjrt=False,
runtime_version=RuntimeVersion.TPU_VM_TF_2150_POD_SE.value,
runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value,
).run()

tf_dlrm_v4_8 = tf_config.get_tf_dlrm_config(
Expand All @@ -148,7 +145,7 @@
train_steps=10000,
extraFlags="--mode=train",
is_pjrt=False,
runtime_version=RuntimeVersion.TPU_VM_TF_2150_SE.value,
runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY.value,
).run()

tf_dlrm_v4_32 = tf_config.get_tf_dlrm_config(
Expand All @@ -162,7 +159,7 @@
extraFlags="--mode=train_and_eval",
is_pod=True,
is_pjrt=False,
runtime_version=RuntimeVersion.TPU_VM_TF_2150_POD_SE.value,
runtime_version=RuntimeVersion.TPU_VM_TF_NIGHTLY_POD.value,
).run()

# Test dependencies
Expand Down
Empty file modified scripts/format-codes.sh
100644 → 100755
Empty file.

0 comments on commit 5e0d91f

Please sign in to comment.