Skip to content

Commit

Permalink
Multi-host GPU tests on GKE (#111)
Browse files Browse the repository at this point in the history
* Multi-GPU tests on GKE

Change-Id: I8b6ef0f096d965d6538e1de3b9699bb271e15232

* Fit new test type into task API

Change-Id: I0507c9d9275cf3b920adfa75a065492d66dc0658

* split up `deploy_job`

Change-Id: I7683d82b3f1ebce32b047f967c66b18b042373ca

* limit epochs

Change-Id: Ibfabea0833dfaf3e9f3922438c0d5dca97463592

* Make cluster name a parameter

Change-Id: I5efbd5b7b7e61a471f2756f81bb0a8d23c8df355

* remove terrform from this PR

Change-Id: Ia14f9601cf5f254af19b339416e525962e6e1e33

* use new cluster

Change-Id: Ieca0e21a6e093f1635b60d1e29bbb663183d76e9

* comment

Change-Id: Ib1ae393fec7b9e9fe8b88e87812cbd0a7a9e9ecb

* set timeout

Change-Id: Ib54bd01bd54c5c785beb2dab1cddcaede1720ca9

* move to main dag

Change-Id: I0f28c7e5a49ac6c8e2655f59d5b9b9db4a5f09e9

* format

Change-Id: Ie32ce56bf74fb31b420d693251c52539d8d3f8c2

* factor out get_authenticated_client

Change-Id: I2a7213cc67d11f4089409cbc52e29b27a0d5fffb

* naming

Change-Id: Id0c6b916fb97ba4567caff6e7560cde21178e40c

* formatting

Change-Id: Ie97fc0545a399a8d088907afbe3e570818e5dab0

* format with correct version

Change-Id: I76ed5ca9ca0cce013ede30a451039c39690f4f17

* add commented dep

Change-Id: I79cb359c070b5bcd3b4d16188217c5ef9417b0c1

* docstrings

Change-Id: I433000f39c12dfb2d402ab445f9ba938048ebdb2

* update some comments

Change-Id: I5f83e564b15c8a97cb9805702e8162e8e29ef326

* implement timeout

Change-Id: Iffc7b7d551e317e130f12c384213df8a5a56c3c9

* typo

Change-Id: I0b0716b19bfbade576be13df7cec92ef286e0543

* formatting

Change-Id: I4fd2f13f8faa6705c7217f321562a42a9bce5345

* format with correct pyink version

Change-Id: Ief7e35acc1dc5a54b2a8f1b2f6a442230d973731

* remove extra dag

Change-Id: I29eadfea4bb89fb2a77a6c8752b5f7da6635fdef

* fix data type

Change-Id: I4dfcedfbd71bb17b7144254c9e6594468539aecc

* formatting

Change-Id: Icfd3ec316e068d2d71d120685ac7857d37f43b17

* return -> raise

Change-Id: Ic607c47aafb439729268a645e8210a47a7ca7ad2

* add myself to a TODO

Change-Id: If3bc4d60890fb0545d269433c516db9dfd538306

* TODO for big class

Change-Id: Iad49f733c866f5309b968097cc724571de2d9e8d

* `run_model`

Change-Id: I04bb3b098d789328bbf7749f70b08e5a056aad01
  • Loading branch information
will-cromar authored Feb 22, 2024
1 parent f511cd2 commit d56f394
Show file tree
Hide file tree
Showing 10 changed files with 406 additions and 50 deletions.
3 changes: 2 additions & 1 deletion .github/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ google-cloud-container
google-cloud-tpu>=1.16.0
jsonlines
tensorflow-cpu
apache-airflow-providers-cncf-kubernetes
apache-airflow-providers-cncf-kubernetes
kubernetes
11 changes: 6 additions & 5 deletions dags/legacy_test/templates/gpus.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,14 @@ local base = import 'base.libsonnet';
GPUSpec:: base.BaseAccelerator {
local gpu = self,

name: '%(version)s-x%(count)d' % gpu,
name: '%(version)s-x%(count)dx%(num_hosts)d' % gpu,
type: 'gpu',
version: error 'Must specify GPUSpec `version`',
count: 1,
replicas: gpu.count,
num_hosts: 1,
// Label used in GCE API
accelerator_type: error 'Must specify GPUSpec `accelerator_type',

// Ignore TPU settings.
PodTemplate(_):: {
Expand All @@ -43,8 +46,6 @@ local base = import 'base.libsonnet';
},
},

teslaK80: self.GPUSpec { version: 'k80' },
teslaV100: self.GPUSpec { version: 'v100' },
teslaA100: self.GPUSpec { version: 'a100' },
teslaT4: self.GPUSpec { version: 't4' },
teslaV100: self.GPUSpec { version: 'v100', accelerator_type: 'nvidia-tesla-v100' },
teslaA100: self.GPUSpec { version: 'a100', accelerator_type: 'nvidia-tesla-a100' },
}
33 changes: 31 additions & 2 deletions dags/legacy_test/tests/pytorch/nightly/common.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ local volumes = import 'templates/volumes.libsonnet';
tpuSettings+: {
softwareVersion: 'pytorch-nightly',
},
imageTag: 'nightly_3.7',
imageTag: 'nightly_3.8',
},
PyTorchTest:: common.PyTorchTest + Nightly {
local config = self,
Expand Down Expand Up @@ -120,7 +120,36 @@ local volumes = import 'templates/volumes.libsonnet';
},
GpuMixin:: {
local config = self,
imageTag+: '_cuda_11.8',
imageTag+: '_cuda_12.1',

entrypoint: [
'bash',
'-cxue',
|||
export PATH=/usr/local/nvidia/bin${PATH:+:${PATH}}
export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/nvidia/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
nvidia-smi
pip3 uninstall -y torch torchvision
pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
mkdir pytorch
wget https://github.com/pytorch/xla/archive/refs/heads/master.tar.gz -O - | tar xzf -
mv xla-master pytorch/xla
export PJRT_DEVICE=CUDA
# Run whatever is in `command` here
"${@:0}"
|||,
],
command: [
'torchrun',
'--nnodes=%d' % config.accelerator.num_hosts,
'--node_rank=$(JOB_COMPLETION_INDEX)',
'--nproc_per_node=%d' % config.accelerator.count,
'--rdzv_endpoint=$(JOB_NAME)-0.headless-svc:12355',
] + super.command[1:],

podTemplate+:: {
spec+: {
Expand Down
20 changes: 16 additions & 4 deletions dags/legacy_test/tests/pytorch/nightly/resnet50-mp.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -141,20 +141,31 @@ local tpus = import 'templates/tpus.libsonnet';

local gpu = self.gpu,
gpu:: common.GpuMixin {
local config = self,

cpu: '7.0',
memory: '40Gi',

// Disable XLA metrics report on GPU
command+: [
'--pjrt_distributed',
'--nometrics_debug',
'--num_epochs=2',
],
flags+: {
modelDir: null,
},

jobTemplate+:: {
spec+: {
completionMode: 'Indexed',
completions: config.accelerator.num_hosts,
parallelism: config.accelerator.num_hosts,
}
},
},
local v100x4 = self.v100x4,
v100x4:: gpu {
accelerator: gpus.teslaV100 { count: 4 },
local v100x2x2 = self.v100x2x2,
v100x2x2:: gpu {
accelerator: gpus.teslaV100 { count: 2, num_hosts: 2 },
},

local pjrt_ddp = self.pjrt_ddp,
Expand Down Expand Up @@ -194,6 +205,7 @@ local tpus = import 'templates/tpus.libsonnet';
},

configs: [
resnet50 + fake_data + v100x2x2 + timeouts.Hours(3),
// PJRT
resnet50 + fake_data + v2_8 + timeouts.Hours(3) + pjrt,
resnet50 + fake_data + v3_8 + timeouts.Hours(2) + pjrt,
Expand Down
13 changes: 13 additions & 0 deletions dags/pytorch_xla/pytorchxla_torchvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@
metric_config.DatasetOption.XLML_DATASET,
)

US_CENTRAL1 = gcp_config.GCPConfig(
Project.CLOUD_ML_AUTO_SOLUTIONS.value,
# HACK: use region in place of zone, since clusters are regional
zone="us-central1",
dataset_name=...,
)


with models.DAG(
dag_id="pytorchxla-torchvision",
Expand Down Expand Up @@ -59,3 +66,9 @@

mnist_v2_8 >> resnet_v2_8
mnist_v2_8 >> resnet_v4_8

resnet_v100_2x2 = task.GpuGkeTask(
test_config.JSonnetGpuTest.from_pytorch("pt-nightly-resnet50-mp-fake-v100-x2x2"),
US_CENTRAL1,
"gpu-uc1",
).run()
1 change: 1 addition & 0 deletions deployment/cloud_composer_template.tf
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ resource "google_composer_environment" "example_environment" {
# google-cloud-container = ""
# tensorflow-cpu = ""
# apache-airflow-providers-cncf-kubernetes = ""
# kubernetes = ""
}
}

Expand Down
124 changes: 122 additions & 2 deletions xlml/apis/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@
import abc
import dataclasses
import datetime
from typing import Optional, Tuple
import shlex
from typing import Any, Dict, Optional, Tuple
import airflow
from airflow.models.taskmixin import DAGNode
from airflow.utils.task_group import TaskGroup
from xlml.apis import gcp_config, metric_config, test_config
from xlml.utils import gpu, metric, name_format, ssh, tpu, xpk, startup_script
from xlml.utils import gpu, metric, name_format, ssh, tpu, xpk, gke, startup_script


class BaseTask(abc.ABC):
Expand Down Expand Up @@ -503,3 +504,122 @@ def clean_up(self, resource: airflow.XComArg, project_id: str, zone: str) -> DAG
AirflowTaskTimeout: An error occurs when execution_timeout is breached.
"""
return gpu.delete_resource.override(group_id="clean_up")(resource, project_id, zone)


# TODO(ranran): This class is big. Let's move it to a new file.
@dataclasses.dataclass
class GpuGkeTask(BaseTask):
"""This is a class to set up tasks for GPU on a GKE cluster.
Attributes:
image_project: the project that an image belongs to.
image_family: the family group that an image belongs to.
cluster_name: Name of the GCP cluster.
job_create_timeout: Amount of time to wait for all pods to become active.
"""

task_test_config: test_config.JSonnetGpuTest
task_gcp_config: gcp_config.GCPConfig
cluster_name: str
job_create_timeout: datetime.timedelta = datetime.timedelta(minutes=10)
# TODO(wcromar): job history metrics
# task_metric_config: Optional[metric_config.MetricConfig] = None

def run(self) -> DAGNode:
"""Run a test job.
Returns:
A task group that runs the given test config on a GKE cluster.
"""
with TaskGroup(
group_id=self.task_test_config.benchmark_id, prefix_group_id=True
) as group:
job_body = self._get_job_manifest()
gke.run_job.override(group_id="run_model")(
job_body, self.task_gcp_config, self.cluster_name, self.job_create_timeout
)

return group

def _get_job_manifest(self):
return {
"apiVersion": "batch/v1",
"kind": "Job",
"metadata": {
"generateName": f"{self.task_test_config.benchmark_id}-",
"labels": {
"accelerator": self.task_test_config.accelerator.name,
"benchmarkId": self.task_test_config.benchmark_id,
},
},
"spec": {
"activeDeadlineSeconds": int(
datetime.timedelta(
minutes=self.task_test_config.time_out_in_min or 60
).total_seconds()
),
"backoffLimit": 0,
"completionMode": "Indexed",
"completions": self.task_test_config.num_hosts,
"parallelism": self.task_test_config.num_hosts,
"template": {
"metadata": {
# Matches `headless-svc` in GKE cluster. See deployments directory.
"labels": {"headless-svc": "true"},
},
"spec": {
"subdomain": "headless-svc",
"nodeSelector": {
"cloud.google.com/gke-accelerator": self.task_test_config.accelerator.accelerator_type,
},
"restartPolicy": "Never",
"containers": [
{
"name": "main",
"image": self.task_test_config.docker_image,
"imagePullPolicy": "Always",
"command": shlex.split(self.task_test_config.setup_script),
"args": shlex.split(self.task_test_config.test_script),
"resources": {
"limits": {
"nvidia.com/gpu": self.task_test_config.accelerator.count,
}
},
"env": [
{
"name": "POD_NAME",
"valueFrom": {
"fieldRef": {"fieldPath": "metadata.name"}
},
},
{
"name": "POD_NAMESPACE",
"valueFrom": {
"fieldRef": {"fieldPath": "metadata.namespace"}
},
},
{
"name": "JOB_NAME",
"valueFrom": {
"fieldRef": {
"fieldPath": "metadata.labels['job-name']"
}
},
},
],
"volumeMounts": [
{
"mountPath": "/dev/shm",
"name": "dshm",
"readOnly": False,
},
],
},
],
"volumes": [
{"emptyDir": {"medium": "Memory"}, "name": "dshm"},
],
},
},
},
}
Loading

0 comments on commit d56f394

Please sign in to comment.