Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multi-host GPU tests on GKE #111

Merged
merged 30 commits into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
72c4afc
Multi-GPU tests on GKE
will-cromar Feb 12, 2024
6c4dcfd
Fit new test type into task API
will-cromar Feb 13, 2024
dbb523f
split up `deploy_job`
will-cromar Feb 15, 2024
5eff685
limit epochs
will-cromar Feb 15, 2024
4842cdf
Make cluster name a parameter
will-cromar Feb 16, 2024
d3cd9c6
remove terrform from this PR
will-cromar Feb 16, 2024
57dc55d
use new cluster
will-cromar Feb 16, 2024
074c4e2
comment
will-cromar Feb 16, 2024
9d74f72
set timeout
will-cromar Feb 16, 2024
24d030c
move to main dag
will-cromar Feb 16, 2024
c2e0d3d
format
will-cromar Feb 16, 2024
3a75b0a
factor out get_authenticated_client
will-cromar Feb 16, 2024
61a28dd
naming
will-cromar Feb 16, 2024
bad8b38
formatting
will-cromar Feb 16, 2024
b08623e
format with correct version
will-cromar Feb 16, 2024
c59a53f
add commented dep
will-cromar Feb 20, 2024
eebc648
docstrings
will-cromar Feb 20, 2024
33a7de4
update some comments
will-cromar Feb 20, 2024
191f582
implement timeout
will-cromar Feb 20, 2024
9907376
typo
will-cromar Feb 20, 2024
bb27148
formatting
will-cromar Feb 20, 2024
4fa2653
format with correct pyink version
will-cromar Feb 20, 2024
3c55699
remove extra dag
will-cromar Feb 20, 2024
bd4099b
fix data type
will-cromar Feb 20, 2024
e3de889
formatting
will-cromar Feb 20, 2024
163a809
return -> raise
will-cromar Feb 20, 2024
c11f120
add myself to a TODO
will-cromar Feb 22, 2024
ce82e82
TODO for big class
will-cromar Feb 22, 2024
397d7cb
`run_model`
will-cromar Feb 22, 2024
b0ecd54
Merge branch 'master' into wcromar/multi-gpu-test
will-cromar Feb 22, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ google-cloud-container
google-cloud-tpu>=1.16.0
jsonlines
tensorflow-cpu
apache-airflow-providers-cncf-kubernetes
apache-airflow-providers-cncf-kubernetes
will-cromar marked this conversation as resolved.
Show resolved Hide resolved
kubernetes
11 changes: 6 additions & 5 deletions dags/legacy_test/templates/gpus.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,14 @@ local base = import 'base.libsonnet';
GPUSpec:: base.BaseAccelerator {
local gpu = self,

name: '%(version)s-x%(count)d' % gpu,
name: '%(version)s-x%(count)dx%(num_hosts)d' % gpu,
type: 'gpu',
version: error 'Must specify GPUSpec `version`',
count: 1,
replicas: gpu.count,
num_hosts: 1,
// Label used in GCE API
accelerator_type: error 'Must specify GPUSpec `accelerator_type',

// Ignore TPU settings.
PodTemplate(_):: {
Expand All @@ -43,8 +46,6 @@ local base = import 'base.libsonnet';
},
},

teslaK80: self.GPUSpec { version: 'k80' },
teslaV100: self.GPUSpec { version: 'v100' },
teslaA100: self.GPUSpec { version: 'a100' },
teslaT4: self.GPUSpec { version: 't4' },
teslaV100: self.GPUSpec { version: 'v100', accelerator_type: 'nvidia-tesla-v100' },
teslaA100: self.GPUSpec { version: 'a100', accelerator_type: 'nvidia-tesla-a100' },
}
33 changes: 31 additions & 2 deletions dags/legacy_test/tests/pytorch/nightly/common.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ local volumes = import 'templates/volumes.libsonnet';
tpuSettings+: {
softwareVersion: 'pytorch-nightly',
},
imageTag: 'nightly_3.7',
imageTag: 'nightly_3.8',
},
PyTorchTest:: common.PyTorchTest + Nightly {
local config = self,
Expand Down Expand Up @@ -120,7 +120,36 @@ local volumes = import 'templates/volumes.libsonnet';
},
GpuMixin:: {
local config = self,
imageTag+: '_cuda_11.8',
imageTag+: '_cuda_12.1',

entrypoint: [
'bash',
'-cxue',
|||
export PATH=/usr/local/nvidia/bin${PATH:+:${PATH}}
export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/nvidia/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}

nvidia-smi
pip3 uninstall -y torch torchvision
pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu

mkdir pytorch
wget https://github.com/pytorch/xla/archive/refs/heads/master.tar.gz -O - | tar xzf -
will-cromar marked this conversation as resolved.
Show resolved Hide resolved
mv xla-master pytorch/xla

export PJRT_DEVICE=CUDA

# Run whatever is in `command` here
"${@:0}"
|||,
],
command: [
'torchrun',
'--nnodes=%d' % config.accelerator.num_hosts,
'--node_rank=$(JOB_COMPLETION_INDEX)',
'--nproc_per_node=%d' % config.accelerator.count,
will-cromar marked this conversation as resolved.
Show resolved Hide resolved
'--rdzv_endpoint=$(JOB_NAME)-0.headless-svc:12355',
RissyRan marked this conversation as resolved.
Show resolved Hide resolved
] + super.command[1:],
will-cromar marked this conversation as resolved.
Show resolved Hide resolved

podTemplate+:: {
spec+: {
Expand Down
20 changes: 16 additions & 4 deletions dags/legacy_test/tests/pytorch/nightly/resnet50-mp.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -141,20 +141,31 @@ local tpus = import 'templates/tpus.libsonnet';

local gpu = self.gpu,
gpu:: common.GpuMixin {
local config = self,

cpu: '7.0',
memory: '40Gi',

// Disable XLA metrics report on GPU
command+: [
'--pjrt_distributed',
'--nometrics_debug',
'--num_epochs=2',
],
flags+: {
modelDir: null,
},

jobTemplate+:: {
spec+: {
completionMode: 'Indexed',
completions: config.accelerator.num_hosts,
parallelism: config.accelerator.num_hosts,
}
},
},
local v100x4 = self.v100x4,
v100x4:: gpu {
accelerator: gpus.teslaV100 { count: 4 },
local v100x2x2 = self.v100x2x2,
v100x2x2:: gpu {
accelerator: gpus.teslaV100 { count: 2, num_hosts: 2 },
},

local pjrt_ddp = self.pjrt_ddp,
Expand Down Expand Up @@ -194,6 +205,7 @@ local tpus = import 'templates/tpus.libsonnet';
},

configs: [
resnet50 + fake_data + v100x2x2 + timeouts.Hours(3),
// PJRT
resnet50 + fake_data + v2_8 + timeouts.Hours(3) + pjrt,
resnet50 + fake_data + v3_8 + timeouts.Hours(2) + pjrt,
Expand Down
13 changes: 13 additions & 0 deletions dags/pytorch_xla/pytorchxla_torchvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@
metric_config.DatasetOption.XLML_DATASET,
)

US_CENTRAL1 = gcp_config.GCPConfig(
Project.CLOUD_ML_AUTO_SOLUTIONS.value,
# HACK: use region in place of zone, since clusters are regional
zone="us-central1",
dataset_name=...,
)


with models.DAG(
dag_id="pytorchxla-torchvision",
Expand Down Expand Up @@ -59,3 +66,9 @@

mnist_v2_8 >> resnet_v2_8
mnist_v2_8 >> resnet_v4_8

resnet_v100_2x2 = task.GpuGkeTask(
test_config.JSonnetGpuTest.from_pytorch("pt-nightly-resnet50-mp-fake-v100-x2x2"),
US_CENTRAL1,
"gpu-uc1",
).run()
1 change: 1 addition & 0 deletions deployment/cloud_composer_template.tf
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ resource "google_composer_environment" "example_environment" {
# google-cloud-container = ""
# tensorflow-cpu = ""
# apache-airflow-providers-cncf-kubernetes = ""
# kubernetes = ""
}
}

Expand Down
124 changes: 122 additions & 2 deletions xlml/apis/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@
import abc
import dataclasses
import datetime
from typing import Optional, Tuple
import shlex
from typing import Any, Dict, Optional, Tuple
import airflow
from airflow.models.taskmixin import DAGNode
from airflow.utils.task_group import TaskGroup
from xlml.apis import gcp_config, metric_config, test_config
from xlml.utils import gpu, metric, name_format, ssh, tpu, xpk, startup_script
from xlml.utils import gpu, metric, name_format, ssh, tpu, xpk, gke, startup_script


class BaseTask(abc.ABC):
Expand Down Expand Up @@ -503,3 +504,122 @@ def clean_up(self, resource: airflow.XComArg, project_id: str, zone: str) -> DAG
AirflowTaskTimeout: An error occurs when execution_timeout is breached.
"""
return gpu.delete_resource.override(group_id="clean_up")(resource, project_id, zone)


# TODO(ranran): This class is big. Let's move it to a new file.
@dataclasses.dataclass
class GpuGkeTask(BaseTask):
"""This is a class to set up tasks for GPU on a GKE cluster.

Attributes:
image_project: the project that an image belongs to.
image_family: the family group that an image belongs to.
cluster_name: Name of the GCP cluster.
job_create_timeout: Amount of time to wait for all pods to become active.
"""

task_test_config: test_config.JSonnetGpuTest
task_gcp_config: gcp_config.GCPConfig
cluster_name: str
job_create_timeout: datetime.timedelta = datetime.timedelta(minutes=10)
# TODO(wcromar): job history metrics
# task_metric_config: Optional[metric_config.MetricConfig] = None

def run(self) -> DAGNode:
"""Run a test job.

Returns:
A task group that runs the given test config on a GKE cluster.
"""
with TaskGroup(
group_id=self.task_test_config.benchmark_id, prefix_group_id=True
) as group:
job_body = self._get_job_manifest()
gke.run_job.override(group_id="run_model")(
job_body, self.task_gcp_config, self.cluster_name, self.job_create_timeout
)

return group

def _get_job_manifest(self):
return {
will-cromar marked this conversation as resolved.
Show resolved Hide resolved
"apiVersion": "batch/v1",
"kind": "Job",
"metadata": {
"generateName": f"{self.task_test_config.benchmark_id}-",
"labels": {
"accelerator": self.task_test_config.accelerator.name,
"benchmarkId": self.task_test_config.benchmark_id,
},
},
"spec": {
"activeDeadlineSeconds": int(
datetime.timedelta(
minutes=self.task_test_config.time_out_in_min or 60
).total_seconds()
),
"backoffLimit": 0,
"completionMode": "Indexed",
"completions": self.task_test_config.num_hosts,
"parallelism": self.task_test_config.num_hosts,
"template": {
"metadata": {
# Matches `headless-svc` in GKE cluster. See deployments directory.
will-cromar marked this conversation as resolved.
Show resolved Hide resolved
"labels": {"headless-svc": "true"},
},
"spec": {
"subdomain": "headless-svc",
"nodeSelector": {
"cloud.google.com/gke-accelerator": self.task_test_config.accelerator.accelerator_type,
},
"restartPolicy": "Never",
"containers": [
{
"name": "main",
"image": self.task_test_config.docker_image,
"imagePullPolicy": "Always",
"command": shlex.split(self.task_test_config.setup_script),
"args": shlex.split(self.task_test_config.test_script),
"resources": {
"limits": {
"nvidia.com/gpu": self.task_test_config.accelerator.count,
}
},
"env": [
{
"name": "POD_NAME",
"valueFrom": {
"fieldRef": {"fieldPath": "metadata.name"}
},
},
{
"name": "POD_NAMESPACE",
"valueFrom": {
"fieldRef": {"fieldPath": "metadata.namespace"}
},
},
{
"name": "JOB_NAME",
"valueFrom": {
"fieldRef": {
"fieldPath": "metadata.labels['job-name']"
}
},
},
],
"volumeMounts": [
{
"mountPath": "/dev/shm",
"name": "dshm",
"readOnly": False,
},
],
},
],
"volumes": [
{"emptyDir": {"medium": "Memory"}, "name": "dshm"},
],
},
},
},
}
Loading
Loading