Multi-host GPU tests on GKE (#111)

* Multi-GPU tests on GKE Change-Id: I8b6ef0f096d965d6538e1de3b9699bb271e15232 * Fit new test type into task API Change-Id: I0507c9d9275cf3b920adfa75a065492d66dc0658 * split up `deploy_job` Change-Id: I7683d82b3f1ebce32b047f967c66b18b042373ca * limit epochs Change-Id: Ibfabea0833dfaf3e9f3922438c0d5dca97463592 * Make cluster name a parameter Change-Id: I5efbd5b7b7e61a471f2756f81bb0a8d23c8df355 * remove terrform from this PR Change-Id: Ia14f9601cf5f254af19b339416e525962e6e1e33 * use new cluster Change-Id: Ieca0e21a6e093f1635b60d1e29bbb663183d76e9 * comment Change-Id: Ib1ae393fec7b9e9fe8b88e87812cbd0a7a9e9ecb * set timeout Change-Id: Ib54bd01bd54c5c785beb2dab1cddcaede1720ca9 * move to main dag Change-Id: I0f28c7e5a49ac6c8e2655f59d5b9b9db4a5f09e9 * format Change-Id: Ie32ce56bf74fb31b420d693251c52539d8d3f8c2 * factor out get_authenticated_client Change-Id: I2a7213cc67d11f4089409cbc52e29b27a0d5fffb * naming Change-Id: Id0c6b916fb97ba4567caff6e7560cde21178e40c * formatting Change-Id: Ie97fc0545a399a8d088907afbe3e570818e5dab0 * format with correct version Change-Id: I76ed5ca9ca0cce013ede30a451039c39690f4f17 * add commented dep Change-Id: I79cb359c070b5bcd3b4d16188217c5ef9417b0c1 * docstrings Change-Id: I433000f39c12dfb2d402ab445f9ba938048ebdb2 * update some comments Change-Id: I5f83e564b15c8a97cb9805702e8162e8e29ef326 * implement timeout Change-Id: Iffc7b7d551e317e130f12c384213df8a5a56c3c9 * typo Change-Id: I0b0716b19bfbade576be13df7cec92ef286e0543 * formatting Change-Id: I4fd2f13f8faa6705c7217f321562a42a9bce5345 * format with correct pyink version Change-Id: Ief7e35acc1dc5a54b2a8f1b2f6a442230d973731 * remove extra dag Change-Id: I29eadfea4bb89fb2a77a6c8752b5f7da6635fdef * fix data type Change-Id: I4dfcedfbd71bb17b7144254c9e6594468539aecc * formatting Change-Id: Icfd3ec316e068d2d71d120685ac7857d37f43b17 * return -> raise Change-Id: Ic607c47aafb439729268a645e8210a47a7ca7ad2 * add myself to a TODO Change-Id: If3bc4d60890fb0545d269433c516db9dfd538306 * TODO for big class Change-Id: Iad49f733c866f5309b968097cc724571de2d9e8d * `run_model` Change-Id: I04bb3b098d789328bbf7749f70b08e5a056aad01
GoogleCloudPlatform · Feb 22, 2024 · d56f394 · d56f394
1 parent f511cd2
commit d56f394
Show file tree

Hide file tree

Showing 10 changed files with 406 additions and 50 deletions.
diff --git a/.github/requirements.txt b/.github/requirements.txt
@@ -7,4 +7,5 @@ google-cloud-container
 google-cloud-tpu>=1.16.0
 jsonlines
 tensorflow-cpu
-apache-airflow-providers-cncf-kubernetes
+apache-airflow-providers-cncf-kubernetes
+kubernetes
diff --git a/dags/legacy_test/templates/gpus.libsonnet b/dags/legacy_test/templates/gpus.libsonnet
@@ -18,11 +18,14 @@ local base = import 'base.libsonnet';
   GPUSpec:: base.BaseAccelerator {
     local gpu = self,
 
-    name: '%(version)s-x%(count)d' % gpu,
+    name: '%(version)s-x%(count)dx%(num_hosts)d' % gpu,
     type: 'gpu',
     version: error 'Must specify GPUSpec `version`',
     count: 1,
     replicas: gpu.count,
+    num_hosts: 1,
+    // Label used in GCE API
+    accelerator_type: error 'Must specify GPUSpec `accelerator_type',
 
     // Ignore TPU settings.
     PodTemplate(_):: {
@@ -43,8 +46,6 @@ local base = import 'base.libsonnet';
     },
   },
 
-  teslaK80: self.GPUSpec { version: 'k80' },
-  teslaV100: self.GPUSpec { version: 'v100' },
-  teslaA100: self.GPUSpec { version: 'a100' },
-  teslaT4: self.GPUSpec { version: 't4' },
+  teslaV100: self.GPUSpec { version: 'v100', accelerator_type: 'nvidia-tesla-v100' },
+  teslaA100: self.GPUSpec { version: 'a100', accelerator_type: 'nvidia-tesla-a100' },
 }
diff --git a/dags/legacy_test/tests/pytorch/nightly/common.libsonnet b/dags/legacy_test/tests/pytorch/nightly/common.libsonnet
@@ -24,7 +24,7 @@ local volumes = import 'templates/volumes.libsonnet';
     tpuSettings+: {
       softwareVersion: 'pytorch-nightly',
     },
-    imageTag: 'nightly_3.7',
+    imageTag: 'nightly_3.8',
   },
   PyTorchTest:: common.PyTorchTest + Nightly {
     local config = self,
@@ -120,7 +120,36 @@ local volumes = import 'templates/volumes.libsonnet';
   },
   GpuMixin:: {
     local config = self,
-    imageTag+: '_cuda_11.8',
+    imageTag+: '_cuda_12.1',
+
+    entrypoint: [
+      'bash',
+      '-cxue',
+      |||
+        export PATH=/usr/local/nvidia/bin${PATH:+:${PATH}}
+        export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/nvidia/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+
+        nvidia-smi
+        pip3 uninstall -y torch torchvision
+        pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
+
+        mkdir pytorch
+        wget https://github.com/pytorch/xla/archive/refs/heads/master.tar.gz -O - | tar xzf -
+        mv xla-master pytorch/xla
+
+        export PJRT_DEVICE=CUDA
+
+        # Run whatever is in `command` here
+        "${@:0}"
+      |||,
+    ],
+    command: [
+      'torchrun',
+      '--nnodes=%d' % config.accelerator.num_hosts,
+      '--node_rank=$(JOB_COMPLETION_INDEX)',
+      '--nproc_per_node=%d' % config.accelerator.count,
+      '--rdzv_endpoint=$(JOB_NAME)-0.headless-svc:12355',
+    ] + super.command[1:],
 
     podTemplate+:: {
       spec+: {

diff --git a/dags/legacy_test/tests/pytorch/nightly/resnet50-mp.libsonnet b/dags/legacy_test/tests/pytorch/nightly/resnet50-mp.libsonnet
@@ -141,20 +141,31 @@ local tpus = import 'templates/tpus.libsonnet';
 
   local gpu = self.gpu,
   gpu:: common.GpuMixin {
+    local config = self,
+
     cpu: '7.0',
     memory: '40Gi',
 
-    // Disable XLA metrics report on GPU
     command+: [
+      '--pjrt_distributed',
       '--nometrics_debug',
+      '--num_epochs=2',
     ],
     flags+: {
       modelDir: null,
     },
+
+    jobTemplate+:: {
+      spec+: {
+        completionMode: 'Indexed',
+        completions: config.accelerator.num_hosts,
+        parallelism: config.accelerator.num_hosts,
+      }
+    },
   },
-  local v100x4 = self.v100x4,
-  v100x4:: gpu {
-    accelerator: gpus.teslaV100 { count: 4 },
+  local v100x2x2 = self.v100x2x2,
+  v100x2x2:: gpu {
+    accelerator: gpus.teslaV100 { count: 2, num_hosts: 2 },
   },
 
   local pjrt_ddp = self.pjrt_ddp,
@@ -194,6 +205,7 @@ local tpus = import 'templates/tpus.libsonnet';
   },
 
   configs: [
+    resnet50 + fake_data + v100x2x2 + timeouts.Hours(3),
     // PJRT
     resnet50 + fake_data + v2_8 + timeouts.Hours(3) + pjrt,
     resnet50 + fake_data + v3_8 + timeouts.Hours(2) + pjrt,

diff --git a/dags/pytorch_xla/pytorchxla_torchvision.py b/dags/pytorch_xla/pytorchxla_torchvision.py
@@ -32,6 +32,13 @@
     metric_config.DatasetOption.XLML_DATASET,
 )
 
+US_CENTRAL1 = gcp_config.GCPConfig(
+    Project.CLOUD_ML_AUTO_SOLUTIONS.value,
+    # HACK: use region in place of zone, since clusters are regional
+    zone="us-central1",
+    dataset_name=...,
+)
+
 
 with models.DAG(
     dag_id="pytorchxla-torchvision",
@@ -59,3 +66,9 @@
 
   mnist_v2_8 >> resnet_v2_8
   mnist_v2_8 >> resnet_v4_8
+
+  resnet_v100_2x2 = task.GpuGkeTask(
+      test_config.JSonnetGpuTest.from_pytorch("pt-nightly-resnet50-mp-fake-v100-x2x2"),
+      US_CENTRAL1,
+      "gpu-uc1",
+  ).run()
diff --git a/deployment/cloud_composer_template.tf b/deployment/cloud_composer_template.tf
@@ -141,6 +141,7 @@ resource "google_composer_environment" "example_environment" {
         # google-cloud-container            = ""
         # tensorflow-cpu                    = ""
         # apache-airflow-providers-cncf-kubernetes = ""
+        # kubernetes                        = ""
       }
     }
 

diff --git a/xlml/apis/task.py b/xlml/apis/task.py
@@ -17,12 +17,13 @@
 import abc
 import dataclasses
 import datetime
-from typing import Optional, Tuple
+import shlex
+from typing import Any, Dict, Optional, Tuple
 import airflow
 from airflow.models.taskmixin import DAGNode
 from airflow.utils.task_group import TaskGroup
 from xlml.apis import gcp_config, metric_config, test_config
-from xlml.utils import gpu, metric, name_format, ssh, tpu, xpk, startup_script
+from xlml.utils import gpu, metric, name_format, ssh, tpu, xpk, gke, startup_script
 
 
 class BaseTask(abc.ABC):
@@ -503,3 +504,122 @@ def clean_up(self, resource: airflow.XComArg, project_id: str, zone: str) -> DAG
       AirflowTaskTimeout: An error occurs when execution_timeout is breached.
     """
     return gpu.delete_resource.override(group_id="clean_up")(resource, project_id, zone)
+
+
+# TODO(ranran): This class is big. Let's move it to a new file.
+@dataclasses.dataclass
+class GpuGkeTask(BaseTask):
+  """This is a class to set up tasks for GPU on a GKE cluster.
+
+  Attributes:
+    image_project: the project that an image belongs to.
+    image_family: the family group that an image belongs to.
+    cluster_name: Name of the GCP cluster.
+    job_create_timeout: Amount of time to wait for all pods to become active.
+  """
+
+  task_test_config: test_config.JSonnetGpuTest
+  task_gcp_config: gcp_config.GCPConfig
+  cluster_name: str
+  job_create_timeout: datetime.timedelta = datetime.timedelta(minutes=10)
+  # TODO(wcromar): job history metrics
+  # task_metric_config: Optional[metric_config.MetricConfig] = None
+
+  def run(self) -> DAGNode:
+    """Run a test job.
+
+    Returns:
+      A task group that runs the given test config on a GKE cluster.
+    """
+    with TaskGroup(
+        group_id=self.task_test_config.benchmark_id, prefix_group_id=True
+    ) as group:
+      job_body = self._get_job_manifest()
+      gke.run_job.override(group_id="run_model")(
+          job_body, self.task_gcp_config, self.cluster_name, self.job_create_timeout
+      )
+
+    return group
+
+  def _get_job_manifest(self):
+    return {
+        "apiVersion": "batch/v1",
+        "kind": "Job",
+        "metadata": {
+            "generateName": f"{self.task_test_config.benchmark_id}-",
+            "labels": {
+                "accelerator": self.task_test_config.accelerator.name,
+                "benchmarkId": self.task_test_config.benchmark_id,
+            },
+        },
+        "spec": {
+            "activeDeadlineSeconds": int(
+                datetime.timedelta(
+                    minutes=self.task_test_config.time_out_in_min or 60
+                ).total_seconds()
+            ),
+            "backoffLimit": 0,
+            "completionMode": "Indexed",
+            "completions": self.task_test_config.num_hosts,
+            "parallelism": self.task_test_config.num_hosts,
+            "template": {
+                "metadata": {
+                    # Matches `headless-svc` in GKE cluster. See deployments directory.
+                    "labels": {"headless-svc": "true"},
+                },
+                "spec": {
+                    "subdomain": "headless-svc",
+                    "nodeSelector": {
+                        "cloud.google.com/gke-accelerator": self.task_test_config.accelerator.accelerator_type,
+                    },
+                    "restartPolicy": "Never",
+                    "containers": [
+                        {
+                            "name": "main",
+                            "image": self.task_test_config.docker_image,
+                            "imagePullPolicy": "Always",
+                            "command": shlex.split(self.task_test_config.setup_script),
+                            "args": shlex.split(self.task_test_config.test_script),
+                            "resources": {
+                                "limits": {
+                                    "nvidia.com/gpu": self.task_test_config.accelerator.count,
+                                }
+                            },
+                            "env": [
+                                {
+                                    "name": "POD_NAME",
+                                    "valueFrom": {
+                                        "fieldRef": {"fieldPath": "metadata.name"}
+                                    },
+                                },
+                                {
+                                    "name": "POD_NAMESPACE",
+                                    "valueFrom": {
+                                        "fieldRef": {"fieldPath": "metadata.namespace"}
+                                    },
+                                },
+                                {
+                                    "name": "JOB_NAME",
+                                    "valueFrom": {
+                                        "fieldRef": {
+                                            "fieldPath": "metadata.labels['job-name']"
+                                        }
+                                    },
+                                },
+                            ],
+                            "volumeMounts": [
+                                {
+                                    "mountPath": "/dev/shm",
+                                    "name": "dshm",
+                                    "readOnly": False,
+                                },
+                            ],
+                        },
+                    ],
+                    "volumes": [
+                        {"emptyDir": {"medium": "Memory"}, "name": "dshm"},
+                    ],
+                },
+            },
+        },
+    }