GoogleCloudPlatform · will-cromar · Feb 22, 2024 · Feb 12, 2024 · Feb 13, 2024 · Feb 15, 2024
@@ -7,4 +7,5 @@ google-cloud-container
 google-cloud-tpu>=1.16.0
 jsonlines
 tensorflow-cpu
-apache-airflow-providers-cncf-kubernetes
+apache-airflow-providers-cncf-kubernetes
+kubernetes
@@ -18,11 +18,14 @@ local base = import 'base.libsonnet';
   GPUSpec:: base.BaseAccelerator {
     local gpu = self,
 
-    name: '%(version)s-x%(count)d' % gpu,
+    name: '%(version)s-x%(count)dx%(num_hosts)d' % gpu,
     type: 'gpu',
     version: error 'Must specify GPUSpec `version`',
     count: 1,
     replicas: gpu.count,
+    num_hosts: 1,
+    // Label used in GCE API
+    accelerator_type: error 'Must specify GPUSpec `accelerator_type',
 
     // Ignore TPU settings.
     PodTemplate(_):: {
@@ -43,8 +46,6 @@ local base = import 'base.libsonnet';
     },
   },
 
-  teslaK80: self.GPUSpec { version: 'k80' },
-  teslaV100: self.GPUSpec { version: 'v100' },
-  teslaA100: self.GPUSpec { version: 'a100' },
-  teslaT4: self.GPUSpec { version: 't4' },
+  teslaV100: self.GPUSpec { version: 'v100', accelerator_type: 'nvidia-tesla-v100' },
+  teslaA100: self.GPUSpec { version: 'a100', accelerator_type: 'nvidia-tesla-a100' },
 }
@@ -24,7 +24,7 @@ local volumes = import 'templates/volumes.libsonnet';
     tpuSettings+: {
       softwareVersion: 'pytorch-nightly',
     },
-    imageTag: 'nightly_3.7',
+    imageTag: 'nightly_3.8',
   },
   PyTorchTest:: common.PyTorchTest + Nightly {
     local config = self,
@@ -120,7 +120,36 @@ local volumes = import 'templates/volumes.libsonnet';
   },
   GpuMixin:: {
     local config = self,
-    imageTag+: '_cuda_11.8',
+    imageTag+: '_cuda_12.1',
+
+    entrypoint: [
+      'bash',
+      '-cxue',
+      |||
+        export PATH=/usr/local/nvidia/bin${PATH:+:${PATH}}
+        export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/nvidia/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+
+        nvidia-smi
+        pip3 uninstall -y torch torchvision
+        pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
+
+        mkdir pytorch
+        wget https://github.com/pytorch/xla/archive/refs/heads/master.tar.gz -O - | tar xzf -
+        mv xla-master pytorch/xla
+
+        export PJRT_DEVICE=CUDA
+
+        # Run whatever is in `command` here
+        "${@:0}"
+      |||,
+    ],
+    command: [
+      'torchrun',
+      '--nnodes=%d' % config.accelerator.num_hosts,
+      '--node_rank=$(JOB_COMPLETION_INDEX)',
+      '--nproc_per_node=%d' % config.accelerator.count,
+      '--rdzv_endpoint=$(JOB_NAME)-0.headless-svc:12355',
+    ] + super.command[1:],
 
     podTemplate+:: {
       spec+: {

@@ -141,20 +141,31 @@ local tpus = import 'templates/tpus.libsonnet';
 
   local gpu = self.gpu,
   gpu:: common.GpuMixin {
+    local config = self,
+
     cpu: '7.0',
     memory: '40Gi',
 
-    // Disable XLA metrics report on GPU
     command+: [
+      '--pjrt_distributed',
       '--nometrics_debug',
+      '--num_epochs=2',
     ],
     flags+: {
       modelDir: null,
     },
+
+    jobTemplate+:: {
+      spec+: {
+        completionMode: 'Indexed',
+        completions: config.accelerator.num_hosts,
+        parallelism: config.accelerator.num_hosts,
+      }
+    },
   },
-  local v100x4 = self.v100x4,
-  v100x4:: gpu {
-    accelerator: gpus.teslaV100 { count: 4 },
+  local v100x2x2 = self.v100x2x2,
+  v100x2x2:: gpu {
+    accelerator: gpus.teslaV100 { count: 2, num_hosts: 2 },
   },
 
   local pjrt_ddp = self.pjrt_ddp,
@@ -194,6 +205,7 @@ local tpus = import 'templates/tpus.libsonnet';
   },
 
   configs: [
+    resnet50 + fake_data + v100x2x2 + timeouts.Hours(3),
     // PJRT
     resnet50 + fake_data + v2_8 + timeouts.Hours(3) + pjrt,
     resnet50 + fake_data + v3_8 + timeouts.Hours(2) + pjrt,

@@ -32,6 +32,13 @@
     metric_config.DatasetOption.XLML_DATASET,
 )
 
+US_CENTRAL1 = gcp_config.GCPConfig(
+    Project.CLOUD_ML_AUTO_SOLUTIONS.value,
+    # HACK: use region in place of zone, since clusters are regional
+    zone="us-central1",
+    dataset_name=...,
+)
+
 
 with models.DAG(
     dag_id="pytorchxla-torchvision",
@@ -59,3 +66,9 @@
 
   mnist_v2_8 >> resnet_v2_8
   mnist_v2_8 >> resnet_v4_8
+
+  resnet_v100_2x2 = task.GpuGkeTask(
+      test_config.JSonnetGpuTest.from_pytorch("pt-nightly-resnet50-mp-fake-v100-x2x2"),
+      US_CENTRAL1,
+      "gpu-uc1",
+  ).run()
@@ -141,6 +141,7 @@ resource "google_composer_environment" "example_environment" {
         # google-cloud-container            = ""
         # tensorflow-cpu                    = ""
         # apache-airflow-providers-cncf-kubernetes = ""
+        # kubernetes                        = ""
       }
     }
 

@@ -17,12 +17,13 @@
 import abc
 import dataclasses
 import datetime
-from typing import Optional, Tuple
+import shlex
+from typing import Any, Dict, Optional, Tuple
 import airflow
 from airflow.models.taskmixin import DAGNode
 from airflow.utils.task_group import TaskGroup
 from xlml.apis import gcp_config, metric_config, test_config
-from xlml.utils import gpu, metric, name_format, ssh, tpu, xpk, startup_script
+from xlml.utils import gpu, metric, name_format, ssh, tpu, xpk, gke, startup_script
 
 
 class BaseTask(abc.ABC):
@@ -503,3 +504,121 @@ def clean_up(self, resource: airflow.XComArg, project_id: str, zone: str) -> DAG
       AirflowTaskTimeout: An error occurs when execution_timeout is breached.
     """
     return gpu.delete_resource.override(group_id="clean_up")(resource, project_id, zone)
+
+
+@dataclasses.dataclass
+class GpuGkeTask(BaseTask):
+  """This is a class to set up tasks for GPU on a GKE cluster.
+
+  Attributes:
+    image_project: the project that an image belongs to.
+    image_family: the family group that an image belongs to.
+    cluster_name: Name of the GCP cluster.
+    job_create_timeout: Amount of time to wait for all pods to become active.
+  """
+
+  task_test_config: test_config.JSonnetGpuTest
+  task_gcp_config: gcp_config.GCPConfig
+  cluster_name: str
+  job_create_timeout: datetime.timedelta = datetime.timedelta(minutes=10)
+  # TODO: metrics
+  # task_metric_config: Optional[metric_config.MetricConfig] = None
+
+  def run(self) -> DAGNode:
+    """Run a test job.
+
+    Returns:
+      A task group that runs the given test config on a GKE cluster.
+    """
+    with TaskGroup(
+        group_id=self.task_test_config.benchmark_id, prefix_group_id=True
+    ) as group:
+      job_body = self._get_job_manifest()
+      gke.run_job(
+          job_body, self.task_gcp_config, self.cluster_name, self.job_create_timeout
+      )
+
+    return group
+
+  def _get_job_manifest(self):
+    return {
+        "apiVersion": "batch/v1",
+        "kind": "Job",
+        "metadata": {
+            "generateName": f"{self.task_test_config.benchmark_id}-",
+            "labels": {
+                "accelerator": self.task_test_config.accelerator.name,
+                "benchmarkId": self.task_test_config.benchmark_id,
+            },
+        },
+        "spec": {
+            "activeDeadlineSeconds": int(
+                datetime.timedelta(
+                    minutes=self.task_test_config.time_out_in_min or 60
+                ).total_seconds()
+            ),
+            "backoffLimit": 0,
+            "completionMode": "Indexed",
+            "completions": self.task_test_config.num_hosts,
+            "parallelism": self.task_test_config.num_hosts,
+            "template": {
+                "metadata": {
+                    # Matches `headless-svc` in GKE cluster. See deployments directory.
+                    "labels": {"headless-svc": "true"},
+                },
+                "spec": {
+                    "subdomain": "headless-svc",
+                    "nodeSelector": {
+                        "cloud.google.com/gke-accelerator": self.task_test_config.accelerator.accelerator_type,
+                    },
+                    "restartPolicy": "Never",
+                    "containers": [
+                        {
+                            "name": "main",
+                            "image": self.task_test_config.docker_image,
+                            "imagePullPolicy": "Always",
+                            "command": shlex.split(self.task_test_config.setup_script),
+                            "args": shlex.split(self.task_test_config.test_script),
+                            "resources": {
+                                "limits": {
+                                    "nvidia.com/gpu": self.task_test_config.accelerator.count,
+                                }
+                            },
+                            "env": [
+                                {
+                                    "name": "POD_NAME",
+                                    "valueFrom": {
+                                        "fieldRef": {"fieldPath": "metadata.name"}
+                                    },
+                                },
+                                {
+                                    "name": "POD_NAMESPACE",
+                                    "valueFrom": {
+                                        "fieldRef": {"fieldPath": "metadata.namespace"}
+                                    },
+                                },
+                                {
+                                    "name": "JOB_NAME",
+                                    "valueFrom": {
+                                        "fieldRef": {
+                                            "fieldPath": "metadata.labels['job-name']"
+                                        }
+                                    },
+                                },
+                            ],
+                            "volumeMounts": [
+                                {
+                                    "mountPath": "/dev/shm",
+                                    "name": "dshm",
+                                    "readOnly": False,
+                                },
+                            ],
+                        },
+                    ],
+                    "volumes": [
+                        {"emptyDir": {"medium": "Memory"}, "name": "dshm"},
+                    ],
+                },
+            },
+        },
+    }