Disable XRT in default CI jobs (#5135)

* PJRT_ONLY -> DISABLE_XRT * Add XRT CPU test workflow * Disable XRT in coverage * Disable XRT in TPU CI * Use `latest-xrt` in image tag * Fix syntax * Only run python tests for XRT * Remove validation for `disable_xrt` * Move autocast test with other python tests * Fix path typo
pytorch · Jun 8, 2023 · 997b2e7 · 997b2e7
1 parent 0ba21fa
commit 997b2e7
Show file tree

Hide file tree

Showing 13 changed files with 117 additions and 19 deletions.
diff --git a/.bazelrc b/.bazelrc
@@ -76,7 +76,7 @@ test:tpu --local_test_jobs=1
 test:cuda --local_test_jobs=1
 
 # Exclude XRT from the build
-build:pjrt_only --define=pjrt_only=true
+build:disable_xrt --define=disable_xrt=true
 
 #########################################################################
 # RBE config options below.

diff --git a/.circleci/common.sh b/.circleci/common.sh
@@ -131,8 +131,6 @@ function run_torch_xla_tests() {
       chmod -R 755 ~/htmlcov
     else
       ./test/run_tests.sh
-      # only run test_autocast for cpu and gpu on circleCI.
-      python test/test_autocast.py
 
       # GPU tests
       if [ -x "$(command -v nvidia-smi)" ]; then

diff --git a/.circleci/test_xrt.sh b/.circleci/test_xrt.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+set -ex
+
+source ./xla_env
+source .circleci/common.sh
+
+PYTORCH_DIR=/tmp/pytorch
+XLA_DIR=$PYTORCH_DIR/xla
+USE_COVERAGE="${USE_COVERAGE:-0}"
+
+# Needs to be kept in sync with .jenkins/pytorch/common_utils.sh in pytorch/pytorch.
+TORCHVISION_COMMIT="$(cat $PYTORCH_DIR/.github/ci_commit_pins/vision.txt)"
+
+function pip_install() {
+  # retry 3 times
+  # old versions of pip don't have the "--progress-bar" flag
+  pip install --progress-bar off "$@" || pip install --progress-bar off "$@" || pip install --progress-bar off "$@" ||\
+  pip install "$@" || pip install "$@" || pip install "$@"
+}
+
+function install_torchvision() {
+  pip_install --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$TORCHVISION_COMMIT"
+}
+
+install_torchvision
+
+./test/run_tests.sh
diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml
@@ -15,6 +15,16 @@ on:
         type: string
         description: Runner type for the test
         default: linux.12xlarge
+      cuda:
+        required: false
+        type: string
+        description: Whether to build XLA with CUDA
+        default: 1
+      disable_xrt:
+        required: false
+        type: string
+        description: Whether to disable XRT in the build
+        default: 0
 
     secrets:
       gcloud-service-key:
@@ -37,6 +47,8 @@ jobs:
       WORKDIR: /var/lib/jenkins/workspace
       SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
       GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
+      XLA_CUDA: ${{ inputs.cuda }}
+      DISABLE_XRT: ${{ inputs.disable_xrt }}
     steps:
       - name: Setup Linux
         uses: pytorch/test-infra/.github/actions/setup-linux@main
@@ -76,8 +88,8 @@ jobs:
         run: |
           echo "declare -x SCCACHE_BUCKET=${SCCACHE_BUCKET}" | docker exec -i "${pid}" sh -c "cat >> env"
           echo "declare -x CC=clang-8 CXX=clang++-8" | docker exec -i "${pid}" sh -c "cat >> xla_env"
-          echo "declare -x XLA_USE_XRT=1" | docker exec -i "${pid}" sh -c "cat >> xla_env"
-          echo "declare -x XLA_CUDA=1" | docker exec -i "${pid}" sh -c "cat >> xla_env"
+          echo "declare -x DISABLE_XRT=${DISABLE_XRT}" | docker exec -i "${pid}" sh -c "cat >> xla_env"
+          echo "declare -x XLA_CUDA=${XLA_CUDA}" | docker exec -i "${pid}" sh -c "cat >> xla_env"
           echo "declare -x BAZEL_REMOTE_CACHE=1" | docker exec -i "${pid}" sh -c "cat >> xla_env"
           echo "${GCLOUD_SERVICE_KEY}" | docker exec -i "${pid}" sh -c "cat >> default_credentials.json"
 
@@ -95,7 +107,13 @@ jobs:
         id: upload-docker-image
         shell: bash
         run: |
-          export COMMIT_DOCKER_IMAGE="${ECR_DOCKER_IMAGE_BASE}:latest-${GITHUB_SHA}"
+          if [[ ${DISABLE_XRT} == 1 ]]; then
+            image_tag_base=latest
+          else
+            image_tag_base=latest-xrt
+          fi
+
+          export COMMIT_DOCKER_IMAGE="${ECR_DOCKER_IMAGE_BASE}:${image_tag_base}-${GITHUB_SHA}"
           time docker commit "${pid}" "${COMMIT_DOCKER_IMAGE}"
           time docker push "${COMMIT_DOCKER_IMAGE}"
           echo "docker-image=${COMMIT_DOCKER_IMAGE}" >> "${GITHUB_OUTPUT}"

diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
@@ -22,6 +22,21 @@ on:
         default: 270
         description: |
           Set the maximum (in minutes) how long the workflow should take to finish
+      disable-pjrt:
+        required: false
+        type: string
+        default: 0
+        description: Whether to disable PJRT tests
+      disable-xrt:
+        required: false
+        type: string
+        default: 0
+        description: Whether to disable XRT tests
+      test-script:
+        required: false
+        type: string
+        default: test.sh
+        description: Which test script to run
 
     secrets:
       gcloud-service-key:
@@ -36,6 +51,9 @@ jobs:
       WORKDIR: /var/lib/jenkins/workspace
       GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
       USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }}
+      XLA_SKIP_XRT_TESTS: ${{ inputs.disable-xrt }}
+      XLA_SKIP_TORCH_OP_TESTS: ${{ inputs.disable-pjrt }}
+      XLA_SKIP_MP_OP_TESTS: ${{ inputs.disable-pjrt }}
     steps:
       - name: Setup Linux
         uses: pytorch/test-infra/.github/actions/setup-linux@main
@@ -70,12 +88,12 @@ jobs:
         run: |
           echo "DOCKER_IMAGE: ${DOCKER_IMAGE}"
           docker pull "${DOCKER_IMAGE}"
-          pid=$(docker run ${GPU_FLAG:-} -e USE_COVERAGE -t -d -w "$WORKDIR" "${DOCKER_IMAGE}")
+          pid=$(docker run ${GPU_FLAG:-} -e USE_COVERAGE -e XLA_SKIP_XRT_TESTS -e XLA_SKIP_TORCH_OP_TESTS -e XLA_SKIP_MP_OP_TESTS -t -d -w "$WORKDIR" "${DOCKER_IMAGE}")
           echo "${GCLOUD_SERVICE_KEY}" | docker exec -i "${pid}" sh -c "cat >> /tmp/pytorch/xla/default_credentials.json"
           echo "pid=${pid}" >> "${GITHUB_ENV}"
       - name: Test
         shell: bash
-        run: docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/test.sh'
+        run: docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/${{ inputs.test-script }}'
       - name: Upload coverage results
         if: ${{ inputs.collect-coverage }}
         shell: bash

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -19,6 +19,8 @@ jobs:
     with:
       ecr-docker-image-base: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base
       gcr-docker-image: gcr.io/tpu-pytorch/xla_base:latest
+      disable_xrt: 1
+      cuda: 1
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
@@ -29,6 +31,7 @@ jobs:
     with:
       docker-image: ${{ needs.build.outputs.docker-image }}
       timeout-minutes: 90
+      disable-xrt: 1
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
@@ -40,6 +43,31 @@ jobs:
       docker-image: ${{ needs.build.outputs.docker-image }}
       runner: linux.8xlarge.nvidia.gpu
       timeout-minutes: 180
+      disable-xrt: 1
+    secrets:
+      gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
+
+  build-xrt:
+    name: "Build XLA with XRT"
+    uses: ./.github/workflows/_build.yml
+    with:
+      ecr-docker-image-base: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base
+      gcr-docker-image: gcr.io/tpu-pytorch/xla_base:latest
+      disable_xrt: 0
+      cuda: 0
+    secrets:
+      gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
+
+  test-xrt-cpu:
+    name: "Test XRT with CPU"
+    uses: ./.github/workflows/_test.yml
+    needs: build-xrt
+    with:
+      docker-image: ${{ needs.build-xrt.outputs.docker-image }}
+      timeout-minutes: 90
+      disable-xrt: 0
+      disable-pjrt: 1
+      test-script: 'test_xrt.sh'
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
@@ -52,6 +80,7 @@ jobs:
       docker-image: ${{ needs.build.outputs.docker-image }}
       collect-coverage: true
       timeout-minutes: 120
+      disable-xrt: 1
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
@@ -65,6 +94,7 @@ jobs:
       runner: linux.8xlarge.nvidia.gpu
       timeout-minutes: 210
       collect-coverage: true
+      disable-xrt: 1
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 

diff --git a/infra/ansible/config/env.yaml b/infra/ansible/config/env.yaml
@@ -32,6 +32,7 @@ build_env:
     XLA_SANDBOX_BUILD: 1
     BAZEL_REMOTE_CACHE: 1
     SILO_NAME: "cache-silo-{{ arch }}-{{ accelerator }}"
+    DISABLE_XRT: "{{ disable_xrt }}"
 
   amd64:
     ARCH: amd64

diff --git a/infra/ansible/config/vars.yaml b/infra/ansible/config/vars.yaml
@@ -7,4 +7,6 @@ clang_version: 10
 # PyTorch and PyTorch/XLA wheel versions.
 package_version: 2.0
 # If set to true, wheels will be renamed to $WHEEL_NAME-nightly-cp38-cp38-linux_x86_64.whl.
-nightly_release: false
+nightly_release: false
+# Whether to disable XRT during build
+disable_xrt: 0
diff --git a/infra/tpu-pytorch/test_triggers.tf b/infra/tpu-pytorch/test_triggers.tf
@@ -27,6 +27,7 @@ module "tpu_e2e_tests" {
   ansible_vars = {
     arch            = "amd64"
     accelerator     = "tpu"
+    disable_xrt     = "1"
     pytorch_git_rev = "main"
     # The commit ID associated with the triggered build. Substituted when
     # Cloud Build is triggered.

diff --git a/setup.py b/setup.py
@@ -40,7 +40,7 @@
 #   TPUVM_MODE=0
 #     whether to build for TPU
 #
-#   PJRT_ONLY=0
+#   DISABLE_XRT=0
 #     whether to exclude XRT from the build
 #
 #   SILO_NAME=""
@@ -253,8 +253,8 @@ def bazel_build(self, ext):
     if _check_env_flag('TPUVM_MODE'):
       bazel_argv.append('--config=tpu')
 
-    if _check_env_flag('PJRT_ONLY'):
-      bazel_argv.append('--config=pjrt_only')
+    if _check_env_flag('DISABLE_XRT'):
+      bazel_argv.append('--config=disable_xrt')
 
     # Remote cache authentication.
     if _check_env_flag('BAZEL_REMOTE_CACHE'):

diff --git a/test/run_tests.sh b/test/run_tests.sh
@@ -138,6 +138,7 @@ function run_torchrun {
 function run_xrt_tests {
   # For features not supported in PJRT
   echo "Running XRT tests"
+  run_xrt "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
   run_opbyop  "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
   run_async_scalar  "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
   run_torchrun  "$CDIR/test_allreduce_torchrun.py"
@@ -194,6 +195,7 @@ function run_xla_op_tests {
   run_test "$CDIR/test_operations_hlo.py" "$@" --verbosity=$VERBOSITY
   run_test "$CDIR/test_input_output_aliases.py"
   run_test "$CDIR/test_torch_distributed_xla_backend.py"
+  run_test "$CDIR/test_autocast.py"
 }
 
 function run_op_tests {

diff --git a/third_party/xla_client/BUILD b/third_party/xla_client/BUILD
@@ -25,8 +25,8 @@ exports_files([
 ])
 
 config_setting(
-  name = "pjrt_only",
-  define_values = {"pjrt_only": "true"},
+  name = "disable_xrt",
+  define_values = {"disable_xrt": "true"},
 )
 
 tf_proto_library_cc(
@@ -69,7 +69,7 @@ cc_library(
     "runtime.h",
   ],
   local_defines = select({
-    ":pjrt_only": ["PJRT_ONLY"],
+    ":disable_xrt": ["DISABLE_XRT"],
     "//conditions:default": [],
   }),
   deps = [
@@ -78,7 +78,7 @@ cc_library(
     ":pjrt_computation_client",
     "@org_tensorflow//tensorflow/tsl/platform:stacktrace",
   ] + select({
-    ":pjrt_only": [],
+    ":disable_xrt": [],
     "//conditions:default": [
       ":xrt_computation_client",
       ":xrt_local_service",

diff --git a/third_party/xla_client/runtime.cc b/third_party/xla_client/runtime.cc
@@ -3,7 +3,7 @@
 #include "third_party/xla_client/env_vars.h"
 #include "third_party/xla_client/pjrt_computation_client.h"
 
-#ifndef PJRT_ONLY
+#ifndef DISABLE_XRT
 #include "third_party/xla_client/xrt_computation_client.h"
 #include "third_party/xla_client/xrt_local_service.h"
 #endif
@@ -24,7 +24,7 @@ ComputationClient* CreateClient() {
   if (sys_util::GetEnvString(env::kEnvPjRtDevice, "") != "") {
     client = new PjRtComputationClient();
   } else {
-#ifndef PJRT_ONLY
+#ifndef DISABLE_XRT
     client = new XrtComputationClient();
 #else
     XLA_ERROR() << "$PJRT_DEVICE is not set." << std::endl;
@@ -49,7 +49,7 @@ ComputationClient* GetComputationClientIfInitialized() {
 }
 
 void RunLocalService(uint64_t service_port) {
-#ifndef PJRT_ONLY
+#ifndef DISABLE_XRT
   try {
     XrtLocalService* service = new XrtLocalService(
         "localservice|localhost:" + std::to_string(service_port),