Skip to content

Commit

Permalink
Disable XRT in default CI jobs (#5135)
Browse files Browse the repository at this point in the history
* PJRT_ONLY -> DISABLE_XRT

* Add XRT CPU test workflow

* Disable XRT in coverage

* Disable XRT in TPU CI

* Use `latest-xrt` in image tag

* Fix syntax

* Only run python tests for XRT

* Remove validation for `disable_xrt`

* Move autocast test with other python tests

* Fix path typo
  • Loading branch information
will-cromar authored Jun 8, 2023
1 parent 0ba21fa commit 997b2e7
Show file tree
Hide file tree
Showing 13 changed files with 117 additions and 19 deletions.
2 changes: 1 addition & 1 deletion .bazelrc
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ test:tpu --local_test_jobs=1
test:cuda --local_test_jobs=1

# Exclude XRT from the build
build:pjrt_only --define=pjrt_only=true
build:disable_xrt --define=disable_xrt=true

#########################################################################
# RBE config options below.
Expand Down
2 changes: 0 additions & 2 deletions .circleci/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,6 @@ function run_torch_xla_tests() {
chmod -R 755 ~/htmlcov
else
./test/run_tests.sh
# only run test_autocast for cpu and gpu on circleCI.
python test/test_autocast.py

# GPU tests
if [ -x "$(command -v nvidia-smi)" ]; then
Expand Down
28 changes: 28 additions & 0 deletions .circleci/test_xrt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash

set -ex

source ./xla_env
source .circleci/common.sh

PYTORCH_DIR=/tmp/pytorch
XLA_DIR=$PYTORCH_DIR/xla
USE_COVERAGE="${USE_COVERAGE:-0}"

# Needs to be kept in sync with .jenkins/pytorch/common_utils.sh in pytorch/pytorch.
TORCHVISION_COMMIT="$(cat $PYTORCH_DIR/.github/ci_commit_pins/vision.txt)"

function pip_install() {
# retry 3 times
# old versions of pip don't have the "--progress-bar" flag
pip install --progress-bar off "$@" || pip install --progress-bar off "$@" || pip install --progress-bar off "$@" ||\
pip install "$@" || pip install "$@" || pip install "$@"
}

function install_torchvision() {
pip_install --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$TORCHVISION_COMMIT"
}

install_torchvision

./test/run_tests.sh
24 changes: 21 additions & 3 deletions .github/workflows/_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,16 @@ on:
type: string
description: Runner type for the test
default: linux.12xlarge
cuda:
required: false
type: string
description: Whether to build XLA with CUDA
default: 1
disable_xrt:
required: false
type: string
description: Whether to disable XRT in the build
default: 0

secrets:
gcloud-service-key:
Expand All @@ -37,6 +47,8 @@ jobs:
WORKDIR: /var/lib/jenkins/workspace
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
XLA_CUDA: ${{ inputs.cuda }}
DISABLE_XRT: ${{ inputs.disable_xrt }}
steps:
- name: Setup Linux
uses: pytorch/test-infra/.github/actions/setup-linux@main
Expand Down Expand Up @@ -76,8 +88,8 @@ jobs:
run: |
echo "declare -x SCCACHE_BUCKET=${SCCACHE_BUCKET}" | docker exec -i "${pid}" sh -c "cat >> env"
echo "declare -x CC=clang-8 CXX=clang++-8" | docker exec -i "${pid}" sh -c "cat >> xla_env"
echo "declare -x XLA_USE_XRT=1" | docker exec -i "${pid}" sh -c "cat >> xla_env"
echo "declare -x XLA_CUDA=1" | docker exec -i "${pid}" sh -c "cat >> xla_env"
echo "declare -x DISABLE_XRT=${DISABLE_XRT}" | docker exec -i "${pid}" sh -c "cat >> xla_env"
echo "declare -x XLA_CUDA=${XLA_CUDA}" | docker exec -i "${pid}" sh -c "cat >> xla_env"
echo "declare -x BAZEL_REMOTE_CACHE=1" | docker exec -i "${pid}" sh -c "cat >> xla_env"
echo "${GCLOUD_SERVICE_KEY}" | docker exec -i "${pid}" sh -c "cat >> default_credentials.json"
Expand All @@ -95,7 +107,13 @@ jobs:
id: upload-docker-image
shell: bash
run: |
export COMMIT_DOCKER_IMAGE="${ECR_DOCKER_IMAGE_BASE}:latest-${GITHUB_SHA}"
if [[ ${DISABLE_XRT} == 1 ]]; then
image_tag_base=latest
else
image_tag_base=latest-xrt
fi
export COMMIT_DOCKER_IMAGE="${ECR_DOCKER_IMAGE_BASE}:${image_tag_base}-${GITHUB_SHA}"
time docker commit "${pid}" "${COMMIT_DOCKER_IMAGE}"
time docker push "${COMMIT_DOCKER_IMAGE}"
echo "docker-image=${COMMIT_DOCKER_IMAGE}" >> "${GITHUB_OUTPUT}"
Expand Down
22 changes: 20 additions & 2 deletions .github/workflows/_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,21 @@ on:
default: 270
description: |
Set the maximum (in minutes) how long the workflow should take to finish
disable-pjrt:
required: false
type: string
default: 0
description: Whether to disable PJRT tests
disable-xrt:
required: false
type: string
default: 0
description: Whether to disable XRT tests
test-script:
required: false
type: string
default: test.sh
description: Which test script to run

secrets:
gcloud-service-key:
Expand All @@ -36,6 +51,9 @@ jobs:
WORKDIR: /var/lib/jenkins/workspace
GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }}
XLA_SKIP_XRT_TESTS: ${{ inputs.disable-xrt }}
XLA_SKIP_TORCH_OP_TESTS: ${{ inputs.disable-pjrt }}
XLA_SKIP_MP_OP_TESTS: ${{ inputs.disable-pjrt }}
steps:
- name: Setup Linux
uses: pytorch/test-infra/.github/actions/setup-linux@main
Expand Down Expand Up @@ -70,12 +88,12 @@ jobs:
run: |
echo "DOCKER_IMAGE: ${DOCKER_IMAGE}"
docker pull "${DOCKER_IMAGE}"
pid=$(docker run ${GPU_FLAG:-} -e USE_COVERAGE -t -d -w "$WORKDIR" "${DOCKER_IMAGE}")
pid=$(docker run ${GPU_FLAG:-} -e USE_COVERAGE -e XLA_SKIP_XRT_TESTS -e XLA_SKIP_TORCH_OP_TESTS -e XLA_SKIP_MP_OP_TESTS -t -d -w "$WORKDIR" "${DOCKER_IMAGE}")
echo "${GCLOUD_SERVICE_KEY}" | docker exec -i "${pid}" sh -c "cat >> /tmp/pytorch/xla/default_credentials.json"
echo "pid=${pid}" >> "${GITHUB_ENV}"
- name: Test
shell: bash
run: docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/test.sh'
run: docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/${{ inputs.test-script }}'
- name: Upload coverage results
if: ${{ inputs.collect-coverage }}
shell: bash
Expand Down
30 changes: 30 additions & 0 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ jobs:
with:
ecr-docker-image-base: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base
gcr-docker-image: gcr.io/tpu-pytorch/xla_base:latest
disable_xrt: 1
cuda: 1
secrets:
gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}

Expand All @@ -29,6 +31,7 @@ jobs:
with:
docker-image: ${{ needs.build.outputs.docker-image }}
timeout-minutes: 90
disable-xrt: 1
secrets:
gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}

Expand All @@ -40,6 +43,31 @@ jobs:
docker-image: ${{ needs.build.outputs.docker-image }}
runner: linux.8xlarge.nvidia.gpu
timeout-minutes: 180
disable-xrt: 1
secrets:
gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}

build-xrt:
name: "Build XLA with XRT"
uses: ./.github/workflows/_build.yml
with:
ecr-docker-image-base: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base
gcr-docker-image: gcr.io/tpu-pytorch/xla_base:latest
disable_xrt: 0
cuda: 0
secrets:
gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}

test-xrt-cpu:
name: "Test XRT with CPU"
uses: ./.github/workflows/_test.yml
needs: build-xrt
with:
docker-image: ${{ needs.build-xrt.outputs.docker-image }}
timeout-minutes: 90
disable-xrt: 0
disable-pjrt: 1
test-script: 'test_xrt.sh'
secrets:
gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}

Expand All @@ -52,6 +80,7 @@ jobs:
docker-image: ${{ needs.build.outputs.docker-image }}
collect-coverage: true
timeout-minutes: 120
disable-xrt: 1
secrets:
gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}

Expand All @@ -65,6 +94,7 @@ jobs:
runner: linux.8xlarge.nvidia.gpu
timeout-minutes: 210
collect-coverage: true
disable-xrt: 1
secrets:
gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}

Expand Down
1 change: 1 addition & 0 deletions infra/ansible/config/env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ build_env:
XLA_SANDBOX_BUILD: 1
BAZEL_REMOTE_CACHE: 1
SILO_NAME: "cache-silo-{{ arch }}-{{ accelerator }}"
DISABLE_XRT: "{{ disable_xrt }}"

amd64:
ARCH: amd64
Expand Down
4 changes: 3 additions & 1 deletion infra/ansible/config/vars.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,6 @@ clang_version: 10
# PyTorch and PyTorch/XLA wheel versions.
package_version: 2.0
# If set to true, wheels will be renamed to $WHEEL_NAME-nightly-cp38-cp38-linux_x86_64.whl.
nightly_release: false
nightly_release: false
# Whether to disable XRT during build
disable_xrt: 0
1 change: 1 addition & 0 deletions infra/tpu-pytorch/test_triggers.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ module "tpu_e2e_tests" {
ansible_vars = {
arch = "amd64"
accelerator = "tpu"
disable_xrt = "1"
pytorch_git_rev = "main"
# The commit ID associated with the triggered build. Substituted when
# Cloud Build is triggered.
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
# TPUVM_MODE=0
# whether to build for TPU
#
# PJRT_ONLY=0
# DISABLE_XRT=0
# whether to exclude XRT from the build
#
# SILO_NAME=""
Expand Down Expand Up @@ -253,8 +253,8 @@ def bazel_build(self, ext):
if _check_env_flag('TPUVM_MODE'):
bazel_argv.append('--config=tpu')

if _check_env_flag('PJRT_ONLY'):
bazel_argv.append('--config=pjrt_only')
if _check_env_flag('DISABLE_XRT'):
bazel_argv.append('--config=disable_xrt')

# Remote cache authentication.
if _check_env_flag('BAZEL_REMOTE_CACHE'):
Expand Down
2 changes: 2 additions & 0 deletions test/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ function run_torchrun {
function run_xrt_tests {
# For features not supported in PJRT
echo "Running XRT tests"
run_xrt "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
run_opbyop "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
run_async_scalar "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
run_torchrun "$CDIR/test_allreduce_torchrun.py"
Expand Down Expand Up @@ -194,6 +195,7 @@ function run_xla_op_tests {
run_test "$CDIR/test_operations_hlo.py" "$@" --verbosity=$VERBOSITY
run_test "$CDIR/test_input_output_aliases.py"
run_test "$CDIR/test_torch_distributed_xla_backend.py"
run_test "$CDIR/test_autocast.py"
}

function run_op_tests {
Expand Down
8 changes: 4 additions & 4 deletions third_party/xla_client/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ exports_files([
])

config_setting(
name = "pjrt_only",
define_values = {"pjrt_only": "true"},
name = "disable_xrt",
define_values = {"disable_xrt": "true"},
)

tf_proto_library_cc(
Expand Down Expand Up @@ -69,7 +69,7 @@ cc_library(
"runtime.h",
],
local_defines = select({
":pjrt_only": ["PJRT_ONLY"],
":disable_xrt": ["DISABLE_XRT"],
"//conditions:default": [],
}),
deps = [
Expand All @@ -78,7 +78,7 @@ cc_library(
":pjrt_computation_client",
"@org_tensorflow//tensorflow/tsl/platform:stacktrace",
] + select({
":pjrt_only": [],
":disable_xrt": [],
"//conditions:default": [
":xrt_computation_client",
":xrt_local_service",
Expand Down
6 changes: 3 additions & 3 deletions third_party/xla_client/runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#include "third_party/xla_client/env_vars.h"
#include "third_party/xla_client/pjrt_computation_client.h"

#ifndef PJRT_ONLY
#ifndef DISABLE_XRT
#include "third_party/xla_client/xrt_computation_client.h"
#include "third_party/xla_client/xrt_local_service.h"
#endif
Expand All @@ -24,7 +24,7 @@ ComputationClient* CreateClient() {
if (sys_util::GetEnvString(env::kEnvPjRtDevice, "") != "") {
client = new PjRtComputationClient();
} else {
#ifndef PJRT_ONLY
#ifndef DISABLE_XRT
client = new XrtComputationClient();
#else
XLA_ERROR() << "$PJRT_DEVICE is not set." << std::endl;
Expand All @@ -49,7 +49,7 @@ ComputationClient* GetComputationClientIfInitialized() {
}

void RunLocalService(uint64_t service_port) {
#ifndef PJRT_ONLY
#ifndef DISABLE_XRT
try {
XrtLocalService* service = new XrtLocalService(
"localservice|localhost:" + std::to_string(service_port),
Expand Down

0 comments on commit 997b2e7

Please sign in to comment.