From 3989e0c196d6368a73dd6cb67025ec733d177b28 Mon Sep 17 00:00:00 2001 From: iefgnoix Date: Thu, 16 May 2024 22:40:16 +0000 Subject: [PATCH] add workflow for test that requires pytorch CUDA. --- .../_build_torch_with_cuda_and_xla.yml | 59 +++++++++++++++++++ .github/workflows/build_and_test.yml | 21 +++++++ infra/ansible/config/env.yaml | 1 + .../ansible/roles/build_srcs/tasks/main.yaml | 3 +- 4 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/_build_torch_with_cuda_and_xla.yml diff --git a/.github/workflows/_build_torch_with_cuda_and_xla.yml b/.github/workflows/_build_torch_with_cuda_and_xla.yml new file mode 100644 index 00000000000..e55c928b0ee --- /dev/null +++ b/.github/workflows/_build_torch_with_cuda_and_xla.yml @@ -0,0 +1,59 @@ +name: build-cuda-plugin +on: + workflow_call: + inputs: + dev-image: + required: true + type: string + description: Base image for builds + runner: + required: false + type: string + description: Runner type for the test + default: linux.12xlarge + + secrets: + gcloud-service-key: + required: true + description: Secret to access Bazel build cache +jobs: + build: + runs-on: ${{ inputs.runner }} + container: + image: ${{ inputs.dev-image }} + env: + GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }} + GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json + BAZEL_JOBS: 16 + BAZEL_REMOTE_CACHE: 1 + BUILD_CPP_TESTS: 1 + steps: + - name: Setup gcloud + shell: bash + run: | + echo "${GCLOUD_SERVICE_KEY}" > $GOOGLE_APPLICATION_CREDENTIALS + - name: Checkout PyTorch Repo + uses: actions/checkout@v4 + with: + repository: pytorch/pytorch + path: pytorch + submodules: recursive + - name: Checkout PyTorch/XLA Repo + uses: actions/checkout@v4 + with: + path: pytorch/xla + - name: Build + shell: bash + run: | + cd pytorch/xla/infra/ansible + ansible-playbook playbook.yaml -vvv -e "stage=build arch=amd64 accelerator=cuda cuda_compute_capabilities=5.2,7.5 src_root=${GITHUB_WORKSPACE} build_cpp_tests=1 git_versioned_xla_build=1 cache_suffix=-ci build_pytorch_with_cuda=1" --skip-tags=fetch_srcs,install_deps + - name: Upload wheel + uses: actions/upload-artifact@v4 + with: + name: torch-xla-wheels + path: /dist/*.whl + - name: Upload CPP test binaries + uses: actions/upload-artifact@v4 + with: + name: cpp-test-bin + path: /tmp/test/bin diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 60a2eda44cd..741cf531e15 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -28,6 +28,14 @@ jobs: secrets: gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} + build-torch-with-cuda-xla-with-cuda: + name: "Build PyTorch with CUDA and PyTorch/XLA" + uses: ./.github/workflows/_build_torch_with_cuda_and_xla.yml + with: + dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1 + secrets: + gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} + build-cuda-plugin: name: "Build XLA CUDA plugin" uses: ./.github/workflows/_build_plugin.yml @@ -50,6 +58,19 @@ jobs: test-cuda: name: "GPU tests" uses: ./.github/workflows/_test.yml + needs: [build-torch-with-cuda-xla-with-cuda] + with: + dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1 + runner: linux.8xlarge.nvidia.gpu + timeout-minutes: 300 + collect-coverage: false + install-cuda-plugin: true + secrets: + gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} + + test-cuda-with-pytorch-cuda-enabled: + name: "GPU tests with PyTorch CUDA enabled" + uses: ./.github/workflows/_test.yml needs: [build-torch-xla, build-cuda-plugin] with: dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1 diff --git a/infra/ansible/config/env.yaml b/infra/ansible/config/env.yaml index 9e2fe7270cc..6c34fae2f4e 100644 --- a/infra/ansible/config/env.yaml +++ b/infra/ansible/config/env.yaml @@ -43,6 +43,7 @@ build_env: cuda: TF_CUDA_COMPUTE_CAPABILITIES: "{{ cuda_compute_capabilities }}" XLA_CUDA: 1 + USE_CUDA: "{{ build_pytorch_with_cuda }}" tpu: ACCELERATOR: tpu diff --git a/infra/ansible/roles/build_srcs/tasks/main.yaml b/infra/ansible/roles/build_srcs/tasks/main.yaml index da09a695453..0f46a431225 100644 --- a/infra/ansible/roles/build_srcs/tasks/main.yaml +++ b/infra/ansible/roles/build_srcs/tasks/main.yaml @@ -27,8 +27,7 @@ cmd: python setup.py bdist_wheel chdir: "{{ (src_root, 'pytorch') | path_join }}" creates: "{{ (src_root, 'pytorch/dist/*.whl') | path_join }}" - # Set `USE_CUDA=0` as PyTorch cannot be used with GPU in eager and XLA mode. - environment: "{{ env_vars | combine({'USE_CUDA': 0}) }}" + environment: "{{ env_vars }}" - name: Find PyTorch *.whl files in pytorch/dist ansible.builtin.find: