add workflow for test that requires pytorch CUDA.

pytorch · May 16, 2024 · 3989e0c · 3989e0c
1 parent aeed89e
commit 3989e0c
Showing 4 changed files with 82 additions and 2 deletions.
diff --git a/.github/workflows/_build_torch_with_cuda_and_xla.yml b/.github/workflows/_build_torch_with_cuda_and_xla.yml
@@ -0,0 +1,59 @@
+name: build-cuda-plugin
+on:
+  workflow_call:
+    inputs:
+      dev-image:
+        required: true
+        type: string
+        description: Base image for builds
+      runner:
+        required: false
+        type: string
+        description: Runner type for the test
+        default: linux.12xlarge
+
+    secrets:
+      gcloud-service-key:
+        required: true
+        description: Secret to access Bazel build cache
+jobs:
+  build:
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.dev-image }}
+    env:
+      GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
+      GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json
+      BAZEL_JOBS: 16
+      BAZEL_REMOTE_CACHE: 1
+      BUILD_CPP_TESTS: 1
+    steps:
+      - name: Setup gcloud
+        shell: bash
+        run: |
+          echo "${GCLOUD_SERVICE_KEY}" > $GOOGLE_APPLICATION_CREDENTIALS
+      - name: Checkout PyTorch Repo
+        uses: actions/checkout@v4
+        with:
+          repository: pytorch/pytorch
+          path: pytorch
+          submodules: recursive
+      - name: Checkout PyTorch/XLA Repo
+        uses: actions/checkout@v4
+        with:
+          path: pytorch/xla
+      - name: Build
+        shell: bash
+        run: |
+          cd pytorch/xla/infra/ansible
+          ansible-playbook playbook.yaml -vvv -e "stage=build arch=amd64 accelerator=cuda cuda_compute_capabilities=5.2,7.5 src_root=${GITHUB_WORKSPACE} build_cpp_tests=1 git_versioned_xla_build=1 cache_suffix=-ci build_pytorch_with_cuda=1" --skip-tags=fetch_srcs,install_deps
+      - name: Upload wheel
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch-xla-wheels
+          path: /dist/*.whl
+      - name: Upload CPP test binaries
+        uses: actions/upload-artifact@v4
+        with:
+          name: cpp-test-bin
+          path: /tmp/test/bin
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -28,6 +28,14 @@ jobs:
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
+  build-torch-with-cuda-xla-with-cuda:
+    name: "Build PyTorch with CUDA and PyTorch/XLA"
+    uses: ./.github/workflows/_build_torch_with_cuda_and_xla.yml
+    with:
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
+    secrets:
+      gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
+
   build-cuda-plugin:
     name: "Build XLA CUDA plugin"
     uses: ./.github/workflows/_build_plugin.yml
@@ -50,6 +58,19 @@ jobs:
   test-cuda:
     name: "GPU tests"
     uses: ./.github/workflows/_test.yml
+    needs: [build-torch-with-cuda-xla-with-cuda]
+    with:
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
+      runner: linux.8xlarge.nvidia.gpu
+      timeout-minutes: 300
+      collect-coverage: false
+      install-cuda-plugin: true
+    secrets:
+      gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
+
+  test-cuda-with-pytorch-cuda-enabled:
+    name: "GPU tests with PyTorch CUDA enabled"
+    uses: ./.github/workflows/_test.yml
     needs: [build-torch-xla, build-cuda-plugin]
     with:
       dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1

diff --git a/infra/ansible/config/env.yaml b/infra/ansible/config/env.yaml
@@ -43,6 +43,7 @@ build_env:
   cuda:
     TF_CUDA_COMPUTE_CAPABILITIES: "{{ cuda_compute_capabilities }}"
     XLA_CUDA: 1
+    USE_CUDA: "{{ build_pytorch_with_cuda }}"
 
   tpu:
     ACCELERATOR: tpu

diff --git a/infra/ansible/roles/build_srcs/tasks/main.yaml b/infra/ansible/roles/build_srcs/tasks/main.yaml
@@ -27,8 +27,7 @@
     cmd: python setup.py bdist_wheel
     chdir: "{{ (src_root, 'pytorch') | path_join }}"
     creates: "{{ (src_root, 'pytorch/dist/*.whl') | path_join }}"
-  # Set `USE_CUDA=0` as PyTorch cannot be used with GPU in eager and XLA mode.
-  environment: "{{ env_vars | combine({'USE_CUDA': 0}) }}"
+  environment: "{{ env_vars }}"
 
 - name: Find PyTorch *.whl files in pytorch/dist
   ansible.builtin.find: