diff --git a/.github/workflows/_build_torch_with_cuda.yml b/.github/workflows/_build_torch_with_cuda.yml new file mode 100644 index 00000000000..e9defd40eb5 --- /dev/null +++ b/.github/workflows/_build_torch_with_cuda.yml @@ -0,0 +1,55 @@ +name: build-torch-with-cuda +on: + workflow_call: + inputs: + dev-image: + required: true + type: string + description: Base image for builds + torch-commit: + required: true + type: string + description: torch-commit + runner: + required: false + type: string + description: Runner type for the test + default: linux.12xlarge +jobs: + build: + runs-on: ${{ inputs.runner }} + container: + image: ${{ inputs.dev-image }} + options: "--gpus all --shm-size 16g" + env: + _GLIBCXX_USE_CXX11_ABI: 0 + steps: + # See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802 + - name: Clean up workspace + run: | + ls -la + rm -rvf ${GITHUB_WORKSPACE}/* + - name: Setup CUDA environment + shell: bash + run: | + echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV + - name: Check GPU + run: nvidia-smi + - name: Checkout PyTorch Repo + uses: actions/checkout@v4 + with: + repository: pytorch/pytorch + path: pytorch + ref: ${{ inputs.torch-commit }} + submodules: recursive + - name: Build + shell: bash + run: | + cd pytorch + USE_CUDA=1 python setup.py bdist_wheel + - name: Upload wheel + uses: actions/upload-artifact@v4 + with: + name: torch-with-cuda + path: pytorch/dist/*.whl diff --git a/.github/workflows/_build_torch_xla.yml b/.github/workflows/_build_torch_xla.yml index 9113c7fe4da..56e6b5408c3 100644 --- a/.github/workflows/_build_torch_xla.yml +++ b/.github/workflows/_build_torch_xla.yml @@ -6,6 +6,10 @@ on: required: true type: string description: Base image for builds + torch-commit: + required: true + type: string + description: torch-commit runner: required: false type: string @@ -42,6 +46,7 @@ jobs: with: repository: pytorch/pytorch path: pytorch + ref: ${{ inputs.torch-commit }} submodules: recursive - name: Checkout PyTorch/XLA Repo uses: actions/checkout@v4 diff --git a/.github/workflows/_get_torch_commit.yml b/.github/workflows/_get_torch_commit.yml new file mode 100644 index 00000000000..debaecd8194 --- /dev/null +++ b/.github/workflows/_get_torch_commit.yml @@ -0,0 +1,32 @@ +name: get-torch-commit +on: + workflow_call: + outputs: + torch_commit: + description: "torch commit to be used" + value: ${{ jobs.get-commit.outputs.torch_commit }} + +jobs: + get-commit: + runs-on: ubuntu-20.04 + outputs: + torch_commit: ${{ steps.get_torch_commit.outputs.torch_commit }} + steps: + # See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802 + - name: Clean up workspace + run: | + ls -la + rm -rvf ${GITHUB_WORKSPACE}/* + - name: Checkout PyTorch Repo + uses: actions/checkout@v4 + with: + repository: pytorch/pytorch + path: pytorch + submodules: recursive + - id: get_torch_commit + name: Get torch commit + run: | + cd pytorch + torch_commit=$(git rev-parse HEAD) + echo "torch_commit=$torch_commit" >> "$GITHUB_OUTPUT" + diff --git a/.github/workflows/_test_requiring_torch_cuda.yml b/.github/workflows/_test_requiring_torch_cuda.yml new file mode 100644 index 00000000000..a3e265e557f --- /dev/null +++ b/.github/workflows/_test_requiring_torch_cuda.yml @@ -0,0 +1,110 @@ +name: xla-test-requiring-torch-cuda +on: + workflow_call: + inputs: + dev-image: + required: true + type: string + description: Base image for builds + runner: + required: false + type: string + description: Runner type for the test + default: linux.12xlarge + collect-coverage: + required: false + type: boolean + description: Set to true to collect coverage information + default: false + timeout-minutes: + required: false + type: number + default: 30 + description: | + Set the maximum (in minutes) how long the workflow should take to finish + timeout-minutes: + +jobs: + test: + runs-on: ${{ inputs.runner }} + container: + image: ${{ inputs.dev-image }} + options: "--gpus all --shm-size 16g" + timeout-minutes: ${{ inputs.timeout-minutes }} + env: + USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }} + BAZEL_JOBS: 16 + BAZEL_REMOTE_CACHE: 1 + steps: + # See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802 + # TODO: need to find a way to reuse these steps. + - name: Clean up workspace + run: | + ls -la + rm -rvf ${GITHUB_WORKSPACE}/* + - name: Fetch torch/torch_xla/torchvision wheels + uses: actions/download-artifact@v4 + with: + name: torch-xla-wheels + path: /tmp/wheels/ + - name: Remove torch wheel built with CUDA disabled + shell: bash + run: | + rm -rf /tmp/wheels/torch-* + - name: Fetch the torch wheel built with CUDA enabled + uses: actions/download-artifact@v4 + with: + name: torch-with-cuda + path: /tmp/wheels/ + - name: Fetch CUDA plugin + uses: actions/download-artifact@v4 + with: + name: cuda-plugin + path: /tmp/wheels/ + - name: Setup CUDA environment + shell: bash + run: | + echo "XLA_REGISTER_INSTALLED_PLUGINS=1" >> $GITHUB_ENV + + echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV + - name: Check GPU + run: nvidia-smi + - name: Install wheels + shell: bash + run: | + pip install /tmp/wheels/*.whl + # TODO: Add these in setup.py + pip install fsspec + pip install rich + + echo "Import check..." + python -c "import torch, torch_xla, torchvision" + echo "Import check done." + echo "Check if CUDA is available for PyTorch..." + python -c "import torch; assert torch.cuda.is_available()" + echo "CUDA is available for PyTorch." + - name: Record PyTorch commit + run: | + # Don't just pipe output in shell because imports may do extra logging + python -c " + import torch_xla.version + with open('$GITHUB_ENV', 'a') as f: + f.write(f'PYTORCH_COMMIT={torch_xla.version.__torch_gitrev__}\n') + " + - name: Checkout PyTorch Repo + uses: actions/checkout@v4 + with: + repository: pytorch/pytorch + path: pytorch + ref: ${{ env.PYTORCH_COMMIT }} + - name: Checkout PyTorch/XLA Repo + uses: actions/checkout@v4 + with: + path: pytorch/xla + - name: Test + shell: bash + run: | + set -xue + PJRT_DEVICE=CUDA python pytorch/xla/test/test_operations.py -v + PJRT_DEVICE=CUDA python pytorch/xla/test/dynamo/test_dynamo.py -v diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 60a2eda44cd..1a924f65036 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -20,14 +20,30 @@ concurrency: jobs: + get-torch-commit: + name: "Get torch commit" + uses: ./.github/workflows/_get_torch_commit.yml + build-torch-xla: name: "Build PyTorch/XLA" uses: ./.github/workflows/_build_torch_xla.yml + needs: get-torch-commit with: dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_tpuvm + torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} secrets: gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} + build-torch-with-cuda: + name: "Build PyTorch with CUDA" + uses: ./.github/workflows/_build_torch_with_cuda.yml + needs: get-torch-commit + with: + # note that to build a torch wheel with CUDA enabled, we do not need a GPU runner. + dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1 + torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} + runner: linux.8xlarge.nvidia.gpu + build-cuda-plugin: name: "Build XLA CUDA plugin" uses: ./.github/workflows/_build_plugin.yml @@ -60,6 +76,16 @@ jobs: secrets: gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} + test-cuda-with-pytorch-cuda-enabled: + name: "GPU tests requiring torch CUDA" + uses: ./.github/workflows/_test_requiring_torch_cuda.yml + needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin] + with: + dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1 + runner: linux.8xlarge.nvidia.gpu + timeout-minutes: 300 + collect-coverage: false + test-tpu: name: "TPU tests" uses: ./.github/workflows/_tpu_ci.yml diff --git a/test/test_operations.py b/test/test_operations.py index e1dad566536..b4a1838a5a5 100644 --- a/test/test_operations.py +++ b/test/test_operations.py @@ -2684,13 +2684,13 @@ def test_dlpack_non_default_layout(self): t1 = cuda_t.t() xla_t1 = xdlpack.from_dlpack(t1.__dlpack__()) self.assertEqual(xla_t1.device.type, 'xla') - self.assertEqual(xla_t1.device.index, 0) + self.assertEqual(xla_t1.device.index, t1.device.index) self.assertTrue(torch.allclose(t1.cpu(), xla_t1.cpu())) t2 = cuda_t[0] xla_t2 = xdlpack.from_dlpack(t2.__dlpack__()) self.assertEqual(xla_t2.device.type, 'xla') - self.assertEqual(xla_t2.device.index, 0) + self.assertEqual(xla_t2.device.index, t2.device.index) self.assertTrue(torch.allclose(t2.cpu(), xla_t2.cpu())) t3 = cuda_t[:, 0] @@ -2702,13 +2702,13 @@ def test_dlpack_non_default_layout(self): t4 = cuda_t[1, :] xla_t4 = xdlpack.from_dlpack(t4.__dlpack__()) self.assertEqual(xla_t4.device.type, 'xla') - self.assertEqual(xla_t4.device.index, 0) + self.assertEqual(xla_t4.device.index, t4.device.index) self.assertTrue(torch.allclose(t4.cpu(), xla_t4.cpu())) t5 = cuda_t[1] xla_t5 = xdlpack.from_dlpack(t5.__dlpack__()) self.assertEqual(xla_t5.device.type, 'xla') - self.assertEqual(xla_t5.device.index, 0) + self.assertEqual(xla_t5.device.index, t5.device.index) self.assertTrue(torch.allclose(t5.cpu(), xla_t5.cpu()))