Skip to content

Commit

Permalink
Add a CI workflow for tests that requires pytorch CUDA. (#7140)
Browse files Browse the repository at this point in the history
  • Loading branch information
vanbasten23 authored May 31, 2024
1 parent 8471826 commit 6fadbf5
Show file tree
Hide file tree
Showing 6 changed files with 232 additions and 4 deletions.
55 changes: 55 additions & 0 deletions .github/workflows/_build_torch_with_cuda.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: build-torch-with-cuda
on:
workflow_call:
inputs:
dev-image:
required: true
type: string
description: Base image for builds
torch-commit:
required: true
type: string
description: torch-commit
runner:
required: false
type: string
description: Runner type for the test
default: linux.12xlarge
jobs:
build:
runs-on: ${{ inputs.runner }}
container:
image: ${{ inputs.dev-image }}
options: "--gpus all --shm-size 16g"
env:
_GLIBCXX_USE_CXX11_ABI: 0
steps:
# See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802
- name: Clean up workspace
run: |
ls -la
rm -rvf ${GITHUB_WORKSPACE}/*
- name: Setup CUDA environment
shell: bash
run: |
echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV
- name: Check GPU
run: nvidia-smi
- name: Checkout PyTorch Repo
uses: actions/checkout@v4
with:
repository: pytorch/pytorch
path: pytorch
ref: ${{ inputs.torch-commit }}
submodules: recursive
- name: Build
shell: bash
run: |
cd pytorch
USE_CUDA=1 python setup.py bdist_wheel
- name: Upload wheel
uses: actions/upload-artifact@v4
with:
name: torch-with-cuda
path: pytorch/dist/*.whl
5 changes: 5 additions & 0 deletions .github/workflows/_build_torch_xla.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ on:
required: true
type: string
description: Base image for builds
torch-commit:
required: true
type: string
description: torch-commit
runner:
required: false
type: string
Expand Down Expand Up @@ -42,6 +46,7 @@ jobs:
with:
repository: pytorch/pytorch
path: pytorch
ref: ${{ inputs.torch-commit }}
submodules: recursive
- name: Checkout PyTorch/XLA Repo
uses: actions/checkout@v4
Expand Down
32 changes: 32 additions & 0 deletions .github/workflows/_get_torch_commit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: get-torch-commit
on:
workflow_call:
outputs:
torch_commit:
description: "torch commit to be used"
value: ${{ jobs.get-commit.outputs.torch_commit }}

jobs:
get-commit:
runs-on: ubuntu-20.04
outputs:
torch_commit: ${{ steps.get_torch_commit.outputs.torch_commit }}
steps:
# See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802
- name: Clean up workspace
run: |
ls -la
rm -rvf ${GITHUB_WORKSPACE}/*
- name: Checkout PyTorch Repo
uses: actions/checkout@v4
with:
repository: pytorch/pytorch
path: pytorch
submodules: recursive
- id: get_torch_commit
name: Get torch commit
run: |
cd pytorch
torch_commit=$(git rev-parse HEAD)
echo "torch_commit=$torch_commit" >> "$GITHUB_OUTPUT"
110 changes: 110 additions & 0 deletions .github/workflows/_test_requiring_torch_cuda.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
name: xla-test-requiring-torch-cuda
on:
workflow_call:
inputs:
dev-image:
required: true
type: string
description: Base image for builds
runner:
required: false
type: string
description: Runner type for the test
default: linux.12xlarge
collect-coverage:
required: false
type: boolean
description: Set to true to collect coverage information
default: false
timeout-minutes:
required: false
type: number
default: 30
description: |
Set the maximum (in minutes) how long the workflow should take to finish
timeout-minutes:
jobs:
test:
runs-on: ${{ inputs.runner }}
container:
image: ${{ inputs.dev-image }}
options: "--gpus all --shm-size 16g"
timeout-minutes: ${{ inputs.timeout-minutes }}
env:
USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }}
BAZEL_JOBS: 16
BAZEL_REMOTE_CACHE: 1
steps:
# See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802
# TODO: need to find a way to reuse these steps.
- name: Clean up workspace
run: |
ls -la
rm -rvf ${GITHUB_WORKSPACE}/*
- name: Fetch torch/torch_xla/torchvision wheels
uses: actions/download-artifact@v4
with:
name: torch-xla-wheels
path: /tmp/wheels/
- name: Remove torch wheel built with CUDA disabled
shell: bash
run: |
rm -rf /tmp/wheels/torch-*
- name: Fetch the torch wheel built with CUDA enabled
uses: actions/download-artifact@v4
with:
name: torch-with-cuda
path: /tmp/wheels/
- name: Fetch CUDA plugin
uses: actions/download-artifact@v4
with:
name: cuda-plugin
path: /tmp/wheels/
- name: Setup CUDA environment
shell: bash
run: |
echo "XLA_REGISTER_INSTALLED_PLUGINS=1" >> $GITHUB_ENV
echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV
- name: Check GPU
run: nvidia-smi
- name: Install wheels
shell: bash
run: |
pip install /tmp/wheels/*.whl
# TODO: Add these in setup.py
pip install fsspec
pip install rich
echo "Import check..."
python -c "import torch, torch_xla, torchvision"
echo "Import check done."
echo "Check if CUDA is available for PyTorch..."
python -c "import torch; assert torch.cuda.is_available()"
echo "CUDA is available for PyTorch."
- name: Record PyTorch commit
run: |
# Don't just pipe output in shell because imports may do extra logging
python -c "
import torch_xla.version
with open('$GITHUB_ENV', 'a') as f:
f.write(f'PYTORCH_COMMIT={torch_xla.version.__torch_gitrev__}\n')
"
- name: Checkout PyTorch Repo
uses: actions/checkout@v4
with:
repository: pytorch/pytorch
path: pytorch
ref: ${{ env.PYTORCH_COMMIT }}
- name: Checkout PyTorch/XLA Repo
uses: actions/checkout@v4
with:
path: pytorch/xla
- name: Test
shell: bash
run: |
set -xue
PJRT_DEVICE=CUDA python pytorch/xla/test/test_operations.py -v
PJRT_DEVICE=CUDA python pytorch/xla/test/dynamo/test_dynamo.py -v
26 changes: 26 additions & 0 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,30 @@ concurrency:

jobs:

get-torch-commit:
name: "Get torch commit"
uses: ./.github/workflows/_get_torch_commit.yml

build-torch-xla:
name: "Build PyTorch/XLA"
uses: ./.github/workflows/_build_torch_xla.yml
needs: get-torch-commit
with:
dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_tpuvm
torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
secrets:
gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}

build-torch-with-cuda:
name: "Build PyTorch with CUDA"
uses: ./.github/workflows/_build_torch_with_cuda.yml
needs: get-torch-commit
with:
# note that to build a torch wheel with CUDA enabled, we do not need a GPU runner.
dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
runner: linux.8xlarge.nvidia.gpu

build-cuda-plugin:
name: "Build XLA CUDA plugin"
uses: ./.github/workflows/_build_plugin.yml
Expand Down Expand Up @@ -60,6 +76,16 @@ jobs:
secrets:
gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}

test-cuda-with-pytorch-cuda-enabled:
name: "GPU tests requiring torch CUDA"
uses: ./.github/workflows/_test_requiring_torch_cuda.yml
needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin]
with:
dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
runner: linux.8xlarge.nvidia.gpu
timeout-minutes: 300
collect-coverage: false

test-tpu:
name: "TPU tests"
uses: ./.github/workflows/_tpu_ci.yml
Expand Down
8 changes: 4 additions & 4 deletions test/test_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2684,13 +2684,13 @@ def test_dlpack_non_default_layout(self):
t1 = cuda_t.t()
xla_t1 = xdlpack.from_dlpack(t1.__dlpack__())
self.assertEqual(xla_t1.device.type, 'xla')
self.assertEqual(xla_t1.device.index, 0)
self.assertEqual(xla_t1.device.index, t1.device.index)
self.assertTrue(torch.allclose(t1.cpu(), xla_t1.cpu()))

t2 = cuda_t[0]
xla_t2 = xdlpack.from_dlpack(t2.__dlpack__())
self.assertEqual(xla_t2.device.type, 'xla')
self.assertEqual(xla_t2.device.index, 0)
self.assertEqual(xla_t2.device.index, t2.device.index)
self.assertTrue(torch.allclose(t2.cpu(), xla_t2.cpu()))

t3 = cuda_t[:, 0]
Expand All @@ -2702,13 +2702,13 @@ def test_dlpack_non_default_layout(self):
t4 = cuda_t[1, :]
xla_t4 = xdlpack.from_dlpack(t4.__dlpack__())
self.assertEqual(xla_t4.device.type, 'xla')
self.assertEqual(xla_t4.device.index, 0)
self.assertEqual(xla_t4.device.index, t4.device.index)
self.assertTrue(torch.allclose(t4.cpu(), xla_t4.cpu()))

t5 = cuda_t[1]
xla_t5 = xdlpack.from_dlpack(t5.__dlpack__())
self.assertEqual(xla_t5.device.type, 'xla')
self.assertEqual(xla_t5.device.index, 0)
self.assertEqual(xla_t5.device.index, t5.device.index)
self.assertTrue(torch.allclose(t5.cpu(), xla_t5.cpu()))


Expand Down

0 comments on commit 6fadbf5

Please sign in to comment.