diff --git a/.github/workflows/_build_torch_with_cuda.yml b/.github/workflows/_build_torch_with_cuda.yml
new file mode 100644
index 00000000000..e9defd40eb5
--- /dev/null
+++ b/.github/workflows/_build_torch_with_cuda.yml
@@ -0,0 +1,55 @@
+name: build-torch-with-cuda
+on:
+  workflow_call:
+    inputs:
+      dev-image:
+        required: true
+        type: string
+        description: Base image for builds
+      torch-commit:
+          required: true
+          type: string
+          description: torch-commit
+      runner:
+        required: false
+        type: string
+        description: Runner type for the test
+        default: linux.12xlarge
+jobs:
+  build:
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.dev-image }}
+      options: "--gpus all --shm-size 16g"
+    env:
+      _GLIBCXX_USE_CXX11_ABI: 0
+    steps:
+      # See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802
+      - name: Clean up workspace
+        run: |
+          ls -la
+          rm -rvf ${GITHUB_WORKSPACE}/*
+      - name: Setup CUDA environment
+        shell: bash
+        run: |
+          echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV
+          echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV
+      - name: Check GPU
+        run: nvidia-smi
+      - name: Checkout PyTorch Repo
+        uses: actions/checkout@v4
+        with:
+          repository: pytorch/pytorch
+          path: pytorch
+          ref: ${{ inputs.torch-commit }}
+          submodules: recursive
+      - name: Build
+        shell: bash
+        run: |
+          cd pytorch
+          USE_CUDA=1 python setup.py bdist_wheel
+      - name: Upload wheel
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch-with-cuda
+          path: pytorch/dist/*.whl
diff --git a/.github/workflows/_build_torch_xla.yml b/.github/workflows/_build_torch_xla.yml
index 9113c7fe4da..56e6b5408c3 100644
--- a/.github/workflows/_build_torch_xla.yml
+++ b/.github/workflows/_build_torch_xla.yml
@@ -6,6 +6,10 @@ on:
         required: true
         type: string
         description: Base image for builds
+      torch-commit:
+          required: true
+          type: string
+          description: torch-commit
       runner:
         required: false
         type: string
@@ -42,6 +46,7 @@ jobs:
         with:
           repository: pytorch/pytorch
           path: pytorch
+          ref: ${{ inputs.torch-commit }}
           submodules: recursive
       - name: Checkout PyTorch/XLA Repo
         uses: actions/checkout@v4
diff --git a/.github/workflows/_get_torch_commit.yml b/.github/workflows/_get_torch_commit.yml
new file mode 100644
index 00000000000..debaecd8194
--- /dev/null
+++ b/.github/workflows/_get_torch_commit.yml
@@ -0,0 +1,32 @@
+name: get-torch-commit
+on:
+  workflow_call:
+    outputs:
+      torch_commit:
+        description: "torch commit to be used"
+        value: ${{ jobs.get-commit.outputs.torch_commit }}
+
+jobs:
+  get-commit:
+    runs-on: ubuntu-20.04
+    outputs: 
+      torch_commit: ${{ steps.get_torch_commit.outputs.torch_commit }}
+    steps:
+      # See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802
+      - name: Clean up workspace
+        run: |
+          ls -la
+          rm -rvf ${GITHUB_WORKSPACE}/*
+      - name: Checkout PyTorch Repo
+        uses: actions/checkout@v4
+        with:
+          repository: pytorch/pytorch
+          path: pytorch
+          submodules: recursive
+      - id: get_torch_commit
+        name: Get torch commit
+        run: |
+          cd pytorch
+          torch_commit=$(git rev-parse HEAD) 
+          echo "torch_commit=$torch_commit" >> "$GITHUB_OUTPUT"
+      
diff --git a/.github/workflows/_test_requiring_torch_cuda.yml b/.github/workflows/_test_requiring_torch_cuda.yml
new file mode 100644
index 00000000000..a3e265e557f
--- /dev/null
+++ b/.github/workflows/_test_requiring_torch_cuda.yml
@@ -0,0 +1,110 @@
+name: xla-test-requiring-torch-cuda
+on:
+  workflow_call:
+    inputs:
+      dev-image:
+        required: true
+        type: string
+        description: Base image for builds
+      runner:
+        required: false
+        type: string
+        description: Runner type for the test
+        default: linux.12xlarge
+      collect-coverage:
+        required: false
+        type: boolean
+        description: Set to true to collect coverage information
+        default: false
+      timeout-minutes:
+        required: false
+        type: number
+        default: 30
+        description: |
+          Set the maximum (in minutes) how long the workflow should take to finish
+            timeout-minutes:
+
+jobs:
+  test:
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.dev-image }}
+      options: "--gpus all --shm-size 16g"
+    timeout-minutes: ${{ inputs.timeout-minutes }}
+    env:
+      USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }}
+      BAZEL_JOBS: 16
+      BAZEL_REMOTE_CACHE: 1
+    steps:
+      # See https://github.com/actions/checkout/issues/1014#issuecomment-1906802802
+      # TODO: need to find a way to reuse these steps.
+      - name: Clean up workspace
+        run: |
+          ls -la
+          rm -rvf ${GITHUB_WORKSPACE}/*
+      - name: Fetch torch/torch_xla/torchvision wheels
+        uses: actions/download-artifact@v4
+        with:
+          name: torch-xla-wheels
+          path: /tmp/wheels/
+      - name: Remove torch wheel built with CUDA disabled
+        shell: bash
+        run: |
+          rm -rf /tmp/wheels/torch-*
+      - name: Fetch the torch wheel built with CUDA enabled
+        uses: actions/download-artifact@v4
+        with:
+          name: torch-with-cuda
+          path: /tmp/wheels/
+      - name: Fetch CUDA plugin
+        uses: actions/download-artifact@v4
+        with:
+          name: cuda-plugin
+          path: /tmp/wheels/
+      - name: Setup CUDA environment
+        shell: bash
+        run: |
+          echo "XLA_REGISTER_INSTALLED_PLUGINS=1" >> $GITHUB_ENV
+
+          echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV
+          echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV
+      - name: Check GPU
+        run: nvidia-smi
+      - name: Install wheels
+        shell: bash
+        run: |
+          pip install /tmp/wheels/*.whl
+          # TODO: Add these in setup.py
+          pip install fsspec
+          pip install rich
+
+          echo "Import check..."
+          python -c "import torch, torch_xla, torchvision"
+          echo "Import check done."
+          echo "Check if CUDA is available for PyTorch..."
+          python -c "import torch; assert torch.cuda.is_available()"
+          echo "CUDA is available for PyTorch."
+      - name: Record PyTorch commit
+        run: |
+          # Don't just pipe output in shell because imports may do extra logging
+          python -c "
+          import torch_xla.version
+          with open('$GITHUB_ENV', 'a') as f:
+            f.write(f'PYTORCH_COMMIT={torch_xla.version.__torch_gitrev__}\n')
+          "
+      - name: Checkout PyTorch Repo
+        uses: actions/checkout@v4
+        with:
+          repository: pytorch/pytorch
+          path: pytorch
+          ref: ${{ env.PYTORCH_COMMIT }}
+      - name: Checkout PyTorch/XLA Repo
+        uses: actions/checkout@v4
+        with:
+          path: pytorch/xla
+      - name: Test
+        shell: bash
+        run: |
+          set -xue
+          PJRT_DEVICE=CUDA python pytorch/xla/test/test_operations.py -v
+          PJRT_DEVICE=CUDA python pytorch/xla/test/dynamo/test_dynamo.py -v
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 60a2eda44cd..1a924f65036 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -20,14 +20,30 @@ concurrency:
 
 jobs:
 
+  get-torch-commit:
+    name: "Get torch commit"
+    uses: ./.github/workflows/_get_torch_commit.yml
+
   build-torch-xla:
     name: "Build PyTorch/XLA"
     uses: ./.github/workflows/_build_torch_xla.yml
+    needs: get-torch-commit
     with:
       dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_tpuvm
+      torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
+  build-torch-with-cuda:
+    name: "Build PyTorch with CUDA"
+    uses: ./.github/workflows/_build_torch_with_cuda.yml
+    needs: get-torch-commit
+    with:
+      # note that to build a torch wheel with CUDA enabled, we do not need a GPU runner.
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
+      torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
+      runner: linux.8xlarge.nvidia.gpu
+
   build-cuda-plugin:
     name: "Build XLA CUDA plugin"
     uses: ./.github/workflows/_build_plugin.yml
@@ -60,6 +76,16 @@ jobs:
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
+  test-cuda-with-pytorch-cuda-enabled:
+    name: "GPU tests requiring torch CUDA"
+    uses: ./.github/workflows/_test_requiring_torch_cuda.yml
+    needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin]
+    with:
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
+      runner: linux.8xlarge.nvidia.gpu
+      timeout-minutes: 300
+      collect-coverage: false
+
   test-tpu:
     name: "TPU tests"
     uses: ./.github/workflows/_tpu_ci.yml
diff --git a/test/test_operations.py b/test/test_operations.py
index e1dad566536..b4a1838a5a5 100644
--- a/test/test_operations.py
+++ b/test/test_operations.py
@@ -2684,13 +2684,13 @@ def test_dlpack_non_default_layout(self):
     t1 = cuda_t.t()
     xla_t1 = xdlpack.from_dlpack(t1.__dlpack__())
     self.assertEqual(xla_t1.device.type, 'xla')
-    self.assertEqual(xla_t1.device.index, 0)
+    self.assertEqual(xla_t1.device.index, t1.device.index)
     self.assertTrue(torch.allclose(t1.cpu(), xla_t1.cpu()))
 
     t2 = cuda_t[0]
     xla_t2 = xdlpack.from_dlpack(t2.__dlpack__())
     self.assertEqual(xla_t2.device.type, 'xla')
-    self.assertEqual(xla_t2.device.index, 0)
+    self.assertEqual(xla_t2.device.index, t2.device.index)
     self.assertTrue(torch.allclose(t2.cpu(), xla_t2.cpu()))
 
     t3 = cuda_t[:, 0]
@@ -2702,13 +2702,13 @@ def test_dlpack_non_default_layout(self):
     t4 = cuda_t[1, :]
     xla_t4 = xdlpack.from_dlpack(t4.__dlpack__())
     self.assertEqual(xla_t4.device.type, 'xla')
-    self.assertEqual(xla_t4.device.index, 0)
+    self.assertEqual(xla_t4.device.index, t4.device.index)
     self.assertTrue(torch.allclose(t4.cpu(), xla_t4.cpu()))
 
     t5 = cuda_t[1]
     xla_t5 = xdlpack.from_dlpack(t5.__dlpack__())
     self.assertEqual(xla_t5.device.type, 'xla')
-    self.assertEqual(xla_t5.device.index, 0)
+    self.assertEqual(xla_t5.device.index, t5.device.index)
     self.assertTrue(torch.allclose(t5.cpu(), xla_t5.cpu()))