Merge branch 'master' into wcromar/tx2-simplify-setup

pytorch · Apr 24, 2024 · f731f15 · f731f15
2 parents e7ca0a2 + 6fd448d
commit f731f15
Show file tree

Hide file tree

Showing 95 changed files with 2,776 additions and 299 deletions.
diff --git a/.devcontainer/gpu-internal/devcontainer.json b/.devcontainer/gpu-internal/devcontainer.json
@@ -0,0 +1,30 @@
+{
+  "name": "gpu-internal",
+  "image": "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1",
+  "runArgs": [
+    "--gpus=all",
+    "--net=host",
+    "--shm-size=16G"
+  ],
+  "containerEnv": {
+    "BAZEL_REMOTE_CACHE": "1",
+    "SILO_NAME": "cache-silo-${localEnv:USER}-gpuvm"
+  },
+  "initializeCommand": "docker pull us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1",
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "ms-vscode.cpptools-themes",
+        "BazelBuild.vscode-bazel",
+        "DevonDCarew.bazel-code",
+        "StackBuild.bazel-stack-vscode",
+        "StackBuild.bazel-stack-vscode-cc",
+        "xaver.clang-format",
+        "ryanluker.vscode-coverage-gutters",
+        "ms-azuretools.vscode-docker",
+        "ms-python.python"
+      ]
+    }
+  }
+}
diff --git a/.github/workflows/_build_plugin.yml b/.github/workflows/_build_plugin.yml
@@ -0,0 +1,47 @@
+name: build-cuda-plugin
+on:
+  workflow_call:
+    inputs:
+      dev-image:
+        required: true
+        type: string
+        description: Base image for builds
+      runner:
+        required: false
+        type: string
+        description: Runner type for the test
+        default: linux.12xlarge
+
+    secrets:
+      gcloud-service-key:
+        required: true
+        description: Secret to access Bazel build cache
+jobs:
+  build:
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.dev-image }}
+    env:
+      GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
+      GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json
+      BAZEL_JOBS: 16
+      BAZEL_REMOTE_CACHE: 1
+    steps:
+      - name: Setup gcloud
+        shell: bash
+        run: |
+          echo "${GCLOUD_SERVICE_KEY}" > $GOOGLE_APPLICATION_CREDENTIALS
+      - name: Checkout repo
+        uses: actions/checkout@v4
+        with:
+          path: pytorch/xla
+      - name: Build
+        shell: bash
+        run: |
+          cd pytorch/xla/infra/ansible
+          ansible-playbook playbook.yaml -vvv -e "stage=build_plugin arch=amd64 accelerator=cuda cuda_compute_capabilities=5.2,7.5 src_root=${GITHUB_WORKSPACE} cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps
+      - name: Upload wheel
+        uses: actions/upload-artifact@v4
+        with:
+          name: cuda-plugin
+          path: /dist/*.whl
diff --git a/.github/workflows/_build_torch_xla.yml b/.github/workflows/_build_torch_xla.yml
@@ -0,0 +1,55 @@
+name: build-cuda-plugin
+on:
+  workflow_call:
+    inputs:
+      dev-image:
+        required: true
+        type: string
+        description: Base image for builds
+      runner:
+        required: false
+        type: string
+        description: Runner type for the test
+        default: linux.12xlarge
+
+    secrets:
+      gcloud-service-key:
+        required: true
+        description: Secret to access Bazel build cache
+jobs:
+  build:
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.dev-image }}
+    env:
+      GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
+      GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json
+      BAZEL_JOBS: 16
+      BAZEL_REMOTE_CACHE: 1
+      # BUILD_CPP_TESTS: 1
+    steps:
+      - name: Setup gcloud
+        shell: bash
+        run: |
+          echo "${GCLOUD_SERVICE_KEY}" > $GOOGLE_APPLICATION_CREDENTIALS
+      - name: Checkout PyTorch Repo
+        uses: actions/checkout@v4
+        with:
+          repository: pytorch/pytorch
+          path: pytorch
+          submodules: recursive
+          # TODO: correct pin
+      - name: Checkout PyTorch/XLA Repo
+        uses: actions/checkout@v4
+        with:
+          path: pytorch/xla
+      - name: Build
+        shell: bash
+        run: |
+          cd pytorch/xla/infra/ansible
+          ansible-playbook playbook.yaml -vvv -e "stage=build arch=amd64 accelerator=tpu src_root=${GITHUB_WORKSPACE} bundle_libtpu=0 cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps
+      - name: Upload wheel
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch-xla-wheels
+          path: /dist/*.whl
diff --git a/.github/workflows/_test.yml → .github/workflows/_test_cpp.yml b/.github/workflows/_test.yml → .github/workflows/_test_cpp.yml
@@ -45,17 +45,8 @@ jobs:
       matrix:
         include:
           # Use readable strings as they define the workflow titles.
-          - run_benchmark_tests: 'benchmark_tests'
           - run_cpp_tests1: 'cpp_tests1'
           - run_cpp_tests2: 'cpp_tests2'
-          - run_python_tests: 'python_tests'
-            run_xla_op_tests1: 'xla_op1'
-          - run_python_tests: 'python_tests'
-            run_xla_op_tests2: 'xla_op2'
-          - run_python_tests: 'python_tests'
-            run_xla_op_tests3: 'xla_op3'
-          - run_python_tests: 'python_tests'
-            run_torch_mp_op_tests: 'torch_mp_op'
     timeout-minutes: ${{ inputs.timeout-minutes }}
     env:
       DOCKER_IMAGE: ${{ inputs.docker-image }}
@@ -64,14 +55,8 @@ jobs:
       USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }}
       XLA_SKIP_TORCH_OP_TESTS: ${{ inputs.disable-pjrt }}
       XLA_SKIP_MP_OP_TESTS: ${{ inputs.disable-pjrt }}
-      RUN_BENCHMARK_TESTS: ${{ matrix.run_benchmark_tests }}
       RUN_CPP_TESTS1: ${{ matrix.run_cpp_tests1 }}
       RUN_CPP_TESTS2: ${{ matrix.run_cpp_tests2 }}
-      RUN_PYTHON_TESTS: ${{ matrix.run_python_tests }}
-      RUN_XLA_OP_TESTS1: ${{ matrix.run_xla_op_tests1 }}
-      RUN_XLA_OP_TESTS2: ${{ matrix.run_xla_op_tests2 }}
-      RUN_XLA_OP_TESTS3: ${{ matrix.run_xla_op_tests3 }}
-      RUN_TORCH_MP_OP_TESTS: ${{ matrix.run_torch_mp_op_tests }}
     steps:
       - name: Setup Linux
         uses: pytorch/test-infra/.github/actions/setup-linux@main

diff --git a/.github/workflows/_test_python.yml b/.github/workflows/_test_python.yml
@@ -0,0 +1,176 @@
+name: xla-test
+on:
+  workflow_call:
+    inputs:
+      dev-image:
+        required: true
+        type: string
+        description: Base image for builds
+      runner:
+        required: false
+        type: string
+        description: Runner type for the test
+        default: linux.12xlarge
+      collect-coverage:
+        required: false
+        type: boolean
+        description: Set to true to collect coverage information
+        default: false
+      timeout-minutes:
+        required: false
+        type: number
+        default: 270
+        description: |
+          Set the maximum (in minutes) how long the workflow should take to finish
+            timeout-minutes:
+      install-cuda-plugin:
+        required: false
+        type: boolean
+        default: false
+        description: Whether to install CUDA plugin package
+
+    secrets:
+      gcloud-service-key:
+        required: true
+        description: Secret to access Bazel build cache
+jobs:
+  test:
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.dev-image }}
+      options: "${{ inputs.install-cuda-plugin && '--gpus all' || '' }} --shm-size 16g"
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          # Use readable strings as they define the workflow titles.
+          - run_benchmark_tests: 'benchmark_tests'
+          - run_python_tests: 'python_tests'
+            run_xla_op_tests1: 'xla_op1'
+          - run_python_tests: 'python_tests'
+            run_xla_op_tests2: 'xla_op2'
+          - run_python_tests: 'python_tests'
+            run_xla_op_tests3: 'xla_op3'
+          - run_python_tests: 'python_tests'
+            run_torch_mp_op_tests: 'torch_mp_op'
+    timeout-minutes: ${{ inputs.timeout-minutes }}
+    env:
+      GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
+      GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json
+      USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }}
+      RUN_BENCHMARK_TESTS: ${{ matrix.run_benchmark_tests }}
+      RUN_PYTHON_TESTS: ${{ matrix.run_python_tests }}
+      RUN_XLA_OP_TESTS1: ${{ matrix.run_xla_op_tests1 }}
+      RUN_XLA_OP_TESTS2: ${{ matrix.run_xla_op_tests2 }}
+      RUN_XLA_OP_TESTS3: ${{ matrix.run_xla_op_tests3 }}
+      RUN_TORCH_MP_OP_TESTS: ${{ matrix.run_torch_mp_op_tests }}
+      BAZEL_JOBS: 16
+      BAZEL_REMOTE_CACHE: 1
+    steps:
+      - name: Setup gcloud
+        shell: bash
+        run: |
+          echo "${GCLOUD_SERVICE_KEY}" > $GOOGLE_APPLICATION_CREDENTIALS
+      - name: Fetch wheels
+        uses: actions/download-artifact@v4
+        with:
+          name: torch-xla-wheels
+          path: /tmp/wheels/
+      - name: Fetch CUDA plugin
+        uses: actions/download-artifact@v4
+        with:
+          name: cuda-plugin
+          path: /tmp/wheels/
+        if: ${{ inputs.install-cuda-plugin }}
+      - name: Setup CUDA environment
+        shell: bash
+        run: |
+          # TODO: Make PJRT_DEVICE=CPU work with XLA_REGISTER_INSTALLED_PLUGINS=1
+          echo "XLA_REGISTER_INSTALLED_PLUGINS=1" >> $GITHUB_ENV
+
+          echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV
+          echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV
+        if: ${{ inputs.install-cuda-plugin }}
+      - name: Check GPU
+        run: nvidia-smi
+        if: ${{ inputs.install-cuda-plugin }}
+      - name: Install wheels
+        shell: bash
+        run: |
+          pip install /tmp/wheels/*.whl
+          # TODO: Add these in setup.py
+          pip install fsspec
+          pip install rich
+      - name: Record PyTorch commit
+        run: echo "PYTORCH_COMMIT=$(python -c 'import torch_xla.version; print(torch_xla.version.__torch_gitrev__)')" >> $GITHUB_ENV
+      - name: Checkout PyTorch Repo
+        uses: actions/checkout@v4
+        with:
+          repository: pytorch/pytorch
+          path: pytorch
+          ref: ${{ env.PYTORCH_COMMIT }}
+      - name: Checkout PyTorch/XLA Repo
+        uses: actions/checkout@v4
+        with:
+          path: pytorch/xla
+      - name: Extra CI deps
+        shell: bash
+        run: |
+          set -x
+
+          pip install expecttest unittest-xml-reporting
+
+          if [[ ! -z "$RUN_BENCHMARK_TESTS" ]]; then
+            pip install -r pytorch/xla/benchmarks/requirements.txt
+          fi
+      - name: Test
+        shell: bash
+        run: |
+          source pytorch/xla/.circleci/common.sh
+
+          run_torch_xla_tests pytorch/ pytorch/xla/ $USE_COVERAGE
+      - name: Upload coverage results
+        if: ${{ inputs.collect-coverage }}
+        shell: bash
+        env:
+          CIRCLE_WORKFLOW_ID: ${{ github.run_id }}
+          CIRCLE_BUILD_NUM: ${{ github.run_number }}
+          BENCHMARK_TEST_NAME: ${{ env.RUN_BENCHMARK_TESTS }}
+          PYTHON_TEST_NAME: ${{ env.RUN_PYTHON_TESTS }}${{ env.RUN_XLA_OP_TESTS1 }}${{ env.RUN_XLA_OP_TESTS2 }}${{ env.RUN_XLA_OP_TESTS3 }}${{ env.RUN_TORCH_MP_OP_TESTS }}
+          CPP_TEST_NAME: ${{ env.RUN_CPP_TESTS1 }}${{ env.RUN_CPP_TESTS2 }}
+        run: |
+            # TODO(yeounoh) collect coverage report as needed.
+            if [ -n "${BENCHMARK_TEST_NAME}" ]; then
+                exit 0
+            fi
+            docker cp "${pid}":/home/jenkins/htmlcov "${GITHUB_WORKSPACE}"
+            if [ -n "${GPU_FLAG:-}" ]; then
+              if [ -n "${PYTHON_TEST_NAME}" ]; then
+                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage_${PYTHON_TEST_NAME}.out
+                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage_${PYTHON_TEST_NAME}.out
+              fi
+              if [ -n "${CPP_TEST_NAME}" ]; then
+                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage_${CPP_TEST_NAME}.out
+                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage_${CPP_TEST_NAME}.out
+              fi
+            else
+              if [ -n "${PYTHON_TEST_NAME}" ]; then
+                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out
+                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out
+              fi
+
+              if [ -n "${CPP_TEST_NAME}" ]; then
+                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out
+                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out
+              fi
+
+              if [ "${CPP_TEST_NAME}" == "cpp_tests1" ]; then
+                ABS_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "commit_id": '\"${GITHUB_SHA}\"', "ref": "HEAD", "source": "https://github.com/pytorch/xla", "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}'
+                echo $ABS_METADATA > abs_metadata.json
+                gsutil cp abs_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json
+
+                INC_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "patchset_num": 1, "change_id": '${CIRCLE_BUILD_NUM}', "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}'
+                echo $INC_METADATA > inc_metadata.json
+                gsutil cp inc_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json
+              fi
+            fi
diff --git a/.github/workflows/_tpu_ci.yml b/.github/workflows/_tpu_ci.yml
@@ -0,0 +1,35 @@
+name: TPU Integration Test
+on:
+  workflow_call:
+jobs:
+  tpu-test:
+    runs-on: v4-runner-set
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+        with:
+          path: pytorch/xla
+      - name: Fetch wheels
+        uses: actions/download-artifact@v4
+        with:
+          name: torch-xla-wheels
+          path: /tmp/wheels/
+      - name: Install wheels
+        shell: bash
+        run: |
+          pip install /tmp/wheels/*.whl
+      - name: Install test dependencies
+        shell: bash
+        run: |
+          # TODO: Add these in setup.py
+          pip install fsspec
+          pip install rich
+          # Jax nightly is needed for pallas tests.
+          pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+          pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+      - name: Run Tests
+        env:
+          PJRT_DEVICE: TPU
+        run: |
+          cd pytorch/xla
+          test/tpu/run_tests.sh