Skip to content

Commit

Permalink
Merge branch 'master' into wcromar/tx2-simplify-setup
Browse files Browse the repository at this point in the history
  • Loading branch information
qihqi authored Apr 24, 2024
2 parents e7ca0a2 + 6fd448d commit f731f15
Show file tree
Hide file tree
Showing 95 changed files with 2,776 additions and 299 deletions.
30 changes: 30 additions & 0 deletions .devcontainer/gpu-internal/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"name": "gpu-internal",
"image": "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1",
"runArgs": [
"--gpus=all",
"--net=host",
"--shm-size=16G"
],
"containerEnv": {
"BAZEL_REMOTE_CACHE": "1",
"SILO_NAME": "cache-silo-${localEnv:USER}-gpuvm"
},
"initializeCommand": "docker pull us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1",
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"ms-vscode.cpptools-themes",
"BazelBuild.vscode-bazel",
"DevonDCarew.bazel-code",
"StackBuild.bazel-stack-vscode",
"StackBuild.bazel-stack-vscode-cc",
"xaver.clang-format",
"ryanluker.vscode-coverage-gutters",
"ms-azuretools.vscode-docker",
"ms-python.python"
]
}
}
}
47 changes: 47 additions & 0 deletions .github/workflows/_build_plugin.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: build-cuda-plugin
on:
workflow_call:
inputs:
dev-image:
required: true
type: string
description: Base image for builds
runner:
required: false
type: string
description: Runner type for the test
default: linux.12xlarge

secrets:
gcloud-service-key:
required: true
description: Secret to access Bazel build cache
jobs:
build:
runs-on: ${{ inputs.runner }}
container:
image: ${{ inputs.dev-image }}
env:
GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json
BAZEL_JOBS: 16
BAZEL_REMOTE_CACHE: 1
steps:
- name: Setup gcloud
shell: bash
run: |
echo "${GCLOUD_SERVICE_KEY}" > $GOOGLE_APPLICATION_CREDENTIALS
- name: Checkout repo
uses: actions/checkout@v4
with:
path: pytorch/xla
- name: Build
shell: bash
run: |
cd pytorch/xla/infra/ansible
ansible-playbook playbook.yaml -vvv -e "stage=build_plugin arch=amd64 accelerator=cuda cuda_compute_capabilities=5.2,7.5 src_root=${GITHUB_WORKSPACE} cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps
- name: Upload wheel
uses: actions/upload-artifact@v4
with:
name: cuda-plugin
path: /dist/*.whl
55 changes: 55 additions & 0 deletions .github/workflows/_build_torch_xla.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: build-cuda-plugin
on:
workflow_call:
inputs:
dev-image:
required: true
type: string
description: Base image for builds
runner:
required: false
type: string
description: Runner type for the test
default: linux.12xlarge

secrets:
gcloud-service-key:
required: true
description: Secret to access Bazel build cache
jobs:
build:
runs-on: ${{ inputs.runner }}
container:
image: ${{ inputs.dev-image }}
env:
GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json
BAZEL_JOBS: 16
BAZEL_REMOTE_CACHE: 1
# BUILD_CPP_TESTS: 1
steps:
- name: Setup gcloud
shell: bash
run: |
echo "${GCLOUD_SERVICE_KEY}" > $GOOGLE_APPLICATION_CREDENTIALS
- name: Checkout PyTorch Repo
uses: actions/checkout@v4
with:
repository: pytorch/pytorch
path: pytorch
submodules: recursive
# TODO: correct pin
- name: Checkout PyTorch/XLA Repo
uses: actions/checkout@v4
with:
path: pytorch/xla
- name: Build
shell: bash
run: |
cd pytorch/xla/infra/ansible
ansible-playbook playbook.yaml -vvv -e "stage=build arch=amd64 accelerator=tpu src_root=${GITHUB_WORKSPACE} bundle_libtpu=0 cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps
- name: Upload wheel
uses: actions/upload-artifact@v4
with:
name: torch-xla-wheels
path: /dist/*.whl
15 changes: 0 additions & 15 deletions .github/workflows/_test.yml → .github/workflows/_test_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,8 @@ jobs:
matrix:
include:
# Use readable strings as they define the workflow titles.
- run_benchmark_tests: 'benchmark_tests'
- run_cpp_tests1: 'cpp_tests1'
- run_cpp_tests2: 'cpp_tests2'
- run_python_tests: 'python_tests'
run_xla_op_tests1: 'xla_op1'
- run_python_tests: 'python_tests'
run_xla_op_tests2: 'xla_op2'
- run_python_tests: 'python_tests'
run_xla_op_tests3: 'xla_op3'
- run_python_tests: 'python_tests'
run_torch_mp_op_tests: 'torch_mp_op'
timeout-minutes: ${{ inputs.timeout-minutes }}
env:
DOCKER_IMAGE: ${{ inputs.docker-image }}
Expand All @@ -64,14 +55,8 @@ jobs:
USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }}
XLA_SKIP_TORCH_OP_TESTS: ${{ inputs.disable-pjrt }}
XLA_SKIP_MP_OP_TESTS: ${{ inputs.disable-pjrt }}
RUN_BENCHMARK_TESTS: ${{ matrix.run_benchmark_tests }}
RUN_CPP_TESTS1: ${{ matrix.run_cpp_tests1 }}
RUN_CPP_TESTS2: ${{ matrix.run_cpp_tests2 }}
RUN_PYTHON_TESTS: ${{ matrix.run_python_tests }}
RUN_XLA_OP_TESTS1: ${{ matrix.run_xla_op_tests1 }}
RUN_XLA_OP_TESTS2: ${{ matrix.run_xla_op_tests2 }}
RUN_XLA_OP_TESTS3: ${{ matrix.run_xla_op_tests3 }}
RUN_TORCH_MP_OP_TESTS: ${{ matrix.run_torch_mp_op_tests }}
steps:
- name: Setup Linux
uses: pytorch/test-infra/.github/actions/setup-linux@main
Expand Down
176 changes: 176 additions & 0 deletions .github/workflows/_test_python.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
name: xla-test
on:
workflow_call:
inputs:
dev-image:
required: true
type: string
description: Base image for builds
runner:
required: false
type: string
description: Runner type for the test
default: linux.12xlarge
collect-coverage:
required: false
type: boolean
description: Set to true to collect coverage information
default: false
timeout-minutes:
required: false
type: number
default: 270
description: |
Set the maximum (in minutes) how long the workflow should take to finish
timeout-minutes:
install-cuda-plugin:
required: false
type: boolean
default: false
description: Whether to install CUDA plugin package

secrets:
gcloud-service-key:
required: true
description: Secret to access Bazel build cache
jobs:
test:
runs-on: ${{ inputs.runner }}
container:
image: ${{ inputs.dev-image }}
options: "${{ inputs.install-cuda-plugin && '--gpus all' || '' }} --shm-size 16g"
strategy:
fail-fast: false
matrix:
include:
# Use readable strings as they define the workflow titles.
- run_benchmark_tests: 'benchmark_tests'
- run_python_tests: 'python_tests'
run_xla_op_tests1: 'xla_op1'
- run_python_tests: 'python_tests'
run_xla_op_tests2: 'xla_op2'
- run_python_tests: 'python_tests'
run_xla_op_tests3: 'xla_op3'
- run_python_tests: 'python_tests'
run_torch_mp_op_tests: 'torch_mp_op'
timeout-minutes: ${{ inputs.timeout-minutes }}
env:
GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }}
GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json
USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }}
RUN_BENCHMARK_TESTS: ${{ matrix.run_benchmark_tests }}
RUN_PYTHON_TESTS: ${{ matrix.run_python_tests }}
RUN_XLA_OP_TESTS1: ${{ matrix.run_xla_op_tests1 }}
RUN_XLA_OP_TESTS2: ${{ matrix.run_xla_op_tests2 }}
RUN_XLA_OP_TESTS3: ${{ matrix.run_xla_op_tests3 }}
RUN_TORCH_MP_OP_TESTS: ${{ matrix.run_torch_mp_op_tests }}
BAZEL_JOBS: 16
BAZEL_REMOTE_CACHE: 1
steps:
- name: Setup gcloud
shell: bash
run: |
echo "${GCLOUD_SERVICE_KEY}" > $GOOGLE_APPLICATION_CREDENTIALS
- name: Fetch wheels
uses: actions/download-artifact@v4
with:
name: torch-xla-wheels
path: /tmp/wheels/
- name: Fetch CUDA plugin
uses: actions/download-artifact@v4
with:
name: cuda-plugin
path: /tmp/wheels/
if: ${{ inputs.install-cuda-plugin }}
- name: Setup CUDA environment
shell: bash
run: |
# TODO: Make PJRT_DEVICE=CPU work with XLA_REGISTER_INSTALLED_PLUGINS=1
echo "XLA_REGISTER_INSTALLED_PLUGINS=1" >> $GITHUB_ENV
echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV
if: ${{ inputs.install-cuda-plugin }}
- name: Check GPU
run: nvidia-smi
if: ${{ inputs.install-cuda-plugin }}
- name: Install wheels
shell: bash
run: |
pip install /tmp/wheels/*.whl
# TODO: Add these in setup.py
pip install fsspec
pip install rich
- name: Record PyTorch commit
run: echo "PYTORCH_COMMIT=$(python -c 'import torch_xla.version; print(torch_xla.version.__torch_gitrev__)')" >> $GITHUB_ENV
- name: Checkout PyTorch Repo
uses: actions/checkout@v4
with:
repository: pytorch/pytorch
path: pytorch
ref: ${{ env.PYTORCH_COMMIT }}
- name: Checkout PyTorch/XLA Repo
uses: actions/checkout@v4
with:
path: pytorch/xla
- name: Extra CI deps
shell: bash
run: |
set -x
pip install expecttest unittest-xml-reporting
if [[ ! -z "$RUN_BENCHMARK_TESTS" ]]; then
pip install -r pytorch/xla/benchmarks/requirements.txt
fi
- name: Test
shell: bash
run: |
source pytorch/xla/.circleci/common.sh
run_torch_xla_tests pytorch/ pytorch/xla/ $USE_COVERAGE
- name: Upload coverage results
if: ${{ inputs.collect-coverage }}
shell: bash
env:
CIRCLE_WORKFLOW_ID: ${{ github.run_id }}
CIRCLE_BUILD_NUM: ${{ github.run_number }}
BENCHMARK_TEST_NAME: ${{ env.RUN_BENCHMARK_TESTS }}
PYTHON_TEST_NAME: ${{ env.RUN_PYTHON_TESTS }}${{ env.RUN_XLA_OP_TESTS1 }}${{ env.RUN_XLA_OP_TESTS2 }}${{ env.RUN_XLA_OP_TESTS3 }}${{ env.RUN_TORCH_MP_OP_TESTS }}
CPP_TEST_NAME: ${{ env.RUN_CPP_TESTS1 }}${{ env.RUN_CPP_TESTS2 }}
run: |
# TODO(yeounoh) collect coverage report as needed.
if [ -n "${BENCHMARK_TEST_NAME}" ]; then
exit 0
fi
docker cp "${pid}":/home/jenkins/htmlcov "${GITHUB_WORKSPACE}"
if [ -n "${GPU_FLAG:-}" ]; then
if [ -n "${PYTHON_TEST_NAME}" ]; then
gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage_${PYTHON_TEST_NAME}.out
gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage_${PYTHON_TEST_NAME}.out
fi
if [ -n "${CPP_TEST_NAME}" ]; then
gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage_${CPP_TEST_NAME}.out
gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage_${CPP_TEST_NAME}.out
fi
else
if [ -n "${PYTHON_TEST_NAME}" ]; then
gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out
gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out
fi
if [ -n "${CPP_TEST_NAME}" ]; then
gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out
gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out
fi
if [ "${CPP_TEST_NAME}" == "cpp_tests1" ]; then
ABS_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "commit_id": '\"${GITHUB_SHA}\"', "ref": "HEAD", "source": "https://github.com/pytorch/xla", "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}'
echo $ABS_METADATA > abs_metadata.json
gsutil cp abs_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json
INC_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "patchset_num": 1, "change_id": '${CIRCLE_BUILD_NUM}', "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}'
echo $INC_METADATA > inc_metadata.json
gsutil cp inc_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json
fi
fi
35 changes: 35 additions & 0 deletions .github/workflows/_tpu_ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: TPU Integration Test
on:
workflow_call:
jobs:
tpu-test:
runs-on: v4-runner-set
steps:
- name: Checkout repo
uses: actions/checkout@v4
with:
path: pytorch/xla
- name: Fetch wheels
uses: actions/download-artifact@v4
with:
name: torch-xla-wheels
path: /tmp/wheels/
- name: Install wheels
shell: bash
run: |
pip install /tmp/wheels/*.whl
- name: Install test dependencies
shell: bash
run: |
# TODO: Add these in setup.py
pip install fsspec
pip install rich
# Jax nightly is needed for pallas tests.
pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
- name: Run Tests
env:
PJRT_DEVICE: TPU
run: |
cd pytorch/xla
test/tpu/run_tests.sh
Loading

0 comments on commit f731f15

Please sign in to comment.