-
Notifications
You must be signed in to change notification settings - Fork 488
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into wcromar/tx2-simplify-setup
- Loading branch information
Showing
95 changed files
with
2,776 additions
and
299 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
{ | ||
"name": "gpu-internal", | ||
"image": "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1", | ||
"runArgs": [ | ||
"--gpus=all", | ||
"--net=host", | ||
"--shm-size=16G" | ||
], | ||
"containerEnv": { | ||
"BAZEL_REMOTE_CACHE": "1", | ||
"SILO_NAME": "cache-silo-${localEnv:USER}-gpuvm" | ||
}, | ||
"initializeCommand": "docker pull us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1", | ||
"customizations": { | ||
"vscode": { | ||
"extensions": [ | ||
"llvm-vs-code-extensions.vscode-clangd", | ||
"ms-vscode.cpptools-themes", | ||
"BazelBuild.vscode-bazel", | ||
"DevonDCarew.bazel-code", | ||
"StackBuild.bazel-stack-vscode", | ||
"StackBuild.bazel-stack-vscode-cc", | ||
"xaver.clang-format", | ||
"ryanluker.vscode-coverage-gutters", | ||
"ms-azuretools.vscode-docker", | ||
"ms-python.python" | ||
] | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
name: build-cuda-plugin | ||
on: | ||
workflow_call: | ||
inputs: | ||
dev-image: | ||
required: true | ||
type: string | ||
description: Base image for builds | ||
runner: | ||
required: false | ||
type: string | ||
description: Runner type for the test | ||
default: linux.12xlarge | ||
|
||
secrets: | ||
gcloud-service-key: | ||
required: true | ||
description: Secret to access Bazel build cache | ||
jobs: | ||
build: | ||
runs-on: ${{ inputs.runner }} | ||
container: | ||
image: ${{ inputs.dev-image }} | ||
env: | ||
GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }} | ||
GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json | ||
BAZEL_JOBS: 16 | ||
BAZEL_REMOTE_CACHE: 1 | ||
steps: | ||
- name: Setup gcloud | ||
shell: bash | ||
run: | | ||
echo "${GCLOUD_SERVICE_KEY}" > $GOOGLE_APPLICATION_CREDENTIALS | ||
- name: Checkout repo | ||
uses: actions/checkout@v4 | ||
with: | ||
path: pytorch/xla | ||
- name: Build | ||
shell: bash | ||
run: | | ||
cd pytorch/xla/infra/ansible | ||
ansible-playbook playbook.yaml -vvv -e "stage=build_plugin arch=amd64 accelerator=cuda cuda_compute_capabilities=5.2,7.5 src_root=${GITHUB_WORKSPACE} cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps | ||
- name: Upload wheel | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: cuda-plugin | ||
path: /dist/*.whl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
name: build-cuda-plugin | ||
on: | ||
workflow_call: | ||
inputs: | ||
dev-image: | ||
required: true | ||
type: string | ||
description: Base image for builds | ||
runner: | ||
required: false | ||
type: string | ||
description: Runner type for the test | ||
default: linux.12xlarge | ||
|
||
secrets: | ||
gcloud-service-key: | ||
required: true | ||
description: Secret to access Bazel build cache | ||
jobs: | ||
build: | ||
runs-on: ${{ inputs.runner }} | ||
container: | ||
image: ${{ inputs.dev-image }} | ||
env: | ||
GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }} | ||
GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json | ||
BAZEL_JOBS: 16 | ||
BAZEL_REMOTE_CACHE: 1 | ||
# BUILD_CPP_TESTS: 1 | ||
steps: | ||
- name: Setup gcloud | ||
shell: bash | ||
run: | | ||
echo "${GCLOUD_SERVICE_KEY}" > $GOOGLE_APPLICATION_CREDENTIALS | ||
- name: Checkout PyTorch Repo | ||
uses: actions/checkout@v4 | ||
with: | ||
repository: pytorch/pytorch | ||
path: pytorch | ||
submodules: recursive | ||
# TODO: correct pin | ||
- name: Checkout PyTorch/XLA Repo | ||
uses: actions/checkout@v4 | ||
with: | ||
path: pytorch/xla | ||
- name: Build | ||
shell: bash | ||
run: | | ||
cd pytorch/xla/infra/ansible | ||
ansible-playbook playbook.yaml -vvv -e "stage=build arch=amd64 accelerator=tpu src_root=${GITHUB_WORKSPACE} bundle_libtpu=0 cache_suffix=-ci" --skip-tags=fetch_srcs,install_deps | ||
- name: Upload wheel | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: torch-xla-wheels | ||
path: /dist/*.whl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,176 @@ | ||
name: xla-test | ||
on: | ||
workflow_call: | ||
inputs: | ||
dev-image: | ||
required: true | ||
type: string | ||
description: Base image for builds | ||
runner: | ||
required: false | ||
type: string | ||
description: Runner type for the test | ||
default: linux.12xlarge | ||
collect-coverage: | ||
required: false | ||
type: boolean | ||
description: Set to true to collect coverage information | ||
default: false | ||
timeout-minutes: | ||
required: false | ||
type: number | ||
default: 270 | ||
description: | | ||
Set the maximum (in minutes) how long the workflow should take to finish | ||
timeout-minutes: | ||
install-cuda-plugin: | ||
required: false | ||
type: boolean | ||
default: false | ||
description: Whether to install CUDA plugin package | ||
|
||
secrets: | ||
gcloud-service-key: | ||
required: true | ||
description: Secret to access Bazel build cache | ||
jobs: | ||
test: | ||
runs-on: ${{ inputs.runner }} | ||
container: | ||
image: ${{ inputs.dev-image }} | ||
options: "${{ inputs.install-cuda-plugin && '--gpus all' || '' }} --shm-size 16g" | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
include: | ||
# Use readable strings as they define the workflow titles. | ||
- run_benchmark_tests: 'benchmark_tests' | ||
- run_python_tests: 'python_tests' | ||
run_xla_op_tests1: 'xla_op1' | ||
- run_python_tests: 'python_tests' | ||
run_xla_op_tests2: 'xla_op2' | ||
- run_python_tests: 'python_tests' | ||
run_xla_op_tests3: 'xla_op3' | ||
- run_python_tests: 'python_tests' | ||
run_torch_mp_op_tests: 'torch_mp_op' | ||
timeout-minutes: ${{ inputs.timeout-minutes }} | ||
env: | ||
GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }} | ||
GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json | ||
USE_COVERAGE: ${{ inputs.collect-coverage && '1' || '0' }} | ||
RUN_BENCHMARK_TESTS: ${{ matrix.run_benchmark_tests }} | ||
RUN_PYTHON_TESTS: ${{ matrix.run_python_tests }} | ||
RUN_XLA_OP_TESTS1: ${{ matrix.run_xla_op_tests1 }} | ||
RUN_XLA_OP_TESTS2: ${{ matrix.run_xla_op_tests2 }} | ||
RUN_XLA_OP_TESTS3: ${{ matrix.run_xla_op_tests3 }} | ||
RUN_TORCH_MP_OP_TESTS: ${{ matrix.run_torch_mp_op_tests }} | ||
BAZEL_JOBS: 16 | ||
BAZEL_REMOTE_CACHE: 1 | ||
steps: | ||
- name: Setup gcloud | ||
shell: bash | ||
run: | | ||
echo "${GCLOUD_SERVICE_KEY}" > $GOOGLE_APPLICATION_CREDENTIALS | ||
- name: Fetch wheels | ||
uses: actions/download-artifact@v4 | ||
with: | ||
name: torch-xla-wheels | ||
path: /tmp/wheels/ | ||
- name: Fetch CUDA plugin | ||
uses: actions/download-artifact@v4 | ||
with: | ||
name: cuda-plugin | ||
path: /tmp/wheels/ | ||
if: ${{ inputs.install-cuda-plugin }} | ||
- name: Setup CUDA environment | ||
shell: bash | ||
run: | | ||
# TODO: Make PJRT_DEVICE=CPU work with XLA_REGISTER_INSTALLED_PLUGINS=1 | ||
echo "XLA_REGISTER_INSTALLED_PLUGINS=1" >> $GITHUB_ENV | ||
echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV | ||
echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV | ||
if: ${{ inputs.install-cuda-plugin }} | ||
- name: Check GPU | ||
run: nvidia-smi | ||
if: ${{ inputs.install-cuda-plugin }} | ||
- name: Install wheels | ||
shell: bash | ||
run: | | ||
pip install /tmp/wheels/*.whl | ||
# TODO: Add these in setup.py | ||
pip install fsspec | ||
pip install rich | ||
- name: Record PyTorch commit | ||
run: echo "PYTORCH_COMMIT=$(python -c 'import torch_xla.version; print(torch_xla.version.__torch_gitrev__)')" >> $GITHUB_ENV | ||
- name: Checkout PyTorch Repo | ||
uses: actions/checkout@v4 | ||
with: | ||
repository: pytorch/pytorch | ||
path: pytorch | ||
ref: ${{ env.PYTORCH_COMMIT }} | ||
- name: Checkout PyTorch/XLA Repo | ||
uses: actions/checkout@v4 | ||
with: | ||
path: pytorch/xla | ||
- name: Extra CI deps | ||
shell: bash | ||
run: | | ||
set -x | ||
pip install expecttest unittest-xml-reporting | ||
if [[ ! -z "$RUN_BENCHMARK_TESTS" ]]; then | ||
pip install -r pytorch/xla/benchmarks/requirements.txt | ||
fi | ||
- name: Test | ||
shell: bash | ||
run: | | ||
source pytorch/xla/.circleci/common.sh | ||
run_torch_xla_tests pytorch/ pytorch/xla/ $USE_COVERAGE | ||
- name: Upload coverage results | ||
if: ${{ inputs.collect-coverage }} | ||
shell: bash | ||
env: | ||
CIRCLE_WORKFLOW_ID: ${{ github.run_id }} | ||
CIRCLE_BUILD_NUM: ${{ github.run_number }} | ||
BENCHMARK_TEST_NAME: ${{ env.RUN_BENCHMARK_TESTS }} | ||
PYTHON_TEST_NAME: ${{ env.RUN_PYTHON_TESTS }}${{ env.RUN_XLA_OP_TESTS1 }}${{ env.RUN_XLA_OP_TESTS2 }}${{ env.RUN_XLA_OP_TESTS3 }}${{ env.RUN_TORCH_MP_OP_TESTS }} | ||
CPP_TEST_NAME: ${{ env.RUN_CPP_TESTS1 }}${{ env.RUN_CPP_TESTS2 }} | ||
run: | | ||
# TODO(yeounoh) collect coverage report as needed. | ||
if [ -n "${BENCHMARK_TEST_NAME}" ]; then | ||
exit 0 | ||
fi | ||
docker cp "${pid}":/home/jenkins/htmlcov "${GITHUB_WORKSPACE}" | ||
if [ -n "${GPU_FLAG:-}" ]; then | ||
if [ -n "${PYTHON_TEST_NAME}" ]; then | ||
gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage_${PYTHON_TEST_NAME}.out | ||
gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage_${PYTHON_TEST_NAME}.out | ||
fi | ||
if [ -n "${CPP_TEST_NAME}" ]; then | ||
gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage_${CPP_TEST_NAME}.out | ||
gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage_${CPP_TEST_NAME}.out | ||
fi | ||
else | ||
if [ -n "${PYTHON_TEST_NAME}" ]; then | ||
gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out | ||
gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out | ||
fi | ||
if [ -n "${CPP_TEST_NAME}" ]; then | ||
gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out | ||
gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out | ||
fi | ||
if [ "${CPP_TEST_NAME}" == "cpp_tests1" ]; then | ||
ABS_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "commit_id": '\"${GITHUB_SHA}\"', "ref": "HEAD", "source": "https://github.com/pytorch/xla", "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}' | ||
echo $ABS_METADATA > abs_metadata.json | ||
gsutil cp abs_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json | ||
INC_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "patchset_num": 1, "change_id": '${CIRCLE_BUILD_NUM}', "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}' | ||
echo $INC_METADATA > inc_metadata.json | ||
gsutil cp inc_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json | ||
fi | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
name: TPU Integration Test | ||
on: | ||
workflow_call: | ||
jobs: | ||
tpu-test: | ||
runs-on: v4-runner-set | ||
steps: | ||
- name: Checkout repo | ||
uses: actions/checkout@v4 | ||
with: | ||
path: pytorch/xla | ||
- name: Fetch wheels | ||
uses: actions/download-artifact@v4 | ||
with: | ||
name: torch-xla-wheels | ||
path: /tmp/wheels/ | ||
- name: Install wheels | ||
shell: bash | ||
run: | | ||
pip install /tmp/wheels/*.whl | ||
- name: Install test dependencies | ||
shell: bash | ||
run: | | ||
# TODO: Add these in setup.py | ||
pip install fsspec | ||
pip install rich | ||
# Jax nightly is needed for pallas tests. | ||
pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html | ||
pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html | ||
- name: Run Tests | ||
env: | ||
PJRT_DEVICE: TPU | ||
run: | | ||
cd pytorch/xla | ||
test/tpu/run_tests.sh |
Oops, something went wrong.