Skip to content

Commit

Permalink
Backport infra/ansible to r2.0 (#5104)
Browse files Browse the repository at this point in the history
  • Loading branch information
wonjoolee95 authored May 31, 2023
1 parent 7c97401 commit 500e1c2
Show file tree
Hide file tree
Showing 22 changed files with 674 additions and 0 deletions.
6 changes: 6 additions & 0 deletions infra/ansible/.ansible-lint
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
# .ansible-lint

profile: moderate
skip_list:
- schema[tasks]
32 changes: 32 additions & 0 deletions infra/ansible/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
ARG python_version=3.8
ARG debian_version=buster

FROM python:${python_version}-${debian_version} AS build

WORKDIR /ansible
RUN pip install ansible
COPY . /ansible

ARG ansible_vars
RUN ansible-playbook -vvv playbook.yaml -e "stage=build" -e "${ansible_vars}"

FROM python:${python_version}-${debian_version} AS release

WORKDIR /ansible
RUN pip install ansible
COPY . /ansible

ARG ansible_vars
RUN ansible-playbook -vvv playbook.yaml -e "stage=release" -e "${ansible_vars}" --tags "install_deps"

WORKDIR /tmp/wheels
COPY --from=build /src/pytorch/dist/*.whl ./
COPY --from=build /src/pytorch/xla/dist/*.whl ./

RUN echo "Installing the following wheels" && ls *.whl
RUN pip install *.whl

WORKDIR /

RUN rm -rf /ansible /tmp/wheels
COPY --from=build /dist/*.whl /dist/
58 changes: 58 additions & 0 deletions infra/ansible/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Ansible playbook

This ansible playbook will perform the following actions on the localhost:
* install required pip and apt packages, depending on the specified stage,
architecture and accelerator (see [apt.yaml](config/apt.yaml) and
[pip.yaml](config/pip.yaml)).
* fetch bazel (version configured in [vars.yaml](config/vars.yaml)),
* fetch PyTorch and XLA sources at master (or specific revisions,
see role `fetch_srcs` in [playbook.yaml](playbook.yaml)).
* set required environment variables (see [env.yaml](config/env.yaml)),
* build and install PyTorch and XLA wheels,
* apply infrastructure tests (see `*/tests.yaml` files in [roles](roles)).

## Prerequisites

* Python 3.8+
* Ansible. Install with `pip install ansible`.

## Running

The playbook requires passing explicitly 3 variables that configure playbook
behavior (installed pip/apt packages and set environment variables):
* `stage`: build or release. Different packages are installed depending on
the chosen stage.
* `arch`: aarch64 or amd64. Architecture of the built image and wheels.
* `accelerator`: tpu or cuda. Available accelerator.

The variables can be passed through `-e` flag: `-e "<var>=<value>"`.

Example: `ansible-playbook playbook.yaml -e "stage=build arch=amd64 accelerator=tpu"`

## Config structure

The playbook configuration is split into 4 files, per each logical system.
The configuration is simply loaded as playbook variables which are then passed
to specific roles and tasks.
Only variables in [config/env.yaml](config/env.yaml) are passed as env variables.

* [apt.yaml](config/apt.yaml) - specifies apt packages for each stage and
architecture or accelerator.
Packages shared between all architectures and accelerators in a given stage
are specified in `*_common`. They are appended to any architecture specific list.

This config also contains a list of required apt repos and signing keys.
These variables are mainly consumed by the [install_deps](roles/install_deps/tasks/main.yaml) role.

* [pip.yaml](config/pip.yaml) - similarly to apt.yaml, lists pip packages per stage and arch / accelerator.
In both pip and apt config files stage and and arch / accelerator are
concatenated together and specified under one key (e.g. build_amd64, release_tpu).

* [env.yaml](config/env.yaml) - contains Ansible variables that are passed as env variables when
building PyTorch and XLA (`build_env`). Variables in `release_env` are saved in `/etc/environment` (executed for the `release` stage).

* [vars.yaml](config/vars.yaml) - Ansible variables used in other config files and throughout the playbook.
Not associated with any particular system.

Variables from these config files are dynamically loaded (during playbook execution),
see [playbook.yaml](playbook.yaml).
16 changes: 16 additions & 0 deletions infra/ansible/ansible.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# See https://docs.ansible.com/ansible/latest/reference_appendices/config.html
# for various configuration options.

[defaults]
# Displays tasks execution duration.
callbacks_enabled = profile_tasks
# The playbooks is only run on the implicit localhost.
# Silence warning about empty hosts inventory.
localhost_warning = False
# Make output human-readable.
stdout_callback = yaml

[inventory]
# Silence warning about no inventory.
# This option is available since Ansible 2.14 (available only with Python 3.9+).
inventory_unparsed_warning = False
60 changes: 60 additions & 0 deletions infra/ansible/config/apt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Contains lists of apt packages for each stage (build|release) and arch or accelerator.
apt:
pkgs:
build_common:
- ccache
- curl
- git
- gnupg
- libopenblas-dev
- ninja-build
- procps
- python3-pip
- rename
- vim
- wget
- clang-format-7
- lcov
- less

build_cuda:
- "cuda-libraries-{{ cuda_version | replace('.', '-') }}"
- "cuda-toolkit-{{ cuda_version | replace('.', '-') }}"
- "cuda-minimal-build-{{ cuda_version | replace('.', '-') }}"
- "{{ cuda_deps['libcudnn'][cuda_version] }}"
- "{{ cuda_deps['libcudnn-dev'][cuda_version] }}"

build_amd64:
- "clang-{{ clang_version }}"

build_aarch64:
- scons
- gcc-10
- g++-10

release_common:
- curl
- git
- gnupg
- libgomp1
- libopenblas-base
- patch

release_cuda:
- "cuda-libraries-{{ cuda_version | replace('.', '-') }}"
- "cuda-minimal-build-{{ cuda_version | replace('.', '-') }}"
- "{{ cuda_deps['libcudnn'][cuda_version] }}"

# Specify objects with string fields `url` and `keyring`.
# The keyring path should start with /usr/share/keyrings/ for debian and ubuntu.
signing_keys:
- url: https://apt.llvm.org/llvm-snapshot.gpg.key
keyring: /usr/share/keyrings/llvm.pgp
- url: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_repo }}/x86_64/3bf863cc.pub"
keyring: /usr/share/keyrings/cuda.pgp

repos:
# signed-by path should match the corresponding keyring path above.
- "deb [signed-by=/usr/share/keyrings/llvm.pgp] http://apt.llvm.org/{{ llvm_debian_repo }}/ llvm-toolchain-{{ llvm_debian_repo }}-{{ clang_version }} main"
- "deb-src [signed-by=/usr/share/keyrings/llvm.pgp] http://apt.llvm.org/{{ llvm_debian_repo }}/ llvm-toolchain-{{ llvm_debian_repo }}-{{ clang_version }} main"
- "deb [signed-by=/usr/share/keyrings/cuda.pgp] https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_repo }}/x86_64/ /"
12 changes: 12 additions & 0 deletions infra/ansible/config/cuda_deps.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Versions of cuda dependencies for given cuda versions.
# Note: wrap version in quotes to ensure they're treated as strings.
cuda_deps:
# List all libcudnn8 versions with `apt list -a libcudnn8`
libcudnn:
"11.8": libcudnn8=8.8.0.121-1+cuda11.8
"11.7": libcudnn8=8.5.0.96-1+cuda11.7
"11.2": libcudnn8=8.1.1.33-1+cuda11.2
libcudnn-dev:
"11.8": libcudnn8-dev=8.8.0.121-1+cuda11.8
"11.7": libcudnn8-dev=8.5.0.96-1+cuda11.7
"11.2": libcudnn8-dev=8.1.1.33-1+cuda11.2
49 changes: 49 additions & 0 deletions infra/ansible/config/env.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Variables that will be stored in ~/.bashrc and ~/.zshrc files for the release stage.
# They'll be accessible for all processes on the host, also in the development image.
release_env:
common:
# Force GCC because clang/bazel has issues.
CC: gcc
CXX: g++
# CC: "clang-{{ clang_version }}"
# CXX: "clang++-{{ clang_version }}"
LD_LIBRARY_PATH: "$LD_LIBRARY_PATH:/usr/local/lib"

tpu:
ACCELERATOR: tpu
TPUVM_MODE: 1

cuda:
TF_CUDA_COMPUTE_CAPABILITIES: 7.0,7.5,8.0
XLA_CUDA: 1

# Variables that will be passed to shell environment only for building PyTorch and XLA libs.
build_env:
common:
LD_LIBRARY_PATH: "$LD_LIBRARY_PATH:/usr/local/lib"
# Set explicitly to 0 as setup.py defaults this flag to true if unset.
BUILD_CPP_TESTS: 0
# Force GCC because clang/bazel has issues.
CC: gcc
CXX: g++
PYTORCH_BUILD_NUMBER: 1
TORCH_XLA_VERSION: "{{ package_version }}"
PYTORCH_BUILD_VERSION: "{{ package_version }}"
XLA_SANDBOX_BUILD: 1
BAZEL_REMOTE_CACHE: 1
SILO_NAME: "cache-silo-{{ arch }}-{{ accelerator }}"

amd64:
ARCH: amd64

aarch64:

cuda:
TF_CUDA_COMPUTE_CAPABILITIES: 7.0,7.5,8.0
XLA_CUDA: 1

tpu:
ACCELERATOR: tpu
TPUVM_MODE: 1
BUNDLE_LIBTPU: 1

53 changes: 53 additions & 0 deletions infra/ansible/config/pip.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Contains lists of pip packages for each stage (build|release) and arch or accelerator.
pip:
pkgs:
# Shared between all architectures and accelerators for the build stage.
build_common:
- astunparse
- cffi
- cloud-tpu-client
- cmake
- coverage
- dataclasses
- expecttest==0.1.3
- future
- git-archive-all
- google-api-python-client
- google-cloud-storage
- hypothesis
- lark-parser
- ninja
- numpy
- oauth2client
- pyyaml
- requests
- setuptools
- six
- tensorboard
- tensorboardX
- tqdm
- typing
- typing_extensions
- sympy
- yapf==0.30.0

build_amd64:
- mkl
- mkl-include

build_aarch64:

# Shared between all architectures and accelerators for the release stage.
release_common:
- numpy
- pyyaml
- mkl
- mkl-include

release_tpu:

# Packages that will be installed with the `--nodeps` flag.
pkgs_nodeps:
release_common:
- torchvision
- pillow
10 changes: 10 additions & 0 deletions infra/ansible/config/vars.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Used for fetching cuda from the right repo, see apt.yaml.
cuda_repo: ubuntu1804
cuda_version: "11.8"
# Used for fetching clang from the right repo, see apt.yaml.
llvm_debian_repo: buster
clang_version: 10
# PyTorch and PyTorch/XLA wheel versions.
package_version: 2.0
# If set to true, wheels will be renamed to $WHEEL_NAME-nightly-cp38-cp38-linux_x86_64.whl.
nightly_release: false
19 changes: 19 additions & 0 deletions infra/ansible/development.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Dockerfile for building a development image.
# The built image contains all required pip and apt packages for building and
# running PyTorch and PyTorch/XLA. The image doesn't contain any source code.
ARG python_version=3.8
ARG debian_version=buster

FROM python:${python_version}-${debian_version}

RUN pip install ansible

COPY . /ansible
WORKDIR /ansible

# List Asnible tasks to apply for the dev image.
ENV TAGS="bazel,configure_env,install_deps"

ARG ansible_vars
RUN ansible-playbook playbook.yaml -e "stage=build" -e "${ansible_vars}" --tags "${TAGS}"
RUN ansible-playbook playbook.yaml -e "stage=release" -e "${ansible_vars}" --tags "${TAGS}"
38 changes: 38 additions & 0 deletions infra/ansible/e2e_tests.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
ARG python_version=3.8
ARG debian_version=buster

FROM python:${python_version}-${debian_version} AS build

WORKDIR /ansible
RUN pip install ansible
COPY . /ansible

# Build PyTorch and PyTorch/XLA wheels.
ARG ansible_vars
RUN ansible-playbook -vvv playbook.yaml -e "stage=build" -e "${ansible_vars}"

FROM python:${python_version}-${debian_version}
WORKDIR /ansible
RUN pip install ansible
COPY . /ansible

# Install runtime pip and apt dependencies.
ARG ansible_vars
RUN ansible-playbook -vvv playbook.yaml -e "stage=release" -e "${ansible_vars}" --tags "install_deps"

# Copy test sources.
RUN mkdir -p /src/pytorch/xla
COPY --from=build /src/pytorch/xla/test /src/pytorch/xla/test

# Copy and install wheels.
WORKDIR /tmp/wheels
COPY --from=build /src/pytorch/dist/*.whl ./
COPY --from=build /src/pytorch/xla/dist/*.whl ./

RUN echo "Installing the following wheels" && ls *.whl
RUN pip install *.whl

WORKDIR /

# Clean-up unused directories.
RUN rm -rf /ansible /tmp/wheels
Loading

0 comments on commit 500e1c2

Please sign in to comment.