Skip to content

Commit

Permalink
Update CI image with dev container image (pytorch#5290)
Browse files Browse the repository at this point in the history
This PR updates PyTorch/XLA CI image to use dev container based image. The same image is used in upstream CI in #109757
---------

Co-authored-by: Siyuan Liu <[email protected]>
  • Loading branch information
2 people authored and mbzomowski committed Nov 16, 2023
1 parent 642e026 commit 6ca267a
Show file tree
Hide file tree
Showing 13 changed files with 48 additions and 75 deletions.
5 changes: 4 additions & 1 deletion .circleci/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,17 @@ apply_patches

python -c "import fcntl; fcntl.fcntl(1, fcntl.F_SETFL, 0)"

# We always build PyTorch without CUDA support.
export USE_CUDA=0
python setup.py install

sccache --show-stats

source $XLA_DIR/xla_env
export GCLOUD_SERVICE_KEY_FILE="$XLA_DIR/default_credentials.json"
export SILO_NAME='cache-silo-ci-gcc-11' # cache bucket for CI
export SILO_NAME='cache-silo-ci-dev-3.8_cuda_12.1' # cache bucket for CI
export BUILD_CPP_TESTS='1'
export TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_70,sm_75,compute_80,$TF_CUDA_COMPUTE_CAPABILITIES"
build_torch_xla $XLA_DIR

popd
21 changes: 0 additions & 21 deletions .circleci/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -92,27 +92,6 @@ function install_deps_pytorch_xla() {

sudo ln -s "$(command -v bazelisk)" /usr/bin/bazel

# Install gcc-11
sudo apt-get update
# Update ppa for GCC
sudo apt-get install -y software-properties-common
sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
sudo apt update -y
sudo apt install -y gcc-11
sudo apt install -y g++-11
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100

export NVCC_PREPEND_FLAGS='-ccbin /usr/bin/g++-11'

# Hack similar to https://github.com/pytorch/pytorch/pull/105227/files#diff-9e59213240d3b55d2ddc53c8c096db9eece0665d64f46473454f9dc0c10fd804
sudo rm /opt/conda/lib/libstdc++.so*

# Update gcov for test coverage
sudo update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-11 100
sudo update-alternatives --install /usr/bin/gcov-dump gcov-dump /usr/bin/gcov-dump-11 100
sudo update-alternatives --install /usr/bin/gcov-tool gcov-tool /usr/bin/gcov-tool-11 100

# Symnlink the missing cuda headers if exists
CUBLAS_PATTERN="/usr/include/cublas*"
if ls $CUBLAS_PATTERN 1> /dev/null 2>&1; then
Expand Down
59 changes: 28 additions & 31 deletions .circleci/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# This requires cuda & cudnn packages pre-installed in the base image.
# Other available cuda images are listed at https://hub.docker.com/r/nvidia/cuda
ARG base_image="nvidia/cuda:11.7.0-cudnn8-devel-ubuntu18.04"
ARG base_image="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1"
FROM "${base_image}"

ARG python_version="3.8"
ARG cuda="1"
ARG cuda_compute="5.2,7.5"
ARG cc="clang-8"
ARG cxx="clang++-8"
ARG cc="clang"
ARG cxx="clang++"
ARG cxx_abi="1"
ARG tpuvm=""

Expand Down Expand Up @@ -37,38 +37,15 @@ ENV CXX "${cxx}"
# Whether to build for TPUVM mode
ENV TPUVM_MODE "${tpuvm}"

# Rotate nvidia repo public key (last updated: 04/27/2022)
# Unfortunately, nvidia/cuda image is shipped with invalid public key
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub

# Install base system packages
RUN apt-get clean && apt-get update
RUN apt-get upgrade -y
RUN apt-get install --fix-missing -y python-pip python3-pip git curl libopenblas-dev vim jq \
apt-transport-https ca-certificates procps openssl sudo wget libssl-dev libc6-dbg

# Install clang & llvm
ADD ./install_llvm_clang.sh install_llvm_clang.sh
RUN bash ./install_llvm_clang.sh

# Install clang as upstream CI forces clang
RUN apt-get install -y clang
# Install valgrind
ADD ./install_valgrind.sh install_valgrind.sh
COPY ./install_valgrind.sh install_valgrind.sh
RUN bash ./install_valgrind.sh

# Sets up jenkins user.
RUN useradd jenkins && \
mkdir /home/jenkins && \
chown jenkins /home/jenkins
RUN echo 'jenkins ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers

RUN mkdir -p /opt/conda /opt/cargo /opt/rustup /workspace /var/lib/jenkins && \
chown jenkins /opt/conda /opt/cargo /opt/rustup /workspace /var/lib/jenkins
USER jenkins
WORKDIR /workspace

# Install openmpi for CUDA
run sudo apt-get install -y ssh
run sudo apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev
run apt-get install -y ssh
run apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev

# Builds and configure sccache
ENV OPENSSL_INCLUDE_DIR /usr/include/openssl
Expand All @@ -87,6 +64,25 @@ RUN . $CARGO_HOME/env && \

ENV PATH $CARGO_HOME/bin:$PATH

# Upstream CI requires jq
RUN apt-get install -y jq

# TODO: Add exec permisson for all users in base image.
RUN chmod a+x /usr/local/bin/bazel
# TODO: move sudo installation in base image.
RUN apt-get install -y sudo

RUN useradd jenkins && \
mkdir /home/jenkins && \
chown jenkins /home/jenkins
RUN echo 'jenkins ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers

RUN mkdir -p /opt/conda /opt/cargo /opt/rustup /workspace /var/lib/jenkins && \
chown jenkins /opt/conda /opt/cargo /opt/rustup /workspace /var/lib/jenkins
ENV PATH /home/jenkins/.local/bin:$PATH
USER jenkins
WORKDIR /workspace

# Installs and configures Conda.
ADD ./install_conda.sh install_conda.sh
RUN sudo chown jenkins ./install_conda.sh
Expand All @@ -95,6 +91,7 @@ RUN bash ./install_conda.sh "${python_version}" /opt/conda
RUN echo "conda activate base" >> ~/.bashrc
RUN echo "export TF_CPP_LOG_THREAD_ID=1" >> ~/.bashrc
ENV PATH /opt/conda/bin:$PATH
ENV LD_LIBRARY_PATH /lib/x86_64-linux-gnu/:/usr/lib/x86_64-linux-gnu/:/opt/conda/lib/:$LD_LIBRARY_PATH

RUN bash -c "source ~/.bashrc"
CMD ["bash"]
7 changes: 2 additions & 5 deletions .circleci/docker/install_conda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ set -ex

PYTHON_VERSION=$1
CONDA_PREFIX=$2
DEFAULT_PYTHON_VERSION=3.7
DEFAULT_PYTHON_VERSION=3.8


function install_and_setup_conda() {
Expand All @@ -30,7 +30,7 @@ function install_and_setup_conda() {
conda update -y -n base conda
conda install -y python=$PYTHON_VERSION

conda install -y nomkl numpy=1.18.5 pyyaml setuptools cmake \
conda install -y nomkl numpy=1.18.5 pyyaml setuptools \
cffi typing tqdm coverage hypothesis dataclasses cython

/usr/bin/yes | pip install mkl==2022.2.1
Expand All @@ -41,9 +41,6 @@ function install_and_setup_conda() {
/usr/bin/yes | pip install --upgrade numba
/usr/bin/yes | pip install cloud-tpu-client
/usr/bin/yes | pip install expecttest==0.1.3
/usr/bin/yes | pip install ninja # Install ninja to speedup the build
# Using Ninja requires CMake>=3.13, PyTorch requires CMake>=3.18
/usr/bin/yes | pip install "cmake>=3.18" --upgrade
/usr/bin/yes | pip install absl-py
# Additional PyTorch requirements
/usr/bin/yes | pip install scikit-image scipy==1.6.3
Expand Down
2 changes: 1 addition & 1 deletion .circleci/docker/install_valgrind.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ tar -xjf valgrind-${VALGRIND_VERSION}.tar.bz2
cd valgrind-${VALGRIND_VERSION}
./configure --prefix=/usr/local
make -j6
sudo make install
make install
cd ../../
rm -rf valgrind_build
alias valgrind="/usr/local/bin/valgrind"
2 changes: 1 addition & 1 deletion .circleci/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,5 @@ function install_torchvision() {
install_torchvision

export GCLOUD_SERVICE_KEY_FILE="$XLA_DIR/default_credentials.json"
export SILO_NAME='cache-silo-ci-gcc-11' # cache bucket for CI
export SILO_NAME='cache-silo-ci-dev-3.8_cuda_12.1' # cache bucket for CI
run_torch_xla_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
10 changes: 4 additions & 6 deletions .github/workflows/_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,12 @@ jobs:
# if image layers are not present in the repo.
# Note: disable the following 2 lines while testing a new image, so we do not
# push to the upstream.
docker tag "${GCR_DOCKER_IMAGE}" "${ECR_DOCKER_IMAGE_BASE}:v1.0" >/dev/null
docker push "${ECR_DOCKER_IMAGE_BASE}:v1.0" >/dev/null
docker tag "${GCR_DOCKER_IMAGE}" "${ECR_DOCKER_IMAGE_BASE}:v1.1-lite" >/dev/null
docker push "${ECR_DOCKER_IMAGE_BASE}:v1.1-lite" >/dev/null
- name: Start the container
shell: bash
run: |
pid=$(docker run -t -d -w "$WORKDIR" "${GCR_DOCKER_IMAGE}")
pid=$(docker run --privileged -t -d -w "$WORKDIR" "${GCR_DOCKER_IMAGE}")
docker exec -u jenkins "${pid}" sudo chown -R jenkins "${WORKDIR}"
docker cp "${GITHUB_WORKSPACE}/." "$pid:$WORKDIR"
echo "pid=${pid}" >> "${GITHUB_ENV}"
Expand All @@ -87,7 +87,6 @@ jobs:
shell: bash
run: |
echo "declare -x SCCACHE_BUCKET=${SCCACHE_BUCKET}" | docker exec -i "${pid}" sh -c "cat >> env"
echo "declare -x CC=clang-8 CXX=clang++-8" | docker exec -i "${pid}" sh -c "cat >> xla_env"
echo "declare -x DISABLE_XRT=${DISABLE_XRT}" | docker exec -i "${pid}" sh -c "cat >> xla_env"
echo "declare -x XLA_CUDA=${XLA_CUDA}" | docker exec -i "${pid}" sh -c "cat >> xla_env"
echo "declare -x BAZEL_REMOTE_CACHE=1" | docker exec -i "${pid}" sh -c "cat >> xla_env"
Expand All @@ -96,8 +95,7 @@ jobs:
- name: Build
shell: bash
run: |
docker exec -u jenkins "${pid}" bash -c ". ~/.bashrc && .circleci/build.sh"
docker exec --privileged -u jenkins "${pid}" bash -c ".circleci/build.sh"
- name: Cleanup build env
shell: bash
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/_coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ jobs:
- name: Test
shell: bash
run: |
docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/${{ inputs.test-script }}'
docker exec -u jenkins "${pid}" bash -c '.circleci/${{ inputs.test-script }}'
- name: Upload coverage results
if: ${{ inputs.collect-coverage }}
shell: bash
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
echo "pid=${pid}" >> "${GITHUB_ENV}"
- name: Build & publish docs
shell: bash
run: docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/doc_push.sh'
run: docker exec -u jenkins "${pid}" bash -c '.circleci/doc_push.sh'
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()
2 changes: 1 addition & 1 deletion .github/workflows/_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ jobs:
- name: Test
shell: bash
run: |
docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/${{ inputs.test-script }}'
docker exec --privileged -u jenkins "${pid}" bash -c '.circleci/${{ inputs.test-script }}'
- name: Upload coverage results
if: ${{ inputs.collect-coverage }}
shell: bash
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ jobs:
uses: ./.github/workflows/_build.yml
with:
ecr-docker-image-base: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base
gcr-docker-image: gcr.io/tpu-pytorch/xla_base:latest
disable_xrt: 1
gcr-docker-image: gcr.io/tpu-pytorch/xla_base:dev-3.8_cuda_12.1
cuda: 1
secrets:
gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
Expand All @@ -43,7 +42,7 @@ jobs:
with:
docker-image: ${{ needs.build.outputs.docker-image }}
runner: linux.8xlarge.nvidia.gpu
timeout-minutes: 300
timeout-minutes: 180
disable-xrt: 1
secrets:
gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/build_and_test_xrt.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
uses: ./.github/workflows/_build.yml
with:
ecr-docker-image-base: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base
gcr-docker-image: gcr.io/tpu-pytorch/xla_base:latest
gcr-docker-image: gcr.io/tpu-pytorch/xla_base:dev-3.8_cuda_12.1
disable_xrt: 0
cuda: 1
secrets:
Expand All @@ -42,7 +42,7 @@ jobs:
with:
docker-image: ${{ needs.build.outputs.docker-image }}
runner: linux.8xlarge.nvidia.gpu
timeout-minutes: 300
timeout-minutes: 180
disable-xrt: 0
secrets:
gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
2 changes: 1 addition & 1 deletion .kokoro/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ ARG SCCACHE="$(which sccache)"

WORKDIR /pytorch/xla
ARG GCLOUD_SERVICE_KEY_FILE="/pytorch/xla/default_credentials.json"
ARG SILO_NAME='cache-silo-ci-gcc-11' # cache bucket for CI
ARG SILO_NAME='cache-silo-ci-dev-3.8_cuda_12.1' # cache bucket for CI
RUN time pip install -e .

# Run tests
Expand Down

0 comments on commit 6ca267a

Please sign in to comment.