From e793e8642c3c0400eeac89f3d80ad9f9d2906b6b Mon Sep 17 00:00:00 2001 From: Siyuan Liu Date: Thu, 12 Oct 2023 14:07:29 -0700 Subject: [PATCH] Update CI image with dev container image (#5290) This PR updates PyTorch/XLA CI image to use dev container based image. The same image is used in upstream CI in #109757 --------- Co-authored-by: Siyuan Liu --- .circleci/build.sh | 5 +- .circleci/common.sh | 21 --------- .circleci/docker/Dockerfile | 59 +++++++++++------------- .circleci/docker/install_conda.sh | 7 +-- .circleci/docker/install_valgrind.sh | 2 +- .circleci/test.sh | 2 +- .github/workflows/_build.yml | 10 ++-- .github/workflows/_coverage.yml | 2 +- .github/workflows/_docs.yml | 2 +- .github/workflows/_test.yml | 2 +- .github/workflows/build_and_test.yml | 5 +- .github/workflows/build_and_test_xrt.yml | 4 +- .kokoro/Dockerfile | 2 +- 13 files changed, 48 insertions(+), 75 deletions(-) mode change 100644 => 100755 .circleci/docker/install_valgrind.sh diff --git a/.circleci/build.sh b/.circleci/build.sh index 25cc78f3c6b..68e81a5436c 100755 --- a/.circleci/build.sh +++ b/.circleci/build.sh @@ -41,14 +41,17 @@ apply_patches python -c "import fcntl; fcntl.fcntl(1, fcntl.F_SETFL, 0)" +# We always build PyTorch without CUDA support. +export USE_CUDA=0 python setup.py install sccache --show-stats source $XLA_DIR/xla_env export GCLOUD_SERVICE_KEY_FILE="$XLA_DIR/default_credentials.json" -export SILO_NAME='cache-silo-ci-gcc-11' # cache bucket for CI +export SILO_NAME='cache-silo-ci-dev-3.8_cuda_12.1' # cache bucket for CI export BUILD_CPP_TESTS='1' +export TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_70,sm_75,compute_80,$TF_CUDA_COMPUTE_CAPABILITIES" build_torch_xla $XLA_DIR popd diff --git a/.circleci/common.sh b/.circleci/common.sh index d773eef13ec..317b9832c4e 100755 --- a/.circleci/common.sh +++ b/.circleci/common.sh @@ -92,27 +92,6 @@ function install_deps_pytorch_xla() { sudo ln -s "$(command -v bazelisk)" /usr/bin/bazel - # Install gcc-11 - sudo apt-get update - # Update ppa for GCC - sudo apt-get install -y software-properties-common - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test - sudo apt update -y - sudo apt install -y gcc-11 - sudo apt install -y g++-11 - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100 - - export NVCC_PREPEND_FLAGS='-ccbin /usr/bin/g++-11' - - # Hack similar to https://github.com/pytorch/pytorch/pull/105227/files#diff-9e59213240d3b55d2ddc53c8c096db9eece0665d64f46473454f9dc0c10fd804 - sudo rm /opt/conda/lib/libstdc++.so* - - # Update gcov for test coverage - sudo update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-11 100 - sudo update-alternatives --install /usr/bin/gcov-dump gcov-dump /usr/bin/gcov-dump-11 100 - sudo update-alternatives --install /usr/bin/gcov-tool gcov-tool /usr/bin/gcov-tool-11 100 - # Symnlink the missing cuda headers if exists CUBLAS_PATTERN="/usr/include/cublas*" if ls $CUBLAS_PATTERN 1> /dev/null 2>&1; then diff --git a/.circleci/docker/Dockerfile b/.circleci/docker/Dockerfile index 89f6dbc08db..f0cd196511c 100644 --- a/.circleci/docker/Dockerfile +++ b/.circleci/docker/Dockerfile @@ -1,13 +1,13 @@ # This requires cuda & cudnn packages pre-installed in the base image. # Other available cuda images are listed at https://hub.docker.com/r/nvidia/cuda -ARG base_image="nvidia/cuda:11.7.0-cudnn8-devel-ubuntu18.04" +ARG base_image="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1" FROM "${base_image}" ARG python_version="3.8" ARG cuda="1" ARG cuda_compute="5.2,7.5" -ARG cc="clang-8" -ARG cxx="clang++-8" +ARG cc="clang" +ARG cxx="clang++" ARG cxx_abi="1" ARG tpuvm="" @@ -37,38 +37,15 @@ ENV CXX "${cxx}" # Whether to build for TPUVM mode ENV TPUVM_MODE "${tpuvm}" -# Rotate nvidia repo public key (last updated: 04/27/2022) -# Unfortunately, nvidia/cuda image is shipped with invalid public key -RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub - -# Install base system packages -RUN apt-get clean && apt-get update -RUN apt-get upgrade -y -RUN apt-get install --fix-missing -y python-pip python3-pip git curl libopenblas-dev vim jq \ - apt-transport-https ca-certificates procps openssl sudo wget libssl-dev libc6-dbg - -# Install clang & llvm -ADD ./install_llvm_clang.sh install_llvm_clang.sh -RUN bash ./install_llvm_clang.sh - +# Install clang as upstream CI forces clang +RUN apt-get install -y clang # Install valgrind -ADD ./install_valgrind.sh install_valgrind.sh +COPY ./install_valgrind.sh install_valgrind.sh RUN bash ./install_valgrind.sh -# Sets up jenkins user. -RUN useradd jenkins && \ - mkdir /home/jenkins && \ - chown jenkins /home/jenkins -RUN echo 'jenkins ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers - -RUN mkdir -p /opt/conda /opt/cargo /opt/rustup /workspace /var/lib/jenkins && \ - chown jenkins /opt/conda /opt/cargo /opt/rustup /workspace /var/lib/jenkins -USER jenkins -WORKDIR /workspace - # Install openmpi for CUDA -run sudo apt-get install -y ssh -run sudo apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev +run apt-get install -y ssh +run apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev # Builds and configure sccache ENV OPENSSL_INCLUDE_DIR /usr/include/openssl @@ -87,6 +64,25 @@ RUN . $CARGO_HOME/env && \ ENV PATH $CARGO_HOME/bin:$PATH +# Upstream CI requires jq +RUN apt-get install -y jq + +# TODO: Add exec permisson for all users in base image. +RUN chmod a+x /usr/local/bin/bazel +# TODO: move sudo installation in base image. +RUN apt-get install -y sudo + +RUN useradd jenkins && \ + mkdir /home/jenkins && \ + chown jenkins /home/jenkins +RUN echo 'jenkins ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + +RUN mkdir -p /opt/conda /opt/cargo /opt/rustup /workspace /var/lib/jenkins && \ + chown jenkins /opt/conda /opt/cargo /opt/rustup /workspace /var/lib/jenkins +ENV PATH /home/jenkins/.local/bin:$PATH +USER jenkins +WORKDIR /workspace + # Installs and configures Conda. ADD ./install_conda.sh install_conda.sh RUN sudo chown jenkins ./install_conda.sh @@ -95,6 +91,7 @@ RUN bash ./install_conda.sh "${python_version}" /opt/conda RUN echo "conda activate base" >> ~/.bashrc RUN echo "export TF_CPP_LOG_THREAD_ID=1" >> ~/.bashrc ENV PATH /opt/conda/bin:$PATH +ENV LD_LIBRARY_PATH /lib/x86_64-linux-gnu/:/usr/lib/x86_64-linux-gnu/:/opt/conda/lib/:$LD_LIBRARY_PATH RUN bash -c "source ~/.bashrc" CMD ["bash"] diff --git a/.circleci/docker/install_conda.sh b/.circleci/docker/install_conda.sh index b0fc17c73ec..15e2c541b25 100644 --- a/.circleci/docker/install_conda.sh +++ b/.circleci/docker/install_conda.sh @@ -4,7 +4,7 @@ set -ex PYTHON_VERSION=$1 CONDA_PREFIX=$2 -DEFAULT_PYTHON_VERSION=3.7 +DEFAULT_PYTHON_VERSION=3.8 function install_and_setup_conda() { @@ -30,7 +30,7 @@ function install_and_setup_conda() { conda update -y -n base conda conda install -y python=$PYTHON_VERSION - conda install -y nomkl numpy=1.18.5 pyyaml setuptools cmake \ + conda install -y nomkl numpy=1.18.5 pyyaml setuptools \ cffi typing tqdm coverage hypothesis dataclasses cython /usr/bin/yes | pip install mkl==2022.2.1 @@ -41,9 +41,6 @@ function install_and_setup_conda() { /usr/bin/yes | pip install --upgrade numba /usr/bin/yes | pip install cloud-tpu-client /usr/bin/yes | pip install expecttest==0.1.3 - /usr/bin/yes | pip install ninja # Install ninja to speedup the build - # Using Ninja requires CMake>=3.13, PyTorch requires CMake>=3.18 - /usr/bin/yes | pip install "cmake>=3.18" --upgrade /usr/bin/yes | pip install absl-py # Additional PyTorch requirements /usr/bin/yes | pip install scikit-image scipy==1.6.3 diff --git a/.circleci/docker/install_valgrind.sh b/.circleci/docker/install_valgrind.sh old mode 100644 new mode 100755 index e235d36609b..08e45fd0e28 --- a/.circleci/docker/install_valgrind.sh +++ b/.circleci/docker/install_valgrind.sh @@ -9,7 +9,7 @@ tar -xjf valgrind-${VALGRIND_VERSION}.tar.bz2 cd valgrind-${VALGRIND_VERSION} ./configure --prefix=/usr/local make -j6 -sudo make install +make install cd ../../ rm -rf valgrind_build alias valgrind="/usr/local/bin/valgrind" diff --git a/.circleci/test.sh b/.circleci/test.sh index 914d56d206f..127c7f497a1 100755 --- a/.circleci/test.sh +++ b/.circleci/test.sh @@ -26,5 +26,5 @@ function install_torchvision() { install_torchvision export GCLOUD_SERVICE_KEY_FILE="$XLA_DIR/default_credentials.json" -export SILO_NAME='cache-silo-ci-gcc-11' # cache bucket for CI +export SILO_NAME='cache-silo-ci-dev-3.8_cuda_12.1' # cache bucket for CI run_torch_xla_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml index 74b1b00397a..879594476ef 100644 --- a/.github/workflows/_build.yml +++ b/.github/workflows/_build.yml @@ -73,12 +73,12 @@ jobs: # if image layers are not present in the repo. # Note: disable the following 2 lines while testing a new image, so we do not # push to the upstream. - docker tag "${GCR_DOCKER_IMAGE}" "${ECR_DOCKER_IMAGE_BASE}:v1.0" >/dev/null - docker push "${ECR_DOCKER_IMAGE_BASE}:v1.0" >/dev/null + docker tag "${GCR_DOCKER_IMAGE}" "${ECR_DOCKER_IMAGE_BASE}:v1.1-lite" >/dev/null + docker push "${ECR_DOCKER_IMAGE_BASE}:v1.1-lite" >/dev/null - name: Start the container shell: bash run: | - pid=$(docker run -t -d -w "$WORKDIR" "${GCR_DOCKER_IMAGE}") + pid=$(docker run --privileged -t -d -w "$WORKDIR" "${GCR_DOCKER_IMAGE}") docker exec -u jenkins "${pid}" sudo chown -R jenkins "${WORKDIR}" docker cp "${GITHUB_WORKSPACE}/." "$pid:$WORKDIR" echo "pid=${pid}" >> "${GITHUB_ENV}" @@ -87,7 +87,6 @@ jobs: shell: bash run: | echo "declare -x SCCACHE_BUCKET=${SCCACHE_BUCKET}" | docker exec -i "${pid}" sh -c "cat >> env" - echo "declare -x CC=clang-8 CXX=clang++-8" | docker exec -i "${pid}" sh -c "cat >> xla_env" echo "declare -x DISABLE_XRT=${DISABLE_XRT}" | docker exec -i "${pid}" sh -c "cat >> xla_env" echo "declare -x XLA_CUDA=${XLA_CUDA}" | docker exec -i "${pid}" sh -c "cat >> xla_env" echo "declare -x BAZEL_REMOTE_CACHE=1" | docker exec -i "${pid}" sh -c "cat >> xla_env" @@ -96,8 +95,7 @@ jobs: - name: Build shell: bash run: | - docker exec -u jenkins "${pid}" bash -c ". ~/.bashrc && .circleci/build.sh" - + docker exec --privileged -u jenkins "${pid}" bash -c ".circleci/build.sh" - name: Cleanup build env shell: bash run: | diff --git a/.github/workflows/_coverage.yml b/.github/workflows/_coverage.yml index 4643e225314..e114074bb7e 100644 --- a/.github/workflows/_coverage.yml +++ b/.github/workflows/_coverage.yml @@ -94,7 +94,7 @@ jobs: - name: Test shell: bash run: | - docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/${{ inputs.test-script }}' + docker exec -u jenkins "${pid}" bash -c '.circleci/${{ inputs.test-script }}' - name: Upload coverage results if: ${{ inputs.collect-coverage }} shell: bash diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml index d1bba0962e9..ed9a4ab0ea9 100644 --- a/.github/workflows/_docs.yml +++ b/.github/workflows/_docs.yml @@ -43,7 +43,7 @@ jobs: echo "pid=${pid}" >> "${GITHUB_ENV}" - name: Build & publish docs shell: bash - run: docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/doc_push.sh' + run: docker exec -u jenkins "${pid}" bash -c '.circleci/doc_push.sh' - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main if: always() diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml index 7c7215a573a..3f0aa8acd47 100644 --- a/.github/workflows/_test.yml +++ b/.github/workflows/_test.yml @@ -116,7 +116,7 @@ jobs: - name: Test shell: bash run: | - docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/${{ inputs.test-script }}' + docker exec --privileged -u jenkins "${pid}" bash -c '.circleci/${{ inputs.test-script }}' - name: Upload coverage results if: ${{ inputs.collect-coverage }} shell: bash diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 83277c8c96a..31b415c503e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -19,8 +19,7 @@ jobs: uses: ./.github/workflows/_build.yml with: ecr-docker-image-base: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base - gcr-docker-image: gcr.io/tpu-pytorch/xla_base:latest - disable_xrt: 1 + gcr-docker-image: gcr.io/tpu-pytorch/xla_base:dev-3.8_cuda_12.1 cuda: 1 secrets: gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} @@ -43,7 +42,7 @@ jobs: with: docker-image: ${{ needs.build.outputs.docker-image }} runner: linux.8xlarge.nvidia.gpu - timeout-minutes: 300 + timeout-minutes: 180 disable-xrt: 1 secrets: gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} diff --git a/.github/workflows/build_and_test_xrt.yml b/.github/workflows/build_and_test_xrt.yml index dd3f95b7100..79f96e0c19c 100644 --- a/.github/workflows/build_and_test_xrt.yml +++ b/.github/workflows/build_and_test_xrt.yml @@ -18,7 +18,7 @@ jobs: uses: ./.github/workflows/_build.yml with: ecr-docker-image-base: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base - gcr-docker-image: gcr.io/tpu-pytorch/xla_base:latest + gcr-docker-image: gcr.io/tpu-pytorch/xla_base:dev-3.8_cuda_12.1 disable_xrt: 0 cuda: 1 secrets: @@ -42,7 +42,7 @@ jobs: with: docker-image: ${{ needs.build.outputs.docker-image }} runner: linux.8xlarge.nvidia.gpu - timeout-minutes: 300 + timeout-minutes: 180 disable-xrt: 0 secrets: gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} diff --git a/.kokoro/Dockerfile b/.kokoro/Dockerfile index 32cc499477d..40210aba1f3 100644 --- a/.kokoro/Dockerfile +++ b/.kokoro/Dockerfile @@ -47,7 +47,7 @@ ARG SCCACHE="$(which sccache)" WORKDIR /pytorch/xla ARG GCLOUD_SERVICE_KEY_FILE="/pytorch/xla/default_credentials.json" -ARG SILO_NAME='cache-silo-ci-gcc-11' # cache bucket for CI +ARG SILO_NAME='cache-silo-ci-dev-3.8_cuda_12.1' # cache bucket for CI RUN time pip install -e . # Run tests