diff --git a/.github/container/Dockerfile.pax b/.github/container/Dockerfile.pax new file mode 100644 index 000000000..f86c00643 --- /dev/null +++ b/.github/container/Dockerfile.pax @@ -0,0 +1,93 @@ +# syntax=docker/dockerfile:1-labs +############################################################################### +## Pax for Amd64 and Aarch64 for GraceHopper. +## We want both containers to be equivalent. +## GH need special treatments as not all pip wheel support it. +## So this is more complex than what x86 needs. +## Overtime the GH installation should be simpler. +############################################################################### + +ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest +FROM ${BASE_IMAGE} + +# We need to build some packages from source, bring some dependencies. +RUN apt-get update && \ + apt-get update && \ + apt-get install -y \ + bat \ + curl \ + git \ + gnupg \ + rsync \ + liblzma-dev \ + && \ + apt-get autoremove -y && apt-get clean && rm -rf /var/lib/apt/lists + + +RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-$(dpkg --print-architecture) -O /usr/bin/bazel && \ + chmod a+x /usr/bin/bazel + +# force a recent tensorflow_datasets version to have latest protobuf dep +RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.14.0 + +## Install tensorflow-text +## The checkout version must match the TF version. +RUN cd ${INSTALL_DIR} && \ + git clone http://github.com/tensorflow/text.git && \ + cd text && \ + git checkout v2.14.0 && \ + ./oss_scripts/run_build.sh && \ + find * | grep '.whl$' && \ + pip install ./tensorflow_text-*.whl && \ + cd .. && \ + rm -Rf text + +# Lingvo +ADD install-lingvo.sh /usr/local/bin +ADD lingvo.patch /opt/ +RUN ARCH=`dpkg --print-architecture`; if [ "$ARCH" = "arm64" ] ; then install-lingvo.sh; else pip install lingvo; fi; + +ADD install-pax.sh /usr/local/bin +ENV NVTE_FRAMEWORK=jax +ADD install-te.sh /usr/local/bin +ADD install-flax.sh /usr/local/bin + +ARG REPO_PAXML=https://github.com/google/paxml.git +ARG REPO_PRAXIS=https://github.com/google/praxis.git +ARG REF_PAXML=main +ARG REF_PRAXIS=main + +# Don't defer install pax as on ARM we have this error: +# pip._vendor.resolvelib.resolvers.ResolutionTooDeep +RUN install-pax.sh --from_paxml ${REPO_PAXML} --from_praxis ${REPO_PRAXIS} --ref_paxml ${REF_PAXML} --ref_praxis ${REF_PRAXIS} + +RUN <<"EOF" bash -ex +install-flax.sh --defer +install-te.sh --defer + +if [[ -f /opt/requirements-defer.txt ]]; then + # SKIP_HEAD_INSTALLS avoids having to install jax from Github source so that + # we do not overwrite the jax that was already installed. + SKIP_HEAD_INSTALLS=true pip install -r /opt/requirements-defer.txt +fi +if [[ -f /opt/cleanup.sh ]]; then + bash -ex /opt/cleanup.sh +fi +EOF + +# Install T5 now, Pip will build the wheel from source, it needs Rust. +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > /tmp/rustup.sh && \ + echo "be3535b3033ff5e0ecc4d589a35d3656f681332f860c5fd6684859970165ddcc /tmp/rustup.sh" | sha256sum --check && \ + bash /tmp/rustup.sh -y && \ + export PATH=$PATH:/root/.cargo/bin && \ + pip install t5 && \ + rm -Rf /root/.cargo /root/.rustup && \ + mv /root/.profile /root/.profile.save && \ + grep -v cargo /root/.profile.save > /root/.profile && \ + rm /root/.profile.save && \ + mv /root/.bashrc /root/.bashrc.save && \ + grep -v cargo /root/.bashrc.save > /root/.bashrc && \ + rm /root/.bashrc.save && \ + rm -Rf /root/.cache /tmp/* + +ADD test-pax.sh /usr/local/bin diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 deleted file mode 100644 index e8ae18291..000000000 --- a/.github/container/Dockerfile.pax.amd64 +++ /dev/null @@ -1,33 +0,0 @@ -# syntax=docker/dockerfile:1-labs -############################################################################### -## Pax -############################################################################### - -ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest -FROM ${BASE_IMAGE} - -ADD install-pax.sh /usr/local/bin -ADD install-flax.sh /usr/local/bin -ADD install-te.sh /usr/local/bin - -ENV NVTE_FRAMEWORK=jax -ARG REPO_PAXML=https://github.com/google/paxml.git -ARG REPO_PRAXIS=https://github.com/google/praxis.git -ARG REF_PAXML=main -ARG REF_PRAXIS=main -RUN <<"EOF" bash -ex -install-pax.sh --defer --from_paxml ${REPO_PAXML} --from_praxis ${REPO_PRAXIS} --ref_paxml ${REF_PAXML} --ref_praxis ${REF_PRAXIS} -install-flax.sh --defer -install-te.sh --defer - -if [[ -f /opt/requirements-defer.txt ]]; then - # SKIP_HEAD_INSTALLS avoids having to install jax from Github source so that - # we do not overwrite the jax that was already installed. - SKIP_HEAD_INSTALLS=true pip install -r /opt/requirements-defer.txt -fi -if [[ -f /opt/cleanup.sh ]]; then - bash -ex /opt/cleanup.sh -fi -EOF - -ADD test-pax.sh /usr/local/bin diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 deleted file mode 100644 index 2aeb9d76f..000000000 --- a/.github/container/Dockerfile.pax.arm64 +++ /dev/null @@ -1,61 +0,0 @@ -# syntax=docker/dockerfile:1-labs -############################################################################### -## Pax for AArch64 -############################################################################### - -ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest -FROM ${BASE_IMAGE} - -# We need to build some packages from source, bring some dependencies. -RUN apt-get update && \ - apt-get update && \ - apt-get install -y \ - bat \ - curl \ - git \ - gnupg \ - rsync \ - liblzma-dev \ - && \ - apt-get autoremove -y && apt-get clean && rm -rf /var/lib/apt/lists - - -RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-arm64 -O /usr/bin/bazel && \ - chmod a+x /usr/bin/bazel - - -# Lingvo -ADD install_lingvo_aarch64.sh /opt/ -ADD lingvo.patch /opt/ -RUN /opt/install_lingvo_aarch64.sh - -ADD install-pax.sh /usr/local/bin -RUN install-pax.sh - -ENV NVTE_FRAMEWORK=jax -ADD install-te.sh /usr/local/bin -RUN install-te.sh - -# Install T5 now, Pip will build the wheel from source, it needs Rust. -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > /tmp/rustup.sh && \ - echo "be3535b3033ff5e0ecc4d589a35d3656f681332f860c5fd6684859970165ddcc /tmp/rustup.sh" | sha256sum --check && \ - bash /tmp/rustup.sh -y && \ - export PATH=$PATH:/root/.cargo/bin && \ - pip install t5 && \ - rm -Rf /root/.cargo /root/.rustup && \ - mv /root/.profile /root/.profile.save && \ - grep -v cargo /root/.profile.save > /root/.profile && \ - rm /root/.profile.save && \ - mv /root/.bashrc /root/.bashrc.save && \ - grep -v cargo /root/.bashrc.save > /root/.bashrc && \ - rm /root/.bashrc.save && \ - rm -Rf /root/.cache /tmp/* - -ADD test-pax.sh /usr/local/bin - -# TODO: Utilize these build-args and use them when installing pax -# ARG REPO_PAXML=https://github.com/google/paxml.git -# ARG REPO_PRAXIS=https://github.com/google/praxis.git -# ARG REF_PAXML=main -# ARG REF_PRAXIS=main -# install-pax.sh --defer --from_paxml ${REPO_PAXML} --from_praxis ${REPO_PRAXIS} --ref_paxml ${REF_PAXML} --ref_praxis ${REF_PRAXIS} \ No newline at end of file diff --git a/.github/container/install_lingvo_aarch64.sh b/.github/container/install-lingvo.sh similarity index 75% rename from .github/container/install_lingvo_aarch64.sh rename to .github/container/install-lingvo.sh index 1c1499684..3b9a9b8c8 100755 --- a/.github/container/install_lingvo_aarch64.sh +++ b/.github/container/install-lingvo.sh @@ -4,21 +4,7 @@ INSTALL_DIR="${INSTALL_DIR:-/opt}" LINGVO_REF="${LINGVO_REF:-HEAD}" LINGVO_REPO="${LINGVO_REPO:-https://github.com/tensorflow/lingvo.git}" -## Install tensorflow-text -cd ${INSTALL_DIR} -pip install tensorflow_datasets==4.9.2 # force a recent version to have latest protobuf dep -pip install auditwheel -pip install tensorflow==2.13.0 -git clone http://github.com/tensorflow/text.git -pushd text -git checkout v2.13.0 -./oss_scripts/run_build.sh -find * | grep '.whl$' -pip install ./tensorflow_text-*.whl -popd -rm -Rf text - -## Install lingvo +## Download lingvo early to fail fast LINGVO_INSTALLED_DIR=${INSTALL_DIR}/lingvo [[ -d lingvo ]] || git clone ${LINGVO_REPO} ${LINGVO_INSTALLED_DIR} @@ -30,12 +16,14 @@ pushd ${LINGVO_INSTALLED_DIR} git fetch origin pull/329/head:pr329 git config user.name "JAX Toolbox" git config user.email "jax@nvidia.com" -# git cherry-pick pr326 pr328 pr329 ## pr326, pr328 merged +# git cherry-pick --allow-empty pr326 pr328 pr329 ## pr326 pr328 merged git cherry-pick --allow-empty pr329 # Disable 2 flaky tests here patch -p1 < /opt/lingvo.patch + +## Install lingvo sed -i 's/tensorflow=/#tensorflow=/' docker/dev.requirements.txt sed -i 's/tensorflow-text=/#tensorflow-text=/' docker/dev.requirements.txt sed -i 's/dataclasses=/#dataclasses=/' docker/dev.requirements.txt diff --git a/.github/workflows/_build_pax.yaml b/.github/workflows/_build_pax.yaml index 84012afa8..389deccf5 100644 --- a/.github/workflows/_build_pax.yaml +++ b/.github/workflows/_build_pax.yaml @@ -92,7 +92,7 @@ jobs: with: context: .github/container push: true - file: .github/container/Dockerfile.pax.${{ matrix.PLATFORM }} + file: .github/container/Dockerfile.pax platforms: linux/${{ matrix.PLATFORM }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }}