From e2b5e6ab84469b7e3314a0ef6e9f5c45f6255eba Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Tue, 3 Oct 2023 13:09:42 -0700 Subject: [PATCH 1/9] Make the pax ARM dockerfile works on AMD64. Also move some stuff to the dockerfile vs install script. --- .github/container/Dockerfile.pax.arm64 | 24 +++++++++++++++++++-- .github/container/install_lingvo_aarch64.sh | 22 ++++++------------- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index 2aeb9d76f..6d757968e 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -1,6 +1,10 @@ # syntax=docker/dockerfile:1-labs ############################################################################### -## Pax for AArch64 +## Pax for AArch64 and Amd64 for GraceHopper. +## We want both containers to be equivalent. +## GH need special treatments as not all pip wheel support it. +## So this is more complex than what x86 needs. +## Overtime the GH installation should be simpler. ############################################################################### ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest @@ -20,9 +24,22 @@ RUN apt-get update && \ apt-get autoremove -y && apt-get clean && rm -rf /var/lib/apt/lists -RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-arm64 -O /usr/bin/bazel && \ +RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-$(dpkg --print-architecture) -O /usr/bin/bazel && \ chmod a+x /usr/bin/bazel +# force a recent tensorflow_datasets version to have latest protobuf dep +RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 + +## Install tensorflow-text +RUN cd ${INSTALL_DIR} && \ + git clone http://github.com/tensorflow/text.git && \ + cd text && \ + git checkout v2.13.0 && \ + ./oss_scripts/run_build.sh && \ + find * | grep '.whl$' && \ + pip install ./tensorflow_text-*.whl && \ + cd .. && \ + rm -Rf text # Lingvo ADD install_lingvo_aarch64.sh /opt/ @@ -36,6 +53,9 @@ ENV NVTE_FRAMEWORK=jax ADD install-te.sh /usr/local/bin RUN install-te.sh +ADD install-flax.sh /usr/local/bin +RUN install-flax.sh + # Install T5 now, Pip will build the wheel from source, it needs Rust. RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > /tmp/rustup.sh && \ echo "be3535b3033ff5e0ecc4d589a35d3656f681332f860c5fd6684859970165ddcc /tmp/rustup.sh" | sha256sum --check && \ diff --git a/.github/container/install_lingvo_aarch64.sh b/.github/container/install_lingvo_aarch64.sh index 1c1499684..2c6341e99 100755 --- a/.github/container/install_lingvo_aarch64.sh +++ b/.github/container/install_lingvo_aarch64.sh @@ -4,21 +4,7 @@ INSTALL_DIR="${INSTALL_DIR:-/opt}" LINGVO_REF="${LINGVO_REF:-HEAD}" LINGVO_REPO="${LINGVO_REPO:-https://github.com/tensorflow/lingvo.git}" -## Install tensorflow-text -cd ${INSTALL_DIR} -pip install tensorflow_datasets==4.9.2 # force a recent version to have latest protobuf dep -pip install auditwheel -pip install tensorflow==2.13.0 -git clone http://github.com/tensorflow/text.git -pushd text -git checkout v2.13.0 -./oss_scripts/run_build.sh -find * | grep '.whl$' -pip install ./tensorflow_text-*.whl -popd -rm -Rf text - -## Install lingvo +## Download lingvo early to fail fast LINGVO_INSTALLED_DIR=${INSTALL_DIR}/lingvo [[ -d lingvo ]] || git clone ${LINGVO_REPO} ${LINGVO_INSTALLED_DIR} @@ -30,12 +16,16 @@ pushd ${LINGVO_INSTALLED_DIR} git fetch origin pull/329/head:pr329 git config user.name "JAX Toolbox" git config user.email "jax@nvidia.com" -# git cherry-pick pr326 pr328 pr329 ## pr326, pr328 merged +# git cherry-pick --allow-empty pr326 pr328 pr329 ## pr326 pr328 merged git cherry-pick --allow-empty pr329 # Disable 2 flaky tests here patch -p1 < /opt/lingvo.patch +popd + +## Install lingvo +pushd ${LINGVO_INSTALLED_DIR} sed -i 's/tensorflow=/#tensorflow=/' docker/dev.requirements.txt sed -i 's/tensorflow-text=/#tensorflow-text=/' docker/dev.requirements.txt sed -i 's/dataclasses=/#dataclasses=/' docker/dev.requirements.txt From 76d9c9dd2611292b91fbcc651fc9421a0888fa18 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Tue, 3 Oct 2023 13:25:56 -0700 Subject: [PATCH 2/9] Update the CI --- .../{Dockerfile.pax.arm64 => Dockerfile.pax} | 0 .github/container/Dockerfile.pax.amd64 | 33 ------------------- .github/workflows/_build_pax.yaml | 2 +- 3 files changed, 1 insertion(+), 34 deletions(-) rename .github/container/{Dockerfile.pax.arm64 => Dockerfile.pax} (100%) delete mode 100644 .github/container/Dockerfile.pax.amd64 diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax similarity index 100% rename from .github/container/Dockerfile.pax.arm64 rename to .github/container/Dockerfile.pax diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 deleted file mode 100644 index e8ae18291..000000000 --- a/.github/container/Dockerfile.pax.amd64 +++ /dev/null @@ -1,33 +0,0 @@ -# syntax=docker/dockerfile:1-labs -############################################################################### -## Pax -############################################################################### - -ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest -FROM ${BASE_IMAGE} - -ADD install-pax.sh /usr/local/bin -ADD install-flax.sh /usr/local/bin -ADD install-te.sh /usr/local/bin - -ENV NVTE_FRAMEWORK=jax -ARG REPO_PAXML=https://github.com/google/paxml.git -ARG REPO_PRAXIS=https://github.com/google/praxis.git -ARG REF_PAXML=main -ARG REF_PRAXIS=main -RUN <<"EOF" bash -ex -install-pax.sh --defer --from_paxml ${REPO_PAXML} --from_praxis ${REPO_PRAXIS} --ref_paxml ${REF_PAXML} --ref_praxis ${REF_PRAXIS} -install-flax.sh --defer -install-te.sh --defer - -if [[ -f /opt/requirements-defer.txt ]]; then - # SKIP_HEAD_INSTALLS avoids having to install jax from Github source so that - # we do not overwrite the jax that was already installed. - SKIP_HEAD_INSTALLS=true pip install -r /opt/requirements-defer.txt -fi -if [[ -f /opt/cleanup.sh ]]; then - bash -ex /opt/cleanup.sh -fi -EOF - -ADD test-pax.sh /usr/local/bin diff --git a/.github/workflows/_build_pax.yaml b/.github/workflows/_build_pax.yaml index 84012afa8..389deccf5 100644 --- a/.github/workflows/_build_pax.yaml +++ b/.github/workflows/_build_pax.yaml @@ -92,7 +92,7 @@ jobs: with: context: .github/container push: true - file: .github/container/Dockerfile.pax.${{ matrix.PLATFORM }} + file: .github/container/Dockerfile.pax platforms: linux/${{ matrix.PLATFORM }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} From 5682293abc1da4a6a9feeac394793fd22773a836 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Tue, 3 Oct 2023 13:27:51 -0700 Subject: [PATCH 3/9] Rename a file --- .github/container/Dockerfile.pax | 4 ++-- .../{install_lingvo_aarch64.sh => install-lingvo.sh} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename .github/container/{install_lingvo_aarch64.sh => install-lingvo.sh} (100%) diff --git a/.github/container/Dockerfile.pax b/.github/container/Dockerfile.pax index 6d757968e..649e938cd 100644 --- a/.github/container/Dockerfile.pax +++ b/.github/container/Dockerfile.pax @@ -42,9 +42,9 @@ RUN cd ${INSTALL_DIR} && \ rm -Rf text # Lingvo -ADD install_lingvo_aarch64.sh /opt/ +ADD install-lingvo.sh /usr/local/bin ADD lingvo.patch /opt/ -RUN /opt/install_lingvo_aarch64.sh +RUN install-lingvo.sh ADD install-pax.sh /usr/local/bin RUN install-pax.sh diff --git a/.github/container/install_lingvo_aarch64.sh b/.github/container/install-lingvo.sh similarity index 100% rename from .github/container/install_lingvo_aarch64.sh rename to .github/container/install-lingvo.sh From cc01fa943342ed4b21d72f13b43adfa3d9fc80d3 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Mon, 9 Oct 2023 16:25:46 -0700 Subject: [PATCH 4/9] Fix x86 pax build --- .github/container/Dockerfile.pax | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.pax b/.github/container/Dockerfile.pax index 649e938cd..16a48eabc 100644 --- a/.github/container/Dockerfile.pax +++ b/.github/container/Dockerfile.pax @@ -44,7 +44,7 @@ RUN cd ${INSTALL_DIR} && \ # Lingvo ADD install-lingvo.sh /usr/local/bin ADD lingvo.patch /opt/ -RUN install-lingvo.sh +RUN ARCH=`dpkg --print-architecture`; if [ "$ARCH" = "arm64" ] ; then install-lingvo.sh; else pip install lingvo; fi; ADD install-pax.sh /usr/local/bin RUN install-pax.sh From ed31a08059f570a13b9394aa9206c3e26f581c91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Bastien?= Date: Tue, 10 Oct 2023 09:38:24 -0400 Subject: [PATCH 5/9] Update .github/container/Dockerfile.pax Co-authored-by: Mehdi Amini --- .github/container/Dockerfile.pax | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.pax b/.github/container/Dockerfile.pax index 16a48eabc..19b4c8547 100644 --- a/.github/container/Dockerfile.pax +++ b/.github/container/Dockerfile.pax @@ -1,6 +1,6 @@ # syntax=docker/dockerfile:1-labs ############################################################################### -## Pax for AArch64 and Amd64 for GraceHopper. +## Pax for Amd64 and Aarch64 for GraceHopper. ## We want both containers to be equivalent. ## GH need special treatments as not all pip wheel support it. ## So this is more complex than what x86 needs. From 84201783850ffdf6394898977bc9f45593353003 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Tue, 10 Oct 2023 13:56:42 +0000 Subject: [PATCH 6/9] Remove useless lines. --- .github/container/install-lingvo.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/container/install-lingvo.sh b/.github/container/install-lingvo.sh index 2c6341e99..3b9a9b8c8 100755 --- a/.github/container/install-lingvo.sh +++ b/.github/container/install-lingvo.sh @@ -21,11 +21,9 @@ git cherry-pick --allow-empty pr329 # Disable 2 flaky tests here patch -p1 < /opt/lingvo.patch -popd ## Install lingvo -pushd ${LINGVO_INSTALLED_DIR} sed -i 's/tensorflow=/#tensorflow=/' docker/dev.requirements.txt sed -i 's/tensorflow-text=/#tensorflow-text=/' docker/dev.requirements.txt sed -i 's/dataclasses=/#dataclasses=/' docker/dev.requirements.txt From 017f88f7ad046fc4321ead772ba0d2c4a274c12a Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Tue, 10 Oct 2023 08:23:41 -0700 Subject: [PATCH 7/9] Defer to packages installation --- .github/container/Dockerfile.pax | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/.github/container/Dockerfile.pax b/.github/container/Dockerfile.pax index 19b4c8547..9eccf2882 100644 --- a/.github/container/Dockerfile.pax +++ b/.github/container/Dockerfile.pax @@ -31,6 +31,7 @@ RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazeli RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 ## Install tensorflow-text +## The checkout version must match the TF version. RUN cd ${INSTALL_DIR} && \ git clone http://github.com/tensorflow/text.git && \ cd text && \ @@ -47,14 +48,27 @@ ADD lingvo.patch /opt/ RUN ARCH=`dpkg --print-architecture`; if [ "$ARCH" = "arm64" ] ; then install-lingvo.sh; else pip install lingvo; fi; ADD install-pax.sh /usr/local/bin -RUN install-pax.sh - ENV NVTE_FRAMEWORK=jax ADD install-te.sh /usr/local/bin -RUN install-te.sh - ADD install-flax.sh /usr/local/bin -RUN install-flax.sh + +# Don't defer install pax as on ARM we have this error: +# pip._vendor.resolvelib.resolvers.ResolutionTooDeep +RUN install-pax.sh + +RUN <<"EOF" bash -ex +install-flax.sh --defer +install-te.sh --defer + +if [[ -f /opt/requirements-defer.txt ]]; then + # SKIP_HEAD_INSTALLS avoids having to install jax from Github source so that + # we do not overwrite the jax that was already installed. + SKIP_HEAD_INSTALLS=true pip install -r /opt/requirements-defer.txt +fi +if [[ -f /opt/cleanup.sh ]]; then + bash -ex /opt/cleanup.sh +fi +EOF # Install T5 now, Pip will build the wheel from source, it needs Rust. RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > /tmp/rustup.sh && \ From cdbc223b293e6a355ddee65289607c5e21b6ae7b Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 11 Oct 2023 09:53:01 -0700 Subject: [PATCH 8/9] Try with TF 2.14 --- .github/container/Dockerfile.pax | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/container/Dockerfile.pax b/.github/container/Dockerfile.pax index 9eccf2882..9227f3521 100644 --- a/.github/container/Dockerfile.pax +++ b/.github/container/Dockerfile.pax @@ -28,14 +28,14 @@ RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazeli chmod a+x /usr/bin/bazel # force a recent tensorflow_datasets version to have latest protobuf dep -RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 +RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.14.0 ## Install tensorflow-text ## The checkout version must match the TF version. RUN cd ${INSTALL_DIR} && \ git clone http://github.com/tensorflow/text.git && \ cd text && \ - git checkout v2.13.0 && \ + git checkout v2.14.0 && \ ./oss_scripts/run_build.sh && \ find * | grep '.whl$' && \ pip install ./tensorflow_text-*.whl && \ From a0917d126bc9925d35ef6e99e971f890d406b486 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 12 Oct 2023 10:25:11 -0700 Subject: [PATCH 9/9] Port "Fixes .github/container/Dockerfile.{t5x,pax} by passing in the repo/refs" --- .github/container/Dockerfile.pax | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/container/Dockerfile.pax b/.github/container/Dockerfile.pax index 9227f3521..f86c00643 100644 --- a/.github/container/Dockerfile.pax +++ b/.github/container/Dockerfile.pax @@ -52,9 +52,14 @@ ENV NVTE_FRAMEWORK=jax ADD install-te.sh /usr/local/bin ADD install-flax.sh /usr/local/bin +ARG REPO_PAXML=https://github.com/google/paxml.git +ARG REPO_PRAXIS=https://github.com/google/praxis.git +ARG REF_PAXML=main +ARG REF_PRAXIS=main + # Don't defer install pax as on ARM we have this error: # pip._vendor.resolvelib.resolvers.ResolutionTooDeep -RUN install-pax.sh +RUN install-pax.sh --from_paxml ${REPO_PAXML} --from_praxis ${REPO_PRAXIS} --ref_paxml ${REF_PAXML} --ref_praxis ${REF_PRAXIS} RUN <<"EOF" bash -ex install-flax.sh --defer @@ -86,10 +91,3 @@ RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > /tmp/rustup.sh & rm -Rf /root/.cache /tmp/* ADD test-pax.sh /usr/local/bin - -# TODO: Utilize these build-args and use them when installing pax -# ARG REPO_PAXML=https://github.com/google/paxml.git -# ARG REPO_PRAXIS=https://github.com/google/praxis.git -# ARG REF_PAXML=main -# ARG REF_PRAXIS=main -# install-pax.sh --defer --from_paxml ${REPO_PAXML} --from_praxis ${REPO_PRAXIS} --ref_paxml ${REF_PAXML} --ref_praxis ${REF_PRAXIS} \ No newline at end of file