diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index f0f2759e8..ee9a1dbe1 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -37,6 +37,17 @@ RUN --mount=type=ssh \ --mount=type=secret,id=SSH_KNOWN_HOSTS,target=/root/.ssh/known_hosts \ git clone "${REPO_XLA}" "${SRC_PATH_XLA}" && cd "${SRC_PATH_XLA}" && git checkout ${REF_XLA} +# TODO: This is a WAR to NCCL errors we observe in TOT. Should be removed when no longer needed +RUN <> /opt/pip-tools.d/requirements-jax.in -echo "$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/requirements-jax.in -echo "flax" >> /opt/pip-tools.d/requirements-jax.in +RUN <<"EOF" bash -ex +echo "-e file://${SRC_PATH_JAX}" >> /opt/pip-tools.d/manifest.jax +echo "jaxlib @ file://$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax +EOF + +## Flax +ARG REPO_FLAX +ARG REF_FLAX +ARG SRC_PATH_FLAX +RUN get-source.sh -f ${REPO_FLAX} -r ${REF_FLAX} -d ${SRC_PATH_FLAX} -m /opt/pip-tools.d/manifest.flax + +## Transformer engine: check out source and build wheel +ARG REPO_TE +ARG REF_TE +ARG SRC_PATH_TE +ENV NVTE_FRAMEWORK=jax +ENV SRC_PATH_TE=${SRC_PATH_TE} +RUN <<"EOF" bash -ex +set -o pipefail +pip install ninja && rm -rf ~/.cache/pip +get-source.sh -f ${REPO_TE} -r ${REF_TE} -d ${SRC_PATH_TE} +pushd ${SRC_PATH_TE} +python setup.py bdist_wheel && rm -rf build +echo "transformer-engine @ file://$(ls ${SRC_PATH_TE}/dist/*.whl)" >> /opt/pip-tools.d/manifest.te EOF # TODO: properly configure entrypoint diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 37fa6ca68..cc2adc056 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -1,7 +1,13 @@ name: "~Sandbox" on: - workflow_dispatch: + # workflow_dispatch: + # push: + +permissions: + contents: read # to fetch code + actions: write # to cancel previous workflows + packages: write # to upload container jobs: sandbox: diff --git a/.github/workflows/nightly-t5x-build.yaml b/.github/workflows/nightly-t5x-build.yaml index a503111b4..48f62bc43 100644 --- a/.github/workflows/nightly-t5x-build.yaml +++ b/.github/workflows/nightly-t5x-build.yaml @@ -71,6 +71,7 @@ jobs: runs-on: ubuntu-22.04 outputs: DOCKER_TAG_MEALKIT: '' + DOCKER_TAG_FINAL: '' steps: - name: Generate placeholder warning shell: bash -x -e {0} diff --git a/rosetta/rosetta/projects/t5x/README.md b/rosetta/rosetta/projects/t5x/README.md index aeec2f688..39401f415 100644 --- a/rosetta/rosetta/projects/t5x/README.md +++ b/rosetta/rosetta/projects/t5x/README.md @@ -197,7 +197,6 @@ t5x/contrib/gpu/scripts_gpu/singlenode_ft_frompile.sh \ # Known Issues * There is a known sporadic NCCL crash that happens when using the T5x container at node counts greater than or equal to 32 nodes. We will fix this in the next release. The issue is tracked [here](https://github.com/NVIDIA/JAX-Toolbox/issues/194). -* The T5x nightlies disable `NCCL_NVLS_ENABLE=0` ([doc](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-nvls-enable)). Future releases will re-enable this feature. # Changelog - Added Transformer Engine + FP8 support diff --git a/rosetta/rosetta/projects/vit/README.md b/rosetta/rosetta/projects/vit/README.md index 0c5b22a47..a57896480 100644 --- a/rosetta/rosetta/projects/vit/README.md +++ b/rosetta/rosetta/projects/vit/README.md @@ -157,5 +157,4 @@ Pre-training was performed on 1 node with a global batch size of 4096. Models we ## Known Issues 1. By default, gradient accumulation (GA) sums loss across the microbatches. As a result, loss is scaled up when using gradient accumulation, and training with GA only works when using a scale-invariant optimizer such as Adam or Adafactor. ViT fine-tuning is performed using SGD; thus, GA should not be used when fine-tuning. -2. The nightlies disable `NCCL_NVLS_ENABLE=0` ([doc](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-nvls-enable)). Future releases will re-enable this feature.