From 934658351fcbe3fd573b10a94ba2ad0f178aa78d Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Mon, 23 Oct 2023 14:23:24 -0700
Subject: [PATCH 01/16] Loosen dali benchmark test as it has always been around
 155 (#320)

Examples:
*
[158](https://github.com/NVIDIA/JAX-Toolbox/actions/runs/6532058705/job/17734663330#step:7:187)
*
[154](https://github.com/NVIDIA/JAX-Toolbox/actions/runs/6523380018/job/17714026959#step:7:184)
*
[155](https://github.com/NVIDIA/JAX-Toolbox/actions/runs/6516962689/job/17701057693#step:7:184)

The original benchmark was done on a workstation with different specs
than the runner here.
---
 rosetta/rosetta/projects/vit/dali_utils_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rosetta/rosetta/projects/vit/dali_utils_test.py b/rosetta/rosetta/projects/vit/dali_utils_test.py
index ddee2772f..7449b65e0 100644
--- a/rosetta/rosetta/projects/vit/dali_utils_test.py
+++ b/rosetta/rosetta/projects/vit/dali_utils_test.py
@@ -56,7 +56,7 @@ def test_baseline_dali_iteration_stats(
 
     bps = iter_per_sec(dataset, batch_size=dummy_wds_metadata.batch_size, num_iter=500)
 
-    assert bps > 170
+    assert bps > (155 * 0.9)
 
 
 def test_dali_cls_preprocessing(dummy_wds_metadata):

From c50839183fb69d20ab946cd7312521d796dc2c53 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Mon, 23 Oct 2023 15:27:45 -0700
Subject: [PATCH 02/16] Fixes small grammar typo in t5x readme (#330)

---
 rosetta/rosetta/projects/t5x/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rosetta/rosetta/projects/t5x/README.md b/rosetta/rosetta/projects/t5x/README.md
index ccc88cab8..6a0776fdc 100644
--- a/rosetta/rosetta/projects/t5x/README.md
+++ b/rosetta/rosetta/projects/t5x/README.md
@@ -22,7 +22,7 @@ WORKSPACE_PATH=""  # Path used for run outputs (unspecified = /t5x_home/workspac
 
 ## Container
 We provide the latest fully built, ready-to-use, and verified container here: `ghcr.io/nvidia/t5x:latest-verified`. The verified containers will be updated
-periodically, but if you wish to use the bleeding edge (which may come have unexpected behavior), please use `ghcr.io/nvidia/t5x:latest`.
+periodically, but if you wish to use the bleeding edge (which may come with unexpected behavior), please use `ghcr.io/nvidia/t5x:latest`.
 We also provide nightly dated images with the naming pattern [ghcr.io/nvidia/t5x:nightly-YYYY-MM-DD](https://github.com/NVIDIA/JAX-Toolbox/pkgs/container/t5x), but we encourage
 you to use the latest ones to get the best performance.
 

From 5b70916c96339a4b0627fe483985a4a9eca4a7ed Mon Sep 17 00:00:00 2001
From: "Yu-Hang \"Maxin\" Tang" <Tang.Maxin@gmail.com>
Date: Tue, 24 Oct 2023 22:06:57 -0700
Subject: [PATCH 03/16] Create Docker manifest V2 schema 2 images in CI (#325)

These images will be located in the `ghcr.io/nvidia/jax-toolbox-retrofit` repo.
---
 .github/workflows/_publish_container.yaml  |  4 -
 .github/workflows/_retrofit_container.yaml | 98 ++++++++++++++++++++++
 .github/workflows/ci.yaml                  | 68 +++++++++++++++
 3 files changed, 166 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/_retrofit_container.yaml

diff --git a/.github/workflows/_publish_container.yaml b/.github/workflows/_publish_container.yaml
index 40340cb6f..c2d03c7f7 100644
--- a/.github/workflows/_publish_container.yaml
+++ b/.github/workflows/_publish_container.yaml
@@ -85,10 +85,6 @@ jobs:
             docker buildx imagetools create --tag $tag ${{ steps.get-manifests.outputs.manifests }}
           done
 
-      - name: Skopeo Login to GitHub Container Registry
-        run: |
-          echo ${{ secrets.GITHUB_TOKEN }} | skopeo login --authfile - ghcr.io
-
       - name: Create single-arch images
         if: ${{ inputs.EXPOSE_SINGLE_ARCH_IMAGES }}
         shell: bash -x -e {0}
diff --git a/.github/workflows/_retrofit_container.yaml b/.github/workflows/_retrofit_container.yaml
new file mode 100644
index 000000000..57301deaf
--- /dev/null
+++ b/.github/workflows/_retrofit_container.yaml
@@ -0,0 +1,98 @@
+name: ~split multi-arch OCI manifests into Docker Image Manifest V2, Schema 2
+
+on:
+  workflow_call:
+    inputs:
+      SOURCE_IMAGE:
+        type: string
+        description: 'Source docker image:'
+        required: true
+      TARGET_TAGS:
+        type: string
+        description: 'Target docker tags in docker/metadata-action format:'
+        required: true
+      EXPOSE_SINGLE_ARCH_IMAGES:
+        type: boolean
+        description: 'Also expose single-arch images:'
+        required: false
+        default: true
+    outputs:
+      # MULTIARCH_TAG:
+      #   description: "Tags of the multi-arch image published"
+      #   value: ${{ jobs.publish.outputs.MULTIARCH_TAG }}
+      SINGLEARCH_TAGS:
+        description: "Tags of the single-arch images published"
+        value: ${{ jobs.publish.outputs.SINGLEARCH_TAGS }}
+
+env:
+  DOCKER_REPOSITORY: 'ghcr.io/nvidia/jax-toolbox-retrofit'
+
+jobs:
+  publish:
+    runs-on: ubuntu-22.04
+    outputs:
+      # MULTIARCH_TAG: ${{ steps.meta.outputs.tags }}
+      SINGLEARCH_TAGS: ${{ steps.single-arch.outputs.tags }}
+    steps:
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Set docker metadata
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.DOCKER_REPOSITORY }}
+          flavor: latest=false
+          tags: ${{ inputs.TARGET_TAGS }}
+
+      - name: Extract manifests from the source manifest list
+        id: get-manifests
+        shell: bash -x -e {0}
+        run: |
+          SOURCE_REPO=$(echo ${{ inputs.SOURCE_IMAGE }} | cut -d: -f1)
+          MEDIA_TYPE=$(docker manifest inspect ${{ inputs.SOURCE_IMAGE }} | jq -r '.mediaType')
+          if [[ ${MEDIA_TYPE} != "application/vnd.oci.image.index.v1+json" ]]; then
+            echo "This workflow only work with OCI manifest lists"
+            exit 1
+          fi
+
+          MANIFESTS=$(
+            docker manifest inspect ${{ inputs.SOURCE_IMAGE }} |\
+            jq -r '.manifests[] | select(.platform.os != "unknown") | .digest' |\
+            xargs -I{} echo ${SOURCE_REPO}@{} |\
+            tr '\n' ' '
+          )
+
+          echo "manifests=$MANIFESTS" >> $GITHUB_OUTPUT
+
+      ## Requires skopeo >= v1.6.0, but Actions only has v1.4.0
+      # - name: Create Docker v2s2 multi-arch manifest list
+      #   id: multi-arch
+      #   shell: bash -x -e {0}
+      #   run: |
+      #     for tag in $(echo "${{ steps.meta.outputs.tags }}"); do
+      #       skopeo copy --multi-arch all --format v2s2 docker://${{ inputs.SOURCE_IMAGE }} docker://$tag
+      #     done
+
+      - name: Create Docker v2s2 single-arch manifests
+        id: single-arch
+        if: ${{ inputs.EXPOSE_SINGLE_ARCH_IMAGES }}
+        shell: bash -x -e {0}
+        run: |
+          output_tags=""
+          # Create new manifest list from extracted manifests
+          for manifest in ${{ steps.get-manifests.outputs.manifests }}; do
+            os=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.os')
+            arch=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.architecture')
+            for tag in $(echo "${{ steps.meta.outputs.tags }}"); do
+              single_arch_tag="${tag}-${os}-${arch}"
+              skopeo copy --format v2s2 docker://$manifest docker://${single_arch_tag}
+              output_tags="${output_tags} ${single_arch_tag}"
+            done
+          done
+
+          echo "tags=${output_tags}" >> $GITHUB_OUTPUT
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index a75dfac8b..f06feb9cc 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -192,6 +192,74 @@ jobs:
           | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} |
           EOF
 
+  retrofit-containers:
+    needs: [build-base, build-jax, build-te, build-t5x, build-pax, build-rosetta-t5x, build-rosetta-pax]
+    if: always()
+    runs-on: ubuntu-22.04
+    env:
+      DOCKER_REPO: 'ghcr.io/nvidia/jax-toolbox-retrofit'
+    steps:
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      ## Requires skopeo >= v1.6.0, but Actions only has v1.4.0
+      # - name: Create Docker v2s2 multi-arch manifest list
+      #   id: multi-arch
+      #   shell: bash -x -e {0}
+      #   run: |
+      #     for tag in $(echo "${{ steps.meta.outputs.tags }}"); do
+      #       skopeo copy --multi-arch all --format v2s2 docker://${{ inputs.SOURCE_IMAGE }} docker://$tag
+      #     done
+
+      - name: Create Docker v2s2 single-arch manifests
+        id: single-arch
+        shell: bash -x -e {0}
+        run: |
+
+          for source in \
+            ${{ needs.build-base.outputs.DOCKER_TAGS }}        \
+            ${{ needs.build-jax.outputs.DOCKER_TAGS }}         \
+            ${{ needs.build-te.outputs.DOCKER_TAGS }}          \
+            ${{ needs.build-t5x.outputs.DOCKER_TAGS }}         \
+            ${{ needs.build-pax.outputs.DOCKER_TAGS }}         \
+            ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} \
+            ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} \
+          ; do
+            source_repo=$(echo ${source} | cut -d: -f1)
+            media_type=$(docker manifest inspect ${source} | jq -r '.mediaType')
+            if [[ ${media_type} != "application/vnd.oci.image.index.v1+json" ]]; then
+              echo "Image ${source} is already in Docker format v2s2"
+              dest=${DOCKER_REPO}:$(echo ${source} | cut -d: -f2)
+              skopeo copy --format v2s2 docker://${source} docker://${dest}
+              echo "${dest}" >> $GITHUB_STEP_SUMMARY
+            else
+              manifests=$(
+                docker manifest inspect ${source} |\
+                jq -r '.manifests[] | select(.platform.os != "unknown") | .digest' |\
+                xargs -I{} echo ${source_repo}@{} |\
+                tr '\n' ' '
+              )
+
+              ## registry/org/repo:tag -> repo-tag
+              # dest_tag=$(echo ${source} | cut -d: -f1 | cut -d/ -f3)-$(echo ${source} | cut -d: -f2)
+              ## registry/org/repo:tag -> tag
+              dest_tag=$(echo ${source} | cut -d: -f2)
+
+              for manifest in ${manifests}; do
+                os=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.os')
+                arch=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.architecture')
+                # single_arch_tag="ghcr.io/nvidia/jax-toolbox-retrofit:${{ github.run_id }}-${dest_tag}-${os}-${arch}"
+                single_arch_tag="${DOCKER_REPO}:${dest_tag}-${os}-${arch}"
+                skopeo copy --format v2s2 docker://$manifest docker://${single_arch_tag}
+                echo "${single_arch_tag}" >> $GITHUB_STEP_SUMMARY
+              done
+            fi
+          done
+
   test-distribution:
     needs: metadata
     uses: ./.github/workflows/_test_distribution.yaml

From 71216383b8666ccf0874d550e1b855ac68da4e6d Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 25 Oct 2023 14:48:21 -0700
Subject: [PATCH 04/16] Updated t5-large perf (#342)

---
 rosetta/rosetta/projects/t5x/README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/rosetta/rosetta/projects/t5x/README.md b/rosetta/rosetta/projects/t5x/README.md
index 6a0776fdc..aeec2f688 100644
--- a/rosetta/rosetta/projects/t5x/README.md
+++ b/rosetta/rosetta/projects/t5x/README.md
@@ -80,16 +80,16 @@ For a SLURM+pyxis cluster, [`example*.sub`](./scripts) files provide example slu
 ## Convergence and performance
 For our Pile convergence runs, we used a Global batch size of 2304 for XXL and 2016-2048 for all other models, where GBS is defined as #GPUs * BS/GPU / Tensor Parallel(TP). Below are example (tested) hardware topologies on NVIDIA DGX A100 (8x A100-SXM4-80G) and H100-SXM-80G nodes.
 
-| size                                    | GPU              | Precision | #GPUs |  TP   | BS / GPU | Sequences/Sec | Seq/Sec/GPU | Est. Walltime | GPU-days | MNLI 2.0 - matched | SQuAD v1.1 (EM/F1) | Convergence Log                                                                              | Config | 
-| ----                                    | ------------     | --------- | ----- | ----- | -------- | ------------- | ----------- | ------------- | -------- |------------------ | ------------------ | ---------------                                                                              | ----   |
-| T5-v1.1-small | A100 80G SXM     | bf16      | 8     | 1     | 256      | ~5712         | 714         | 4.2 days      | 33       | 83.06%             | 78.33 / 86.63      | [log](https://tensorboard.dev/experiment/lWnHal7PRnOLeZuewyWVxQ/#scalars&_smoothingWeight=0) | [pile](../t5/t5_1_1/examples/small_pile_pretrain.gin)
-| T5-v1.1-large | A100 80G SXM     | bf16      | 64    | 1     | 32       | ~4853         | 75.8        | 4.8 days      | 309     | 89.23%             | 86.12 / 93.21      | [log](https://tensorboard.dev/experiment/aOxJBIvTQBeTJ8XGXxaL6Q/#scalars&_smoothingWeight=0) |[pile](../t5/t5_1_1/examples/large_pile_pretrain.gin)
-| T5-v1.1-xl       | A100 80G SXM     | bf16      | 144   | 1     | 8        | ~3021         | 21.0        | 7.9 days      | 1,133   | N/A(perf test)     | N/A (perf test)  |                |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin)
-| T5-v1.1-xl       | A100 80G SXM     | bf16      | 256   | 1     | 8        | ~4322         | 16.9        | 5.5 days      | 1,408   | 91.15%             | 89.36 / 95.29      | [log](https://tensorboard.dev/experiment/vuRoEYgkRgWiEtbvgxlOqw/#scalars&_smoothingWeight=0) |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin)
-| T5-v1.1-xxl     | A100 80G SXM     | bf16      | 512   | 8     | 36       | ~1887         | 3.69        | 12.6 days     | 6,431  |N/A(partial run)   | N/A(partial run)   |                  |[pile](../t5/t5_1_1/examples/xxl_pile_pretrain.gin)
-| T5-v1.1-large | **H100 80G SXM** | TE-fp8    | 64    | 1     | 32       | ~10156        | **158.7**   | **2.3 days**  | **147** | 89.1%              | 86.36 / 93.5       | [log](https://tensorboard.dev/experiment/QJYnDaaBSeuZtYPXXtAG3Q/#scalars&_smoothingWeight=0) |[pile](../t5/t5_1_1/examples/large_pile_pretrain.gin)
-| T5-v1.1-xl       | **H100 80G SXM** | TE-fp8    | 144   | 1     | 14       | ~7257         | **50.4**    | **3.3 days**  | **475** | N/A (perf test)    | N/A (perf test)    |                 |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin)
-| T5-v1.1-xl       | **H100 80G SXM** | TE-fp8    | 256   | 1     | 8        | ~9688         | **37.8**    | **2.4 days**  | **614** | N/A (perf test)    | N/A (perf test)    |                 |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin)
+| size          | GPU              | Precision | #GPUs |  TP   | BS / GPU | Sequences/Sec | Seq/Sec/GPU | Est. Walltime | GPU-days | MNLI 2.0 - matched | SQuAD v1.1 (EM/F1) | Convergence Log                                                                              | Config | 
+| ----          | ------------     | --------- | ----- | ----- | -------- | ------------- | ----------- | ------------- | -------- |------------------ | ------------------  | ---------------                                                                              | ----   |
+| T5-v1.1-small | A100 80G SXM     | bf16      | 8     | 1     | 256      | ~5712         | 714         | 4.2 days      | 33       | 83.06%             | 78.33 / 86.63      | [log](https://tensorboard.dev/experiment/lWnHal7PRnOLeZuewyWVxQ/#scalars&_smoothingWeight=0) | `t5x/contrib/gpu/t5/t5_1_1/examples/small_pile_pretrain.gin` |
+| T5-v1.1-large | A100 80G SXM     | bf16      | 64    | 1     | 32       | ~4853         | 75.8        | 4.8 days      | 309      | 89.23%             | 86.12 / 93.21      | [log](https://tensorboard.dev/experiment/aOxJBIvTQBeTJ8XGXxaL6Q/#scalars&_smoothingWeight=0) | `t5x/contrib/gpu/t5/t5_1_1/examples/large_pile_pretrain.gin` |
+| T5-v1.1-xl    | A100 80G SXM     | bf16      | 144   | 1     | 8        | ~3021         | 21.0        | 7.9 days      | 1,133    | N/A(perf test)     | N/A (perf test)    |                                                                                              | `t5x/contrib/gpu/t5/t5_1_1/examples/xl_pile_pretrain.gin` |
+| T5-v1.1-xl    | A100 80G SXM     | bf16      | 256   | 1     | 8        | ~4322         | 16.9        | 5.5 days      | 1,408    | 91.15%             | 89.36 / 95.29      | [log](https://tensorboard.dev/experiment/vuRoEYgkRgWiEtbvgxlOqw/#scalars&_smoothingWeight=0) | `t5x/contrib/gpu/t5/t5_1_1/examples/xl_pile_pretrain.gin` |
+| T5-v1.1-xxl   | A100 80G SXM     | bf16      | 512   | 8     | 36       | ~1887         | 3.69        | 12.6 days     | 6,431    | N/A(partial run)   | N/A(partial run)   |                                                                                              | `t5x/contrib/gpu/t5/t5_1_1/examples/xxl_pile_pretrain.gin` |
+| T5-v1.1-large | **H100 80G SXM** | TE-fp8    | 64    | 1     | 32       | ~11139        | **174.1**   | **2.1 days**  | **134**  | 89.1%              | 86.36 / 93.5       | [log](https://tensorboard.dev/experiment/QJYnDaaBSeuZtYPXXtAG3Q/#scalars&_smoothingWeight=0) | `t5x/contrib/gpu/t5/t5_1_1/examples/large_pile_pretrain.gin` |
+| T5-v1.1-xl    | **H100 80G SXM** | TE-fp8    | 144   | 1     | 14       | ~7257         | **50.4**    | **3.3 days**  | **475**  | N/A (perf test)    | N/A (perf test)    |                                                                                              | `t5x/contrib/gpu/t5/t5_1_1/examples/xl_pile_pretrain.gin` |
+| T5-v1.1-xl    | **H100 80G SXM** | TE-fp8    | 256   | 1     | 8        | ~9688         | **37.8**    | **2.4 days**  | **614**  | N/A (perf test)    | N/A (perf test)    |                                                                                              | `t5x/contrib/gpu/t5/t5_1_1/examples/xl_pile_pretrain.gin` |
 
 Note: Convergence (as shown in log) was not necessarily done with the hardware topology listed, but the listed topology is tested. Estimated Walltime is calculated assuming full throughput (seq/sec) continuously. In practice, there are compilation overheads at the beginning of each run/restart (in cluster settings) + checkpointing overheads (if any).
 

From f09bde62b909d287e99cb9807775ebd80578277d Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 25 Oct 2023 15:23:07 -0700
Subject: [PATCH 05/16] Fix pre-submit CI from building full rosetta-t5x
 multiarch matrix which is not possible yet (#324)

CI would erroneously say arm build failed, but it failued b/c there is
no ARM t5x image yet. This change disables ARM builds for rosetta-t5x
until https://github.com/NVIDIA/JAX-Toolbox/pull/252 is in
---
 .github/workflows/ci.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index f06feb9cc..ef0a54432 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -158,6 +158,7 @@ jobs:
       BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}
       BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }}
       BASE_LIBRARY: t5x
+      PLATFORMS: '["amd64"]'
     secrets: inherit
 
   build-rosetta-pax:

From df7a5fea753efbd9eea1ea86b09b570247deb9c0 Mon Sep 17 00:00:00 2001
From: ashors1 <71393111+ashors1@users.noreply.github.com>
Date: Thu, 26 Oct 2023 16:01:18 -0700
Subject: [PATCH 06/16] Update Pax README and sub file (#345)

- Adds FP8 documentation
- Updates perf table
- Makes some other minor improvements for readability
---
 rosetta/rosetta/projects/pax/README.md        | 136 +++++++++++-------
 .../pax/scripts/example_slurm_pile.sub        |  10 +-
 2 files changed, 87 insertions(+), 59 deletions(-)

diff --git a/rosetta/rosetta/projects/pax/README.md b/rosetta/rosetta/projects/pax/README.md
index c8c30e9ab..2905d74e5 100644
--- a/rosetta/rosetta/projects/pax/README.md
+++ b/rosetta/rosetta/projects/pax/README.md
@@ -1,28 +1,28 @@
 # Pax
-[Pax](https://github.com/google/paxml/tree/main) is a framework developed by Google optimized for running machine learning experiments using JAX. Pax consists of the Paxml and [Praxis](https://github.com/google/praxis/tree/main) repositories. Pax is maintained as a [distribution](../../../docs/DEVELOPMENT.md) within rosetta. This means that we cherry-pick the necessary changes to optimize Pax for GPUs on top of upstream Paxml and Praxis' `main` branches. 
+[Pax](https://github.com/google/paxml/tree/main) is a framework developed by Google optimized for running machine learning experiments using JAX. Pax consists of the Paxml and [Praxis](https://github.com/google/praxis/tree/main) repositories and is maintained as a [distribution](../../../docs/DEVELOPMENT.md) within Rosetta. This means that we cherry-pick the necessary changes to optimize Pax for GPUs on top of upstream Paxml and Praxis' `main` branches. We also provide support for FP8 training via [Transformer Engine](https://github.com/NVIDIA/TransformerEngine).
 
 Any `paxml/*` or `praxis/*` relative directory/file can be found in [google/paxml](https://github.com/google/paxml/tree/main) or [google/praxis](https://github.com/google/praxis/tree/main), respectively, but to
-view the most up-to-date version of that directory/file with any GPU-specific patches, please see [Inspecting the source code](#inspecting-the-source-code).
+view the most up-to-date version of that directory/file with any GPU-specific patches, please see [Inspecting the Source Code](#inspecting-the-source-code).
 
 ## Hardware Specifications
-Convergence and performance has been validated on NVIDIA DGX A100 (8x A100 80G) nodes; for details, please refer to the [Configs](#configs) section below. We provide both singlenode and multinode pre-training support. If running on a machine with less than 80G memory, some of the default configurations may run out of memory; if you run out of memory and have more GPUs available, increase your GPU count and decrease your batch size per GPU.
+Convergence and performance has been validated on NVIDIA DGX H100 (8x H100 80G) and A100 (8x A100 80G) nodes; for details, please refer to the [Configs](#configs) section below. We provide both singlenode and multinode pre-training support. If running on a machine with less than 80G memory, some of the default configurations may run out of memory; if you run out of memory and have more GPUs available, increase your GPU count and decrease your batch size per GPU.
 
 ## Containers
-We provide a fully built and ready-to-use container which includes the latest optimizations, experimental features, and examples benchmarked for multi-node, multi-GPU training: `nvcr.io/nvidia/jax:23.08-paxml-py3`. This container also provides bfloat16 [Transformer Engine](https://github.com/NVIDIA/TransformerEngine) support.
+We provide fully built and ready-to-use containers which include the latest optimizations, experimental features, and examples benchmarked for multi-node, multi-GPU training: `nvcr.io/nvidia/jax:23.10-paxml-py3` (multi-arch), `nvcr.io/nvidia/jax:23.10-paxml-py3-amd64` and `nvcr.io/nvidia/jax:23.10-paxml-py3-arm64`. These containers also provide FP8 support via [Transformer Engine](https://github.com/NVIDIA/TransformerEngine). Verified containers will be updated periodically, but if you wish to use the bleeding edge (which may come with unexpected behavior), please use `ghcr.io/nvidia/pax:latest`. We also provide nightly dated images with the naming pattern `ghcr.io/nvidia/pax:nightly-YYYY-MM-DD`, but we encourage you to use the latest ones for the best performance.
 
 For more information on the Pax build and for details on how to manually build the Pax distribution, please refer to [DEVELOPMENT.md](../../../docs/DEVELOPMENT.md). 
 
-*Note*: All paths mentioned in subsequent sections are relative to the top-level directory of the Paxml repository. When working interactively with containers, make sure you are in `/opt/paxml` before running any commmands.
+*Note*: All paths mentioned in subsequent sections are relative to the top-level directory of the Paxml repository. When working interactively with containers, make sure you navigate to `/opt/paxml` before running any commmands.
 
 ### Launching a container
 Use the following command to launch a container:
 ```
 docker run -ti --gpus=all --net=host --ipc=host -v <DATASET_PATH>:/opt/paxml/datasets -v <WORKSPACE_PATH>:/opt/paxml/workspace -v <VOCAB_PATH>:/opt/paxml/vocab -w /opt/paxml <CONTAINER> /bin/bash
 ```
-where `DATASET_PATH` is the path to the Pile or Lambada dataset. If these datasets have not yet been downloaded, they can be downloaded inside of the container (see [Downloading The Pile and Lambada Datasets](#Downloading-the-pile-and-lambada-datasets) for more). `WORKSPACE_PATH` is the path to the directory where you would like to store any persistent files, and `VOCAB_PATH` is the path to the pretrained sentencepiece model to use during tokenization (see [Downloading the SentencePiece Model](#Downloading-the-sentencepiece-model) for more). 
+where `DATASET_PATH` is the path to the Pile or Lambada dataset. If these datasets have not yet been downloaded, they can be downloaded from inside of the container (see [Downloading The Pile and Lambada Datasets](#Downloading-the-pile-and-lambada-datasets) for more). `WORKSPACE_PATH` is the path to the directory where you would like to store any persistent files, and `VOCAB_PATH` is the path to the pretrained SentencePiece model to use during tokenization (see [Downloading the SentencePiece Model](#Downloading-the-sentencepiece-model) for more). 
 
 ## Downloading The Pile and Lambada Datasets
-The given models are trained using The Pile dataset and evaluated using the Lambada dataset. The scripts [download_the_pile.py](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/download_the_pile.py) and [download_lambada.py](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/download_lambada.py) will download The Pile and the Lambada datasets to the `TFDS_DATA_DIR` enviroment variable. To control the location of the downloaded datasets, use the following command prior to running the download scripts: `export TFDS_DATA_DIR=<path_to_dowload_data_to>`. After the data has been successfully downloaded, use the same `TFDS_DATA_DIR` when running experiments.
+The GPT model configs we provide are trained using The Pile dataset and evaluated using the Lambada dataset. The scripts [download_the_pile.py](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/download_the_pile.py) and [download_lambada.py](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/download_lambada.py) will download The Pile and Lambada datasets to the `TFDS_DATA_DIR` enviroment variable. To control the location of the downloaded datasets, use the following command prior to running the download scripts: `export TFDS_DATA_DIR=<path_to_dowload_data_to>`. After the data has been successfully downloaded, use the same `TFDS_DATA_DIR` when running experiments.
 
 ## Downloading the SentencePiece Model
 Pax models require a pretrained SentencePiece model to tokenize the datasets. The SentencePiece model used in the following experiments is `gs://mlperf-llm-public2/vocab/c4_en_301_5Mexp2_spm.model`. This model was trained using [these instructions](https://github.com/sgpyc/training/blob/paxml-llm-draft/large_language_model/paxml/utils/generate_spm.md). Use the following commands to download the tokenizer locally. This should be done _prior_ to launching the container.  
@@ -34,7 +34,7 @@ You can then use the following mount to attach the tokenizer to your container:
 docker run -v ${PWD}/c4_sentencepiece/c4_en_301_5Mexp2_spm.model:/opt/paxml/vocab ...
 ```
 
-## Inspecting the source code
+## Inspecting the Source Code
 If you would like to inspect Pax's source code (`paxml/*` and `praxis/*`) to learn more about what is being run, you can do so by inspecting
 the source within the container. Here are some examples:
 
@@ -44,37 +44,37 @@ cd $(python -c 'import paxml; print(paxml.__path__[0])')/../paxml/contrib/gpu/sc
 
 # (Non-interactive): View paxml/contrib/gpu/scripts_gpu/configs.py
 FILE=paxml/contrib/gpu/scripts_gpu/configs.py
-docker run --entrypoint="" --rm $CONTAINER sh -c 'cat $(python -c "import paxml; print(*paxml.__path__)" 2>/dev/null)/../'$FILE
+docker run --entrypoint="" --rm <CONTAINER> sh -c 'cat $(python -c "import paxml; print(*paxml.__path__)" 2>/dev/null)/../'$FILE
 ```
 
 ## Running a Job
-Note that when training with The Pile dataset, you must provide the `TFDS_DATA_DIR` as a command-line argument and a `VOCAB_PATH` (the path to a pretrained sentencepiece model) as an environment variable (see the bash scripts below for examples). 
+Note that when training with The Pile dataset, you must provide the `TFDS_DATA_DIR` as a command-line argument and a `VOCAB_PATH` (the path to a pretrained SentencePiece model) as an environment variable. See the bash scripts below for examples. 
 
 ### Quick Runs
 #### Interactive: Single Node
-See [run_pile_singlenode.sh](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh) for an example of training a 126m model on a single node using The Pile. Once inside of your container, this script can be run interactively using the following command:
+See [run_pile_singlenode.sh](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh) for an example of training a 126M parameter model on a single node using The Pile. Once inside of your container, this script can be run interactively using the following command:
 ``` 
 bash paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh <TFDS_DATA_DIR> <VOCAB_PATH> <PRECISION> <NUM_GPUS> <PERCORE_BATCH_SIZE> <LOGDIR>
 ```
 where `TFDS_DATA_DIR` is the path to The Pile dataset, `VOCAB_PATH` is the path to the pretrained SentencePiece `.model` file, and `LOGDIR` is the relative path of the directory to which to write checkpoints and logging information. `PERCORE_BATCH_SIZE` is the batch size per GPU _prior_ to sharding according to the parallel strategy. See [Customized Runs](#Customized-runs) for more information about this hyperparameter. 
 
-For example, to train the 126m model using a percore batch size of 4 on 8 gpus, you can use the following command: 
+For example, to train the 126M model using a percore batch size of 4 on 8 H100 gpus, you can use the following command:
 ```
-bash paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh /opt/paxml/datasets /opt/paxml/vocab bfloat16 8 4 log_dir
+ENABLE_FP8=1 bash paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh /opt/paxml/datasets /opt/paxml/vocab bfloat16 8 4 log_dir
 ```
 
-See [run_lambada_singlenode.sh](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh) for an example of running zero-shot evaluation on the 126m model using the Lambada dataset. Use the following command to run this script:
+See [run_lambada_singlenode.sh](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh) for an example of running zero-shot evaluation on the 126M model using the Lambada dataset. Use the following command to run this script:
 ``` 
 bash paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh <TFDS_DATA_DIR> <VOCAB_PATH> <PRECISION> <NUM_GPUS> <PERCORE_BATCH_SIZE> <LOGDIR>
 ```
 `TFDS_DATA_DIR` should contain the path to the Lambada dataset and `LOGDIR` should match the `LOGDIR` from the pretraining run.
 
 #### Multi Node
-See [example_slurm_pile.sub](https://github.com/NVIDIA/JAX-Toolbox/blob/main/rosetta/rosetta/projects/pax/scripts/example_slurm_pile.sub) for an example slurm submit file that launches an 8-node run with a 126 million parameter GPT model.
+See [example_slurm_pile.sub](https://github.com/NVIDIA/JAX-Toolbox/blob/main/rosetta/rosetta/projects/pax/scripts/example_slurm_pile.sub) for an example slurm submit file that launches an 8-node training run with a 126 million parameter GPT model.
 
 To launch `example_slurm_pile.sub`, run the following command:
 ```
-CONTAINER=<CONTAINER> BASE_WORKSPACE_DIR=<PATH_TO_WORKSPACE> BASE_TFDS_DATA_DIR=<PATH_TO_THE_PILE> BASE_VOCAB_PATH=<PATH_TO_SENTENCEPIECE_MODEL> LOG_DIR_LOCAL=<LOG_DIR_LOCAL> OUTPUT_DIR=<OUTPUT_DIR_LOCAL> PREC=bfloat16 GPUS_PER_NODE=8 PERCORE_BATCH_SIZE=4 sbatch -N 8 -A <ACCOUNT> -p <PARTITION> -J <JOBNAME> paxml/contrib/gpu/scripts_gpu/example_slurm_pile.sub
+CONTAINER=<CONTAINER> BASE_WORKSPACE_DIR=<PATH_TO_WORKSPACE> BASE_TFDS_DATA_DIR=<PATH_TO_THE_PILE> BASE_VOCAB_PATH=<PATH_TO_SENTENCEPIECE_MODEL> LOG_DIR_LOCAL=<LOG_DIR_LOCAL> OUTPUT_DIR=<OUTPUT_DIR_LOCAL> PREC=bfloat16 GPUS_PER_NODE=8 PERCORE_BATCH_SIZE=4 ENABLE_FP8=<ENABLE_FP8> sbatch -N 8 -A <ACCOUNT> -p <PARTITION> -J <JOBNAME> scripts/example_slurm_pile.sub
 ```
 where `BASE_WORKSPACE_DIR`, `BASE_TFDS_DATA_DIR`, and `BASE_VOCAB_PATH` are absolute paths and `LOG_DIR` and `OUTPUT_DIR` are relative to `BASE_WORKSPACE_DIR`.
     
@@ -93,19 +93,26 @@ Paxml uses [Fiddle](https://github.com/google/fiddle/tree/main) for configuring
 For example, in our `*.sh` scripts, we override the default values of `FPROP_DTYPE`, `ICI_MESH_SHAPE`, and `PERCORE_BATCH_SIZE`. 
 
 We provide a list of some of the frequently overridden hyperparameters, and an explanation of each, below:
-- `ICI_MESH_SHAPE`: This refers to the parallelism strategy used on chips connected by a fast network (e.g. NVLink). `ICI_MESH_SHAPE` typically has 3 dimensions, `[data, fsdp, tensor]`, corresponding to data parallelism (DP), fully-sharded data parallelism (FSDP/ZeRO-3), and tensor parallelism (TP), respectively. To use pure data parallelism, you should set `ICI_MESH_SHAPE` to `[NUM_GPUS, 1, 1]`.
-- `DCN_MESH_SHAPE`: This refers to the parallelism strategy for machines connected by a datacenter network. This is the generally parallel strategy used _across_ nodes.
+- `ICI_MESH_SHAPE`: This refers to the parallelism strategy used on chips connected by a fast network (e.g. NVLink). `ICI_MESH_SHAPE` typically has 3 dimensions, `[data, fsdp, tensor]`, corresponding to data parallelism (DP), fully-sharded data parallelism (FSDP/ZeRO-3), and tensor parallelism (TP), respectively. For example,to use pure data parallelism, you should set `ICI_MESH_SHAPE` to `[NUM_GPUS, 1, 1]`.
+- `DCN_MESH_SHAPE`: This refers to the parallelism strategy for machines connected by a datacenter network. In our case, this refers to the parallel strategy used _across_ nodes. It has the same dimensions as `ICI_MESH_SHAPE`.
 - `PERCORE_BATCH_SIZE`: This is the batch size loaded by each worker _prior_ to sharding the data according to the parallel strategy. We should always have that `GLOBAL_BATCH_SIZE = PERCORE_BATCH_SIZE * NUM_GPUS`, regardless of the parallel strategy. Note that a consequence of this is that `PERCORE_BATCH_SIZE` will not always equal `MICROBATCH_SIZE`, particularly when using tensor parallelism (TP). If using 2-way TP, for example, `MICROBATCH_SIZE` will be twice the `PERCORE_BATCH_SIZE`. If using tensor or pipeline parallelism, `PERCORE_BATCH_SIZE` may be fractional. For example, when using 2-way TP, setting `PERCORE_BATCH_SIZE` to 0.5 will result in a microbatch size of `PERCORE_BATCH_SIZE * TP = 1`.
-- `NUM_LAYERS`, `NUM_HEADS`, `MODEL_DIMS`, `HIDDEN_DIMS`: These are hyperparameters of the transformer model. `MODEL_DIMS` refers to the hidden dimension of the transformer, and `HIDDEN_DIMS` refers to the hidden dimension of the transformer feed-forward network. 
+- `NUM_LAYERS`, `NUM_HEADS`, `MODEL_DIMS`, `HIDDEN_DIMS`: These are hyperparameters of the transformer model. `MODEL_DIMS` refers to the hidden dimension of the transformer and `HIDDEN_DIMS` refers to the hidden dimension of the transformer feed-forward network.
 
 We provide three "base" configurations in `paxml/contrib/gpu/scripts_gpu/configs.py`. For more information about these configurations and how to run experiments using them, please refer to the [Configs](#Configs) section below.
 
 ### Transformer Engine
-Training using Transformer Engine (TE) with bfloat16 precision can be enabled via the environment variable `ENABLE_TE`. To enable TE, simply add the following line to `run_pile_multinode.sh` (or whatever bash script you are using to run experiments):
+Training using Transformer Engine (TE) with bfloat16 precision is controlled via the environment variable `ENABLE_TE`. TE is enabled by default in the prebuilt container, but if you would like to disable TE, you can do so by flipping the value of `ENABLE_TE` in the container:
 ```
-export ENABLE_TE=1
+export ENABLE_TE=0
 ```
-Note that packing is currently not supported when using TE. All configs disable packing by default, but beware that if packing is manually enabled, training with TE will error. 
+
+FP8 training is controlled via the `ENABLE_FP8` environment variable. To enable FP8 training, set `ENABLE_FP8=1`. For example, the following command trains a 126M model on a single node using FP8:
+```
+ENABLE_FP8=1 bash paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh /opt/paxml/datasets /opt/paxml/vocab bfloat16 8 4 log_dir
+```
+
+Note that packing is currently not supported when using TE. All configs disable packing by default, but beware that if packing is manually enabled, training with TE will error.
+
 
 ## XLA Flags
 We recommend setting the following XLA flags when running experiments: 
@@ -116,7 +123,7 @@ We recommend setting the following XLA flags when running experiments:
 4. `--xla_gpu_enable_async_reduce_scatter=true`: Allows XLA:GPU to run Reduce Scatter NCCL kernels on a separate CUDA stream to allow overlap with compute kernels
 5. `--xla_gpu_enable_async_all_reduce=true`: Allows XLA:GPU to run All Reduce NCCL kernels on a separate CUDA stream to allow overlap with compute kernels.
 6. `--xla_gpu_enable_highest_priority_async_stream=true`: Allows XLA to prioritize the launch of NCCL kernels before GeMMs to ensure enough SMs are available for async communication kernels.
-7. `--xla_gpu_all_reduce_combine_threshold_bytes=51200`: Combines NCCL All Reduce kernels until threshold size is reached.
+7. `--xla_gpu_all_reduce_combine_threshold_bytes=<BYTES>`: Combines NCCL All Reduce kernels until threshold size is reached. For 126M, we recommend setting this value to 33554432. For 5B and 175B, we recommend 51200.
 8. `--xla_gpu_enable_triton_gemm=false`: Disallows Triton GeMM kernels; uses CUBLAS GeMM kernels instead. CUBLAS kernels are currently better tuned for GPUs and thus provide better performance
 9. `--xla_gpu_cuda_graph_level=0`: Disallows XLA from using CUDA graphs.
 
@@ -124,55 +131,74 @@ These flags are enabled by default in `paxml/contrib/gpu/scripts_gpu/run_pile_mu
 ```
 export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_triton_gemm=false"
 ```
+For the the 126M model, we recommend setting `--xla_gpu_all_reduce_combine_threshold_bytes=33554432`, which is different from the default value in `paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh`. To overwrite the default XLA flags set in the script, set the `BASE_XLA_FLAGS` environment variable prior to calling `run_pile_multinode` as follows:
+
+```
+BASE_XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
+                --xla_gpu_simplify_all_fp_conversions --xla_gpu_enable_async_all_gather=true
+                --xla_gpu_enable_async_reduce_scatter=true  --xla_gpu_enable_highest_priority_async_stream=true
+                --xla_gpu_enable_triton_softmax_fusion=false  --xla_gpu_all_reduce_combine_threshold_bytes=33554432
+                --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true" bash run_pile_multinode.sh ...
+```
 
 ## Configs
-We provide three "base" model configurations in `paxml/contrib/gpu/scripts_gpu/configs.py`. The first is a 126 million parameter GPT model. Convergence using The Pile dataset has been verified with this model. The remaining configs are 5 billion and 175 billion parameter models. Both 5B and 175B are provided for benchmarking purposes and have not been thoroughly tested for convergence to date.
+We provide three "base" model configurations in `paxml/contrib/gpu/scripts_gpu/configs.py`. The first is a 126 million parameter GPT model. Convergence using The Pile dataset has been verified with this model. The remaining configs are 5 billion and 175 billion parameter models. Both 5B and 175B are provided primarily for benchmarking purposes and been less thoroughly tested for convergence.
 
-The table below describes current performance of the given configs. Experiments were run using NVIDIA DGX A100 (8x A100 80G) nodes. Note that Lambada accuracy reported corresponds to the best accuracy seen across the run. Estimated walltime denotes the aproximate time to train each model to completion (i.e. number of days to reach `MAX_STEPS` number of steps as described in `configs.py`).
+The tables below describe current performance of the given configs. Experiments were run using NVIDIA DGX A100 80G and H100 80G nodes. Note that Lambada accuracy reported corresponds to the best accuracy seen across the run. Estimated walltime denotes the aproximate time to train each model to completion (i.e. number of days to reach `MAX_STEPS` number of steps as described in `configs.py`).
 
-| Size | #GPUs | DP | FSDP | TP | BS / GPU | Sequences/Sec (bf16 / TE bf16) | Estimated Walltime (days, bf16 / TE bf16) | Lambada Accuracy | Convergence Log |
-| ---- | ----- | -- | ---- | -- | ---------| ---------------| ------------------------- | ---------------- |---------------- |
-| 126M |  64    |64    |1    |1    | 4        |  1761.3 / 2339.8   |         1.01 / 0.76             |        0.397 (± 0.012)     | [log](https://tensorboard.dev/experiment/RCroDLAUQzGUoudzqD1NmQ/) |
-| 5B   | 256    | 1    |256    |1    | 8       |  465.45 / 598.83     |      3.82 /  2.97          |       N/A        | [log](https://tensorboard.dev/experiment/AyXAn8ZDRheUARN1NMJ1sw)           |
-| 175B | 256    |1    |256    |1    | 6       |   18.29 / 19.62      |        72.92 / 67.97           |    N/A           | [log](https://tensorboard.dev/experiment/NJnv5LbdQby2PcZGPnTRrA/)  | N/A           |
+### A100 Results
 
-*Note*: Estimated walltime is computed assuming full throughput continuously. In practice, true walltime may be greater due to compilation overheads, interleaved evaluation, and checkpointing. A number of the linked convergence runs were completed using older software; thus, reported throughput does not match current results (notably for 126M and 5B bf16). The most up-to-date throughput numbers are reported in the table. 
+| Size | GPU | Precision | #GPUs | DP | FSDP | TP | BS / GPU | Sequences/Sec | Est. Walltime (days) | Lambada Accuracy (± standard deviation) | Convergence Log |
+| ---- | ----- |----- |----- | -- | ---- | -- | ---------| ---------------| ------------------------- | ---------------- |---------------- |
+| 126M | A100 80G SXM | BF16 |  64    |64    |1    |1    | 4     |   1877.20  |         0.95        |   0.397 (± 0.012)     | [log](https://tensorboard.dev/experiment/RCroDLAUQzGUoudzqD1NmQ/) |
+| 5B   | A100 80G SXM | BF16 | 256    | 1    |256    |1    | 8       |  465.45     |       3.82           |       N/A        |            |
+| 175B | A100 80G SXM | BF16 | 256    |1    |256    |1    | 6       |   18.29     |        72.92         |        N/A       |    |
+| 126M | A100 80G SXM | TE BF16 |  64    |64    |1    |1    | 4     |  2512.2   |     0.71            |   N/A | |
+| 5B   | A100 80G SXM | TE BF16 | 256    | 1    |256    |1    | 8       | 586.82    |    3.02    |       N/A        |            |
+| 175B | A100 80G SXM | TE BF16 | 256    |1    |256    |1    | 6       |   19.47    |      68.49     |        N/A       |    |
 
-The runs in 5B convergence log were trained for around 26k (TE) and 45k (no TE) steps at a global batch size of 2048 and a sequence length of 2048, amounting to around 109 billion and 189 billion consumed tokens for TE, non-TE respectively. The 175B convergence log was trained for a total of around 700 steps at a global batch size of 1536 and a sequence length of 2048, amounting to around 2.2 billion consumed tokens. Finally, 175B was trained using the [C4 dataset](https://github.com/mlcommons/training/tree/master/large_language_model/paxml#2-dataset), while 126M and 5B were both trained using the Pile.
+## H100 results
+
+| Size | GPU | Precision | #GPUs | DP | FSDP | TP | BS / GPU | Sequences/Sec | Est. Walltime (days) | Lambada Accuracy (± standard deviation) | Convergence Log |
+| ---- | ----- |----- |----- | -- | ---- | -- | ---------| ---------------| ------------------------- | ---------------- |---------------- |
+| 126M | H100 80G SXM | TE BF16 |  64    |64    |1    |1    | 4        |  4143.21  |     0.43           |    0.425 (± 0.018)        | [log](https://tensorboard.dev/experiment/GgDMwODzQjm9kVc9H6259A/) |
+| 5B   | H100 80G SXM | TE BF16 | 256    | 1    |256    |1    | 8       |  1066.67  |   1.67   |       N/A        |   |
+| 175B | H100 80G SXM | TE BF16 | 256    |1    |256    |1    | 6       |  44.01  |   30.35   |       N/A        |    |
+| 5B   | H100 80G SXM | TE FP8 | 256    | 1    |256    |1    | 8       |  1288.05   |     1.38       |       N/A        |    [log](https://tensorboard.dev/experiment/i5kiGeQpRRapswa68RkYHQ/)      |
+| 175B | H100 80G SXM | TE FP8 | 256    |1    |256    |1    | 6       |   65.64   |     20.33      |       N/A        |   [log](https://tensorboard.dev/experiment/HvpU324wQYarwgvd9P3Uew/)     |
+
+
+*Note*: Estimated walltime is computed assuming full throughput continuously. In practice, true walltime may be greater due to compilation overheads, interleaved evaluation, and checkpointing. A number of the linked convergence runs were completed using older software; thus, throughput reported in the linked logs may not match current results. The most up-to-date throughput numbers are reported in the table. 
+
+5B FP8 was trained for 75,000 steps at a global batch size of 2048 and a sequence length of 2048, amounting to around 300 billion consumed tokens. 175B FP8 was trained for a total of around 1,000 steps at a global batch size of 1536 and a sequence length of 2048, amounting to around 3.14 billion consumed tokens. 175B was trained using the [C4 dataset](https://github.com/mlcommons/training/tree/master/large_language_model/paxml#2-dataset) and restores from an [initial MLPerf checkpoint](https://github.com/mlcommons/training/tree/master/large_language_model/paxml#initial-checkpoint). 126M and 5B were both trained using the Pile.
 
 ### Running an Experiment with Base Configs
-To run an experiment with any base model configuration with the default parallel strategy reported in the table, copy [run_pile_multinode.sh](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh) to your workspace and make the following modifications: replace `--fdl_config=paxml.contrib.gpu.scripts_gpu.configs.Pile126M` with the experiment you are interested in running (e.g. `paxml.contrib.gpu.scripts_gpu.configs.GPT5B` or `paxml.contrib.gpu.scripts_gpu.configs.GPT175B`) and remove `--fdl.ICI_MESH_SHAPE="[${TRAIN_GPUS}, 1, 1]"`. The resulting bash script (call it `run_my_model_multinode.sh`) can be passed into `example_slurm_pile.sub` using the following command. This command presumes that `run_my_model_multinode.sh` lives in `BASE_WORKSPACE_DIR`.
+To run an experiment with any base model configuration with the default parallel strategy reported in the table, copy [run_pile_multinode.sh](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh) to your workspace and make the following modifications: replace `--fdl_config=paxml.contrib.gpu.scripts_gpu.configs.Pile126M` with the experiment you are interested in running (e.g. `paxml.contrib.gpu.scripts_gpu.configs.GPT5B` or `paxml.contrib.gpu.scripts_gpu.configs.GPT175B`) and remove `--fdl.ICI_MESH_SHAPE="[${NUM_GPUS}, 1, 1]"` and `--fdl.DCN_MESH_SHAPE="[${SLURM_JOB_NUM_NODES}, 1, 1]"`. The resulting bash script (call it `run_my_model_multinode.sh`) can be passed into `example_slurm_pile.sub` using the following command. This command presumes that `run_my_model_multinode.sh` lives in `BASE_WORKSPACE_DIR`.
 ```
-BASE_SCRIPT=run_my_model_multinode.sh CONTAINER=<CONTAINER> BASE_WORKSPACE_DIR=<PATH_TO_WORKSPACE> BASE_TFDS_DATA_DIR=<PATH_TO_THE_PILE> BASE_VOCAB_PATH=<PATH_TO_SENTENCEPIECE_MODEL> LOG_DIR_LOCAL=<LOG_DIR_LOCAL> OUTPUT_DIR=<OUTPUT_DIR_LOCAL> PREC=<PRECISION> GPUS_PER_NODE=<GPUS_PER_NODE> PERCORE_BATCH_SIZE=<BS_PER_GPU> sbatch -N <NUM_NODES> -A <ACCOUNT> -p <PARTITION> -J <JOBNAME> paxml/contrib/gpu/scripts_gpu/example_slurm_pile.sub
+BASE_SCRIPT=run_my_model_multinode.sh CONTAINER=<CONTAINER> BASE_WORKSPACE_DIR=<PATH_TO_WORKSPACE> BASE_TFDS_DATA_DIR=<PATH_TO_THE_PILE> BASE_VOCAB_PATH=<PATH_TO_SENTENCEPIECE_MODEL> LOG_DIR_LOCAL=<LOG_DIR_LOCAL> OUTPUT_DIR=<OUTPUT_DIR_LOCAL> PREC=<PRECISION> GPUS_PER_NODE=<GPUS_PER_NODE> PERCORE_BATCH_SIZE=<BS_PER_GPU> ENABLE_FP8=<ENABLE_FP8> sbatch -N <NUM_NODES> -A <ACCOUNT> -p <PARTITION> -J <JOBNAME> scripts/example_slurm_pile.sub
+```
+Here, it is assumed that you are running with the number of nodes reported in the table. If using a different node count, scale `DCN_MESH_SHAPE` accordingly. For example, the default value of `DCN_MESH_SHAPE` for `paxml.contrib.gpu.scripts_gpu.configs.GPT5B` is `[1,32,1]`. If running on 16 nodes, adjust `DCN_MESH_SHAPE` as follows:
+```
+--fdl.DCN_MESH_SHAPE=[1,16,1]
 ```
 
+
 ## Known Issues
-* The Paxml container does not fully support Hopper yet. Future releases will add Hopper support.
 * Pipeline parallelism is not supported with NVIDIA Transformer Engine enabled in the Paxml container.
-* There are known Common Vulnerabilities and Exposures (CVE) that affect the Paxml container related to TensorFlow 2.9.x due to pinning TensorFlow to 2.9.x in Paxml and Lingvo. We will fix these in the next release. The known CVEs are:
-    * CVE-2023-25668 
-    * CVE-2023-25658
-    * CVE-2023-25663
-    * CVE-2023-25664
-    * CVE-2023-25664
-    * CVE-2023-25672
-    * CVE-2023-25674
-    * CVE-2023-25660
-    * CVE-2023-27579
-    * CVE-2023-25671
-    * CVE-2023-25659
-    * CVE-2023-25662
-    * CVE-2023-25675
-    * CVE-2023-25801
-    * CVE-2023-25670
-    * CVE-2023-25669
-    * CVE-2023-25665
-    * CVE-2023-25673
-    * CVE-2023-25666
 * The Paxml nightlies disable `NCCL_NVLS_ENABLE=0` ([doc](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-nvls-enable)). Future releases will re-enable this feature.
+* The release container has a known XLA bug which affects single-process training in some cases. This bug has been fixed in newer XLA versions. If running into issues with single-process training, try using a Pax nightly container after 10/3. You can also try cherry-picking [this commit](https://github.com/openxla/xla/commit/aa8e7340cb319b9419a097155874bf105da05e1d) in the tested container.  
+* Infrequent hangs have been observed in multinode settings. Setting `CUDA_MODULE_LOADING=EAGER` helps with these hangs. This environment variable is set by default in `nvcr.io/nvidia/jax:23.10-paxml-py3`, `nvcr.io/nvidia/jax:23.10-paxml-py3-amd64`, and `nvcr.io/nvidia/jax:23.10-paxml-py3-arm64`.
+* We currently see unexpected convergence behavior when dropout is used with Transformer Engine. Default configs do not enable dropout within transformer layers and thus should be unaffected by this bug, but users may encounter this bug if manually enabling dropout in their models.
 
 
 ## Changelog
+### 10/26/2023
+- Enabled BF16 Transformer Engine by default
+- Added FP8 Transformer Engine support
+- Updated 5B config to disable dropout in transformer layers
+- bfloat16 performance
+    - 126M performance is 6% higher than 8/29, bringing the overall regression with respect to 7/11 to around 10%. We will continue to improve 126M performance in future releases.
+
 ### 8/29/2023
 - Added bfloat16 Transformer Engine support
 - Disabled packing by default in all base configurations for TE compatibility
diff --git a/rosetta/rosetta/projects/pax/scripts/example_slurm_pile.sub b/rosetta/rosetta/projects/pax/scripts/example_slurm_pile.sub
index 47e122432..1b479cd1e 100644
--- a/rosetta/rosetta/projects/pax/scripts/example_slurm_pile.sub
+++ b/rosetta/rosetta/projects/pax/scripts/example_slurm_pile.sub
@@ -29,16 +29,18 @@ set -eux
 # File system and volume glue code
 #-------------------------------------------------------------------------------
 # << CHANGE ! >>
-CONTAINER="${CONTAINER:-nvcr.io/nvidia/jax:23.08-paxml-py3}"
+CONTAINER="${CONTAINER:-nvcr.io/nvidia/jax:23.10-paxml-py3}"
 
 # << CHANGE ! >>
 BASE_WORKSPACE_DIR=${BASE_WORKSPACE_DIR} ## location to write logs and checkpoints to
 BASE_TFDS_DATA_DIR=${BASE_TFDS_DATA_DIR}
 BASE_VOCAB_PATH=${BASE_VOCAB_PATH}
 PAXML_DIR=${PAXML_DIR:-/opt/paxml}
+ENABLE_TE=${ENABLE_TE:-1}
+ENABLE_FP8=${ENABLE_FP8:-0}
 
 # Default env variables for paths required by pax training scripts
-WORKSPACE_DIR=/mnt/workspace
+WORKSPACE_DIR=/opt/paxml/workspace
 TFDS_DATA_DIR=/mnt/datasets
 GPT_VOCAB_PATH=/mnt/vocab
 
@@ -61,7 +63,7 @@ if [[ -z "${BASE_SCRIPT:-}" ]]; then
   export BASE_SCRIPT="${PAXML_DIR}/paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh"
   echo "Using default BASE_SCRIPT=$BASE_SCRIPT"
 else
-  export BASE_SCRIPT="/mnt/workspace/${BASE_SCRIPT}"
+  export BASE_SCRIPT="${WORKSPACE_DIR}/${BASE_SCRIPT}"
   echo "Using custom BASE_SCRIPT=$BASE_SCRIPT"
 fi
 
@@ -69,7 +71,7 @@ cmd="$(cat <<EOF
 echo "*******STARTING********"
 cd ${PAXML_DIR}
 nvidia-smi
-bash $BASE_SCRIPT $TFDS_DATA_DIR $GPT_VOCAB_PATH $PREC $GPUS_PER_NODE $PERCORE_BATCH_SIZE ${WORKSPACE_DIR}/${LOG_DIR_LOCAL}
+ENABLE_TE=$ENABLE_TE ENABLE_FP8=$ENABLE_FP8 bash $BASE_SCRIPT $TFDS_DATA_DIR $GPT_VOCAB_PATH $PREC $GPUS_PER_NODE $PERCORE_BATCH_SIZE ${WORKSPACE_DIR}/${LOG_DIR_LOCAL}
 EOF
 )"
 

From 1657890087300f72fa1af1b031caf0f75193ca58 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Thu, 26 Oct 2023 23:46:52 -0700
Subject: [PATCH 07/16] Adds CUDA_MODULE_LOADING=EAGER to core jax container
 env vars (#329)

---
 .github/container/Dockerfile.jax | 1 +
 README.md                        | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax
index 9914b28c3..3563d01db 100644
--- a/.github/container/Dockerfile.jax
+++ b/.github/container/Dockerfile.jax
@@ -52,6 +52,7 @@ ENV XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_a
 ENV CUDA_DEVICE_MAX_CONNECTIONS=1
 ENV NCCL_IB_SL=1
 ENV NCCL_NVLS_ENABLE=0
+ENV CUDA_MODULE_LOADING=EAGER
 
 COPY --from=jax-builder ${SRC_PATH_JAX}-no-git ${SRC_PATH_JAX}
 COPY --from=jax-builder ${SRC_PATH_XLA}-no-git ${SRC_PATH_XLA}
diff --git a/README.md b/README.md
index 8b1b68b98..6e208d35d 100644
--- a/README.md
+++ b/README.md
@@ -159,6 +159,7 @@ The [JAX image](ghcr.io/nvidia/jax) is embedded with the following flags and env
 | `CUDA_DEVICE_MAX_CONNECTIONS` | `1` | use a single queue for GPU work to lower latency of stream operations; OK since XLA already orders launches |
 | `NCCL_IB_SL` | `1` | defines the InfiniBand Service Level ([1](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-ib-sl)) |
 | `NCCL_NVLS_ENABLE` | `0` | Disables NVLink SHARP ([1](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-nvls-enable)). Future releases will re-enable this feature. |
+| `CUDA_MODULE_LOADING` | `EAGER` | Disables lazy-loading ([1](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#cuda-environment-variables)) which uses slightly more GPU memory. |
 
 ## FAQ (Frequently Asked Questions)
 

From 3ae2d13c1ec2ca2591586ed52967b93d4e6c688c Mon Sep 17 00:00:00 2001
From: ashors1 <71393111+ashors1@users.noreply.github.com>
Date: Fri, 27 Oct 2023 13:36:55 -0700
Subject: [PATCH 08/16] Re-enable NVLS in nightly containers (#331)

NVLS was disabled due to a known issue in NCCL 2.17 that caused
intermittent hangs. The issue has been resolved in NCCL 2.18, so we are
safe to re-enable NVLS.

---------

Co-authored-by: Terry Kong <terryk@nvidia.com>
---
 .github/container/Dockerfile.jax       | 1 -
 README.md                              | 1 -
 rosetta/rosetta/projects/pax/README.md | 1 -
 rosetta/rosetta/projects/t5x/README.md | 1 -
 rosetta/rosetta/projects/vit/README.md | 1 -
 5 files changed, 5 deletions(-)

diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax
index 3563d01db..ed55219fe 100644
--- a/.github/container/Dockerfile.jax
+++ b/.github/container/Dockerfile.jax
@@ -51,7 +51,6 @@ ENV BUILD_DATE=${BUILD_DATE}
 ENV XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_triton_gemm=false"
 ENV CUDA_DEVICE_MAX_CONNECTIONS=1
 ENV NCCL_IB_SL=1
-ENV NCCL_NVLS_ENABLE=0
 ENV CUDA_MODULE_LOADING=EAGER
 
 COPY --from=jax-builder ${SRC_PATH_JAX}-no-git ${SRC_PATH_JAX}
diff --git a/README.md b/README.md
index 6e208d35d..8ba466c88 100644
--- a/README.md
+++ b/README.md
@@ -158,7 +158,6 @@ The [JAX image](ghcr.io/nvidia/jax) is embedded with the following flags and env
 | -------------------- | ----- | ----------- |
 | `CUDA_DEVICE_MAX_CONNECTIONS` | `1` | use a single queue for GPU work to lower latency of stream operations; OK since XLA already orders launches |
 | `NCCL_IB_SL` | `1` | defines the InfiniBand Service Level ([1](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-ib-sl)) |
-| `NCCL_NVLS_ENABLE` | `0` | Disables NVLink SHARP ([1](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-nvls-enable)). Future releases will re-enable this feature. |
 | `CUDA_MODULE_LOADING` | `EAGER` | Disables lazy-loading ([1](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#cuda-environment-variables)) which uses slightly more GPU memory. |
 
 ## FAQ (Frequently Asked Questions)
diff --git a/rosetta/rosetta/projects/pax/README.md b/rosetta/rosetta/projects/pax/README.md
index 2905d74e5..9405abefb 100644
--- a/rosetta/rosetta/projects/pax/README.md
+++ b/rosetta/rosetta/projects/pax/README.md
@@ -185,7 +185,6 @@ Here, it is assumed that you are running with the number of nodes reported in th
 
 ## Known Issues
 * Pipeline parallelism is not supported with NVIDIA Transformer Engine enabled in the Paxml container.
-* The Paxml nightlies disable `NCCL_NVLS_ENABLE=0` ([doc](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-nvls-enable)). Future releases will re-enable this feature.
 * The release container has a known XLA bug which affects single-process training in some cases. This bug has been fixed in newer XLA versions. If running into issues with single-process training, try using a Pax nightly container after 10/3. You can also try cherry-picking [this commit](https://github.com/openxla/xla/commit/aa8e7340cb319b9419a097155874bf105da05e1d) in the tested container.  
 * Infrequent hangs have been observed in multinode settings. Setting `CUDA_MODULE_LOADING=EAGER` helps with these hangs. This environment variable is set by default in `nvcr.io/nvidia/jax:23.10-paxml-py3`, `nvcr.io/nvidia/jax:23.10-paxml-py3-amd64`, and `nvcr.io/nvidia/jax:23.10-paxml-py3-arm64`.
 * We currently see unexpected convergence behavior when dropout is used with Transformer Engine. Default configs do not enable dropout within transformer layers and thus should be unaffected by this bug, but users may encounter this bug if manually enabling dropout in their models.
diff --git a/rosetta/rosetta/projects/t5x/README.md b/rosetta/rosetta/projects/t5x/README.md
index aeec2f688..39401f415 100644
--- a/rosetta/rosetta/projects/t5x/README.md
+++ b/rosetta/rosetta/projects/t5x/README.md
@@ -197,7 +197,6 @@ t5x/contrib/gpu/scripts_gpu/singlenode_ft_frompile.sh \
 
 # Known Issues
 * There is a known sporadic NCCL crash that happens when using the T5x container at node counts greater than or equal to 32 nodes. We will fix this in the next release. The issue is tracked [here](https://github.com/NVIDIA/JAX-Toolbox/issues/194).
-* The T5x nightlies disable `NCCL_NVLS_ENABLE=0` ([doc](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-nvls-enable)). Future releases will re-enable this feature.
 
 # Changelog
 - Added Transformer Engine + FP8 support
diff --git a/rosetta/rosetta/projects/vit/README.md b/rosetta/rosetta/projects/vit/README.md
index 0c5b22a47..a57896480 100644
--- a/rosetta/rosetta/projects/vit/README.md
+++ b/rosetta/rosetta/projects/vit/README.md
@@ -157,5 +157,4 @@ Pre-training was performed on 1 node with a global batch size of 4096. Models we
 
 ## Known Issues
 1. By default, gradient accumulation (GA) sums loss across the microbatches. As a result, loss is scaled up when using gradient accumulation, and training with GA only works when using a scale-invariant optimizer such as Adam or Adafactor. ViT fine-tuning is performed using SGD; thus, GA should not be used when fine-tuning.
-2. The nightlies disable `NCCL_NVLS_ENABLE=0` ([doc](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-nvls-enable)). Future releases will re-enable this feature.
 

From e73b63232e91d8b97bc664efa85876f59a15d3f0 Mon Sep 17 00:00:00 2001
From: ashors1 <71393111+ashors1@users.noreply.github.com>
Date: Fri, 27 Oct 2023 18:52:15 -0700
Subject: [PATCH 09/16] Update Pax TE patch to point to rebased branch (#348)

---
 rosetta/patchlist-paxml.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rosetta/patchlist-paxml.txt b/rosetta/patchlist-paxml.txt
index 4f1162c76..a7a67f2ab 100644
--- a/rosetta/patchlist-paxml.txt
+++ b/rosetta/patchlist-paxml.txt
@@ -5,4 +5,4 @@
 # - External Pull Requests (These are pull requests with upstream paxml and are of the form "pull/$PULLID/head")
 # - Note: Only the first column is used as a git-ref, so anything after is a comment
 
-mirror/patch/add_dropout_support_to_te # adds Transformer Engine support (+ dropout support)
+pull/46/head  # adds Transformer Engine support

From c9a8c558a373e2d62ee855a88cc123310cef4f03 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Mon, 30 Oct 2023 15:34:54 -0700
Subject: [PATCH 10/16] Loosens t5x loss tests relative tolerances (#343)

Relaxing the relative tolerance on the loss tests since it was leading
to too many false positives. For reference, deviation in loss for the t5
model can sometimes be up to 15% at the start of training with real
data.
---
 .../baselines/test_t5x_mgmn_metrics.py        | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/baselines/test_t5x_mgmn_metrics.py b/.github/workflows/baselines/test_t5x_mgmn_metrics.py
index c14fc2332..6205afedc 100644
--- a/.github/workflows/baselines/test_t5x_mgmn_metrics.py
+++ b/.github/workflows/baselines/test_t5x_mgmn_metrics.py
@@ -7,18 +7,18 @@
 from numpy.testing import assert_allclose
 
 LOSS_RTOL = {
-    '1G1N': 0.02,
-    '1G2N': 0.03,
-    '1P1G': 0.03,
-    '1P2G': 0.03,
-    '1P4G': 0.035,
-    '1P8G': 0.035,
-    '2G1N': 0.025,
-    '2G2N': 0.015,
-    '4G1N': 0.04,  # orig = 0.03
-    '4G2N': 0.03,
-    '8G1N': 0.03,
-    '8G2N': 0.05
+    '1G1N': 0.10,  # orig = 0.02
+    '1G2N': 0.10,  # orig = 0.03
+    '1P1G': 0.10,  # orig = 0.03
+    '1P2G': 0.10,  # orig = 0.03
+    '1P4G': 0.10,  # orig = 0.035
+    '1P8G': 0.10,  # orig = 0.035
+    '2G1N': 0.10,  # orig = 0.025
+    '2G2N': 0.10,  # orig = 0.015
+    '4G1N': 0.10,  # orig = 0.03
+    '4G2N': 0.10,  # orig = 0.03
+    '8G1N': 0.10,  # orig = 0.03
+    '8G2N': 0.10,  # orig = 0.05
 }
 STEP_TIME_MULT = {
     "1G1N": 0.95,

From b2b6bf38ef0ddc5a38deb1744b4a99d33f3410d6 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 1 Nov 2023 14:36:24 -0700
Subject: [PATCH 11/16] Adds rosetta-t5x TE + no-TE tests that enable the
 correct configs for testing (#332)

- [ ] Add capability to retroactively test with newer test-t5x.sh like
in
[t5x-wget-test](https://github.com/NVIDIA/JAX-Toolbox/tree/t5x-wget-test)
- [ ] Sets `ENABLE_TE=1` in the Dockerfile.t5x which is identical to the
logic from before where it was always enabled in rosetta-t5x
---
 .github/container/test-t5x.sh                 |  21 +-
 .github/workflows/_test_t5x_rosetta.yaml      | 369 ++++++++++++++++++
 .../nightly-rosetta-t5x-build-test.yaml       |   6 +-
 rosetta/Dockerfile.t5x                        |   1 +
 4 files changed, 386 insertions(+), 11 deletions(-)
 create mode 100644 .github/workflows/_test_t5x_rosetta.yaml

diff --git a/.github/container/test-t5x.sh b/.github/container/test-t5x.sh
index ffcc85983..ef0f93b00 100755
--- a/.github/container/test-t5x.sh
+++ b/.github/container/test-t5x.sh
@@ -14,8 +14,9 @@ usage() {
     echo "  OPTIONS                   DESCRIPTION"
     echo "  -a, --additional-args     Additional gin args to pass to t5x/train.py"
     echo "  -b, --batch-size          Global batch size (REQUIRED)"
-    echo "  -c --use-contrib-configs  If provided uses contrib/gpu configs instead of top-level configs. Notably, gpu configs use adamw instead of adafactor"
+    echo "  -c, --use-contrib-configs If provided uses contrib/gpu configs instead of top-level configs. Notably, gpu configs use adamw instead of adafactor"
     echo "  -d, --dtype               Data type, defaults to bfloat16."
+    echo "  --enable-te {0,1}         1 to enable, 0 to disable; defaults to ENABLE_TE in env or 0 if unset"
     echo "  -e, --epochs              Number of epochs to run, defaults to 7."
     echo "  --multiprocess            Enable the multiprocess GPU mode."
     echo "  -o, --output NAME         Name for the output folder, a temporary folder will be created if none specified."
@@ -23,7 +24,7 @@ usage() {
     exit $1
 }
 
-args=$(getopt -o a:b:cd:e:o:s:h --long additional-args:,batch-size:,use-contrib-configs,dtype:,epochs:,help,multiprocess,output:,steps-per-epoch: -- "$@")
+args=$(getopt -o a:b:cd:e:ho:s: --long additional-args:,batch-size:,use-contrib-configs,dtype:,enable-te:,epochs:,help,multiprocess,output:,steps-per-epoch: -- "$@")
 if [[ $? -ne 0 ]]; then
     exit 1
 fi
@@ -38,6 +39,7 @@ EPOCHS=7
 MULTIPROCESS=0
 OUTPUT=$(mktemp -d)
 STEPS_PER_EPOCH=100
+ENABLE_TE=${ENABLE_TE:-0}
 
 eval set -- "$args"
 while [ : ]; do
@@ -58,10 +60,17 @@ while [ : ]; do
             DTYPE="$2"
             shift 2
             ;;
+        --enable-te)
+            ENABLE_TE="$2"
+            shift 2
+            ;;
         -e | --epochs)
             EPOCHS="$2"
             shift 2
             ;;
+        -h | --help)
+            usage 1
+            ;;
         --multiprocess)
             MULTIPROCESS=1
             shift 1
@@ -74,9 +83,6 @@ while [ : ]; do
             STEPS_PER_EPOCH="$2"
             shift 2
             ;;
-        -h | --help)
-            usage 1
-            ;;
         --)
             shift;
             break 
@@ -100,6 +106,7 @@ print_var ADDITIONAL_ARGS
 print_var BATCH_SIZE
 print_var USE_CONTRIB_CONFIGS
 print_var DTYPE
+print_var ENABLE_TE
 print_var EPOCHS
 print_var OUTPUT
 print_var MULTIPROCESS
@@ -176,7 +183,8 @@ EOF
 
 ## Launch
 set -exou pipefail
-python -m t5x.train \
+
+ENABLE_TE=$ENABLE_TE python -m t5x.train \
     --gin_file benchmark.gin \
     --gin.MODEL_DIR=\"${OUTPUT}\" \
     --gin.network.T5Config.dtype=\"${DTYPE}\" \
@@ -187,5 +195,4 @@ python -m t5x.train \
     --gin.CheckpointConfig.save=None \
     $ADDITIONAL_ARGS \
     $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu)
-set +x
 echo "Output at ${OUTPUT}"
diff --git a/.github/workflows/_test_t5x_rosetta.yaml b/.github/workflows/_test_t5x_rosetta.yaml
new file mode 100644
index 000000000..58bb562be
--- /dev/null
+++ b/.github/workflows/_test_t5x_rosetta.yaml
@@ -0,0 +1,369 @@
+name: ~test T5X(Rosetta), MGMN
+
+on:
+  workflow_call:
+    inputs:
+      T5X_IMAGE:
+        type: string
+        description: T5X image from ghcr.io/nvidia/t5x
+        default: 'ghcr.io/nvidia/t5x:latest'
+        required: false
+      ARTIFACT_NAME:
+        type: string
+        description:  If provided, will prepend a prefix to the artifact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts
+        default: ""
+        required: false
+    outputs:
+      TEST_STATUS:
+        description: 'Summary pass/fail value indicating if results from tests are acceptable'
+        value: ${{ jobs.publish-test.outputs.STATUS }}
+
+env:
+  BATCH_SIZE_PER_GPU: 32
+
+jobs:
+
+  single-process-multi-device:
+    strategy:
+      matrix:
+        include:
+          - TEST_NAME: "1P1G_te-1"
+            N_GPU: 1
+            ADDITIONAL_ARGS: ""
+            EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False"
+          - TEST_NAME: "1P1G_te-0"
+            N_GPU: 1
+            ADDITIONAL_ARGS: "--enable-te 0"
+            EXTRA_GIN_ARGS: ""
+          - TEST_NAME: "1P8G_te-1"
+            N_GPU: 8
+            ADDITIONAL_ARGS: ""
+            EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False"
+      fail-fast: false
+
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Print environment variables
+        run: env
+
+      - name: Setup SSH agent
+        uses: webfactory/ssh-agent@v0.8.0
+        with:
+          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+
+      - name: Setup SSH known hosts
+        id: ssh-known-hosts
+        run: |
+          mkdir -p ~/.ssh
+          cat >> ~/.ssh/known_hosts << EOF
+          ${{ vars.SSH_KNOWN_HOSTS }}
+          EOF
+          chmod 600 ~/.ssh/known_hosts
+          echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT
+
+      - name: Labels and metadata
+        id: meta
+        shell: bash -x -e {0}
+        run: |
+          IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')"
+          TEST_CASE_NAME=${{ matrix.TEST_NAME }}
+          JOB_NAME=${{ inputs.ARTIFACT_NAME }}${GITHUB_RUN_ID}-${TEST_CASE_NAME}
+          LOG_FILE=/nfs/cluster/${JOB_NAME}.log
+          MODEL_PATH=/nfs/cluster/${JOB_NAME}
+          BATCH_SIZE=$((${{ env.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }}))
+          for var in IMAGE TEST_CASE_NAME JOB_NAME LOG_FILE MODEL_PATH BATCH_SIZE; do
+            echo "$var=${!var}" >> $GITHUB_OUTPUT
+          done
+
+      - name: Submit SLURM jobs over SSH
+        id: submit
+        shell: bash -O expand_aliases -x -e {0}
+        run: |
+          alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
+          sshx "date && hostname && sinfo"          
+          sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
+          JOB=$(sshx sbatch --parsable << EOF
+          #!/bin/bash
+          #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }}
+          #SBATCH --exclusive
+          #SBATCH --nodes=1
+          #SBATCH --tasks=1
+          #SBATCH --gpus-per-node=${{ matrix.N_GPU }}
+          #SBATCH --time=00:30:00
+          #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }}
+          #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }}"
+          time srun \
+            --container-image=${{ steps.meta.outputs.IMAGE }} \
+            --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \
+            --container-entrypoint \
+            bash -c 'wget -P /tmp/ https://raw.githubusercontent.com/NVIDIA/JAX-Toolbox/${{ github.sha }}/.github/container/test-t5x.sh && sleep 10 && bash /tmp/test-t5x.sh \
+              --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \
+              --dtype bfloat16 \
+              --batch-size ${{ steps.meta.outputs.BATCH_SIZE }} \
+              --epochs 7 \
+              --steps-per-epoch 100 \
+              --use-contrib-configs \
+              ${{ matrix.ADDITIONAL_ARGS }} \
+              ${{ matrix.EXTRA_GIN_ARGS != '' && format('--additional-args "{0}"', matrix.EXTRA_GIN_ARGS) || '' }}'
+          EOF
+          )
+
+          set +x
+          while sshx squeue -j $JOB | grep -q $JOB; do
+            echo "SLURM Job $JOB is still running."
+            sleep 15
+          done
+          echo "SLRUM Job $JOB finished."
+
+          # Gather job info
+          SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1)
+          SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g')
+          echo "SLURM Job state is ${SLURM_STATE}"
+          echo "SLURM Job exit code is ${SLURM_EXITCODE}"
+          echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT"
+          echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT"
+
+          set -x
+
+      - name: Retrieve training logs and upload to TensorBoard server
+        shell: bash -x -e {0}
+        run: |
+          mkdir output/
+          rsync -rtz --progress \
+            ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
+            output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
+          rsync -rtz --progress \
+            ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
+            output/ || true
+          rsync -rtz --progress \
+            output/ \
+            ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true
+
+      - name: Write SLURM job status to file
+        shell: bash -x -e {0}
+        run: |
+          python << EOF
+          import json
+          with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f:
+              dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
+              json.dump(dump, f)
+          EOF
+ 
+      - name: Upload training logs as artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ steps.meta.outputs.JOB_NAME }}
+          path: output/*
+
+  multi-gpu-multi-node:
+    strategy:
+      matrix:
+        include:
+          - TEST_NAME: "1N1G-te-1"
+            N_GPU: 1
+            N_NODE: 1
+            ADDITIONAL_ARGS: ""
+            EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False"
+          - TEST_NAME: "1N8G-te-1"
+            N_GPU: 8
+            N_NODE: 1
+            ADDITIONAL_ARGS: ""
+            EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False"
+          - TEST_NAME: "2N8G-te-1"
+            N_GPU: 8
+            N_NODE: 2
+            ADDITIONAL_ARGS: ""
+            EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False"
+          - TEST_NAME: "2N2G_te-0"
+            N_GPU: 2
+            N_NODE: 2
+            ADDITIONAL_ARGS: "--enable-te 0"
+            EXTRA_GIN_ARGS: ""
+      fail-fast: false
+
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Print environment variables
+        run: env
+
+      - name: Setup SSH agent
+        uses: webfactory/ssh-agent@v0.8.0
+        with:
+          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+
+      - name: Setup SSH known hosts
+        id: ssh-known-hosts
+        run: |
+          mkdir -p ~/.ssh
+          cat >> ~/.ssh/known_hosts << EOF
+          ${{ vars.SSH_KNOWN_HOSTS }}
+          EOF
+          chmod 600 ~/.ssh/known_hosts
+          echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT
+
+      - name: Labels and metadata
+        id: meta
+        shell: bash -x -e {0}
+        run: |
+          IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')"
+          TEST_CASE_NAME=${{ matrix.TEST_NAME }}
+          TOTAL_TASKS=$((${{ matrix.N_GPU }} * ${{ matrix.N_NODE }}))
+          JOB_NAME=${{ inputs.ARTIFACT_NAME }}${GITHUB_RUN_ID}-${TEST_CASE_NAME}
+          LOG_FILE=/nfs/cluster/${JOB_NAME}.log
+          MODEL_PATH=/nfs/cluster/${JOB_NAME}
+          BATCH_SIZE=$((${{ env.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }} * ${{ matrix.N_NODE }}))
+          for var in IMAGE TEST_CASE_NAME TOTAL_TASKS JOB_NAME LOG_FILE MODEL_PATH BATCH_SIZE; do
+            echo "$var=${!var}" >> $GITHUB_OUTPUT
+          done
+
+      - name: Submit SLURM jobs over SSH
+        id: submit
+        shell: bash -O expand_aliases -x -e {0}
+        run: |
+          alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
+          sshx "date && hostname && sinfo"          
+          sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
+          JOB=$(sshx sbatch --parsable << EOF
+          #!/bin/bash
+          #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }}
+          #SBATCH --exclusive
+          #SBATCH --nodes=${{ matrix.N_NODE }}
+          #SBATCH --gpus-per-node=${{ matrix.N_GPU }}
+          #SBATCH --tasks=${{ steps.meta.outputs.TOTAL_TASKS }}
+          #SBATCH --tasks-per-node=${{ matrix.N_GPU }}
+          #SBATCH --time=00:30:00
+          #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }}
+          #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }}"
+          time srun \
+            --container-image=${{ steps.meta.outputs.IMAGE }} \
+            --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \
+            --container-entrypoint \
+            bash -c 'wget -P /tmp/ https://raw.githubusercontent.com/NVIDIA/JAX-Toolbox/${{ github.sha }}/.github/container/test-t5x.sh && sleep 10 && bash /tmp/test-t5x.sh \
+              --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \
+              --dtype bfloat16 \
+              --batch-size ${{ steps.meta.outputs.BATCH_SIZE }} \
+              --epochs 7 \
+              --steps-per-epoch 100 \
+              --multiprocess \
+              --use-contrib-configs \
+              ${{ matrix.ADDITIONAL_ARGS }} \
+              ${{ matrix.EXTRA_GIN_ARGS != '' && format('--additional-args "{0}"', matrix.EXTRA_GIN_ARGS) || '' }}'
+          EOF
+          )
+
+          set +x
+          while sshx squeue -j $JOB | grep -q $JOB; do
+            echo "SLURM Job $JOB is still running."
+            sleep 15
+          done
+          echo "SLRUM Job $JOB finished."
+
+          # Gather job info
+          SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1)
+          SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g')
+          echo "SLURM Job state is ${SLURM_STATE}"
+          echo "SLURM Job exit code is ${SLURM_EXITCODE}"
+          echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT"
+          echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT"
+
+          set -x
+
+      - name: Retrieve training logs and upload to TensorBoard server
+        shell: bash -x -e {0}
+        run: |
+
+          mkdir output/
+          rsync -rtz --progress \
+            ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
+            output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
+          rsync -rtz --progress \
+            ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
+            output/ || true
+          rsync -rtz --progress \
+            output/ \
+            ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true
+
+      - name: Write SLURM job status to file
+        shell: bash -x -e {0}
+        run: |
+          python << EOF
+          import json
+          with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f:
+              dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
+              json.dump(dump, f)
+          EOF
+ 
+      - name: Upload training logs as artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ steps.meta.outputs.JOB_NAME }}
+          path: output/*
+
+  publish-test:
+    needs: [multi-gpu-multi-node, single-process-multi-device]
+    uses: ./.github/workflows/_publish_badge.yaml
+    if: success() || failure()
+    secrets: inherit
+    with:
+      ENDPOINT_FILENAME: '${{ inputs.ARTIFACT_NAME }}rosetta-t5x-test-completion-status.json'
+      PUBLISH: false
+      SCRIPT: |
+        EXIT_STATUSES="${{ inputs.ARTIFACT_NAME }}${GITHUB_RUN_ID}-*/*-status.json"
+        PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
+        FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
+        TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l)
+
+        cat <<EOF >>$GITHUB_STEP_SUMMARY
+        ## T5x MGMN+SPMD Test Status
+        | Test Case | State | Exit Code |
+        | --- | --- | --- |
+        EOF
+        for i in $EXIT_STATUSES; do
+          # Files are named ${{ inputs.ARTIFACT_NAME }}<GHID>-<NAME>/<NAME>-status.json
+          echo "| $(echo $i | cut -d/ -f1 | cut -d- -f2-) | $(jq -r .state $i) | $(jq -r .exitcode $i)"
+        done | tee -a $GITHUB_STEP_SUMMARY
+
+        echo "Test statuses:"
+        jq -rc 'input_filename,.' $EXIT_STATUSES
+
+        if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then
+          echo "STATUS=success" >> $GITHUB_OUTPUT
+          BADGE_COLOR=brightgreen
+        elif [[ $PASSED_TESTS -eq 0 ]]; then
+          echo "STATUS=failure" >> $GITHUB_OUTPUT
+          BADGE_COLOR=red
+        else
+          echo "STATUS=failure" >> $GITHUB_OUTPUT
+          BADGE_COLOR=yellow
+        fi
+        echo "LABEL='Completion'" >> $GITHUB_OUTPUT
+        echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT
+        echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT
+
+  summary:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Generate TensorBoard query URL
+        run: |
+          (
+          cat << EOF
+
+          ## T5X MGMN training
+
+          [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=${{ inputs.ARTIFACT_NAME }}${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per)
+
+          EOF
+          ) | tee $GITHUB_STEP_SUMMARY
+
+  outcome:
+    needs: publish-test
+    runs-on: ubuntu-22.04
+    if: success() || failure()
+    steps:
+      - name: Sets workflow status based on test outputs 
+        run: |
+          if [[ ${{ needs.publish-test.outputs.STATUS }} != success ]]; then
+            exit 1
+          fi
diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml
index 010dc9e79..1c3aa3a27 100644
--- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml
+++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml
@@ -95,12 +95,10 @@ jobs:
 
   test-t5x:
     needs: build
-    uses: ./.github/workflows/_test_t5x.yaml
+    uses: ./.github/workflows/_test_t5x_rosetta.yaml
     if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
     with:
       T5X_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }}
-      # Disable packing b/c rosetta-t5x images run with TE by default, and TE does not currently support packing
-      EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False"
     secrets: inherit
 
   test-vit:
@@ -136,7 +134,7 @@ jobs:
         echo "LABEL='Tests'" >> $GITHUB_OUTPUT
 
         if [[ ${{ needs.build.result }} == "success" ]]; then
-          if [[ $UNIT_STATUS == "success" ]] && [[ $T5X_STATUS == "success" ]] && [[ $VIT_STATUS == "success" ]] then
+          if [[ $UNIT_STATUS == "success" ]] && [[ $T5X_STATUS == "success" ]] && [[ $VIT_STATUS == "success" ]]; then
             COLOR=brightgreen
             MESSAGE="Unit passed / MGMN passed"
           elif [[ $UNIT_STATUS == "success" ]]; then 
diff --git a/rosetta/Dockerfile.t5x b/rosetta/Dockerfile.t5x
index 79e73b0bd..3878ff5c0 100644
--- a/rosetta/Dockerfile.t5x
+++ b/rosetta/Dockerfile.t5x
@@ -13,6 +13,7 @@ FROM scratch as flax-mirror-source
 ADD --keep-git-dir=true https://github.com/google/flax.git#main /
 
 FROM ${BASE_IMAGE} AS rosetta
+ENV ENABLE_TE=1
 
 ARG GIT_USER_EMAIL
 ARG GIT_USER_NAME

From 4ecb6310370369eb1674648b554d873e3137850d Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 1 Nov 2023 15:09:43 -0700
Subject: [PATCH 12/16] Fix markdown hyperlink for jax package on frontpage
 readme (#319)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8ba466c88..26adbcda5 100644
--- a/README.md
+++ b/README.md
@@ -145,7 +145,7 @@ We will update this table as new models become available, so stay tuned.
 
 ## Environment Variables
 
-The [JAX image](ghcr.io/nvidia/jax) is embedded with the following flags and environment variables for performance tuning:
+The [JAX image](https://github.com/NVIDIA/JAX-Toolbox/pkgs/container/jax) is embedded with the following flags and environment variables for performance tuning:
 
 | XLA Flags | Value | Explanation |
 | --------- | ----- | ----------- |

From 4ad04aca235618393b4ffd1476de4d56d4220d95 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 1 Nov 2023 15:12:42 -0700
Subject: [PATCH 13/16] Adds a --seed option to test-t5x.sh to ensure
 determinism (#344)

To ensure that the tests results for a particular container are
reproducible between runs, this change introduces a seed argument that
sets the jax seed and dataset seed to 42. It remains configurable, but
now there shouldn't be variance given the same container.

- Also fixes a typo where --steps-per-epoch wasn't in the usage doc of
this script

Co-authored-by: NVIDIA <jax@nvidia.com>
Co-authored-by: Yu-Hang "Maxin" Tang <Tang.Maxin@gmail.com>
---
 .github/container/test-t5x.sh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/.github/container/test-t5x.sh b/.github/container/test-t5x.sh
index ef0f93b00..573834b8d 100755
--- a/.github/container/test-t5x.sh
+++ b/.github/container/test-t5x.sh
@@ -20,11 +20,13 @@ usage() {
     echo "  -e, --epochs              Number of epochs to run, defaults to 7."
     echo "  --multiprocess            Enable the multiprocess GPU mode."
     echo "  -o, --output NAME         Name for the output folder, a temporary folder will be created if none specified."
+    echo "  --seed INT                Random seed for deterministim. Defaults to 42."
+    echo "  -s, --steps-per-epoch INT Steps per epoch. Detauls to 100"
     echo "  -h, --help                Print usage."
     exit $1
 }
 
-args=$(getopt -o a:b:cd:e:ho:s: --long additional-args:,batch-size:,use-contrib-configs,dtype:,enable-te:,epochs:,help,multiprocess,output:,steps-per-epoch: -- "$@")
+args=$(getopt -o a:b:cd:e:ho:s: --long additional-args:,batch-size:,use-contrib-configs,dtype:,enable-te:,epochs:,help,multiprocess,output:,seed:,steps-per-epoch: -- "$@")
 if [[ $? -ne 0 ]]; then
     exit 1
 fi
@@ -38,6 +40,7 @@ DTYPE=bfloat16
 EPOCHS=7
 MULTIPROCESS=0
 OUTPUT=$(mktemp -d)
+SEED=42
 STEPS_PER_EPOCH=100
 ENABLE_TE=${ENABLE_TE:-0}
 
@@ -79,6 +82,10 @@ while [ : ]; do
             OUTPUT="$2"
             shift 2
             ;;
+        --seed)
+            SEED="$2"
+            shift 2
+            ;;
         -s | --steps-per-epoch)
             STEPS_PER_EPOCH="$2"
             shift 2
@@ -193,6 +200,9 @@ ENABLE_TE=$ENABLE_TE python -m t5x.train \
     --gin.train.eval_steps=0 \
     --gin.train.eval_period=${STEPS_PER_EPOCH} \
     --gin.CheckpointConfig.save=None \
+    --gin.train/utils.DatasetConfig.seed=${SEED} \
+    --gin.train_eval/utils.DatasetConfig.seed=${SEED} \
+    --gin.train.random_seed=${SEED} \
     $ADDITIONAL_ARGS \
     $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu)
 echo "Output at ${OUTPUT}"

From 96716482970cf2a5570d88df37e03add88cdc45a Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 1 Nov 2023 16:13:52 -0700
Subject: [PATCH 14/16] Dynamic workflow run names (#356)

This change introduces the dynamic [run name
field](https://github.blog/changelog/2022-09-26-github-actions-dynamic-names-for-workflow-runs/#:~:text=GitHub%20Actions%20customers%20can%20now,visit%20the%20GitHub%20Actions%20community.)
`run-name`.

It's currently difficult on mobile to find the "workflow_run" that
corresponds to a particular date, so hopefully this helps identify which
builds were nightly vs which builds were manually triggered.

I couldn't find a good way to dynamically look up the `name` field, so
for now I copied all of names. I also wasn't able to find a "created_at"
for the scheduled workflows, so those don't have timestamps for now.

__Assumptions__:
* "workflow_run" == nightly since "scheduled" events only happen on
`main` and `workflow_run` are only run for concrete workflows and not
reusable workflows

### TODO
- [x] Test the workflow_run codepath
- [x] Test the scheduled codepath


![image](https://github.com/NVIDIA/JAX-Toolbox/assets/7576060/4b916452-334a-4a73-9220-9fbadc70462f)
---
 .github/workflows/cuda-121-jax-pin.yaml               | 1 +
 .github/workflows/cuda-122-jax-pin.yaml               | 1 +
 .github/workflows/nightly-distribution-test.yaml      | 1 +
 .github/workflows/nightly-jax-build.yaml              | 1 +
 .github/workflows/nightly-jax-test-unit.yaml          | 1 +
 .github/workflows/nightly-pax-build.yaml              | 1 +
 .github/workflows/nightly-pax-test-mgmn.yaml          | 1 +
 .github/workflows/nightly-rosetta-pax-build.yaml      | 1 +
 .github/workflows/nightly-rosetta-t5x-build-test.yaml | 1 +
 .github/workflows/nightly-t5x-build.yaml              | 1 +
 .github/workflows/nightly-t5x-test-mgmn.yaml          | 1 +
 .github/workflows/nightly-te-build.yaml               | 1 +
 .github/workflows/nightly-te-test.yaml                | 1 +
 .github/workflows/pax-cuda-121.yaml                   | 1 +
 .github/workflows/weekly-base-build.yaml              | 1 +
 15 files changed, 15 insertions(+)

diff --git a/.github/workflows/cuda-121-jax-pin.yaml b/.github/workflows/cuda-121-jax-pin.yaml
index 829d0ae9d..2e7e3c382 100644
--- a/.github/workflows/cuda-121-jax-pin.yaml
+++ b/.github/workflows/cuda-121-jax-pin.yaml
@@ -1,4 +1,5 @@
 name: Nightly Containers on CUDA 12.1 (JAX pinned)
+run-name: Nightly Containers on CUDA 12.1 (JAX pinned) (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
 
 on:
   schedule:
diff --git a/.github/workflows/cuda-122-jax-pin.yaml b/.github/workflows/cuda-122-jax-pin.yaml
index 3ea4f053b..cb12d1037 100644
--- a/.github/workflows/cuda-122-jax-pin.yaml
+++ b/.github/workflows/cuda-122-jax-pin.yaml
@@ -1,4 +1,5 @@
 name: Nightly Containers on CUDA 12.2 (JAX pinned)
+run-name: Nightly Containers on CUDA 12.2 (JAX pinned) (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
 
 on:
   schedule:
diff --git a/.github/workflows/nightly-distribution-test.yaml b/.github/workflows/nightly-distribution-test.yaml
index 16541e1cb..4bbbb393b 100644
--- a/.github/workflows/nightly-distribution-test.yaml
+++ b/.github/workflows/nightly-distribution-test.yaml
@@ -1,4 +1,5 @@
 name: Nightly Distribution test
+run-name: Nightly Distribution test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
 
 on:
   workflow_dispatch:
diff --git a/.github/workflows/nightly-jax-build.yaml b/.github/workflows/nightly-jax-build.yaml
index fb39e8650..5e513a6f1 100644
--- a/.github/workflows/nightly-jax-build.yaml
+++ b/.github/workflows/nightly-jax-build.yaml
@@ -1,4 +1,5 @@
 name: Nightly JAX build
+run-name: Nightly JAX build (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
 
 on:
   schedule:
diff --git a/.github/workflows/nightly-jax-test-unit.yaml b/.github/workflows/nightly-jax-test-unit.yaml
index c1e48169a..7d70065f7 100644
--- a/.github/workflows/nightly-jax-test-unit.yaml
+++ b/.github/workflows/nightly-jax-test-unit.yaml
@@ -1,4 +1,5 @@
 name: Nightly JAX unit test
+run-name: Nightly JAX unit test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
 
 on:
   workflow_run:
diff --git a/.github/workflows/nightly-pax-build.yaml b/.github/workflows/nightly-pax-build.yaml
index 98224e98a..64728265a 100644
--- a/.github/workflows/nightly-pax-build.yaml
+++ b/.github/workflows/nightly-pax-build.yaml
@@ -1,4 +1,5 @@
 name: Nightly Pax build
+run-name: Nightly Pax build (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
 
 on:
   workflow_run:
diff --git a/.github/workflows/nightly-pax-test-mgmn.yaml b/.github/workflows/nightly-pax-test-mgmn.yaml
index 5d785163f..db041cd77 100644
--- a/.github/workflows/nightly-pax-test-mgmn.yaml
+++ b/.github/workflows/nightly-pax-test-mgmn.yaml
@@ -1,4 +1,5 @@
 name: Nightly Pax MGMN performance test
+run-name: Nightly Pax MGMN performance test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
 
 on:
   workflow_run:
diff --git a/.github/workflows/nightly-rosetta-pax-build.yaml b/.github/workflows/nightly-rosetta-pax-build.yaml
index 504c9103a..537d8a78f 100644
--- a/.github/workflows/nightly-rosetta-pax-build.yaml
+++ b/.github/workflows/nightly-rosetta-pax-build.yaml
@@ -1,4 +1,5 @@
 name: Nightly Rosetta Paxml build and test
+run-name: Nightly Rosetta Paxml build and test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
 
 on:
   workflow_run:
diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml
index 1c3aa3a27..09d39867c 100644
--- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml
+++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml
@@ -1,4 +1,5 @@
 name: Nightly Rosetta T5x build and test
+run-name: Nightly Rosetta T5x build and test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
 
 on:
   workflow_run:
diff --git a/.github/workflows/nightly-t5x-build.yaml b/.github/workflows/nightly-t5x-build.yaml
index 2a6eda333..089f94069 100644
--- a/.github/workflows/nightly-t5x-build.yaml
+++ b/.github/workflows/nightly-t5x-build.yaml
@@ -1,4 +1,5 @@
 name: Nightly T5X build
+run-name: Nightly T5X build (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
 
 on:
   workflow_run:
diff --git a/.github/workflows/nightly-t5x-test-mgmn.yaml b/.github/workflows/nightly-t5x-test-mgmn.yaml
index d13b68ce3..40fa91819 100644
--- a/.github/workflows/nightly-t5x-test-mgmn.yaml
+++ b/.github/workflows/nightly-t5x-test-mgmn.yaml
@@ -1,4 +1,5 @@
 name: Nightly T5X MGMN performance test
+run-name: Nightly T5X MGMN performance test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
 
 on:
   workflow_run:
diff --git a/.github/workflows/nightly-te-build.yaml b/.github/workflows/nightly-te-build.yaml
index 3fecd1067..2b9cc3c30 100644
--- a/.github/workflows/nightly-te-build.yaml
+++ b/.github/workflows/nightly-te-build.yaml
@@ -1,4 +1,5 @@
 name: Nightly Transformer Engine build
+run-name: Nightly Transformer Engine build (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
 
 on:
   workflow_run:
diff --git a/.github/workflows/nightly-te-test.yaml b/.github/workflows/nightly-te-test.yaml
index c030af044..e4e03881e 100644
--- a/.github/workflows/nightly-te-test.yaml
+++ b/.github/workflows/nightly-te-test.yaml
@@ -1,4 +1,5 @@
 name: Nightly Transformer Engine test
+run-name: Nightly Transformer Engine test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
 
 on:
   workflow_run:
diff --git a/.github/workflows/pax-cuda-121.yaml b/.github/workflows/pax-cuda-121.yaml
index c468872fe..01330beaa 100644
--- a/.github/workflows/pax-cuda-121.yaml
+++ b/.github/workflows/pax-cuda-121.yaml
@@ -1,4 +1,5 @@
 name: Nightly Containers on CUDA 12.1
+run-name: Nightly Containers on CUDA 12.1 (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
 
 on:
   schedule:
diff --git a/.github/workflows/weekly-base-build.yaml b/.github/workflows/weekly-base-build.yaml
index fad8d74f4..7211f478e 100644
--- a/.github/workflows/weekly-base-build.yaml
+++ b/.github/workflows/weekly-base-build.yaml
@@ -1,4 +1,5 @@
 name: Weekly base container build
+run-name: Weekly base container build (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
 
 on:
   schedule:

From bc3e0b5ee41edc6e7c6f12ee4f45cdb61e2cacc2 Mon Sep 17 00:00:00 2001
From: Vladislav <vkozlov@nvidia.com>
Date: Thu, 2 Nov 2023 15:24:49 -0600
Subject: [PATCH 15/16] Fix random failling tests for backend_independent on
 V100 (#351)

Fixes randomly failures in the backend-independent section of JAX unit
tests:
```
Cannot find a free accelerator to run the test  on, exiting with failure
```

Changes: limit the number of concurrent test jobs even for
backend-independent tests, which do create GPU contexts.

As a clarification, `--jobs` and `--local_test_jobs` do not make a
difference for our particular CI pipeline, since JAX is built in a
separate CI job anyway.

References (From Reed Wanderman-Milne @ Google):

> 1. In particular, you have to set NB_GPUS, JOBS_PER_ACC, and J
correctly or you can get that error (I recently got the same error by
not setting those correctly)
> 2. (also I think --jobs should be --local_test_jobs in that code
block, no reason to restrict the number of jobs compiling JAX)
---
 .github/container/test-jax.sh | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/container/test-jax.sh b/.github/container/test-jax.sh
index e1a7513b0..80bed4529 100755
--- a/.github/container/test-jax.sh
+++ b/.github/container/test-jax.sh
@@ -126,24 +126,25 @@ case "${BATTERY}" in
     large)
         JOBS_PER_GPU=1
         JOBS=$((NGPUS * JOBS_PER_GPU))
-        EXTRA_FLAGS="--jobs=${JOBS} --test_env=JAX_TESTS_PER_ACCELERATOR=${JOBS_PER_GPU} --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow"
+        EXTRA_FLAGS="--local_test_jobs=${JOBS} --test_env=JAX_TESTS_PER_ACCELERATOR=${JOBS_PER_GPU} --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow"
         BAZEL_TARGET="${BAZEL_TARGET} //tests:image_test_gpu //tests:scipy_stats_test_gpu"
         ;;
     gpu)
         JOBS_PER_GPU=8
         JOBS=$((NGPUS * JOBS_PER_GPU))
-        EXTRA_FLAGS="--jobs=${JOBS} --test_env=JAX_TESTS_PER_ACCELERATOR=${JOBS_PER_GPU} --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow"
+        EXTRA_FLAGS="--local_test_jobs=${JOBS} --test_env=JAX_TESTS_PER_ACCELERATOR=${JOBS_PER_GPU} --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow"
         BAZEL_TARGET="${BAZEL_TARGET} //tests:gpu_tests"
         ;;
     backend-independent)
-        JOBS=$NCPUS
-        EXTRA_FLAGS="--jobs=${JOBS} --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow"
+        JOBS_PER_GPU=4
+        JOBS=$(($NGPUS * JOBS_PER_GPU))
+        EXTRA_FLAGS="--local_test_jobs=${JOBS} --test_env=JAX_TESTS_PER_ACCELERATOR=${JOBS_PER_GPU} --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow"
         BAZEL_TARGET="${BAZEL_TARGET} //tests:backend_independent_tests"
         ;;
     "")
         JOBS_PER_GPU=4
         JOBS=$((NGPUS * JOBS_PER_GPU))
-        EXTRA_FLAGS="--jobs=${JOBS} --test_env=JAX_TESTS_PER_ACCELERATOR=${JOBS_PER_GPU}"
+        EXTRA_FLAGS="--local_test_jobs=${JOBS} --test_env=JAX_TESTS_PER_ACCELERATOR=${JOBS_PER_GPU}"
         ;;
     *)
         echo "Unknown battery ${BATTERY}"

From f6aff1635070d94afaa2afd775e4fba34185cdb1 Mon Sep 17 00:00:00 2001
From: ashors1 <71393111+ashors1@users.noreply.github.com>
Date: Fri, 3 Nov 2023 10:58:23 -0700
Subject: [PATCH 16/16] Propagate error code in ViT tests (#357)

---
 rosetta/test-vit.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rosetta/test-vit.sh b/rosetta/test-vit.sh
index 87bc210fa..b74d7e347 100755
--- a/rosetta/test-vit.sh
+++ b/rosetta/test-vit.sh
@@ -136,7 +136,8 @@ with wds.TarWriter(out_tar_path) as dst:
 
 EOF
 
-set -x
+set -exou pipefail
+
 DATA_PATH="/tmp/dummy_vit_data"
 python -m generate_dummy_wds --output_tar_path=${DATA_PATH}
 
@@ -151,5 +152,4 @@ python -m t5x.train \
     --gin_search_paths=/opt/rosetta \
     --gin.CheckpointConfig.save=None \
     $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu)
-set +x
 echo "Output at ${OUTPUT}"