diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 9914b28c3..ed55219fe 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -51,7 +51,7 @@ ENV BUILD_DATE=${BUILD_DATE} ENV XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_triton_gemm=false" ENV CUDA_DEVICE_MAX_CONNECTIONS=1 ENV NCCL_IB_SL=1 -ENV NCCL_NVLS_ENABLE=0 +ENV CUDA_MODULE_LOADING=EAGER COPY --from=jax-builder ${SRC_PATH_JAX}-no-git ${SRC_PATH_JAX} COPY --from=jax-builder ${SRC_PATH_XLA}-no-git ${SRC_PATH_XLA} diff --git a/.github/container/test-jax.sh b/.github/container/test-jax.sh index e1a7513b0..80bed4529 100755 --- a/.github/container/test-jax.sh +++ b/.github/container/test-jax.sh @@ -126,24 +126,25 @@ case "${BATTERY}" in large) JOBS_PER_GPU=1 JOBS=$((NGPUS * JOBS_PER_GPU)) - EXTRA_FLAGS="--jobs=${JOBS} --test_env=JAX_TESTS_PER_ACCELERATOR=${JOBS_PER_GPU} --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow" + EXTRA_FLAGS="--local_test_jobs=${JOBS} --test_env=JAX_TESTS_PER_ACCELERATOR=${JOBS_PER_GPU} --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow" BAZEL_TARGET="${BAZEL_TARGET} //tests:image_test_gpu //tests:scipy_stats_test_gpu" ;; gpu) JOBS_PER_GPU=8 JOBS=$((NGPUS * JOBS_PER_GPU)) - EXTRA_FLAGS="--jobs=${JOBS} --test_env=JAX_TESTS_PER_ACCELERATOR=${JOBS_PER_GPU} --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow" + EXTRA_FLAGS="--local_test_jobs=${JOBS} --test_env=JAX_TESTS_PER_ACCELERATOR=${JOBS_PER_GPU} --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow" BAZEL_TARGET="${BAZEL_TARGET} //tests:gpu_tests" ;; backend-independent) - JOBS=$NCPUS - EXTRA_FLAGS="--jobs=${JOBS} --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow" + JOBS_PER_GPU=4 + JOBS=$(($NGPUS * JOBS_PER_GPU)) + EXTRA_FLAGS="--local_test_jobs=${JOBS} --test_env=JAX_TESTS_PER_ACCELERATOR=${JOBS_PER_GPU} --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow" BAZEL_TARGET="${BAZEL_TARGET} //tests:backend_independent_tests" ;; "") JOBS_PER_GPU=4 JOBS=$((NGPUS * JOBS_PER_GPU)) - EXTRA_FLAGS="--jobs=${JOBS} --test_env=JAX_TESTS_PER_ACCELERATOR=${JOBS_PER_GPU}" + EXTRA_FLAGS="--local_test_jobs=${JOBS} --test_env=JAX_TESTS_PER_ACCELERATOR=${JOBS_PER_GPU}" ;; *) echo "Unknown battery ${BATTERY}" diff --git a/.github/container/test-t5x.sh b/.github/container/test-t5x.sh index ffcc85983..573834b8d 100755 --- a/.github/container/test-t5x.sh +++ b/.github/container/test-t5x.sh @@ -14,16 +14,19 @@ usage() { echo " OPTIONS DESCRIPTION" echo " -a, --additional-args Additional gin args to pass to t5x/train.py" echo " -b, --batch-size Global batch size (REQUIRED)" - echo " -c --use-contrib-configs If provided uses contrib/gpu configs instead of top-level configs. Notably, gpu configs use adamw instead of adafactor" + echo " -c, --use-contrib-configs If provided uses contrib/gpu configs instead of top-level configs. Notably, gpu configs use adamw instead of adafactor" echo " -d, --dtype Data type, defaults to bfloat16." + echo " --enable-te {0,1} 1 to enable, 0 to disable; defaults to ENABLE_TE in env or 0 if unset" echo " -e, --epochs Number of epochs to run, defaults to 7." echo " --multiprocess Enable the multiprocess GPU mode." echo " -o, --output NAME Name for the output folder, a temporary folder will be created if none specified." + echo " --seed INT Random seed for deterministim. Defaults to 42." + echo " -s, --steps-per-epoch INT Steps per epoch. Detauls to 100" echo " -h, --help Print usage." exit $1 } -args=$(getopt -o a:b:cd:e:o:s:h --long additional-args:,batch-size:,use-contrib-configs,dtype:,epochs:,help,multiprocess,output:,steps-per-epoch: -- "$@") +args=$(getopt -o a:b:cd:e:ho:s: --long additional-args:,batch-size:,use-contrib-configs,dtype:,enable-te:,epochs:,help,multiprocess,output:,seed:,steps-per-epoch: -- "$@") if [[ $? -ne 0 ]]; then exit 1 fi @@ -37,7 +40,9 @@ DTYPE=bfloat16 EPOCHS=7 MULTIPROCESS=0 OUTPUT=$(mktemp -d) +SEED=42 STEPS_PER_EPOCH=100 +ENABLE_TE=${ENABLE_TE:-0} eval set -- "$args" while [ : ]; do @@ -58,10 +63,17 @@ while [ : ]; do DTYPE="$2" shift 2 ;; + --enable-te) + ENABLE_TE="$2" + shift 2 + ;; -e | --epochs) EPOCHS="$2" shift 2 ;; + -h | --help) + usage 1 + ;; --multiprocess) MULTIPROCESS=1 shift 1 @@ -70,13 +82,14 @@ while [ : ]; do OUTPUT="$2" shift 2 ;; + --seed) + SEED="$2" + shift 2 + ;; -s | --steps-per-epoch) STEPS_PER_EPOCH="$2" shift 2 ;; - -h | --help) - usage 1 - ;; --) shift; break @@ -100,6 +113,7 @@ print_var ADDITIONAL_ARGS print_var BATCH_SIZE print_var USE_CONTRIB_CONFIGS print_var DTYPE +print_var ENABLE_TE print_var EPOCHS print_var OUTPUT print_var MULTIPROCESS @@ -176,7 +190,8 @@ EOF ## Launch set -exou pipefail -python -m t5x.train \ + +ENABLE_TE=$ENABLE_TE python -m t5x.train \ --gin_file benchmark.gin \ --gin.MODEL_DIR=\"${OUTPUT}\" \ --gin.network.T5Config.dtype=\"${DTYPE}\" \ @@ -185,7 +200,9 @@ python -m t5x.train \ --gin.train.eval_steps=0 \ --gin.train.eval_period=${STEPS_PER_EPOCH} \ --gin.CheckpointConfig.save=None \ + --gin.train/utils.DatasetConfig.seed=${SEED} \ + --gin.train_eval/utils.DatasetConfig.seed=${SEED} \ + --gin.train.random_seed=${SEED} \ $ADDITIONAL_ARGS \ $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu) -set +x echo "Output at ${OUTPUT}" diff --git a/.github/workflows/_publish_container.yaml b/.github/workflows/_publish_container.yaml index 40340cb6f..c2d03c7f7 100644 --- a/.github/workflows/_publish_container.yaml +++ b/.github/workflows/_publish_container.yaml @@ -85,10 +85,6 @@ jobs: docker buildx imagetools create --tag $tag ${{ steps.get-manifests.outputs.manifests }} done - - name: Skopeo Login to GitHub Container Registry - run: | - echo ${{ secrets.GITHUB_TOKEN }} | skopeo login --authfile - ghcr.io - - name: Create single-arch images if: ${{ inputs.EXPOSE_SINGLE_ARCH_IMAGES }} shell: bash -x -e {0} diff --git a/.github/workflows/_retrofit_container.yaml b/.github/workflows/_retrofit_container.yaml new file mode 100644 index 000000000..57301deaf --- /dev/null +++ b/.github/workflows/_retrofit_container.yaml @@ -0,0 +1,98 @@ +name: ~split multi-arch OCI manifests into Docker Image Manifest V2, Schema 2 + +on: + workflow_call: + inputs: + SOURCE_IMAGE: + type: string + description: 'Source docker image:' + required: true + TARGET_TAGS: + type: string + description: 'Target docker tags in docker/metadata-action format:' + required: true + EXPOSE_SINGLE_ARCH_IMAGES: + type: boolean + description: 'Also expose single-arch images:' + required: false + default: true + outputs: + # MULTIARCH_TAG: + # description: "Tags of the multi-arch image published" + # value: ${{ jobs.publish.outputs.MULTIARCH_TAG }} + SINGLEARCH_TAGS: + description: "Tags of the single-arch images published" + value: ${{ jobs.publish.outputs.SINGLEARCH_TAGS }} + +env: + DOCKER_REPOSITORY: 'ghcr.io/nvidia/jax-toolbox-retrofit' + +jobs: + publish: + runs-on: ubuntu-22.04 + outputs: + # MULTIARCH_TAG: ${{ steps.meta.outputs.tags }} + SINGLEARCH_TAGS: ${{ steps.single-arch.outputs.tags }} + steps: + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set docker metadata + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.DOCKER_REPOSITORY }} + flavor: latest=false + tags: ${{ inputs.TARGET_TAGS }} + + - name: Extract manifests from the source manifest list + id: get-manifests + shell: bash -x -e {0} + run: | + SOURCE_REPO=$(echo ${{ inputs.SOURCE_IMAGE }} | cut -d: -f1) + MEDIA_TYPE=$(docker manifest inspect ${{ inputs.SOURCE_IMAGE }} | jq -r '.mediaType') + if [[ ${MEDIA_TYPE} != "application/vnd.oci.image.index.v1+json" ]]; then + echo "This workflow only work with OCI manifest lists" + exit 1 + fi + + MANIFESTS=$( + docker manifest inspect ${{ inputs.SOURCE_IMAGE }} |\ + jq -r '.manifests[] | select(.platform.os != "unknown") | .digest' |\ + xargs -I{} echo ${SOURCE_REPO}@{} |\ + tr '\n' ' ' + ) + + echo "manifests=$MANIFESTS" >> $GITHUB_OUTPUT + + ## Requires skopeo >= v1.6.0, but Actions only has v1.4.0 + # - name: Create Docker v2s2 multi-arch manifest list + # id: multi-arch + # shell: bash -x -e {0} + # run: | + # for tag in $(echo "${{ steps.meta.outputs.tags }}"); do + # skopeo copy --multi-arch all --format v2s2 docker://${{ inputs.SOURCE_IMAGE }} docker://$tag + # done + + - name: Create Docker v2s2 single-arch manifests + id: single-arch + if: ${{ inputs.EXPOSE_SINGLE_ARCH_IMAGES }} + shell: bash -x -e {0} + run: | + output_tags="" + # Create new manifest list from extracted manifests + for manifest in ${{ steps.get-manifests.outputs.manifests }}; do + os=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.os') + arch=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.architecture') + for tag in $(echo "${{ steps.meta.outputs.tags }}"); do + single_arch_tag="${tag}-${os}-${arch}" + skopeo copy --format v2s2 docker://$manifest docker://${single_arch_tag} + output_tags="${output_tags} ${single_arch_tag}" + done + done + + echo "tags=${output_tags}" >> $GITHUB_OUTPUT diff --git a/.github/workflows/_test_t5x_rosetta.yaml b/.github/workflows/_test_t5x_rosetta.yaml new file mode 100644 index 000000000..58bb562be --- /dev/null +++ b/.github/workflows/_test_t5x_rosetta.yaml @@ -0,0 +1,369 @@ +name: ~test T5X(Rosetta), MGMN + +on: + workflow_call: + inputs: + T5X_IMAGE: + type: string + description: T5X image from ghcr.io/nvidia/t5x + default: 'ghcr.io/nvidia/t5x:latest' + required: false + ARTIFACT_NAME: + type: string + description: If provided, will prepend a prefix to the artifact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts + default: "" + required: false + outputs: + TEST_STATUS: + description: 'Summary pass/fail value indicating if results from tests are acceptable' + value: ${{ jobs.publish-test.outputs.STATUS }} + +env: + BATCH_SIZE_PER_GPU: 32 + +jobs: + + single-process-multi-device: + strategy: + matrix: + include: + - TEST_NAME: "1P1G_te-1" + N_GPU: 1 + ADDITIONAL_ARGS: "" + EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False" + - TEST_NAME: "1P1G_te-0" + N_GPU: 1 + ADDITIONAL_ARGS: "--enable-te 0" + EXTRA_GIN_ARGS: "" + - TEST_NAME: "1P8G_te-1" + N_GPU: 8 + ADDITIONAL_ARGS: "" + EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False" + fail-fast: false + + runs-on: ubuntu-22.04 + steps: + - name: Print environment variables + run: env + + - name: Setup SSH agent + uses: webfactory/ssh-agent@v0.8.0 + with: + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + + - name: Setup SSH known hosts + id: ssh-known-hosts + run: | + mkdir -p ~/.ssh + cat >> ~/.ssh/known_hosts << EOF + ${{ vars.SSH_KNOWN_HOSTS }} + EOF + chmod 600 ~/.ssh/known_hosts + echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT + + - name: Labels and metadata + id: meta + shell: bash -x -e {0} + run: | + IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')" + TEST_CASE_NAME=${{ matrix.TEST_NAME }} + JOB_NAME=${{ inputs.ARTIFACT_NAME }}${GITHUB_RUN_ID}-${TEST_CASE_NAME} + LOG_FILE=/nfs/cluster/${JOB_NAME}.log + MODEL_PATH=/nfs/cluster/${JOB_NAME} + BATCH_SIZE=$((${{ env.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }})) + for var in IMAGE TEST_CASE_NAME JOB_NAME LOG_FILE MODEL_PATH BATCH_SIZE; do + echo "$var=${!var}" >> $GITHUB_OUTPUT + done + + - name: Submit SLURM jobs over SSH + id: submit + shell: bash -O expand_aliases -x -e {0} + run: | + alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + sshx "date && hostname && sinfo" + sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} + JOB=$(sshx sbatch --parsable << EOF + #!/bin/bash + #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} + #SBATCH --exclusive + #SBATCH --nodes=1 + #SBATCH --tasks=1 + #SBATCH --gpus-per-node=${{ matrix.N_GPU }} + #SBATCH --time=00:30:00 + #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} + #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }}" + time srun \ + --container-image=${{ steps.meta.outputs.IMAGE }} \ + --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ + --container-entrypoint \ + bash -c 'wget -P /tmp/ https://raw.githubusercontent.com/NVIDIA/JAX-Toolbox/${{ github.sha }}/.github/container/test-t5x.sh && sleep 10 && bash /tmp/test-t5x.sh \ + --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \ + --dtype bfloat16 \ + --batch-size ${{ steps.meta.outputs.BATCH_SIZE }} \ + --epochs 7 \ + --steps-per-epoch 100 \ + --use-contrib-configs \ + ${{ matrix.ADDITIONAL_ARGS }} \ + ${{ matrix.EXTRA_GIN_ARGS != '' && format('--additional-args "{0}"', matrix.EXTRA_GIN_ARGS) || '' }}' + EOF + ) + + set +x + while sshx squeue -j $JOB | grep -q $JOB; do + echo "SLURM Job $JOB is still running." + sleep 15 + done + echo "SLRUM Job $JOB finished." + + # Gather job info + SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) + SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') + echo "SLURM Job state is ${SLURM_STATE}" + echo "SLURM Job exit code is ${SLURM_EXITCODE}" + echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" + echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" + + set -x + + - name: Retrieve training logs and upload to TensorBoard server + shell: bash -x -e {0} + run: | + mkdir output/ + rsync -rtz --progress \ + ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ + output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true + rsync -rtz --progress \ + ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ + output/ || true + rsync -rtz --progress \ + output/ \ + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true + + - name: Write SLURM job status to file + shell: bash -x -e {0} + run: | + python << EOF + import json + with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: + dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} + json.dump(dump, f) + EOF + + - name: Upload training logs as artifacts + uses: actions/upload-artifact@v3 + with: + name: ${{ steps.meta.outputs.JOB_NAME }} + path: output/* + + multi-gpu-multi-node: + strategy: + matrix: + include: + - TEST_NAME: "1N1G-te-1" + N_GPU: 1 + N_NODE: 1 + ADDITIONAL_ARGS: "" + EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False" + - TEST_NAME: "1N8G-te-1" + N_GPU: 8 + N_NODE: 1 + ADDITIONAL_ARGS: "" + EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False" + - TEST_NAME: "2N8G-te-1" + N_GPU: 8 + N_NODE: 2 + ADDITIONAL_ARGS: "" + EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False" + - TEST_NAME: "2N2G_te-0" + N_GPU: 2 + N_NODE: 2 + ADDITIONAL_ARGS: "--enable-te 0" + EXTRA_GIN_ARGS: "" + fail-fast: false + + runs-on: ubuntu-22.04 + + steps: + - name: Print environment variables + run: env + + - name: Setup SSH agent + uses: webfactory/ssh-agent@v0.8.0 + with: + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + + - name: Setup SSH known hosts + id: ssh-known-hosts + run: | + mkdir -p ~/.ssh + cat >> ~/.ssh/known_hosts << EOF + ${{ vars.SSH_KNOWN_HOSTS }} + EOF + chmod 600 ~/.ssh/known_hosts + echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT + + - name: Labels and metadata + id: meta + shell: bash -x -e {0} + run: | + IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')" + TEST_CASE_NAME=${{ matrix.TEST_NAME }} + TOTAL_TASKS=$((${{ matrix.N_GPU }} * ${{ matrix.N_NODE }})) + JOB_NAME=${{ inputs.ARTIFACT_NAME }}${GITHUB_RUN_ID}-${TEST_CASE_NAME} + LOG_FILE=/nfs/cluster/${JOB_NAME}.log + MODEL_PATH=/nfs/cluster/${JOB_NAME} + BATCH_SIZE=$((${{ env.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }} * ${{ matrix.N_NODE }})) + for var in IMAGE TEST_CASE_NAME TOTAL_TASKS JOB_NAME LOG_FILE MODEL_PATH BATCH_SIZE; do + echo "$var=${!var}" >> $GITHUB_OUTPUT + done + + - name: Submit SLURM jobs over SSH + id: submit + shell: bash -O expand_aliases -x -e {0} + run: | + alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + sshx "date && hostname && sinfo" + sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} + JOB=$(sshx sbatch --parsable << EOF + #!/bin/bash + #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} + #SBATCH --exclusive + #SBATCH --nodes=${{ matrix.N_NODE }} + #SBATCH --gpus-per-node=${{ matrix.N_GPU }} + #SBATCH --tasks=${{ steps.meta.outputs.TOTAL_TASKS }} + #SBATCH --tasks-per-node=${{ matrix.N_GPU }} + #SBATCH --time=00:30:00 + #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} + #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }}" + time srun \ + --container-image=${{ steps.meta.outputs.IMAGE }} \ + --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ + --container-entrypoint \ + bash -c 'wget -P /tmp/ https://raw.githubusercontent.com/NVIDIA/JAX-Toolbox/${{ github.sha }}/.github/container/test-t5x.sh && sleep 10 && bash /tmp/test-t5x.sh \ + --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \ + --dtype bfloat16 \ + --batch-size ${{ steps.meta.outputs.BATCH_SIZE }} \ + --epochs 7 \ + --steps-per-epoch 100 \ + --multiprocess \ + --use-contrib-configs \ + ${{ matrix.ADDITIONAL_ARGS }} \ + ${{ matrix.EXTRA_GIN_ARGS != '' && format('--additional-args "{0}"', matrix.EXTRA_GIN_ARGS) || '' }}' + EOF + ) + + set +x + while sshx squeue -j $JOB | grep -q $JOB; do + echo "SLURM Job $JOB is still running." + sleep 15 + done + echo "SLRUM Job $JOB finished." + + # Gather job info + SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) + SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') + echo "SLURM Job state is ${SLURM_STATE}" + echo "SLURM Job exit code is ${SLURM_EXITCODE}" + echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" + echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" + + set -x + + - name: Retrieve training logs and upload to TensorBoard server + shell: bash -x -e {0} + run: | + + mkdir output/ + rsync -rtz --progress \ + ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ + output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true + rsync -rtz --progress \ + ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ + output/ || true + rsync -rtz --progress \ + output/ \ + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true + + - name: Write SLURM job status to file + shell: bash -x -e {0} + run: | + python << EOF + import json + with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: + dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} + json.dump(dump, f) + EOF + + - name: Upload training logs as artifacts + uses: actions/upload-artifact@v3 + with: + name: ${{ steps.meta.outputs.JOB_NAME }} + path: output/* + + publish-test: + needs: [multi-gpu-multi-node, single-process-multi-device] + uses: ./.github/workflows/_publish_badge.yaml + if: success() || failure() + secrets: inherit + with: + ENDPOINT_FILENAME: '${{ inputs.ARTIFACT_NAME }}rosetta-t5x-test-completion-status.json' + PUBLISH: false + SCRIPT: | + EXIT_STATUSES="${{ inputs.ARTIFACT_NAME }}${GITHUB_RUN_ID}-*/*-status.json" + PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) + FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) + TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) + + cat <>$GITHUB_STEP_SUMMARY + ## T5x MGMN+SPMD Test Status + | Test Case | State | Exit Code | + | --- | --- | --- | + EOF + for i in $EXIT_STATUSES; do + # Files are named ${{ inputs.ARTIFACT_NAME }}-/-status.json + echo "| $(echo $i | cut -d/ -f1 | cut -d- -f2-) | $(jq -r .state $i) | $(jq -r .exitcode $i)" + done | tee -a $GITHUB_STEP_SUMMARY + + echo "Test statuses:" + jq -rc 'input_filename,.' $EXIT_STATUSES + + if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then + echo "STATUS=success" >> $GITHUB_OUTPUT + BADGE_COLOR=brightgreen + elif [[ $PASSED_TESTS -eq 0 ]]; then + echo "STATUS=failure" >> $GITHUB_OUTPUT + BADGE_COLOR=red + else + echo "STATUS=failure" >> $GITHUB_OUTPUT + BADGE_COLOR=yellow + fi + echo "LABEL='Completion'" >> $GITHUB_OUTPUT + echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT + echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT + + summary: + runs-on: ubuntu-22.04 + + steps: + - name: Generate TensorBoard query URL + run: | + ( + cat << EOF + + ## T5X MGMN training + + [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=${{ inputs.ARTIFACT_NAME }}${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per) + + EOF + ) | tee $GITHUB_STEP_SUMMARY + + outcome: + needs: publish-test + runs-on: ubuntu-22.04 + if: success() || failure() + steps: + - name: Sets workflow status based on test outputs + run: | + if [[ ${{ needs.publish-test.outputs.STATUS }} != success ]]; then + exit 1 + fi diff --git a/.github/workflows/baselines/test_t5x_mgmn_metrics.py b/.github/workflows/baselines/test_t5x_mgmn_metrics.py index c14fc2332..6205afedc 100644 --- a/.github/workflows/baselines/test_t5x_mgmn_metrics.py +++ b/.github/workflows/baselines/test_t5x_mgmn_metrics.py @@ -7,18 +7,18 @@ from numpy.testing import assert_allclose LOSS_RTOL = { - '1G1N': 0.02, - '1G2N': 0.03, - '1P1G': 0.03, - '1P2G': 0.03, - '1P4G': 0.035, - '1P8G': 0.035, - '2G1N': 0.025, - '2G2N': 0.015, - '4G1N': 0.04, # orig = 0.03 - '4G2N': 0.03, - '8G1N': 0.03, - '8G2N': 0.05 + '1G1N': 0.10, # orig = 0.02 + '1G2N': 0.10, # orig = 0.03 + '1P1G': 0.10, # orig = 0.03 + '1P2G': 0.10, # orig = 0.03 + '1P4G': 0.10, # orig = 0.035 + '1P8G': 0.10, # orig = 0.035 + '2G1N': 0.10, # orig = 0.025 + '2G2N': 0.10, # orig = 0.015 + '4G1N': 0.10, # orig = 0.03 + '4G2N': 0.10, # orig = 0.03 + '8G1N': 0.10, # orig = 0.03 + '8G2N': 0.10, # orig = 0.05 } STEP_TIME_MULT = { "1G1N": 0.95, diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a75dfac8b..ef0a54432 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -158,6 +158,7 @@ jobs: BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} BASE_LIBRARY: t5x + PLATFORMS: '["amd64"]' secrets: inherit build-rosetta-pax: @@ -192,6 +193,74 @@ jobs: | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | EOF + retrofit-containers: + needs: [build-base, build-jax, build-te, build-t5x, build-pax, build-rosetta-t5x, build-rosetta-pax] + if: always() + runs-on: ubuntu-22.04 + env: + DOCKER_REPO: 'ghcr.io/nvidia/jax-toolbox-retrofit' + steps: + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + ## Requires skopeo >= v1.6.0, but Actions only has v1.4.0 + # - name: Create Docker v2s2 multi-arch manifest list + # id: multi-arch + # shell: bash -x -e {0} + # run: | + # for tag in $(echo "${{ steps.meta.outputs.tags }}"); do + # skopeo copy --multi-arch all --format v2s2 docker://${{ inputs.SOURCE_IMAGE }} docker://$tag + # done + + - name: Create Docker v2s2 single-arch manifests + id: single-arch + shell: bash -x -e {0} + run: | + + for source in \ + ${{ needs.build-base.outputs.DOCKER_TAGS }} \ + ${{ needs.build-jax.outputs.DOCKER_TAGS }} \ + ${{ needs.build-te.outputs.DOCKER_TAGS }} \ + ${{ needs.build-t5x.outputs.DOCKER_TAGS }} \ + ${{ needs.build-pax.outputs.DOCKER_TAGS }} \ + ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} \ + ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} \ + ; do + source_repo=$(echo ${source} | cut -d: -f1) + media_type=$(docker manifest inspect ${source} | jq -r '.mediaType') + if [[ ${media_type} != "application/vnd.oci.image.index.v1+json" ]]; then + echo "Image ${source} is already in Docker format v2s2" + dest=${DOCKER_REPO}:$(echo ${source} | cut -d: -f2) + skopeo copy --format v2s2 docker://${source} docker://${dest} + echo "${dest}" >> $GITHUB_STEP_SUMMARY + else + manifests=$( + docker manifest inspect ${source} |\ + jq -r '.manifests[] | select(.platform.os != "unknown") | .digest' |\ + xargs -I{} echo ${source_repo}@{} |\ + tr '\n' ' ' + ) + + ## registry/org/repo:tag -> repo-tag + # dest_tag=$(echo ${source} | cut -d: -f1 | cut -d/ -f3)-$(echo ${source} | cut -d: -f2) + ## registry/org/repo:tag -> tag + dest_tag=$(echo ${source} | cut -d: -f2) + + for manifest in ${manifests}; do + os=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.os') + arch=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.architecture') + # single_arch_tag="ghcr.io/nvidia/jax-toolbox-retrofit:${{ github.run_id }}-${dest_tag}-${os}-${arch}" + single_arch_tag="${DOCKER_REPO}:${dest_tag}-${os}-${arch}" + skopeo copy --format v2s2 docker://$manifest docker://${single_arch_tag} + echo "${single_arch_tag}" >> $GITHUB_STEP_SUMMARY + done + fi + done + test-distribution: needs: metadata uses: ./.github/workflows/_test_distribution.yaml diff --git a/.github/workflows/cuda-121-jax-pin.yaml b/.github/workflows/cuda-121-jax-pin.yaml index 829d0ae9d..2e7e3c382 100644 --- a/.github/workflows/cuda-121-jax-pin.yaml +++ b/.github/workflows/cuda-121-jax-pin.yaml @@ -1,4 +1,5 @@ name: Nightly Containers on CUDA 12.1 (JAX pinned) +run-name: Nightly Containers on CUDA 12.1 (JAX pinned) (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) on: schedule: diff --git a/.github/workflows/cuda-122-jax-pin.yaml b/.github/workflows/cuda-122-jax-pin.yaml index 3ea4f053b..cb12d1037 100644 --- a/.github/workflows/cuda-122-jax-pin.yaml +++ b/.github/workflows/cuda-122-jax-pin.yaml @@ -1,4 +1,5 @@ name: Nightly Containers on CUDA 12.2 (JAX pinned) +run-name: Nightly Containers on CUDA 12.2 (JAX pinned) (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) on: schedule: diff --git a/.github/workflows/nightly-distribution-test.yaml b/.github/workflows/nightly-distribution-test.yaml index 16541e1cb..4bbbb393b 100644 --- a/.github/workflows/nightly-distribution-test.yaml +++ b/.github/workflows/nightly-distribution-test.yaml @@ -1,4 +1,5 @@ name: Nightly Distribution test +run-name: Nightly Distribution test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) on: workflow_dispatch: diff --git a/.github/workflows/nightly-jax-build.yaml b/.github/workflows/nightly-jax-build.yaml index fb39e8650..5e513a6f1 100644 --- a/.github/workflows/nightly-jax-build.yaml +++ b/.github/workflows/nightly-jax-build.yaml @@ -1,4 +1,5 @@ name: Nightly JAX build +run-name: Nightly JAX build (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) on: schedule: diff --git a/.github/workflows/nightly-jax-test-unit.yaml b/.github/workflows/nightly-jax-test-unit.yaml index c1e48169a..7d70065f7 100644 --- a/.github/workflows/nightly-jax-test-unit.yaml +++ b/.github/workflows/nightly-jax-test-unit.yaml @@ -1,4 +1,5 @@ name: Nightly JAX unit test +run-name: Nightly JAX unit test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) on: workflow_run: diff --git a/.github/workflows/nightly-pax-build.yaml b/.github/workflows/nightly-pax-build.yaml index 98224e98a..64728265a 100644 --- a/.github/workflows/nightly-pax-build.yaml +++ b/.github/workflows/nightly-pax-build.yaml @@ -1,4 +1,5 @@ name: Nightly Pax build +run-name: Nightly Pax build (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) on: workflow_run: diff --git a/.github/workflows/nightly-pax-test-mgmn.yaml b/.github/workflows/nightly-pax-test-mgmn.yaml index 5d785163f..db041cd77 100644 --- a/.github/workflows/nightly-pax-test-mgmn.yaml +++ b/.github/workflows/nightly-pax-test-mgmn.yaml @@ -1,4 +1,5 @@ name: Nightly Pax MGMN performance test +run-name: Nightly Pax MGMN performance test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) on: workflow_run: diff --git a/.github/workflows/nightly-rosetta-pax-build.yaml b/.github/workflows/nightly-rosetta-pax-build.yaml index 504c9103a..537d8a78f 100644 --- a/.github/workflows/nightly-rosetta-pax-build.yaml +++ b/.github/workflows/nightly-rosetta-pax-build.yaml @@ -1,4 +1,5 @@ name: Nightly Rosetta Paxml build and test +run-name: Nightly Rosetta Paxml build and test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) on: workflow_run: diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml index 010dc9e79..09d39867c 100644 --- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml @@ -1,4 +1,5 @@ name: Nightly Rosetta T5x build and test +run-name: Nightly Rosetta T5x build and test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) on: workflow_run: @@ -95,12 +96,10 @@ jobs: test-t5x: needs: build - uses: ./.github/workflows/_test_t5x.yaml + uses: ./.github/workflows/_test_t5x_rosetta.yaml if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: T5X_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} - # Disable packing b/c rosetta-t5x images run with TE by default, and TE does not currently support packing - EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False" secrets: inherit test-vit: @@ -136,7 +135,7 @@ jobs: echo "LABEL='Tests'" >> $GITHUB_OUTPUT if [[ ${{ needs.build.result }} == "success" ]]; then - if [[ $UNIT_STATUS == "success" ]] && [[ $T5X_STATUS == "success" ]] && [[ $VIT_STATUS == "success" ]] then + if [[ $UNIT_STATUS == "success" ]] && [[ $T5X_STATUS == "success" ]] && [[ $VIT_STATUS == "success" ]]; then COLOR=brightgreen MESSAGE="Unit passed / MGMN passed" elif [[ $UNIT_STATUS == "success" ]]; then diff --git a/.github/workflows/nightly-t5x-build.yaml b/.github/workflows/nightly-t5x-build.yaml index 2a6eda333..089f94069 100644 --- a/.github/workflows/nightly-t5x-build.yaml +++ b/.github/workflows/nightly-t5x-build.yaml @@ -1,4 +1,5 @@ name: Nightly T5X build +run-name: Nightly T5X build (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) on: workflow_run: diff --git a/.github/workflows/nightly-t5x-test-mgmn.yaml b/.github/workflows/nightly-t5x-test-mgmn.yaml index d13b68ce3..40fa91819 100644 --- a/.github/workflows/nightly-t5x-test-mgmn.yaml +++ b/.github/workflows/nightly-t5x-test-mgmn.yaml @@ -1,4 +1,5 @@ name: Nightly T5X MGMN performance test +run-name: Nightly T5X MGMN performance test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) on: workflow_run: diff --git a/.github/workflows/nightly-te-build.yaml b/.github/workflows/nightly-te-build.yaml index 3fecd1067..2b9cc3c30 100644 --- a/.github/workflows/nightly-te-build.yaml +++ b/.github/workflows/nightly-te-build.yaml @@ -1,4 +1,5 @@ name: Nightly Transformer Engine build +run-name: Nightly Transformer Engine build (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) on: workflow_run: diff --git a/.github/workflows/nightly-te-test.yaml b/.github/workflows/nightly-te-test.yaml index c030af044..e4e03881e 100644 --- a/.github/workflows/nightly-te-test.yaml +++ b/.github/workflows/nightly-te-test.yaml @@ -1,4 +1,5 @@ name: Nightly Transformer Engine test +run-name: Nightly Transformer Engine test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) on: workflow_run: diff --git a/.github/workflows/pax-cuda-121.yaml b/.github/workflows/pax-cuda-121.yaml index c468872fe..01330beaa 100644 --- a/.github/workflows/pax-cuda-121.yaml +++ b/.github/workflows/pax-cuda-121.yaml @@ -1,4 +1,5 @@ name: Nightly Containers on CUDA 12.1 +run-name: Nightly Containers on CUDA 12.1 (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) on: schedule: diff --git a/.github/workflows/weekly-base-build.yaml b/.github/workflows/weekly-base-build.yaml index fad8d74f4..7211f478e 100644 --- a/.github/workflows/weekly-base-build.yaml +++ b/.github/workflows/weekly-base-build.yaml @@ -1,4 +1,5 @@ name: Weekly base container build +run-name: Weekly base container build (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) on: schedule: diff --git a/README.md b/README.md index 8b1b68b98..26adbcda5 100644 --- a/README.md +++ b/README.md @@ -145,7 +145,7 @@ We will update this table as new models become available, so stay tuned. ## Environment Variables -The [JAX image](ghcr.io/nvidia/jax) is embedded with the following flags and environment variables for performance tuning: +The [JAX image](https://github.com/NVIDIA/JAX-Toolbox/pkgs/container/jax) is embedded with the following flags and environment variables for performance tuning: | XLA Flags | Value | Explanation | | --------- | ----- | ----------- | @@ -158,7 +158,7 @@ The [JAX image](ghcr.io/nvidia/jax) is embedded with the following flags and env | -------------------- | ----- | ----------- | | `CUDA_DEVICE_MAX_CONNECTIONS` | `1` | use a single queue for GPU work to lower latency of stream operations; OK since XLA already orders launches | | `NCCL_IB_SL` | `1` | defines the InfiniBand Service Level ([1](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-ib-sl)) | -| `NCCL_NVLS_ENABLE` | `0` | Disables NVLink SHARP ([1](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-nvls-enable)). Future releases will re-enable this feature. | +| `CUDA_MODULE_LOADING` | `EAGER` | Disables lazy-loading ([1](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#cuda-environment-variables)) which uses slightly more GPU memory. | ## FAQ (Frequently Asked Questions) diff --git a/rosetta/Dockerfile.t5x b/rosetta/Dockerfile.t5x index 79e73b0bd..3878ff5c0 100644 --- a/rosetta/Dockerfile.t5x +++ b/rosetta/Dockerfile.t5x @@ -13,6 +13,7 @@ FROM scratch as flax-mirror-source ADD --keep-git-dir=true https://github.com/google/flax.git#main / FROM ${BASE_IMAGE} AS rosetta +ENV ENABLE_TE=1 ARG GIT_USER_EMAIL ARG GIT_USER_NAME diff --git a/rosetta/patchlist-paxml.txt b/rosetta/patchlist-paxml.txt index 4f1162c76..a7a67f2ab 100644 --- a/rosetta/patchlist-paxml.txt +++ b/rosetta/patchlist-paxml.txt @@ -5,4 +5,4 @@ # - External Pull Requests (These are pull requests with upstream paxml and are of the form "pull/$PULLID/head") # - Note: Only the first column is used as a git-ref, so anything after is a comment -mirror/patch/add_dropout_support_to_te # adds Transformer Engine support (+ dropout support) +pull/46/head # adds Transformer Engine support diff --git a/rosetta/rosetta/projects/pax/README.md b/rosetta/rosetta/projects/pax/README.md index c8c30e9ab..9405abefb 100644 --- a/rosetta/rosetta/projects/pax/README.md +++ b/rosetta/rosetta/projects/pax/README.md @@ -1,28 +1,28 @@ # Pax -[Pax](https://github.com/google/paxml/tree/main) is a framework developed by Google optimized for running machine learning experiments using JAX. Pax consists of the Paxml and [Praxis](https://github.com/google/praxis/tree/main) repositories. Pax is maintained as a [distribution](../../../docs/DEVELOPMENT.md) within rosetta. This means that we cherry-pick the necessary changes to optimize Pax for GPUs on top of upstream Paxml and Praxis' `main` branches. +[Pax](https://github.com/google/paxml/tree/main) is a framework developed by Google optimized for running machine learning experiments using JAX. Pax consists of the Paxml and [Praxis](https://github.com/google/praxis/tree/main) repositories and is maintained as a [distribution](../../../docs/DEVELOPMENT.md) within Rosetta. This means that we cherry-pick the necessary changes to optimize Pax for GPUs on top of upstream Paxml and Praxis' `main` branches. We also provide support for FP8 training via [Transformer Engine](https://github.com/NVIDIA/TransformerEngine). Any `paxml/*` or `praxis/*` relative directory/file can be found in [google/paxml](https://github.com/google/paxml/tree/main) or [google/praxis](https://github.com/google/praxis/tree/main), respectively, but to -view the most up-to-date version of that directory/file with any GPU-specific patches, please see [Inspecting the source code](#inspecting-the-source-code). +view the most up-to-date version of that directory/file with any GPU-specific patches, please see [Inspecting the Source Code](#inspecting-the-source-code). ## Hardware Specifications -Convergence and performance has been validated on NVIDIA DGX A100 (8x A100 80G) nodes; for details, please refer to the [Configs](#configs) section below. We provide both singlenode and multinode pre-training support. If running on a machine with less than 80G memory, some of the default configurations may run out of memory; if you run out of memory and have more GPUs available, increase your GPU count and decrease your batch size per GPU. +Convergence and performance has been validated on NVIDIA DGX H100 (8x H100 80G) and A100 (8x A100 80G) nodes; for details, please refer to the [Configs](#configs) section below. We provide both singlenode and multinode pre-training support. If running on a machine with less than 80G memory, some of the default configurations may run out of memory; if you run out of memory and have more GPUs available, increase your GPU count and decrease your batch size per GPU. ## Containers -We provide a fully built and ready-to-use container which includes the latest optimizations, experimental features, and examples benchmarked for multi-node, multi-GPU training: `nvcr.io/nvidia/jax:23.08-paxml-py3`. This container also provides bfloat16 [Transformer Engine](https://github.com/NVIDIA/TransformerEngine) support. +We provide fully built and ready-to-use containers which include the latest optimizations, experimental features, and examples benchmarked for multi-node, multi-GPU training: `nvcr.io/nvidia/jax:23.10-paxml-py3` (multi-arch), `nvcr.io/nvidia/jax:23.10-paxml-py3-amd64` and `nvcr.io/nvidia/jax:23.10-paxml-py3-arm64`. These containers also provide FP8 support via [Transformer Engine](https://github.com/NVIDIA/TransformerEngine). Verified containers will be updated periodically, but if you wish to use the bleeding edge (which may come with unexpected behavior), please use `ghcr.io/nvidia/pax:latest`. We also provide nightly dated images with the naming pattern `ghcr.io/nvidia/pax:nightly-YYYY-MM-DD`, but we encourage you to use the latest ones for the best performance. For more information on the Pax build and for details on how to manually build the Pax distribution, please refer to [DEVELOPMENT.md](../../../docs/DEVELOPMENT.md). -*Note*: All paths mentioned in subsequent sections are relative to the top-level directory of the Paxml repository. When working interactively with containers, make sure you are in `/opt/paxml` before running any commmands. +*Note*: All paths mentioned in subsequent sections are relative to the top-level directory of the Paxml repository. When working interactively with containers, make sure you navigate to `/opt/paxml` before running any commmands. ### Launching a container Use the following command to launch a container: ``` docker run -ti --gpus=all --net=host --ipc=host -v :/opt/paxml/datasets -v :/opt/paxml/workspace -v :/opt/paxml/vocab -w /opt/paxml /bin/bash ``` -where `DATASET_PATH` is the path to the Pile or Lambada dataset. If these datasets have not yet been downloaded, they can be downloaded inside of the container (see [Downloading The Pile and Lambada Datasets](#Downloading-the-pile-and-lambada-datasets) for more). `WORKSPACE_PATH` is the path to the directory where you would like to store any persistent files, and `VOCAB_PATH` is the path to the pretrained sentencepiece model to use during tokenization (see [Downloading the SentencePiece Model](#Downloading-the-sentencepiece-model) for more). +where `DATASET_PATH` is the path to the Pile or Lambada dataset. If these datasets have not yet been downloaded, they can be downloaded from inside of the container (see [Downloading The Pile and Lambada Datasets](#Downloading-the-pile-and-lambada-datasets) for more). `WORKSPACE_PATH` is the path to the directory where you would like to store any persistent files, and `VOCAB_PATH` is the path to the pretrained SentencePiece model to use during tokenization (see [Downloading the SentencePiece Model](#Downloading-the-sentencepiece-model) for more). ## Downloading The Pile and Lambada Datasets -The given models are trained using The Pile dataset and evaluated using the Lambada dataset. The scripts [download_the_pile.py](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/download_the_pile.py) and [download_lambada.py](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/download_lambada.py) will download The Pile and the Lambada datasets to the `TFDS_DATA_DIR` enviroment variable. To control the location of the downloaded datasets, use the following command prior to running the download scripts: `export TFDS_DATA_DIR=`. After the data has been successfully downloaded, use the same `TFDS_DATA_DIR` when running experiments. +The GPT model configs we provide are trained using The Pile dataset and evaluated using the Lambada dataset. The scripts [download_the_pile.py](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/download_the_pile.py) and [download_lambada.py](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/download_lambada.py) will download The Pile and Lambada datasets to the `TFDS_DATA_DIR` enviroment variable. To control the location of the downloaded datasets, use the following command prior to running the download scripts: `export TFDS_DATA_DIR=`. After the data has been successfully downloaded, use the same `TFDS_DATA_DIR` when running experiments. ## Downloading the SentencePiece Model Pax models require a pretrained SentencePiece model to tokenize the datasets. The SentencePiece model used in the following experiments is `gs://mlperf-llm-public2/vocab/c4_en_301_5Mexp2_spm.model`. This model was trained using [these instructions](https://github.com/sgpyc/training/blob/paxml-llm-draft/large_language_model/paxml/utils/generate_spm.md). Use the following commands to download the tokenizer locally. This should be done _prior_ to launching the container. @@ -34,7 +34,7 @@ You can then use the following mount to attach the tokenizer to your container: docker run -v ${PWD}/c4_sentencepiece/c4_en_301_5Mexp2_spm.model:/opt/paxml/vocab ... ``` -## Inspecting the source code +## Inspecting the Source Code If you would like to inspect Pax's source code (`paxml/*` and `praxis/*`) to learn more about what is being run, you can do so by inspecting the source within the container. Here are some examples: @@ -44,37 +44,37 @@ cd $(python -c 'import paxml; print(paxml.__path__[0])')/../paxml/contrib/gpu/sc # (Non-interactive): View paxml/contrib/gpu/scripts_gpu/configs.py FILE=paxml/contrib/gpu/scripts_gpu/configs.py -docker run --entrypoint="" --rm $CONTAINER sh -c 'cat $(python -c "import paxml; print(*paxml.__path__)" 2>/dev/null)/../'$FILE +docker run --entrypoint="" --rm sh -c 'cat $(python -c "import paxml; print(*paxml.__path__)" 2>/dev/null)/../'$FILE ``` ## Running a Job -Note that when training with The Pile dataset, you must provide the `TFDS_DATA_DIR` as a command-line argument and a `VOCAB_PATH` (the path to a pretrained sentencepiece model) as an environment variable (see the bash scripts below for examples). +Note that when training with The Pile dataset, you must provide the `TFDS_DATA_DIR` as a command-line argument and a `VOCAB_PATH` (the path to a pretrained SentencePiece model) as an environment variable. See the bash scripts below for examples. ### Quick Runs #### Interactive: Single Node -See [run_pile_singlenode.sh](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh) for an example of training a 126m model on a single node using The Pile. Once inside of your container, this script can be run interactively using the following command: +See [run_pile_singlenode.sh](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh) for an example of training a 126M parameter model on a single node using The Pile. Once inside of your container, this script can be run interactively using the following command: ``` bash paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh ``` where `TFDS_DATA_DIR` is the path to The Pile dataset, `VOCAB_PATH` is the path to the pretrained SentencePiece `.model` file, and `LOGDIR` is the relative path of the directory to which to write checkpoints and logging information. `PERCORE_BATCH_SIZE` is the batch size per GPU _prior_ to sharding according to the parallel strategy. See [Customized Runs](#Customized-runs) for more information about this hyperparameter. -For example, to train the 126m model using a percore batch size of 4 on 8 gpus, you can use the following command: +For example, to train the 126M model using a percore batch size of 4 on 8 H100 gpus, you can use the following command: ``` -bash paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh /opt/paxml/datasets /opt/paxml/vocab bfloat16 8 4 log_dir +ENABLE_FP8=1 bash paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh /opt/paxml/datasets /opt/paxml/vocab bfloat16 8 4 log_dir ``` -See [run_lambada_singlenode.sh](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh) for an example of running zero-shot evaluation on the 126m model using the Lambada dataset. Use the following command to run this script: +See [run_lambada_singlenode.sh](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh) for an example of running zero-shot evaluation on the 126M model using the Lambada dataset. Use the following command to run this script: ``` bash paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh ``` `TFDS_DATA_DIR` should contain the path to the Lambada dataset and `LOGDIR` should match the `LOGDIR` from the pretraining run. #### Multi Node -See [example_slurm_pile.sub](https://github.com/NVIDIA/JAX-Toolbox/blob/main/rosetta/rosetta/projects/pax/scripts/example_slurm_pile.sub) for an example slurm submit file that launches an 8-node run with a 126 million parameter GPT model. +See [example_slurm_pile.sub](https://github.com/NVIDIA/JAX-Toolbox/blob/main/rosetta/rosetta/projects/pax/scripts/example_slurm_pile.sub) for an example slurm submit file that launches an 8-node training run with a 126 million parameter GPT model. To launch `example_slurm_pile.sub`, run the following command: ``` -CONTAINER= BASE_WORKSPACE_DIR= BASE_TFDS_DATA_DIR= BASE_VOCAB_PATH= LOG_DIR_LOCAL= OUTPUT_DIR= PREC=bfloat16 GPUS_PER_NODE=8 PERCORE_BATCH_SIZE=4 sbatch -N 8 -A -p -J paxml/contrib/gpu/scripts_gpu/example_slurm_pile.sub +CONTAINER= BASE_WORKSPACE_DIR= BASE_TFDS_DATA_DIR= BASE_VOCAB_PATH= LOG_DIR_LOCAL= OUTPUT_DIR= PREC=bfloat16 GPUS_PER_NODE=8 PERCORE_BATCH_SIZE=4 ENABLE_FP8= sbatch -N 8 -A -p -J scripts/example_slurm_pile.sub ``` where `BASE_WORKSPACE_DIR`, `BASE_TFDS_DATA_DIR`, and `BASE_VOCAB_PATH` are absolute paths and `LOG_DIR` and `OUTPUT_DIR` are relative to `BASE_WORKSPACE_DIR`. @@ -93,19 +93,26 @@ Paxml uses [Fiddle](https://github.com/google/fiddle/tree/main) for configuring For example, in our `*.sh` scripts, we override the default values of `FPROP_DTYPE`, `ICI_MESH_SHAPE`, and `PERCORE_BATCH_SIZE`. We provide a list of some of the frequently overridden hyperparameters, and an explanation of each, below: -- `ICI_MESH_SHAPE`: This refers to the parallelism strategy used on chips connected by a fast network (e.g. NVLink). `ICI_MESH_SHAPE` typically has 3 dimensions, `[data, fsdp, tensor]`, corresponding to data parallelism (DP), fully-sharded data parallelism (FSDP/ZeRO-3), and tensor parallelism (TP), respectively. To use pure data parallelism, you should set `ICI_MESH_SHAPE` to `[NUM_GPUS, 1, 1]`. -- `DCN_MESH_SHAPE`: This refers to the parallelism strategy for machines connected by a datacenter network. This is the generally parallel strategy used _across_ nodes. +- `ICI_MESH_SHAPE`: This refers to the parallelism strategy used on chips connected by a fast network (e.g. NVLink). `ICI_MESH_SHAPE` typically has 3 dimensions, `[data, fsdp, tensor]`, corresponding to data parallelism (DP), fully-sharded data parallelism (FSDP/ZeRO-3), and tensor parallelism (TP), respectively. For example,to use pure data parallelism, you should set `ICI_MESH_SHAPE` to `[NUM_GPUS, 1, 1]`. +- `DCN_MESH_SHAPE`: This refers to the parallelism strategy for machines connected by a datacenter network. In our case, this refers to the parallel strategy used _across_ nodes. It has the same dimensions as `ICI_MESH_SHAPE`. - `PERCORE_BATCH_SIZE`: This is the batch size loaded by each worker _prior_ to sharding the data according to the parallel strategy. We should always have that `GLOBAL_BATCH_SIZE = PERCORE_BATCH_SIZE * NUM_GPUS`, regardless of the parallel strategy. Note that a consequence of this is that `PERCORE_BATCH_SIZE` will not always equal `MICROBATCH_SIZE`, particularly when using tensor parallelism (TP). If using 2-way TP, for example, `MICROBATCH_SIZE` will be twice the `PERCORE_BATCH_SIZE`. If using tensor or pipeline parallelism, `PERCORE_BATCH_SIZE` may be fractional. For example, when using 2-way TP, setting `PERCORE_BATCH_SIZE` to 0.5 will result in a microbatch size of `PERCORE_BATCH_SIZE * TP = 1`. -- `NUM_LAYERS`, `NUM_HEADS`, `MODEL_DIMS`, `HIDDEN_DIMS`: These are hyperparameters of the transformer model. `MODEL_DIMS` refers to the hidden dimension of the transformer, and `HIDDEN_DIMS` refers to the hidden dimension of the transformer feed-forward network. +- `NUM_LAYERS`, `NUM_HEADS`, `MODEL_DIMS`, `HIDDEN_DIMS`: These are hyperparameters of the transformer model. `MODEL_DIMS` refers to the hidden dimension of the transformer and `HIDDEN_DIMS` refers to the hidden dimension of the transformer feed-forward network. We provide three "base" configurations in `paxml/contrib/gpu/scripts_gpu/configs.py`. For more information about these configurations and how to run experiments using them, please refer to the [Configs](#Configs) section below. ### Transformer Engine -Training using Transformer Engine (TE) with bfloat16 precision can be enabled via the environment variable `ENABLE_TE`. To enable TE, simply add the following line to `run_pile_multinode.sh` (or whatever bash script you are using to run experiments): +Training using Transformer Engine (TE) with bfloat16 precision is controlled via the environment variable `ENABLE_TE`. TE is enabled by default in the prebuilt container, but if you would like to disable TE, you can do so by flipping the value of `ENABLE_TE` in the container: ``` -export ENABLE_TE=1 +export ENABLE_TE=0 ``` -Note that packing is currently not supported when using TE. All configs disable packing by default, but beware that if packing is manually enabled, training with TE will error. + +FP8 training is controlled via the `ENABLE_FP8` environment variable. To enable FP8 training, set `ENABLE_FP8=1`. For example, the following command trains a 126M model on a single node using FP8: +``` +ENABLE_FP8=1 bash paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh /opt/paxml/datasets /opt/paxml/vocab bfloat16 8 4 log_dir +``` + +Note that packing is currently not supported when using TE. All configs disable packing by default, but beware that if packing is manually enabled, training with TE will error. + ## XLA Flags We recommend setting the following XLA flags when running experiments: @@ -116,7 +123,7 @@ We recommend setting the following XLA flags when running experiments: 4. `--xla_gpu_enable_async_reduce_scatter=true`: Allows XLA:GPU to run Reduce Scatter NCCL kernels on a separate CUDA stream to allow overlap with compute kernels 5. `--xla_gpu_enable_async_all_reduce=true`: Allows XLA:GPU to run All Reduce NCCL kernels on a separate CUDA stream to allow overlap with compute kernels. 6. `--xla_gpu_enable_highest_priority_async_stream=true`: Allows XLA to prioritize the launch of NCCL kernels before GeMMs to ensure enough SMs are available for async communication kernels. -7. `--xla_gpu_all_reduce_combine_threshold_bytes=51200`: Combines NCCL All Reduce kernels until threshold size is reached. +7. `--xla_gpu_all_reduce_combine_threshold_bytes=`: Combines NCCL All Reduce kernels until threshold size is reached. For 126M, we recommend setting this value to 33554432. For 5B and 175B, we recommend 51200. 8. `--xla_gpu_enable_triton_gemm=false`: Disallows Triton GeMM kernels; uses CUBLAS GeMM kernels instead. CUBLAS kernels are currently better tuned for GPUs and thus provide better performance 9. `--xla_gpu_cuda_graph_level=0`: Disallows XLA from using CUDA graphs. @@ -124,55 +131,73 @@ These flags are enabled by default in `paxml/contrib/gpu/scripts_gpu/run_pile_mu ``` export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_triton_gemm=false" ``` +For the the 126M model, we recommend setting `--xla_gpu_all_reduce_combine_threshold_bytes=33554432`, which is different from the default value in `paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh`. To overwrite the default XLA flags set in the script, set the `BASE_XLA_FLAGS` environment variable prior to calling `run_pile_multinode` as follows: + +``` +BASE_XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false + --xla_gpu_simplify_all_fp_conversions --xla_gpu_enable_async_all_gather=true + --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_highest_priority_async_stream=true + --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_all_reduce_combine_threshold_bytes=33554432 + --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true" bash run_pile_multinode.sh ... +``` ## Configs -We provide three "base" model configurations in `paxml/contrib/gpu/scripts_gpu/configs.py`. The first is a 126 million parameter GPT model. Convergence using The Pile dataset has been verified with this model. The remaining configs are 5 billion and 175 billion parameter models. Both 5B and 175B are provided for benchmarking purposes and have not been thoroughly tested for convergence to date. +We provide three "base" model configurations in `paxml/contrib/gpu/scripts_gpu/configs.py`. The first is a 126 million parameter GPT model. Convergence using The Pile dataset has been verified with this model. The remaining configs are 5 billion and 175 billion parameter models. Both 5B and 175B are provided primarily for benchmarking purposes and been less thoroughly tested for convergence. -The table below describes current performance of the given configs. Experiments were run using NVIDIA DGX A100 (8x A100 80G) nodes. Note that Lambada accuracy reported corresponds to the best accuracy seen across the run. Estimated walltime denotes the aproximate time to train each model to completion (i.e. number of days to reach `MAX_STEPS` number of steps as described in `configs.py`). +The tables below describe current performance of the given configs. Experiments were run using NVIDIA DGX A100 80G and H100 80G nodes. Note that Lambada accuracy reported corresponds to the best accuracy seen across the run. Estimated walltime denotes the aproximate time to train each model to completion (i.e. number of days to reach `MAX_STEPS` number of steps as described in `configs.py`). -| Size | #GPUs | DP | FSDP | TP | BS / GPU | Sequences/Sec (bf16 / TE bf16) | Estimated Walltime (days, bf16 / TE bf16) | Lambada Accuracy | Convergence Log | -| ---- | ----- | -- | ---- | -- | ---------| ---------------| ------------------------- | ---------------- |---------------- | -| 126M | 64 |64 |1 |1 | 4 | 1761.3 / 2339.8 | 1.01 / 0.76 | 0.397 (± 0.012) | [log](https://tensorboard.dev/experiment/RCroDLAUQzGUoudzqD1NmQ/) | -| 5B | 256 | 1 |256 |1 | 8 | 465.45 / 598.83 | 3.82 / 2.97 | N/A | [log](https://tensorboard.dev/experiment/AyXAn8ZDRheUARN1NMJ1sw) | -| 175B | 256 |1 |256 |1 | 6 | 18.29 / 19.62 | 72.92 / 67.97 | N/A | [log](https://tensorboard.dev/experiment/NJnv5LbdQby2PcZGPnTRrA/) | N/A | +### A100 Results -*Note*: Estimated walltime is computed assuming full throughput continuously. In practice, true walltime may be greater due to compilation overheads, interleaved evaluation, and checkpointing. A number of the linked convergence runs were completed using older software; thus, reported throughput does not match current results (notably for 126M and 5B bf16). The most up-to-date throughput numbers are reported in the table. +| Size | GPU | Precision | #GPUs | DP | FSDP | TP | BS / GPU | Sequences/Sec | Est. Walltime (days) | Lambada Accuracy (± standard deviation) | Convergence Log | +| ---- | ----- |----- |----- | -- | ---- | -- | ---------| ---------------| ------------------------- | ---------------- |---------------- | +| 126M | A100 80G SXM | BF16 | 64 |64 |1 |1 | 4 | 1877.20 | 0.95 | 0.397 (± 0.012) | [log](https://tensorboard.dev/experiment/RCroDLAUQzGUoudzqD1NmQ/) | +| 5B | A100 80G SXM | BF16 | 256 | 1 |256 |1 | 8 | 465.45 | 3.82 | N/A | | +| 175B | A100 80G SXM | BF16 | 256 |1 |256 |1 | 6 | 18.29 | 72.92 | N/A | | +| 126M | A100 80G SXM | TE BF16 | 64 |64 |1 |1 | 4 | 2512.2 | 0.71 | N/A | | +| 5B | A100 80G SXM | TE BF16 | 256 | 1 |256 |1 | 8 | 586.82 | 3.02 | N/A | | +| 175B | A100 80G SXM | TE BF16 | 256 |1 |256 |1 | 6 | 19.47 | 68.49 | N/A | | -The runs in 5B convergence log were trained for around 26k (TE) and 45k (no TE) steps at a global batch size of 2048 and a sequence length of 2048, amounting to around 109 billion and 189 billion consumed tokens for TE, non-TE respectively. The 175B convergence log was trained for a total of around 700 steps at a global batch size of 1536 and a sequence length of 2048, amounting to around 2.2 billion consumed tokens. Finally, 175B was trained using the [C4 dataset](https://github.com/mlcommons/training/tree/master/large_language_model/paxml#2-dataset), while 126M and 5B were both trained using the Pile. +## H100 results + +| Size | GPU | Precision | #GPUs | DP | FSDP | TP | BS / GPU | Sequences/Sec | Est. Walltime (days) | Lambada Accuracy (± standard deviation) | Convergence Log | +| ---- | ----- |----- |----- | -- | ---- | -- | ---------| ---------------| ------------------------- | ---------------- |---------------- | +| 126M | H100 80G SXM | TE BF16 | 64 |64 |1 |1 | 4 | 4143.21 | 0.43 | 0.425 (± 0.018) | [log](https://tensorboard.dev/experiment/GgDMwODzQjm9kVc9H6259A/) | +| 5B | H100 80G SXM | TE BF16 | 256 | 1 |256 |1 | 8 | 1066.67 | 1.67 | N/A | | +| 175B | H100 80G SXM | TE BF16 | 256 |1 |256 |1 | 6 | 44.01 | 30.35 | N/A | | +| 5B | H100 80G SXM | TE FP8 | 256 | 1 |256 |1 | 8 | 1288.05 | 1.38 | N/A | [log](https://tensorboard.dev/experiment/i5kiGeQpRRapswa68RkYHQ/) | +| 175B | H100 80G SXM | TE FP8 | 256 |1 |256 |1 | 6 | 65.64 | 20.33 | N/A | [log](https://tensorboard.dev/experiment/HvpU324wQYarwgvd9P3Uew/) | + + +*Note*: Estimated walltime is computed assuming full throughput continuously. In practice, true walltime may be greater due to compilation overheads, interleaved evaluation, and checkpointing. A number of the linked convergence runs were completed using older software; thus, throughput reported in the linked logs may not match current results. The most up-to-date throughput numbers are reported in the table. + +5B FP8 was trained for 75,000 steps at a global batch size of 2048 and a sequence length of 2048, amounting to around 300 billion consumed tokens. 175B FP8 was trained for a total of around 1,000 steps at a global batch size of 1536 and a sequence length of 2048, amounting to around 3.14 billion consumed tokens. 175B was trained using the [C4 dataset](https://github.com/mlcommons/training/tree/master/large_language_model/paxml#2-dataset) and restores from an [initial MLPerf checkpoint](https://github.com/mlcommons/training/tree/master/large_language_model/paxml#initial-checkpoint). 126M and 5B were both trained using the Pile. ### Running an Experiment with Base Configs -To run an experiment with any base model configuration with the default parallel strategy reported in the table, copy [run_pile_multinode.sh](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh) to your workspace and make the following modifications: replace `--fdl_config=paxml.contrib.gpu.scripts_gpu.configs.Pile126M` with the experiment you are interested in running (e.g. `paxml.contrib.gpu.scripts_gpu.configs.GPT5B` or `paxml.contrib.gpu.scripts_gpu.configs.GPT175B`) and remove `--fdl.ICI_MESH_SHAPE="[${TRAIN_GPUS}, 1, 1]"`. The resulting bash script (call it `run_my_model_multinode.sh`) can be passed into `example_slurm_pile.sub` using the following command. This command presumes that `run_my_model_multinode.sh` lives in `BASE_WORKSPACE_DIR`. +To run an experiment with any base model configuration with the default parallel strategy reported in the table, copy [run_pile_multinode.sh](https://github.com/google/paxml/blob/main/paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh) to your workspace and make the following modifications: replace `--fdl_config=paxml.contrib.gpu.scripts_gpu.configs.Pile126M` with the experiment you are interested in running (e.g. `paxml.contrib.gpu.scripts_gpu.configs.GPT5B` or `paxml.contrib.gpu.scripts_gpu.configs.GPT175B`) and remove `--fdl.ICI_MESH_SHAPE="[${NUM_GPUS}, 1, 1]"` and `--fdl.DCN_MESH_SHAPE="[${SLURM_JOB_NUM_NODES}, 1, 1]"`. The resulting bash script (call it `run_my_model_multinode.sh`) can be passed into `example_slurm_pile.sub` using the following command. This command presumes that `run_my_model_multinode.sh` lives in `BASE_WORKSPACE_DIR`. ``` -BASE_SCRIPT=run_my_model_multinode.sh CONTAINER= BASE_WORKSPACE_DIR= BASE_TFDS_DATA_DIR= BASE_VOCAB_PATH= LOG_DIR_LOCAL= OUTPUT_DIR= PREC= GPUS_PER_NODE= PERCORE_BATCH_SIZE= sbatch -N -A -p -J paxml/contrib/gpu/scripts_gpu/example_slurm_pile.sub +BASE_SCRIPT=run_my_model_multinode.sh CONTAINER= BASE_WORKSPACE_DIR= BASE_TFDS_DATA_DIR= BASE_VOCAB_PATH= LOG_DIR_LOCAL= OUTPUT_DIR= PREC= GPUS_PER_NODE= PERCORE_BATCH_SIZE= ENABLE_FP8= sbatch -N -A -p -J scripts/example_slurm_pile.sub +``` +Here, it is assumed that you are running with the number of nodes reported in the table. If using a different node count, scale `DCN_MESH_SHAPE` accordingly. For example, the default value of `DCN_MESH_SHAPE` for `paxml.contrib.gpu.scripts_gpu.configs.GPT5B` is `[1,32,1]`. If running on 16 nodes, adjust `DCN_MESH_SHAPE` as follows: +``` +--fdl.DCN_MESH_SHAPE=[1,16,1] ``` + ## Known Issues -* The Paxml container does not fully support Hopper yet. Future releases will add Hopper support. * Pipeline parallelism is not supported with NVIDIA Transformer Engine enabled in the Paxml container. -* There are known Common Vulnerabilities and Exposures (CVE) that affect the Paxml container related to TensorFlow 2.9.x due to pinning TensorFlow to 2.9.x in Paxml and Lingvo. We will fix these in the next release. The known CVEs are: - * CVE-2023-25668 - * CVE-2023-25658 - * CVE-2023-25663 - * CVE-2023-25664 - * CVE-2023-25664 - * CVE-2023-25672 - * CVE-2023-25674 - * CVE-2023-25660 - * CVE-2023-27579 - * CVE-2023-25671 - * CVE-2023-25659 - * CVE-2023-25662 - * CVE-2023-25675 - * CVE-2023-25801 - * CVE-2023-25670 - * CVE-2023-25669 - * CVE-2023-25665 - * CVE-2023-25673 - * CVE-2023-25666 -* The Paxml nightlies disable `NCCL_NVLS_ENABLE=0` ([doc](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-nvls-enable)). Future releases will re-enable this feature. +* The release container has a known XLA bug which affects single-process training in some cases. This bug has been fixed in newer XLA versions. If running into issues with single-process training, try using a Pax nightly container after 10/3. You can also try cherry-picking [this commit](https://github.com/openxla/xla/commit/aa8e7340cb319b9419a097155874bf105da05e1d) in the tested container. +* Infrequent hangs have been observed in multinode settings. Setting `CUDA_MODULE_LOADING=EAGER` helps with these hangs. This environment variable is set by default in `nvcr.io/nvidia/jax:23.10-paxml-py3`, `nvcr.io/nvidia/jax:23.10-paxml-py3-amd64`, and `nvcr.io/nvidia/jax:23.10-paxml-py3-arm64`. +* We currently see unexpected convergence behavior when dropout is used with Transformer Engine. Default configs do not enable dropout within transformer layers and thus should be unaffected by this bug, but users may encounter this bug if manually enabling dropout in their models. ## Changelog +### 10/26/2023 +- Enabled BF16 Transformer Engine by default +- Added FP8 Transformer Engine support +- Updated 5B config to disable dropout in transformer layers +- bfloat16 performance + - 126M performance is 6% higher than 8/29, bringing the overall regression with respect to 7/11 to around 10%. We will continue to improve 126M performance in future releases. + ### 8/29/2023 - Added bfloat16 Transformer Engine support - Disabled packing by default in all base configurations for TE compatibility diff --git a/rosetta/rosetta/projects/pax/scripts/example_slurm_pile.sub b/rosetta/rosetta/projects/pax/scripts/example_slurm_pile.sub index 47e122432..1b479cd1e 100644 --- a/rosetta/rosetta/projects/pax/scripts/example_slurm_pile.sub +++ b/rosetta/rosetta/projects/pax/scripts/example_slurm_pile.sub @@ -29,16 +29,18 @@ set -eux # File system and volume glue code #------------------------------------------------------------------------------- # << CHANGE ! >> -CONTAINER="${CONTAINER:-nvcr.io/nvidia/jax:23.08-paxml-py3}" +CONTAINER="${CONTAINER:-nvcr.io/nvidia/jax:23.10-paxml-py3}" # << CHANGE ! >> BASE_WORKSPACE_DIR=${BASE_WORKSPACE_DIR} ## location to write logs and checkpoints to BASE_TFDS_DATA_DIR=${BASE_TFDS_DATA_DIR} BASE_VOCAB_PATH=${BASE_VOCAB_PATH} PAXML_DIR=${PAXML_DIR:-/opt/paxml} +ENABLE_TE=${ENABLE_TE:-1} +ENABLE_FP8=${ENABLE_FP8:-0} # Default env variables for paths required by pax training scripts -WORKSPACE_DIR=/mnt/workspace +WORKSPACE_DIR=/opt/paxml/workspace TFDS_DATA_DIR=/mnt/datasets GPT_VOCAB_PATH=/mnt/vocab @@ -61,7 +63,7 @@ if [[ -z "${BASE_SCRIPT:-}" ]]; then export BASE_SCRIPT="${PAXML_DIR}/paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh" echo "Using default BASE_SCRIPT=$BASE_SCRIPT" else - export BASE_SCRIPT="/mnt/workspace/${BASE_SCRIPT}" + export BASE_SCRIPT="${WORKSPACE_DIR}/${BASE_SCRIPT}" echo "Using custom BASE_SCRIPT=$BASE_SCRIPT" fi @@ -69,7 +71,7 @@ cmd="$(cat < 170 + assert bps > (155 * 0.9) def test_dali_cls_preprocessing(dummy_wds_metadata): diff --git a/rosetta/test-vit.sh b/rosetta/test-vit.sh index 87bc210fa..b74d7e347 100755 --- a/rosetta/test-vit.sh +++ b/rosetta/test-vit.sh @@ -136,7 +136,8 @@ with wds.TarWriter(out_tar_path) as dst: EOF -set -x +set -exou pipefail + DATA_PATH="/tmp/dummy_vit_data" python -m generate_dummy_wds --output_tar_path=${DATA_PATH} @@ -151,5 +152,4 @@ python -m t5x.train \ --gin_search_paths=/opt/rosetta \ --gin.CheckpointConfig.save=None \ $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu) -set +x echo "Output at ${OUTPUT}"