Skip to content

Nightly Rosetta T5x build and test #141

Nightly Rosetta T5x build and test

Nightly Rosetta T5x build and test #141

name: Nightly Rosetta T5x build and test
on:
workflow_run:
workflows: [Nightly T5X build]
types: [completed]
branches: [main]
workflow_dispatch:
inputs:
BASE_IMAGE:
type: string
description: 'T5x image built by NVIDIA/JAX-Toolbox'
default: 'ghcr.io/nvidia/upstream-t5x:latest'
required: true
PUBLISH:
type: boolean
description: Publish dated images and update the 'latest' tag?
default: false
required: false
env:
BASE_LIBRARY: t5x
DOCKER_REGISTRY: ghcr.io/nvidia
permissions:
contents: read # to fetch code
actions: write # to cancel previous workflows
packages: write # to upload container
jobs:
metadata:
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
runs-on: ubuntu-22.04
outputs:
BUILD_DATE: ${{ steps.meta-vars.outputs.BUILD_DATE }}
BASE_LIBRARY: ${{ steps.meta-vars.outputs.BASE_LIBRARY }}
BASE_IMAGE: ${{ steps.meta-vars.outputs.BASE_IMAGE }}
PUBLISH: ${{ steps.meta-vars.outputs.PUBLISH }}
steps:
- name: Set build metadata
id: meta-vars
shell: bash -x -e {0}
run: |
BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d')
if [[ -z "${{ inputs.BASE_IMAGE }}" ]]; then
BASE_IMAGE=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:latest
else
BASE_IMAGE=${{ inputs.BASE_IMAGE }}
fi
echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT
echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT
echo "BASE_IMAGE=${BASE_IMAGE}" >> $GITHUB_OUTPUT
echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT
build:
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
needs: metadata
uses: ./.github/workflows/_build_rosetta.yaml
with:
BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}
BASE_LIBRARY: ${{ needs.metadata.outputs.BASE_LIBRARY }}
BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE }}
# TODO: Can't build ARM until https://github.com/NVIDIA/JAX-Toolbox/pull/252 is available
PLATFORMS: '["amd64"]'
secrets: inherit
publish-build:
needs: [metadata, build]
uses: ./.github/workflows/_publish_badge.yaml
if: ( success() || failure() ) && (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
secrets: inherit
with:
ENDPOINT_FILENAME: 'rosetta-t5x-build-status.json'
PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }}
SCRIPT: |
if [[ ${{ needs.build.result }} == "success" ]]; then
BADGE_COLOR=brightgreen
MSG=passing
else
BADGE_COLOR=red
MSG=failing
fi
echo "LABEL='nightly'" >> $GITHUB_OUTPUT
echo "MESSAGE='${MSG}'" >> $GITHUB_OUTPUT
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT
test-unit:
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
needs: build
uses: ./.github/workflows/_test_rosetta.yaml
with:
ROSETTA_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }}
secrets: inherit
test-t5x:
needs: build
uses: ./.github/workflows/_test_t5x.yaml
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
with:
T5X_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }}
# Disable packing b/c rosetta-t5x images run with TE by default, and TE does not currently support packing
EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False"
secrets: inherit
publish-t5x:
needs: [metadata, test-t5x]
uses: ./.github/workflows/_publish_t5x_results.yaml
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
with:
BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}
EXPERIMENT_SUBDIR: ROSETTA_T5X
secrets: inherit
publish-test:
needs: [metadata, build, test-unit, test-t5x]
uses: ./.github/workflows/_publish_badge.yaml
if: ( always() )
secrets: inherit
with:
ENDPOINT_FILENAME: 'rosetta-t5x-overall-test-status.json'
PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }}
SCRIPT: |
UNIT_STATUS=${{ needs.test-unit.outputs.TEST_STATUS }}
T5X_STATUS=${{ needs.test-t5x.outputs.TEST_STATUS }}
echo "LABEL='Tests'" >> $GITHUB_OUTPUT
if [[ ${{ needs.build.result }} == "success" ]]; then
if [[ $UNIT_STATUS == "success" ]] && [[ $T5X_STATUS == "success" ]]; then
COLOR=brightgreen
MESSAGE="Unit passed / MGMN passed"
elif [[ $UNIT_STATUS == "success" ]]; then
COLOR=yellow
MESSAGE="Unit passed / MGMN failed"
elif [[ $T5X_STATUS == "success" ]]; then
COLOR=yellow
MESSAGE="Unit failed / MGMN passed"
else
COLOR=red
MESSAGE="Unit failed / MGMN failed"
fi
else
MESSAGE="n/a"
COLOR="red"
fi
echo "MESSAGE='${MESSAGE}'" >> $GITHUB_OUTPUT
echo "COLOR='${COLOR}'" >> $GITHUB_OUTPUT
publish-latest-container:
needs: [metadata, build, test-t5x, test-unit]
if: ( needs.test-unit.outputs.TEST_STATUS == 'success' && needs.test-t5x.outputs.TEST_STATUS == 'success' ) && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH))
uses: ./.github/workflows/_publish_container.yaml
secrets: inherit
with:
SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }}
TARGET_IMAGE: t5x
TARGET_TAGS: |
type=raw,value=latest,priority=1000
publish-container:
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)
needs: [metadata, build]
uses: ./.github/workflows/_publish_container.yaml
secrets: inherit
with:
SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }}
TARGET_IMAGE: t5x
TARGET_TAGS: |
type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900
if-upstream-failed:
runs-on: ubuntu-latest
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch'
steps:
- run: echo 'Upstream workflow failed, aborting run' && exit 1