Nightly T5X MGMN performance test (nightly 2023-11-03T09:39:25Z) #299
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Nightly T5X MGMN performance test | |
run-name: Nightly T5X MGMN performance test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) | |
on: | |
workflow_run: | |
workflows: [Nightly T5X build] | |
types: [completed] | |
branches: [main] | |
workflow_dispatch: | |
inputs: | |
T5X_IMAGE: | |
type: string | |
description: T5X container | |
default: 'ghcr.io/nvidia/upstream-t5x:latest' | |
required: true | |
PUBLISH: | |
type: boolean | |
description: Publish dated results to tensorboard server? | |
default: false | |
required: false | |
permissions: | |
contents: read # to fetch code | |
actions: write # to cancel previous workflows | |
packages: write # to upload container | |
issues: write # to create issues | |
env: | |
DEFAULT_T5X_IMAGE: 'ghcr.io/nvidia/upstream-t5x:latest' | |
jobs: | |
metadata: | |
runs-on: ubuntu-22.04 | |
outputs: | |
BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} | |
T5X_IMAGE: ${{ steps.date.outputs.T5X_IMAGE }} | |
PUBLISH: ${{ steps.date.outputs.PUBLISH }} | |
steps: | |
- name: Set metadata | |
id: date | |
shell: bash -x -e {0} | |
run: | | |
BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') | |
echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT | |
T5X_IMAGE=${{ inputs.T5X_IMAGE }} | |
T5X_IMAGE=${T5X_IMAGE:-${{ env.DEFAULT_T5X_IMAGE }}} | |
echo "T5X_IMAGE=${T5X_IMAGE}" >> $GITHUB_OUTPUT | |
echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT | |
run-jobs: | |
needs: metadata | |
uses: ./.github/workflows/_test_t5x.yaml | |
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' | |
with: | |
T5X_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }} | |
secrets: inherit | |
publish: | |
needs: [metadata, run-jobs] | |
uses: ./.github/workflows/_publish_t5x_results.yaml | |
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' | |
with: | |
BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} | |
EXPERIMENT_SUBDIR: T5X | |
secrets: inherit | |
publish-completion: | |
needs: [metadata, run-jobs] | |
uses: ./.github/workflows/_publish_badge.yaml | |
if: success() || failure() | |
secrets: inherit | |
with: | |
ENDPOINT_FILENAME: 't5x-test-overall-status.json' | |
PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} | |
SCRIPT: | | |
STATUS=failure | |
if [[ ${{ needs.run-jobs.result }} == "success" ]]; then | |
EXIT_STATUSES="${GITHUB_RUN_ID}-*/*-status.json" | |
PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) | |
FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) | |
TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) | |
echo "Test statuses:" | |
jq -rc 'input_filename,.' $EXIT_STATUSES | |
if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then | |
BADGE_COLOR=brightgreen | |
STATUS=success | |
elif [[ $PASSED_TESTS -eq 0 ]]; then | |
BADGE_COLOR=red | |
else | |
BADGE_COLOR=yellow | |
fi | |
echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT | |
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT | |
else | |
echo "MESSAGE='n/a'" >> $GITHUB_OUTPUT | |
echo "COLOR='red'" >> $GITHUB_OUTPUT | |
fi | |
echo "LABEL='Completion'" >> $GITHUB_OUTPUT | |
echo "STATUS='$STATUS'" >> $GITHUB_OUTPUT | |
publish-verified: | |
if: needs.publish-completion.outputs.STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) | |
needs: [metadata, publish-completion] | |
uses: ./.github/workflows/_publish_container.yaml | |
secrets: inherit | |
with: | |
SOURCE_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }} | |
TARGET_IMAGE: upstream-t5x | |
TARGET_TAGS: | | |
type=raw,value=latest-verified,priority=1000 | |
triage: | |
needs: [metadata, publish-completion] | |
uses: ./.github/workflows/_triage.yaml | |
if: needs.publish-completion.outputs.STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch') | |
secrets: inherit | |
with: | |
BROKEN_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }} | |
BASE_IMAGE: ghcr.io/nvidia/upstream-t5x:latest-verified | |
REPO_DIRS: "/opt/t5x /opt/flax" | |
FILE_ISSUE: true | |
if-upstream-failed: | |
runs-on: ubuntu-latest | |
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' | |
steps: | |
- run: echo 'Upstream workflow failed, aborting run' && exit 1 |