Nightly Pax MGMN performance test (nightly 2023-11-05T09:37:17Z) #301
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Nightly Pax MGMN performance test | |
run-name: Nightly Pax MGMN performance test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) | |
on: | |
workflow_run: | |
workflows: [Nightly Pax build] | |
types: [completed] | |
branches: [main] | |
workflow_dispatch: | |
inputs: | |
PAX_IMAGE: | |
type: string | |
description: Pax container | |
default: 'ghcr.io/nvidia/upstream-pax:latest' | |
required: true | |
PUBLISH: | |
type: boolean | |
description: Publish dated results to tensorboard server? | |
default: false | |
required: false | |
permissions: | |
contents: read # to fetch code | |
actions: write # to cancel previous workflows | |
packages: write # to upload container | |
issues: write # to create issues | |
env: | |
DEFAULT_PAX_IMAGE: 'ghcr.io/nvidia/upstream-pax:latest' | |
jobs: | |
metadata: | |
runs-on: ubuntu-22.04 | |
outputs: | |
BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} | |
PAX_IMAGE: ${{ steps.date.outputs.PAX_IMAGE }} | |
PUBLISH: ${{ steps.date.outputs.PUBLISH }} | |
steps: | |
- name: Set metadata | |
id: date | |
shell: bash -x -e {0} | |
run: | | |
BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') | |
echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT | |
PAX_IMAGE=${{ inputs.PAX_IMAGE }} | |
PAX_IMAGE=${PAX_IMAGE:-${{ env.DEFAULT_PAX_IMAGE }}} | |
echo "PAX_IMAGE=${PAX_IMAGE}" >> $GITHUB_OUTPUT | |
echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT | |
run-jobs: | |
needs: metadata | |
uses: ./.github/workflows/_test_pax.yaml | |
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' | |
with: | |
PAX_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }} | |
secrets: inherit | |
publish: | |
needs: [metadata, run-jobs] | |
runs-on: ubuntu-22.04 | |
steps: | |
- name: Setup SSH agent | |
uses: webfactory/[email protected] | |
with: | |
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} | |
- name: Setup SSH known hosts | |
id: ssh-known-hosts | |
run: | | |
mkdir -p ~/.ssh | |
cat >> ~/.ssh/known_hosts << EOF | |
${{ vars.SSH_KNOWN_HOSTS }} | |
EOF | |
chmod 600 ~/.ssh/known_hosts | |
echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT | |
- name: Setup SSH config | |
id: ssh-config | |
run: | | |
mkdir -p ~/.ssh | |
cat >> ~/.ssh/config << EOF | |
${{ vars.SSH_CONFIG }} | |
EOF | |
chmod 600 ~/.ssh/config | |
- name: Create dated folder and generate TensorBoard query URL | |
id: mkdir | |
shell: bash -x -e {0} | |
run: | | |
FOLDER="${{ needs.metadata.outputs.BUILD_DATE }}/PAX" | |
# copy folder | |
ssh -T tensorboard mkdir -p /tensorboard-logs/${FOLDER} | |
ssh -T tensorboard rsync -rt /tensorboard-logs/${GITHUB_RUN_ID}/ /tensorboard-logs/${FOLDER}/ | |
# generate query URL | |
( | |
cat << EOF | |
## PAX MGMN nightly training: ${{ needs.metadata.outputs.BUILD_DATE }} | |
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per) | |
EOF | |
) | tee $GITHUB_STEP_SUMMARY | |
publish-completion: | |
needs: [metadata, run-jobs] | |
uses: ./.github/workflows/_publish_badge.yaml | |
if: success() || failure() | |
secrets: inherit | |
with: | |
ENDPOINT_FILENAME: 'pax-test-completion-status.json' | |
PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} | |
SCRIPT: | | |
STATUS=failure | |
if [[ ${{ needs.run-jobs.result }} == "success" ]]; then | |
EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*TP*PP/*-status.json" | |
PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) | |
FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) | |
TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) | |
echo "Test statuses:" | |
jq -rc 'input_filename,.' $EXIT_STATUSES | |
if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then | |
BADGE_COLOR=brightgreen | |
STATUS=success | |
elif [[ $PASSED_TESTS -eq 0 ]]; then | |
BADGE_COLOR=red | |
else | |
BADGE_COLOR=yellow | |
fi | |
echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT | |
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT | |
else | |
echo "MESSAGE='n/a'" >> $GITHUB_OUTPUT | |
echo "COLOR='red'" >> $GITHUB_OUTPUT | |
fi | |
echo "LABEL='Completion'" >> $GITHUB_OUTPUT | |
echo "STATUS='$STATUS'" >> $GITHUB_OUTPUT | |
publish-verified: | |
if: needs.publish-completion.outputs.STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) | |
needs: [metadata, publish-completion] | |
uses: ./.github/workflows/_publish_container.yaml | |
secrets: inherit | |
with: | |
SOURCE_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }} | |
TARGET_IMAGE: upstream-pax | |
TARGET_TAGS: | | |
type=raw,value=latest-verified,priority=1000 | |
triage: | |
needs: [metadata, publish-completion] | |
uses: ./.github/workflows/_triage.yaml | |
if: needs.publish-completion.outputs.STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch') | |
secrets: inherit | |
with: | |
BROKEN_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }} | |
BASE_IMAGE: ghcr.io/nvidia/upstream-pax:latest-verified | |
REPO_DIRS: "/opt/paxml /opt/praxis" | |
FILE_ISSUE: true | |
if-upstream-failed: | |
runs-on: ubuntu-latest | |
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' | |
steps: | |
- run: echo 'Upstream workflow failed, aborting run' && exit 1 |