diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index ceadde34d..6ddbe7e69 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -180,7 +180,7 @@ jobs: TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} secrets: inherit - test-t5x: + test-upstream-t5x: needs: build-t5x if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a uses: ./.github/workflows/_test_t5x.yaml @@ -188,7 +188,16 @@ jobs: T5X_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAG_FINAL }} secrets: inherit - test-pax: + test-rosetta-t5x: + needs: build-rosetta-t5x + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_t5x_rosetta.yaml + with: + T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} + # Disable packing b/c rosetta-t5x images run with TE by default, and TE does not currently support packing + secrets: inherit + + test-upstream-pax: needs: build-pax if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a uses: ./.github/workflows/_test_pax.yaml @@ -196,10 +205,11 @@ jobs: PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} secrets: inherit - test-vit: - needs: build-rosetta-t5x - uses: ./.github/workflows/_test_vit.yaml + test-rosetta-pax: + needs: build-rosetta-pax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_pax_rosetta.yaml with: - ROSETTA_T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit - + PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + diff --git a/.github/workflows/_publish_t5x_results.yaml b/.github/workflows/_publish_t5x_pax_results.yaml similarity index 78% rename from .github/workflows/_publish_t5x_results.yaml rename to .github/workflows/_publish_t5x_pax_results.yaml index 6b9dc6ebb..f79298002 100644 --- a/.github/workflows/_publish_t5x_results.yaml +++ b/.github/workflows/_publish_t5x_pax_results.yaml @@ -1,4 +1,4 @@ -name: ~publish t5x integration test results +name: ~publish t5x/pax integration test results on: workflow_call: @@ -9,12 +9,17 @@ on: required: true EXPERIMENT_SUBDIR: type: string - description: Subdirectory to easily filter experiments, e.g., T5X, ROSETTA_T5X + description: Subdirectory to easily filter experiments, e.g., T5X, ROSETTA_PAX default: T5X required: false + ARTIFACT_NAME: + type: string + description: If provided, will prepend a prefix to the artifact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts + default: "" + required: false jobs: - publish-t5x: + publish: runs-on: ubuntu-22.04 steps: - name: Setup SSH agent @@ -48,7 +53,7 @@ jobs: FOLDER="${{ inputs.BUILD_DATE }}/${{ inputs.EXPERIMENT_SUBDIR }}" # copy folder ssh -T tensorboard mkdir -p /tensorboard-logs/${FOLDER} - ssh -T tensorboard rsync -rt /tensorboard-logs/${GITHUB_RUN_ID}/ /tensorboard-logs/${FOLDER}/ + ssh -T tensorboard rsync -rt /tensorboard-logs/${{ inputs.ARTIFACT_NAME }}${GITHUB_RUN_ID}/ /tensorboard-logs/${FOLDER}/ # generate query URL ( cat << EOF diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index aaca5a4a5..e0e4d1435 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -13,6 +13,11 @@ on: description: Extra command line args to pass to test-pax.sh default: "" required: false + ARTIFACT_NAME: + type: string + description: If provided, will prepend a prefix to the artifact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts + default: "" + required: false outputs: TEST_STATUS: description: 'Summary pass/fail value indicating if results from tests are acceptable' diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index ca8715adf..804a33a13 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -13,6 +13,11 @@ on: description: Extra command line args to pass to test-pax.sh default: "" required: false + ARTIFACT_NAME: + type: string + description: If provided, will prepend a prefix to the artifact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts + default: "" + required: false outputs: TEST_STATUS: description: 'Summary pass/fail value indicating if results from tests are acceptable' @@ -58,7 +63,7 @@ jobs: MAX_GPUS_PER_NODE=8 NODES=1 GPUS_PER_NODE=8 - JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME} + JOB_NAME=${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}-${TEST_CASE_NAME} LOG_FILE=/nfs/cluster/${JOB_NAME}.log MODEL_PATH=/nfs/cluster/${JOB_NAME} for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do @@ -125,7 +130,7 @@ jobs: output/ || true rsync -rtz --progress \ output/ \ - ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}rosetta-pax-/ || true - name: Write SLURM job status to file shell: bash -x -e {0} run: | @@ -185,7 +190,7 @@ jobs: NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE)) GPUS_PER_NODE=$((TOTAL_TASKS/NODES)) - JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME} + JOB_NAME=${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}-${TEST_CASE_NAME} LOG_FILE=/nfs/cluster/${JOB_NAME}.log MODEL_PATH=/nfs/cluster/${JOB_NAME} for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do @@ -261,7 +266,7 @@ jobs: output/ || true rsync -rtz --progress \ output/ \ - ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}/ || true - name: Write SLURM job status to file shell: bash -x -e {0} @@ -321,7 +326,7 @@ jobs: NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE)) GPUS_PER_NODE=$((TOTAL_TASKS/NODES)) - JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME} + JOB_NAME=${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}-${TEST_CASE_NAME} LOG_FILE=/nfs/cluster/${JOB_NAME}.log MODEL_PATH=/nfs/cluster/${JOB_NAME} for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do @@ -395,7 +400,7 @@ jobs: output/ || true rsync -rtz --progress \ output/ \ - ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}/ || true - name: Write SLURM job status to file shell: bash -x -e {0} @@ -453,7 +458,7 @@ jobs: NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE)) GPUS_PER_NODE=$((TOTAL_TASKS/NODES)) - JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME} + JOB_NAME=${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}-${TEST_CASE_NAME} LOG_FILE=/nfs/cluster/${JOB_NAME}.log MODEL_PATH=/nfs/cluster/${JOB_NAME} for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do @@ -530,7 +535,7 @@ jobs: output/ || true rsync -rtz --progress \ output/ \ - ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}/ || true - name: Write SLURM job status to file shell: bash -x -e {0} @@ -587,7 +592,7 @@ jobs: NODES=1 GPUS_PER_NODE=8 - JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME} + JOB_NAME=${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}-${TEST_CASE_NAME} LOG_FILE=/nfs/cluster/${JOB_NAME}.log MODEL_PATH=/nfs/cluster/${JOB_NAME} for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do @@ -662,7 +667,7 @@ jobs: output/ || true rsync -rtz --progress \ output/ \ - ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}/ || true - name: Write SLURM job status to file shell: bash -x -e {0} @@ -695,15 +700,15 @@ jobs: shell: bash -x {0} run: | pip install pytest pytest-reportlog tensorboard - for i in ${GITHUB_RUN_ID}-*DP*FSDP*TP*PP* ${GITHUB_RUN_ID}-*DP_TE_dropout; do - SUBDIR=$(echo $i | cut -d'-' -f2) + for i in ${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}-*DP*FSDP*TP*PP* ${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}-*DP_TE_dropout; do + SUBDIR=$(echo $i | rev | cut -d'-' -f1 | rev) mv $i/$SUBDIR* . python3 .github/workflows/baselines/summarize_metrics.py $SUBDIR # create result json in baseline format done - echo '## PAX MGMN Test Metrics' >> $GITHUB_STEP_SUMMARY + echo '## Rosetta PAX MGMN Test Metrics' >> $GITHUB_STEP_SUMMARY for i in *_metrics.json; do - echo $i | cut -d'.' -f1 + echo $(basename -- $i _metrics.json) echo '```json' jq . $i echo '```' @@ -724,39 +729,40 @@ jobs: if: ( always() ) secrets: inherit with: - ENDPOINT_FILENAME: 'pax-test-status.json' + ENDPOINT_FILENAME: '${{ inputs.ARTIFACT_NAME }}rosetta-pax-test-status.json' PUBLISH: false SCRIPT: | - EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*FSDP*TP*PP*/*-status.json ${GITHUB_RUN_ID}-*DP_TE_dropout/*-status.json" + EXIT_STATUSES="${{ inputs.ARTIFACT_NAME }}rosetta-pax-*DP*FSDP*TP*PP*/*-status.json ${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}-*DP_TE_dropout/*-status.json" PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) - echo '## PAX MGMN Test Status' >> $GITHUB_STEP_SUMMARY + cat <>$GITHUB_STEP_SUMMARY + ## Pax MGMN+SPMD Test Status + | Test Case | State | Exit Code | + | --- | --- | --- | + EOF for i in $EXIT_STATUSES; do - echo $i | cut -d'.' -f1 - echo '```json' - jq . $i - echo '```' + # Files are named rosetta-pax--/-status.json + echo "| $(echo $i | cut -d/ -f1 | cut -d- -f4) | $(jq -r .state $i) | $(jq -r .exitcode $i)" done | tee -a $GITHUB_STEP_SUMMARY echo "Test statuses:" jq -rc 'input_filename,.' $EXIT_STATUSES - PYTEST_LOG=metrics-test-log/report.jsonl + METRICS_LOG=metrics-test-log/report.jsonl all_outcomes() { - cat $PYTEST_LOG | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome' + cat $METRICS_LOG | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome' } cnt_type() { - cat $PYTEST_LOG | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l + cat $METRICS_LOG | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l } PYTEST_FAILED_TESTS=$(cnt_type failed) PYTEST_PASSED_TESTS=$(cnt_type passed) PYTEST_TOTAL_TESTS=$(all_outcomes | wc -l) if ([[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] && \ - [[ $PYTEST_FAILED_TESTS -eq 0 ]] && [[ $PYTEST_TOTAL_TESTS -gt 0 ]]) || \ - ([[ $PASSED_TESTS -eq $TOTAL_TESTS ]] && [[ $PYTEST_PASSED_TESTS -eq $PYTEST_TOTAL_TESTS ]]); then + [[ $PYTEST_FAILED_TESTS -eq 0 ]] && [[ $PYTEST_TOTAL_TESTS -gt 0 ]]); then STATUS=success BADGE_COLOR=brightgreen elif [[ $PASSED_TESTS -eq 0 ]] || [[ $PYTEST_PASSED_TESTS -eq 0 ]]; then @@ -781,9 +787,9 @@ jobs: ( cat << EOF - ## PAX MGMN training + ## Rosetta PAX MGMN training - [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per) + [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per) EOF ) | tee $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/_test_t5x.yaml b/.github/workflows/_test_t5x.yaml index 033ba9041..3de559b41 100644 --- a/.github/workflows/_test_t5x.yaml +++ b/.github/workflows/_test_t5x.yaml @@ -18,6 +18,11 @@ on: description: Extra gin args to pass to test-t5x.sh default: "" required: false + ARTIFACT_NAME: + type: string + description: If provided, will prepend a prefix to the artifact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts + default: "" + required: false outputs: TEST_STATUS: description: 'Summary pass/fail value indicating if results from tests are acceptable' diff --git a/.github/workflows/_test_t5x_rosetta.yaml b/.github/workflows/_test_t5x_rosetta.yaml index 58bb562be..6f23a9748 100644 --- a/.github/workflows/_test_t5x_rosetta.yaml +++ b/.github/workflows/_test_t5x_rosetta.yaml @@ -20,6 +20,7 @@ on: env: BATCH_SIZE_PER_GPU: 32 + VIT_BATCH_SIZE_PER_GPU: 256 jobs: @@ -67,7 +68,7 @@ jobs: run: | IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')" TEST_CASE_NAME=${{ matrix.TEST_NAME }} - JOB_NAME=${{ inputs.ARTIFACT_NAME }}${GITHUB_RUN_ID}-${TEST_CASE_NAME} + JOB_NAME=${{ inputs.ARTIFACT_NAME }}rosetta-T5X-${GITHUB_RUN_ID}-${TEST_CASE_NAME} LOG_FILE=/nfs/cluster/${JOB_NAME}.log MODEL_PATH=/nfs/cluster/${JOB_NAME} BATCH_SIZE=$((${{ env.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }})) @@ -137,7 +138,7 @@ jobs: output/ || true rsync -rtz --progress \ output/ \ - ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/rosetta-T5X-${GITHUB_RUN_ID}/ || true - name: Write SLURM job status to file shell: bash -x -e {0} @@ -209,7 +210,7 @@ jobs: IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')" TEST_CASE_NAME=${{ matrix.TEST_NAME }} TOTAL_TASKS=$((${{ matrix.N_GPU }} * ${{ matrix.N_NODE }})) - JOB_NAME=${{ inputs.ARTIFACT_NAME }}${GITHUB_RUN_ID}-${TEST_CASE_NAME} + JOB_NAME=${{ inputs.ARTIFACT_NAME }}rosetta-T5X-${GITHUB_RUN_ID}-${TEST_CASE_NAME} LOG_FILE=/nfs/cluster/${JOB_NAME}.log MODEL_PATH=/nfs/cluster/${JOB_NAME} BATCH_SIZE=$((${{ env.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }} * ${{ matrix.N_NODE }})) @@ -282,7 +283,244 @@ jobs: output/ || true rsync -rtz --progress \ output/ \ - ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/{{ inputs.ARTIFACT_NAME }}rosetta-T5X-${GITHUB_RUN_ID}/ || true + + - name: Write SLURM job status to file + shell: bash -x -e {0} + run: | + python << EOF + import json + with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: + dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} + json.dump(dump, f) + EOF + + - name: Upload training logs as artifacts + uses: actions/upload-artifact@v3 + with: + name: ${{ steps.meta.outputs.JOB_NAME }} + path: output/* + + vit-single-process-multi-device: + strategy: + matrix: + N_GPU: [8] + fail-fast: false + + runs-on: ubuntu-22.04 + + steps: + - name: Print environment variables + run: env + + - name: Setup SSH agent + uses: webfactory/ssh-agent@v0.8.0 + with: + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + + - name: Setup SSH known hosts + id: ssh-known-hosts + run: | + mkdir -p ~/.ssh + cat >> ~/.ssh/known_hosts << EOF + ${{ vars.SSH_KNOWN_HOSTS }} + EOF + chmod 600 ~/.ssh/known_hosts + echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT + + - name: Labels and metadata + id: meta + shell: bash -x -e {0} + run: | + IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')" + TEST_CASE_NAME=VIT1P${{ matrix.N_GPU }}G + JOB_NAME=${{ inputs.ARTIFACT_NAME }}rosetta-VIT-${GITHUB_RUN_ID}-${TEST_CASE_NAME} + LOG_FILE=/nfs/cluster/${JOB_NAME}.log + MODEL_PATH=/nfs/cluster/${JOB_NAME} + BATCH_SIZE=$((${{ env.VIT_BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }})) + for var in IMAGE TEST_CASE_NAME JOB_NAME LOG_FILE MODEL_PATH BATCH_SIZE; do + echo "$var=${!var}" >> $GITHUB_OUTPUT + done + + - name: Submit SLURM jobs over SSH + id: submit + shell: bash -O expand_aliases -x -e {0} + run: | + alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + sshx "date && hostname && sinfo" + sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} + JOB=$(sshx sbatch --parsable << EOF + #!/bin/bash + #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} + #SBATCH --exclusive + #SBATCH --nodes=1 + #SBATCH --tasks=1 + #SBATCH --gpus-per-node=${{ matrix.N_GPU }} + #SBATCH --time=00:30:00 + #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} + #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }},XLA_PYTHON_CLIENT_MEM_FRACTION=0.9" + time srun \ + --container-image=${{ steps.meta.outputs.IMAGE }} \ + --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ + --container-entrypoint \ + test-vit.sh \ + --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \ + --dtype bfloat16 \ + --batch-size ${{ steps.meta.outputs.BATCH_SIZE }} + EOF + ) + + set +x + while sshx squeue -j $JOB | grep -q $JOB; do + echo "SLURM Job $JOB is still running." + sleep 15 + done + echo "SLRUM Job $JOB finished." + + # Gather job info + SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) + SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') + echo "SLURM Job state is ${SLURM_STATE}" + echo "SLURM Job exit code is ${SLURM_EXITCODE}" + echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" + echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" + + set -x + + - name: Retrieve training logs and upload to TensorBoard server + shell: bash -x -e {0} + run: | + mkdir output/ + rsync -rtz --progress \ + ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ + output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true + rsync -rtz --progress \ + ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ + output/ || true + rsync -rtz --progress \ + output/ \ + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}rosetta-VIT-${GITHUB_RUN_ID}/ || true + + - name: Write SLURM job status to file + shell: bash -x -e {0} + run: | + python << EOF + import json + with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: + dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} + json.dump(dump, f) + EOF + + - name: Upload training logs as artifacts + uses: actions/upload-artifact@v3 + with: + name: ${{ steps.meta.outputs.JOB_NAME }} + path: output/* + + vit-multi-gpu-multi-node: + strategy: + matrix: + N_GPU: [1, 8] + N_NODE: [1, 2] + fail-fast: false + + runs-on: ubuntu-22.04 + + steps: + - name: Print environment variables + run: env + + - name: Setup SSH agent + uses: webfactory/ssh-agent@v0.8.0 + with: + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + + - name: Setup SSH known hosts + id: ssh-known-hosts + run: | + mkdir -p ~/.ssh + cat >> ~/.ssh/known_hosts << EOF + ${{ vars.SSH_KNOWN_HOSTS }} + EOF + chmod 600 ~/.ssh/known_hosts + echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT + + - name: Labels and metadata + id: meta + shell: bash -x -e {0} + run: | + IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')" + TEST_CASE_NAME=VIT${{ matrix.N_GPU }}G${{ matrix.N_NODE }}N + TOTAL_TASKS=$((${{ matrix.N_GPU }} * ${{ matrix.N_NODE }})) + JOB_NAME=${{ inputs.ARTIFACT_NAME }}rosetta-VIT-${GITHUB_RUN_ID}-${TEST_CASE_NAME} + LOG_FILE=/nfs/cluster/${JOB_NAME}.log + MODEL_PATH=/nfs/cluster/${JOB_NAME} + BATCH_SIZE=$((${{ env.VIT_BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }} * ${{ matrix.N_NODE }})) + for var in IMAGE TEST_CASE_NAME TOTAL_TASKS JOB_NAME LOG_FILE MODEL_PATH BATCH_SIZE; do + echo "$var=${!var}" >> $GITHUB_OUTPUT + done + + - name: Submit SLURM jobs over SSH + id: submit + shell: bash -O expand_aliases -x -e {0} + run: | + alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + sshx "date && hostname && sinfo" + sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} + JOB=$(sshx sbatch --parsable << EOF + #!/bin/bash + #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} + #SBATCH --exclusive + #SBATCH --nodes=${{ matrix.N_NODE }} + #SBATCH --gpus-per-node=${{ matrix.N_GPU }} + #SBATCH --tasks=${{ steps.meta.outputs.TOTAL_TASKS }} + #SBATCH --tasks-per-node=${{ matrix.N_GPU }} + #SBATCH --time=00:30:00 + #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} + #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }},XLA_PYTHON_CLIENT_MEM_FRACTION=0.9" + time srun \ + --container-image=${{ steps.meta.outputs.IMAGE }} \ + --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ + --container-entrypoint \ + test-vit.sh \ + --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \ + --dtype bfloat16 \ + --batch-size ${{ steps.meta.outputs.BATCH_SIZE }} \ + --multiprocess + EOF + ) + + set +x + while sshx squeue -j $JOB | grep -q $JOB; do + echo "SLURM Job $JOB is still running." + sleep 15 + done + echo "SLRUM Job $JOB finished." + + # Gather job info + SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) + SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') + echo "SLURM Job state is ${SLURM_STATE}" + echo "SLURM Job exit code is ${SLURM_EXITCODE}" + echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" + echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" + + set -x + + - name: Retrieve training logs and upload to TensorBoard server + shell: bash -x -e {0} + run: | + + mkdir output/ + rsync -rtz --progress \ + ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ + output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true + rsync -rtz --progress \ + ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ + output/ || true + rsync -rtz --progress \ + output/ \ + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}rosetta-VIT-${GITHUB_RUN_ID}/ || true - name: Write SLURM job status to file shell: bash -x -e {0} @@ -309,21 +547,41 @@ jobs: ENDPOINT_FILENAME: '${{ inputs.ARTIFACT_NAME }}rosetta-t5x-test-completion-status.json' PUBLISH: false SCRIPT: | - EXIT_STATUSES="${{ inputs.ARTIFACT_NAME }}${GITHUB_RUN_ID}-*/*-status.json" - PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) - FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) - TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) + T5X_EXIT_STATUSES="${{ inputs.ARTIFACT_NAME }}rosetta-T5X-${GITHUB_RUN_ID}-*/*-status.json" + T5X_PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $T5X_EXIT_STATUSES | wc -l) + T5X_FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $T5X_EXIT_STATUSES | wc -l) + T5X_TOTAL_TESTS=$(ls $T5X_EXIT_STATUSES | wc -l) cat <>$GITHUB_STEP_SUMMARY ## T5x MGMN+SPMD Test Status | Test Case | State | Exit Code | | --- | --- | --- | EOF - for i in $EXIT_STATUSES; do + for i in $T5X_EXIT_STATUSES; do + # Files are named ${{ inputs.ARTIFACT_NAME }}-rosetta-t5x--/-status.json + echo "| $(echo $i | cut -d/ -f1 | cut -d- -f4-) | $(jq -r .state $i) | $(jq -r .exitcode $i)" + done | tee -a $GITHUB_STEP_SUMMARY + + VIT_EXIT_STATUSES="${{ inputs.ARTIFACT_NAME }}rosetta-VIT-${GITHUB_RUN_ID}-*/*-status.json" + VIT_PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $VIT_EXIT_STATUSES | wc -l) + VIT_FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $VIT_EXIT_STATUSES | wc -l) + VIT_TOTAL_TESTS=$(ls $VIT_EXIT_STATUSES | wc -l) + + cat <>$GITHUB_STEP_SUMMARY + ## ViT MGMN+SPMD Test Status + | Test Case | State | Exit Code | + | --- | --- | --- | + EOF + for i in $VIT_EXIT_STATUSES; do # Files are named ${{ inputs.ARTIFACT_NAME }}-/-status.json - echo "| $(echo $i | cut -d/ -f1 | cut -d- -f2-) | $(jq -r .state $i) | $(jq -r .exitcode $i)" + echo "| $(echo $i | cut -d/ -f1 | cut -d- -f4-) | $(jq -r .state $i) | $(jq -r .exitcode $i)" done | tee -a $GITHUB_STEP_SUMMARY + EXIT_STATUSES="$VIT_EXIT_STATUSES $T5X_EXIT_STATUSES" + PASSED_TESTS=$(( T5X_PASSED_TESTS + VIT_PASSED_TESTS )) + FAILED_TESTS=$(( T5X_FAILED_TESTS + VIT_FAILED_TESTS )) + TOTAL_TESTS=$(( T5X_TOTAL_TESTS + VIT_TOTAL_TESTS )) + echo "Test statuses:" jq -rc 'input_filename,.' $EXIT_STATUSES @@ -340,6 +598,7 @@ jobs: echo "LABEL='Completion'" >> $GITHUB_OUTPUT echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT + summary: runs-on: ubuntu-22.04 @@ -350,9 +609,9 @@ jobs: ( cat << EOF - ## T5X MGMN training + ## Rosetta T5X MGMN training - [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=${{ inputs.ARTIFACT_NAME }}${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per) + [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=${{ inputs.ARTIFACT_NAME }}rosetta-[T5X|VIT]-${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per) EOF ) | tee $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/_test_vit.yaml b/.github/workflows/_test_vit.yaml deleted file mode 100644 index dbae8721f..000000000 --- a/.github/workflows/_test_vit.yaml +++ /dev/null @@ -1,327 +0,0 @@ -name: ~test ViT, MGMN - -on: - workflow_call: - inputs: - ROSETTA_T5X_IMAGE: - type: string - description: Rosetta image from ghcr.io/nvidia/rosetta-t5x - default: 'ghcr.io/nvidia/rosetta-t5x:latest' - required: false - BATCH_SIZE_PER_GPU: - type: number - description: Batch size per GPU - default: 256 - required: false - outputs: - TEST_STATUS: - description: 'Summary pass/fail value indicating if results from tests are acceptable' - value: ${{ jobs.publish-test.outputs.STATUS }} - -jobs: - - single-process-multi-device: - strategy: - matrix: - N_GPU: [8] - fail-fast: false - - runs-on: ubuntu-22.04 - - steps: - - name: Print environment variables - run: env - - - name: Setup SSH agent - uses: webfactory/ssh-agent@v0.8.0 - with: - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - - - name: Setup SSH known hosts - id: ssh-known-hosts - run: | - mkdir -p ~/.ssh - cat >> ~/.ssh/known_hosts << EOF - ${{ vars.SSH_KNOWN_HOSTS }} - EOF - chmod 600 ~/.ssh/known_hosts - echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT - - - name: Labels and metadata - id: meta - shell: bash -x -e {0} - run: | - IMAGE="$(echo ${{inputs.ROSETTA_T5X_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=VIT1P${{ matrix.N_GPU }}G - JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME} - LOG_FILE=/nfs/cluster/${JOB_NAME}.log - MODEL_PATH=/nfs/cluster/${JOB_NAME} - BATCH_SIZE=$((${{ inputs.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }})) - for var in IMAGE TEST_CASE_NAME JOB_NAME LOG_FILE MODEL_PATH BATCH_SIZE; do - echo "$var=${!var}" >> $GITHUB_OUTPUT - done - - - name: Submit SLURM jobs over SSH - id: submit - shell: bash -O expand_aliases -x -e {0} - run: | - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" - sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} - JOB=$(sshx sbatch --parsable << EOF - #!/bin/bash - #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} - #SBATCH --exclusive - #SBATCH --nodes=1 - #SBATCH --tasks=1 - #SBATCH --gpus-per-node=${{ matrix.N_GPU }} - #SBATCH --time=00:30:00 - #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} - #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }},XLA_PYTHON_CLIENT_MEM_FRACTION=0.9" - time srun \ - --container-image=${{ steps.meta.outputs.IMAGE }} \ - --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ - --container-entrypoint \ - test-vit.sh \ - --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \ - --dtype bfloat16 \ - --batch-size ${{ steps.meta.outputs.BATCH_SIZE }} - EOF - ) - - set +x - while sshx squeue -j $JOB | grep -q $JOB; do - echo "SLURM Job $JOB is still running." - sleep 15 - done - echo "SLRUM Job $JOB finished." - - # Gather job info - SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) - SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') - echo "SLURM Job state is ${SLURM_STATE}" - echo "SLURM Job exit code is ${SLURM_EXITCODE}" - echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" - echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" - - set -x - - - name: Retrieve training logs and upload to TensorBoard server - shell: bash -x -e {0} - run: | - mkdir output/ - rsync -rtz --progress \ - ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ - output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ - ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ - output/ || true - rsync -rtz --progress \ - output/ \ - ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true - - - name: Write SLURM job status to file - shell: bash -x -e {0} - run: | - python << EOF - import json - with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: - dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} - json.dump(dump, f) - EOF - - - name: Upload training logs as artifacts - uses: actions/upload-artifact@v3 - with: - name: ${{ steps.meta.outputs.JOB_NAME }} - path: output/* - - multi-gpu-multi-node: - strategy: - matrix: - N_GPU: [1, 8] - N_NODE: [1, 2] - fail-fast: false - - runs-on: ubuntu-22.04 - - steps: - - name: Print environment variables - run: env - - - name: Setup SSH agent - uses: webfactory/ssh-agent@v0.8.0 - with: - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - - - name: Setup SSH known hosts - id: ssh-known-hosts - run: | - mkdir -p ~/.ssh - cat >> ~/.ssh/known_hosts << EOF - ${{ vars.SSH_KNOWN_HOSTS }} - EOF - chmod 600 ~/.ssh/known_hosts - echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT - - - name: Labels and metadata - id: meta - shell: bash -x -e {0} - run: | - IMAGE="$(echo ${{inputs.ROSETTA_T5X_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=VIT${{ matrix.N_GPU }}G${{ matrix.N_NODE }}N - TOTAL_TASKS=$((${{ matrix.N_GPU }} * ${{ matrix.N_NODE }})) - JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME} - LOG_FILE=/nfs/cluster/${JOB_NAME}.log - MODEL_PATH=/nfs/cluster/${JOB_NAME} - BATCH_SIZE=$((${{ inputs.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }} * ${{ matrix.N_NODE }})) - for var in IMAGE TEST_CASE_NAME TOTAL_TASKS JOB_NAME LOG_FILE MODEL_PATH BATCH_SIZE; do - echo "$var=${!var}" >> $GITHUB_OUTPUT - done - - - name: Submit SLURM jobs over SSH - id: submit - shell: bash -O expand_aliases -x -e {0} - run: | - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" - sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} - JOB=$(sshx sbatch --parsable << EOF - #!/bin/bash - #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} - #SBATCH --exclusive - #SBATCH --nodes=${{ matrix.N_NODE }} - #SBATCH --gpus-per-node=${{ matrix.N_GPU }} - #SBATCH --tasks=${{ steps.meta.outputs.TOTAL_TASKS }} - #SBATCH --tasks-per-node=${{ matrix.N_GPU }} - #SBATCH --time=00:30:00 - #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} - #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }},XLA_PYTHON_CLIENT_MEM_FRACTION=0.9" - time srun \ - --container-image=${{ steps.meta.outputs.IMAGE }} \ - --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ - --container-entrypoint \ - test-vit.sh \ - --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \ - --dtype bfloat16 \ - --batch-size ${{ steps.meta.outputs.BATCH_SIZE }} \ - --multiprocess - EOF - ) - - set +x - while sshx squeue -j $JOB | grep -q $JOB; do - echo "SLURM Job $JOB is still running." - sleep 15 - done - echo "SLRUM Job $JOB finished." - - # Gather job info - SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) - SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') - echo "SLURM Job state is ${SLURM_STATE}" - echo "SLURM Job exit code is ${SLURM_EXITCODE}" - echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" - echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" - - set -x - - - name: Retrieve training logs and upload to TensorBoard server - shell: bash -x -e {0} - run: | - - mkdir output/ - rsync -rtz --progress \ - ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ - output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ - ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ - output/ || true - rsync -rtz --progress \ - output/ \ - ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true - - - name: Write SLURM job status to file - shell: bash -x -e {0} - run: | - python << EOF - import json - with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: - dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} - json.dump(dump, f) - EOF - - - name: Upload training logs as artifacts - uses: actions/upload-artifact@v3 - with: - name: ${{ steps.meta.outputs.JOB_NAME }} - path: output/* - - publish-test: - needs: [multi-gpu-multi-node, single-process-multi-device] - uses: ./.github/workflows/_publish_badge.yaml - if: ( always() ) - secrets: inherit - with: - ENDPOINT_FILENAME: 'vit-test-completion-status.json' - PUBLISH: false - SCRIPT: | - EXIT_STATUSES="${GITHUB_RUN_ID}-*/*-status.json" - PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) - FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) - TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) - - cat <>$GITHUB_STEP_SUMMARY - ## ViT MGMN+SPMD Test Status - | Test Case | State | Exit Code | - | --- | --- | --- | - EOF - for i in $EXIT_STATUSES; do - # Files are named -/-status.json - echo "| $(echo $i | cut -d/ -f1 | cut -d- -f2) | $(jq -r .state $i) | $(jq -r .exitcode $i)" - done | tee -a $GITHUB_STEP_SUMMARY - - echo "Test statuses:" - jq -rc 'input_filename,.' $EXIT_STATUSES - - if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then - echo "STATUS=success" >> $GITHUB_OUTPUT - BADGE_COLOR=brightgreen - elif [[ $PASSED_TESTS -eq 0 ]]; then - echo "STATUS=failure" >> $GITHUB_OUTPUT - BADGE_COLOR=red - else - echo "STATUS=failure" >> $GITHUB_OUTPUT - BADGE_COLOR=yellow - fi - echo "LABEL='Completion'" >> $GITHUB_OUTPUT - echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT - - summary: - runs-on: ubuntu-22.04 - - steps: - - name: Generate TensorBoard query URL - run: | - ( - cat << EOF - - ## ViT MGMN training - - [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=${GITHUB_RUN_ID}/VIT&_smoothingWeight=0&tagFilter=samples_per) - - EOF - ) | tee $GITHUB_STEP_SUMMARY - - - outcome: - needs: publish-test - runs-on: ubuntu-22.04 - if: ( always() ) - steps: - - name: Sets workflow status based on test outputs - run: | - if [[ ${{ needs.publish-test.outputs.STATUS }} != success ]]; then - exit 1 - fi diff --git a/.github/workflows/nightly-pax-test-mgmn.yaml b/.github/workflows/nightly-pax-test-mgmn.yaml index db041cd77..3ae570c3e 100644 --- a/.github/workflows/nightly-pax-test-mgmn.yaml +++ b/.github/workflows/nightly-pax-test-mgmn.yaml @@ -59,50 +59,12 @@ jobs: publish: needs: [metadata, run-jobs] - runs-on: ubuntu-22.04 - steps: - - name: Setup SSH agent - uses: webfactory/ssh-agent@v0.8.0 - with: - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - - - name: Setup SSH known hosts - id: ssh-known-hosts - run: | - mkdir -p ~/.ssh - cat >> ~/.ssh/known_hosts << EOF - ${{ vars.SSH_KNOWN_HOSTS }} - EOF - chmod 600 ~/.ssh/known_hosts - echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT - - - name: Setup SSH config - id: ssh-config - run: | - mkdir -p ~/.ssh - cat >> ~/.ssh/config << EOF - ${{ vars.SSH_CONFIG }} - EOF - chmod 600 ~/.ssh/config - - - name: Create dated folder and generate TensorBoard query URL - id: mkdir - shell: bash -x -e {0} - run: | - FOLDER="${{ needs.metadata.outputs.BUILD_DATE }}/PAX" - # copy folder - ssh -T tensorboard mkdir -p /tensorboard-logs/${FOLDER} - ssh -T tensorboard rsync -rt /tensorboard-logs/${GITHUB_RUN_ID}/ /tensorboard-logs/${FOLDER}/ - # generate query URL - ( - cat << EOF - - ## PAX MGMN nightly training: ${{ needs.metadata.outputs.BUILD_DATE }} - - [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per) - - EOF - ) | tee $GITHUB_STEP_SUMMARY + uses: ./.github/workflows/_publish_t5x_pax_results.yaml + if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + with: + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + EXPERIMENT_SUBDIR: PAX + secrets: inherit publish-completion: needs: [metadata, run-jobs] diff --git a/.github/workflows/nightly-rosetta-pax-build.yaml b/.github/workflows/nightly-rosetta-pax-build.yaml index 0acc36d3d..6a61a9976 100644 --- a/.github/workflows/nightly-rosetta-pax-build.yaml +++ b/.github/workflows/nightly-rosetta-pax-build.yaml @@ -134,6 +134,16 @@ jobs: PAX_IMAGE: ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} secrets: inherit + publish-pax: + needs: [metadata, test-pax] + uses: ./.github/workflows/_publish_t5x_pax_results.yaml + if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + with: + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + EXPERIMENT_SUBDIR: ROSETTA_PAX + ARTIFACT_NAME: "rosetta-pax-" + secrets: inherit + publish-test: needs: [metadata, amd64, arm64, test-pax] uses: ./.github/workflows/_publish_badge.yaml diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml index 9c734b57c..e00f6f6d2 100644 --- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml @@ -143,25 +143,18 @@ jobs: T5X_IMAGE: ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} secrets: inherit - test-vit: - needs: [metadata, amd64, arm64] - uses: ./.github/workflows/_test_vit.yaml - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' - with: - ROSETTA_T5X_IMAGE: ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} - secrets: inherit - publish-t5x: - needs: [metadata, test-t5x, test-vit] - uses: ./.github/workflows/_publish_t5x_results.yaml + needs: [metadata, test-t5x] + uses: ./.github/workflows/_publish_t5x_pax_results.yaml if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} EXPERIMENT_SUBDIR: ROSETTA_T5X + ARTIFACT_NAME: "rosetta-T5X-" secrets: inherit publish-test: - needs: [metadata, amd64, arm64, test-unit, test-t5x, test-vit] + needs: [metadata, amd64, arm64, test-unit, test-t5x] uses: ./.github/workflows/_publish_badge.yaml if: ( always() ) secrets: inherit @@ -171,18 +164,17 @@ jobs: SCRIPT: | UNIT_STATUS=${{ needs.test-unit.outputs.TEST_STATUS }} T5X_STATUS=${{ needs.test-t5x.outputs.TEST_STATUS }} - VIT_STATUS=${{ needs.test-vit.outputs.TEST_STATUS }} echo "LABEL='Tests'" >> $GITHUB_OUTPUT if [[ ${{ needs.amd64.result }} == "success" && ${{ needs.arm64.result }} == "success" ]]; then - if [[ $UNIT_STATUS == "success" ]] && [[ $T5X_STATUS == "success" ]] && [[ $VIT_STATUS == "success" ]]; then + if [[ $UNIT_STATUS == "success" ]] && [[ $T5X_STATUS == "success" ]]; then COLOR=brightgreen MESSAGE="Unit passed / MGMN passed" elif [[ $UNIT_STATUS == "success" ]]; then COLOR=yellow MESSAGE="Unit passed / MGMN failed" - elif [[ $T5X_STATUS == "success" ]] && [[ $VIT_STATUS == "success" ]]; then + elif [[ $T5X_STATUS == "success" ]]; then COLOR=yellow MESSAGE="Unit failed / MGMN passed" else diff --git a/.github/workflows/nightly-t5x-test-mgmn.yaml b/.github/workflows/nightly-t5x-test-mgmn.yaml index 40fa91819..866711940 100644 --- a/.github/workflows/nightly-t5x-test-mgmn.yaml +++ b/.github/workflows/nightly-t5x-test-mgmn.yaml @@ -59,7 +59,7 @@ jobs: publish: needs: [metadata, run-jobs] - uses: ./.github/workflows/_publish_t5x_results.yaml + uses: ./.github/workflows/_publish_t5x_pax_results.yaml if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}