From ae8badcd49d5e465543a198a9b9c2dcec82bc8dd Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Tue, 3 Oct 2023 18:01:48 -0700 Subject: [PATCH] Fixes rosetta tests from overwriting artifacts from upstream tests --- .github/workflows/_test_pax.yaml | 11 ++++++++--- .github/workflows/_test_t5x.yaml | 9 +++++++-- .github/workflows/nightly-rosetta-pax-build.yaml | 1 + .github/workflows/nightly-rosetta-t5x-build-test.yaml | 1 + 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index 73e6209dd..538c51a78 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -13,6 +13,11 @@ on: description: Extra command line args to pass to test-pax.sh default: "" required: false + SUFFIX: + type: string + description: If provided, will append a suffix to the arfiact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts + default: "" + required: false outputs: TEST_STATUS: description: 'Summary pass/fail value indicating if results from tests are acceptable' @@ -64,7 +69,7 @@ jobs: NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE)) GPUS_PER_NODE=$((TOTAL_TASKS/NODES)) - JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME} + JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME}${{ inputs.SUFFIX }} LOG_FILE=/nfs/cluster/${JOB_NAME}.log MODEL_PATH=/nfs/cluster/${JOB_NAME} for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do @@ -170,7 +175,7 @@ jobs: shell: bash -x {0} run: | pip install pytest pytest-reportlog tensorboard - for i in ${GITHUB_RUN_ID}-*DP*TP*PP; do + for i in ${GITHUB_RUN_ID}-*DP*TP*PP${{ inputs.SUFFIX }}; do SUBDIR=$(echo $i | cut -d'-' -f2) mv $i/$SUBDIR* . python3 .github/workflows/baselines/summarize_metrics.py $SUBDIR # create result json in baseline format @@ -202,7 +207,7 @@ jobs: ENDPOINT_FILENAME: 'pax-test-status.json' PUBLISH: false SCRIPT: | - EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*TP*PP/*-status.json" + EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*TP*PP${{ inputs.SUFFIX }}/*-status.json" PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) diff --git a/.github/workflows/_test_t5x.yaml b/.github/workflows/_test_t5x.yaml index 5b8b0529e..f8af31d06 100644 --- a/.github/workflows/_test_t5x.yaml +++ b/.github/workflows/_test_t5x.yaml @@ -18,6 +18,11 @@ on: description: Extra gin args to pass to test-t5x.sh default: "" required: false + SUFFIX: + type: string + description: If provided, will append a suffix to the arfiact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts + default: "" + required: false outputs: TEST_STATUS: description: 'Summary pass/fail value indicating if results from tests are acceptable' @@ -58,7 +63,7 @@ jobs: run: | IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')" TEST_CASE_NAME=1P${{ matrix.N_GPU }}G - JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME} + JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME}${{ inputs.SUFFIX }} LOG_FILE=/nfs/cluster/${JOB_NAME}.log MODEL_PATH=/nfs/cluster/${JOB_NAME} BATCH_SIZE=$((${{ inputs.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }})) @@ -277,7 +282,7 @@ jobs: ENDPOINT_FILENAME: 't5x-test-completion-status.json' PUBLISH: false SCRIPT: | - EXIT_STATUSES="${GITHUB_RUN_ID}-*/*-status.json" + EXIT_STATUSES="${GITHUB_RUN_ID}-*[PG]*[GN]${{ inputs.SUFFIX }}/*-status.json" PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) diff --git a/.github/workflows/nightly-rosetta-pax-build.yaml b/.github/workflows/nightly-rosetta-pax-build.yaml index 1c8d23d10..2250348c2 100644 --- a/.github/workflows/nightly-rosetta-pax-build.yaml +++ b/.github/workflows/nightly-rosetta-pax-build.yaml @@ -90,6 +90,7 @@ jobs: with: PAX_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} EXTRA_TEST_ARGS: "--enable-te --additional-args \"--fdl.PACKED_INPUT=False\"" + SUFFIX: "-rosetta" secrets: inherit publish-test: diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml index ed0d0352e..413525d7d 100644 --- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml @@ -99,6 +99,7 @@ jobs: T5X_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} # Disable packing b/c rosetta-t5x images run with TE by default, and TE does not currently support packing EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False" + SUFFIX: "-rosetta" secrets: inherit publish-t5x: