From ae8badcd49d5e465543a198a9b9c2dcec82bc8dd Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Tue, 3 Oct 2023 18:01:48 -0700
Subject: [PATCH] Fixes rosetta tests from overwriting artifacts from upstream
 tests

---
 .github/workflows/_test_pax.yaml                      | 11 ++++++++---
 .github/workflows/_test_t5x.yaml                      |  9 +++++++--
 .github/workflows/nightly-rosetta-pax-build.yaml      |  1 +
 .github/workflows/nightly-rosetta-t5x-build-test.yaml |  1 +
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml
index 73e6209dd..538c51a78 100644
--- a/.github/workflows/_test_pax.yaml
+++ b/.github/workflows/_test_pax.yaml
@@ -13,6 +13,11 @@ on:
         description: Extra command line args to pass to test-pax.sh
         default: ""
         required: false
+      SUFFIX:
+        type: string
+        description: If provided, will append a suffix to the arfiact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts
+        default: ""
+        required: false
     outputs:
       TEST_STATUS:
         description: 'Summary pass/fail value indicating if results from tests are acceptable'
@@ -64,7 +69,7 @@ jobs:
           NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE))
           GPUS_PER_NODE=$((TOTAL_TASKS/NODES))
 
-          JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME}
+          JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME}${{ inputs.SUFFIX }}
           LOG_FILE=/nfs/cluster/${JOB_NAME}.log
           MODEL_PATH=/nfs/cluster/${JOB_NAME}
           for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do
@@ -170,7 +175,7 @@ jobs:
         shell: bash -x {0}
         run: |
           pip install pytest pytest-reportlog tensorboard
-          for i in ${GITHUB_RUN_ID}-*DP*TP*PP; do
+          for i in ${GITHUB_RUN_ID}-*DP*TP*PP${{ inputs.SUFFIX }}; do
             SUBDIR=$(echo $i | cut -d'-' -f2)
             mv $i/$SUBDIR* .
             python3 .github/workflows/baselines/summarize_metrics.py $SUBDIR # create result json in baseline format
@@ -202,7 +207,7 @@ jobs:
       ENDPOINT_FILENAME: 'pax-test-status.json'
       PUBLISH: false
       SCRIPT: |
-        EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*TP*PP/*-status.json"
+        EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*TP*PP${{ inputs.SUFFIX }}/*-status.json"
         PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
         FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
         TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l)
diff --git a/.github/workflows/_test_t5x.yaml b/.github/workflows/_test_t5x.yaml
index 5b8b0529e..f8af31d06 100644
--- a/.github/workflows/_test_t5x.yaml
+++ b/.github/workflows/_test_t5x.yaml
@@ -18,6 +18,11 @@ on:
         description: Extra gin args to pass to test-t5x.sh
         default: ""
         required: false
+      SUFFIX:
+        type: string
+        description: If provided, will append a suffix to the arfiact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts
+        default: ""
+        required: false
     outputs:
       TEST_STATUS:
         description: 'Summary pass/fail value indicating if results from tests are acceptable'
@@ -58,7 +63,7 @@ jobs:
         run: |
           IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')"
           TEST_CASE_NAME=1P${{ matrix.N_GPU }}G
-          JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME}
+          JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME}${{ inputs.SUFFIX }}
           LOG_FILE=/nfs/cluster/${JOB_NAME}.log
           MODEL_PATH=/nfs/cluster/${JOB_NAME}
           BATCH_SIZE=$((${{ inputs.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }}))
@@ -277,7 +282,7 @@ jobs:
       ENDPOINT_FILENAME: 't5x-test-completion-status.json'
       PUBLISH: false
       SCRIPT: |
-        EXIT_STATUSES="${GITHUB_RUN_ID}-*/*-status.json"
+        EXIT_STATUSES="${GITHUB_RUN_ID}-*[PG]*[GN]${{ inputs.SUFFIX }}/*-status.json"
         PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
         FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
         TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l)
diff --git a/.github/workflows/nightly-rosetta-pax-build.yaml b/.github/workflows/nightly-rosetta-pax-build.yaml
index 1c8d23d10..2250348c2 100644
--- a/.github/workflows/nightly-rosetta-pax-build.yaml
+++ b/.github/workflows/nightly-rosetta-pax-build.yaml
@@ -90,6 +90,7 @@ jobs:
     with:
       PAX_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }}
       EXTRA_TEST_ARGS: "--enable-te --additional-args \"--fdl.PACKED_INPUT=False\""
+      SUFFIX: "-rosetta"
     secrets: inherit
 
   publish-test:
diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml
index ed0d0352e..413525d7d 100644
--- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml
+++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml
@@ -99,6 +99,7 @@ jobs:
       T5X_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }}
       # Disable packing b/c rosetta-t5x images run with TE by default, and TE does not currently support packing
       EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False"
+      SUFFIX: "-rosetta"
     secrets: inherit
 
   publish-t5x: