Skip to content

Commit

Permalink
Add rosetta tests to ci.yaml (#298)
Browse files Browse the repository at this point in the history
Currently, the CI workflow only runs MGMN tests on 'upstream' container.
This PR adds MGMN tests on Rosetta containers to `ci.yaml`.

---------

Co-authored-by: Terry Kong <[email protected]>
Co-authored-by: Yu-Hang "Maxin" Tang <[email protected]>
  • Loading branch information
3 people authored Dec 7, 2023
1 parent f6d52f0 commit 5b2c1b4
Show file tree
Hide file tree
Showing 11 changed files with 365 additions and 438 deletions.
26 changes: 18 additions & 8 deletions .github/workflows/_ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -180,26 +180,36 @@ jobs:
TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }}
secrets: inherit

test-t5x:
test-upstream-t5x:
needs: build-t5x
if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
uses: ./.github/workflows/_test_t5x.yaml
with:
T5X_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAG_FINAL }}
secrets: inherit

test-pax:
test-rosetta-t5x:
needs: build-rosetta-t5x
if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
uses: ./.github/workflows/_test_t5x_rosetta.yaml
with:
T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
# Disable packing b/c rosetta-t5x images run with TE by default, and TE does not currently support packing
secrets: inherit

test-upstream-pax:
needs: build-pax
if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
uses: ./.github/workflows/_test_pax.yaml
with:
PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }}
secrets: inherit

test-vit:
needs: build-rosetta-t5x
uses: ./.github/workflows/_test_vit.yaml
test-rosetta-pax:
needs: build-rosetta-pax
if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
uses: ./.github/workflows/_test_pax_rosetta.yaml
with:
ROSETTA_T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
secrets: inherit
PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}
secrets: inherit

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: ~publish t5x integration test results
name: ~publish t5x/pax integration test results

on:
workflow_call:
Expand All @@ -9,12 +9,17 @@ on:
required: true
EXPERIMENT_SUBDIR:
type: string
description: Subdirectory to easily filter experiments, e.g., T5X, ROSETTA_T5X
description: Subdirectory to easily filter experiments, e.g., T5X, ROSETTA_PAX
default: T5X
required: false
ARTIFACT_NAME:
type: string
description: If provided, will prepend a prefix to the artifact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts
default: ""
required: false

jobs:
publish-t5x:
publish:
runs-on: ubuntu-22.04
steps:
- name: Setup SSH agent
Expand Down Expand Up @@ -48,7 +53,7 @@ jobs:
FOLDER="${{ inputs.BUILD_DATE }}/${{ inputs.EXPERIMENT_SUBDIR }}"
# copy folder
ssh -T tensorboard mkdir -p /tensorboard-logs/${FOLDER}
ssh -T tensorboard rsync -rt /tensorboard-logs/${GITHUB_RUN_ID}/ /tensorboard-logs/${FOLDER}/
ssh -T tensorboard rsync -rt /tensorboard-logs/${{ inputs.ARTIFACT_NAME }}${GITHUB_RUN_ID}/ /tensorboard-logs/${FOLDER}/
# generate query URL
(
cat << EOF
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/_test_pax.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ on:
description: Extra command line args to pass to test-pax.sh
default: ""
required: false
ARTIFACT_NAME:
type: string
description: If provided, will prepend a prefix to the artifact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts
default: ""
required: false
outputs:
TEST_STATUS:
description: 'Summary pass/fail value indicating if results from tests are acceptable'
Expand Down
62 changes: 34 additions & 28 deletions .github/workflows/_test_pax_rosetta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ on:
description: Extra command line args to pass to test-pax.sh
default: ""
required: false
ARTIFACT_NAME:
type: string
description: If provided, will prepend a prefix to the artifact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts
default: ""
required: false
outputs:
TEST_STATUS:
description: 'Summary pass/fail value indicating if results from tests are acceptable'
Expand Down Expand Up @@ -58,7 +63,7 @@ jobs:
MAX_GPUS_PER_NODE=8
NODES=1
GPUS_PER_NODE=8
JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME}
JOB_NAME=${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
LOG_FILE=/nfs/cluster/${JOB_NAME}.log
MODEL_PATH=/nfs/cluster/${JOB_NAME}
for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do
Expand Down Expand Up @@ -125,7 +130,7 @@ jobs:
output/ || true
rsync -rtz --progress \
output/ \
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}rosetta-pax-/ || true
- name: Write SLURM job status to file
shell: bash -x -e {0}
run: |
Expand Down Expand Up @@ -185,7 +190,7 @@ jobs:
NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE))
GPUS_PER_NODE=$((TOTAL_TASKS/NODES))
JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME}
JOB_NAME=${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
LOG_FILE=/nfs/cluster/${JOB_NAME}.log
MODEL_PATH=/nfs/cluster/${JOB_NAME}
for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do
Expand Down Expand Up @@ -261,7 +266,7 @@ jobs:
output/ || true
rsync -rtz --progress \
output/ \
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}/ || true
- name: Write SLURM job status to file
shell: bash -x -e {0}
Expand Down Expand Up @@ -321,7 +326,7 @@ jobs:
NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE))
GPUS_PER_NODE=$((TOTAL_TASKS/NODES))
JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME}
JOB_NAME=${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
LOG_FILE=/nfs/cluster/${JOB_NAME}.log
MODEL_PATH=/nfs/cluster/${JOB_NAME}
for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do
Expand Down Expand Up @@ -395,7 +400,7 @@ jobs:
output/ || true
rsync -rtz --progress \
output/ \
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}/ || true
- name: Write SLURM job status to file
shell: bash -x -e {0}
Expand Down Expand Up @@ -453,7 +458,7 @@ jobs:
NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE))
GPUS_PER_NODE=$((TOTAL_TASKS/NODES))
JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME}
JOB_NAME=${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
LOG_FILE=/nfs/cluster/${JOB_NAME}.log
MODEL_PATH=/nfs/cluster/${JOB_NAME}
for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do
Expand Down Expand Up @@ -530,7 +535,7 @@ jobs:
output/ || true
rsync -rtz --progress \
output/ \
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}/ || true
- name: Write SLURM job status to file
shell: bash -x -e {0}
Expand Down Expand Up @@ -587,7 +592,7 @@ jobs:
NODES=1
GPUS_PER_NODE=8
JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME}
JOB_NAME=${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
LOG_FILE=/nfs/cluster/${JOB_NAME}.log
MODEL_PATH=/nfs/cluster/${JOB_NAME}
for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do
Expand Down Expand Up @@ -662,7 +667,7 @@ jobs:
output/ || true
rsync -rtz --progress \
output/ \
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}/ || true
- name: Write SLURM job status to file
shell: bash -x -e {0}
Expand Down Expand Up @@ -695,15 +700,15 @@ jobs:
shell: bash -x {0}
run: |
pip install pytest pytest-reportlog tensorboard
for i in ${GITHUB_RUN_ID}-*DP*FSDP*TP*PP* ${GITHUB_RUN_ID}-*DP_TE_dropout; do
SUBDIR=$(echo $i | cut -d'-' -f2)
for i in ${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}-*DP*FSDP*TP*PP* ${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}-*DP_TE_dropout; do
SUBDIR=$(echo $i | rev | cut -d'-' -f1 | rev)
mv $i/$SUBDIR* .
python3 .github/workflows/baselines/summarize_metrics.py $SUBDIR # create result json in baseline format
done
echo '## PAX MGMN Test Metrics' >> $GITHUB_STEP_SUMMARY
echo '## Rosetta PAX MGMN Test Metrics' >> $GITHUB_STEP_SUMMARY
for i in *_metrics.json; do
echo $i | cut -d'.' -f1
echo $(basename -- $i _metrics.json)
echo '```json'
jq . $i
echo '```'
Expand All @@ -724,39 +729,40 @@ jobs:
if: ( always() )
secrets: inherit
with:
ENDPOINT_FILENAME: 'pax-test-status.json'
ENDPOINT_FILENAME: '${{ inputs.ARTIFACT_NAME }}rosetta-pax-test-status.json'
PUBLISH: false
SCRIPT: |
EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*FSDP*TP*PP*/*-status.json ${GITHUB_RUN_ID}-*DP_TE_dropout/*-status.json"
EXIT_STATUSES="${{ inputs.ARTIFACT_NAME }}rosetta-pax-*DP*FSDP*TP*PP*/*-status.json ${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}-*DP_TE_dropout/*-status.json"
PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l)
echo '## PAX MGMN Test Status' >> $GITHUB_STEP_SUMMARY
cat <<EOF >>$GITHUB_STEP_SUMMARY
## Pax MGMN+SPMD Test Status
| Test Case | State | Exit Code |
| --- | --- | --- |
EOF
for i in $EXIT_STATUSES; do
echo $i | cut -d'.' -f1
echo '```json'
jq . $i
echo '```'
# Files are named rosetta-pax-<GHID>-<NAME>/<NAME>-status.json
echo "| $(echo $i | cut -d/ -f1 | cut -d- -f4) | $(jq -r .state $i) | $(jq -r .exitcode $i)"
done | tee -a $GITHUB_STEP_SUMMARY
echo "Test statuses:"
jq -rc 'input_filename,.' $EXIT_STATUSES
PYTEST_LOG=metrics-test-log/report.jsonl
METRICS_LOG=metrics-test-log/report.jsonl
all_outcomes() {
cat $PYTEST_LOG | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome'
cat $METRICS_LOG | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome'
}
cnt_type() {
cat $PYTEST_LOG | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l
cat $METRICS_LOG | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l
}
PYTEST_FAILED_TESTS=$(cnt_type failed)
PYTEST_PASSED_TESTS=$(cnt_type passed)
PYTEST_TOTAL_TESTS=$(all_outcomes | wc -l)
if ([[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] && \
[[ $PYTEST_FAILED_TESTS -eq 0 ]] && [[ $PYTEST_TOTAL_TESTS -gt 0 ]]) || \
([[ $PASSED_TESTS -eq $TOTAL_TESTS ]] && [[ $PYTEST_PASSED_TESTS -eq $PYTEST_TOTAL_TESTS ]]); then
[[ $PYTEST_FAILED_TESTS -eq 0 ]] && [[ $PYTEST_TOTAL_TESTS -gt 0 ]]); then
STATUS=success
BADGE_COLOR=brightgreen
elif [[ $PASSED_TESTS -eq 0 ]] || [[ $PYTEST_PASSED_TESTS -eq 0 ]]; then
Expand All @@ -781,9 +787,9 @@ jobs:
(
cat << EOF
## PAX MGMN training
## Rosetta PAX MGMN training
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per)
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=${{ inputs.ARTIFACT_NAME }}rosetta-pax-${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per)
EOF
) | tee $GITHUB_STEP_SUMMARY
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/_test_t5x.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ on:
description: Extra gin args to pass to test-t5x.sh
default: ""
required: false
ARTIFACT_NAME:
type: string
description: If provided, will prepend a prefix to the artifact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts
default: ""
required: false
outputs:
TEST_STATUS:
description: 'Summary pass/fail value indicating if results from tests are acceptable'
Expand Down
Loading

0 comments on commit 5b2c1b4

Please sign in to comment.