From cd8750d569c67887b5c98536ad0dc75e60f82c50 Mon Sep 17 00:00:00 2001 From: Vladislav Date: Thu, 15 Feb 2024 14:59:13 -0700 Subject: [PATCH 1/2] Check status of SLURM job --- .github/workflows/_runner_ondemand_slurm.yaml | 18 +++++++ .github/workflows/_sandbox.yaml | 51 +++++++++---------- 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/.github/workflows/_runner_ondemand_slurm.yaml b/.github/workflows/_runner_ondemand_slurm.yaml index b80ce078c..b3b8e836c 100644 --- a/.github/workflows/_runner_ondemand_slurm.yaml +++ b/.github/workflows/_runner_ondemand_slurm.yaml @@ -81,6 +81,7 @@ jobs: # launch runner time docker run \ + --name ${{steps.meta.outputs.JOB_NAME }} \ --network host \ --gpus all \ --privileged \ @@ -98,9 +99,26 @@ jobs: echo "SLURM_JOB_ID=$(cat ${SLURM_JOB_ID_FILE})" >> $GITHUB_OUTPUT - name: Wait for SLURM job to complete + id: job-completion shell: bash -x -e {0} run: | while ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} squeue -j${{ steps.submit.outputs.SLURM_JOB_ID }} > /dev/null 2>&1; do echo "wait"; sleep 15; done + SLURM_JOB_COMPLETION_STATUS_FILE=$(mktemp) + ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} >${SLURM_JOB_COMPLETION_STATUS_FILE} \ + sacct -XP -j ${{ steps.submit.outputs.SLURM_JOB_ID }} + + echo "JOB COMPLETION STATUS" + cat ${SLURM_JOB_COMPLETION_STATUS_FILE} + + SLURM_JOB_COMPLETION_STATE_FILE=$(mktemp) + ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} >${SLURM_JOB_COMPLETION_STATE_FILE} \ + sacct -XP -j ${{ steps.submit.outputs.SLURM_JOB_ID }} -o State + + SLURM_JOB_COMPLETION_STATE=$(cat ${SLURM_JOB_COMPLETION_STATE_FILE} | tail -1) + + if [[ ${SLURM_JOB_COMPLETION_STATE} != "COMPLETED" ]]; then + exit 1 + fi - name: Remove orphaned SLURM job if the CI job is canceled if: cancelled() diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 7b90b72ca..24179d340 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -4,9 +4,28 @@ on: workflow_dispatch: jobs: - sandbox: - runs-on: ubuntu-22.04 + runner: + uses: ./.github/workflows/_runner_ondemand_slurm.yaml + with: + NAME: "A100-${{ github.run_id }}" + LABELS: "A100:${{ github.run_id }}" + TIME: "01:00:00" + secrets: inherit + + test: + strategy: + fail-fast: false + matrix: + GPU_ARCH: [A100] + # ensures A100 job lands on dedicated runner for this particular job + runs-on: [self-hosted, "${{ matrix.GPU_ARCH == 'A100' && format('{0}:{1}', matrix.GPU_ARCH, github.run_id) || matrix.GPU_ARCH }}"] steps: + - name: Print GPU information + run: nvidia-smi + + - name: Check out repository + uses: actions/checkout@v4 + - name: Login to GitHub Container Registry uses: docker/login-action@v3 with: @@ -14,28 +33,8 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Print usage + - name: Run tests + shell: bash -x -e {0} + continue-on-error: true run: | - cat << EOF - This is an empty workflow file located in the main branch of your - repository. It serves as a testing ground for new GitHub Actions on - development branches before merging them to the main branch. By - defining and overloading this workflow on your development branch, - you can test new actions without affecting your main branch, ensuring - a smooth integration process once the changes are ready to be merged. - - Usage: - - 1. In your development branch, modify the sandbox.yml workflow file - to include the new actions you want to test. Make sure to commit - the changes to the development branch. - 2. Navigate to the 'Actions' tab in your repository, select the - '~Sandbox' workflow, and choose your development branch from the - branch dropdown menu. Click on 'Run workflow' to trigger the - workflow on your development branch. - 3. Once you have tested and verified the new actions in the Sandbox - workflow, you can incorporate them into your main workflow(s) and - merge the development branch into the main branch. Remember to - revert the changes to the sandbox.yml file in the main branch to - keep it empty for future testing. - EOF + docker run "ubuntu:22.04" bash -ec "sleep 1; exit 0" From 7e8e0849836fbbb0cc1754a2909ef9d1ae0b9e6a Mon Sep 17 00:00:00 2001 From: Vladislav Date: Thu, 15 Feb 2024 21:54:15 -0700 Subject: [PATCH 2/2] Address Yu-Hang comments --- .github/workflows/_runner_ondemand_slurm.yaml | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/.github/workflows/_runner_ondemand_slurm.yaml b/.github/workflows/_runner_ondemand_slurm.yaml index b3b8e836c..7ebde5093 100644 --- a/.github/workflows/_runner_ondemand_slurm.yaml +++ b/.github/workflows/_runner_ondemand_slurm.yaml @@ -99,23 +99,17 @@ jobs: echo "SLURM_JOB_ID=$(cat ${SLURM_JOB_ID_FILE})" >> $GITHUB_OUTPUT - name: Wait for SLURM job to complete - id: job-completion shell: bash -x -e {0} run: | while ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} squeue -j${{ steps.submit.outputs.SLURM_JOB_ID }} > /dev/null 2>&1; do echo "wait"; sleep 15; done SLURM_JOB_COMPLETION_STATUS_FILE=$(mktemp) ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} >${SLURM_JOB_COMPLETION_STATUS_FILE} \ - sacct -XP -j ${{ steps.submit.outputs.SLURM_JOB_ID }} + sacct -XP -j ${{ steps.submit.outputs.SLURM_JOB_ID }} -o JobID,JobName,Partition,Account,AllocCPUS,State,ExitCode echo "JOB COMPLETION STATUS" cat ${SLURM_JOB_COMPLETION_STATUS_FILE} - SLURM_JOB_COMPLETION_STATE_FILE=$(mktemp) - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} >${SLURM_JOB_COMPLETION_STATE_FILE} \ - sacct -XP -j ${{ steps.submit.outputs.SLURM_JOB_ID }} -o State - - SLURM_JOB_COMPLETION_STATE=$(cat ${SLURM_JOB_COMPLETION_STATE_FILE} | tail -1) - + SLURM_JOB_COMPLETION_STATE=$( cat ${SLURM_JOB_COMPLETION_STATE_FILE} | tail -1 | cut -d "|" -f 6 - ) if [[ ${SLURM_JOB_COMPLETION_STATE} != "COMPLETED" ]]; then exit 1 fi