Non-regression tests #34
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Non-regression tests | |
on: | |
workflow_dispatch: | |
schedule: | |
- cron: '0 21 * * 0-5' # every Sunday to Friday at 11pm CET (10pm winter time) | |
- cron: '0 21 * * 6' # every Saturday at 1am CET (midnight winter time) | |
concurrency: | |
group: ${{ github.workflow }} | |
jobs: | |
start-runner: | |
name: Start self-hosted EC2 runner | |
runs-on: ubuntu-22.04 | |
env: | |
AWS_REGION: us-west-2 | |
EC2_AMI_ID: ami-03549026a9aa06f99 | |
EC2_INSTANCE_TYPE: dl1.24xlarge | |
EC2_SUBNET_ID: subnet-452c913d | |
EC2_SECURITY_GROUP: sg-0894f4f70dd6bd778 | |
outputs: | |
label: ${{ steps.start-ec2-runner.outputs.label }} | |
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} | |
steps: | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v1 | |
with: | |
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
aws-region: ${{ env.AWS_REGION }} | |
- name: Start EC2 runner | |
id: start-ec2-runner | |
uses: philschmid/philschmid-ec2-github-runner@main | |
with: | |
mode: start | |
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} | |
ec2-image-id: ${{ env.EC2_AMI_ID }} | |
ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }} | |
subnet-id: ${{ env.EC2_SUBNET_ID }} | |
security-group-id: ${{ env.EC2_SECURITY_GROUP }} | |
aws-resource-tags: > # optional, requires additional permissions | |
[ | |
{"Key": "Name", "Value": "optimum-habana-ci-slow-tests"}, | |
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"} | |
] | |
example-diff: | |
name: Test examples differences | |
needs: start-runner # required to start the main job when the runner is ready | |
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner | |
env: | |
AWS_REGION: us-west-2 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v2 | |
- name: Pull image | |
run: | | |
docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest | |
- name: Run tests | |
run: | | |
docker run \ | |
-v $PWD:/root/workspace \ | |
--workdir=/root/workspace \ | |
--runtime=habana \ | |
-e HABANA_VISIBLE_DEVICES=all \ | |
-e OMPI_MCA_btl_vader_single_copy_mechanism=none \ | |
--cap-add=sys_nice \ | |
--net=host \ | |
--ipc=host \ | |
vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ | |
/bin/bash tests/ci/example_diff_tests.sh | |
stable-diffusion: | |
name: Test Stable Diffusion | |
if: ${{ !cancelled() && (success() || failure()) }} | |
needs: | |
- start-runner | |
- example-diff # run the job when the previous test job is done | |
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner | |
env: | |
AWS_REGION: us-west-2 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v2 | |
- name: Pull image | |
run: | | |
docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest | |
- name: Run tests | |
run: | | |
docker run \ | |
-v $PWD:/root/workspace \ | |
--workdir=/root/workspace \ | |
--runtime=habana \ | |
-e HABANA_VISIBLE_DEVICES=all \ | |
-e OMPI_MCA_btl_vader_single_copy_mechanism=none \ | |
--cap-add=sys_nice \ | |
--net=host \ | |
--ipc=host \ | |
vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ | |
/bin/bash tests/ci/slow_tests_diffusers.sh | |
deepspeed: | |
name: Test DeepSpeed models | |
if: ${{ !cancelled() && (success() || failure()) }} | |
needs: | |
- start-runner | |
- example-diff | |
- stable-diffusion # run the job when the previous test job is done | |
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner | |
env: | |
AWS_REGION: us-west-2 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v2 | |
- name: Pull image | |
run: | | |
docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest | |
- name: Run tests | |
run: | | |
docker run \ | |
-v $PWD:/root/workspace \ | |
--workdir=/root/workspace \ | |
--runtime=habana \ | |
-e HABANA_VISIBLE_DEVICES=all \ | |
-e OMPI_MCA_btl_vader_single_copy_mechanism=none \ | |
--cap-add=sys_nice \ | |
--net=host \ | |
--ipc=host \ | |
vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ | |
/bin/bash tests/ci/slow_tests_deepspeed.sh | |
multi-card: | |
name: Test multi-card models | |
if: ${{ !cancelled() && (success() || failure()) }} | |
needs: | |
- start-runner | |
- example-diff | |
- deepspeed # run the job when the previous test job is done | |
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner | |
env: | |
AWS_REGION: us-west-2 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v2 | |
- name: Pull image | |
run: | | |
docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest | |
- name: Run tests | |
run: | | |
docker run \ | |
-v $PWD:/root/workspace \ | |
--workdir=/root/workspace \ | |
--runtime=habana \ | |
-e HABANA_VISIBLE_DEVICES=all \ | |
-e OMPI_MCA_btl_vader_single_copy_mechanism=none \ | |
--cap-add=sys_nice \ | |
--net=host \ | |
--ipc=host \ | |
vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ | |
/bin/bash tests/ci/slow_tests_8x.sh | |
single-card: | |
name: Test single-card models | |
if: ${{ !cancelled() && (success() || failure()) }} | |
needs: | |
- start-runner | |
- example-diff | |
- deepspeed | |
- multi-card # run the job when the previous test jobs are done | |
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner | |
env: | |
AWS_REGION: us-west-2 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v2 | |
- name: Pull image | |
run: | | |
docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest | |
- name: Run tests | |
run: | | |
docker run \ | |
-v $PWD:/root/workspace \ | |
--workdir=/root/workspace \ | |
--runtime=habana \ | |
-e HABANA_VISIBLE_DEVICES=all \ | |
-e OMPI_MCA_btl_vader_single_copy_mechanism=none \ | |
--cap-add=sys_nice \ | |
--net=host \ | |
--ipc=host \ | |
vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ | |
/bin/bash tests/ci/slow_tests_1x.sh | |
albert-xxl-single-card: | |
name: Test single-card ALBERT XXL | |
if: ${{ !cancelled() && (success() || failure()) }} | |
needs: | |
- start-runner | |
- example-diff | |
- deepspeed | |
- multi-card | |
- single-card # run the job when the previous test jobs are done | |
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner | |
env: | |
AWS_REGION: us-west-2 | |
steps: | |
- name: Checkout | |
if: github.event.schedule == '0 21 * * 6' | |
uses: actions/checkout@v2 | |
- name: Pull image | |
if: github.event.schedule == '0 21 * * 6' | |
run: | | |
docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest | |
- name: Run test | |
if: github.event.schedule == '0 21 * * 6' | |
run: | | |
docker run \ | |
-v $PWD:/root/workspace \ | |
--workdir=/root/workspace \ | |
--runtime=habana \ | |
-e HABANA_VISIBLE_DEVICES=all \ | |
-e OMPI_MCA_btl_vader_single_copy_mechanism=none \ | |
--cap-add=sys_nice \ | |
--net=host \ | |
--ipc=host \ | |
vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ | |
/bin/bash tests/ci/albert_xxl_1x.sh | |
- name: Warning | |
if: github.event.schedule != '0 21 * * 6' | |
run: echo "ALBERT XXL 1x is only tested on Saturdays." | |
text-generation: | |
name: Test text-generation example | |
if: ${{ !cancelled() && (success() || failure()) }} | |
needs: | |
- start-runner | |
- example-diff | |
- deepspeed | |
- multi-card | |
- single-card | |
- albert-xxl-single-card # run the job when the previous test jobs are done | |
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner | |
env: | |
AWS_REGION: us-west-2 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v2 | |
- name: Pull image | |
run: | | |
docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest | |
- name: Run tests | |
run: | | |
docker run \ | |
-v $PWD:/root/workspace \ | |
--workdir=/root/workspace \ | |
--runtime=habana \ | |
-e HABANA_VISIBLE_DEVICES=all \ | |
-e OMPI_MCA_btl_vader_single_copy_mechanism=none \ | |
--cap-add=sys_nice \ | |
--net=host \ | |
--ipc=host \ | |
vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ | |
make slow_tests_text_generation_example TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }} | |
stop-runner: | |
name: Stop self-hosted EC2 runner | |
needs: | |
- start-runner # required to get output from the start-runner job | |
- example-diff | |
- deepspeed | |
- multi-card | |
- single-card | |
- albert-xxl-single-card | |
- text-generation | |
runs-on: ubuntu-22.04 | |
env: | |
AWS_REGION: us-west-2 | |
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs | |
steps: | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v1 | |
with: | |
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
aws-region: ${{ env.AWS_REGION }} | |
- name: Stop EC2 runner | |
uses: philschmid/philschmid-ec2-github-runner@main | |
with: | |
mode: stop | |
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} | |
label: ${{ needs.start-runner.outputs.label }} | |
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} |