Self-hosted runner (nightly) #818
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Self-hosted runner (nightly) | |
# Note that each job's dependencies go into a corresponding docker file. | |
# | |
# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is | |
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at | |
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile` | |
on: | |
repository_dispatch: | |
schedule: | |
- cron: "0 16 * * *" | |
env: | |
HF_HOME: /mnt/cache | |
TRANSFORMERS_IS_CI: yes | |
OMP_NUM_THREADS: 8 | |
MKL_NUM_THREADS: 8 | |
RUN_SLOW: yes | |
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} | |
TF_FORCE_GPU_ALLOW_GROWTH: true | |
RUN_PT_TF_CROSS_TESTS: 1 | |
jobs: | |
setup: | |
name: Setup | |
strategy: | |
matrix: | |
machine_type: [single-gpu, multi-gpu] | |
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | |
container: | |
image: huggingface/transformers-all-latest-torch-nightly-gpu | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
outputs: | |
matrix: ${{ steps.set-matrix.outputs.matrix }} | |
steps: | |
- name: Update clone | |
working-directory: /transformers | |
run: | | |
git fetch && git checkout ${{ github.sha }} | |
- name: Cleanup | |
working-directory: /transformers | |
run: | | |
rm -rf tests/__pycache__ | |
rm -rf tests/models/__pycache__ | |
rm -rf reports | |
- id: set-matrix | |
name: Identify models to test | |
working-directory: /transformers/tests | |
run: | | |
echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
run_tests_single_gpu: | |
name: Model tests | |
strategy: | |
fail-fast: false | |
matrix: | |
folders: ${{ fromJson(needs.setup.outputs.matrix) }} | |
machine_type: [single-gpu] | |
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | |
container: | |
image: huggingface/transformers-all-latest-torch-nightly-gpu | |
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
needs: setup | |
steps: | |
- name: Echo folder ${{ matrix.folders }} | |
shell: bash | |
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to | |
# set the artifact folder names (because the character `/` is not allowed). | |
run: | | |
echo "${{ matrix.folders }}" | |
matrix_folders=${{ matrix.folders }} | |
matrix_folders=${matrix_folders/'models/'/'models_'} | |
echo "$matrix_folders" | |
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV | |
- name: Update clone | |
working-directory: /transformers | |
run: git fetch && git checkout ${{ github.sha }} | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /transformers | |
run: | | |
python3 utils/print_env.py | |
- name: Run all tests on GPU | |
working-directory: /transformers | |
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} | |
- name: Failure short reports | |
if: ${{ failure() }} | |
continue-on-error: true | |
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports | |
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} | |
run_tests_multi_gpu: | |
name: Model tests | |
strategy: | |
fail-fast: false | |
matrix: | |
folders: ${{ fromJson(needs.setup.outputs.matrix) }} | |
machine_type: [multi-gpu] | |
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | |
container: | |
image: huggingface/transformers-all-latest-torch-nightly-gpu | |
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
needs: setup | |
steps: | |
- name: Echo folder ${{ matrix.folders }} | |
shell: bash | |
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to | |
# set the artifact folder names (because the character `/` is not allowed). | |
run: | | |
echo "${{ matrix.folders }}" | |
matrix_folders=${{ matrix.folders }} | |
matrix_folders=${matrix_folders/'models/'/'models_'} | |
echo "$matrix_folders" | |
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV | |
- name: Update clone | |
working-directory: /transformers | |
run: git fetch && git checkout ${{ github.sha }} | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /transformers | |
run: | | |
python3 utils/print_env.py | |
- name: Run all tests on GPU | |
working-directory: /transformers | |
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} | |
- name: Failure short reports | |
if: ${{ failure() }} | |
continue-on-error: true | |
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports | |
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} | |
run_all_tests_torch_cuda_extensions_gpu: | |
name: Torch CUDA extension tests | |
strategy: | |
fail-fast: false | |
matrix: | |
machine_type: [single-gpu, multi-gpu] | |
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} | |
needs: setup | |
container: | |
image: huggingface/transformers-pytorch-deepspeed-nightly-gpu | |
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | |
steps: | |
- name: Update clone | |
working-directory: /workspace/transformers | |
run: git fetch && git checkout ${{ github.sha }} | |
# To avoid unknown test failures | |
- name: Pre build DeepSpeed *again* | |
working-directory: /workspace | |
run: | | |
python3 -m pip uninstall -y deepspeed | |
rm -rf DeepSpeed | |
git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build | |
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check | |
- name: NVIDIA-SMI | |
run: | | |
nvidia-smi | |
- name: Environment | |
working-directory: /workspace/transformers | |
run: | | |
python utils/print_env.py | |
- name: Run all tests on GPU | |
working-directory: /workspace/transformers | |
run: | | |
python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended | |
- name: Failure short reports | |
if: ${{ failure() }} | |
continue-on-error: true | |
run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt | |
- name: Test suite reports artifacts | |
if: ${{ always() }} | |
uses: actions/upload-artifact@v2 | |
with: | |
name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports | |
path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu | |
send_results: | |
name: Send results to webhook | |
runs-on: ubuntu-latest | |
if: always() | |
needs: [setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu] | |
steps: | |
- uses: actions/checkout@v2 | |
- uses: actions/download-artifact@v2 | |
- name: Send message to Slack | |
env: | |
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} | |
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} | |
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} | |
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} | |
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }} | |
CI_EVENT: nightly-build | |
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change | |
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. | |
run: | | |
pip install slack_sdk | |
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}" |