Integration tests #832
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Integration tests | |
on: | |
workflow_dispatch: | |
inputs: | |
djl-version: | |
description: 'The released version of DJL' | |
required: false | |
default: '' | |
schedule: | |
- cron: '0 15 * * *' | |
jobs: | |
create-runners: | |
runs-on: [self-hosted, scheduler] | |
steps: | |
- name: Create new GPU instance | |
id: create_gpu | |
run: | | |
cd /home/ubuntu/djl_benchmark_script/scripts | |
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \ | |
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \ | |
--fail \ | |
| jq '.token' | tr -d '"' ) | |
./start_instance.sh action_gpu $token djl-serving | |
- name: Create new Graviton instance | |
id: create_aarch64 | |
run: | | |
cd /home/ubuntu/djl_benchmark_script/scripts | |
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \ | |
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \ | |
--fail \ | |
| jq '.token' | tr -d '"' ) | |
./start_instance.sh action_graviton $token djl-serving | |
outputs: | |
gpu_instance_id: ${{ steps.create_gpu.outputs.action_gpu_instance_id }} | |
aarch64_instance_id: ${{ steps.create_aarch64.outputs.action_graviton_instance_id }} | |
cpu-test: | |
runs-on: ubuntu-latest | |
strategy: | |
matrix: | |
arch: [ cpu, cpu-full ] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set up JDK 17 | |
uses: actions/setup-java@v4 | |
with: | |
distribution: 'corretto' | |
java-version: 17 | |
- uses: actions/cache@v4 | |
with: | |
path: ~/.gradle/caches | |
key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*') }} | |
restore-keys: | | |
${{ runner.os }}-gradle- | |
- name: Install DJL-Bench | |
working-directory: benchmark | |
run: ./gradlew installOnLinux | |
- name: Build container name | |
run: ./serving/docker/scripts/docker_name_builder.sh ${{ matrix.arch }} ${{ github.event.inputs.djl-version }} | |
- name: Download models and dockers | |
working-directory: tests/integration | |
run: | | |
docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG | |
./download_models.sh ${{ matrix.arch }} | |
- name: Test Python model | |
if: ${{ matrix.arch != 'cpu' }} | |
working-directory: tests/integration | |
run: | | |
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ | |
serve -m test::Python=file:/opt/ml/model/resnet18_all_batch.zip | |
./test_client.sh image/jpg models/kitten.jpg | |
./test_client.sh tensor/ndlist 1,3,224,224 | |
./test_client.sh tensor/npz 1,3,224,224 | |
docker rm -f $(docker ps -aq) | |
- name: Test dynamic batch with Python model | |
if: ${{ matrix.arch != 'cpu' }} | |
working-directory: tests/integration | |
run: | | |
echo -en "SERVING_BATCH_SIZE=2\nSERVING_MAX_BATCH_DELAY=30000" > docker_env | |
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ | |
serve -m test::Python=file:/opt/ml/model/resnet18_all_batch.zip | |
EXPECT_TIMEOUT=1 ./test_client.sh image/jpg models/kitten.jpg | |
./test_client.sh image/jpg models/kitten.jpg | |
docker rm -f $(docker ps -aq) | |
rm docker_env | |
- name: Test PyTorch model | |
working-directory: tests/integration | |
run: | | |
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ | |
serve -m test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip | |
./test_client.sh image/jpg models/kitten.jpg | |
docker rm -f $(docker ps -aq) | |
- name: Test PyTorch model binary mode | |
working-directory: tests/integration | |
run: | | |
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ | |
serve -m "test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip?translatorFactory=ai.djl.translate.NoopServingTranslatorFactory&application=undefined" | |
./test_client.sh tensor/ndlist 1,3,224,224 | |
./test_client.sh tensor/npz 1,3,224,224 | |
docker rm -f $(docker ps -aq) | |
- name: Test dynamic batch with PyTorch model | |
working-directory: tests/integration | |
run: | | |
echo -en "SERVING_BATCH_SIZE=2\nSERVING_MAX_BATCH_DELAY=30000" > docker_env | |
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ | |
serve -m test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip | |
EXPECT_TIMEOUT=1 ./test_client.sh image/jpg models/kitten.jpg | |
./test_client.sh image/jpg models/kitten.jpg | |
docker rm -f $(docker ps -aq) | |
rm docker_env | |
- name: Test MxNet model | |
working-directory: tests/integration | |
run: | | |
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ | |
serve -m test::MXNet=file:/opt/ml/model/ssd_resnet50.zip | |
./test_client.sh image/jpg models/kitten.jpg | |
docker rm -f $(docker ps -aq) | |
- name: Test ONNX model | |
working-directory: tests/integration | |
run: | | |
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ | |
serve -m test::OnnxRuntime=file:/opt/ml/model/resnet18-v1-7.zip | |
./test_client.sh image/jpg models/kitten.jpg | |
docker rm -f $(docker ps -aq) | |
- name: Test TensorFlow model binary mode | |
working-directory: tests/integration | |
run: | | |
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ | |
serve -m test::TensorFlow=file:/opt/ml/model/resnet50v1.zip?model_name=resnet50 | |
./test_client.sh tensor/ndlist 1,224,224,3 | |
docker rm -f $(docker ps -aq) | |
- name: On fail step | |
if: ${{ failure() }} | |
working-directory: tests/integration | |
run: | | |
cat logs/serving.log | |
- name: Upload test logs | |
uses: actions/upload-artifact@v3 | |
with: | |
name: ${{ matrix.arch }}-logs | |
path: tests/integration/logs/ | |
gpu-test: | |
runs-on: [ self-hosted, gpu ] | |
timeout-minutes: 30 | |
needs: create-runners | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Clean env | |
run: | | |
yes | docker system prune -a --volumes | |
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ | |
echo "wait dpkg lock..." | |
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done | |
- name: Set up JDK 17 | |
uses: actions/setup-java@v4 | |
with: | |
distribution: 'corretto' | |
java-version: 17 | |
- uses: actions/cache@v4 | |
with: | |
path: ~/.gradle/caches | |
key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*') }} | |
restore-keys: | | |
${{ runner.os }}-gradle- | |
- name: Install DJL-Bench | |
working-directory: benchmark | |
run: ./gradlew installOnLinux | |
- name: Build container name | |
run: ./serving/docker/scripts/docker_name_builder.sh pytorch-gpu ${{ github.event.inputs.djl-version }} | |
- name: Download models and dockers | |
working-directory: tests/integration | |
run: | | |
docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG | |
mkdir logs | |
./download_models.sh pytorch-gpu | |
- name: Test Python model | |
working-directory: tests/integration | |
run: | | |
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-gpu \ | |
serve -m test::Python=file:/opt/ml/model/resnet18_all_batch.zip | |
./test_client.sh image/jpg models/kitten.jpg | |
docker rm -f $(docker ps -aq) | |
- name: Test PyTorch model | |
working-directory: tests/integration | |
run: | | |
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-gpu \ | |
serve -m test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip | |
./test_client.sh image/jpg models/kitten.jpg | |
docker rm -f $(docker ps -aq) | |
- name: On fail step | |
if: ${{ failure() }} | |
working-directory: tests/integration | |
run: | | |
cat logs/serving.log | |
- name: Upload test logs | |
uses: actions/upload-artifact@v3 | |
with: | |
name: pytorch-gpu-logs | |
path: tests/integration/logs/ | |
aarch64-test: | |
runs-on: [ self-hosted, aarch64 ] | |
timeout-minutes: 30 | |
needs: create-runners | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Clean env | |
run: | | |
yes | docker system prune -a --volumes | |
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ | |
- name: Set up JDK 17 | |
uses: actions/setup-java@v4 | |
with: | |
distribution: 'corretto' | |
java-version: 17 | |
architecture: aarch64 | |
- uses: actions/cache@v4 | |
with: | |
path: ~/.gradle/caches | |
key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*') }} | |
restore-keys: | | |
${{ runner.os }}-gradle- | |
- name: Install DJL-Bench | |
working-directory: benchmark | |
run: ./gradlew installOnLinux | |
- name: Build container name | |
run: ./serving/docker/scripts/docker_name_builder.sh aarch64 ${{ github.event.inputs.djl-version }} | |
- name: Download models and dockers | |
working-directory: tests/integration | |
run: | | |
docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG | |
mkdir logs | |
./download_models.sh aarch64 | |
- name: Test PyTorch model | |
working-directory: tests/integration | |
run: | | |
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models aarch64 \ | |
serve -m test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip | |
./test_client.sh image/jpg models/kitten.jpg | |
docker rm -f $(docker ps -aq) | |
- name: Test ONNX model | |
working-directory: tests/integration | |
run: | | |
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models aarch64 \ | |
serve -m test::OnnxRuntime=file:/opt/ml/model/resnet18-v1-7.zip | |
./test_client.sh image/jpg models/kitten.jpg | |
docker rm -f $(docker ps -aq) | |
- name: Check telemetry collection | |
working-directory: tests/integration | |
run: | | |
./test_telemetry.sh logs/telemetry-test | |
- name: On fail step | |
if: ${{ failure() }} | |
working-directory: tests/integration | |
run: | | |
cat logs/serving.log | |
- name: Upload test logs | |
uses: actions/upload-artifact@v3 | |
with: | |
name: pytorch-aarch64-logs | |
path: tests/integration/logs/ | |
stop-runners: | |
if: always() | |
runs-on: [ self-hosted, scheduler ] | |
needs: [ create-runners, aarch64-test, gpu-test ] | |
steps: | |
- name: Stop all instances | |
run: | | |
cd /home/ubuntu/djl_benchmark_script/scripts | |
instance_id=${{ needs.create-runners.outputs.gpu_instance_id }} | |
./stop_instance.sh $instance_id | |
instance_id=${{ needs.create-runners.outputs.aarch64_instance_id }} | |
./stop_instance.sh $instance_id |