diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index a8e6223d40..35a62fdb2e 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -37,237 +37,238 @@ jobs: gpu_instance_id: ${{ steps.create_gpu.outputs.action_gpu_instance_id }} aarch64_instance_id: ${{ steps.create_aarch64.outputs.action_graviton_instance_id }} - cpu-test: - runs-on: ubuntu-latest - strategy: - matrix: - arch: [ cpu, cpu-full ] - steps: - - uses: actions/checkout@v3 - - name: Set up JDK 17 - uses: actions/setup-java@v4 - with: - distribution: 'corretto' - java-version: 17 - - uses: actions/cache@v3 - with: - path: ~/.gradle/caches - key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*') }} - restore-keys: | - ${{ runner.os }}-gradle- - - name: Install DJL-Bench - working-directory: benchmark - run: ./gradlew installOnLinux - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh ${{ matrix.arch }} ${{ github.event.inputs.djl-version }} - - name: Download models and dockers - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - ./download_models.sh ${{ matrix.arch }} - - name: Test Python model - if: ${{ matrix.arch != 'cpu' }} - working-directory: tests/integration - run: | - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ - serve -m test::Python=file:/opt/ml/model/resnet18_all_batch.zip - ./test_client.sh image/jpg models/kitten.jpg - ./test_client.sh tensor/ndlist 1,3,224,224 - ./test_client.sh tensor/npz 1,3,224,224 - docker rm -f $(docker ps -aq) - - name: Test dynamic batch with Python model - if: ${{ matrix.arch != 'cpu' }} - working-directory: tests/integration - run: | - echo -en "SERVING_BATCH_SIZE=2\nSERVING_MAX_BATCH_DELAY=30000" > docker_env - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ - serve -m test::Python=file:/opt/ml/model/resnet18_all_batch.zip - EXPECT_TIMEOUT=1 ./test_client.sh image/jpg models/kitten.jpg - ./test_client.sh image/jpg models/kitten.jpg - docker rm -f $(docker ps -aq) - rm docker_env - - name: Test PyTorch model - working-directory: tests/integration - run: | - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ - serve -m test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip - ./test_client.sh image/jpg models/kitten.jpg - docker rm -f $(docker ps -aq) - - name: Test PyTorch model binary mode - working-directory: tests/integration - run: | - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ - serve -m "test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip?translatorFactory=ai.djl.translate.NoopServingTranslatorFactory&application=undefined" - ./test_client.sh tensor/ndlist 1,3,224,224 - ./test_client.sh tensor/npz 1,3,224,224 - docker rm -f $(docker ps -aq) - - name: Test dynamic batch with PyTorch model - working-directory: tests/integration - run: | - echo -en "SERVING_BATCH_SIZE=2\nSERVING_MAX_BATCH_DELAY=30000" > docker_env - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ - serve -m test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip - EXPECT_TIMEOUT=1 ./test_client.sh image/jpg models/kitten.jpg - ./test_client.sh image/jpg models/kitten.jpg - docker rm -f $(docker ps -aq) - rm docker_env - - name: Test MxNet model - working-directory: tests/integration - run: | - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ - serve -m test::MXNet=file:/opt/ml/model/ssd_resnet50.zip - ./test_client.sh image/jpg models/kitten.jpg - docker rm -f $(docker ps -aq) - - name: Test ONNX model - working-directory: tests/integration - run: | - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ - serve -m test::OnnxRuntime=file:/opt/ml/model/resnet18-v1-7.zip - ./test_client.sh image/jpg models/kitten.jpg - docker rm -f $(docker ps -aq) - - name: Test TensorFlow model binary mode - working-directory: tests/integration - run: | - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ - serve -m test::TensorFlow=file:/opt/ml/model/resnet50v1.zip?model_name=resnet50 - ./test_client.sh tensor/ndlist 1,224,224,3 - docker rm -f $(docker ps -aq) - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.arch }}-logs - path: tests/integration/logs/ - - gpu-test: - runs-on: [ self-hosted, gpu ] - timeout-minutes: 30 - needs: create-runners - steps: - - uses: actions/checkout@v3 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up JDK 17 - uses: actions/setup-java@v4 - with: - distribution: 'corretto' - java-version: 17 - - uses: actions/cache@v3 - with: - path: ~/.gradle/caches - key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*') }} - restore-keys: | - ${{ runner.os }}-gradle- - - name: Install DJL-Bench - working-directory: benchmark - run: ./gradlew installOnLinux - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh pytorch-gpu ${{ github.event.inputs.djl-version }} - - name: Download models and dockers - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - mkdir logs - ./download_models.sh pytorch-gpu - - name: Test Python model - working-directory: tests/integration - run: | - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-gpu \ - serve -m test::Python=file:/opt/ml/model/resnet18_all_batch.zip - ./test_client.sh image/jpg models/kitten.jpg - docker rm -f $(docker ps -aq) - - name: Test PyTorch model - working-directory: tests/integration - run: | - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-gpu \ - serve -m test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip - ./test_client.sh image/jpg models/kitten.jpg - docker rm -f $(docker ps -aq) - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: pytorch-gpu-logs - path: tests/integration/logs/ - - aarch64-test: - runs-on: [ self-hosted, aarch64 ] - timeout-minutes: 30 - needs: create-runners - steps: - - uses: actions/checkout@v3 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - - name: Set up JDK 17 - uses: actions/setup-java@v4 - with: - distribution: 'corretto' - java-version: 17 - architecture: aarch64 - - uses: actions/cache@v3 - with: - path: ~/.gradle/caches - key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*') }} - restore-keys: | - ${{ runner.os }}-gradle- - - name: Install DJL-Bench - working-directory: benchmark - run: ./gradlew installOnLinux - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh aarch64 ${{ github.event.inputs.djl-version }} - - name: Download models and dockers - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - mkdir logs - ./download_models.sh aarch64 - - name: Test PyTorch model - working-directory: tests/integration - run: | - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models aarch64 \ - serve -m test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip - ./test_client.sh image/jpg models/kitten.jpg - docker rm -f $(docker ps -aq) - - name: Test ONNX model - working-directory: tests/integration - run: | - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models aarch64 \ - serve -m test::OnnxRuntime=file:/opt/ml/model/resnet18-v1-7.zip - ./test_client.sh image/jpg models/kitten.jpg - docker rm -f $(docker ps -aq) - - name: Check telemetry collection - working-directory: tests/integration - run: | - ./test_telemetry.sh logs/telemetry-test - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: pytorch-aarch64-logs - path: tests/integration/logs/ +# cpu-test: +# runs-on: ubuntu-latest +# strategy: +# matrix: +# arch: [ cpu, cpu-full ] +# steps: +# - uses: actions/checkout@v3 +# - name: Set up JDK 17 +# uses: actions/setup-java@v4 +# with: +# distribution: 'corretto' +# java-version: 17 +# - uses: actions/cache@v3 +# with: +# path: ~/.gradle/caches +# key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*') }} +# restore-keys: | +# ${{ runner.os }}-gradle- +# - name: Install DJL-Bench +# working-directory: benchmark +# run: ./gradlew installOnLinux +# - name: Build container name +# run: ./serving/docker/scripts/docker_name_builder.sh ${{ matrix.arch }} ${{ github.event.inputs.djl-version }} +# - name: Download models and dockers +# working-directory: tests/integration +# run: | +# docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG +# ./download_models.sh ${{ matrix.arch }} +# - name: Test Python model +# if: ${{ matrix.arch != 'cpu' }} +# working-directory: tests/integration +# run: | +# ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ +# serve -m test::Python=file:/opt/ml/model/resnet18_all_batch.zip +# ./test_client.sh image/jpg models/kitten.jpg +# ./test_client.sh tensor/ndlist 1,3,224,224 +# ./test_client.sh tensor/npz 1,3,224,224 +# docker rm -f $(docker ps -aq) +# - name: Test dynamic batch with Python model +# if: ${{ matrix.arch != 'cpu' }} +# working-directory: tests/integration +# run: | +# echo -en "SERVING_BATCH_SIZE=2\nSERVING_MAX_BATCH_DELAY=30000" > docker_env +# ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ +# serve -m test::Python=file:/opt/ml/model/resnet18_all_batch.zip +# EXPECT_TIMEOUT=1 ./test_client.sh image/jpg models/kitten.jpg +# ./test_client.sh image/jpg models/kitten.jpg +# docker rm -f $(docker ps -aq) +# rm docker_env +# - name: Test PyTorch model +# working-directory: tests/integration +# run: | +# ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ +# serve -m test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip +# ./test_client.sh image/jpg models/kitten.jpg +# docker rm -f $(docker ps -aq) +# - name: Test PyTorch model binary mode +# working-directory: tests/integration +# run: | +# ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ +# serve -m "test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip?translatorFactory=ai.djl.translate.NoopServingTranslatorFactory&application=undefined" +# ./test_client.sh tensor/ndlist 1,3,224,224 +# ./test_client.sh tensor/npz 1,3,224,224 +# docker rm -f $(docker ps -aq) +# - name: Test dynamic batch with PyTorch model +# working-directory: tests/integration +# run: | +# echo -en "SERVING_BATCH_SIZE=2\nSERVING_MAX_BATCH_DELAY=30000" > docker_env +# ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ +# serve -m test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip +# EXPECT_TIMEOUT=1 ./test_client.sh image/jpg models/kitten.jpg +# ./test_client.sh image/jpg models/kitten.jpg +# docker rm -f $(docker ps -aq) +# rm docker_env +# - name: Test MxNet model +# working-directory: tests/integration +# run: | +# ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ +# serve -m test::MXNet=file:/opt/ml/model/ssd_resnet50.zip +# ./test_client.sh image/jpg models/kitten.jpg +# docker rm -f $(docker ps -aq) +# - name: Test ONNX model +# working-directory: tests/integration +# run: | +# ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ +# serve -m test::OnnxRuntime=file:/opt/ml/model/resnet18-v1-7.zip +# ./test_client.sh image/jpg models/kitten.jpg +# docker rm -f $(docker ps -aq) +# - name: Test TensorFlow model binary mode +# working-directory: tests/integration +# run: | +# ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models ${{ matrix.arch }} \ +# serve -m test::TensorFlow=file:/opt/ml/model/resnet50v1.zip?model_name=resnet50 +# ./test_client.sh tensor/ndlist 1,224,224,3 +# docker rm -f $(docker ps -aq) +# - name: On fail step +# if: ${{ failure() }} +# working-directory: tests/integration +# run: | +# cat logs/serving.log +# - name: Upload test logs +# uses: actions/upload-artifact@v3 +# with: +# name: ${{ matrix.arch }}-logs +# path: tests/integration/logs/ +# +# gpu-test: +# runs-on: [ self-hosted, gpu ] +# timeout-minutes: 30 +# needs: create-runners +# steps: +# - uses: actions/checkout@v3 +# - name: Clean env +# run: | +# yes | docker system prune -a --volumes +# sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ +# echo "wait dpkg lock..." +# while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done +# - name: Set up JDK 17 +# uses: actions/setup-java@v4 +# with: +# distribution: 'corretto' +# java-version: 17 +# - uses: actions/cache@v3 +# with: +# path: ~/.gradle/caches +# key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*') }} +# restore-keys: | +# ${{ runner.os }}-gradle- +# - name: Install DJL-Bench +# working-directory: benchmark +# run: ./gradlew installOnLinux +# - name: Build container name +# run: ./serving/docker/scripts/docker_name_builder.sh pytorch-gpu ${{ github.event.inputs.djl-version }} +# - name: Download models and dockers +# working-directory: tests/integration +# run: | +# docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG +# mkdir logs +# ./download_models.sh pytorch-gpu +# - name: Test Python model +# working-directory: tests/integration +# run: | +# ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-gpu \ +# serve -m test::Python=file:/opt/ml/model/resnet18_all_batch.zip +# ./test_client.sh image/jpg models/kitten.jpg +# docker rm -f $(docker ps -aq) +# - name: Test PyTorch model +# working-directory: tests/integration +# run: | +# ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-gpu \ +# serve -m test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip +# ./test_client.sh image/jpg models/kitten.jpg +# docker rm -f $(docker ps -aq) +# - name: On fail step +# if: ${{ failure() }} +# working-directory: tests/integration +# run: | +# cat logs/serving.log +# - name: Upload test logs +# uses: actions/upload-artifact@v3 +# with: +# name: pytorch-gpu-logs +# path: tests/integration/logs/ +# +# aarch64-test: +# runs-on: [ self-hosted, aarch64 ] +# timeout-minutes: 30 +# needs: create-runners +# steps: +# - uses: actions/checkout@v3 +# - name: Clean env +# run: | +# yes | docker system prune -a --volumes +# sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ +# - name: Set up JDK 17 +# uses: actions/setup-java@v4 +# with: +# distribution: 'corretto' +# java-version: 17 +# architecture: aarch64 +# - uses: actions/cache@v3 +# with: +# path: ~/.gradle/caches +# key: ${{ runner.os }}-gradle-${{ hashFiles('**/*.gradle*') }} +# restore-keys: | +# ${{ runner.os }}-gradle- +# - name: Install DJL-Bench +# working-directory: benchmark +# run: ./gradlew installOnLinux +# - name: Build container name +# run: ./serving/docker/scripts/docker_name_builder.sh aarch64 ${{ github.event.inputs.djl-version }} +# - name: Download models and dockers +# working-directory: tests/integration +# run: | +# docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG +# mkdir logs +# ./download_models.sh aarch64 +# - name: Test PyTorch model +# working-directory: tests/integration +# run: | +# ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models aarch64 \ +# serve -m test::PyTorch=file:/opt/ml/model/resnet18_all_batch.zip +# ./test_client.sh image/jpg models/kitten.jpg +# docker rm -f $(docker ps -aq) +# - name: Test ONNX model +# working-directory: tests/integration +# run: | +# ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models aarch64 \ +# serve -m test::OnnxRuntime=file:/opt/ml/model/resnet18-v1-7.zip +# ./test_client.sh image/jpg models/kitten.jpg +# docker rm -f $(docker ps -aq) +# - name: Check telemetry collection +# working-directory: tests/integration +# run: | +# ./test_telemetry.sh logs/telemetry-test +# - name: On fail step +# if: ${{ failure() }} +# working-directory: tests/integration +# run: | +# cat logs/serving.log +# - name: Upload test logs +# uses: actions/upload-artifact@v3 +# with: +# name: pytorch-aarch64-logs +# path: tests/integration/logs/ stop-runners: if: always() runs-on: [ self-hosted, scheduler ] - needs: [ create-runners, aarch64-test, gpu-test ] + #needs: [ create-runners, aarch64-test, gpu-test ] + needs: [ create-runners ] steps: - name: Stop all instances run: | @@ -276,3 +277,10 @@ jobs: ./stop_instance.sh $instance_id instance_id=${{ needs.create-runners.outputs.aarch64_instance_id }} ./stop_instance.sh $instance_id + + publish-success-metric: + if: always() + needs: [ stop-runners ] + uses: ./.github/workflows/publish-job-success.yml + with: + metric-name: "DJLServing-IntegrationTests-siddhave-test" \ No newline at end of file diff --git a/.github/workflows/publish-job-success.yml b/.github/workflows/publish-job-success.yml new file mode 100644 index 0000000000..10250fa822 --- /dev/null +++ b/.github/workflows/publish-job-success.yml @@ -0,0 +1,27 @@ +name: Publish Job Success Metric to CloudWatch + +on: + workflow_call: + inputs: + metric-name: + description: "The name of the job to publish a metric for" + type: string + required: true + +jobs: + publish-job-success-to-cloudwatch: + runs-on: [ self-hosted, scheduler ] + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-region: us-west-2 + - name: Publish Job Success Metric + run: | + [[ ${{ job.status }} == "success" ]] + failedBuild=$? + echo ${{github.event.inputs.metric-name}} + aws cloudwatch put-metric-data --namespace GithubCI \ + --metric-name ${{github.event.inputs.metric-name}} \ + --value $failedBuild \ + --unit Count