From 17f1fc9af43fa87e7170c63503e0a6171e28d8df Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Sat, 11 May 2024 19:59:31 +0000
Subject: [PATCH] Deployed 5ed72329 to master with MkDocs 1.6.0 and mike 2.1.1
---
.../modelserving/v1beta1/llm/vllm/index.html | 4 +-
master/search/search_index.json | 2 +-
master/sitemap.xml | 344 +++++++++---------
master/sitemap.xml.gz | Bin 1722 -> 1722 bytes
4 files changed, 175 insertions(+), 175 deletions(-)
diff --git a/master/modelserving/v1beta1/llm/vllm/index.html b/master/modelserving/v1beta1/llm/vllm/index.html
index 8aee8a450..94687965f 100644
--- a/master/modelserving/v1beta1/llm/vllm/index.html
+++ b/master/modelserving/v1beta1/llm/vllm/index.html
@@ -1178,7 +1178,7 @@ Deploy the LLaMA model with vL
command :
- python3
- -m
- - vllm.entrypoints.api_server
+ - vllm.entrypoints.openai.api_server
env :
- name : STORAGE_URI
value : gs://kfserving-examples/llm/huggingface/llama
@@ -1217,7 +1217,7 @@ Benchmarking vLLM Runtime
to find out your ingress IP and port.
python benchmark.py --backend vllm --port ${ INGRESS_PORT } --host ${ INGRESS_HOST } --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer ./tokenizer --request-rate 5
+python benchmark_serving.py --backend openai --port ${ INGRESS_PORT } --host ${ INGRESS_HOST } --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer ./tokenizer --request-rate 5
Expected Output
diff --git a/master/search/search_index.json b/master/search/search_index.json
index 1ccc6cd58..eb4372811 100644
--- a/master/search/search_index.json
+++ b/master/search/search_index.json
@@ -1 +1 @@
-{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"","title":"Home"},{"location":"admin/kubernetes_deployment/","text":"Kubernetes Deployment Installation Guide \u00b6 KServe supports RawDeployment mode to enable InferenceService deployment with Kubernetes resources Deployment , Service , Ingress and Horizontal Pod Autoscaler . Comparing to serverless deployment it unlocks Knative limitations such as mounting multiple volumes, on the other hand Scale down and from Zero is not supported in RawDeployment mode. Kubernetes 1.22 is the minimally required version and please check the following recommended Istio versions for the corresponding Kubernetes version. Recommended Version Matrix \u00b6 Kubernetes Version Recommended Istio Version 1.27 1.18, 1.19 1.28 1.19, 1.20 1.29 1.20, 1.21 1. Install Istio \u00b6 The minimally required Istio version is 1.13 and you can refer to the Istio install guide . Once Istio is installed, create IngressClass resource for istio. apiVersion : networking.k8s.io/v1 kind : IngressClass metadata : name : istio spec : controller : istio.io/ingress-controller Note Istio ingress is recommended, but you can choose to install with other Ingress controllers and create IngressClass resource for your Ingress option. 2. Install Cert Manager \u00b6 The minimally required Cert Manager version is 1.9.0 and you can refer to Cert Manager installation guide . Note Cert manager is required to provision webhook certs for production grade installation, alternatively you can run self signed certs generation script. 3. Install KServe \u00b6 Note The default KServe deployment mode is Serverless which depends on Knative. The following step changes the default deployment mode to RawDeployment before installing KServe. i. Install KServe kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.12.0/kserve.yaml Install KServe default serving runtimes: kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.12.0/kserve-cluster-resources.yaml ii. Change default deployment mode and ingress option First in ConfigMap inferenceservice-config modify the defaultDeploymentMode in the deploy section, kubectl kubectl patch configmap/inferenceservice-config -n kserve --type = strategic -p '{\"data\": {\"deploy\": \"{\\\"defaultDeploymentMode\\\": \\\"RawDeployment\\\"}\"}}' then modify the ingressClassName in ingress section to point to IngressClass name created in step 1 . ingress : |- { \"ingressClassName\" : \"your-ingress-class\" , }","title":"Kubernetes deployment installation"},{"location":"admin/kubernetes_deployment/#kubernetes-deployment-installation-guide","text":"KServe supports RawDeployment mode to enable InferenceService deployment with Kubernetes resources Deployment , Service , Ingress and Horizontal Pod Autoscaler . Comparing to serverless deployment it unlocks Knative limitations such as mounting multiple volumes, on the other hand Scale down and from Zero is not supported in RawDeployment mode. Kubernetes 1.22 is the minimally required version and please check the following recommended Istio versions for the corresponding Kubernetes version.","title":"Kubernetes Deployment Installation Guide"},{"location":"admin/kubernetes_deployment/#recommended-version-matrix","text":"Kubernetes Version Recommended Istio Version 1.27 1.18, 1.19 1.28 1.19, 1.20 1.29 1.20, 1.21","title":"Recommended Version Matrix"},{"location":"admin/kubernetes_deployment/#1-install-istio","text":"The minimally required Istio version is 1.13 and you can refer to the Istio install guide . Once Istio is installed, create IngressClass resource for istio. apiVersion : networking.k8s.io/v1 kind : IngressClass metadata : name : istio spec : controller : istio.io/ingress-controller Note Istio ingress is recommended, but you can choose to install with other Ingress controllers and create IngressClass resource for your Ingress option.","title":"1. Install Istio"},{"location":"admin/kubernetes_deployment/#2-install-cert-manager","text":"The minimally required Cert Manager version is 1.9.0 and you can refer to Cert Manager installation guide . Note Cert manager is required to provision webhook certs for production grade installation, alternatively you can run self signed certs generation script.","title":"2. Install Cert Manager"},{"location":"admin/kubernetes_deployment/#3-install-kserve","text":"Note The default KServe deployment mode is Serverless which depends on Knative. The following step changes the default deployment mode to RawDeployment before installing KServe. i. Install KServe kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.12.0/kserve.yaml Install KServe default serving runtimes: kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.12.0/kserve-cluster-resources.yaml ii. Change default deployment mode and ingress option First in ConfigMap inferenceservice-config modify the defaultDeploymentMode in the deploy section, kubectl kubectl patch configmap/inferenceservice-config -n kserve --type = strategic -p '{\"data\": {\"deploy\": \"{\\\"defaultDeploymentMode\\\": \\\"RawDeployment\\\"}\"}}' then modify the ingressClassName in ingress section to point to IngressClass name created in step 1 . ingress : |- { \"ingressClassName\" : \"your-ingress-class\" , }","title":"3. Install KServe"},{"location":"admin/migration/","text":"Migrating from KFServing \u00b6 This doc explains how to migrate existing inference services from KFServing to KServe without downtime. Note The migration job will by default delete the leftover KFServing installation after migrating the inference services from serving.kubeflow.org to serving.kserve.io . Migrating from standalone KFServing \u00b6 Install KServe v0.7 using the install YAML This will not affect existing services yet. kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/install/v0.7.0/kserve.yaml Run the KServe Migration YAML This will begin the migration. Any errors here may affect your existing services. If you do not want to delete the KFServing resources after migrating, download and edit the env REMOVE_KFSERVING in the YAML before applying it If your KFServing is installed in a namespace other than kfserving-system , then download and set the env KFSERVING_NAMESPACE in the YAML before applying it kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/hack/kserve_migration/kserve_migration_job.yaml Clean up the migration resources kubectl delete ClusterRoleBinding cluster-migration-rolebinding kubectl delete ClusterRole cluster-migration-role kubectl delete ServiceAccount cluster-migration-svcaccount -n kserve Migrating from Kubeflow-based KFServing \u00b6 Install Kubeflow-based KServe 0.7 using the install YAML This will not affect existing services yet. kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/install/v0.7.0/kserve_kubeflow.yaml Run the KServe Migration YAML for Kubeflow-based installations This will begin the migration. Any errors here may affect your existing services. If you do not want to delete the KFServing resources after migrating, download and edit the env REMOVE_KFSERVING in the YAML before applying it kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/hack/kserve_migration/kserve_migration_job_kubeflow.yaml Clean up the migration resources kubectl delete ClusterRoleBinding cluster-migration-rolebinding kubectl delete ClusterRole cluster-migration-role kubectl delete ServiceAccount cluster-migration-svcaccount -n kubeflow Update the models web app to use the new InferenceService API group serving.kserve.io Change the deployment image to kserve/models-web-app:v0.7.0-rc0 This is a temporary fix until the next Kubeflow release includes these changes kubectl edit deployment kfserving-models-web-app -n kubeflow Update the cluster role to be able to access the new InferenceService API group serving.kserve.io Edit the apiGroups from serving.kubeflow.org to serving.kserve.io This is a temporary fix until the next Kubeflow release includes these changes kubectl edit clusterrole kfserving-models-web-app-cluster-role","title":"Migrating from KFServing"},{"location":"admin/migration/#migrating-from-kfserving","text":"This doc explains how to migrate existing inference services from KFServing to KServe without downtime. Note The migration job will by default delete the leftover KFServing installation after migrating the inference services from serving.kubeflow.org to serving.kserve.io .","title":"Migrating from KFServing"},{"location":"admin/migration/#migrating-from-standalone-kfserving","text":"Install KServe v0.7 using the install YAML This will not affect existing services yet. kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/install/v0.7.0/kserve.yaml Run the KServe Migration YAML This will begin the migration. Any errors here may affect your existing services. If you do not want to delete the KFServing resources after migrating, download and edit the env REMOVE_KFSERVING in the YAML before applying it If your KFServing is installed in a namespace other than kfserving-system , then download and set the env KFSERVING_NAMESPACE in the YAML before applying it kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/hack/kserve_migration/kserve_migration_job.yaml Clean up the migration resources kubectl delete ClusterRoleBinding cluster-migration-rolebinding kubectl delete ClusterRole cluster-migration-role kubectl delete ServiceAccount cluster-migration-svcaccount -n kserve","title":"Migrating from standalone KFServing"},{"location":"admin/migration/#migrating-from-kubeflow-based-kfserving","text":"Install Kubeflow-based KServe 0.7 using the install YAML This will not affect existing services yet. kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/install/v0.7.0/kserve_kubeflow.yaml Run the KServe Migration YAML for Kubeflow-based installations This will begin the migration. Any errors here may affect your existing services. If you do not want to delete the KFServing resources after migrating, download and edit the env REMOVE_KFSERVING in the YAML before applying it kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/hack/kserve_migration/kserve_migration_job_kubeflow.yaml Clean up the migration resources kubectl delete ClusterRoleBinding cluster-migration-rolebinding kubectl delete ClusterRole cluster-migration-role kubectl delete ServiceAccount cluster-migration-svcaccount -n kubeflow Update the models web app to use the new InferenceService API group serving.kserve.io Change the deployment image to kserve/models-web-app:v0.7.0-rc0 This is a temporary fix until the next Kubeflow release includes these changes kubectl edit deployment kfserving-models-web-app -n kubeflow Update the cluster role to be able to access the new InferenceService API group serving.kserve.io Edit the apiGroups from serving.kubeflow.org to serving.kserve.io This is a temporary fix until the next Kubeflow release includes these changes kubectl edit clusterrole kfserving-models-web-app-cluster-role","title":"Migrating from Kubeflow-based KFServing"},{"location":"admin/modelmesh/","text":"ModelMesh Installation Guide \u00b6 KServe ModelMesh installation enables high-scale, high-density and frequently-changing model serving use cases. A Kubernetes cluster is required. You will need cluster-admin authority. Additionally, kustomize and an etcd server on the Kubernetes cluster are required. 1. Standard Installation \u00b6 You can find the standard installation instructions in the ModelMesh Serving installation guide . This approach assumes you have installed the prerequisites such as etcd and S3-compatible object storage. 2. Quick Installation \u00b6 A quick installation allows you to quickly get ModelMesh Serving up and running without having to manually install the prerequisites. The steps are described in the ModelMesh Serving quick start guide . Note ModelMesh Serving is namespace scoped, meaning all of its components must exist within a single namespace and only one instance of ModelMesh Serving can be installed per namespace. For more details, you can check out the ModelMesh Serving getting started guide .","title":"ModelMesh installation"},{"location":"admin/modelmesh/#modelmesh-installation-guide","text":"KServe ModelMesh installation enables high-scale, high-density and frequently-changing model serving use cases. A Kubernetes cluster is required. You will need cluster-admin authority. Additionally, kustomize and an etcd server on the Kubernetes cluster are required.","title":"ModelMesh Installation Guide"},{"location":"admin/modelmesh/#1-standard-installation","text":"You can find the standard installation instructions in the ModelMesh Serving installation guide . This approach assumes you have installed the prerequisites such as etcd and S3-compatible object storage.","title":"1. Standard Installation"},{"location":"admin/modelmesh/#2-quick-installation","text":"A quick installation allows you to quickly get ModelMesh Serving up and running without having to manually install the prerequisites. The steps are described in the ModelMesh Serving quick start guide . Note ModelMesh Serving is namespace scoped, meaning all of its components must exist within a single namespace and only one instance of ModelMesh Serving can be installed per namespace. For more details, you can check out the ModelMesh Serving getting started guide .","title":"2. Quick Installation"},{"location":"admin/serverless/serverless/","text":"Serverless Installation Guide \u00b6 KServe Serverless installation enables autoscaling based on request volume and supports scale down to and from zero. It also supports revision management and canary rollout based on revisions. Kubernetes 1.22 is the minimally required version and please check the following recommended Knative, Istio versions for the corresponding Kubernetes version. Recommended Version Matrix \u00b6 Kubernetes Version Recommended Istio Version Recommended Knative Version 1.27 1.18,1.19 1.10,1.11 1.28 1.19,1.20 1.11,1.12.4 1.29 1.20,1.21 1.12.4,1.13.1 1. Install Knative Serving \u00b6 Please refer to Knative Serving install guide . Note If you are looking to use PodSpec fields such as nodeSelector, affinity or tolerations which are now supported in the v1beta1 API spec, you need to turn on the corresponding feature flags in your Knative configuration. Warning Knative 1.13.1 requires Istio 1.20+, gRPC routing does not work with previous Istio releases, see release notes . 2. Install Networking Layer \u00b6 The recommended networking layer for KServe is Istio as currently it works best with KServe, please refer to the Istio install guide . Alternatively you can also choose other networking layers like Kourier or Contour , see how to install Kourier with KServe guide . 3. Install Cert Manager \u00b6 The minimally required Cert Manager version is 1.9.0 and you can refer to Cert Manager . Note Cert manager is required to provision webhook certs for production grade installation, alternatively you can run self signed certs generation script. 4. Install KServe \u00b6 kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.12.0/kserve.yaml 5. Install KServe Built-in ClusterServingRuntimes \u00b6 kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.12.0/kserve-cluster-resources.yaml Note ClusterServingRuntimes are required to create InferenceService for built-in model serving runtimes with KServe v0.8.0 or higher.","title":"Serverless installation"},{"location":"admin/serverless/serverless/#serverless-installation-guide","text":"KServe Serverless installation enables autoscaling based on request volume and supports scale down to and from zero. It also supports revision management and canary rollout based on revisions. Kubernetes 1.22 is the minimally required version and please check the following recommended Knative, Istio versions for the corresponding Kubernetes version.","title":"Serverless Installation Guide"},{"location":"admin/serverless/serverless/#recommended-version-matrix","text":"Kubernetes Version Recommended Istio Version Recommended Knative Version 1.27 1.18,1.19 1.10,1.11 1.28 1.19,1.20 1.11,1.12.4 1.29 1.20,1.21 1.12.4,1.13.1","title":"Recommended Version Matrix"},{"location":"admin/serverless/serverless/#1-install-knative-serving","text":"Please refer to Knative Serving install guide . Note If you are looking to use PodSpec fields such as nodeSelector, affinity or tolerations which are now supported in the v1beta1 API spec, you need to turn on the corresponding feature flags in your Knative configuration. Warning Knative 1.13.1 requires Istio 1.20+, gRPC routing does not work with previous Istio releases, see release notes .","title":"1. Install Knative Serving"},{"location":"admin/serverless/serverless/#2-install-networking-layer","text":"The recommended networking layer for KServe is Istio as currently it works best with KServe, please refer to the Istio install guide . Alternatively you can also choose other networking layers like Kourier or Contour , see how to install Kourier with KServe guide .","title":"2. Install Networking Layer"},{"location":"admin/serverless/serverless/#3-install-cert-manager","text":"The minimally required Cert Manager version is 1.9.0 and you can refer to Cert Manager . Note Cert manager is required to provision webhook certs for production grade installation, alternatively you can run self signed certs generation script.","title":"3. Install Cert Manager"},{"location":"admin/serverless/serverless/#4-install-kserve","text":"kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.12.0/kserve.yaml","title":"4. Install KServe"},{"location":"admin/serverless/serverless/#5-install-kserve-built-in-clusterservingruntimes","text":"kubectl kubectl apply -f https://github.com/kserve/kserve/releases/download/v0.12.0/kserve-cluster-resources.yaml Note ClusterServingRuntimes are required to create InferenceService for built-in model serving runtimes with KServe v0.8.0 or higher.","title":"5. Install KServe Built-in ClusterServingRuntimes"},{"location":"admin/serverless/kourier_networking/","text":"Deploy InferenceService with Alternative Networking Layer \u00b6 KServe creates the top level Istio Virtual Service for routing to InferenceService components based on the virtual host or path based routing. Now KServe provides an option for disabling the top level virtual service to allow configuring other networking layers Knative supports. For example, Kourier is an alternative networking layer and the following steps show how you can deploy KServe with Kourier . Install Kourier Networking Layer \u00b6 Please refer to the Serverless Installation Guide and change the second step to install Kourier instead of Istio . Install the Kourier networking layer: kubectl apply -f https://github.com/knative/net-kourier/releases/download/ ${ KNATIVE_VERSION } /kourier.yaml Configure Knative Serving to use Kourier: kubectl patch configmap/config-network \\ --namespace knative-serving \\ --type merge \\ --patch '{\"data\":{\"ingress-class\":\"kourier.ingress.networking.knative.dev\"}}' Verify Kourier installation: kubectl get pods -n knative-serving && kubectl get pods -n kourier-system Expected Output NAME READY STATUS RESTARTS AGE activator-77db7d9dd7-kbrgr 1 /1 Running 0 10m autoscaler-67dbf79b95-htnp9 1 /1 Running 0 10m controller-684b6bc97f-ffm58 1 /1 Running 0 10m domain-mapping-6d99d99978-ktmrf 1 /1 Running 0 10m domainmapping-webhook-5f998498b6-sddnm 1 /1 Running 0 10m net-kourier-controller-68967d76dc-ncj2n 1 /1 Running 0 10m webhook-97bdc7b4d-nr7qf 1 /1 Running 0 10m NAME READY STATUS RESTARTS AGE 3scale-kourier-gateway-54c49c8ff5-x8tgn 1 /1 Running 0 10m Edit inferenceservice-config configmap to disable Istio top level virtual host: kubectl edit configmap/inferenceservice-config --namespace kserve # Add the flag `\"disableIstioVirtualHost\": true` under the ingress section ingress : | - { \"disableIstioVirtualHost\" : true } Restart the KServe Controller kubectl rollout restart deployment kserve-controller-manager -n kserve Deploy InferenceService for Testing Kourier Gateway \u00b6 Create the InferenceService \u00b6 New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : model : modelFormat : name : pmml storageUri : \"gs://kfserving-examples/models/pmml\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/pmml kubectl apply -f pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/pmml-demo created Run a Prediction \u00b6 Note that when setting INGRESS_HOST and INGRESS_PORT following the determining the ingress IP and ports guide you need to replace istio-ingressgateway with kourier-gateway . For example if you choose to do Port Forward for testing you need to select the kourier-gateway pod as following. kubectl port-forward --namespace kourier-system \\ $( kubectl get pod -n kourier-system -l \"app=3scale-kourier-gateway\" --output = jsonpath = \"{.items[0].metadata.name}\" ) 8080 :8080 export INGRESS_HOST = localhost export INGRESS_PORT = 8080 Make sure that you create a file named pmml-input.json with the following content, under your current terminal path. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } Send a prediction request to the InferenceService and check the output. MODEL_NAME = pmml-demo INPUT_PATH = @./pmml-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-demo -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/pmml-demo:predict HTTP/1.1 > Host: pmml-demo-predictor-default.default.example.com > User-Agent: curl/7.58.0 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes < HTTP/1.1 200 OK < content-length: 144 < content-type: application/json ; charset = UTF-8 < date: Wed, 14 Sep 2022 13 :30:09 GMT < server: envoy < x-envoy-upstream-service-time: 58 < * Connection #0 to host localhost left intact { \"predictions\" : [{ \"Species\" : \"setosa\" , \"Probability_setosa\" : 1 .0, \"Probability_versicolor\" : 0 .0, \"Probability_virginica\" : 0 .0, \"Node_Id\" : \"2\" }]}","title":"Kourier Networking Layer"},{"location":"admin/serverless/kourier_networking/#deploy-inferenceservice-with-alternative-networking-layer","text":"KServe creates the top level Istio Virtual Service for routing to InferenceService components based on the virtual host or path based routing. Now KServe provides an option for disabling the top level virtual service to allow configuring other networking layers Knative supports. For example, Kourier is an alternative networking layer and the following steps show how you can deploy KServe with Kourier .","title":"Deploy InferenceService with Alternative Networking Layer"},{"location":"admin/serverless/kourier_networking/#install-kourier-networking-layer","text":"Please refer to the Serverless Installation Guide and change the second step to install Kourier instead of Istio . Install the Kourier networking layer: kubectl apply -f https://github.com/knative/net-kourier/releases/download/ ${ KNATIVE_VERSION } /kourier.yaml Configure Knative Serving to use Kourier: kubectl patch configmap/config-network \\ --namespace knative-serving \\ --type merge \\ --patch '{\"data\":{\"ingress-class\":\"kourier.ingress.networking.knative.dev\"}}' Verify Kourier installation: kubectl get pods -n knative-serving && kubectl get pods -n kourier-system Expected Output NAME READY STATUS RESTARTS AGE activator-77db7d9dd7-kbrgr 1 /1 Running 0 10m autoscaler-67dbf79b95-htnp9 1 /1 Running 0 10m controller-684b6bc97f-ffm58 1 /1 Running 0 10m domain-mapping-6d99d99978-ktmrf 1 /1 Running 0 10m domainmapping-webhook-5f998498b6-sddnm 1 /1 Running 0 10m net-kourier-controller-68967d76dc-ncj2n 1 /1 Running 0 10m webhook-97bdc7b4d-nr7qf 1 /1 Running 0 10m NAME READY STATUS RESTARTS AGE 3scale-kourier-gateway-54c49c8ff5-x8tgn 1 /1 Running 0 10m Edit inferenceservice-config configmap to disable Istio top level virtual host: kubectl edit configmap/inferenceservice-config --namespace kserve # Add the flag `\"disableIstioVirtualHost\": true` under the ingress section ingress : | - { \"disableIstioVirtualHost\" : true } Restart the KServe Controller kubectl rollout restart deployment kserve-controller-manager -n kserve","title":"Install Kourier Networking Layer"},{"location":"admin/serverless/kourier_networking/#deploy-inferenceservice-for-testing-kourier-gateway","text":"","title":"Deploy InferenceService for Testing Kourier Gateway"},{"location":"admin/serverless/kourier_networking/#create-the-inferenceservice","text":"New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : model : modelFormat : name : pmml storageUri : \"gs://kfserving-examples/models/pmml\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/pmml kubectl apply -f pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/pmml-demo created","title":"Create the InferenceService"},{"location":"admin/serverless/kourier_networking/#run-a-prediction","text":"Note that when setting INGRESS_HOST and INGRESS_PORT following the determining the ingress IP and ports guide you need to replace istio-ingressgateway with kourier-gateway . For example if you choose to do Port Forward for testing you need to select the kourier-gateway pod as following. kubectl port-forward --namespace kourier-system \\ $( kubectl get pod -n kourier-system -l \"app=3scale-kourier-gateway\" --output = jsonpath = \"{.items[0].metadata.name}\" ) 8080 :8080 export INGRESS_HOST = localhost export INGRESS_PORT = 8080 Make sure that you create a file named pmml-input.json with the following content, under your current terminal path. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } Send a prediction request to the InferenceService and check the output. MODEL_NAME = pmml-demo INPUT_PATH = @./pmml-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-demo -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/pmml-demo:predict HTTP/1.1 > Host: pmml-demo-predictor-default.default.example.com > User-Agent: curl/7.58.0 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes < HTTP/1.1 200 OK < content-length: 144 < content-type: application/json ; charset = UTF-8 < date: Wed, 14 Sep 2022 13 :30:09 GMT < server: envoy < x-envoy-upstream-service-time: 58 < * Connection #0 to host localhost left intact { \"predictions\" : [{ \"Species\" : \"setosa\" , \"Probability_setosa\" : 1 .0, \"Probability_versicolor\" : 0 .0, \"Probability_virginica\" : 0 .0, \"Node_Id\" : \"2\" }]}","title":"Run a Prediction"},{"location":"admin/serverless/servicemesh/","text":"Secure InferenceService with ServiceMesh \u00b6 A service mesh is a dedicated infrastructure layer that you can add to your InferenceService to allow you to transparently add capabilities like observability, traffic management and security. In this example we show how you can turn on the Istio service mesh mode to provide a uniform and efficient way to secure service-to-service communication in a cluster with TLS encryption, strong identity-based authentication and authorization. Turn on strict mTLS and Authorization Policy \u00b6 For namespace traffic isolation, we lock down the in cluster traffic to only allow requests from the same namespace and enable mTLS for TLS encryption and strong identity-based authentication. Because Knative requests are frequently routed through activator, when turning on mTLS additional traffic rules are required and activator/autoscaler in knative-serving namespace must have sidecar injected as well. For more details please see mTLS in Knative , to understand when requests are forwarded through the activator, see target burst capacity docs. Create the namespace user1 which is used for this example. kubectl create namespace user1 When activator is not on the request path, the rule checks if the source namespace of the request is the same as the destination namespace of InferenceService . When activator is on the request path, the rule checks the source namespace knative-serving namespace as the request is proxied through activator. Warning Currently when activator is on the request path, it is not able to check the originated namespace or original identity due to the net-istio issue . apiVersion : security.istio.io/v1beta1 kind : PeerAuthentication metadata : name : default namespace : user1 spec : mtls : mode : STRICT --- apiVersion : security.istio.io/v1beta1 kind : AuthorizationPolicy metadata : name : allow-serving-tests namespace : user1 spec : action : ALLOW rules : # 1. mTLS for service from source \"user1\" namespace to destination service when TargetBurstCapacity=0 without local gateway and activator on the path # Source Service from \"user1\" namespace -> Destination Service in \"user1\" namespace - from : - source : namespaces : [ \"user1\" ] # 2. mTLS for service from source \"user1\" namespace to destination service with activator on the path # Source Service from \"user1\" namespace -> Activator(Knative Serving namespace) -> Destination service in \"user1\" namespace # unfortunately currently we could not lock down the source namespace as Activator does not capture the source namespace when proxying the request, see https://github.com/knative-sandbox/net-istio/issues/554. - from : - source : namespaces : [ \"knative-serving\" ] # 3. allow metrics and probes from knative serving namespaces - from : - source : namespaces : [ \"knative-serving\" ] to : - operation : paths : [ \"/metrics\" , \"/healthz\" , \"/ready\" , \"/wait-for-drain\" ] Apply the PeerAuthentication and AuthorizationPolicy rules with auth.yaml : kubectl apply -f auth.yaml Disable Top Level Virtual Service \u00b6 KServe currently creates an Istio top level virtual service to support routing between InferenceService components like predictor, transformer and explainer, as well as support path based routing as an alternative routing with service hosts. In serverless service mesh mode this creates a problem that in order to route through the underlying virtual service created by Knative Service, the top level virtual service is required to route to the Istio Gateway instead of to the InferenceService component on the service mesh directly. By disabling the top level virtual service, it eliminates the extra route to Istio local gateway and the authorization policy can check the source namespace when mTLS is established directly between service to service and activator is not on the request path. To disable the top level virtual service, add the flag \"disableIstioVirtualHost\": true under the ingress config in inferenceservice configmap. kubectl edit configmap/inferenceservice-config --namespace kserve ingress : | - { \"disableIstioVirtualHost\" : true } Deploy InferenceService with Istio sidecar injection \u00b6 First label the namespace with istio-injection=enabled to turn on the sidecar injection for the namespace. kubectl label namespace user1 istio-injection = enabled --overwrite Create the InferenceService with and without Knative activator on the path: When autoscaling.knative.dev/targetBurstCapacity is set to 0, Knative removes the activator from the request path so the test service can directly establish the mTLS connection to the InferenceService and the authorization policy can check the original namespace of the request to lock down the traffic for namespace isolation. InferenceService with activator on path InferenceService without activator on path apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-iris-burst\" namespace : user1 annotations : \"sidecar.istio.io/inject\" : \"true\" spec : predictor : model : modelFormat : name : sklearn storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-iris\" namespace : user1 annotations : \"autoscaling.knative.dev/targetBurstCapacity\" : \"0\" \"sidecar.istio.io/inject\" : \"true\" spec : predictor : model : modelFormat : name : sklearn storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" kubectl apply -f sklearn_iris.yaml Expected Output $ inferenceservice.serving.kserve.io/sklearn-iris created $ inferenceservice.serving.kserve.io/sklearn-iris-burst created kubectl get pods -n user1 NAME READY STATUS RESTARTS AGE httpbin-6484879498-qxqj8 2 /2 Running 0 19h sklearn-iris-burst-predictor-default-00001-deployment-5685n46f6 3 /3 Running 0 12h sklearn-iris-predictor-default-00001-deployment-985d5cd46-zzw4x 3 /3 Running 0 12h Run a prediction from the same namespace \u00b6 Deploy a test service in user1 namespace with httpbin.yaml . kubectl apply -f httpbin.yaml Run a prediction request to the sklearn-iris InferenceService without activator on the path, you are expected to get HTTP 200 as the authorization rule allows traffic from the same namespace. kubectl exec -it httpbin-6484879498-qxqj8 -c istio-proxy -n user1 -- curl -v sklearn-iris-predictor-default.user1.svc.cluster.local/v1/models/sklearn-iris Expected Output * Connected to sklearn-iris-predictor-default.user1.svc.cluster.local ( 10 .96.137.152 ) port 80 ( #0) > GET /v1/models/sklearn-iris HTTP/1.1 > Host: sklearn-iris-predictor-default.user1.svc.cluster.local > User-Agent: curl/7.81.0 > Accept: */* > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 36 < content-type: application/json < date: Sat, 26 Nov 2022 01 :45:10 GMT < server: istio-envoy < x-envoy-upstream-service-time: 42 < * Connection #0 to host sklearn-iris-predictor-default.user1.svc.cluster.local left intact { \"name\" : \"sklearn-iris\" , \"ready\" :true } Run a prediction request to the sklearn-iris-burst InferenceService with activator on the path, you are expected to get HTTP 200 as the authorization rule allows traffic from knative-serving namespace. kubectl exec -it httpbin-6484879498-qxqj8 -c istio-proxy -n user1 -- curl -v sklearn-iris-burst-predictor-default.user1.svc.cluster.local/v1/models/sklearn-iris-burst Expected Output * Connected to sklearn-iris-burst-predictor-default.user1.svc.cluster.local ( 10 .96.137.152 ) port 80 ( #0) > GET /v1/models/sklearn-iris-burst HTTP/1.1 > Host: sklearn-iris-burst-predictor-default.user1.svc.cluster.local > User-Agent: curl/7.81.0 > Accept: */* > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 42 < content-type: application/json < date: Sat, 26 Nov 2022 13 :55:14 GMT < server: istio-envoy < x-envoy-upstream-service-time: 209 < * Connection #0 to host sklearn-iris-burst-predictor-default.user1.svc.cluster.local left intact { \"name\" : \"sklearn-iris-burst\" , \"ready\" :true } Run a prediction from a different namespace \u00b6 Deploy a test service in default namespace with sleep.yaml which is different from the namespace the InferenceService is deployed to. kubectl apply -f sleep.yaml When you send a prediction request to the sklearn-iris InferenceService without activator on the request path from a different namespace, you are expected to get HTTP 403 \"RBAC denied\" as the authorization rule only allows the traffic from the same namespace user1 where the InferenceService is deployed. kubectl exec -it sleep-6d6b49d8b8-6ths6 -- curl -v sklearn-iris-predictor-default.user1.svc.cluster.local/v1/models/sklearn-iris Expected Output * Connected to sklearn-iris-predictor-default.user1.svc.cluster.local ( 10 .96.137.152 ) port 80 ( #0) > GET /v1/models/sklearn-iris HTTP/1.1 > Host: sklearn-iris-predictor-default.user1.svc.cluster.local > User-Agent: curl/7.86.0-DEV > Accept: */* > * Mark bundle as not supporting multiuse < HTTP/1.1 403 Forbidden < content-length: 19 < content-type: text/plain < date: Sat, 26 Nov 2022 02 :45:46 GMT < server: envoy < x-envoy-upstream-service-time: 14 < * Connection #0 to host sklearn-iris-predictor-default.user1.svc.cluster.local left intact When you send a prediction request to the sklearn-iris-burst InferenceService with activator on the request path from a different namespace, you actually get HTTP 200 response due to the above limitation as the authorization policy is not able to lock down the traffic only from the same namespace as the request is proxied through activator in knative-serving namespace, we expect to get HTTP 403 once upstream Knative net-istio is fixed. kubectl exec -it sleep-6d6b49d8b8-6ths6 -- curl -v sklearn-iris-burst-predictor-default.user1.svc.cluster.local/v1/models/sklearn-iris-burst Expected Output * Connected to sklearn-iris-burst-predictor-default.user1.svc.cluster.local ( 10 .96.137.152 ) port 80 ( #0) > GET /v1/models/sklearn-iris-burst HTTP/1.1 > Host: sklearn-iris-burst-predictor-default.user1.svc.cluster.local > User-Agent: curl/7.86.0-DEV > Accept: */* > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 42 < content-type: application/json < date: Sat, 26 Nov 2022 13 :59:04 GMT < server: envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host sklearn-iris-burst-predictor-default.user1.svc.cluster.local left intact { \"name\" : \"sklearn-iris-burst\" , \"ready\" :true }","title":"Istio Service Mesh"},{"location":"admin/serverless/servicemesh/#secure-inferenceservice-with-servicemesh","text":"A service mesh is a dedicated infrastructure layer that you can add to your InferenceService to allow you to transparently add capabilities like observability, traffic management and security. In this example we show how you can turn on the Istio service mesh mode to provide a uniform and efficient way to secure service-to-service communication in a cluster with TLS encryption, strong identity-based authentication and authorization.","title":"Secure InferenceService with ServiceMesh"},{"location":"admin/serverless/servicemesh/#turn-on-strict-mtls-and-authorization-policy","text":"For namespace traffic isolation, we lock down the in cluster traffic to only allow requests from the same namespace and enable mTLS for TLS encryption and strong identity-based authentication. Because Knative requests are frequently routed through activator, when turning on mTLS additional traffic rules are required and activator/autoscaler in knative-serving namespace must have sidecar injected as well. For more details please see mTLS in Knative , to understand when requests are forwarded through the activator, see target burst capacity docs. Create the namespace user1 which is used for this example. kubectl create namespace user1 When activator is not on the request path, the rule checks if the source namespace of the request is the same as the destination namespace of InferenceService . When activator is on the request path, the rule checks the source namespace knative-serving namespace as the request is proxied through activator. Warning Currently when activator is on the request path, it is not able to check the originated namespace or original identity due to the net-istio issue . apiVersion : security.istio.io/v1beta1 kind : PeerAuthentication metadata : name : default namespace : user1 spec : mtls : mode : STRICT --- apiVersion : security.istio.io/v1beta1 kind : AuthorizationPolicy metadata : name : allow-serving-tests namespace : user1 spec : action : ALLOW rules : # 1. mTLS for service from source \"user1\" namespace to destination service when TargetBurstCapacity=0 without local gateway and activator on the path # Source Service from \"user1\" namespace -> Destination Service in \"user1\" namespace - from : - source : namespaces : [ \"user1\" ] # 2. mTLS for service from source \"user1\" namespace to destination service with activator on the path # Source Service from \"user1\" namespace -> Activator(Knative Serving namespace) -> Destination service in \"user1\" namespace # unfortunately currently we could not lock down the source namespace as Activator does not capture the source namespace when proxying the request, see https://github.com/knative-sandbox/net-istio/issues/554. - from : - source : namespaces : [ \"knative-serving\" ] # 3. allow metrics and probes from knative serving namespaces - from : - source : namespaces : [ \"knative-serving\" ] to : - operation : paths : [ \"/metrics\" , \"/healthz\" , \"/ready\" , \"/wait-for-drain\" ] Apply the PeerAuthentication and AuthorizationPolicy rules with auth.yaml : kubectl apply -f auth.yaml","title":"Turn on strict mTLS and Authorization Policy"},{"location":"admin/serverless/servicemesh/#disable-top-level-virtual-service","text":"KServe currently creates an Istio top level virtual service to support routing between InferenceService components like predictor, transformer and explainer, as well as support path based routing as an alternative routing with service hosts. In serverless service mesh mode this creates a problem that in order to route through the underlying virtual service created by Knative Service, the top level virtual service is required to route to the Istio Gateway instead of to the InferenceService component on the service mesh directly. By disabling the top level virtual service, it eliminates the extra route to Istio local gateway and the authorization policy can check the source namespace when mTLS is established directly between service to service and activator is not on the request path. To disable the top level virtual service, add the flag \"disableIstioVirtualHost\": true under the ingress config in inferenceservice configmap. kubectl edit configmap/inferenceservice-config --namespace kserve ingress : | - { \"disableIstioVirtualHost\" : true }","title":"Disable Top Level Virtual Service"},{"location":"admin/serverless/servicemesh/#deploy-inferenceservice-with-istio-sidecar-injection","text":"First label the namespace with istio-injection=enabled to turn on the sidecar injection for the namespace. kubectl label namespace user1 istio-injection = enabled --overwrite Create the InferenceService with and without Knative activator on the path: When autoscaling.knative.dev/targetBurstCapacity is set to 0, Knative removes the activator from the request path so the test service can directly establish the mTLS connection to the InferenceService and the authorization policy can check the original namespace of the request to lock down the traffic for namespace isolation. InferenceService with activator on path InferenceService without activator on path apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-iris-burst\" namespace : user1 annotations : \"sidecar.istio.io/inject\" : \"true\" spec : predictor : model : modelFormat : name : sklearn storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-iris\" namespace : user1 annotations : \"autoscaling.knative.dev/targetBurstCapacity\" : \"0\" \"sidecar.istio.io/inject\" : \"true\" spec : predictor : model : modelFormat : name : sklearn storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" kubectl apply -f sklearn_iris.yaml Expected Output $ inferenceservice.serving.kserve.io/sklearn-iris created $ inferenceservice.serving.kserve.io/sklearn-iris-burst created kubectl get pods -n user1 NAME READY STATUS RESTARTS AGE httpbin-6484879498-qxqj8 2 /2 Running 0 19h sklearn-iris-burst-predictor-default-00001-deployment-5685n46f6 3 /3 Running 0 12h sklearn-iris-predictor-default-00001-deployment-985d5cd46-zzw4x 3 /3 Running 0 12h","title":"Deploy InferenceService with Istio sidecar injection"},{"location":"admin/serverless/servicemesh/#run-a-prediction-from-the-same-namespace","text":"Deploy a test service in user1 namespace with httpbin.yaml . kubectl apply -f httpbin.yaml Run a prediction request to the sklearn-iris InferenceService without activator on the path, you are expected to get HTTP 200 as the authorization rule allows traffic from the same namespace. kubectl exec -it httpbin-6484879498-qxqj8 -c istio-proxy -n user1 -- curl -v sklearn-iris-predictor-default.user1.svc.cluster.local/v1/models/sklearn-iris Expected Output * Connected to sklearn-iris-predictor-default.user1.svc.cluster.local ( 10 .96.137.152 ) port 80 ( #0) > GET /v1/models/sklearn-iris HTTP/1.1 > Host: sklearn-iris-predictor-default.user1.svc.cluster.local > User-Agent: curl/7.81.0 > Accept: */* > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 36 < content-type: application/json < date: Sat, 26 Nov 2022 01 :45:10 GMT < server: istio-envoy < x-envoy-upstream-service-time: 42 < * Connection #0 to host sklearn-iris-predictor-default.user1.svc.cluster.local left intact { \"name\" : \"sklearn-iris\" , \"ready\" :true } Run a prediction request to the sklearn-iris-burst InferenceService with activator on the path, you are expected to get HTTP 200 as the authorization rule allows traffic from knative-serving namespace. kubectl exec -it httpbin-6484879498-qxqj8 -c istio-proxy -n user1 -- curl -v sklearn-iris-burst-predictor-default.user1.svc.cluster.local/v1/models/sklearn-iris-burst Expected Output * Connected to sklearn-iris-burst-predictor-default.user1.svc.cluster.local ( 10 .96.137.152 ) port 80 ( #0) > GET /v1/models/sklearn-iris-burst HTTP/1.1 > Host: sklearn-iris-burst-predictor-default.user1.svc.cluster.local > User-Agent: curl/7.81.0 > Accept: */* > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 42 < content-type: application/json < date: Sat, 26 Nov 2022 13 :55:14 GMT < server: istio-envoy < x-envoy-upstream-service-time: 209 < * Connection #0 to host sklearn-iris-burst-predictor-default.user1.svc.cluster.local left intact { \"name\" : \"sklearn-iris-burst\" , \"ready\" :true }","title":"Run a prediction from the same namespace"},{"location":"admin/serverless/servicemesh/#run-a-prediction-from-a-different-namespace","text":"Deploy a test service in default namespace with sleep.yaml which is different from the namespace the InferenceService is deployed to. kubectl apply -f sleep.yaml When you send a prediction request to the sklearn-iris InferenceService without activator on the request path from a different namespace, you are expected to get HTTP 403 \"RBAC denied\" as the authorization rule only allows the traffic from the same namespace user1 where the InferenceService is deployed. kubectl exec -it sleep-6d6b49d8b8-6ths6 -- curl -v sklearn-iris-predictor-default.user1.svc.cluster.local/v1/models/sklearn-iris Expected Output * Connected to sklearn-iris-predictor-default.user1.svc.cluster.local ( 10 .96.137.152 ) port 80 ( #0) > GET /v1/models/sklearn-iris HTTP/1.1 > Host: sklearn-iris-predictor-default.user1.svc.cluster.local > User-Agent: curl/7.86.0-DEV > Accept: */* > * Mark bundle as not supporting multiuse < HTTP/1.1 403 Forbidden < content-length: 19 < content-type: text/plain < date: Sat, 26 Nov 2022 02 :45:46 GMT < server: envoy < x-envoy-upstream-service-time: 14 < * Connection #0 to host sklearn-iris-predictor-default.user1.svc.cluster.local left intact When you send a prediction request to the sklearn-iris-burst InferenceService with activator on the request path from a different namespace, you actually get HTTP 200 response due to the above limitation as the authorization policy is not able to lock down the traffic only from the same namespace as the request is proxied through activator in knative-serving namespace, we expect to get HTTP 403 once upstream Knative net-istio is fixed. kubectl exec -it sleep-6d6b49d8b8-6ths6 -- curl -v sklearn-iris-burst-predictor-default.user1.svc.cluster.local/v1/models/sklearn-iris-burst Expected Output * Connected to sklearn-iris-burst-predictor-default.user1.svc.cluster.local ( 10 .96.137.152 ) port 80 ( #0) > GET /v1/models/sklearn-iris-burst HTTP/1.1 > Host: sklearn-iris-burst-predictor-default.user1.svc.cluster.local > User-Agent: curl/7.86.0-DEV > Accept: */* > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 42 < content-type: application/json < date: Sat, 26 Nov 2022 13 :59:04 GMT < server: envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host sklearn-iris-burst-predictor-default.user1.svc.cluster.local left intact { \"name\" : \"sklearn-iris-burst\" , \"ready\" :true }","title":"Run a prediction from a different namespace"},{"location":"api/api/","text":"KServe API \u00b6","title":"KServe API"},{"location":"api/api/#kserve-api","text":"","title":"KServe API"},{"location":"blog/_index/","text":"","title":" index"},{"location":"blog/articles/2021-09-27-kfserving-transition/","text":"Authors \u00b6 Dan Sun and Animesh Singh on behalf of the Kubeflow Serving Working Group KFServing is now KServe \u00b6 We are excited to announce the next chapter for KFServing. In coordination with the Kubeflow Project Steering Group, the KFServing GitHub repository has now been transferred to an independent KServe GitHub organization under the stewardship of the Kubeflow Serving Working Group leads. The project has been rebranded from KFServing to KServe , and we are planning to graduate the project from Kubeflow Project later this year. Developed collaboratively by Google, IBM, Bloomberg, NVIDIA, and Seldon in 2019, KFServing was published as open source in early 2019. The project sets out to provide the following features: - A simple, yet powerful, Kubernetes Custom Resource for deploying machine learning (ML) models on production across ML frameworks. - Provide performant, standardized inference protocol. - Serverless inference according to live traffic patterns, supporting \u201cScale-to-zero\u201d on both CPUs and GPUs. - Complete story for production ML Model Serving including prediction, pre/post-processing, explainability, and monitoring. - Support for deploying thousands of models at scale and inference graph capability for multiple models. KFServing was created to address the challenges of deploying and monitoring machine learning models on production for organizations. After publishing the open source project, we\u2019ve seen an explosion in demand for the software, leading to strong adoption and community growth. The scope of the project has since increased, and we have developed multiple components along the way, including our own growing body of documentation that needs it's own website and independent GitHub organization. What's Next \u00b6 Over the coming weeks, we will be releasing KServe 0.7 outside of the Kubeflow Project and will provide more details on how to migrate from KFServing to KServe with minimal disruptions. KFServing 0.5.x/0.6.x releases are still supported in next six months after KServe 0.7 release. We are also working on integrating core Kubeflow APIs and standards for the conformance program . For contributors, please follow the KServe developer and doc contribution guide to make code or doc contributions. We are excited to work with you to make KServe better and promote its adoption by more and more users! KServe Key Links \u00b6 Website Github Slack(#kubeflow-kfserving) Contributor Acknowledgement \u00b6 We'd like to thank all the KServe contributors for this transition work! Andrews Arokiam Animesh Singh Chin Huang Dan Sun Jagadeesh Jinchi He Nick Hill Paul Van Eck Qianshan Chen Suresh Nakkiran Sukumar Gaonkar Theofilos Papapanagiotou Tommy Li Vedant Padwal Yao Xiao Yuzhui Liu","title":"KFserving Transition"},{"location":"blog/articles/2021-09-27-kfserving-transition/#authors","text":"Dan Sun and Animesh Singh on behalf of the Kubeflow Serving Working Group","title":"Authors"},{"location":"blog/articles/2021-09-27-kfserving-transition/#kfserving-is-now-kserve","text":"We are excited to announce the next chapter for KFServing. In coordination with the Kubeflow Project Steering Group, the KFServing GitHub repository has now been transferred to an independent KServe GitHub organization under the stewardship of the Kubeflow Serving Working Group leads. The project has been rebranded from KFServing to KServe , and we are planning to graduate the project from Kubeflow Project later this year. Developed collaboratively by Google, IBM, Bloomberg, NVIDIA, and Seldon in 2019, KFServing was published as open source in early 2019. The project sets out to provide the following features: - A simple, yet powerful, Kubernetes Custom Resource for deploying machine learning (ML) models on production across ML frameworks. - Provide performant, standardized inference protocol. - Serverless inference according to live traffic patterns, supporting \u201cScale-to-zero\u201d on both CPUs and GPUs. - Complete story for production ML Model Serving including prediction, pre/post-processing, explainability, and monitoring. - Support for deploying thousands of models at scale and inference graph capability for multiple models. KFServing was created to address the challenges of deploying and monitoring machine learning models on production for organizations. After publishing the open source project, we\u2019ve seen an explosion in demand for the software, leading to strong adoption and community growth. The scope of the project has since increased, and we have developed multiple components along the way, including our own growing body of documentation that needs it's own website and independent GitHub organization.","title":"KFServing is now KServe"},{"location":"blog/articles/2021-09-27-kfserving-transition/#whats-next","text":"Over the coming weeks, we will be releasing KServe 0.7 outside of the Kubeflow Project and will provide more details on how to migrate from KFServing to KServe with minimal disruptions. KFServing 0.5.x/0.6.x releases are still supported in next six months after KServe 0.7 release. We are also working on integrating core Kubeflow APIs and standards for the conformance program . For contributors, please follow the KServe developer and doc contribution guide to make code or doc contributions. We are excited to work with you to make KServe better and promote its adoption by more and more users!","title":"What's Next"},{"location":"blog/articles/2021-09-27-kfserving-transition/#kserve-key-links","text":"Website Github Slack(#kubeflow-kfserving)","title":"KServe Key Links"},{"location":"blog/articles/2021-09-27-kfserving-transition/#contributor-acknowledgement","text":"We'd like to thank all the KServe contributors for this transition work! Andrews Arokiam Animesh Singh Chin Huang Dan Sun Jagadeesh Jinchi He Nick Hill Paul Van Eck Qianshan Chen Suresh Nakkiran Sukumar Gaonkar Theofilos Papapanagiotou Tommy Li Vedant Padwal Yao Xiao Yuzhui Liu","title":"Contributor Acknowledgement"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/","text":"Authors \u00b6 Dan Sun , Animesh Singh , Yuzhui Liu , Vedant Padwal on behalf of the KServe Working Group. KFServing is now KServe and KServe 0.7 release is available, the release also ensures a smooth user migration experience from KFServing to KServe. What's Changed? \u00b6 InferenceService API group is changed from serving.kubeflow.org to serving.kserve.io #1826 , the migration job is created for smooth transition. Python SDK name is changed from kfserving to kserve . KServe Installation manifests #1824 . Models-web-app is separated out of the kserve repository to models-web-app . Docs and examples are moved to separate repository website . KServe images are migrated to kserve docker hub account. v1alpha2 API group is deprecated #1850 . \ud83c\udf08 What's New? \u00b6 ModelMesh project is joining KServe under repository modelmesh-serving ! ModelMesh is designed for high-scale, high-density and frequently-changing model use cases. ModelMesh intelligently loads and unloads AI models to and from memory to strike an intelligent trade-off between responsiveness to users and computational footprint. To learn more about ModelMesh features and components, check out the ModelMesh announcement blog and Join talk at #KubeCon NA to get a deeper dive into ModelMesh and KServe . (Alpha feature) Raw Kubernetes deployment support, Istio/Knative dependency is now optional and please follow the guide to install and turn on RawDeployment mode. KServe now has its own documentation website temporarily hosted on website . Support v1 crd and webhook configuration for Kubernetes 1.22 #1837 . Triton model serving runtime now defaults to 21.09 version #1840 . \ud83d\udc1e What's Fixed? \u00b6 Bug fix for Azure blob storage #1845 . Tar/Zip support for all storage options #1836 . Fix AWS_REGION env variable and add AWS_CA_BUNDLE for S3 #1780 . Torchserve custom package install fix #1619 . Join the community \u00b6 Visit our Website or GitHub Join the Slack(#kubeflow-kfserving) Attend a Biweekly community meeting on Wednesday 9am PST Contribute at developer and doc contribution guide to make code or doc contributions. We are excited to work with you to make KServe better and promote its adoption by more and more users! Contributors \u00b6 We would like to thank everyone for their efforts on v0.7 Andrews Arokiam Animesh Singh Chin Huang Dan Sun Jagadeesh Jinchi He Nick Hill Paul Van Eck Qianshan Chen Suresh Nakkiran Sukumar Gaonkar Theofilos Papapanagiotou Tommy Li Vedant Padwal Yao Xiao Yuzhui Liu","title":"KServe 0.7 Release"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#authors","text":"Dan Sun , Animesh Singh , Yuzhui Liu , Vedant Padwal on behalf of the KServe Working Group. KFServing is now KServe and KServe 0.7 release is available, the release also ensures a smooth user migration experience from KFServing to KServe.","title":"Authors"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#whats-changed","text":"InferenceService API group is changed from serving.kubeflow.org to serving.kserve.io #1826 , the migration job is created for smooth transition. Python SDK name is changed from kfserving to kserve . KServe Installation manifests #1824 . Models-web-app is separated out of the kserve repository to models-web-app . Docs and examples are moved to separate repository website . KServe images are migrated to kserve docker hub account. v1alpha2 API group is deprecated #1850 .","title":"What's Changed?"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#whats-new","text":"ModelMesh project is joining KServe under repository modelmesh-serving ! ModelMesh is designed for high-scale, high-density and frequently-changing model use cases. ModelMesh intelligently loads and unloads AI models to and from memory to strike an intelligent trade-off between responsiveness to users and computational footprint. To learn more about ModelMesh features and components, check out the ModelMesh announcement blog and Join talk at #KubeCon NA to get a deeper dive into ModelMesh and KServe . (Alpha feature) Raw Kubernetes deployment support, Istio/Knative dependency is now optional and please follow the guide to install and turn on RawDeployment mode. KServe now has its own documentation website temporarily hosted on website . Support v1 crd and webhook configuration for Kubernetes 1.22 #1837 . Triton model serving runtime now defaults to 21.09 version #1840 .","title":"\ud83c\udf08 What's New?"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#whats-fixed","text":"Bug fix for Azure blob storage #1845 . Tar/Zip support for all storage options #1836 . Fix AWS_REGION env variable and add AWS_CA_BUNDLE for S3 #1780 . Torchserve custom package install fix #1619 .","title":"\ud83d\udc1e What's Fixed?"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#join-the-community","text":"Visit our Website or GitHub Join the Slack(#kubeflow-kfserving) Attend a Biweekly community meeting on Wednesday 9am PST Contribute at developer and doc contribution guide to make code or doc contributions. We are excited to work with you to make KServe better and promote its adoption by more and more users!","title":"Join the community"},{"location":"blog/articles/2021-10-11-KServe-0.7-release/#contributors","text":"We would like to thank everyone for their efforts on v0.7 Andrews Arokiam Animesh Singh Chin Huang Dan Sun Jagadeesh Jinchi He Nick Hill Paul Van Eck Qianshan Chen Suresh Nakkiran Sukumar Gaonkar Theofilos Papapanagiotou Tommy Li Vedant Padwal Yao Xiao Yuzhui Liu","title":"Contributors"},{"location":"blog/articles/2022-02-18-KServe-0.8-release/","text":"Authors \u00b6 Dan Sun , Paul Van Eck , Vedant Padwal , Andrews Arokiam on behalf of the KServe Working Group. Announcing: KServe v0.8 \u00b6 February 18, 2022 Today, we are pleased to announce the v0.8.0 release of KServe! While the last release was focused on the transition of KFServing to KServe, this release was focused on unifying the InferenceService API for deploying models on KServe and ModelMesh. Note : For current users of KFServing/KServe, please take a few minutes to answer this short survey and provide your feedback! Now, let's take a look at some of the changes and additions to KServe. What\u2019s changed? \u00b6 ONNX Runtime Server has been removed from the supported serving runtime list. KServe by default now uses the Triton Inference Server to serve ONNX models. KServe\u2019s PyTorchServer has been removed from the supported serving runtime list. KServe by default now uses TorchServe to serve PyTorch models. A few main KServe SDK class names have been changed: KFModel is renamed to Model KFServer is renamed to ModelServer KFModelRepository is renamed to ModelRepository What's new? \u00b6 Some notable updates are: ClusterServingRuntime and ServingRuntime CRDs are introduced. Learn more below . A new Model Spec was introduced to the InferenceService Predictor Spec as a new way to specify models. Learn more below . Knative 1.0 is now supported and certified for the KServe Serverless installation. gRPC is now supported for transformer to predictor network communication. TorchServe Serving runtime has been updated to 0.5.2 which now supports the KServe V2 REST protocol. ModelMesh now has multi-namespace support, and users can now deploy GCS or HTTP(S) hosted models. To see all release updates, check out the KServe release notes and ModelMesh Serving release notes ! ServingRuntimes and ClusterServingRuntimes \u00b6 This release introduces two new CRDs ServingRuntimes and ClusterServingRuntimes with the only difference between these two is that one is namespace-scoped and one is cluster-scoped. A ServingRuntime defines the templates for Pods that can serve one or more particular model formats. Each ServingRuntime defines key information such as the container image of the runtime and a list of the model formats that the runtime supports. In previous versions of KServe, supported predictor formats and container images were defined in a config map in the control plane namespace. The ServingRuntime CRD should allow for improved flexibility and extensibility for defining or customizing runtimes to how you see fit without having to modify any controller code or any resources in the controller namespace. Several out-of-the-box ClusterServingRuntimes are provided with KServe so that users can continue to use KServe how they did before without having to define the runtimes themselves. Example SKLearn ClusterServingRuntime: apiVersion : serving.kserve.io/v1alpha1 kind : ClusterServingRuntime metadata : name : kserve-sklearnserver spec : supportedModelFormats : - name : sklearn version : \"1\" autoSelect : true containers : - name : kserve-container image : kserve/sklearnserver:latest args : - --model_name={{.Name}} - --model_dir=/mnt/models - --http_port=8080 resources : requests : cpu : \"1\" memory : 2Gi limits : cpu : \"1\" memory : 2Gi Updated InferenceService Predictor Spec \u00b6 A new Model spec was also introduced as a part of the Predictor spec for InferenceServices. One of the problems KServe was having was that the InferenceService CRD was becoming unwieldy with each model serving runtime being an object in the Predictor spec. This generated a lot of field duplication in the schema, bloating the overall size of the CRD. If a user wanted to introduce a new model serving framework for KServe to support, the CRD would have to be modified, and subsequently the controller code. Now, with the Model spec, a user can specify a model format and optionally a corresponding version. The KServe control plane will automatically select and use the ClusterServingRuntime or ServingRuntime that supports the given format. Each ServingRuntime maintains a list of supported model formats and versions. If a format has autoselect as true , then that opens the ServingRuntime up for automatic model placement for that model format. New Schema Previous Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-sklearn-isvc spec : predictor : model : modelFormat : name : sklearn storageUri : s3://bucket/sklearn/mnist.joblib apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-sklearn-isvc spec : predictor : sklearn : storageUri : s3://bucket/sklearn/mnist.joblib The previous way of defining predictors is still supported, however, the new approach will be the preferred one going forward. Eventually, the previous schema, with the framework names as keys in the predictor spec, will be removed. ModelMesh Updates \u00b6 ModelMesh has been in the process of integrating as KServe\u2019s multi-model serving backend. With the inclusion of the aforementioned ServingRuntime CRDs and the Predictor Model spec, the two projects are now much more aligned, with continual improvements underway. ModelMesh now supports multi-namespace reconciliation. Previously, the ModelMesh controller would only reconcile against resources deployed in the same namespace as the controller. Now, by default, ModelMesh will be able to handle InferenceService deployments in any \"modelmesh-enabled\" namespace. Learn more here . Also, while ModelMesh previously only supported S3-based storage, we are happy to share that ModelMesh now works with models hosted using GCS and HTTP(S). Join the community \u00b6 Visit our Website or GitHub Join the Slack ( #kubeflow-kfserving ) Attend a biweekly community meeting on Wednesday 9am PST View our developer and doc contribution guides to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thank you for trying out KServe!","title":"KServe 0.8 Release"},{"location":"blog/articles/2022-02-18-KServe-0.8-release/#authors","text":"Dan Sun , Paul Van Eck , Vedant Padwal , Andrews Arokiam on behalf of the KServe Working Group.","title":"Authors"},{"location":"blog/articles/2022-02-18-KServe-0.8-release/#announcing-kserve-v08","text":"February 18, 2022 Today, we are pleased to announce the v0.8.0 release of KServe! While the last release was focused on the transition of KFServing to KServe, this release was focused on unifying the InferenceService API for deploying models on KServe and ModelMesh. Note : For current users of KFServing/KServe, please take a few minutes to answer this short survey and provide your feedback! Now, let's take a look at some of the changes and additions to KServe.","title":"Announcing: KServe v0.8"},{"location":"blog/articles/2022-02-18-KServe-0.8-release/#whats-changed","text":"ONNX Runtime Server has been removed from the supported serving runtime list. KServe by default now uses the Triton Inference Server to serve ONNX models. KServe\u2019s PyTorchServer has been removed from the supported serving runtime list. KServe by default now uses TorchServe to serve PyTorch models. A few main KServe SDK class names have been changed: KFModel is renamed to Model KFServer is renamed to ModelServer KFModelRepository is renamed to ModelRepository","title":"What\u2019s changed?"},{"location":"blog/articles/2022-02-18-KServe-0.8-release/#whats-new","text":"Some notable updates are: ClusterServingRuntime and ServingRuntime CRDs are introduced. Learn more below . A new Model Spec was introduced to the InferenceService Predictor Spec as a new way to specify models. Learn more below . Knative 1.0 is now supported and certified for the KServe Serverless installation. gRPC is now supported for transformer to predictor network communication. TorchServe Serving runtime has been updated to 0.5.2 which now supports the KServe V2 REST protocol. ModelMesh now has multi-namespace support, and users can now deploy GCS or HTTP(S) hosted models. To see all release updates, check out the KServe release notes and ModelMesh Serving release notes !","title":"What's new?"},{"location":"blog/articles/2022-02-18-KServe-0.8-release/#servingruntimes-and-clusterservingruntimes","text":"This release introduces two new CRDs ServingRuntimes and ClusterServingRuntimes with the only difference between these two is that one is namespace-scoped and one is cluster-scoped. A ServingRuntime defines the templates for Pods that can serve one or more particular model formats. Each ServingRuntime defines key information such as the container image of the runtime and a list of the model formats that the runtime supports. In previous versions of KServe, supported predictor formats and container images were defined in a config map in the control plane namespace. The ServingRuntime CRD should allow for improved flexibility and extensibility for defining or customizing runtimes to how you see fit without having to modify any controller code or any resources in the controller namespace. Several out-of-the-box ClusterServingRuntimes are provided with KServe so that users can continue to use KServe how they did before without having to define the runtimes themselves. Example SKLearn ClusterServingRuntime: apiVersion : serving.kserve.io/v1alpha1 kind : ClusterServingRuntime metadata : name : kserve-sklearnserver spec : supportedModelFormats : - name : sklearn version : \"1\" autoSelect : true containers : - name : kserve-container image : kserve/sklearnserver:latest args : - --model_name={{.Name}} - --model_dir=/mnt/models - --http_port=8080 resources : requests : cpu : \"1\" memory : 2Gi limits : cpu : \"1\" memory : 2Gi","title":"ServingRuntimes and ClusterServingRuntimes"},{"location":"blog/articles/2022-02-18-KServe-0.8-release/#updated-inferenceservice-predictor-spec","text":"A new Model spec was also introduced as a part of the Predictor spec for InferenceServices. One of the problems KServe was having was that the InferenceService CRD was becoming unwieldy with each model serving runtime being an object in the Predictor spec. This generated a lot of field duplication in the schema, bloating the overall size of the CRD. If a user wanted to introduce a new model serving framework for KServe to support, the CRD would have to be modified, and subsequently the controller code. Now, with the Model spec, a user can specify a model format and optionally a corresponding version. The KServe control plane will automatically select and use the ClusterServingRuntime or ServingRuntime that supports the given format. Each ServingRuntime maintains a list of supported model formats and versions. If a format has autoselect as true , then that opens the ServingRuntime up for automatic model placement for that model format. New Schema Previous Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-sklearn-isvc spec : predictor : model : modelFormat : name : sklearn storageUri : s3://bucket/sklearn/mnist.joblib apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-sklearn-isvc spec : predictor : sklearn : storageUri : s3://bucket/sklearn/mnist.joblib The previous way of defining predictors is still supported, however, the new approach will be the preferred one going forward. Eventually, the previous schema, with the framework names as keys in the predictor spec, will be removed.","title":"Updated InferenceService Predictor Spec"},{"location":"blog/articles/2022-02-18-KServe-0.8-release/#modelmesh-updates","text":"ModelMesh has been in the process of integrating as KServe\u2019s multi-model serving backend. With the inclusion of the aforementioned ServingRuntime CRDs and the Predictor Model spec, the two projects are now much more aligned, with continual improvements underway. ModelMesh now supports multi-namespace reconciliation. Previously, the ModelMesh controller would only reconcile against resources deployed in the same namespace as the controller. Now, by default, ModelMesh will be able to handle InferenceService deployments in any \"modelmesh-enabled\" namespace. Learn more here . Also, while ModelMesh previously only supported S3-based storage, we are happy to share that ModelMesh now works with models hosted using GCS and HTTP(S).","title":"ModelMesh Updates"},{"location":"blog/articles/2022-02-18-KServe-0.8-release/#join-the-community","text":"Visit our Website or GitHub Join the Slack ( #kubeflow-kfserving ) Attend a biweekly community meeting on Wednesday 9am PST View our developer and doc contribution guides to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thank you for trying out KServe!","title":"Join the community"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/","text":"Announcing: KServe v0.9.0 \u00b6 Today, we are pleased to announce the v0.9.0 release of KServe! KServe has now fully onboarded to LF AI & Data Foundation as an Incubation Project ! In this release we are excited to introduce the new InferenceGraph feature which has long been asked from the community. Also continuing the effort from the last release for unifying the InferenceService API for deploying models on KServe and ModelMesh, ModelMesh is now fully compatible with KServe InferenceService API! Introduce InferenceGraph \u00b6 The ML Inference system is getting bigger and more complex. It often consists of many models to make a single prediction. The common use cases are image classification and natural language multi-stage processing pipelines. For example, an image classification pipeline needs to run top level classification first then downstream further classification based on previous prediction results. KServe has the unique strength to build the distributed inference graph with its native integration of InferenceServices, standard inference protocol for chaining models and serverless auto-scaling capabilities. KServe leverages these strengths to build the InferenceGraph and enable users to deploy complex ML Inference pipelines to production in a declarative and scalable way. InferenceGraph is made up of a list of routing nodes with each node consisting of a set of routing steps. Each step can either route to an InferenceService or another node defined on the graph which makes the InferenceGraph highly composable. The graph router is deployed behind an HTTP endpoint and can be scaled dynamically based on request volume. The InferenceGraph supports four different types of routing nodes: Sequence , Switch , Ensemble , Splitter . Sequence Node : It allows users to define multiple Steps with InferenceServices or Nodes as routing targets in a sequence. The Steps are executed in sequence and the request/response from the previous step and be passed to the next step as input based on configuration. Switch Node : It allows users to define routing conditions and select a Step to execute if it matches the condition. The response is returned as soon as it finds the first step that matches the condition. If no condition is matched, the graph returns the original request. Ensemble Node : A model ensemble requires scoring each model separately and then combines the results into a single prediction response. You can then use different combination methods to produce the final result. Multiple classification trees, for example, are commonly combined using a \"majority vote\" method. Multiple regression trees are often combined using various averaging techniques. Splitter Node : It allows users to split the traffic to multiple targets using a weighted distribution. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"cat-dog-classifier\" spec : predictor : pytorch : resources : requests : cpu : 100m storageUri : gs://kfserving-examples/models/torchserve/cat_dog_classification --- apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"dog-breed-classifier\" spec : predictor : pytorch : resources : requests : cpu : 100m storageUri : gs://kfserving-examples/models/torchserve/dog_breed_classification --- apiVersion : \"serving.kserve.io/v1alpha1\" kind : \"InferenceGraph\" metadata : name : \"dog-breed-pipeline\" spec : nodes : root : routerType : Sequence steps : - serviceName : cat-dog-classifier name : cat_dog_classifier # step name - serviceName : dog-breed-classifier name : dog_breed_classifier data : $request condition : \"[@this].#(predictions.0==\\\"dog\\\")\" Currently InferenceGraph is supported with the Serverless deployment mode. You can try it out following the tutorial . InferenceService API for ModelMesh \u00b6 The InferenceService CRD is now the primary interface for interacting with ModelMesh. Some changes were made to the InferenceService spec to better facilitate ModelMesh\u2019s needs. Storage Spec \u00b6 To unify how model storage is defined for both single and multi-model serving, a new storage spec was added to the predictor model spec. With this storage spec, users can specify a key inside a common secret holding config/credentials for each of the storage backends from which models can be loaded. Example: storage : key : localMinIO # Credential key for the destination storage in the common secret path : sklearn # Model path inside the bucket # schemaPath: null # Optional schema files for payload schema parameters : # Parameters to override the default values inside the common secret. bucket : example-models Learn more here . Model Status \u00b6 For further alignment between ModelMesh and KServe, some additions to the InferenceService status were made. There is now a Model Status section which contains information about the model loaded in the predictor. New fields include: states - State information of the predictor's model. activeModelState - The state of the model currently being served by the predictor's endpoints. targetModelState - This will be set only when transitionStatus is not UpToDate , meaning that the target model differs from the currently-active model. transitionStatus - Indicates state of the predictor relative to its current spec. modelCopies - Model copy information of the predictor's model. lastFailureInfo - Details about the most recent error associated with this predictor. Not all of the contained fields will necessarily have a value. Deploying on ModelMesh \u00b6 For deploying InferenceServices on ModelMesh, the ModelMesh and KServe controllers will still require that the user specifies the serving.kserve.io/deploymentMode: ModelMesh annotation. A complete example on an InferenceService with the new storage spec is showing below: apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-tensorflow-mnist annotations : serving.kserve.io/deploymentMode : ModelMesh spec : predictor : model : modelFormat : name : tensorflow storage : key : localMinIO path : tensorflow/mnist.savedmodel Other New Features: \u00b6 Support serving MLFlow model format via MLServer serving runtime. Support unified autoscaling target and metric fields for InferenceService components with both Serverless and RawDeployment mode. Support InferenceService ingress class and url domain template configuration for RawDeployment mode. ModelMesh now has a default OpenVINO Model Server ServingRuntime. What\u2019s Changed? \u00b6 The KServe controller manager is changed from StatefulSet to Deployment to support HA mode. log4j security vulnerability fix Upgrade TorchServe serving runtime to 0.6.0 Update MLServer serving runtime to 1.0.0 Check out the full release notes for KServe and ModelMesh for more details. Join the community \u00b6 Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thank you for contributing or checking out KServe! \u2013 The KServe Working Group","title":"KServe 0.9 Release"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#announcing-kserve-v090","text":"Today, we are pleased to announce the v0.9.0 release of KServe! KServe has now fully onboarded to LF AI & Data Foundation as an Incubation Project ! In this release we are excited to introduce the new InferenceGraph feature which has long been asked from the community. Also continuing the effort from the last release for unifying the InferenceService API for deploying models on KServe and ModelMesh, ModelMesh is now fully compatible with KServe InferenceService API!","title":"Announcing: KServe v0.9.0"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#introduce-inferencegraph","text":"The ML Inference system is getting bigger and more complex. It often consists of many models to make a single prediction. The common use cases are image classification and natural language multi-stage processing pipelines. For example, an image classification pipeline needs to run top level classification first then downstream further classification based on previous prediction results. KServe has the unique strength to build the distributed inference graph with its native integration of InferenceServices, standard inference protocol for chaining models and serverless auto-scaling capabilities. KServe leverages these strengths to build the InferenceGraph and enable users to deploy complex ML Inference pipelines to production in a declarative and scalable way. InferenceGraph is made up of a list of routing nodes with each node consisting of a set of routing steps. Each step can either route to an InferenceService or another node defined on the graph which makes the InferenceGraph highly composable. The graph router is deployed behind an HTTP endpoint and can be scaled dynamically based on request volume. The InferenceGraph supports four different types of routing nodes: Sequence , Switch , Ensemble , Splitter . Sequence Node : It allows users to define multiple Steps with InferenceServices or Nodes as routing targets in a sequence. The Steps are executed in sequence and the request/response from the previous step and be passed to the next step as input based on configuration. Switch Node : It allows users to define routing conditions and select a Step to execute if it matches the condition. The response is returned as soon as it finds the first step that matches the condition. If no condition is matched, the graph returns the original request. Ensemble Node : A model ensemble requires scoring each model separately and then combines the results into a single prediction response. You can then use different combination methods to produce the final result. Multiple classification trees, for example, are commonly combined using a \"majority vote\" method. Multiple regression trees are often combined using various averaging techniques. Splitter Node : It allows users to split the traffic to multiple targets using a weighted distribution. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"cat-dog-classifier\" spec : predictor : pytorch : resources : requests : cpu : 100m storageUri : gs://kfserving-examples/models/torchserve/cat_dog_classification --- apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"dog-breed-classifier\" spec : predictor : pytorch : resources : requests : cpu : 100m storageUri : gs://kfserving-examples/models/torchserve/dog_breed_classification --- apiVersion : \"serving.kserve.io/v1alpha1\" kind : \"InferenceGraph\" metadata : name : \"dog-breed-pipeline\" spec : nodes : root : routerType : Sequence steps : - serviceName : cat-dog-classifier name : cat_dog_classifier # step name - serviceName : dog-breed-classifier name : dog_breed_classifier data : $request condition : \"[@this].#(predictions.0==\\\"dog\\\")\" Currently InferenceGraph is supported with the Serverless deployment mode. You can try it out following the tutorial .","title":"Introduce InferenceGraph"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#inferenceservice-api-for-modelmesh","text":"The InferenceService CRD is now the primary interface for interacting with ModelMesh. Some changes were made to the InferenceService spec to better facilitate ModelMesh\u2019s needs.","title":"InferenceService API for ModelMesh"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#storage-spec","text":"To unify how model storage is defined for both single and multi-model serving, a new storage spec was added to the predictor model spec. With this storage spec, users can specify a key inside a common secret holding config/credentials for each of the storage backends from which models can be loaded. Example: storage : key : localMinIO # Credential key for the destination storage in the common secret path : sklearn # Model path inside the bucket # schemaPath: null # Optional schema files for payload schema parameters : # Parameters to override the default values inside the common secret. bucket : example-models Learn more here .","title":"Storage Spec"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#model-status","text":"For further alignment between ModelMesh and KServe, some additions to the InferenceService status were made. There is now a Model Status section which contains information about the model loaded in the predictor. New fields include: states - State information of the predictor's model. activeModelState - The state of the model currently being served by the predictor's endpoints. targetModelState - This will be set only when transitionStatus is not UpToDate , meaning that the target model differs from the currently-active model. transitionStatus - Indicates state of the predictor relative to its current spec. modelCopies - Model copy information of the predictor's model. lastFailureInfo - Details about the most recent error associated with this predictor. Not all of the contained fields will necessarily have a value.","title":"Model Status"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#deploying-on-modelmesh","text":"For deploying InferenceServices on ModelMesh, the ModelMesh and KServe controllers will still require that the user specifies the serving.kserve.io/deploymentMode: ModelMesh annotation. A complete example on an InferenceService with the new storage spec is showing below: apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-tensorflow-mnist annotations : serving.kserve.io/deploymentMode : ModelMesh spec : predictor : model : modelFormat : name : tensorflow storage : key : localMinIO path : tensorflow/mnist.savedmodel","title":"Deploying on ModelMesh"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#other-new-features","text":"Support serving MLFlow model format via MLServer serving runtime. Support unified autoscaling target and metric fields for InferenceService components with both Serverless and RawDeployment mode. Support InferenceService ingress class and url domain template configuration for RawDeployment mode. ModelMesh now has a default OpenVINO Model Server ServingRuntime.","title":"Other New Features:"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#whats-changed","text":"The KServe controller manager is changed from StatefulSet to Deployment to support HA mode. log4j security vulnerability fix Upgrade TorchServe serving runtime to 0.6.0 Update MLServer serving runtime to 1.0.0 Check out the full release notes for KServe and ModelMesh for more details.","title":"What\u2019s Changed?"},{"location":"blog/articles/2022-07-21-KServe-0.9-release/#join-the-community","text":"Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thank you for contributing or checking out KServe! \u2013 The KServe Working Group","title":"Join the community"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/","text":"Announcing: KServe v0.10.0 \u00b6 We are excited to announce KServe 0.10 release. In this release we have enabled more KServe networking options, improved KServe telemetry for supported serving runtimes and increased support coverage for Open(aka v2) inference protocol for both standard and ModelMesh InferenceService. KServe Networking Options \u00b6 Istio is now optional for both Serverless and RawDeployment mode. Please see the alternative networking guide for how you can enable other ingress options supported by Knative with Serverless mode. For Istio users, if you want to turn on full service mesh mode to secure InferenceService with mutual TLS and enable the traffic policies, please read the service mesh setup guideline . KServe Telemetry for Serving Runtimes \u00b6 We have instrumented additional latency metrics in KServe Python ServingRuntimes for preprocess , predict and postprocess handlers. In Serverless mode we have extended Knative queue-proxy to enable metrics aggregation for both metrics exposed in queue-proxy and kserve-container from each ServingRuntime . Please read the prometheus metrics setup guideline for how to enable the metrics scraping and aggregations. Open(v2) Inference Protocol Support Coverage \u00b6 As there have been increasing adoptions for KServe v2 Inference Protocol from AMD Inference ServingRuntime which supports FPGAs and OpenVINO which now provides KServe REST and gRPC compatible API, in the issue we have proposed to rename to KServe Open Inference Protocol . In KServe 0.10, we have added Open(v2) inference protocol support for KServe custom runtimes. Now, you can enable v2 REST/gRPC for both custom transformer and predictor with images built by implementing KServe Python SDK API. gRPC enables high performance inference data plane as it is built on top of HTTP/2 and binary data transportation which is more efficient to send over the wire compared to REST. Please see the detailed example for transformer and predictor . from kserve import Model def image_transform ( byte_array ): image_processing = transforms . Compose ([ transforms . ToTensor (), transforms . Normalize (( 0.1307 ,), ( 0.3081 ,)) ]) image = Image . open ( io . BytesIO ( byte_array )) tensor = image_processing ( image ) . numpy () return tensor class CustomModel ( Model ): def predict ( self , request : InferRequest , headers : Dict [ str , str ]) -> InferResponse : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) output = self . model ( input_tensors ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . flatten () . tolist () response_id = generate_uuid () infer_output = InferOutput ( name = \"output-0\" , shape = list ( values . shape ), datatype = \"FP32\" , data = result ) infer_response = InferResponse ( model_name = self . name , infer_outputs = [ infer_output ], response_id = response_id ) return infer_response class CustomTransformer ( Model ): def preprocess ( self , request : InferRequest , headers : Dict [ str , str ]) -> InferRequest : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) infer_inputs = [ InferInput ( name = \"INPUT__0\" , datatype = 'FP32' , shape = list ( input_tensors . shape ), data = input_tensors )] infer_request = InferRequest ( model_name = self . model_name , infer_inputs = infer_inputs ) return infer_request You can use the same Python API type InferRequest and InferResponse for both REST and gRPC protocol. KServe handles the underlying decoding and encoding according to the protocol. Warning A new headers argument is added to the custom handlers to pass http/gRPC headers or other metadata. You can also use this as context dict to pass data between handlers. If you have existing custom transformer or predictor, the headers argument is now required to add to the preprocess , predict and postprocess handlers. Please check the following matrix for supported ModelFormats and ServingRuntimes . Model Format v1 Open(v2) REST/gRPC Tensorflow \u2705 TFServing \u2705 Triton PyTorch \u2705 TorchServe \u2705 TorchServe TorchScript \u2705 TorchServe \u2705 Triton ONNX \u274c \u2705 Triton Scikit-learn \u2705 KServe \u2705 MLServer XGBoost \u2705 KServe \u2705 MLServer LightGBM \u2705 KServe \u2705 MLServer MLFlow \u274c \u2705 MLServer Custom \u2705 KServe \u2705 KServe Multi-Arch Image Support \u00b6 KServe control plane images kserve-controller , kserve/agent , kserve/router are now supported for multiple architectures: ppc64le , arm64 , amd64 , s390x . KServe Storage Credentials Support \u00b6 Currently, AWS users need to create a secret with long term/static IAM credentials for downloading models stored in S3. Security best practice is to use IAM role for service account(IRSA) which enables automatic credential rotation and fine-grained access control, see how to setup IRSA . Support Azure Blobs with managed identity . ModelMesh updates \u00b6 ModelMesh has continued to integrate itself as KServe's multi-model serving backend, introducing improvements and features that better align the two projects. For example, it now supports ClusterServingRuntimes, allowing use of cluster-scoped ServingRuntimes, originally introduced in KServe 0.8. Additionally, ModelMesh introduced support for TorchServe enabling users to serve arbitrary PyTorch models (e.g. eager-mode) in the context of distributed-multi-model serving. Other limitations have been addressed as well, such as adding support for BYTES/string type tensors when using the REST inference API for inference requests that require them. Other Changes: \u00b6 For a complete change list please read the release notes from KServe v0.10 and ModelMesh v0.10 . Join the community \u00b6 Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thanks for all the contributors who have made the commits to 0.10 release! Steve Larkin Stephan Schielke Curtis Maddalozzo Zhongcheng Lao Dimitris Aragiorgis Pan Li tjandy98 Sukumar Gaonkar Rachit Chauhan Rafael Vasquez Tim Kleinloog Christian Kadner ddelange Lize Cai sangjune.park Suresh Nakkeran Konstantinos Messis Matt Rose Alexa Griffith Jagadeesh J Alex Lembiyeuski Yuki Iwai Andrews Arokiam Xin Fu adilhusain-s Pranav Pandit C1berwiz dilverse Yuan Tang Dan Sun Nick Hill The KServe Working Group","title":"KServe 0.10 Release"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#announcing-kserve-v0100","text":"We are excited to announce KServe 0.10 release. In this release we have enabled more KServe networking options, improved KServe telemetry for supported serving runtimes and increased support coverage for Open(aka v2) inference protocol for both standard and ModelMesh InferenceService.","title":"Announcing: KServe v0.10.0"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#kserve-networking-options","text":"Istio is now optional for both Serverless and RawDeployment mode. Please see the alternative networking guide for how you can enable other ingress options supported by Knative with Serverless mode. For Istio users, if you want to turn on full service mesh mode to secure InferenceService with mutual TLS and enable the traffic policies, please read the service mesh setup guideline .","title":"KServe Networking Options"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#kserve-telemetry-for-serving-runtimes","text":"We have instrumented additional latency metrics in KServe Python ServingRuntimes for preprocess , predict and postprocess handlers. In Serverless mode we have extended Knative queue-proxy to enable metrics aggregation for both metrics exposed in queue-proxy and kserve-container from each ServingRuntime . Please read the prometheus metrics setup guideline for how to enable the metrics scraping and aggregations.","title":"KServe Telemetry for Serving Runtimes"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#openv2-inference-protocol-support-coverage","text":"As there have been increasing adoptions for KServe v2 Inference Protocol from AMD Inference ServingRuntime which supports FPGAs and OpenVINO which now provides KServe REST and gRPC compatible API, in the issue we have proposed to rename to KServe Open Inference Protocol . In KServe 0.10, we have added Open(v2) inference protocol support for KServe custom runtimes. Now, you can enable v2 REST/gRPC for both custom transformer and predictor with images built by implementing KServe Python SDK API. gRPC enables high performance inference data plane as it is built on top of HTTP/2 and binary data transportation which is more efficient to send over the wire compared to REST. Please see the detailed example for transformer and predictor . from kserve import Model def image_transform ( byte_array ): image_processing = transforms . Compose ([ transforms . ToTensor (), transforms . Normalize (( 0.1307 ,), ( 0.3081 ,)) ]) image = Image . open ( io . BytesIO ( byte_array )) tensor = image_processing ( image ) . numpy () return tensor class CustomModel ( Model ): def predict ( self , request : InferRequest , headers : Dict [ str , str ]) -> InferResponse : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) output = self . model ( input_tensors ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . flatten () . tolist () response_id = generate_uuid () infer_output = InferOutput ( name = \"output-0\" , shape = list ( values . shape ), datatype = \"FP32\" , data = result ) infer_response = InferResponse ( model_name = self . name , infer_outputs = [ infer_output ], response_id = response_id ) return infer_response class CustomTransformer ( Model ): def preprocess ( self , request : InferRequest , headers : Dict [ str , str ]) -> InferRequest : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) infer_inputs = [ InferInput ( name = \"INPUT__0\" , datatype = 'FP32' , shape = list ( input_tensors . shape ), data = input_tensors )] infer_request = InferRequest ( model_name = self . model_name , infer_inputs = infer_inputs ) return infer_request You can use the same Python API type InferRequest and InferResponse for both REST and gRPC protocol. KServe handles the underlying decoding and encoding according to the protocol. Warning A new headers argument is added to the custom handlers to pass http/gRPC headers or other metadata. You can also use this as context dict to pass data between handlers. If you have existing custom transformer or predictor, the headers argument is now required to add to the preprocess , predict and postprocess handlers. Please check the following matrix for supported ModelFormats and ServingRuntimes . Model Format v1 Open(v2) REST/gRPC Tensorflow \u2705 TFServing \u2705 Triton PyTorch \u2705 TorchServe \u2705 TorchServe TorchScript \u2705 TorchServe \u2705 Triton ONNX \u274c \u2705 Triton Scikit-learn \u2705 KServe \u2705 MLServer XGBoost \u2705 KServe \u2705 MLServer LightGBM \u2705 KServe \u2705 MLServer MLFlow \u274c \u2705 MLServer Custom \u2705 KServe \u2705 KServe","title":"Open(v2) Inference Protocol Support Coverage"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#multi-arch-image-support","text":"KServe control plane images kserve-controller , kserve/agent , kserve/router are now supported for multiple architectures: ppc64le , arm64 , amd64 , s390x .","title":"Multi-Arch Image Support"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#kserve-storage-credentials-support","text":"Currently, AWS users need to create a secret with long term/static IAM credentials for downloading models stored in S3. Security best practice is to use IAM role for service account(IRSA) which enables automatic credential rotation and fine-grained access control, see how to setup IRSA . Support Azure Blobs with managed identity .","title":"KServe Storage Credentials Support"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#modelmesh-updates","text":"ModelMesh has continued to integrate itself as KServe's multi-model serving backend, introducing improvements and features that better align the two projects. For example, it now supports ClusterServingRuntimes, allowing use of cluster-scoped ServingRuntimes, originally introduced in KServe 0.8. Additionally, ModelMesh introduced support for TorchServe enabling users to serve arbitrary PyTorch models (e.g. eager-mode) in the context of distributed-multi-model serving. Other limitations have been addressed as well, such as adding support for BYTES/string type tensors when using the REST inference API for inference requests that require them.","title":"ModelMesh updates"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#other-changes","text":"For a complete change list please read the release notes from KServe v0.10 and ModelMesh v0.10 .","title":"Other Changes:"},{"location":"blog/articles/2023-02-05-KServe-0.10-release/#join-the-community","text":"Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thanks for all the contributors who have made the commits to 0.10 release! Steve Larkin Stephan Schielke Curtis Maddalozzo Zhongcheng Lao Dimitris Aragiorgis Pan Li tjandy98 Sukumar Gaonkar Rachit Chauhan Rafael Vasquez Tim Kleinloog Christian Kadner ddelange Lize Cai sangjune.park Suresh Nakkeran Konstantinos Messis Matt Rose Alexa Griffith Jagadeesh J Alex Lembiyeuski Yuki Iwai Andrews Arokiam Xin Fu adilhusain-s Pranav Pandit C1berwiz dilverse Yuan Tang Dan Sun Nick Hill The KServe Working Group","title":"Join the community"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/","text":"Announcing: KServe v0.11 \u00b6 We are excited to announce the release of KServe 0.11, in this release we introduced Large Language Model (LLM) runtimes, made enhancements to the KServe control plane, Python SDK Open Inference Protocol support and dependency managemenet. For ModelMesh we have added features PVC, HPA, payload logging to ensure feature parity with KServe. Here is a summary of the key changes: KServe Core Inference Enhancements \u00b6 Support path based routing which is served as an alternative way to the host based routing, the URL of the InferenceService could look like http://
/serving// . Please refer to the doc for how to enable path based routing. Introduced priority field for Serving Runtime custom resource to handle the case when you have multiple serving runtimes which support the same model formats, see more details from the serving runtime doc . Introduced Custom Storage Container CRD to allow customized implementations with supported storage URI prefixes, example use cases are private model registry integration: apiVersion : \"serving.kserve.io/v1alpha1\" kind : ClusterStorageContainer metadata : name : default spec : container : name : storage-initializer image : kserve/model-registry:latest resources : requests : memory : 100Mi cpu : 100m limits : memory : 1Gi cpu : \"1\" supportedUriFormats : - prefix : model-registry:// Inference Graph enhancements for improving the API spec to support pod affinity and resource requirement fields. Dependency field with options Soft and Hard is introduced to handle error responses from the inference steps to decide whether to short-circuit the request in case of errors, see the following example with hard dependency with the node steps: apiVersion : serving.kserve.io/v1alpha1 kind : InferenceGraph metadata : name : graph_with_switch_node spec : nodes : root : routerType : Sequence steps : - name : \"rootStep1\" nodeName : node1 dependency : Hard - name : \"rootStep2\" serviceName : {{ success_200_isvc_id }} node1 : routerType : Switch steps : - name : \"node1Step1\" serviceName : {{ error_404_isvc_id }} condition : \"[@this].#(decision_picker==ERROR)\" dependency : Hard For more details please refer to the issue . Improved InferenceService debugging experience by adding the aggregated RoutesReady status and LastDeploymentReady condition to the InferenceService Status to differentiate the endpoint and deployment status. This applies to the serverless mode and for more details refer to the API docs . Enhanced Python SDK Dependency Management \u00b6 KServe has adopted poetry to manage python dependencies. You can now install the KServe SDK with locked dependencies using poetry install . While pip install still works, we highly recommend using poetry to ensure predictable dependency management. The KServe SDK is also slimmed down by making the cloud storage dependency optional, if you require storage dependency for custom serving runtimes you can still install with pip install kserve[storage] . KServe Python Runtimes Improvements \u00b6 KServe Python Runtimes including sklearnserver , lgbserver , xgbserver now support the open inference protocol for both REST and gRPC. Logging improvements including adding Uvicorn access logging and a default KServe logger. Postprocess handler has been aligned with open inference protocol, simplifying the underlying transportation protocol complexities. LLM Runtimes \u00b6 TorchServe LLM Runtime \u00b6 KServe now integrates with TorchServe 0.8, offering the support for LLM models that may not fit onto a single GPU. Huggingface Accelerate and Deepspeed are available options to split the model into multiple partitions over multiple GPUs. You can see the detailed example for how to serve the LLM on KServe with TorchServe runtime. vLLM Runtime \u00b6 Serving LLM models can be surprisingly slow even on high end GPUs, vLLM is a fast and easy-to-use LLM inference engine. It can achieve 10x-20x higher throughput than Huggingface transformers. It supports continuous batching for increased throughput and GPU utilization, paged attention to address the memory bottleneck where in the autoregressive decoding process all the attention key value tensors(KV Cache) are kept in the GPU memory to generate next tokens. In the example we show how to deploy vLLM on KServe and expects further integration in KServe 0.12 with proposed generate endpoint for open inference protocol. ModelMesh Updates \u00b6 Storing Models on Kubernetes Persistent Volumes (PVC) \u00b6 ModelMesh now allows to directly mount model files onto serving runtimes pods using Kubernetes Persistent Volumes . Depending on the selected storage solution this approach can significantly reduce latency when deploying new predictors, potentially remove the need for additional S3 cloud object storage like AWS S3, GCS, or Azure Blob Storage altogether. Horizontal Pod Autoscaling (HPA) \u00b6 Kubernetes Horizontal Pod Autoscaling can now be used at the serving runtime pod level. With HPA enabled, the ModelMesh controller no longer manages the number of replicas. Instead, a HorizontalPodAutoscaler automatically updates the serving runtime deployment with the number of Pods to best match the demand. Model Metrics, Metrics Dashboard, Payload Event Logging \u00b6 ModelMesh v0.11 introduces a new configuration option to emit a subset of useful metrics at the individual model level. These metrics can help identify outlier or \"heavy hitter\" models and consequently fine-tune the deployments of those inference services, like allocating more resources or increasing the number of replicas for improved responsiveness or avoid frequent cache misses. A new Grafana dashboard was added to display the comprehensive set of Prometheus metrics like model loading and unloading rates, internal queuing delays, capacity and usage, cache state, etc. to monitor the general health of the ModelMesh Serving deployment. The new PayloadProcessor interface can be implemented to log prediction requests and responses, to create data sinks for data visualization, for model quality assessment, or for drift and outlier detection by external monitoring systems. What's Changed? \u00b6 To allow longer InferenceService name due to DNS max length limits from issue , the Default suffix in the inference service component(predictor/transformer/explainer) name has been removed for newly created InferenceServices. This affects the client that is using the component url directly instead of the top level InferenceService url. Status.address.url is now consistent for both serverless and raw deployment mode, the url path portion is dropped in serverless mode. Raw bytes are now accepted in v1 protocol, setting the right content-type header to application/json is required to recognize and decode the json payload if content-type is specified. curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test. ${ CUSTOM_DOMAIN } /v1/models/sklearn-iris:predict -d @./iris-input.json For a complete change list please read the release notes from KServe v0.11 and ModelMesh v0.11 . Join the community \u00b6 Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thanks for all the contributors who have made the commits to 0.11 release! The KServe Working Group","title":"KServe 0.11 Release"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#announcing-kserve-v011","text":"We are excited to announce the release of KServe 0.11, in this release we introduced Large Language Model (LLM) runtimes, made enhancements to the KServe control plane, Python SDK Open Inference Protocol support and dependency managemenet. For ModelMesh we have added features PVC, HPA, payload logging to ensure feature parity with KServe. Here is a summary of the key changes:","title":"Announcing: KServe v0.11"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#kserve-core-inference-enhancements","text":"Support path based routing which is served as an alternative way to the host based routing, the URL of the InferenceService could look like http:///serving// . Please refer to the doc for how to enable path based routing. Introduced priority field for Serving Runtime custom resource to handle the case when you have multiple serving runtimes which support the same model formats, see more details from the serving runtime doc . Introduced Custom Storage Container CRD to allow customized implementations with supported storage URI prefixes, example use cases are private model registry integration: apiVersion : \"serving.kserve.io/v1alpha1\" kind : ClusterStorageContainer metadata : name : default spec : container : name : storage-initializer image : kserve/model-registry:latest resources : requests : memory : 100Mi cpu : 100m limits : memory : 1Gi cpu : \"1\" supportedUriFormats : - prefix : model-registry:// Inference Graph enhancements for improving the API spec to support pod affinity and resource requirement fields. Dependency field with options Soft and Hard is introduced to handle error responses from the inference steps to decide whether to short-circuit the request in case of errors, see the following example with hard dependency with the node steps: apiVersion : serving.kserve.io/v1alpha1 kind : InferenceGraph metadata : name : graph_with_switch_node spec : nodes : root : routerType : Sequence steps : - name : \"rootStep1\" nodeName : node1 dependency : Hard - name : \"rootStep2\" serviceName : {{ success_200_isvc_id }} node1 : routerType : Switch steps : - name : \"node1Step1\" serviceName : {{ error_404_isvc_id }} condition : \"[@this].#(decision_picker==ERROR)\" dependency : Hard For more details please refer to the issue . Improved InferenceService debugging experience by adding the aggregated RoutesReady status and LastDeploymentReady condition to the InferenceService Status to differentiate the endpoint and deployment status. This applies to the serverless mode and for more details refer to the API docs .","title":"KServe Core Inference Enhancements"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#enhanced-python-sdk-dependency-management","text":"KServe has adopted poetry to manage python dependencies. You can now install the KServe SDK with locked dependencies using poetry install . While pip install still works, we highly recommend using poetry to ensure predictable dependency management. The KServe SDK is also slimmed down by making the cloud storage dependency optional, if you require storage dependency for custom serving runtimes you can still install with pip install kserve[storage] .","title":"Enhanced Python SDK Dependency Management"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#kserve-python-runtimes-improvements","text":"KServe Python Runtimes including sklearnserver , lgbserver , xgbserver now support the open inference protocol for both REST and gRPC. Logging improvements including adding Uvicorn access logging and a default KServe logger. Postprocess handler has been aligned with open inference protocol, simplifying the underlying transportation protocol complexities.","title":"KServe Python Runtimes Improvements"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#llm-runtimes","text":"","title":"LLM Runtimes"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#torchserve-llm-runtime","text":"KServe now integrates with TorchServe 0.8, offering the support for LLM models that may not fit onto a single GPU. Huggingface Accelerate and Deepspeed are available options to split the model into multiple partitions over multiple GPUs. You can see the detailed example for how to serve the LLM on KServe with TorchServe runtime.","title":"TorchServe LLM Runtime"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#vllm-runtime","text":"Serving LLM models can be surprisingly slow even on high end GPUs, vLLM is a fast and easy-to-use LLM inference engine. It can achieve 10x-20x higher throughput than Huggingface transformers. It supports continuous batching for increased throughput and GPU utilization, paged attention to address the memory bottleneck where in the autoregressive decoding process all the attention key value tensors(KV Cache) are kept in the GPU memory to generate next tokens. In the example we show how to deploy vLLM on KServe and expects further integration in KServe 0.12 with proposed generate endpoint for open inference protocol.","title":"vLLM Runtime"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#modelmesh-updates","text":"","title":"ModelMesh Updates"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#storing-models-on-kubernetes-persistent-volumes-pvc","text":"ModelMesh now allows to directly mount model files onto serving runtimes pods using Kubernetes Persistent Volumes . Depending on the selected storage solution this approach can significantly reduce latency when deploying new predictors, potentially remove the need for additional S3 cloud object storage like AWS S3, GCS, or Azure Blob Storage altogether.","title":"Storing Models on Kubernetes Persistent Volumes (PVC)"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#horizontal-pod-autoscaling-hpa","text":"Kubernetes Horizontal Pod Autoscaling can now be used at the serving runtime pod level. With HPA enabled, the ModelMesh controller no longer manages the number of replicas. Instead, a HorizontalPodAutoscaler automatically updates the serving runtime deployment with the number of Pods to best match the demand.","title":"Horizontal Pod Autoscaling (HPA)"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#model-metrics-metrics-dashboard-payload-event-logging","text":"ModelMesh v0.11 introduces a new configuration option to emit a subset of useful metrics at the individual model level. These metrics can help identify outlier or \"heavy hitter\" models and consequently fine-tune the deployments of those inference services, like allocating more resources or increasing the number of replicas for improved responsiveness or avoid frequent cache misses. A new Grafana dashboard was added to display the comprehensive set of Prometheus metrics like model loading and unloading rates, internal queuing delays, capacity and usage, cache state, etc. to monitor the general health of the ModelMesh Serving deployment. The new PayloadProcessor interface can be implemented to log prediction requests and responses, to create data sinks for data visualization, for model quality assessment, or for drift and outlier detection by external monitoring systems.","title":"Model Metrics, Metrics Dashboard, Payload Event Logging"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#whats-changed","text":"To allow longer InferenceService name due to DNS max length limits from issue , the Default suffix in the inference service component(predictor/transformer/explainer) name has been removed for newly created InferenceServices. This affects the client that is using the component url directly instead of the top level InferenceService url. Status.address.url is now consistent for both serverless and raw deployment mode, the url path portion is dropped in serverless mode. Raw bytes are now accepted in v1 protocol, setting the right content-type header to application/json is required to recognize and decode the json payload if content-type is specified. curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test. ${ CUSTOM_DOMAIN } /v1/models/sklearn-iris:predict -d @./iris-input.json For a complete change list please read the release notes from KServe v0.11 and ModelMesh v0.11 .","title":"What's Changed?"},{"location":"blog/articles/2023-10-08-KServe-0.11-release/#join-the-community","text":"Visit our Website or GitHub Join the Slack ( #kserve ) Attend our community meeting by subscribing to the KServe calendar . View our community github repository to learn how to make contributions. We are excited to work with you to make KServe better and promote its adoption! Thanks for all the contributors who have made the commits to 0.11 release! The KServe Working Group","title":"Join the community"},{"location":"blog/articles/_index/","text":"","title":" index"},{"location":"community/adopters/","text":"Adopters of KServe \u00b6 This page contains a list of organizations who are using KServe either in production, or providing integrations or deployment options with their Cloud or product offerings. If you'd like to be included here, please send a pull request which modifies this file. Please keep the list in alphabetical order. Organization Contact Advanced Micro Devices Varun Sharma Amazon Web Services Ellis Tarn Bloomberg Dan Sun Cars24 Swapnesh Khare Chamred Kubeflow from Canonical Daniela Plasencia Cisco Krishna Durai Cloudera Zoram Thanga CoreWeave Peter Salanki Gojek Willem Pienaar Deeploy Tim Kleinloog Halodoc ID Joinal Ahmed IBM Nick Hill Kubeflow on Google Cloud James Liu Inspur Qingshan Chen Max Kelsen Jacob O'Farrell Naver Mark Winter Nuance Jeff Griffith NVIDIA David Goodwin One Convergence Subra Ongole PITS Global Data Recovery Services Pheianox Red Hat Taneem Ibrahim Seldon Clive Cox Patterson Consulting Josh Patterson Samsung SDS Hanbae Seo Striveworks Jordan Yono Zillow Peilun Li Upstage JuHyung Son Intuit Rachit Chauhan","title":"Adopters"},{"location":"community/adopters/#adopters-of-kserve","text":"This page contains a list of organizations who are using KServe either in production, or providing integrations or deployment options with their Cloud or product offerings. If you'd like to be included here, please send a pull request which modifies this file. Please keep the list in alphabetical order. Organization Contact Advanced Micro Devices Varun Sharma Amazon Web Services Ellis Tarn Bloomberg Dan Sun Cars24 Swapnesh Khare Chamred Kubeflow from Canonical Daniela Plasencia Cisco Krishna Durai Cloudera Zoram Thanga CoreWeave Peter Salanki Gojek Willem Pienaar Deeploy Tim Kleinloog Halodoc ID Joinal Ahmed IBM Nick Hill Kubeflow on Google Cloud James Liu Inspur Qingshan Chen Max Kelsen Jacob O'Farrell Naver Mark Winter Nuance Jeff Griffith NVIDIA David Goodwin One Convergence Subra Ongole PITS Global Data Recovery Services Pheianox Red Hat Taneem Ibrahim Seldon Clive Cox Patterson Consulting Josh Patterson Samsung SDS Hanbae Seo Striveworks Jordan Yono Zillow Peilun Li Upstage JuHyung Son Intuit Rachit Chauhan","title":"Adopters of KServe"},{"location":"community/presentations/","text":"KServe(Formally KFServing) Presentations and Demoes \u00b6 This page contains a list of presentations and demos. If you'd like to add a presentation or demo here, please send a pull request. Presentation/Demo Presenters Distributed Machine Learning Patterns from Manning Publications Yuan Tang KubeCon 2019: Introducing KFServing: Serverless Model Serving on Kubernetes Dan Sun, Ellis Tarn KubeCon 2019: Advanced Model Inferencing Leveraging KNative, Istio & Kubeflow Serving Animesh Singh, Clive Cox KubeflowDojo: KFServing - Production Model Serving Platform Animesh Singh, Tommy Li NVIDIA: Accelerate and Autoscale Deep Learning Inference on GPUs with KFServing Dan Sun, David Goodwin KF Community: KFServing - Enabling Serverless Workloads Across Model Frameworks Ellis Tarn KubeflowDojo: Demo - KFServing End to End through Notebook Animesh Singh, Tommy Li KubeflowDojo: Demo - KFServing with Kafka and Kubeflow Pipelines Animesh Singh Anchor MLOps Podcast: Serving Models with KFServing David Aponte, Demetrios Brinkmann Kubeflow 101: What is KFServing? Stephanie Wong ICML 2020, Workshop on Challenges in Deploying and Monitoring Machine Learning Systems : Serverless inferencing on Kubernetes Clive Cox Serverless Practitioners Summit 2020: Serverless Machine Learning Inference with KFServing Clive Cox, Yuzhui Liu MLOps Meetup: KServe Live Coding Session Theofilos Papapanagiotou KubeCon AI Days 2021: Serving Machine Learning Models at Scale Using KServe Yuzhui Liu KubeCon 2021: Serving Machine Learning Models at Scale Using KServe Animesh Singh KubeCon China 2021: Accelerate Federated Learning Model Deployment with KServe Fangchi Wang & Jiahao Chen KubeCon AI Days 2022: Exploring ML Model Serving with KServe Alexa Nicole Griffith KubeCon AI Days 2022: Enhancing the Performance Testing Process for gRPC Model Inferencing at Scale Ted Chang, Paul Van Eck KubeCon Edge Days 2022: Model Serving at the Edge Made Easier Paul Van Eck KnativeCon 2022: How We Built an ML inference Platform with Knative Dan Sun KubeCon EU 2023: The state and future of cloud native model serving Dan Sun, Theofilos Papapanagiotou Kubeflow Summit 2023: Scale your Models to Zero with Knative and KServe Jooho Lee Kubeflow Summit 2023: What to choose? ModelMesh vs Model Serving? Vaibhav Jain","title":"Demos and Presentations"},{"location":"community/presentations/#kserveformally-kfserving-presentations-and-demoes","text":"This page contains a list of presentations and demos. If you'd like to add a presentation or demo here, please send a pull request. Presentation/Demo Presenters Distributed Machine Learning Patterns from Manning Publications Yuan Tang KubeCon 2019: Introducing KFServing: Serverless Model Serving on Kubernetes Dan Sun, Ellis Tarn KubeCon 2019: Advanced Model Inferencing Leveraging KNative, Istio & Kubeflow Serving Animesh Singh, Clive Cox KubeflowDojo: KFServing - Production Model Serving Platform Animesh Singh, Tommy Li NVIDIA: Accelerate and Autoscale Deep Learning Inference on GPUs with KFServing Dan Sun, David Goodwin KF Community: KFServing - Enabling Serverless Workloads Across Model Frameworks Ellis Tarn KubeflowDojo: Demo - KFServing End to End through Notebook Animesh Singh, Tommy Li KubeflowDojo: Demo - KFServing with Kafka and Kubeflow Pipelines Animesh Singh Anchor MLOps Podcast: Serving Models with KFServing David Aponte, Demetrios Brinkmann Kubeflow 101: What is KFServing? Stephanie Wong ICML 2020, Workshop on Challenges in Deploying and Monitoring Machine Learning Systems : Serverless inferencing on Kubernetes Clive Cox Serverless Practitioners Summit 2020: Serverless Machine Learning Inference with KFServing Clive Cox, Yuzhui Liu MLOps Meetup: KServe Live Coding Session Theofilos Papapanagiotou KubeCon AI Days 2021: Serving Machine Learning Models at Scale Using KServe Yuzhui Liu KubeCon 2021: Serving Machine Learning Models at Scale Using KServe Animesh Singh KubeCon China 2021: Accelerate Federated Learning Model Deployment with KServe Fangchi Wang & Jiahao Chen KubeCon AI Days 2022: Exploring ML Model Serving with KServe Alexa Nicole Griffith KubeCon AI Days 2022: Enhancing the Performance Testing Process for gRPC Model Inferencing at Scale Ted Chang, Paul Van Eck KubeCon Edge Days 2022: Model Serving at the Edge Made Easier Paul Van Eck KnativeCon 2022: How We Built an ML inference Platform with Knative Dan Sun KubeCon EU 2023: The state and future of cloud native model serving Dan Sun, Theofilos Papapanagiotou Kubeflow Summit 2023: Scale your Models to Zero with Knative and KServe Jooho Lee Kubeflow Summit 2023: What to choose? ModelMesh vs Model Serving? Vaibhav Jain","title":"KServe(Formally KFServing) Presentations and Demoes"},{"location":"developer/debug/","text":"KServe Debugging Guide \u00b6 Debug KServe InferenceService Status \u00b6 You deployed an InferenceService to KServe, but it is not in ready state. Go through this step by step guide to understand what failed. kubectl get inferenceservices sklearn-iris NAME URL READY DEFAULT TRAFFIC CANARY TRAFFIC AGE model-example False 1m IngressNotConfigured \u00b6 If you see IngressNotConfigured error, this indicates Istio Ingress Gateway probes are failing. kubectl get ksvc NAME URL LATESTCREATED LATESTREADY READY REASON sklearn-iris-predictor-default http://sklearn-iris-predictor-default.default.example.com sklearn-iris-predictor-default-jk794 mnist-sample-predictor-default-jk794 Unknown IngressNotConfigured You can then check Knative networking-istio pod logs for more details. kubectl logs -l app = networking-istio -n knative-serving If you are seeing HTTP 403, then you may have Istio RBAC turned on which blocks the probes to your service. { \"level\" : \"error\" , \"ts\" : \"2020-03-26T19:12:00.749Z\" , \"logger\" : \"istiocontroller.ingress-controller.status-manager\" , \"caller\" : \"ingress/status.go:366\" , \"msg\" : \"Probing of http://flowers-sample-predictor-default.kubeflow-jeanarmel-luce.example.com:80/ failed, IP: 10.0.0.29:80, ready: false, error: unexpected status code: want [200], got 403 (depth: 0)\" , \"commit\" : \"6b0e5c6\" , \"knative.dev/controller\" : \"ingress-controller\" , \"stacktrace\" : \"knative.dev/serving/pkg/reconciler/ingress.(*StatusProber).processWorkItem\\n\\t/home/prow/go/src/knative.dev/serving/pkg/reconciler/ingress/status.go:366\\nknative.dev/serving/pkg/reconciler/ingress.(*StatusProber).Start.func1\\n\\t/home/prow/go/src/knative.dev/serving/pkg/reconciler/ingress/status.go:268\" } RevisionMissing Error \u00b6 If you see RevisionMissing error, then your service pods are not in ready state. Knative Service creates Knative Revision which represents a snapshot of the InferenceService code and configuration. Storage Initializer fails to download model \u00b6 kubectl get revision $( kubectl get configuration sklearn-iris-predictor-default --output jsonpath = \"{.status.latestCreatedRevisionName}\" ) NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON sklearn-iris-predictor-default-csjpw sklearn-iris-predictor-default sklearn-iris-predictor-default-csjpw 2 Unknown Deploying If you see READY status in Unknown error, this usually indicates that the KServe Storage Initializer init container fails to download the model and you can check the init container logs to see why it fails, note that the pod scales down after sometime if the init container fails . kubectl get pod -l serving.kserve.io/inferenceservice = sklearn-iris NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-29jks-deployment-5f7d4b9996hzrnc 0 /3 Init:Error 1 10s kubectl logs -l model = sklearn-iris -c storage-initializer [ I 200517 03 :56:19 initializer-entrypoint:13 ] Initializing, args: src_uri [ gs://kfserving-examples/models/sklearn/iris-1 ] dest_path [ [ /mnt/models ] [ I 200517 03 :56:19 storage:35 ] Copying contents of gs://kfserving-examples/models/sklearn/iris-1 to local Traceback ( most recent call last ) : File \"/storage-initializer/scripts/initializer-entrypoint\" , line 14 , in kserve.Storage.download ( src_uri, dest_path ) File \"/usr/local/lib/python3.7/site-packages/kfserving/storage.py\" , line 48 , in download Storage._download_gcs ( uri, out_dir ) File \"/usr/local/lib/python3.7/site-packages/kfserving/storage.py\" , line 116 , in _download_gcs The path or model %s does not exist. \" % (uri)) RuntimeError: Failed to fetch model. The path or model gs://kfserving-examples/models/sklearn/iris-1 does not exist. [I 200517 03:40:19 initializer-entrypoint:13] Initializing, args: src_uri [gs://kfserving-examples/models/sklearn/iris] dest_path[ [/mnt/models] [I 200517 03:40:19 storage:35] Copying contents of gs://kfserving-examples/models/sklearn/iris to local [I 200517 03:40:20 storage:111] Downloading: /mnt/models/model.joblib [I 200517 03:40:20 storage:60] Successfully copied gs://kfserving-examples/models/sklearn/iris to /mnt/models Inference Service in OOM status \u00b6 If you see ExitCode137 from the revision status, this means the revision has failed and this usually happens when the inference service pod is out of memory. To address it, you might need to bump up the memory limit of the InferenceService . kubectl get revision $( kubectl get configuration sklearn-iris-predictor-default --output jsonpath = \"{.status.latestCreatedRevisionName}\" ) NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON sklearn-iris-predictor-default-84bzf sklearn-iris-predictor-default sklearn-iris-predictor-default-84bzf 8 False ExitCode137s Inference Service fails to start \u00b6 If you see other exit codes from the revision status you can further check the pod status. kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris sklearn-iris-predictor-default-rvhmk-deployment-867c6444647tz7n 1 /3 CrashLoopBackOff 3 80s If you see the CrashLoopBackOff , then check the kserve-container log to see more details where it fails, the error log is usually propagated on revision container status also. kubectl logs sklearn-iris-predictor-default-rvhmk-deployment-867c6444647tz7n kserve-container [ I 200517 04 :58:21 storage:35 ] Copying contents of /mnt/models to local Traceback ( most recent call last ) : File \"/usr/local/lib/python3.7/runpy.py\" , line 193 , in _run_module_as_main \"__main__\" , mod_spec ) File \"/usr/local/lib/python3.7/runpy.py\" , line 85 , in _run_code exec ( code, run_globals ) File \"/sklearnserver/sklearnserver/__main__.py\" , line 33 , in model.load () File \"/sklearnserver/sklearnserver/model.py\" , line 36 , in load model_file = next ( path for path in paths if os.path.exists ( path )) StopIteration Inference Service cannot fetch docker images from AWS ECR \u00b6 If you don't see the inference service created at all for custom images from private registries (such as AWS ECR), it might be that the Knative Serving Controller fails to authenticate itself against the registry. failed to resolve image to digest: failed to fetch image information: unsupported status code 401 ; body: Not Authorized You can verify that this is actually the case by spinning up a pod that uses your image. The pod should be able to fetch it, if the correct IAM roles are attached, while Knative is not able to. To circumvent this issue you can either skip tag resolution or provide certificates for your registry as detailed in the official knative docs . kubectl -n knative-serving edit configmap config-deployment The resultant yaml will look like something below. apiVersion : v1 kind : ConfigMap metadata : name : config-deployment namespace : knative-serving data : # List of repositories for which tag to digest resolving should be skipped (for AWS ECR: {account_id}.dkr.ecr.{region}.amazonaws.com) registriesSkippingTagResolving : registry.example.com Debug KServe Request flow \u00b6 +----------------------+ +-----------------------+ +--------------------------+ |Istio Virtual Service | |Istio Virtual Service | | K8S Service | | | | | | | |sklearn-iris | |sklearn-iris-predictor | | sklearn-iris-predictor | | +------->|-default +----->| -default-$revision | | | | | | | |KServe Route | |Knative Route | | Knative Revision Service | +----------------------+ +-----------------------+ +------------+-------------+ Knative Ingress Gateway Knative Local Gateway Kube Proxy (Istio gateway) (Istio gateway) | | | +-------------------------------------------------------+ | | Knative Revision Pod | | | | | | +-------------------+ +-----------------+ | | | | | | | | | | |kserve-container |<-----+ Queue Proxy | |<------------------+ | | | | | | | +-------------------+ +--------------^--+ | | | | +-----------------------^-------------------------------+ | scale deployment | +--------+--------+ | pull metrics | Knative | | | Autoscaler |----------- | KPA/HPA | +-----------------+ 1.Traffic arrives through Knative Ingress/Local Gateway for external/internal traffic \u00b6 Istio Gateway resource describes the edge of the mesh receiving incoming or outgoing HTTP/TCP connections. The specification describes a set of ports that should be exposed and the type of protocol to use. If you are using Standalone mode, it installs the Gateway in knative-serving namespace, if you are using Kubeflow KServe (KServe installed with Kubeflow), it installs the Gateway in kubeflow namespace e.g on GCP the gateway is protected behind IAP with Istio authentication policy . kubectl get gateway knative-ingress-gateway -n knative-serving -oyaml kind : Gateway metadata : labels : networking.knative.dev/ingress-provider : istio serving.knative.dev/release : v0.12.1 name : knative-ingress-gateway namespace : knative-serving spec : selector : istio : ingressgateway servers : - hosts : - '*' port : name : http number : 80 protocol : HTTP - hosts : - '*' port : name : https number : 443 protocol : HTTPS tls : mode : SIMPLE privateKey : /etc/istio/ingressgateway-certs/tls.key serverCertificate : /etc/istio/ingressgateway-certs/tls.crt The InferenceService request routes to the Istio Ingress Gateway by matching the host and port from the url, by default http is configured, you can configure HTTPS with TLS certificates . 2. KServe Istio virtual service to route for predictor, transformer, explainer. \u00b6 kubectl get vs sklearn-iris -oyaml apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : sklearn-iris namespace : default gateways : - knative-serving/knative-local-gateway - knative-serving/knative-ingress-gateway hosts : - sklearn-iris.default.svc.cluster.local - sklearn-iris.default.example.com http : - headers : request : set : Host : sklearn-iris-predictor-default.default.svc.cluster.local match : - authority : regex : ^sklearn-iris\\.default(\\.svc(\\.cluster\\.local)?)?(?::\\d{1,5})?$ gateways : - knative-serving/knative-local-gateway - authority : regex : ^sklearn-iris\\.default\\.example\\.com(?::\\d{1,5})?$ gateways : - knative-serving/knative-ingress-gateway route : - destination : host : knative-local-gateway.istio-system.svc.cluster.local port : number : 80 weight : 100 KServe creates the routing rule which by default routes to Predictor if you only have Predictor specified on InferenceService . When Transformer and Explainer are specified on InferenceService the routing rule configures the traffic to route to Transformer or Explainer based on the verb. The request then routes to the second level Knative created virtual service via local gateway with the matching host header. 3. Knative Istio virtual service to route the inference request to the latest ready revision. \u00b6 kubectl get vs sklearn-iris-predictor-default-ingress -oyaml apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : sklearn-iris-predictor-default-mesh namespace : default spec : gateways : - knative-serving/knative-ingress-gateway - knative-serving/knative-local-gateway hosts : - sklearn-iris-predictor-default.default - sklearn-iris-predictor-default.default.example.com - sklearn-iris-predictor-default.default.svc - sklearn-iris-predictor-default.default.svc.cluster.local http : - match : - authority : prefix : sklearn-iris-predictor-default.default gateways : - knative-serving/knative-local-gateway - authority : prefix : sklearn-iris-predictor-default.default.svc gateways : - knative-serving/knative-local-gateway - authority : prefix : sklearn-iris-predictor-default.default gateways : - knative-serving/knative-local-gateway retries : {} route : - destination : host : sklearn-iris-predictor-default-00001.default.svc.cluster.local port : number : 80 headers : request : set : Knative-Serving-Namespace : default Knative-Serving-Revision : sklearn-iris-predictor-default-00001 weight : 100 - match : - authority : prefix : sklearn-iris-predictor-default.default.example.com gateways : - knative-serving/knative-ingress-gateway retries : {} route : - destination : host : sklearn-iris-predictor-default-00001.default.svc.cluster.local port : number : 80 headers : request : set : Knative-Serving-Namespace : default Knative-Serving-Revision : sklearn-iris-predictor-default-00001 weight : 100 The destination here is the k8s Service for the latest ready Knative Revision and it is reconciled by Knative every time user rolls out a new revision. When a new revision is rolled out and in ready state, the old revision is then scaled down, after configured revision GC time the revision resource is garbage collected if the revision no longer has traffic referenced. 4. Kubernetes Service routes the requests to the queue proxy sidecar of the inference service pod on port 8012 . \u00b6 kubectl get svc sklearn-iris-predictor-default-fhmjk-private -oyaml apiVersion : v1 kind : Service metadata : name : sklearn-iris-predictor-default-fhmjk-private namespace : default spec : clusterIP : 10.105.186.18 ports : - name : http port : 80 protocol : TCP targetPort : 8012 - name : queue-metrics port : 9090 protocol : TCP targetPort : queue-metrics - name : http-usermetric port : 9091 protocol : TCP targetPort : http-usermetric - name : http-queueadm port : 8022 protocol : TCP targetPort : 8022 selector : serving.knative.dev/revisionUID : a8f1eafc-3c64-4930-9a01-359f3235333a sessionAffinity : None type : ClusterIP 5. The queue proxy routes to kserve container with max concurrent requests configured with ContainerConcurrency . \u00b6 If the queue proxy has more requests than it can handle, the Knative Autoscaler creates more pods to handle additional requests. 6. Finally The queue proxy routes traffic to the kserve-container for processing the inference requests. \u00b6","title":"Debugging guide"},{"location":"developer/debug/#kserve-debugging-guide","text":"","title":"KServe Debugging Guide"},{"location":"developer/debug/#debug-kserve-inferenceservice-status","text":"You deployed an InferenceService to KServe, but it is not in ready state. Go through this step by step guide to understand what failed. kubectl get inferenceservices sklearn-iris NAME URL READY DEFAULT TRAFFIC CANARY TRAFFIC AGE model-example False 1m","title":"Debug KServe InferenceService Status"},{"location":"developer/debug/#ingressnotconfigured","text":"If you see IngressNotConfigured error, this indicates Istio Ingress Gateway probes are failing. kubectl get ksvc NAME URL LATESTCREATED LATESTREADY READY REASON sklearn-iris-predictor-default http://sklearn-iris-predictor-default.default.example.com sklearn-iris-predictor-default-jk794 mnist-sample-predictor-default-jk794 Unknown IngressNotConfigured You can then check Knative networking-istio pod logs for more details. kubectl logs -l app = networking-istio -n knative-serving If you are seeing HTTP 403, then you may have Istio RBAC turned on which blocks the probes to your service. { \"level\" : \"error\" , \"ts\" : \"2020-03-26T19:12:00.749Z\" , \"logger\" : \"istiocontroller.ingress-controller.status-manager\" , \"caller\" : \"ingress/status.go:366\" , \"msg\" : \"Probing of http://flowers-sample-predictor-default.kubeflow-jeanarmel-luce.example.com:80/ failed, IP: 10.0.0.29:80, ready: false, error: unexpected status code: want [200], got 403 (depth: 0)\" , \"commit\" : \"6b0e5c6\" , \"knative.dev/controller\" : \"ingress-controller\" , \"stacktrace\" : \"knative.dev/serving/pkg/reconciler/ingress.(*StatusProber).processWorkItem\\n\\t/home/prow/go/src/knative.dev/serving/pkg/reconciler/ingress/status.go:366\\nknative.dev/serving/pkg/reconciler/ingress.(*StatusProber).Start.func1\\n\\t/home/prow/go/src/knative.dev/serving/pkg/reconciler/ingress/status.go:268\" }","title":"IngressNotConfigured"},{"location":"developer/debug/#revisionmissing-error","text":"If you see RevisionMissing error, then your service pods are not in ready state. Knative Service creates Knative Revision which represents a snapshot of the InferenceService code and configuration.","title":"RevisionMissing Error"},{"location":"developer/debug/#storage-initializer-fails-to-download-model","text":"kubectl get revision $( kubectl get configuration sklearn-iris-predictor-default --output jsonpath = \"{.status.latestCreatedRevisionName}\" ) NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON sklearn-iris-predictor-default-csjpw sklearn-iris-predictor-default sklearn-iris-predictor-default-csjpw 2 Unknown Deploying If you see READY status in Unknown error, this usually indicates that the KServe Storage Initializer init container fails to download the model and you can check the init container logs to see why it fails, note that the pod scales down after sometime if the init container fails . kubectl get pod -l serving.kserve.io/inferenceservice = sklearn-iris NAME READY STATUS RESTARTS AGE sklearn-iris-predictor-default-29jks-deployment-5f7d4b9996hzrnc 0 /3 Init:Error 1 10s kubectl logs -l model = sklearn-iris -c storage-initializer [ I 200517 03 :56:19 initializer-entrypoint:13 ] Initializing, args: src_uri [ gs://kfserving-examples/models/sklearn/iris-1 ] dest_path [ [ /mnt/models ] [ I 200517 03 :56:19 storage:35 ] Copying contents of gs://kfserving-examples/models/sklearn/iris-1 to local Traceback ( most recent call last ) : File \"/storage-initializer/scripts/initializer-entrypoint\" , line 14 , in kserve.Storage.download ( src_uri, dest_path ) File \"/usr/local/lib/python3.7/site-packages/kfserving/storage.py\" , line 48 , in download Storage._download_gcs ( uri, out_dir ) File \"/usr/local/lib/python3.7/site-packages/kfserving/storage.py\" , line 116 , in _download_gcs The path or model %s does not exist. \" % (uri)) RuntimeError: Failed to fetch model. The path or model gs://kfserving-examples/models/sklearn/iris-1 does not exist. [I 200517 03:40:19 initializer-entrypoint:13] Initializing, args: src_uri [gs://kfserving-examples/models/sklearn/iris] dest_path[ [/mnt/models] [I 200517 03:40:19 storage:35] Copying contents of gs://kfserving-examples/models/sklearn/iris to local [I 200517 03:40:20 storage:111] Downloading: /mnt/models/model.joblib [I 200517 03:40:20 storage:60] Successfully copied gs://kfserving-examples/models/sklearn/iris to /mnt/models","title":"Storage Initializer fails to download model"},{"location":"developer/debug/#inference-service-in-oom-status","text":"If you see ExitCode137 from the revision status, this means the revision has failed and this usually happens when the inference service pod is out of memory. To address it, you might need to bump up the memory limit of the InferenceService . kubectl get revision $( kubectl get configuration sklearn-iris-predictor-default --output jsonpath = \"{.status.latestCreatedRevisionName}\" ) NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON sklearn-iris-predictor-default-84bzf sklearn-iris-predictor-default sklearn-iris-predictor-default-84bzf 8 False ExitCode137s","title":"Inference Service in OOM status"},{"location":"developer/debug/#inference-service-fails-to-start","text":"If you see other exit codes from the revision status you can further check the pod status. kubectl get pods -l serving.kserve.io/inferenceservice = sklearn-iris sklearn-iris-predictor-default-rvhmk-deployment-867c6444647tz7n 1 /3 CrashLoopBackOff 3 80s If you see the CrashLoopBackOff , then check the kserve-container log to see more details where it fails, the error log is usually propagated on revision container status also. kubectl logs sklearn-iris-predictor-default-rvhmk-deployment-867c6444647tz7n kserve-container [ I 200517 04 :58:21 storage:35 ] Copying contents of /mnt/models to local Traceback ( most recent call last ) : File \"/usr/local/lib/python3.7/runpy.py\" , line 193 , in _run_module_as_main \"__main__\" , mod_spec ) File \"/usr/local/lib/python3.7/runpy.py\" , line 85 , in _run_code exec ( code, run_globals ) File \"/sklearnserver/sklearnserver/__main__.py\" , line 33 , in model.load () File \"/sklearnserver/sklearnserver/model.py\" , line 36 , in load model_file = next ( path for path in paths if os.path.exists ( path )) StopIteration","title":"Inference Service fails to start"},{"location":"developer/debug/#inference-service-cannot-fetch-docker-images-from-aws-ecr","text":"If you don't see the inference service created at all for custom images from private registries (such as AWS ECR), it might be that the Knative Serving Controller fails to authenticate itself against the registry. failed to resolve image to digest: failed to fetch image information: unsupported status code 401 ; body: Not Authorized You can verify that this is actually the case by spinning up a pod that uses your image. The pod should be able to fetch it, if the correct IAM roles are attached, while Knative is not able to. To circumvent this issue you can either skip tag resolution or provide certificates for your registry as detailed in the official knative docs . kubectl -n knative-serving edit configmap config-deployment The resultant yaml will look like something below. apiVersion : v1 kind : ConfigMap metadata : name : config-deployment namespace : knative-serving data : # List of repositories for which tag to digest resolving should be skipped (for AWS ECR: {account_id}.dkr.ecr.{region}.amazonaws.com) registriesSkippingTagResolving : registry.example.com","title":"Inference Service cannot fetch docker images from AWS ECR"},{"location":"developer/debug/#debug-kserve-request-flow","text":"+----------------------+ +-----------------------+ +--------------------------+ |Istio Virtual Service | |Istio Virtual Service | | K8S Service | | | | | | | |sklearn-iris | |sklearn-iris-predictor | | sklearn-iris-predictor | | +------->|-default +----->| -default-$revision | | | | | | | |KServe Route | |Knative Route | | Knative Revision Service | +----------------------+ +-----------------------+ +------------+-------------+ Knative Ingress Gateway Knative Local Gateway Kube Proxy (Istio gateway) (Istio gateway) | | | +-------------------------------------------------------+ | | Knative Revision Pod | | | | | | +-------------------+ +-----------------+ | | | | | | | | | | |kserve-container |<-----+ Queue Proxy | |<------------------+ | | | | | | | +-------------------+ +--------------^--+ | | | | +-----------------------^-------------------------------+ | scale deployment | +--------+--------+ | pull metrics | Knative | | | Autoscaler |----------- | KPA/HPA | +-----------------+","title":"Debug KServe Request flow"},{"location":"developer/debug/#1traffic-arrives-through-knative-ingresslocal-gateway-for-externalinternal-traffic","text":"Istio Gateway resource describes the edge of the mesh receiving incoming or outgoing HTTP/TCP connections. The specification describes a set of ports that should be exposed and the type of protocol to use. If you are using Standalone mode, it installs the Gateway in knative-serving namespace, if you are using Kubeflow KServe (KServe installed with Kubeflow), it installs the Gateway in kubeflow namespace e.g on GCP the gateway is protected behind IAP with Istio authentication policy . kubectl get gateway knative-ingress-gateway -n knative-serving -oyaml kind : Gateway metadata : labels : networking.knative.dev/ingress-provider : istio serving.knative.dev/release : v0.12.1 name : knative-ingress-gateway namespace : knative-serving spec : selector : istio : ingressgateway servers : - hosts : - '*' port : name : http number : 80 protocol : HTTP - hosts : - '*' port : name : https number : 443 protocol : HTTPS tls : mode : SIMPLE privateKey : /etc/istio/ingressgateway-certs/tls.key serverCertificate : /etc/istio/ingressgateway-certs/tls.crt The InferenceService request routes to the Istio Ingress Gateway by matching the host and port from the url, by default http is configured, you can configure HTTPS with TLS certificates .","title":"1.Traffic arrives through Knative Ingress/Local Gateway for external/internal traffic"},{"location":"developer/debug/#2-kserve-istio-virtual-service-to-route-for-predictor-transformer-explainer","text":"kubectl get vs sklearn-iris -oyaml apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : sklearn-iris namespace : default gateways : - knative-serving/knative-local-gateway - knative-serving/knative-ingress-gateway hosts : - sklearn-iris.default.svc.cluster.local - sklearn-iris.default.example.com http : - headers : request : set : Host : sklearn-iris-predictor-default.default.svc.cluster.local match : - authority : regex : ^sklearn-iris\\.default(\\.svc(\\.cluster\\.local)?)?(?::\\d{1,5})?$ gateways : - knative-serving/knative-local-gateway - authority : regex : ^sklearn-iris\\.default\\.example\\.com(?::\\d{1,5})?$ gateways : - knative-serving/knative-ingress-gateway route : - destination : host : knative-local-gateway.istio-system.svc.cluster.local port : number : 80 weight : 100 KServe creates the routing rule which by default routes to Predictor if you only have Predictor specified on InferenceService . When Transformer and Explainer are specified on InferenceService the routing rule configures the traffic to route to Transformer or Explainer based on the verb. The request then routes to the second level Knative created virtual service via local gateway with the matching host header.","title":"2. KServe Istio virtual service to route for predictor, transformer, explainer."},{"location":"developer/debug/#3-knative-istio-virtual-service-to-route-the-inference-request-to-the-latest-ready-revision","text":"kubectl get vs sklearn-iris-predictor-default-ingress -oyaml apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : sklearn-iris-predictor-default-mesh namespace : default spec : gateways : - knative-serving/knative-ingress-gateway - knative-serving/knative-local-gateway hosts : - sklearn-iris-predictor-default.default - sklearn-iris-predictor-default.default.example.com - sklearn-iris-predictor-default.default.svc - sklearn-iris-predictor-default.default.svc.cluster.local http : - match : - authority : prefix : sklearn-iris-predictor-default.default gateways : - knative-serving/knative-local-gateway - authority : prefix : sklearn-iris-predictor-default.default.svc gateways : - knative-serving/knative-local-gateway - authority : prefix : sklearn-iris-predictor-default.default gateways : - knative-serving/knative-local-gateway retries : {} route : - destination : host : sklearn-iris-predictor-default-00001.default.svc.cluster.local port : number : 80 headers : request : set : Knative-Serving-Namespace : default Knative-Serving-Revision : sklearn-iris-predictor-default-00001 weight : 100 - match : - authority : prefix : sklearn-iris-predictor-default.default.example.com gateways : - knative-serving/knative-ingress-gateway retries : {} route : - destination : host : sklearn-iris-predictor-default-00001.default.svc.cluster.local port : number : 80 headers : request : set : Knative-Serving-Namespace : default Knative-Serving-Revision : sklearn-iris-predictor-default-00001 weight : 100 The destination here is the k8s Service for the latest ready Knative Revision and it is reconciled by Knative every time user rolls out a new revision. When a new revision is rolled out and in ready state, the old revision is then scaled down, after configured revision GC time the revision resource is garbage collected if the revision no longer has traffic referenced.","title":"3. Knative Istio virtual service to route the inference request to the latest ready revision."},{"location":"developer/debug/#4-kubernetes-service-routes-the-requests-to-the-queue-proxy-sidecar-of-the-inference-service-pod-on-port-8012","text":"kubectl get svc sklearn-iris-predictor-default-fhmjk-private -oyaml apiVersion : v1 kind : Service metadata : name : sklearn-iris-predictor-default-fhmjk-private namespace : default spec : clusterIP : 10.105.186.18 ports : - name : http port : 80 protocol : TCP targetPort : 8012 - name : queue-metrics port : 9090 protocol : TCP targetPort : queue-metrics - name : http-usermetric port : 9091 protocol : TCP targetPort : http-usermetric - name : http-queueadm port : 8022 protocol : TCP targetPort : 8022 selector : serving.knative.dev/revisionUID : a8f1eafc-3c64-4930-9a01-359f3235333a sessionAffinity : None type : ClusterIP","title":"4. Kubernetes Service routes the requests to the queue proxy sidecar of the inference service pod on port 8012."},{"location":"developer/debug/#5-the-queue-proxy-routes-to-kserve-container-with-max-concurrent-requests-configured-with-containerconcurrency","text":"If the queue proxy has more requests than it can handle, the Knative Autoscaler creates more pods to handle additional requests.","title":"5. The queue proxy routes to kserve container with max concurrent requests configured with ContainerConcurrency."},{"location":"developer/debug/#6-finally-the-queue-proxy-routes-traffic-to-the-kserve-container-for-processing-the-inference-requests","text":"","title":"6. Finally The queue proxy routes traffic to the kserve-container for processing the inference requests."},{"location":"developer/developer/","text":"Development \u00b6 This doc explains how to setup a development environment so you can get started contributing . Prerequisites \u00b6 Follow the instructions below to set up your development environment. Once you meet these requirements, you can make changes and deploy your own version of kserve ! Before submitting a PR, see also CONTRIBUTING.md . Install requirements \u00b6 You must install these tools: go : KServe controller is written in Go and requires Go 1.20.0+. git : For source control. Go Module : Go's new dependency management system. ko : For development. kubectl : For managing development environments. kustomize To customize YAMLs for different environments, requires v5.0.0+. yq yq is used in the project makefiles to parse and display YAML output, requires yq 4.* . Install Knative on a Kubernetes cluster \u00b6 KServe currently requires Knative Serving for auto-scaling, canary rollout, Istio for traffic routing and ingress. To install Knative components on your Kubernetes cluster, follow the installation guide or alternatively, use the Knative Operators to manage your installation. Observability, tracing and logging are optional but are often very valuable tools for troubleshooting difficult issues, they can be installed via the directions here . If you start from scratch, KServe requires Kubernetes 1.25+, Knative 1.7+, Istio 1.15+. If you already have Istio or Knative (e.g. from a Kubeflow install) then you don't need to install them explicitly, as long as version dependencies are satisfied. Note On a local environment, when using minikube or kind as Kubernetes cluster, there has been a reported issue that knative quickstart bootstrap does not work as expected. It is recommended to follow the installation manual from knative using yaml or using knative operator for a better result. Setup your environment \u00b6 To start your environment you'll need to set these environment variables (we recommend adding them to your .bashrc ): GOPATH : If you don't have one, simply pick a directory and add export GOPATH=... $GOPATH/bin on PATH : This is so that tooling installed via go get will work properly. KO_DEFAULTPLATFORMS : If you are using M1 Mac book the value is linux/arm64 . KO_DOCKER_REPO : The docker repository to which developer images should be pushed (e.g. docker.io/ ). Note : Set up a docker repository for pushing images. You can use any container image registry by adjusting the authentication methods and repository paths mentioned in the sections below. Google Container Registry quickstart Docker Hub quickstart Azure Container Registry quickstart Note if you are using docker hub to store your images your KO_DOCKER_REPO variable should be docker.io/ . Currently Docker Hub doesn't let you create subdirs under your username. .bashrc example: export GOPATH = \" $HOME /go\" export PATH = \" ${ PATH } : ${ GOPATH } /bin\" export KO_DOCKER_REPO = 'docker.io/' Checkout your fork \u00b6 The Go tools require that you clone the repository to the src/github.com/kserve/kserve directory in your GOPATH . To check out this repository: Create your own fork of this repo Clone it to your machine: mkdir -p ${ GOPATH } /src/github.com/kserve cd ${ GOPATH } /src/github.com/kserve git clone git@github.com: ${ YOUR_GITHUB_USERNAME } /kserve.git cd kserve git remote add upstream git@github.com:kserve/kserve.git git remote set-url --push upstream no_push Adding the upstream remote sets you up nicely for regularly syncing your fork . Once you reach this point you are ready to do a full build and deploy as described below. Deploy KServe \u00b6 Check Knative Serving installation \u00b6 Once you've setup your development environment , you can verify the installation with following: Success $ kubectl -n knative-serving get pods NAME READY STATUS RESTARTS AGE activator-77784645fc-t2pjf 1 /1 Running 0 11d autoscaler-6fddf74d5-z2fzf 1 /1 Running 0 11d autoscaler-hpa-5bf4476cc5-tsbw6 1 /1 Running 0 11d controller-7b8cd7f95c-6jxxj 1 /1 Running 0 11d istio-webhook-866c5bc7f8-t5ztb 1 /1 Running 0 11d networking-istio-54fb8b5d4b-xznwd 1 /1 Running 0 11d webhook-5f5f7bd9b4-cv27c 1 /1 Running 0 11d $ kubectl get gateway -n knative-serving NAME AGE knative-ingress-gateway 11d knative-local-gateway 11d $ kubectl get svc -n istio-system NAME TYPE CLUSTER-IP EXTERNAL-IP PORT ( S ) AGE istio-ingressgateway LoadBalancer 10 .101.196.89 X.X.X.X 15021 :31101/TCP,80:31781/TCP,443:30372/TCP,15443:31067/TCP 11d istiod ClusterIP 10 .101.116.203 15010 /TCP,15012/TCP,443/TCP,15014/TCP,853/TCP 11d Deploy KServe from master branch \u00b6 We suggest using cert manager for provisioning the certificates for the webhook server. Other solutions should also work as long as they put the certificates in the desired location. You can follow the cert manager documentation to install it. If you don't want to install cert manager, you can set the KSERVE_ENABLE_SELF_SIGNED_CA environment variable to true. KSERVE_ENABLE_SELF_SIGNED_CA will execute a script to create a self-signed CA and patch it to the webhook config. export KSERVE_ENABLE_SELF_SIGNED_CA = true After that you can run following command to deploy KServe , you can skip above step if cert manager is already installed. make deploy Optional you can change CPU and memory limits when deploying KServe . export KSERVE_CONTROLLER_CPU_LIMIT = export KSERVE_CONTROLLER_MEMORY_LIMIT = make deploy Expected Output $ kubectl get pods -n kserve -l control-plane = kserve-controller-manager NAME READY STATUS RESTARTS AGE kserve-controller-manager-0 2/2 Running 0 13m Note By default it installs to kserve namespace with the published controller manager image from master branch. Deploy KServe with your own version \u00b6 Run the following command to deploy KServe controller and model agent with your local change. make deploy-dev Note deploy-dev builds the image from your local code, publishes to KO_DOCKER_REPO and deploys the kserve-controller-manager and model agent with the image digest to your cluster for testing. Please also ensure you are logged in to KO_DOCKER_REPO from your client machine. Run the following command to deploy model server with your local change. make deploy-dev-sklearn make deploy-dev-xgb Run the following command to deploy explainer with your local change. make deploy-dev-alibi Run the following command to deploy storage initializer with your local change. make deploy-dev-storageInitializer Warning The deploy command publishes the image to KO_DOCKER_REPO with the version latest , it changes the InferenceService configmap to point to the newly built image sha. The built image is only for development and testing purpose, the current limitation is that it changes the image impacted and reset all other images including the kserver-controller-manager to use the default ones. Smoke test after deployment \u00b6 Run the following command to smoke test the deployment kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/tensorflow/tensorflow.yaml You should see model serving deployment running under default or your specified namespace. $ kubectl get pods -n default -l serving.kserve.io/inferenceservice=flower-sample Expected Output NAME READY STATUS RESTARTS AGE flower-sample-default-htz8r-deployment-8fd979f9b-w2qbv 3/3 Running 0 10s Running unit/integration tests \u00b6 kserver-controller-manager has a few integration tests which requires mock apiserver and etcd, they get installed along with kubebuilder . To run all unit/integration tests: make test Run e2e tests locally \u00b6 To setup from local code, do: ./hack/quick_install.sh make undeploy make deploy-dev Go to python/kserve and install kserve python sdk deps pip3 install -e . [ test ] Then go to test/e2e . Run kubectl create namespace kserve-ci-e2e-test For KIND/minikube: Run export KSERVE_INGRESS_HOST_PORT=localhost:8080 In a different window run kubectl port-forward -n istio-system svc/istio-ingressgateway 8080:80 Note that not all tests will pass as the pytorch test requires gpu. These will show as pending pods at the end or you can add marker to skip the test. Run pytest > testresults.txt Tests may not clean up. To re-run, first do kubectl delete namespace kserve-ci-e2e-test , recreate namespace and run again. Iterating \u00b6 As you make changes to the code-base, there are two special cases to be aware of: If you change an input to generated code , then you must run make manifests . Inputs include: API type definitions in apis/serving Manifests or kustomize patches stored in config . To generate the KServe python/go clients, you should run make generate . If you want to add new dependencies , then you add the imports and the specific version of the dependency module in go.mod . When it encounters an import of a package not provided by any module in go.mod , the go command automatically looks up the module containing the package and adds it to go.mod using the latest version. If you want to upgrade the dependency , then you run go get command e.g go get golang.org/x/text to upgrade to the latest version, go get golang.org/x/text@v0.3.0 to upgrade to a specific version. make deploy-dev Contribute to the code \u00b6 See the guidelines for contributing a feature contributing to an existing issue Releases \u00b6 Please check out the documentation here to understand the release schedule and process. Feedback \u00b6 The best place to provide feedback about the KServe code is via a Github issue. See creating a Github issue for guidelines on submitting bugs and feature requests.","title":"How to contribute"},{"location":"developer/developer/#development","text":"This doc explains how to setup a development environment so you can get started contributing .","title":"Development"},{"location":"developer/developer/#prerequisites","text":"Follow the instructions below to set up your development environment. Once you meet these requirements, you can make changes and deploy your own version of kserve ! Before submitting a PR, see also CONTRIBUTING.md .","title":"Prerequisites"},{"location":"developer/developer/#install-requirements","text":"You must install these tools: go : KServe controller is written in Go and requires Go 1.20.0+. git : For source control. Go Module : Go's new dependency management system. ko : For development. kubectl : For managing development environments. kustomize To customize YAMLs for different environments, requires v5.0.0+. yq yq is used in the project makefiles to parse and display YAML output, requires yq 4.* .","title":"Install requirements"},{"location":"developer/developer/#install-knative-on-a-kubernetes-cluster","text":"KServe currently requires Knative Serving for auto-scaling, canary rollout, Istio for traffic routing and ingress. To install Knative components on your Kubernetes cluster, follow the installation guide or alternatively, use the Knative Operators to manage your installation. Observability, tracing and logging are optional but are often very valuable tools for troubleshooting difficult issues, they can be installed via the directions here . If you start from scratch, KServe requires Kubernetes 1.25+, Knative 1.7+, Istio 1.15+. If you already have Istio or Knative (e.g. from a Kubeflow install) then you don't need to install them explicitly, as long as version dependencies are satisfied. Note On a local environment, when using minikube or kind as Kubernetes cluster, there has been a reported issue that knative quickstart bootstrap does not work as expected. It is recommended to follow the installation manual from knative using yaml or using knative operator for a better result.","title":"Install Knative on a Kubernetes cluster"},{"location":"developer/developer/#setup-your-environment","text":"To start your environment you'll need to set these environment variables (we recommend adding them to your .bashrc ): GOPATH : If you don't have one, simply pick a directory and add export GOPATH=... $GOPATH/bin on PATH : This is so that tooling installed via go get will work properly. KO_DEFAULTPLATFORMS : If you are using M1 Mac book the value is linux/arm64 . KO_DOCKER_REPO : The docker repository to which developer images should be pushed (e.g. docker.io/ ). Note : Set up a docker repository for pushing images. You can use any container image registry by adjusting the authentication methods and repository paths mentioned in the sections below. Google Container Registry quickstart Docker Hub quickstart Azure Container Registry quickstart Note if you are using docker hub to store your images your KO_DOCKER_REPO variable should be docker.io/ . Currently Docker Hub doesn't let you create subdirs under your username. .bashrc example: export GOPATH = \" $HOME /go\" export PATH = \" ${ PATH } : ${ GOPATH } /bin\" export KO_DOCKER_REPO = 'docker.io/'","title":"Setup your environment"},{"location":"developer/developer/#checkout-your-fork","text":"The Go tools require that you clone the repository to the src/github.com/kserve/kserve directory in your GOPATH . To check out this repository: Create your own fork of this repo Clone it to your machine: mkdir -p ${ GOPATH } /src/github.com/kserve cd ${ GOPATH } /src/github.com/kserve git clone git@github.com: ${ YOUR_GITHUB_USERNAME } /kserve.git cd kserve git remote add upstream git@github.com:kserve/kserve.git git remote set-url --push upstream no_push Adding the upstream remote sets you up nicely for regularly syncing your fork . Once you reach this point you are ready to do a full build and deploy as described below.","title":"Checkout your fork"},{"location":"developer/developer/#deploy-kserve","text":"","title":"Deploy KServe"},{"location":"developer/developer/#check-knative-serving-installation","text":"Once you've setup your development environment , you can verify the installation with following: Success $ kubectl -n knative-serving get pods NAME READY STATUS RESTARTS AGE activator-77784645fc-t2pjf 1 /1 Running 0 11d autoscaler-6fddf74d5-z2fzf 1 /1 Running 0 11d autoscaler-hpa-5bf4476cc5-tsbw6 1 /1 Running 0 11d controller-7b8cd7f95c-6jxxj 1 /1 Running 0 11d istio-webhook-866c5bc7f8-t5ztb 1 /1 Running 0 11d networking-istio-54fb8b5d4b-xznwd 1 /1 Running 0 11d webhook-5f5f7bd9b4-cv27c 1 /1 Running 0 11d $ kubectl get gateway -n knative-serving NAME AGE knative-ingress-gateway 11d knative-local-gateway 11d $ kubectl get svc -n istio-system NAME TYPE CLUSTER-IP EXTERNAL-IP PORT ( S ) AGE istio-ingressgateway LoadBalancer 10 .101.196.89 X.X.X.X 15021 :31101/TCP,80:31781/TCP,443:30372/TCP,15443:31067/TCP 11d istiod ClusterIP 10 .101.116.203 15010 /TCP,15012/TCP,443/TCP,15014/TCP,853/TCP 11d","title":"Check Knative Serving installation"},{"location":"developer/developer/#deploy-kserve-from-master-branch","text":"We suggest using cert manager for provisioning the certificates for the webhook server. Other solutions should also work as long as they put the certificates in the desired location. You can follow the cert manager documentation to install it. If you don't want to install cert manager, you can set the KSERVE_ENABLE_SELF_SIGNED_CA environment variable to true. KSERVE_ENABLE_SELF_SIGNED_CA will execute a script to create a self-signed CA and patch it to the webhook config. export KSERVE_ENABLE_SELF_SIGNED_CA = true After that you can run following command to deploy KServe , you can skip above step if cert manager is already installed. make deploy Optional you can change CPU and memory limits when deploying KServe . export KSERVE_CONTROLLER_CPU_LIMIT = export KSERVE_CONTROLLER_MEMORY_LIMIT = make deploy Expected Output $ kubectl get pods -n kserve -l control-plane = kserve-controller-manager NAME READY STATUS RESTARTS AGE kserve-controller-manager-0 2/2 Running 0 13m Note By default it installs to kserve namespace with the published controller manager image from master branch.","title":"Deploy KServe from master branch"},{"location":"developer/developer/#deploy-kserve-with-your-own-version","text":"Run the following command to deploy KServe controller and model agent with your local change. make deploy-dev Note deploy-dev builds the image from your local code, publishes to KO_DOCKER_REPO and deploys the kserve-controller-manager and model agent with the image digest to your cluster for testing. Please also ensure you are logged in to KO_DOCKER_REPO from your client machine. Run the following command to deploy model server with your local change. make deploy-dev-sklearn make deploy-dev-xgb Run the following command to deploy explainer with your local change. make deploy-dev-alibi Run the following command to deploy storage initializer with your local change. make deploy-dev-storageInitializer Warning The deploy command publishes the image to KO_DOCKER_REPO with the version latest , it changes the InferenceService configmap to point to the newly built image sha. The built image is only for development and testing purpose, the current limitation is that it changes the image impacted and reset all other images including the kserver-controller-manager to use the default ones.","title":"Deploy KServe with your own version"},{"location":"developer/developer/#smoke-test-after-deployment","text":"Run the following command to smoke test the deployment kubectl apply -f https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/tensorflow/tensorflow.yaml You should see model serving deployment running under default or your specified namespace. $ kubectl get pods -n default -l serving.kserve.io/inferenceservice=flower-sample Expected Output NAME READY STATUS RESTARTS AGE flower-sample-default-htz8r-deployment-8fd979f9b-w2qbv 3/3 Running 0 10s","title":"Smoke test after deployment"},{"location":"developer/developer/#running-unitintegration-tests","text":"kserver-controller-manager has a few integration tests which requires mock apiserver and etcd, they get installed along with kubebuilder . To run all unit/integration tests: make test","title":"Running unit/integration tests"},{"location":"developer/developer/#run-e2e-tests-locally","text":"To setup from local code, do: ./hack/quick_install.sh make undeploy make deploy-dev Go to python/kserve and install kserve python sdk deps pip3 install -e . [ test ] Then go to test/e2e . Run kubectl create namespace kserve-ci-e2e-test For KIND/minikube: Run export KSERVE_INGRESS_HOST_PORT=localhost:8080 In a different window run kubectl port-forward -n istio-system svc/istio-ingressgateway 8080:80 Note that not all tests will pass as the pytorch test requires gpu. These will show as pending pods at the end or you can add marker to skip the test. Run pytest > testresults.txt Tests may not clean up. To re-run, first do kubectl delete namespace kserve-ci-e2e-test , recreate namespace and run again.","title":"Run e2e tests locally"},{"location":"developer/developer/#iterating","text":"As you make changes to the code-base, there are two special cases to be aware of: If you change an input to generated code , then you must run make manifests . Inputs include: API type definitions in apis/serving Manifests or kustomize patches stored in config . To generate the KServe python/go clients, you should run make generate . If you want to add new dependencies , then you add the imports and the specific version of the dependency module in go.mod . When it encounters an import of a package not provided by any module in go.mod , the go command automatically looks up the module containing the package and adds it to go.mod using the latest version. If you want to upgrade the dependency , then you run go get command e.g go get golang.org/x/text to upgrade to the latest version, go get golang.org/x/text@v0.3.0 to upgrade to a specific version. make deploy-dev","title":"Iterating"},{"location":"developer/developer/#contribute-to-the-code","text":"See the guidelines for contributing a feature contributing to an existing issue","title":"Contribute to the code"},{"location":"developer/developer/#releases","text":"Please check out the documentation here to understand the release schedule and process.","title":"Releases"},{"location":"developer/developer/#feedback","text":"The best place to provide feedback about the KServe code is via a Github issue. See creating a Github issue for guidelines on submitting bugs and feature requests.","title":"Feedback"},{"location":"get_started/","text":"Getting Started with KServe \u00b6 Before you begin \u00b6 Warning KServe Quickstart Environments are for experimentation use only. For production installation, see our Administrator's Guide Before you can get started with a KServe Quickstart deployment you must install kind and the Kubernetes CLI. Install Kind (Kubernetes in Docker) \u00b6 You can use kind (Kubernetes in Docker) to run a local Kubernetes cluster with Docker container nodes. Install the Kubernetes CLI \u00b6 The Kubernetes CLI ( kubectl ) , allows you to run commands against Kubernetes clusters. You can use kubectl to deploy applications, inspect and manage cluster resources, and view logs. Install the KServe \"Quickstart\" environment \u00b6 After having kind installed, create a kind cluster with: kind create cluster Then run: kubectl config get-contexts It should list out a list of contexts you have, one of them should be kind-kind . Then run: kubectl config use-context kind-kind to use this context. You can then get started with a local deployment of KServe by using KServe Quick installation script on Kind : curl -s \"https://raw.githubusercontent.com/kserve/kserve/release-0.12/hack/quick_install.sh\" | bash or install via our published Helm Charts: helm install kserve-crd oci://ghcr.io/kserve/charts/kserve-crd --version v0.12.0 helm install kserve oci://ghcr.io/kserve/charts/kserve --version v0.12.0","title":"KServe Quickstart"},{"location":"get_started/#getting-started-with-kserve","text":"","title":"Getting Started with KServe"},{"location":"get_started/#before-you-begin","text":"Warning KServe Quickstart Environments are for experimentation use only. For production installation, see our Administrator's Guide Before you can get started with a KServe Quickstart deployment you must install kind and the Kubernetes CLI.","title":"Before you begin"},{"location":"get_started/#install-kind-kubernetes-in-docker","text":"You can use kind (Kubernetes in Docker) to run a local Kubernetes cluster with Docker container nodes.","title":"Install Kind (Kubernetes in Docker)"},{"location":"get_started/#install-the-kubernetes-cli","text":"The Kubernetes CLI ( kubectl ) , allows you to run commands against Kubernetes clusters. You can use kubectl to deploy applications, inspect and manage cluster resources, and view logs.","title":"Install the Kubernetes CLI"},{"location":"get_started/#install-the-kserve-quickstart-environment","text":"After having kind installed, create a kind cluster with: kind create cluster Then run: kubectl config get-contexts It should list out a list of contexts you have, one of them should be kind-kind . Then run: kubectl config use-context kind-kind to use this context. You can then get started with a local deployment of KServe by using KServe Quick installation script on Kind : curl -s \"https://raw.githubusercontent.com/kserve/kserve/release-0.12/hack/quick_install.sh\" | bash or install via our published Helm Charts: helm install kserve-crd oci://ghcr.io/kserve/charts/kserve-crd --version v0.12.0 helm install kserve oci://ghcr.io/kserve/charts/kserve --version v0.12.0","title":"Install the KServe \"Quickstart\" environment"},{"location":"get_started/first_isvc/","text":"Run your first InferenceService \u00b6 In this tutorial, you will deploy an InferenceService with a predictor that will load a scikit-learn model trained with the iris dataset. This dataset has three output class: Iris Setosa, Iris Versicolour, and Iris Virginica. You will then send an inference request to your deployed model in order to get a prediction for the class of iris plant your request corresponds to. Since your model is being deployed as an InferenceService, not a raw Kubernetes Service, you just need to provide the storage location of the model and it gets some super powers out of the box . 1. Create a namespace \u00b6 First, create a namespace to use for deploying KServe resources: kubectl create namespace kserve-test 2. Create an InferenceService \u00b6 Next, define a new InferenceService YAML for the model and apply it to the cluster. A new predictor schema was introduced in v0.8.0 . New InferenceServices should be deployed using the new schema. The old schema is provided as reference. New Schema Old Schema kubectl apply -n kserve-test -f - < \"./iris-input.json\" { \"instances\": [ [6.8, 2.8, 4.8, 1.4], [6.0, 3.4, 4.5, 1.6] ] } EOF Depending on your setup, use one of the following commands to curl the InferenceService : Real DNS Magic DNS From Ingress gateway with HOST Header From local cluster gateway If you have configured the DNS, you can directly curl the InferenceService with the URL obtained from the status print. e.g curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test. ${ CUSTOM_DOMAIN } /v1/models/sklearn-iris:predict -d @./iris-input.json If you don't want to go through the trouble to get a real domain, you can instead use \"magic\" dns xip.io . The key is to get the external IP for your cluster. kubectl get svc istio-ingressgateway --namespace istio-system Look for the EXTERNAL-IP column's value(in this case 35.237.217.209) NAME TYPE CLUSTER-IP EXTERNAL-IP PORT ( S ) AGE istio-ingressgateway LoadBalancer 10 .51.253.94 35 .237.217.209 Next step is to setting up the custom domain: kubectl edit cm config-domain --namespace knative-serving Now in your editor, change example.com to {{external-ip}}.xip.io (make sure to replace {{external-ip}} with the IP you found earlier). With the change applied you can now directly curl the URL curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test.35.237.217.209.xip.io/v1/models/sklearn-iris:predict -d @./iris-input.json If you do not have DNS, you can still curl with the ingress gateway external IP using the HOST Header. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-iris -n kserve-test -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" \"http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/sklearn-iris:predict\" -d @./iris-input.json If you are calling from in cluster you can curl with the internal url with host {{InferenceServiceName}}.{{namespace}} curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test/v1/models/sklearn-iris:predict -d @./iris-input.json You should see two predictions returned (i.e. {\"predictions\": [1, 1]} ). Both sets of data points sent for inference correspond to the flower with index 1 . In this case, the model predicts that both flowers are \"Iris Versicolour\". 6. Run performance test (optional) \u00b6 If you want to load test the deployed model, try deploying the following Kubernetes Job to drive load to the model: # use kubectl create instead of apply because the job template is using generateName which doesn't work with kubectl apply kubectl create -f https://raw.githubusercontent.com/kserve/kserve/release-0.11/docs/samples/v1beta1/sklearn/v1/perf.yaml -n kserve-test Execute the following command to view output: kubectl logs load-test8b58n-rgfxr -n kserve-test Expected Output Requests [ total, rate, throughput ] 30000 , 500 .02, 499 .99 Duration [ total, attack, wait ] 1m0s, 59 .998s, 3 .336ms Latencies [ min, mean, 50 , 90 , 95 , 99 , max ] 1 .743ms, 2 .748ms, 2 .494ms, 3 .363ms, 4 .091ms, 7 .749ms, 46 .354ms Bytes In [ total, mean ] 690000 , 23 .00 Bytes Out [ total, mean ] 2460000 , 82 .00 Success [ ratio ] 100 .00% Status Codes [ code:count ] 200 :30000 Error Set:","title":"First InferenceService"},{"location":"get_started/first_isvc/#run-your-first-inferenceservice","text":"In this tutorial, you will deploy an InferenceService with a predictor that will load a scikit-learn model trained with the iris dataset. This dataset has three output class: Iris Setosa, Iris Versicolour, and Iris Virginica. You will then send an inference request to your deployed model in order to get a prediction for the class of iris plant your request corresponds to. Since your model is being deployed as an InferenceService, not a raw Kubernetes Service, you just need to provide the storage location of the model and it gets some super powers out of the box .","title":"Run your first InferenceService"},{"location":"get_started/first_isvc/#1-create-a-namespace","text":"First, create a namespace to use for deploying KServe resources: kubectl create namespace kserve-test","title":"1. Create a namespace"},{"location":"get_started/first_isvc/#2-create-an-inferenceservice","text":"Next, define a new InferenceService YAML for the model and apply it to the cluster. A new predictor schema was introduced in v0.8.0 . New InferenceServices should be deployed using the new schema. The old schema is provided as reference. New Schema Old Schema kubectl apply -n kserve-test -f - < \"./iris-input.json\" { \"instances\": [ [6.8, 2.8, 4.8, 1.4], [6.0, 3.4, 4.5, 1.6] ] } EOF Depending on your setup, use one of the following commands to curl the InferenceService : Real DNS Magic DNS From Ingress gateway with HOST Header From local cluster gateway If you have configured the DNS, you can directly curl the InferenceService with the URL obtained from the status print. e.g curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test. ${ CUSTOM_DOMAIN } /v1/models/sklearn-iris:predict -d @./iris-input.json If you don't want to go through the trouble to get a real domain, you can instead use \"magic\" dns xip.io . The key is to get the external IP for your cluster. kubectl get svc istio-ingressgateway --namespace istio-system Look for the EXTERNAL-IP column's value(in this case 35.237.217.209) NAME TYPE CLUSTER-IP EXTERNAL-IP PORT ( S ) AGE istio-ingressgateway LoadBalancer 10 .51.253.94 35 .237.217.209 Next step is to setting up the custom domain: kubectl edit cm config-domain --namespace knative-serving Now in your editor, change example.com to {{external-ip}}.xip.io (make sure to replace {{external-ip}} with the IP you found earlier). With the change applied you can now directly curl the URL curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test.35.237.217.209.xip.io/v1/models/sklearn-iris:predict -d @./iris-input.json If you do not have DNS, you can still curl with the ingress gateway external IP using the HOST Header. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-iris -n kserve-test -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" \"http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/sklearn-iris:predict\" -d @./iris-input.json If you are calling from in cluster you can curl with the internal url with host {{InferenceServiceName}}.{{namespace}} curl -v -H \"Content-Type: application/json\" http://sklearn-iris.kserve-test/v1/models/sklearn-iris:predict -d @./iris-input.json You should see two predictions returned (i.e. {\"predictions\": [1, 1]} ). Both sets of data points sent for inference correspond to the flower with index 1 . In this case, the model predicts that both flowers are \"Iris Versicolour\".","title":"5. Perform inference"},{"location":"get_started/first_isvc/#6-run-performance-test-optional","text":"If you want to load test the deployed model, try deploying the following Kubernetes Job to drive load to the model: # use kubectl create instead of apply because the job template is using generateName which doesn't work with kubectl apply kubectl create -f https://raw.githubusercontent.com/kserve/kserve/release-0.11/docs/samples/v1beta1/sklearn/v1/perf.yaml -n kserve-test Execute the following command to view output: kubectl logs load-test8b58n-rgfxr -n kserve-test Expected Output Requests [ total, rate, throughput ] 30000 , 500 .02, 499 .99 Duration [ total, attack, wait ] 1m0s, 59 .998s, 3 .336ms Latencies [ min, mean, 50 , 90 , 95 , 99 , max ] 1 .743ms, 2 .748ms, 2 .494ms, 3 .363ms, 4 .091ms, 7 .749ms, 46 .354ms Bytes In [ total, mean ] 690000 , 23 .00 Bytes Out [ total, mean ] 2460000 , 82 .00 Success [ ratio ] 100 .00% Status Codes [ code:count ] 200 :30000 Error Set:","title":"6. Run performance test (optional)"},{"location":"get_started/swagger_ui/","text":"InferenceService Swagger UI \u00b6 KServe ModelServer is built on top of FastAPI , which brings out-of-box support for OpenAPI specification and Swagger UI . Swagger UI allows visualizing and interacting with the KServe InferenceService API directly in the browser , making it easy for exploring the endpoints and validating the outputs without using any command-line tool. Enable Swagger UI \u00b6 Warning Be careful when enabling this for your production InferenceService deployments since the endpoint does not require authentication at this time. Currently, POST request only work for v2 endpoints in the UI. To enable, simply add an extra argument to the InferenceService YAML example from First Inference chapter: kubectl apply -n kserve-test -f - <.github.io/docs/ Where is your Github handle. After a few moments, your changes should be available for public preview at the link provided by MkDocs! This means you can rapidly prototype and share your changes before making a PR! Navigation \u00b6 Navigation in MkDocs uses the \"mkdocs.yml\" file (found in the /docs directory) to organize navigation. For more in-depth information on Navigation, see: https://www.mkdocs.org/user-guide/writing-your-docs/#configure-pages-and-navigation and https://squidfunk.github.io/mkdocs-material/setup/setting-up-navigation/ Content Tabs \u00b6 Content tabs are handy way to organize lots of information in a visually pleasing way. Some documentation from https://squidfunk.github.io/mkdocs-material/reference/content-tabs/#usage is reproduced here: Grouping Code blocks Grouping other content Code blocks are one of the primary targets to be grouped, and can be considered a special case of content tabs, as tabs with a single code block are always rendered without horizontal spacing. Example: === \"C\" ``` c #include int main(void) { printf(\"Hello world!\\n\"); return 0; } ``` === \"C++\" ``` c++ #include int main(void) { std::cout << \"Hello world!\" << std::endl; return 0; } ``` Result: C C++ #include int main ( void ) { printf ( \"Hello world! \\n \" ); return 0 ; } #include int main ( void ) { std :: cout << \"Hello world!\" << std :: endl ; return 0 ; } When a content tab contains more than one code block, it is rendered with horizontal spacing. Vertical spacing is never added, but can be achieved by nesting tabs in other blocks. Example: === \"Unordered list\" * Sed sagittis eleifend rutrum * Donec vitae suscipit est * Nulla tempor lobortis orci === \"Ordered list\" 1. Sed sagittis eleifend rutrum 2. Donec vitae suscipit est 3. Nulla tempor lobortis orci Result: Unordered list Ordered list Sed sagittis eleifend rutrum Donec vitae suscipit est Nulla tempor lobortis orci Sed sagittis eleifend rutrum Donec vitae suscipit est Nulla tempor lobortis orci For more information, see: https://squidfunk.github.io/mkdocs-material/reference/content-tabs/#usage File Includes (Content Reuse) \u00b6 KServe strives to reduce duplicative effort by reusing commonly used bits of information, see the docs/snippet directory for some examples. Snippets does not require a specific extension, and as long as a valid file name is specified, it will attempt to process it. Snippets can handle recursive file inclusion. And if Snippets encounters the same file in the current stack, it will avoid re-processing it in order to avoid an infinite loop (or crash on hitting max recursion depth). For more info, see: https://facelessuser.github.io/pymdown-extensions/extensions/snippets/ Admonitions \u00b6 We use the following admonition boxes only. Use admonitions sparingly; too many admonitions can be distracting. Admonitions Formatting Note A Note contains information that is useful, but not essential. A reader can skip a note without bypassing required information. If the information suggests an action to take, use a tip instead. Tip A Tip suggests an helpful, but not mandatory, action to take. Warning A Warning draws attention to potential trouble. !!! note A Note contains information that is useful, but not essential. A reader can skip a note without bypassing required information. If the information suggests an action to take, use a tip instead. !!! tip A Tip suggests a helpful, but not mandatory, action to take. !!! warning A Warning draws attention to potential trouble. Icons and Emojis \u00b6 Material for MkDocs supports using Material Icons and Emojis using easy shortcodes. Emojs Formatting :taco: To search a database of Icons and Emojis (all of which can be used on kserve.io), as well as usage information, see: https://squidfunk.github.io/mkdocs-material/reference/icons-emojis/#search Redirects \u00b6 The KServe site uses mkdocs-redirects to \"redirect\" users from a page that may no longer exist (or has been moved) to their desired location. Adding re-directs to the KServe site is done in one centralized place, docs/config/redirects.yml . The format is shown here: plugins: redirects: redirect_maps: ... path_to_old_or_moved_URL : path_to_new_URL","title":"MkDocs Contributions"},{"location":"help/contributor/mkdocs-contributor-guide/#mkdocs-contributions","text":"This is a temporary home for contribution guidelines for the MkDocs branch. When MkDocs becomes \"main\" this will be moved to the appropriate place on the website","title":"MkDocs Contributions"},{"location":"help/contributor/mkdocs-contributor-guide/#install-material-for-mkdocs","text":"kserve.io uses Material for MkDocs to render documentation. Material for MkDocs is Python based and uses pip to install most of it's required packages as well as optional add-ons (which we use). You can choose to install MkDocs locally or using a Docker image. pip actually comes pre-installed with Python so it is included in many operating systems (like MacOSx or Ubuntu) but if you don\u2019t have Python, you can install it here: https://www.python.org For some (e.g. folks using RHEL), you may have to use pip3. pip pip3 pip install mkdocs-material mike More detailed instructions can be found here: https://squidfunk.github.io/mkdocs-material/getting-started/#installation pip3 install mkdocs-material mike More detailed instructions can be found here: https://squidfunk.github.io/mkdocs-material/getting-started/#installation","title":"Install Material for MkDocs"},{"location":"help/contributor/mkdocs-contributor-guide/#install-kserve-specific-extensions","text":"KServe uses a number of extensions to MkDocs which can also be installed using pip. If you used pip to install, run the following: pip pip3 pip install mkdocs-material-extensions mkdocs-macros-plugin mkdocs-exclude mkdocs-awesome-pages-plugin mkdocs-redirects pip3 install mkdocs-material-extensions mkdocs-macros-plugin mkdocs-exclude mkdocs-awesome-pages-plugin mkdocs-redirects","title":"Install KServe-Specific Extensions"},{"location":"help/contributor/mkdocs-contributor-guide/#install-dependencies-in-requirementstxt-file","text":"Navigate to root folder and run below command to install required packages and libraries specified in the requirements.txt file. pip pip3 pip install -r requirements.txt pip3 install -r requirements.txt","title":"Install Dependencies in Requirements.txt file"},{"location":"help/contributor/mkdocs-contributor-guide/#setting-up-local-preview","text":"Once you have installed Material for MkDocs and all of the extensions, head over to and clone the repo. In your terminal, find your way over to the location of the cloned repo. Once you are in the main folder and run: Local Preview Local Preview w/ Dirty Reload Local Preview including Blog and Community Site mkdocs serve If you\u2019re only changing a single page in the /docs/ folder (i.e. not the homepage or mkdocs.yml) adding the flag --dirtyreload will make the site rebuild super crazy insta-fast. mkdocs serve --dirtyreload First, install the necessary extensions: npm install -g postcss postcss-cli autoprefixer http-server Once you have those npm packages installed, run: ./hack/build-with-blog.sh serve Note Unfortunately, there aren\u2019t live previews for this version of the local preview. After awhile, your terminal should spit out: INFO - Documentation built in 13 .54 seconds [ I 210519 10 :47:10 server:335 ] Serving on http://127.0.0.1:8000 [ I 210519 10 :47:10 handlers:62 ] Start watching changes [ I 210519 10 :47:10 handlers:64 ] Start detecting changes Now access http://127.0.0.1:8000 and you should see the site is built! \ud83c\udf89 Anytime you change any file in your /docs/ repo and hit save, the site will automatically rebuild itself to reflect your changes!","title":"Setting Up Local Preview"},{"location":"help/contributor/mkdocs-contributor-guide/#setting-up-public-preview","text":"If, for whatever reason, you want to share your work before submitting a PR (where Netlify would generate a preview for you), you can deploy your changes as a Github Page easily using the following command: mkdocs gh-deploy --force INFO - Documentation built in 14 .29 seconds WARNING - Version check skipped: No version specified in previous deployment. INFO - Your documentation should shortly be available at: https://.github.io/docs/ Where is your Github handle. After a few moments, your changes should be available for public preview at the link provided by MkDocs! This means you can rapidly prototype and share your changes before making a PR!","title":"Setting Up \"Public\" Preview"},{"location":"help/contributor/mkdocs-contributor-guide/#navigation","text":"Navigation in MkDocs uses the \"mkdocs.yml\" file (found in the /docs directory) to organize navigation. For more in-depth information on Navigation, see: https://www.mkdocs.org/user-guide/writing-your-docs/#configure-pages-and-navigation and https://squidfunk.github.io/mkdocs-material/setup/setting-up-navigation/","title":"Navigation"},{"location":"help/contributor/mkdocs-contributor-guide/#content-tabs","text":"Content tabs are handy way to organize lots of information in a visually pleasing way. Some documentation from https://squidfunk.github.io/mkdocs-material/reference/content-tabs/#usage is reproduced here: Grouping Code blocks Grouping other content Code blocks are one of the primary targets to be grouped, and can be considered a special case of content tabs, as tabs with a single code block are always rendered without horizontal spacing. Example: === \"C\" ``` c #include int main(void) { printf(\"Hello world!\\n\"); return 0; } ``` === \"C++\" ``` c++ #include int main(void) { std::cout << \"Hello world!\" << std::endl; return 0; } ``` Result: C C++ #include int main ( void ) { printf ( \"Hello world! \\n \" ); return 0 ; } #include int main ( void ) { std :: cout << \"Hello world!\" << std :: endl ; return 0 ; } When a content tab contains more than one code block, it is rendered with horizontal spacing. Vertical spacing is never added, but can be achieved by nesting tabs in other blocks. Example: === \"Unordered list\" * Sed sagittis eleifend rutrum * Donec vitae suscipit est * Nulla tempor lobortis orci === \"Ordered list\" 1. Sed sagittis eleifend rutrum 2. Donec vitae suscipit est 3. Nulla tempor lobortis orci Result: Unordered list Ordered list Sed sagittis eleifend rutrum Donec vitae suscipit est Nulla tempor lobortis orci Sed sagittis eleifend rutrum Donec vitae suscipit est Nulla tempor lobortis orci For more information, see: https://squidfunk.github.io/mkdocs-material/reference/content-tabs/#usage","title":"Content Tabs"},{"location":"help/contributor/mkdocs-contributor-guide/#file-includes-content-reuse","text":"KServe strives to reduce duplicative effort by reusing commonly used bits of information, see the docs/snippet directory for some examples. Snippets does not require a specific extension, and as long as a valid file name is specified, it will attempt to process it. Snippets can handle recursive file inclusion. And if Snippets encounters the same file in the current stack, it will avoid re-processing it in order to avoid an infinite loop (or crash on hitting max recursion depth). For more info, see: https://facelessuser.github.io/pymdown-extensions/extensions/snippets/","title":"File Includes (Content Reuse)"},{"location":"help/contributor/mkdocs-contributor-guide/#admonitions","text":"We use the following admonition boxes only. Use admonitions sparingly; too many admonitions can be distracting. Admonitions Formatting Note A Note contains information that is useful, but not essential. A reader can skip a note without bypassing required information. If the information suggests an action to take, use a tip instead. Tip A Tip suggests an helpful, but not mandatory, action to take. Warning A Warning draws attention to potential trouble. !!! note A Note contains information that is useful, but not essential. A reader can skip a note without bypassing required information. If the information suggests an action to take, use a tip instead. !!! tip A Tip suggests a helpful, but not mandatory, action to take. !!! warning A Warning draws attention to potential trouble.","title":"Admonitions"},{"location":"help/contributor/mkdocs-contributor-guide/#icons-and-emojis","text":"Material for MkDocs supports using Material Icons and Emojis using easy shortcodes. Emojs Formatting :taco: To search a database of Icons and Emojis (all of which can be used on kserve.io), as well as usage information, see: https://squidfunk.github.io/mkdocs-material/reference/icons-emojis/#search","title":"Icons and Emojis"},{"location":"help/contributor/mkdocs-contributor-guide/#redirects","text":"The KServe site uses mkdocs-redirects to \"redirect\" users from a page that may no longer exist (or has been moved) to their desired location. Adding re-directs to the KServe site is done in one centralized place, docs/config/redirects.yml . The format is shown here: plugins: redirects: redirect_maps: ... path_to_old_or_moved_URL : path_to_new_URL","title":"Redirects"},{"location":"help/contributor/templates/template-blog/","text":"Blog template instructions \u00b6 An example template with best-practices that you can use to start drafting an entry to post on the KServe blog. Copy a version of this template without the instructions Include a commented-out table with tracking info about reviews and approvals: | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | | | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | --> Blog content body \u00b6 Example step/section 1: \u00b6 Example step/section 2: \u00b6 Example step/section 3: \u00b6 Example section about results \u00b6 Further reading \u00b6 About the author \u00b6 Copy the template \u00b6 | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | | | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | --> # ## Blog content body ### Example step/section 1: ### Example step/section 2: ### Example step/section 3: ### Example section about results ## Further reading ## About the author ","title":"Blog template instructions"},{"location":"help/contributor/templates/template-blog/#blog-template-instructions","text":"An example template with best-practices that you can use to start drafting an entry to post on the KServe blog. Copy a version of this template without the instructions Include a commented-out table with tracking info about reviews and approvals: | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | | | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | -->","title":"Blog template instructions"},{"location":"help/contributor/templates/template-blog/#blog-content-body","text":" ","title":"Blog content body"},{"location":"help/contributor/templates/template-blog/#example-stepsection-1","text":"","title":"Example step/section 1:"},{"location":"help/contributor/templates/template-blog/#example-stepsection-2","text":"","title":"Example step/section 2:"},{"location":"help/contributor/templates/template-blog/#example-stepsection-3","text":"","title":"Example step/section 3:"},{"location":"help/contributor/templates/template-blog/#example-section-about-results","text":"","title":"Example section about results"},{"location":"help/contributor/templates/template-blog/#further-reading","text":"","title":"Further reading"},{"location":"help/contributor/templates/template-blog/#about-the-author","text":"","title":"About the author"},{"location":"help/contributor/templates/template-blog/#copy-the-template","text":" | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | | | YYYY-MM-DD | :+1:, :monocle_face:, :-1: | --> # ## Blog content body ### Example step/section 1: ### Example step/section 2: ### Example step/section 3: ### Example section about results ## Further reading ## About the author ","title":"Copy the template"},{"location":"help/contributor/templates/template-concept/","text":"Concept Template \u00b6 Use this template when writing conceptual topics. Conceptual topics explain how things work or what things mean. They provide helpful context to readers. They do not include procedures. Template \u00b6 The following template includes the standard sections that should appear in conceptual topics, including a topic introduction sentence, an overview, and placeholders for additional sections and subsections. Copy and paste the markdown from the template to use it in your topic. This topic describes... Write a sentence or two that describes the topic itself, not the subject of the topic. The goal of the topic sentence is to help readers understand if this topic is for them. For example, \"This topic describes what KServe is and how it works.\" ## Overview Write a few sentences describing the subject of the topic. ## Section Title Write a sentence or two to describe the content in this section. Create more sections as necessary. Optionally, add two or more subsections to each section. Do not skip header levels: H2 >> H3, not H2 >> H4. ### Subsection Title Write a sentence or two to describe the content in this section. ### Subsection Title Write a sentence or two to describe the content in this section. Conceptual Content Samples \u00b6 This section provides common content types that appear in conceptual topics. Copy and paste the markdown to use it in your topic. Table \u00b6 Introduce the table with a sentence. For example, \u201cThe following table lists which features are available to a KServe supported ML framework.\u201d Markdown Table Template \u00b6 Header 1 Header 2 Data1 Data2 Data3 Data4 Ordered List \u00b6 Write a sentence or two to introduce the content of the list. For example, \u201cIf you want to fix or add content to a past release, you can find the source files in the following folders.\u201d. Optionally, include bold lead-ins before each list item. Markdown Ordered List Templates \u00b6 Item 1 Item 2 Item 3 Lead-in description: Item 1 Lead-in description: Item 2 Lead-in description: Item 3 Unordered List \u00b6 Write a sentence or two to introduce the content of the list. For example, \u201cYour own path to becoming a KServe contributor can begin in any of the following components:\u201d. Optionally, include bold lead-ins before each list item. Markdown Unordered List Template \u00b6 List item List item List item Lead-in : List item Lead-in : List item Lead-in : List item Note \u00b6 Ensure the text beneath the note is indented as much as note is. Note This is a note. Warning \u00b6 If the note regards an issue that could lead to data loss, the note should be a warning. Warning This is a warning.","title":"Concept Template"},{"location":"help/contributor/templates/template-concept/#concept-template","text":"Use this template when writing conceptual topics. Conceptual topics explain how things work or what things mean. They provide helpful context to readers. They do not include procedures.","title":"Concept Template"},{"location":"help/contributor/templates/template-concept/#template","text":"The following template includes the standard sections that should appear in conceptual topics, including a topic introduction sentence, an overview, and placeholders for additional sections and subsections. Copy and paste the markdown from the template to use it in your topic. This topic describes... Write a sentence or two that describes the topic itself, not the subject of the topic. The goal of the topic sentence is to help readers understand if this topic is for them. For example, \"This topic describes what KServe is and how it works.\" ## Overview Write a few sentences describing the subject of the topic. ## Section Title Write a sentence or two to describe the content in this section. Create more sections as necessary. Optionally, add two or more subsections to each section. Do not skip header levels: H2 >> H3, not H2 >> H4. ### Subsection Title Write a sentence or two to describe the content in this section. ### Subsection Title Write a sentence or two to describe the content in this section.","title":"Template"},{"location":"help/contributor/templates/template-concept/#conceptual-content-samples","text":"This section provides common content types that appear in conceptual topics. Copy and paste the markdown to use it in your topic.","title":"Conceptual Content Samples"},{"location":"help/contributor/templates/template-concept/#table","text":"Introduce the table with a sentence. For example, \u201cThe following table lists which features are available to a KServe supported ML framework.\u201d","title":"Table"},{"location":"help/contributor/templates/template-concept/#markdown-table-template","text":"Header 1 Header 2 Data1 Data2 Data3 Data4","title":"Markdown Table Template"},{"location":"help/contributor/templates/template-concept/#ordered-list","text":"Write a sentence or two to introduce the content of the list. For example, \u201cIf you want to fix or add content to a past release, you can find the source files in the following folders.\u201d. Optionally, include bold lead-ins before each list item.","title":"Ordered List"},{"location":"help/contributor/templates/template-concept/#markdown-ordered-list-templates","text":"Item 1 Item 2 Item 3 Lead-in description: Item 1 Lead-in description: Item 2 Lead-in description: Item 3","title":"Markdown Ordered List Templates"},{"location":"help/contributor/templates/template-concept/#unordered-list","text":"Write a sentence or two to introduce the content of the list. For example, \u201cYour own path to becoming a KServe contributor can begin in any of the following components:\u201d. Optionally, include bold lead-ins before each list item.","title":"Unordered List"},{"location":"help/contributor/templates/template-concept/#markdown-unordered-list-template","text":"List item List item List item Lead-in : List item Lead-in : List item Lead-in : List item","title":"Markdown Unordered List Template"},{"location":"help/contributor/templates/template-concept/#note","text":"Ensure the text beneath the note is indented as much as note is. Note This is a note.","title":"Note"},{"location":"help/contributor/templates/template-concept/#warning","text":"If the note regards an issue that could lead to data loss, the note should be a warning. Warning This is a warning.","title":"Warning"},{"location":"help/contributor/templates/template-procedure/","text":"Procedure template \u00b6 Use this template when writing procedural (how-to) topics. Procedural topics include detailed steps to perform a task as well as some context about the task. Template \u00b6 The following template includes the standard sections that should appear in procedural topics, including a topic sentence, an overview section, and sections for each task within the procedure. Copy and paste the markdown from the template to use it in your topic. This topic describes... Write a sentence or two that describes the topic itself, not the subject of the topic. The goal of the topic sentence is to help readers understand if this topic is for them. For example, \"This topic instructs how to serve a TensorFlow model.\" ## Overview Write a few sentences to describe the subject of the topic, if useful. For example, if the topic is about configuring a broker, you might provide some useful context about brokers. If there are multiple tasks in the procedure and they must be completed in order, create an ordered list that contains each task in the topic. Use bullets for sub-tasks. Include anchor links to the headings for each task. To [task]: 1. [Name of Task 1 (for example, Apply default configuration)](#task-1) 1. [Optional: Name of Task 2](#task-2) !!! note Unless the number of tasks in the procedure is particularly high, do not use numbered lead-ins in the task headings. For example, instead of \"Task 1: Apply default configuration\", use \"Apply default configuration\". ## Prerequisites Use one of the following formats for the Prerequisites section. ### Formatting for two or more prerequisites If there are two or more prerequisites, use the following format. Include links for more information, if necessary. Before you [task], you must have/do: * Prerequisite. See [Link](). * Prerequisite. See [Link](). For example: Before you deploy PyTorch model, you must have: * KServe. See [Installing the KServe](link-to-that-topic). * An Apache Kafka cluster. See [Link to Instructions to Download](link-to-that-topic). ### Format for one prerequisite If there is one prerequisite, use the following format. Include a link for more information, if necessary. Before you [task], you must have/do [prerequisite]. See [Link](link). For example: Before you create the `InferenceService`, you must have a Kubernetes cluster with KServe installed and DNS configured. See the [installation instructions](../../../install/README.md) if you need to create one. ## Task 1 Write a few sentences to describe the task and provide additional context on the task. !!! note When writing a single-step procedure, write the step in one sentence and make it a bullet. The signposting is important given readers are strongly inclined to look for numbered steps and bullet points when searching for instructions. If possible, expand the procedure to include at least one more step. Few procedures truly require a single step. [Task]: 1. Step 1 1. Step 2 ## Optional: Task 2 If the task is optional, put \"Optional:\" in the heading. Write a few sentences to describe the task and provide additional context on the task. [Task]: 1. Step 1 2. Step 2 Procedure Content Samples \u00b6 This section provides common content types that appear in procedural topics. Copy and paste the markdown to use it in your topic. \u201cFill-in-the-Fields\u201d Table \u00b6 Where the reader must enter many values in, for example, a YAML file, use a table within the procedure as follows: Open the YAML file. Key1 : Value1 Key2 : Value2 metadata : annotations : # case-sensitive Key3 : Value3 Key4 : Value4 Key5 : Value5 spec : # Configuration specific to this broker. config : Key6 : Value6 Change the relevant values to your needs, using the following table as a guide. Key Value Type Description Key1 String Description Key2 Integer Description Key3 String Description Key4 String Description Key5 Float Description Key6 String Description Table \u00b6 Introduce the table with a sentence. For example, \u201cThe following table lists which features are available to a KServe supported ML framework. Markdown Table Template \u00b6 Header 1 Header 2 Data1 Data2 Data3 Data4 Ordered List \u00b6 Write a sentence or two to introduce the content of the list. For example, \u201cIf you want to fix or add content to a past release, you can find the source files in the following folders.\u201d. Optionally, include bold lead-ins before each list item. Markdown Ordered List Templates \u00b6 Item 1 Item 2 Item 3 Lead-in description: Item 1 Lead-in description: Item 2 Lead-in description: Item 3 Unordered List \u00b6 Write a sentence or two to introduce the content of the list. For example, \u201cYour own path to becoming a KServe contributor can begin in any of the following components:\u201d. Optionally, include bold lead-ins before each list item. Markdown Unordered List Template \u00b6 List item List item List item Lead-in : List item Lead-in : List item Lead-in : List item Note \u00b6 Ensure the text beneath the note is indented as much as note is. Note This is a note. Warning \u00b6 If the note regards an issue that could lead to data loss, the note should be a warning. Warning This is a warning. Markdown Embedded Image \u00b6 The following is an embedded image reference in markdown. Tabs \u00b6 Place multiple versions of the same procedure (such as a CLI procedure vs a YAML procedure) within tabs. Indent the opening tabs tags 3 spaces to make the tabs display properly. == \"tab1 name\" This is a stem: 1. This is a step. ``` This is some code. ``` 1. This is another step. == \"tab2 name\" This is a stem: 1. This is a step. ``` This is some code. ``` 1. This is another step. Documenting Code and Code Snippets \u00b6 For instructions on how to format code and code snippets, see the Style Guide.","title":"Procedure template"},{"location":"help/contributor/templates/template-procedure/#procedure-template","text":"Use this template when writing procedural (how-to) topics. Procedural topics include detailed steps to perform a task as well as some context about the task.","title":"Procedure template"},{"location":"help/contributor/templates/template-procedure/#template","text":"The following template includes the standard sections that should appear in procedural topics, including a topic sentence, an overview section, and sections for each task within the procedure. Copy and paste the markdown from the template to use it in your topic. This topic describes... Write a sentence or two that describes the topic itself, not the subject of the topic. The goal of the topic sentence is to help readers understand if this topic is for them. For example, \"This topic instructs how to serve a TensorFlow model.\" ## Overview Write a few sentences to describe the subject of the topic, if useful. For example, if the topic is about configuring a broker, you might provide some useful context about brokers. If there are multiple tasks in the procedure and they must be completed in order, create an ordered list that contains each task in the topic. Use bullets for sub-tasks. Include anchor links to the headings for each task. To [task]: 1. [Name of Task 1 (for example, Apply default configuration)](#task-1) 1. [Optional: Name of Task 2](#task-2) !!! note Unless the number of tasks in the procedure is particularly high, do not use numbered lead-ins in the task headings. For example, instead of \"Task 1: Apply default configuration\", use \"Apply default configuration\". ## Prerequisites Use one of the following formats for the Prerequisites section. ### Formatting for two or more prerequisites If there are two or more prerequisites, use the following format. Include links for more information, if necessary. Before you [task], you must have/do: * Prerequisite. See [Link](). * Prerequisite. See [Link](). For example: Before you deploy PyTorch model, you must have: * KServe. See [Installing the KServe](link-to-that-topic). * An Apache Kafka cluster. See [Link to Instructions to Download](link-to-that-topic). ### Format for one prerequisite If there is one prerequisite, use the following format. Include a link for more information, if necessary. Before you [task], you must have/do [prerequisite]. See [Link](link). For example: Before you create the `InferenceService`, you must have a Kubernetes cluster with KServe installed and DNS configured. See the [installation instructions](../../../install/README.md) if you need to create one. ## Task 1 Write a few sentences to describe the task and provide additional context on the task. !!! note When writing a single-step procedure, write the step in one sentence and make it a bullet. The signposting is important given readers are strongly inclined to look for numbered steps and bullet points when searching for instructions. If possible, expand the procedure to include at least one more step. Few procedures truly require a single step. [Task]: 1. Step 1 1. Step 2 ## Optional: Task 2 If the task is optional, put \"Optional:\" in the heading. Write a few sentences to describe the task and provide additional context on the task. [Task]: 1. Step 1 2. Step 2","title":"Template"},{"location":"help/contributor/templates/template-procedure/#procedure-content-samples","text":"This section provides common content types that appear in procedural topics. Copy and paste the markdown to use it in your topic.","title":"Procedure Content Samples"},{"location":"help/contributor/templates/template-procedure/#fill-in-the-fields-table","text":"Where the reader must enter many values in, for example, a YAML file, use a table within the procedure as follows: Open the YAML file. Key1 : Value1 Key2 : Value2 metadata : annotations : # case-sensitive Key3 : Value3 Key4 : Value4 Key5 : Value5 spec : # Configuration specific to this broker. config : Key6 : Value6 Change the relevant values to your needs, using the following table as a guide. Key Value Type Description Key1 String Description Key2 Integer Description Key3 String Description Key4 String Description Key5 Float Description Key6 String Description","title":"\u201cFill-in-the-Fields\u201d Table"},{"location":"help/contributor/templates/template-procedure/#table","text":"Introduce the table with a sentence. For example, \u201cThe following table lists which features are available to a KServe supported ML framework.","title":"Table"},{"location":"help/contributor/templates/template-procedure/#markdown-table-template","text":"Header 1 Header 2 Data1 Data2 Data3 Data4","title":"Markdown Table Template"},{"location":"help/contributor/templates/template-procedure/#ordered-list","text":"Write a sentence or two to introduce the content of the list. For example, \u201cIf you want to fix or add content to a past release, you can find the source files in the following folders.\u201d. Optionally, include bold lead-ins before each list item.","title":"Ordered List"},{"location":"help/contributor/templates/template-procedure/#markdown-ordered-list-templates","text":"Item 1 Item 2 Item 3 Lead-in description: Item 1 Lead-in description: Item 2 Lead-in description: Item 3","title":"Markdown Ordered List Templates"},{"location":"help/contributor/templates/template-procedure/#unordered-list","text":"Write a sentence or two to introduce the content of the list. For example, \u201cYour own path to becoming a KServe contributor can begin in any of the following components:\u201d. Optionally, include bold lead-ins before each list item.","title":"Unordered List"},{"location":"help/contributor/templates/template-procedure/#markdown-unordered-list-template","text":"List item List item List item Lead-in : List item Lead-in : List item Lead-in : List item","title":"Markdown Unordered List Template"},{"location":"help/contributor/templates/template-procedure/#note","text":"Ensure the text beneath the note is indented as much as note is. Note This is a note.","title":"Note"},{"location":"help/contributor/templates/template-procedure/#warning","text":"If the note regards an issue that could lead to data loss, the note should be a warning. Warning This is a warning.","title":"Warning"},{"location":"help/contributor/templates/template-procedure/#markdown-embedded-image","text":"The following is an embedded image reference in markdown.","title":"Markdown Embedded Image"},{"location":"help/contributor/templates/template-procedure/#tabs","text":"Place multiple versions of the same procedure (such as a CLI procedure vs a YAML procedure) within tabs. Indent the opening tabs tags 3 spaces to make the tabs display properly. == \"tab1 name\" This is a stem: 1. This is a step. ``` This is some code. ``` 1. This is another step. == \"tab2 name\" This is a stem: 1. This is a step. ``` This is some code. ``` 1. This is another step.","title":"Tabs"},{"location":"help/contributor/templates/template-procedure/#documenting-code-and-code-snippets","text":"For instructions on how to format code and code snippets, see the Style Guide.","title":"Documenting Code and Code Snippets"},{"location":"help/contributor/templates/template-troubleshooting/","text":"Troubleshooting template \u00b6 When writing guidance to help to troubleshoot specific errors, the error must include: Error Description: To describe the error very briefly so that users can search for it easily. Symptom: To describe the error in a way that helps users to diagnose their issue. Include error messages or anything else users might see if they encounter this error. Explanation (or cause): To inform users about why they are seeing this error. This can be omitted if the cause of the error is unknown. Solution: To inform the user about how to fix the error. Example Troubleshooting Table \u00b6 Troubleshooting \u00b6 | Error Description | |----------|------------| | Symptom | During the event something breaks. | | Cause | The thing is broken. | | Solution | To solve this issue, do the following: 1. This. 2. That. |","title":"Troubleshooting template"},{"location":"help/contributor/templates/template-troubleshooting/#troubleshooting-template","text":"When writing guidance to help to troubleshoot specific errors, the error must include: Error Description: To describe the error very briefly so that users can search for it easily. Symptom: To describe the error in a way that helps users to diagnose their issue. Include error messages or anything else users might see if they encounter this error. Explanation (or cause): To inform users about why they are seeing this error. This can be omitted if the cause of the error is unknown. Solution: To inform the user about how to fix the error.","title":"Troubleshooting template"},{"location":"help/contributor/templates/template-troubleshooting/#example-troubleshooting-table","text":"","title":"Example Troubleshooting Table"},{"location":"help/contributor/templates/template-troubleshooting/#troubleshooting","text":"| Error Description | |----------|------------| | Symptom | During the event something breaks. | | Cause | The thing is broken. | | Solution | To solve this issue, do the following: 1. This. 2. That. |","title":"Troubleshooting"},{"location":"help/style-guide/documenting-code/","text":"Documenting Code \u00b6 Words requiring code formatting \u00b6 Apply code formatting only to special-purpose text: Filenames Path names Fields and values from a YAML file Any text that goes into a CLI CLI names Specify the programming language \u00b6 Specify the language your code is in as part of the code block Specify non-language specific code, like CLI commands, with ```bash. See the following examples for formatting. Correct Incorrect Correct Formatting Incorrect Formatting package main import \"fmt\" func main () { fmt . Println ( \"hello world\" ) } package main import \"fmt\" func main () { fmt.Println ( \"hello world\" ) } ```go package main import \"fmt\" func main() { fmt.Println(\"hello world\") } ``` ```bash package main import \"fmt\" func main() { fmt.Println(\"hello world\") } ``` Documenting YAML \u00b6 When documenting YAML, use two steps. Use step 1 to create the YAML file, and step 2 to apply the YAML file. Use kubectl apply for files/objects that the user creates: it works for both \u201ccreate\u201d and \u201cupdate\u201d, and the source of truth is their local files. Use kubectl edit for files which are shipped as part of the KServe software, like the KServe ConfigMaps. Write ```yaml at the beginning of your code block if you are typing YAML code as part of a CLI command. Correct Incorrect Creating or updating a resource: Create a YAML file using the following template: # YAML FILE CONTENTS Apply the YAML file by running the command: kubectl apply -f .yaml Where is the name of the file you created in the previous step. Editing a ConfigMap: kubectl -n edit configmap Example 1: cat < is\u2026\" Single variable \u00b6 Correct Incorrect kubectl get isvc Where is the name of your InferenceService. kubectl get isvc { SERVICE_NAME } {SERVICE_NAME} = The name of your service Multiple variables \u00b6 Correct Incorrect kn create service --revision-name Where: is the name of your Knative Service. is the desired name of your revision. kn create service --revision-name Where is the name of your Knative Service. Where is the desired name of your revision. CLI output \u00b6 CLI Output should include the custom css \"{ .bash .no-copy }\" in place of \"bash\" which removes the \"Copy to clipboard button\" on the right side of the code block Correct Incorrect Correct Formatting Incorrect Formatting ```{ .bash .no-copy } ``` ```bash ```","title":"Documenting Code"},{"location":"help/style-guide/documenting-code/#documenting-code","text":"","title":"Documenting Code"},{"location":"help/style-guide/documenting-code/#words-requiring-code-formatting","text":"Apply code formatting only to special-purpose text: Filenames Path names Fields and values from a YAML file Any text that goes into a CLI CLI names","title":"Words requiring code formatting"},{"location":"help/style-guide/documenting-code/#specify-the-programming-language","text":"Specify the language your code is in as part of the code block Specify non-language specific code, like CLI commands, with ```bash. See the following examples for formatting. Correct Incorrect Correct Formatting Incorrect Formatting package main import \"fmt\" func main () { fmt . Println ( \"hello world\" ) } package main import \"fmt\" func main () { fmt.Println ( \"hello world\" ) } ```go package main import \"fmt\" func main() { fmt.Println(\"hello world\") } ``` ```bash package main import \"fmt\" func main() { fmt.Println(\"hello world\") } ```","title":"Specify the programming language"},{"location":"help/style-guide/documenting-code/#documenting-yaml","text":"When documenting YAML, use two steps. Use step 1 to create the YAML file, and step 2 to apply the YAML file. Use kubectl apply for files/objects that the user creates: it works for both \u201ccreate\u201d and \u201cupdate\u201d, and the source of truth is their local files. Use kubectl edit for files which are shipped as part of the KServe software, like the KServe ConfigMaps. Write ```yaml at the beginning of your code block if you are typing YAML code as part of a CLI command. Correct Incorrect Creating or updating a resource: Create a YAML file using the following template: # YAML FILE CONTENTS Apply the YAML file by running the command: kubectl apply -f .yaml Where is the name of the file you created in the previous step. Editing a ConfigMap: kubectl -n edit configmap Example 1: cat < is\u2026\"","title":"Referencing variables in code blocks"},{"location":"help/style-guide/documenting-code/#single-variable","text":"Correct Incorrect kubectl get isvc Where is the name of your InferenceService. kubectl get isvc { SERVICE_NAME } {SERVICE_NAME} = The name of your service","title":"Single variable"},{"location":"help/style-guide/documenting-code/#multiple-variables","text":"Correct Incorrect kn create service --revision-name Where: is the name of your Knative Service. is the desired name of your revision. kn create service --revision-name Where is the name of your Knative Service. Where is the desired name of your revision.","title":"Multiple variables"},{"location":"help/style-guide/documenting-code/#cli-output","text":"CLI Output should include the custom css \"{ .bash .no-copy }\" in place of \"bash\" which removes the \"Copy to clipboard button\" on the right side of the code block Correct Incorrect Correct Formatting Incorrect Formatting ```{ .bash .no-copy } ``` ```bash ```","title":"CLI output"},{"location":"help/style-guide/style-and-formatting/","text":"Formatting standards and conventions \u00b6 Titles and headings \u00b6 Use sentence case for titles and headings \u00b6 Only capitalize proper nouns, acronyms, and the first word of the heading. Correct Incorrect ## Configure the feature ## Configure the Feature ### Using feature ### Using Feature ### Using HTTPS ### Using https Do not use code formatting inside headings \u00b6 Correct Incorrect ## Configure the class annotation ## Configure the `class` annotation Use imperatives for headings of procedures \u00b6 For consistency, brevity, and to better signpost where action is expected of the reader, make procedure headings imperatives. Correct Incorrect ## Install KServe ## Installation of KServe ### Configure DNS ### Configuring DNS ## Verify the installation ## How to verify the installation Links \u00b6 Describe what the link targets \u00b6 Correct Incorrect For an explanation of what makes a good hyperlink, see this this article . See this article here . Write links in Markdown, not HTML \u00b6 Correct Incorrect [Kafka Broker](../kafka-broker/README.md) Kafka Broker [Kafka Broker](../kafka-broker/README.md){target=_blank} Kafka Broker Include the .md extension in internal links \u00b6 Correct Incorrect [Setting up a custom domain](../serving/using-a-custom-domain.md) [Setting up a custom domain](../serving/using-a-custom-domain) Link to files, not folders \u00b6 Correct Incorrect [Kafka Broker](../kafka-broker/README.md) [Kafka Broker](../kafka-broker/) Ensure the letter case is correct \u00b6 Correct Incorrect [Kafka Broker](../kafka-broker/README.md) [Kafka Broker](../kafka-broker/readme.md) Formatting \u00b6 Use nonbreaking spaces in units of measurement other than percent \u00b6 For most units of measurement, when you specify a number with the unit, use a nonbreaking space between the number and the unit. Don't use spacing when the unit of measurement is percent. Correct Incorrect 3   GB 3 GB 4   CPUs 4 CPUs 14% 14   % Use bold for user interface elements \u00b6 Correct Incorrect Click Fork Click \"Fork\" Select Other Select \"Other\" Use tables for definition lists \u00b6 When listing terms and their definitions, use table formatting instead of definition list formatting. Correct Incorrect |Value |Description | |------|---------------------| |Value1|Description of Value1| |Value2|Description of Value2| Value1 : Description of Value1 Value2 : Description of Value2 General style \u00b6 Use upper camel case for KServe API objects \u00b6 Correct Incorrect Explainers explainers Transformer transformer InferenceService Inference Service Only use parentheses for acronym explanations \u00b6 Put an acronym inside parentheses after its explanation. Don\u2019t use parentheses for anything else. Parenthetical statements especially should be avoided because readers skip them. If something is important enough to be in the sentence, it should be fully part of that sentence. Correct Incorrect Custom Resource Definition (CRD) Check your CLI (you should see it there) Knative Serving creates a Revision Knative creates a Revision (a stateless, snapshot in time of your code and configuration) Use the international standard for punctuation inside quotes \u00b6 Correct Incorrect Events are recorded with an associated \"stage\". Events are recorded with an associated \"stage.\" The copy is called a \"fork\". The copy is called a \"fork.\"","title":"Formatting standards and conventions"},{"location":"help/style-guide/style-and-formatting/#formatting-standards-and-conventions","text":"","title":"Formatting standards and conventions"},{"location":"help/style-guide/style-and-formatting/#titles-and-headings","text":"","title":"Titles and headings"},{"location":"help/style-guide/style-and-formatting/#use-sentence-case-for-titles-and-headings","text":"Only capitalize proper nouns, acronyms, and the first word of the heading. Correct Incorrect ## Configure the feature ## Configure the Feature ### Using feature ### Using Feature ### Using HTTPS ### Using https","title":"Use sentence case for titles and headings"},{"location":"help/style-guide/style-and-formatting/#do-not-use-code-formatting-inside-headings","text":"Correct Incorrect ## Configure the class annotation ## Configure the `class` annotation","title":"Do not use code formatting inside headings"},{"location":"help/style-guide/style-and-formatting/#use-imperatives-for-headings-of-procedures","text":"For consistency, brevity, and to better signpost where action is expected of the reader, make procedure headings imperatives. Correct Incorrect ## Install KServe ## Installation of KServe ### Configure DNS ### Configuring DNS ## Verify the installation ## How to verify the installation","title":"Use imperatives for headings of procedures"},{"location":"help/style-guide/style-and-formatting/#links","text":"","title":"Links"},{"location":"help/style-guide/style-and-formatting/#describe-what-the-link-targets","text":"Correct Incorrect For an explanation of what makes a good hyperlink, see this this article . See this article here .","title":"Describe what the link targets"},{"location":"help/style-guide/style-and-formatting/#write-links-in-markdown-not-html","text":"Correct Incorrect [Kafka Broker](../kafka-broker/README.md) Kafka Broker [Kafka Broker](../kafka-broker/README.md){target=_blank} Kafka Broker ","title":"Write links in Markdown, not HTML"},{"location":"help/style-guide/style-and-formatting/#include-the-md-extension-in-internal-links","text":"Correct Incorrect [Setting up a custom domain](../serving/using-a-custom-domain.md) [Setting up a custom domain](../serving/using-a-custom-domain)","title":"Include the .md extension in internal links"},{"location":"help/style-guide/style-and-formatting/#link-to-files-not-folders","text":"Correct Incorrect [Kafka Broker](../kafka-broker/README.md) [Kafka Broker](../kafka-broker/)","title":"Link to files, not folders"},{"location":"help/style-guide/style-and-formatting/#ensure-the-letter-case-is-correct","text":"Correct Incorrect [Kafka Broker](../kafka-broker/README.md) [Kafka Broker](../kafka-broker/readme.md)","title":"Ensure the letter case is correct"},{"location":"help/style-guide/style-and-formatting/#formatting","text":"","title":"Formatting"},{"location":"help/style-guide/style-and-formatting/#use-nonbreaking-spaces-in-units-of-measurement-other-than-percent","text":"For most units of measurement, when you specify a number with the unit, use a nonbreaking space between the number and the unit. Don't use spacing when the unit of measurement is percent. Correct Incorrect 3   GB 3 GB 4   CPUs 4 CPUs 14% 14   %","title":"Use nonbreaking spaces in units of measurement other than percent"},{"location":"help/style-guide/style-and-formatting/#use-bold-for-user-interface-elements","text":"Correct Incorrect Click Fork Click \"Fork\" Select Other Select \"Other\"","title":"Use bold for user interface elements"},{"location":"help/style-guide/style-and-formatting/#use-tables-for-definition-lists","text":"When listing terms and their definitions, use table formatting instead of definition list formatting. Correct Incorrect |Value |Description | |------|---------------------| |Value1|Description of Value1| |Value2|Description of Value2| Value1 : Description of Value1 Value2 : Description of Value2","title":"Use tables for definition lists"},{"location":"help/style-guide/style-and-formatting/#general-style","text":"","title":"General style"},{"location":"help/style-guide/style-and-formatting/#use-upper-camel-case-for-kserve-api-objects","text":"Correct Incorrect Explainers explainers Transformer transformer InferenceService Inference Service","title":"Use upper camel case for KServe API objects"},{"location":"help/style-guide/style-and-formatting/#only-use-parentheses-for-acronym-explanations","text":"Put an acronym inside parentheses after its explanation. Don\u2019t use parentheses for anything else. Parenthetical statements especially should be avoided because readers skip them. If something is important enough to be in the sentence, it should be fully part of that sentence. Correct Incorrect Custom Resource Definition (CRD) Check your CLI (you should see it there) Knative Serving creates a Revision Knative creates a Revision (a stateless, snapshot in time of your code and configuration)","title":"Only use parentheses for acronym explanations"},{"location":"help/style-guide/style-and-formatting/#use-the-international-standard-for-punctuation-inside-quotes","text":"Correct Incorrect Events are recorded with an associated \"stage\". Events are recorded with an associated \"stage.\" The copy is called a \"fork\". The copy is called a \"fork.\"","title":"Use the international standard for punctuation inside quotes"},{"location":"help/style-guide/voice-and-language/","text":"Voice and language \u00b6 Use present tense \u00b6 Correct Incorrect This command starts a proxy. This command will start a proxy. Use active voice \u00b6 Correct Incorrect You can explore the API using a browser. The API can be explored using a browser. The YAML file specifies the replica count. The replica count is specified in the YAML file. Use simple and direct language \u00b6 Use simple and direct language. Avoid using unnecessary words, such as \"please\". Correct Incorrect To create a ReplicaSet , ... In order to create a ReplicaSet , ... See the configuration file. Please see the configuration file. View the Pods. With this next command, we'll view the Pods. Address the reader as \"you\", not \"we\" \u00b6 Correct Incorrect You can create a Deployment by ... We can create a Deployment by ... In the preceding output, you can see... In the preceding output, we can see ... This page teaches you how to use pods. In this page, we are going to learn about pods. Avoid jargon, idioms, and Latin \u00b6 Some readers speak English as a second language. Avoid jargon, idioms, and Latin to help make their understanding easier. Correct Incorrect Internally, ... Under the hood, ... Create a new cluster. Turn up a new cluster. Initially, ... Out of the box, ... For example, ... e.g., ... Enter through the gateway ... Enter via the gateway ... Avoid statements about the future \u00b6 Avoid making promises or giving hints about the future. If you need to talk about a feature in development, add a boilerplate under the front matter that identifies the information accordingly. Avoid statements that will soon be out of date \u00b6 Avoid using wording that becomes outdated quickly like \"currently\" and \"new\". A feature that is new today is not new for long. Correct Incorrect In version 1.4, ... In the current version, ... The Federation feature provides ... The new Federation feature provides ... Avoid words that assume a specific level of understanding \u00b6 Avoid words such as \"just\", \"simply\", \"easy\", \"easily\", or \"simple\". These words do not add value. Correct Incorrect Include one command in ... Include just one command in ... Run the container ... Simply run the container ... You can remove ... You can easily remove ... These steps ... These simple steps ...","title":"Voice and language"},{"location":"help/style-guide/voice-and-language/#voice-and-language","text":"","title":"Voice and language"},{"location":"help/style-guide/voice-and-language/#use-present-tense","text":"Correct Incorrect This command starts a proxy. This command will start a proxy.","title":"Use present tense"},{"location":"help/style-guide/voice-and-language/#use-active-voice","text":"Correct Incorrect You can explore the API using a browser. The API can be explored using a browser. The YAML file specifies the replica count. The replica count is specified in the YAML file.","title":"Use active voice"},{"location":"help/style-guide/voice-and-language/#use-simple-and-direct-language","text":"Use simple and direct language. Avoid using unnecessary words, such as \"please\". Correct Incorrect To create a ReplicaSet , ... In order to create a ReplicaSet , ... See the configuration file. Please see the configuration file. View the Pods. With this next command, we'll view the Pods.","title":"Use simple and direct language"},{"location":"help/style-guide/voice-and-language/#address-the-reader-as-you-not-we","text":"Correct Incorrect You can create a Deployment by ... We can create a Deployment by ... In the preceding output, you can see... In the preceding output, we can see ... This page teaches you how to use pods. In this page, we are going to learn about pods.","title":"Address the reader as \"you\", not \"we\""},{"location":"help/style-guide/voice-and-language/#avoid-jargon-idioms-and-latin","text":"Some readers speak English as a second language. Avoid jargon, idioms, and Latin to help make their understanding easier. Correct Incorrect Internally, ... Under the hood, ... Create a new cluster. Turn up a new cluster. Initially, ... Out of the box, ... For example, ... e.g., ... Enter through the gateway ... Enter via the gateway ...","title":"Avoid jargon, idioms, and Latin"},{"location":"help/style-guide/voice-and-language/#avoid-statements-about-the-future","text":"Avoid making promises or giving hints about the future. If you need to talk about a feature in development, add a boilerplate under the front matter that identifies the information accordingly.","title":"Avoid statements about the future"},{"location":"help/style-guide/voice-and-language/#avoid-statements-that-will-soon-be-out-of-date","text":"Avoid using wording that becomes outdated quickly like \"currently\" and \"new\". A feature that is new today is not new for long. Correct Incorrect In version 1.4, ... In the current version, ... The Federation feature provides ... The new Federation feature provides ...","title":"Avoid statements that will soon be out of date"},{"location":"help/style-guide/voice-and-language/#avoid-words-that-assume-a-specific-level-of-understanding","text":"Avoid words such as \"just\", \"simply\", \"easy\", \"easily\", or \"simple\". These words do not add value. Correct Incorrect Include one command in ... Include just one command in ... Run the container ... Simply run the container ... You can remove ... You can easily remove ... These steps ... These simple steps ...","title":"Avoid words that assume a specific level of understanding"},{"location":"modelserving/control_plane/","text":"Control Plane \u00b6 KServe Control Plane : Responsible for reconciling the InferenceService custom resources. It creates the Knative serverless deployment for predictor, transformer, explainer to enable autoscaling based on incoming request workload including scaling down to zero when no traffic is received. When raw deployment mode is enabled, control plane creates Kubernetes deployment, service, ingress, HPA. Control Plane Components \u00b6 KServe Controller : Responsible for creating service, ingress resources, model server container and model agent container for request/response logging , batching and model pulling. Ingress Gateway : Gateway for routing external or internal requests. In Serverless Mode: Knative Serving Controller : Responsible for service revision management, creating network routing resources, serverless container with queue proxy to expose traffic metrics and enforce concurrency limit. Knative Activator : Brings back scaled-to-zero pods and forwards requests. Knative Autoscaler(KPA) : Watches traffic flow to the application, and scales replicas up or down based on configured metrics.","title":"Model Serving Control Plane"},{"location":"modelserving/control_plane/#control-plane","text":"KServe Control Plane : Responsible for reconciling the InferenceService custom resources. It creates the Knative serverless deployment for predictor, transformer, explainer to enable autoscaling based on incoming request workload including scaling down to zero when no traffic is received. When raw deployment mode is enabled, control plane creates Kubernetes deployment, service, ingress, HPA.","title":"Control Plane"},{"location":"modelserving/control_plane/#control-plane-components","text":"KServe Controller : Responsible for creating service, ingress resources, model server container and model agent container for request/response logging , batching and model pulling. Ingress Gateway : Gateway for routing external or internal requests. In Serverless Mode: Knative Serving Controller : Responsible for service revision management, creating network routing resources, serverless container with queue proxy to expose traffic metrics and enforce concurrency limit. Knative Activator : Brings back scaled-to-zero pods and forwards requests. Knative Autoscaler(KPA) : Watches traffic flow to the application, and scales replicas up or down based on configured metrics.","title":"Control Plane Components"},{"location":"modelserving/servingruntimes/","text":"Serving Runtimes \u00b6 KServe makes use of two CRDs for defining model serving environments: ServingRuntimes and ClusterServingRuntimes The only difference between the two is that one is namespace-scoped and the other is cluster-scoped. A ServingRuntime defines the templates for Pods that can serve one or more particular model formats. Each ServingRuntime defines key information such as the container image of the runtime and a list of the model formats that the runtime supports. Other configuration settings for the runtime can be conveyed through environment variables in the container specification. These CRDs allow for improved flexibility and extensibility, enabling users to quickly define or customize reusable runtimes without having to modify any controller code or any resources in the controller namespace. The following is an example of a ServingRuntime: apiVersion : serving.kserve.io/v1alpha1 kind : ServingRuntime metadata : name : example-runtime spec : supportedModelFormats : - name : example-format version : \"1\" autoSelect : true containers : - name : kserve-container image : examplemodelserver:latest args : - --model_dir=/mnt/models - --http_port=8080 Several out-of-the-box ClusterServingRuntimes are provided with KServe so that users can quickly deploy common model formats without having to define the runtimes themselves. Name Supported Model Formats kserve-lgbserver LightGBM kserve-mlserver SKLearn, XGBoost, LightGBM, MLflow kserve-paddleserver Paddle kserve-pmmlserver PMML kserve-sklearnserver SKLearn kserve-tensorflow-serving TensorFlow kserve-torchserve PyTorch kserve-tritonserver TensorFlow, ONNX, PyTorch, TensorRT kserve-xgbserver XGBoost In addition to these included runtimes, you can extend your KServe installation by adding custom runtimes. This is demonstrated in the example for the AMD Inference Server . Spec Attributes \u00b6 Available attributes in the ServingRuntime spec: Attribute Description multiModel Whether this ServingRuntime is ModelMesh-compatible and intended for multi-model usage (as opposed to KServe single-model serving). Defaults to false disabled Disables this runtime containers List of containers associated with the runtime containers[ ].image The container image for the current container containers[ ].command Executable command found in the provided image containers[ ].args List of command line arguments as strings containers[ ].resources Kubernetes limits or requests containers[ ].env List of environment variables to pass to the container containers[ ].imagePullPolicy The container image pull policy containers[ ].workingDir The working directory for current container containers[ ].livenessProbe Probe for checking container liveness containers[ ].readinessProbe Probe for checking container readiness supportedModelFormats List of model types supported by the current runtime supportedModelFormats[ ].name Name of the model format supportedModelFormats[ ].version Version of the model format. Used in validating that a predictor is supported by a runtime. It is recommended to include only the major version here, for example \"1\" rather than \"1.15.4\" supportedModelFormats[ ].autoselect Set to true to allow the ServingRuntime to be used for automatic model placement if this model format is specified with no explicit runtime. The default value is false. supportedModelFormats[ ].priority Priority of this serving runtime for auto selection. This is used to select the serving runtime if more than one serving runtime supports the same model format. The value should be greater than zero. The higher the value, the higher the priority. Priority is not considered if AutoSelect is either false or not specified. Priority can be overridden by specifying the runtime in the InferenceService. storageHelper.disabled Disables the storage helper nodeSelector Influence Kubernetes scheduling to assign pods to nodes affinity Influence Kubernetes scheduling to assign pods to nodes tolerations Allow pods to be scheduled onto nodes with matching taints ModelMesh leverages additional fields not listed here. More information here . Note: ServingRuntimes support the use of template variables of the form {{.Variable}} inside the container spec. These should map to fields inside an InferenceService's metadata object . The primary use of this is for passing in InferenceService-specific information, such as a name, to the runtime environment. Several of the out-of-box ClusterServingRuntimes make use of this by having --model_name={{.Name}} inside the runtime container args to ensure that when a user deploys an InferenceService, the name is passed to the server. Using ServingRuntimes \u00b6 ServingRuntimes can be be used both explicitly and implicitly. Explicit: Specify a runtime \u00b6 When users define predictors in their InferenceServices, they can explicitly specify the name of a ClusterServingRuntime or ServingRuntime . For example: apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-sklearn-isvc spec : predictor : model : modelFormat : name : sklearn storageUri : s3://bucket/sklearn/mnist.joblib runtime : kserve-mlserver Here, the runtime specified is kserve-mlserver , so the KServe controller will first search the namespace for a ServingRuntime with that name. If none exist, the controller will then search the list of ClusterServingRuntimes. If one is found, the controller will first verify that the modelFormat provided in the predictor is in the list of supportedModelFormats . If it is, then the container and pod information provided by the runtime will be used for model deployment. Implicit: Automatic selection \u00b6 In each entry of the supportedModelFormats list, autoSelect: true can optionally be specified to indicate that the given ServingRuntime can be considered for automatic selection for predictors with the corresponding model format if no runtime is explicitly specified. For example, the kserve-sklearnserver ClusterServingRuntime supports SKLearn version 1 and has autoSelect enabled: apiVersion : serving.kserve.io/v1alpha1 kind : ClusterServingRuntime metadata : name : kserve-sklearnserver spec : supportedModelFormats : - name : sklearn version : \"1\" autoSelect : true ... When the following InferenceService is deployed with no runtime specified, the controller will look for a runtime that supports sklearn : apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-sklearn-isvc spec : predictor : model : modelFormat : name : sklearn storageUri : s3://bucket/sklearn/mnist.joblib Since kserve-sklearnserver has an entry in its supportedModelFormats list with sklearn and autoSelect: true , this ClusterServingRuntime will be used for model deployment. If a version is also specified: ... spec : predictor : model : modelFormat : name : sklearn version : \"0\" ... Then, then the version of the supportedModelFormat must also match. In this example, kserve-sklearnserver would not be eligible for selection since it only lists support for sklearn version 1 . Priority \u00b6 If more than one serving runtime supports the same model format with same version and also supports the same protocolVersion then, we can optionally specify priority for the serving runtime. Based on the priority the runtime is automatically selected if no runtime is explicitly specified. Note that, priority is valid only if autoSelect is true . Higher value means higher priority. For example, let's consider the serving runtimes mlserver and kserve-sklearnserver . Both the serving runtimes supports the sklearn model format with version 1 and both supports the protocolVersion v2. Also note that autoSelect is enabled in both the serving runtimes. apiVersion : serving.kserve.io/v1alpha1 kind : ClusterServingRuntime metadata : name : kserve-sklearnserver spec : protocolVersions : - v1 - v2 supportedModelFormats : - name : sklearn version : \"1\" autoSelect : true priority : 1 ... apiVersion : serving.kserve.io/v1alpha1 kind : ClusterServingRuntime metadata : name : mlserver spec : protocolVersions : - v2 supportedModelFormats : - name : sklearn version : \"1\" autoSelect : true priority : 2 ... When the following InferenceService is deployed with no runtime specified, the controller will look for a runtime that supports sklearn : apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-sklearn-isvc spec : predictor : model : protocolVersion : v2 modelFormat : name : sklearn storageUri : s3://bucket/sklearn/mnist.joblib The controller will find the two runtimes kserve-sklearnserver and mlserver as both has an entry in its supportedModelFormats list with sklearn and autoSelect: true . Now the runtime is sorted based on the priority by the controller as there are more than one supported runtime available. Since the mlserver has the higher priority value, this ClusterServingRuntime will be used for model deployment. Constraints of priority The higher priority value means higher precedence. The value must be greater than 0. The priority is valid only if auto select is enabled otherwise the priority is not considered. The serving runtime with priority takes precedence over the serving runtime with priority not specified. Two model formats with same name and same model version cannot have the same priority. If more than one serving runtime supports the model format and none of them specified the priority then, there is no guarantee which runtime will be selected. If multiple versions of a modelFormat are supported by a serving runtime, then it should have the same priority. For example, Below shown serving runtime supports two versions of sklearn. It should have the same priority. apiVersion : serving.kserve.io/v1alpha1 kind : ClusterServingRuntime metadata : name : mlserver spec : protocolVersions : - v2 supportedModelFormats : - name : sklearn version : \"0\" autoSelect : true priority : 2 - name : sklearn version : \"1\" autoSelect : true priority : 2 ... Warning If multiple runtimes list the same format and/or version as auto-selectable and the priority is not specified, the runtime is selected based on the creationTimestamp i.e. the most recently created runtime is selected. So there is no guarantee which runtime will be selected. So users and cluster-administrators should enable autoSelect with care. Previous schema \u00b6 Currently, if a user uses the old schema for deploying predictors where you specify a framework/format as a key, then a KServe webhook will automatically map it to one of the out-of-the-box ClusterServingRuntimes . This is for backwards compatibility. For example: Previous Schema Equivalent New Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-sklearn-isvc spec : predictor : sklearn : storageUri : s3://bucket/sklearn/mnist.joblib apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-sklearn-isvc spec : predictor : model : modelFormat : name : sklearn storageUri : s3://bucket/sklearn/mnist.joblib runtime : kserve-sklearnserver The previous schema would mutate into the new schema where the kserve-sklearnserver ClusterServingRuntime is explicitly specified. Warning The old schema will eventually be removed in favor of the new Model spec, where a user can specify a model format and optionally a corresponding version. In previous versions of KServe, supported predictor formats and container images were defined in a ConfigMap in the control plane namespace. Existing InferenceServices upgraded from v0.7, v0.8, v0.9 need to be converted to the new model spec as the predictor configurations are phased out in v0.10.","title":"Serving Runtimes"},{"location":"modelserving/servingruntimes/#serving-runtimes","text":"KServe makes use of two CRDs for defining model serving environments: ServingRuntimes and ClusterServingRuntimes The only difference between the two is that one is namespace-scoped and the other is cluster-scoped. A ServingRuntime defines the templates for Pods that can serve one or more particular model formats. Each ServingRuntime defines key information such as the container image of the runtime and a list of the model formats that the runtime supports. Other configuration settings for the runtime can be conveyed through environment variables in the container specification. These CRDs allow for improved flexibility and extensibility, enabling users to quickly define or customize reusable runtimes without having to modify any controller code or any resources in the controller namespace. The following is an example of a ServingRuntime: apiVersion : serving.kserve.io/v1alpha1 kind : ServingRuntime metadata : name : example-runtime spec : supportedModelFormats : - name : example-format version : \"1\" autoSelect : true containers : - name : kserve-container image : examplemodelserver:latest args : - --model_dir=/mnt/models - --http_port=8080 Several out-of-the-box ClusterServingRuntimes are provided with KServe so that users can quickly deploy common model formats without having to define the runtimes themselves. Name Supported Model Formats kserve-lgbserver LightGBM kserve-mlserver SKLearn, XGBoost, LightGBM, MLflow kserve-paddleserver Paddle kserve-pmmlserver PMML kserve-sklearnserver SKLearn kserve-tensorflow-serving TensorFlow kserve-torchserve PyTorch kserve-tritonserver TensorFlow, ONNX, PyTorch, TensorRT kserve-xgbserver XGBoost In addition to these included runtimes, you can extend your KServe installation by adding custom runtimes. This is demonstrated in the example for the AMD Inference Server .","title":"Serving Runtimes"},{"location":"modelserving/servingruntimes/#spec-attributes","text":"Available attributes in the ServingRuntime spec: Attribute Description multiModel Whether this ServingRuntime is ModelMesh-compatible and intended for multi-model usage (as opposed to KServe single-model serving). Defaults to false disabled Disables this runtime containers List of containers associated with the runtime containers[ ].image The container image for the current container containers[ ].command Executable command found in the provided image containers[ ].args List of command line arguments as strings containers[ ].resources Kubernetes limits or requests containers[ ].env List of environment variables to pass to the container containers[ ].imagePullPolicy The container image pull policy containers[ ].workingDir The working directory for current container containers[ ].livenessProbe Probe for checking container liveness containers[ ].readinessProbe Probe for checking container readiness supportedModelFormats List of model types supported by the current runtime supportedModelFormats[ ].name Name of the model format supportedModelFormats[ ].version Version of the model format. Used in validating that a predictor is supported by a runtime. It is recommended to include only the major version here, for example \"1\" rather than \"1.15.4\" supportedModelFormats[ ].autoselect Set to true to allow the ServingRuntime to be used for automatic model placement if this model format is specified with no explicit runtime. The default value is false. supportedModelFormats[ ].priority Priority of this serving runtime for auto selection. This is used to select the serving runtime if more than one serving runtime supports the same model format. The value should be greater than zero. The higher the value, the higher the priority. Priority is not considered if AutoSelect is either false or not specified. Priority can be overridden by specifying the runtime in the InferenceService. storageHelper.disabled Disables the storage helper nodeSelector Influence Kubernetes scheduling to assign pods to nodes affinity Influence Kubernetes scheduling to assign pods to nodes tolerations Allow pods to be scheduled onto nodes with matching taints ModelMesh leverages additional fields not listed here. More information here . Note: ServingRuntimes support the use of template variables of the form {{.Variable}} inside the container spec. These should map to fields inside an InferenceService's metadata object . The primary use of this is for passing in InferenceService-specific information, such as a name, to the runtime environment. Several of the out-of-box ClusterServingRuntimes make use of this by having --model_name={{.Name}} inside the runtime container args to ensure that when a user deploys an InferenceService, the name is passed to the server.","title":"Spec Attributes"},{"location":"modelserving/servingruntimes/#using-servingruntimes","text":"ServingRuntimes can be be used both explicitly and implicitly.","title":"Using ServingRuntimes"},{"location":"modelserving/servingruntimes/#explicit-specify-a-runtime","text":"When users define predictors in their InferenceServices, they can explicitly specify the name of a ClusterServingRuntime or ServingRuntime . For example: apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-sklearn-isvc spec : predictor : model : modelFormat : name : sklearn storageUri : s3://bucket/sklearn/mnist.joblib runtime : kserve-mlserver Here, the runtime specified is kserve-mlserver , so the KServe controller will first search the namespace for a ServingRuntime with that name. If none exist, the controller will then search the list of ClusterServingRuntimes. If one is found, the controller will first verify that the modelFormat provided in the predictor is in the list of supportedModelFormats . If it is, then the container and pod information provided by the runtime will be used for model deployment.","title":"Explicit: Specify a runtime"},{"location":"modelserving/servingruntimes/#implicit-automatic-selection","text":"In each entry of the supportedModelFormats list, autoSelect: true can optionally be specified to indicate that the given ServingRuntime can be considered for automatic selection for predictors with the corresponding model format if no runtime is explicitly specified. For example, the kserve-sklearnserver ClusterServingRuntime supports SKLearn version 1 and has autoSelect enabled: apiVersion : serving.kserve.io/v1alpha1 kind : ClusterServingRuntime metadata : name : kserve-sklearnserver spec : supportedModelFormats : - name : sklearn version : \"1\" autoSelect : true ... When the following InferenceService is deployed with no runtime specified, the controller will look for a runtime that supports sklearn : apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-sklearn-isvc spec : predictor : model : modelFormat : name : sklearn storageUri : s3://bucket/sklearn/mnist.joblib Since kserve-sklearnserver has an entry in its supportedModelFormats list with sklearn and autoSelect: true , this ClusterServingRuntime will be used for model deployment. If a version is also specified: ... spec : predictor : model : modelFormat : name : sklearn version : \"0\" ... Then, then the version of the supportedModelFormat must also match. In this example, kserve-sklearnserver would not be eligible for selection since it only lists support for sklearn version 1 .","title":"Implicit: Automatic selection"},{"location":"modelserving/servingruntimes/#priority","text":"If more than one serving runtime supports the same model format with same version and also supports the same protocolVersion then, we can optionally specify priority for the serving runtime. Based on the priority the runtime is automatically selected if no runtime is explicitly specified. Note that, priority is valid only if autoSelect is true . Higher value means higher priority. For example, let's consider the serving runtimes mlserver and kserve-sklearnserver . Both the serving runtimes supports the sklearn model format with version 1 and both supports the protocolVersion v2. Also note that autoSelect is enabled in both the serving runtimes. apiVersion : serving.kserve.io/v1alpha1 kind : ClusterServingRuntime metadata : name : kserve-sklearnserver spec : protocolVersions : - v1 - v2 supportedModelFormats : - name : sklearn version : \"1\" autoSelect : true priority : 1 ... apiVersion : serving.kserve.io/v1alpha1 kind : ClusterServingRuntime metadata : name : mlserver spec : protocolVersions : - v2 supportedModelFormats : - name : sklearn version : \"1\" autoSelect : true priority : 2 ... When the following InferenceService is deployed with no runtime specified, the controller will look for a runtime that supports sklearn : apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-sklearn-isvc spec : predictor : model : protocolVersion : v2 modelFormat : name : sklearn storageUri : s3://bucket/sklearn/mnist.joblib The controller will find the two runtimes kserve-sklearnserver and mlserver as both has an entry in its supportedModelFormats list with sklearn and autoSelect: true . Now the runtime is sorted based on the priority by the controller as there are more than one supported runtime available. Since the mlserver has the higher priority value, this ClusterServingRuntime will be used for model deployment. Constraints of priority The higher priority value means higher precedence. The value must be greater than 0. The priority is valid only if auto select is enabled otherwise the priority is not considered. The serving runtime with priority takes precedence over the serving runtime with priority not specified. Two model formats with same name and same model version cannot have the same priority. If more than one serving runtime supports the model format and none of them specified the priority then, there is no guarantee which runtime will be selected. If multiple versions of a modelFormat are supported by a serving runtime, then it should have the same priority. For example, Below shown serving runtime supports two versions of sklearn. It should have the same priority. apiVersion : serving.kserve.io/v1alpha1 kind : ClusterServingRuntime metadata : name : mlserver spec : protocolVersions : - v2 supportedModelFormats : - name : sklearn version : \"0\" autoSelect : true priority : 2 - name : sklearn version : \"1\" autoSelect : true priority : 2 ... Warning If multiple runtimes list the same format and/or version as auto-selectable and the priority is not specified, the runtime is selected based on the creationTimestamp i.e. the most recently created runtime is selected. So there is no guarantee which runtime will be selected. So users and cluster-administrators should enable autoSelect with care.","title":"Priority"},{"location":"modelserving/servingruntimes/#previous-schema","text":"Currently, if a user uses the old schema for deploying predictors where you specify a framework/format as a key, then a KServe webhook will automatically map it to one of the out-of-the-box ClusterServingRuntimes . This is for backwards compatibility. For example: Previous Schema Equivalent New Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-sklearn-isvc spec : predictor : sklearn : storageUri : s3://bucket/sklearn/mnist.joblib apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : example-sklearn-isvc spec : predictor : model : modelFormat : name : sklearn storageUri : s3://bucket/sklearn/mnist.joblib runtime : kserve-sklearnserver The previous schema would mutate into the new schema where the kserve-sklearnserver ClusterServingRuntime is explicitly specified. Warning The old schema will eventually be removed in favor of the new Model spec, where a user can specify a model format and optionally a corresponding version. In previous versions of KServe, supported predictor formats and container images were defined in a ConfigMap in the control plane namespace. Existing InferenceServices upgraded from v0.7, v0.8, v0.9 need to be converted to the new model spec as the predictor configurations are phased out in v0.10.","title":"Previous schema"},{"location":"modelserving/autoscaling/autoscaling/","text":"Autoscale InferenceService with inference workload \u00b6 InferenceService with target concurrency \u00b6 Create InferenceService \u00b6 Apply the tensorflow example CR with scaling target set to 1. Annotation autoscaling.knative.dev/target is the soft limit rather than a strictly enforced limit, if there is sudden burst of the requests, this value can be exceeded. The scaleTarget and scaleMetric are introduced in version 0.9 of kserve and should be available in both new and old schema. This is the preferred way of defining autoscaling options. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : scaleTarget : 1 scaleMetric : concurrency model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" annotations : autoscaling.knative.dev/target : \"1\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the autoscale.yaml to create the Autoscale InferenceService. kubectl kubectl apply -f autoscale.yaml Expected Output $ inferenceservice.serving.kserve.io/flowers-sample created Predict InferenceService with concurrent requests \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send traffic in 30 seconds spurts maintaining 5 in-flight requests. MODEL_NAME = flowers-sample INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -c 5 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0193 secs Slowest: 10 .1458 secs Fastest: 0 .0127 secs Average: 0 .0364 secs Requests/sec: 137 .4449 Total data: 1019122 bytes Size/request: 247 bytes Response time histogram: 0 .013 [ 1 ] | 1 .026 [ 4120 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 2 .039 [ 0 ] | 3 .053 [ 0 ] | 4 .066 [ 0 ] | 5 .079 [ 0 ] | 6 .093 [ 0 ] | 7 .106 [ 0 ] | 8 .119 [ 0 ] | 9 .133 [ 0 ] | 10 .146 [ 5 ] | Latency distribution: 10 % in 0 .0178 secs 25 % in 0 .0188 secs 50 % in 0 .0199 secs 75 % in 0 .0210 secs 90 % in 0 .0231 secs 95 % in 0 .0328 secs 99 % in 0 .1501 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0002 secs, 0 .0127 secs, 10 .1458 secs DNS-lookup: 0 .0002 secs, 0 .0000 secs, 0 .1502 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0020 secs resp wait: 0 .0360 secs, 0 .0125 secs, 9 .9791 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0021 secs Status code distribution: [ 200 ] 4126 responses Check the number of running pods now, Kserve uses Knative Serving autoscaler which is based on the average number of in-flight requests per pod(concurrency). As the scaling target is set to 1 and we load the service with 5 concurrent requests, so the autoscaler tries scaling up to 5 pods. Notice that out of all the requests there are 5 requests on the histogram that take around 10s, that's the cold start time cost to initially spawn the pods and download model to be ready to serve. The cold start may take longer(to pull the serving image) if the image is not cached on the node that the pod is scheduled on. $ kubectl get pods NAME READY STATUS RESTARTS AGE flowers-sample-default-7kqt6-deployment-75d577dcdb-sr5wd 3 /3 Running 0 42s flowers-sample-default-7kqt6-deployment-75d577dcdb-swnk5 3 /3 Running 0 62s flowers-sample-default-7kqt6-deployment-75d577dcdb-t2njf 3 /3 Running 0 62s flowers-sample-default-7kqt6-deployment-75d577dcdb-vdlp9 3 /3 Running 0 64s flowers-sample-default-7kqt6-deployment-75d577dcdb-vm58d 3 /3 Running 0 42s Check Dashboard \u00b6 View the Knative Serving Scaling dashboards (if configured). kubectl kubectl port-forward --namespace knative-monitoring $( kubectl get pods --namespace knative-monitoring --selector = app = grafana --output = jsonpath = \"{.items..metadata.name}\" ) 3000 InferenceService with target QPS \u00b6 Create the InferenceService \u00b6 Apply the same tensorflow example CR kubectl kubectl apply -f autoscale.yaml Expected Output $ inferenceservice.serving.kserve.io/flowers-sample created Predict InferenceService with target QPS \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send 30 seconds of traffic maintaining 50 qps. MODEL_NAME = flowers-sample INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -q 50 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0264 secs Slowest: 10 .8113 secs Fastest: 0 .0145 secs Average: 0 .0731 secs Requests/sec: 683 .5644 Total data: 5069675 bytes Size/request: 247 bytes Response time histogram: 0 .014 [ 1 ] | 1 .094 [ 20474 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 2 .174 [ 0 ] | 3 .254 [ 0 ] | 4 .333 [ 0 ] | 5 .413 [ 0 ] | 6 .493 [ 0 ] | 7 .572 [ 0 ] | 8 .652 [ 0 ] | 9 .732 [ 0 ] | 10 .811 [ 50 ] | Latency distribution: 10 % in 0 .0284 secs 25 % in 0 .0334 secs 50 % in 0 .0408 secs 75 % in 0 .0527 secs 90 % in 0 .0765 secs 95 % in 0 .0949 secs 99 % in 0 .1334 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0001 secs, 0 .0145 secs, 10 .8113 secs DNS-lookup: 0 .0000 secs, 0 .0000 secs, 0 .0196 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0031 secs resp wait: 0 .0728 secs, 0 .0144 secs, 10 .7688 secs resp read: 0 .0000 secs, 0 .0000 secs, 0 .0031 secs Status code distribution: [ 200 ] 20525 responses Check the number of running pods now, we are loading the service with 50 requests per second, and from the dashboard you can see that it hits the average concurrency 10 and autoscaler tries scaling up to 10 pods. Check Dashboard \u00b6 View the Knative Serving Scaling dashboards (if configured). kubectl port-forward --namespace knative-monitoring $( kubectl get pods --namespace knative-monitoring --selector = app = grafana --output = jsonpath = \"{.items..metadata.name}\" ) 3000 Autoscaler calculates average concurrency over 60 second window so it takes a minute to stabilize at the desired concurrency level, however it also calculates the 6 second panic window and will enter into panic mode if that window reaches 2x target concurrency. From the dashboard you can see that it enters panic mode in which autoscaler operates on shorter and more sensitive window. Once the panic conditions are no longer met for 60 seconds, autoscaler will return back to 60 seconds stable window. Autoscaling on GPU! \u00b6 Autoscaling on GPU is hard with GPU metrics, however thanks to Knative's concurrency based autoscaler scaling on GPU is pretty easy and effective! Create the InferenceService with GPU resource \u00b6 Apply the tensorflow gpu example CR New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample-gpu\" spec : predictor : scaleTarget : 1 scaleMetric : concurrency model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" runtimeVersion : \"2.6.2-gpu\" resources : limits : nvidia.com/gpu : 1 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample-gpu\" annotations : autoscaling.knative.dev/target : \"1\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" runtimeVersion : \"2.6.2-gpu\" resources : limits : nvidia.com/gpu : 1 Apply the autoscale-gpu.yaml . kubectl kubectl apply -f autoscale-gpu.yaml Predict InferenceService with concurrent requests \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send 30 seconds of traffic maintaining 5 in-flight requests. MODEL_NAME = flowers-sample-gpu INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -c 5 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0152 secs Slowest: 9 .7581 secs Fastest: 0 .0142 secs Average: 0 .0350 secs Requests/sec: 142 .9942 Total data: 948532 bytes Size/request: 221 bytes Response time histogram: 0 .014 [ 1 ] | 0 .989 [ 4286 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 1 .963 [ 0 ] | 2 .937 [ 0 ] | 3 .912 [ 0 ] | 4 .886 [ 0 ] | 5 .861 [ 0 ] | 6 .835 [ 0 ] | 7 .809 [ 0 ] | 8 .784 [ 0 ] | 9 .758 [ 5 ] | Latency distribution: 10 % in 0 .0181 secs 25 % in 0 .0189 secs 50 % in 0 .0198 secs 75 % in 0 .0210 secs 90 % in 0 .0230 secs 95 % in 0 .0276 secs 99 % in 0 .0511 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0000 secs, 0 .0142 secs, 9 .7581 secs DNS-lookup: 0 .0000 secs, 0 .0000 secs, 0 .0291 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0023 secs resp wait: 0 .0348 secs, 0 .0141 secs, 9 .7158 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0021 secs Status code distribution: [ 200 ] 4292 responses Autoscaling Customization \u00b6 Autoscaling with ContainerConcurrency \u00b6 ContainerConcurrency determines the number of simultaneous requests that can be processed by each replica of the InferenceService at any given time, it is a hard limit and if the concurrency reaches the hard limit surplus requests will be buffered and must wait until enough capacity is free to execute the requests. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : containerConcurrency : 10 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : containerConcurrency : 10 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the autoscale-custom.yaml . kubectl kubectl apply -f autoscale-custom.yaml Enable scale down to zero \u00b6 KServe by default sets minReplicas to 1, if you want to enable scaling down to zero especially for use cases like serving on GPUs you can set minReplicas to 0 so that the pods automatically scale down to zero when no traffic is received. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : minReplicas : 0 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : minReplicas : 0 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the scale-down-to-zero.yaml . kubectl kubectl apply -f scale-down-to-zero.yaml Autoscaling configuration at component level \u00b6 Autoscaling options can also be configured at the component level. This allows more flexibility in terms of the autoscaling configuration. In a typical deployment, transformers may require a different autoscaling configuration than a predictor. This feature allows the user to scale individual components as required. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : scaleTarget : 2 scaleMetric : concurrency model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier transformer : scaleTarget : 8 scaleMetric : rps containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : scaleTarget : 2 scaleMetric : concurrency pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier transformer : scaleTarget : 8 scaleMetric : rps containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist Apply the autoscale-adv.yaml to create the Autoscale InferenceService. The default for scaleMetric is concurrency and possible values are concurrency , rps , cpu and memory .","title":"Inference Autoscaling"},{"location":"modelserving/autoscaling/autoscaling/#autoscale-inferenceservice-with-inference-workload","text":"","title":"Autoscale InferenceService with inference workload"},{"location":"modelserving/autoscaling/autoscaling/#inferenceservice-with-target-concurrency","text":"","title":"InferenceService with target concurrency"},{"location":"modelserving/autoscaling/autoscaling/#create-inferenceservice","text":"Apply the tensorflow example CR with scaling target set to 1. Annotation autoscaling.knative.dev/target is the soft limit rather than a strictly enforced limit, if there is sudden burst of the requests, this value can be exceeded. The scaleTarget and scaleMetric are introduced in version 0.9 of kserve and should be available in both new and old schema. This is the preferred way of defining autoscaling options. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : scaleTarget : 1 scaleMetric : concurrency model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" annotations : autoscaling.knative.dev/target : \"1\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the autoscale.yaml to create the Autoscale InferenceService. kubectl kubectl apply -f autoscale.yaml Expected Output $ inferenceservice.serving.kserve.io/flowers-sample created","title":"Create InferenceService"},{"location":"modelserving/autoscaling/autoscaling/#predict-inferenceservice-with-concurrent-requests","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send traffic in 30 seconds spurts maintaining 5 in-flight requests. MODEL_NAME = flowers-sample INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -c 5 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0193 secs Slowest: 10 .1458 secs Fastest: 0 .0127 secs Average: 0 .0364 secs Requests/sec: 137 .4449 Total data: 1019122 bytes Size/request: 247 bytes Response time histogram: 0 .013 [ 1 ] | 1 .026 [ 4120 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 2 .039 [ 0 ] | 3 .053 [ 0 ] | 4 .066 [ 0 ] | 5 .079 [ 0 ] | 6 .093 [ 0 ] | 7 .106 [ 0 ] | 8 .119 [ 0 ] | 9 .133 [ 0 ] | 10 .146 [ 5 ] | Latency distribution: 10 % in 0 .0178 secs 25 % in 0 .0188 secs 50 % in 0 .0199 secs 75 % in 0 .0210 secs 90 % in 0 .0231 secs 95 % in 0 .0328 secs 99 % in 0 .1501 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0002 secs, 0 .0127 secs, 10 .1458 secs DNS-lookup: 0 .0002 secs, 0 .0000 secs, 0 .1502 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0020 secs resp wait: 0 .0360 secs, 0 .0125 secs, 9 .9791 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0021 secs Status code distribution: [ 200 ] 4126 responses Check the number of running pods now, Kserve uses Knative Serving autoscaler which is based on the average number of in-flight requests per pod(concurrency). As the scaling target is set to 1 and we load the service with 5 concurrent requests, so the autoscaler tries scaling up to 5 pods. Notice that out of all the requests there are 5 requests on the histogram that take around 10s, that's the cold start time cost to initially spawn the pods and download model to be ready to serve. The cold start may take longer(to pull the serving image) if the image is not cached on the node that the pod is scheduled on. $ kubectl get pods NAME READY STATUS RESTARTS AGE flowers-sample-default-7kqt6-deployment-75d577dcdb-sr5wd 3 /3 Running 0 42s flowers-sample-default-7kqt6-deployment-75d577dcdb-swnk5 3 /3 Running 0 62s flowers-sample-default-7kqt6-deployment-75d577dcdb-t2njf 3 /3 Running 0 62s flowers-sample-default-7kqt6-deployment-75d577dcdb-vdlp9 3 /3 Running 0 64s flowers-sample-default-7kqt6-deployment-75d577dcdb-vm58d 3 /3 Running 0 42s","title":"Predict InferenceService with concurrent requests"},{"location":"modelserving/autoscaling/autoscaling/#check-dashboard","text":"View the Knative Serving Scaling dashboards (if configured). kubectl kubectl port-forward --namespace knative-monitoring $( kubectl get pods --namespace knative-monitoring --selector = app = grafana --output = jsonpath = \"{.items..metadata.name}\" ) 3000","title":"Check Dashboard"},{"location":"modelserving/autoscaling/autoscaling/#inferenceservice-with-target-qps","text":"","title":"InferenceService with target QPS"},{"location":"modelserving/autoscaling/autoscaling/#create-the-inferenceservice","text":"Apply the same tensorflow example CR kubectl kubectl apply -f autoscale.yaml Expected Output $ inferenceservice.serving.kserve.io/flowers-sample created","title":"Create the InferenceService"},{"location":"modelserving/autoscaling/autoscaling/#predict-inferenceservice-with-target-qps","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send 30 seconds of traffic maintaining 50 qps. MODEL_NAME = flowers-sample INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -q 50 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0264 secs Slowest: 10 .8113 secs Fastest: 0 .0145 secs Average: 0 .0731 secs Requests/sec: 683 .5644 Total data: 5069675 bytes Size/request: 247 bytes Response time histogram: 0 .014 [ 1 ] | 1 .094 [ 20474 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 2 .174 [ 0 ] | 3 .254 [ 0 ] | 4 .333 [ 0 ] | 5 .413 [ 0 ] | 6 .493 [ 0 ] | 7 .572 [ 0 ] | 8 .652 [ 0 ] | 9 .732 [ 0 ] | 10 .811 [ 50 ] | Latency distribution: 10 % in 0 .0284 secs 25 % in 0 .0334 secs 50 % in 0 .0408 secs 75 % in 0 .0527 secs 90 % in 0 .0765 secs 95 % in 0 .0949 secs 99 % in 0 .1334 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0001 secs, 0 .0145 secs, 10 .8113 secs DNS-lookup: 0 .0000 secs, 0 .0000 secs, 0 .0196 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0031 secs resp wait: 0 .0728 secs, 0 .0144 secs, 10 .7688 secs resp read: 0 .0000 secs, 0 .0000 secs, 0 .0031 secs Status code distribution: [ 200 ] 20525 responses Check the number of running pods now, we are loading the service with 50 requests per second, and from the dashboard you can see that it hits the average concurrency 10 and autoscaler tries scaling up to 10 pods.","title":"Predict InferenceService with target QPS"},{"location":"modelserving/autoscaling/autoscaling/#check-dashboard_1","text":"View the Knative Serving Scaling dashboards (if configured). kubectl port-forward --namespace knative-monitoring $( kubectl get pods --namespace knative-monitoring --selector = app = grafana --output = jsonpath = \"{.items..metadata.name}\" ) 3000 Autoscaler calculates average concurrency over 60 second window so it takes a minute to stabilize at the desired concurrency level, however it also calculates the 6 second panic window and will enter into panic mode if that window reaches 2x target concurrency. From the dashboard you can see that it enters panic mode in which autoscaler operates on shorter and more sensitive window. Once the panic conditions are no longer met for 60 seconds, autoscaler will return back to 60 seconds stable window.","title":"Check Dashboard"},{"location":"modelserving/autoscaling/autoscaling/#autoscaling-on-gpu","text":"Autoscaling on GPU is hard with GPU metrics, however thanks to Knative's concurrency based autoscaler scaling on GPU is pretty easy and effective!","title":"Autoscaling on GPU!"},{"location":"modelserving/autoscaling/autoscaling/#create-the-inferenceservice-with-gpu-resource","text":"Apply the tensorflow gpu example CR New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample-gpu\" spec : predictor : scaleTarget : 1 scaleMetric : concurrency model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" runtimeVersion : \"2.6.2-gpu\" resources : limits : nvidia.com/gpu : 1 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample-gpu\" annotations : autoscaling.knative.dev/target : \"1\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" runtimeVersion : \"2.6.2-gpu\" resources : limits : nvidia.com/gpu : 1 Apply the autoscale-gpu.yaml . kubectl kubectl apply -f autoscale-gpu.yaml","title":"Create the InferenceService with GPU resource"},{"location":"modelserving/autoscaling/autoscaling/#predict-inferenceservice-with-concurrent-requests_1","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send 30 seconds of traffic maintaining 5 in-flight requests. MODEL_NAME = flowers-sample-gpu INPUT_PATH = input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $MODEL_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 30s -c 5 -m POST -host ${ SERVICE_HOSTNAME } -D $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output Summary: Total: 30 .0152 secs Slowest: 9 .7581 secs Fastest: 0 .0142 secs Average: 0 .0350 secs Requests/sec: 142 .9942 Total data: 948532 bytes Size/request: 221 bytes Response time histogram: 0 .014 [ 1 ] | 0 .989 [ 4286 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 1 .963 [ 0 ] | 2 .937 [ 0 ] | 3 .912 [ 0 ] | 4 .886 [ 0 ] | 5 .861 [ 0 ] | 6 .835 [ 0 ] | 7 .809 [ 0 ] | 8 .784 [ 0 ] | 9 .758 [ 5 ] | Latency distribution: 10 % in 0 .0181 secs 25 % in 0 .0189 secs 50 % in 0 .0198 secs 75 % in 0 .0210 secs 90 % in 0 .0230 secs 95 % in 0 .0276 secs 99 % in 0 .0511 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0000 secs, 0 .0142 secs, 9 .7581 secs DNS-lookup: 0 .0000 secs, 0 .0000 secs, 0 .0291 secs req write: 0 .0000 secs, 0 .0000 secs, 0 .0023 secs resp wait: 0 .0348 secs, 0 .0141 secs, 9 .7158 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0021 secs Status code distribution: [ 200 ] 4292 responses","title":"Predict InferenceService with concurrent requests"},{"location":"modelserving/autoscaling/autoscaling/#autoscaling-customization","text":"","title":"Autoscaling Customization"},{"location":"modelserving/autoscaling/autoscaling/#autoscaling-with-containerconcurrency","text":"ContainerConcurrency determines the number of simultaneous requests that can be processed by each replica of the InferenceService at any given time, it is a hard limit and if the concurrency reaches the hard limit surplus requests will be buffered and must wait until enough capacity is free to execute the requests. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : containerConcurrency : 10 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : containerConcurrency : 10 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the autoscale-custom.yaml . kubectl kubectl apply -f autoscale-custom.yaml","title":"Autoscaling with ContainerConcurrency"},{"location":"modelserving/autoscaling/autoscaling/#enable-scale-down-to-zero","text":"KServe by default sets minReplicas to 1, if you want to enable scaling down to zero especially for use cases like serving on GPUs you can set minReplicas to 0 so that the pods automatically scale down to zero when no traffic is received. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : minReplicas : 0 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flowers-sample\" spec : predictor : minReplicas : 0 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the scale-down-to-zero.yaml . kubectl kubectl apply -f scale-down-to-zero.yaml","title":"Enable scale down to zero"},{"location":"modelserving/autoscaling/autoscaling/#autoscaling-configuration-at-component-level","text":"Autoscaling options can also be configured at the component level. This allows more flexibility in terms of the autoscaling configuration. In a typical deployment, transformers may require a different autoscaling configuration than a predictor. This feature allows the user to scale individual components as required. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : scaleTarget : 2 scaleMetric : concurrency model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier transformer : scaleTarget : 8 scaleMetric : rps containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : scaleTarget : 2 scaleMetric : concurrency pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier transformer : scaleTarget : 8 scaleMetric : rps containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist Apply the autoscale-adv.yaml to create the Autoscale InferenceService. The default for scaleMetric is concurrency and possible values are concurrency , rps , cpu and memory .","title":"Autoscaling configuration at component level"},{"location":"modelserving/batcher/batcher/","text":"Inference Batcher \u00b6 This docs explains on how batch prediction for any ML frameworks (TensorFlow, PyTorch, ...) without decreasing the performance. This batcher is implemented in the KServe model agent sidecar, so the requests first hit the agent sidecar, when a batch prediction is triggered the request is then sent to the model server container for inference. We use webhook to inject the model agent container in the InferenceService pod to do the batching when batcher is enabled. We use go channels to transfer data between http request handler and batcher go routines. Currently we only implemented batching with KServe v1 HTTP protocol, gRPC is not supported yet. When the number of instances (For example, the number of pictures) reaches the maxBatchSize or the latency meets the maxLatency , a batch prediction will be triggered. Example \u00b6 We first create a pytorch predictor with a batcher. The maxLatency is set to a big value (500 milliseconds) to make us be able to observe the batching process. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve\" spec : predictor : minReplicas : 1 timeout : 60 batcher : maxBatchSize : 32 maxLatency : 500 model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve\" spec : predictor : minReplicas : 1 timeout : 60 batcher : maxBatchSize : 32 maxLatency : 500 pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 maxBatchSize : the max batch size for triggering a prediction. maxLatency : the max latency for triggering a prediction (In milliseconds). timeout : timeout of calling predictor service (In seconds). All of the bellowing fields have default values in the code. You can config them or not as you wish. maxBatchSize : 32. maxLatency : 500. timeout : 60. kubectl kubectl create -f pytorch-batcher.yaml We can now send requests to the pytorch model using hey. The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = mnist INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 10s -c 5 -m POST -host \" ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -D ./input.json \"http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict\" The request will go to the model agent container first, the batcher in sidecar container batches the requests and send the inference request to the predictor container. Note If the interval of sending the two requests is less than maxLatency , the returned batchId will be the same. Expected Output Summary: Total: 10 .5361 secs Slowest: 0 .5759 secs Fastest: 0 .4983 secs Average: 0 .5265 secs Requests/sec: 9 .4912 Total data: 24100 bytes Size/request: 241 bytes Response time histogram: 0 .498 [ 1 ] | \u25a0 0 .506 [ 0 ] | 0 .514 [ 44 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .522 [ 21 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .529 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .537 [ 5 ] | \u25a0\u25a0\u25a0\u25a0\u25a0 0 .545 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .553 [ 0 ] | 0 .560 [ 7 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .568 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .576 [ 10 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 Latency distribution: 10 % in 0 .5100 secs 25 % in 0 .5118 secs 50 % in 0 .5149 secs 75 % in 0 .5406 secs 90 % in 0 .5706 secs 95 % in 0 .5733 secs 99 % in 0 .5759 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0004 secs, 0 .4983 secs, 0 .5759 secs DNS-lookup: 0 .0001 secs, 0 .0000 secs, 0 .0015 secs req write: 0 .0002 secs, 0 .0000 secs, 0 .0076 secs resp wait: 0 .5257 secs, 0 .4981 secs, 0 .5749 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0009 secs Status code distribution: [ 200 ] 100 responses","title":"Inference Batcher"},{"location":"modelserving/batcher/batcher/#inference-batcher","text":"This docs explains on how batch prediction for any ML frameworks (TensorFlow, PyTorch, ...) without decreasing the performance. This batcher is implemented in the KServe model agent sidecar, so the requests first hit the agent sidecar, when a batch prediction is triggered the request is then sent to the model server container for inference. We use webhook to inject the model agent container in the InferenceService pod to do the batching when batcher is enabled. We use go channels to transfer data between http request handler and batcher go routines. Currently we only implemented batching with KServe v1 HTTP protocol, gRPC is not supported yet. When the number of instances (For example, the number of pictures) reaches the maxBatchSize or the latency meets the maxLatency , a batch prediction will be triggered.","title":"Inference Batcher"},{"location":"modelserving/batcher/batcher/#example","text":"We first create a pytorch predictor with a batcher. The maxLatency is set to a big value (500 milliseconds) to make us be able to observe the batching process. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve\" spec : predictor : minReplicas : 1 timeout : 60 batcher : maxBatchSize : 32 maxLatency : 500 model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve\" spec : predictor : minReplicas : 1 timeout : 60 batcher : maxBatchSize : 32 maxLatency : 500 pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 maxBatchSize : the max batch size for triggering a prediction. maxLatency : the max latency for triggering a prediction (In milliseconds). timeout : timeout of calling predictor service (In seconds). All of the bellowing fields have default values in the code. You can config them or not as you wish. maxBatchSize : 32. maxLatency : 500. timeout : 60. kubectl kubectl create -f pytorch-batcher.yaml We can now send requests to the pytorch model using hey. The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = mnist INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -z 10s -c 5 -m POST -host \" ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -D ./input.json \"http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict\" The request will go to the model agent container first, the batcher in sidecar container batches the requests and send the inference request to the predictor container. Note If the interval of sending the two requests is less than maxLatency , the returned batchId will be the same. Expected Output Summary: Total: 10 .5361 secs Slowest: 0 .5759 secs Fastest: 0 .4983 secs Average: 0 .5265 secs Requests/sec: 9 .4912 Total data: 24100 bytes Size/request: 241 bytes Response time histogram: 0 .498 [ 1 ] | \u25a0 0 .506 [ 0 ] | 0 .514 [ 44 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .522 [ 21 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .529 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .537 [ 5 ] | \u25a0\u25a0\u25a0\u25a0\u25a0 0 .545 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .553 [ 0 ] | 0 .560 [ 7 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 0 .568 [ 4 ] | \u25a0\u25a0\u25a0\u25a0 0 .576 [ 10 ] | \u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0\u25a0 Latency distribution: 10 % in 0 .5100 secs 25 % in 0 .5118 secs 50 % in 0 .5149 secs 75 % in 0 .5406 secs 90 % in 0 .5706 secs 95 % in 0 .5733 secs 99 % in 0 .5759 secs Details ( average, fastest, slowest ) : DNS+dialup: 0 .0004 secs, 0 .4983 secs, 0 .5759 secs DNS-lookup: 0 .0001 secs, 0 .0000 secs, 0 .0015 secs req write: 0 .0002 secs, 0 .0000 secs, 0 .0076 secs resp wait: 0 .5257 secs, 0 .4981 secs, 0 .5749 secs resp read: 0 .0001 secs, 0 .0000 secs, 0 .0009 secs Status code distribution: [ 200 ] 100 responses","title":"Example"},{"location":"modelserving/certificate/kserve/","text":"KServe with Self Signed Certificate Model Registry \u00b6 If you are using a model registry with a self-signed certificate, you must either skip ssl verify or apply the appropriate CA bundle to the storage-initializer to create a connection with the registry. This document explains three methods that can be used in KServe, described below: Configure CA bundle for storage-initializer Global configuration Namespace scope configuration(Using storage-config Secret) json annotation Skip SSL Verification (NOTE) This is only available for RawDeployment and ServerlessDeployment . For modelmesh, you should add ca bundle content into certificate parameter in storage-config Configure CA bundle for storage-initializer \u00b6 Global Configuration \u00b6 KServe use inferenceservice-config ConfigMap for default configuration. If you want to add cabundle cert for every inference service, you can set caBundleConfigMapName in the ConfigMap. Before updating the ConfigMap, you have to create a ConfigMap for CA bundle certificate in the namespace that KServe controller is running and the data key in the ConfigMap must be cabundle.crt . Create CA ConfigMap with the CA bundle cert kubectl create configmap cabundle --from-file=/path/to/cabundle.crt kubectl get configmap cabundle -o yaml apiVersion: v1 data: cabundle.crt: XXXXX kind: ConfigMap metadata: name: cabundle namespace: kserve Update inferenceservice-config ConfigMap storageInitializer: |- { ... \"caBundleConfigMapName\": \"cabundle\", ... } Afeter you update this configuration, please restart KServe controller pod to pick up the change. When you create a inference service, then the ca bundle will be copied to your user namespace and it will be attached to the storage-initializer container. Using storage-config Secret \u00b6 If you want to apply the cabundle only to a specific inferenceservice, you can use a specific annotation or variable( cabundle_configmap ) on the storage-config Secret used by the inferenceservice. In this case, you have to create the cabundle ConfigMap in the user namespace before you create the inferenceservice. Create a ConfigMap with the cabundle cert kubectl create configmap local-cabundle --from-file=/path/to/cabundle.crt kubectl get configmap cabundle -o yaml apiVersion: v1 data: cabundle.crt: XXXXX kind: ConfigMap metadata: name: local-cabundle namespace: kserve-demo Add an annotation serving.kserve.io/s3-cabundle-configmap to storage-config Secret apiVersion: v1 data: AWS_ACCESS_KEY_ID: VEhFQUNDRVNTS0VZ AWS_SECRET_ACCESS_KEY: VEhFUEFTU1dPUkQ= kind: Secret metadata: annotations: serving.kserve.io/s3-cabundle-configmap: local-cabundle ... name: storage-config namespace: kserve-demo type: Opaque Or, set a variable cabundle_configmap to storage-config Secret apiVersion: v1 stringData: localMinIO: | { \"type\": \"s3\", .... \"cabundle_configmap\": \"local-cabundle\" } kind: Secret metadata: name: storage-config namespace: kserve-demo type: Opaque Skip SSL Verification \u00b6 For testing purposes or when there is no cabundle, you can easily create an SSL connection by disabling SSL verification. This can also be used by adding an annotation or setting a variable in secret-config Secret. Add an annotation( serving.kserve.io/s3-verifyssl ) to storage-config Secret apiVersion: v1 data: AWS_ACCESS_KEY_ID: VEhFQUNDRVNTS0VZ AWS_SECRET_ACCESS_KEY: VEhFUEFTU1dPUkQ= kind: Secret metadata: annotations: serving.kserve.io/s3-verifyssl: \"0\" # 1 is true, 0 is false ... name: storage-config namespace: kserve-demo type: Opaque Or, set a variable ( verify_ssl ) to storage-config Secret apiVersion: v1 stringData: localMinIO: | { \"type\": \"s3\", ... \"verify_ssl\": \"0\" # 1 is true, 0 is false (You can set True/true/False/false too) } kind: Secret metadata: name: storage-config namespace: kserve-demo type: Opaque Full Demo Scripts","title":"CA Certificate"},{"location":"modelserving/certificate/kserve/#kserve-with-self-signed-certificate-model-registry","text":"If you are using a model registry with a self-signed certificate, you must either skip ssl verify or apply the appropriate CA bundle to the storage-initializer to create a connection with the registry. This document explains three methods that can be used in KServe, described below: Configure CA bundle for storage-initializer Global configuration Namespace scope configuration(Using storage-config Secret) json annotation Skip SSL Verification (NOTE) This is only available for RawDeployment and ServerlessDeployment . For modelmesh, you should add ca bundle content into certificate parameter in storage-config","title":"KServe with Self Signed Certificate Model Registry"},{"location":"modelserving/certificate/kserve/#configure-ca-bundle-for-storage-initializer","text":"","title":"Configure CA bundle for storage-initializer"},{"location":"modelserving/certificate/kserve/#global-configuration","text":"KServe use inferenceservice-config ConfigMap for default configuration. If you want to add cabundle cert for every inference service, you can set caBundleConfigMapName in the ConfigMap. Before updating the ConfigMap, you have to create a ConfigMap for CA bundle certificate in the namespace that KServe controller is running and the data key in the ConfigMap must be cabundle.crt . Create CA ConfigMap with the CA bundle cert kubectl create configmap cabundle --from-file=/path/to/cabundle.crt kubectl get configmap cabundle -o yaml apiVersion: v1 data: cabundle.crt: XXXXX kind: ConfigMap metadata: name: cabundle namespace: kserve Update inferenceservice-config ConfigMap storageInitializer: |- { ... \"caBundleConfigMapName\": \"cabundle\", ... } Afeter you update this configuration, please restart KServe controller pod to pick up the change. When you create a inference service, then the ca bundle will be copied to your user namespace and it will be attached to the storage-initializer container.","title":"Global Configuration"},{"location":"modelserving/certificate/kserve/#using-storage-config-secret","text":"If you want to apply the cabundle only to a specific inferenceservice, you can use a specific annotation or variable( cabundle_configmap ) on the storage-config Secret used by the inferenceservice. In this case, you have to create the cabundle ConfigMap in the user namespace before you create the inferenceservice. Create a ConfigMap with the cabundle cert kubectl create configmap local-cabundle --from-file=/path/to/cabundle.crt kubectl get configmap cabundle -o yaml apiVersion: v1 data: cabundle.crt: XXXXX kind: ConfigMap metadata: name: local-cabundle namespace: kserve-demo Add an annotation serving.kserve.io/s3-cabundle-configmap to storage-config Secret apiVersion: v1 data: AWS_ACCESS_KEY_ID: VEhFQUNDRVNTS0VZ AWS_SECRET_ACCESS_KEY: VEhFUEFTU1dPUkQ= kind: Secret metadata: annotations: serving.kserve.io/s3-cabundle-configmap: local-cabundle ... name: storage-config namespace: kserve-demo type: Opaque Or, set a variable cabundle_configmap to storage-config Secret apiVersion: v1 stringData: localMinIO: | { \"type\": \"s3\", .... \"cabundle_configmap\": \"local-cabundle\" } kind: Secret metadata: name: storage-config namespace: kserve-demo type: Opaque","title":"Using storage-config Secret"},{"location":"modelserving/certificate/kserve/#skip-ssl-verification","text":"For testing purposes or when there is no cabundle, you can easily create an SSL connection by disabling SSL verification. This can also be used by adding an annotation or setting a variable in secret-config Secret. Add an annotation( serving.kserve.io/s3-verifyssl ) to storage-config Secret apiVersion: v1 data: AWS_ACCESS_KEY_ID: VEhFQUNDRVNTS0VZ AWS_SECRET_ACCESS_KEY: VEhFUEFTU1dPUkQ= kind: Secret metadata: annotations: serving.kserve.io/s3-verifyssl: \"0\" # 1 is true, 0 is false ... name: storage-config namespace: kserve-demo type: Opaque Or, set a variable ( verify_ssl ) to storage-config Secret apiVersion: v1 stringData: localMinIO: | { \"type\": \"s3\", ... \"verify_ssl\": \"0\" # 1 is true, 0 is false (You can set True/true/False/false too) } kind: Secret metadata: name: storage-config namespace: kserve-demo type: Opaque Full Demo Scripts","title":"Skip SSL Verification"},{"location":"modelserving/data_plane/data_plane/","text":"Data Plane \u00b6 The InferenceService Data Plane architecture consists of a static graph of components which coordinate requests for a single model. Advanced features such as Ensembling, A/B testing, and Multi-Arm-Bandits should compose InferenceServices together. Introduction \u00b6 KServe's data plane protocol introduces an inference API that is independent of any specific ML/DL framework and model server. This allows for quick iterations and consistency across Inference Services and supports both easy-to-use and high-performance use cases. By implementing this protocol both inference clients and servers will increase their utility and portability by operating seamlessly on platforms that have standardized around this API. Kserve's inference protocol is endorsed by NVIDIA Triton Inference Server, TensorFlow Serving, and TorchServe. Note: Protocol V2 uses /infer instead of :predict Concepts \u00b6 Component : Each endpoint is composed of multiple components: \"predictor\", \"explainer\", and \"transformer\". The only required component is the predictor, which is the core of the system. As KServe evolves, we plan to increase the number of supported components to enable use cases like Outlier Detection. Predictor : The predictor is the workhorse of the InferenceService. It is simply a model and a model server that makes it available at a network endpoint. Explainer : The explainer enables an optional alternate data plane that provides model explanations in addition to predictions. Users may define their own explanation container, which configures with relevant environment variables like prediction endpoint. For common use cases, KServe provides out-of-the-box explainers like Alibi. Transformer : The transformer enables users to define a pre and post processing step before the prediction and explanation workflows. Like the explainer, it is configured with relevant environment variables too. For common use cases, KServe provides out-of-the-box transformers like Feast. Data Plane V1 & V2 \u00b6 KServe supports two versions of its data plane, V1 and V2. V1 protocol offers a standard prediction workflow with HTTP/REST. The second version of the data-plane protocol addresses several issues found with the V1 data-plane protocol, including performance and generality across a large number of model frameworks and servers. Protocol V2 expands the capabilities of V1 by adding gRPC APIs. Main changes \u00b6 V2 does not currently support the explain endpoint V2 added Server Readiness/Liveness/Metadata endpoints V2 endpoint paths contain / instead of : V2 renamed :predict endpoint to /infer V2 allows for model versions in the request path (optional) V1 APIs \u00b6 API Verb Path List Models GET /v1/models Model Ready GET /v1/models/ Predict POST /v1/models/:predict Explain POST /v1/models/:explain V2 APIs \u00b6 API Verb Path Inference POST v2/models/[/versions/]/infer Model Metadata GET v2/models/[/versions/] Server Readiness GET v2/health/ready Server Liveness GET v2/health/live Server Metadata GET v2 Model Readiness GET v2/models/[/versions/ ]/ready ** path contents in [] are optional Please see V1 Protocol and V2 Protocol documentation for more information.","title":"Model Serving Data Plane"},{"location":"modelserving/data_plane/data_plane/#data-plane","text":"The InferenceService Data Plane architecture consists of a static graph of components which coordinate requests for a single model. Advanced features such as Ensembling, A/B testing, and Multi-Arm-Bandits should compose InferenceServices together.","title":"Data Plane"},{"location":"modelserving/data_plane/data_plane/#introduction","text":"KServe's data plane protocol introduces an inference API that is independent of any specific ML/DL framework and model server. This allows for quick iterations and consistency across Inference Services and supports both easy-to-use and high-performance use cases. By implementing this protocol both inference clients and servers will increase their utility and portability by operating seamlessly on platforms that have standardized around this API. Kserve's inference protocol is endorsed by NVIDIA Triton Inference Server, TensorFlow Serving, and TorchServe. Note: Protocol V2 uses /infer instead of :predict","title":"Introduction"},{"location":"modelserving/data_plane/data_plane/#concepts","text":"Component : Each endpoint is composed of multiple components: \"predictor\", \"explainer\", and \"transformer\". The only required component is the predictor, which is the core of the system. As KServe evolves, we plan to increase the number of supported components to enable use cases like Outlier Detection. Predictor : The predictor is the workhorse of the InferenceService. It is simply a model and a model server that makes it available at a network endpoint. Explainer : The explainer enables an optional alternate data plane that provides model explanations in addition to predictions. Users may define their own explanation container, which configures with relevant environment variables like prediction endpoint. For common use cases, KServe provides out-of-the-box explainers like Alibi. Transformer : The transformer enables users to define a pre and post processing step before the prediction and explanation workflows. Like the explainer, it is configured with relevant environment variables too. For common use cases, KServe provides out-of-the-box transformers like Feast.","title":"Concepts"},{"location":"modelserving/data_plane/data_plane/#data-plane-v1-v2","text":"KServe supports two versions of its data plane, V1 and V2. V1 protocol offers a standard prediction workflow with HTTP/REST. The second version of the data-plane protocol addresses several issues found with the V1 data-plane protocol, including performance and generality across a large number of model frameworks and servers. Protocol V2 expands the capabilities of V1 by adding gRPC APIs.","title":"Data Plane V1 & V2"},{"location":"modelserving/data_plane/data_plane/#main-changes","text":"V2 does not currently support the explain endpoint V2 added Server Readiness/Liveness/Metadata endpoints V2 endpoint paths contain / instead of : V2 renamed :predict endpoint to /infer V2 allows for model versions in the request path (optional)","title":"Main changes"},{"location":"modelserving/data_plane/data_plane/#v1-apis","text":"API Verb Path List Models GET /v1/models Model Ready GET /v1/models/ Predict POST /v1/models/:predict Explain POST /v1/models/:explain","title":"V1 APIs"},{"location":"modelserving/data_plane/data_plane/#v2-apis","text":"API Verb Path Inference POST v2/models/[/versions/]/infer Model Metadata GET v2/models/[/versions/] Server Readiness GET v2/health/ready Server Liveness GET v2/health/live Server Metadata GET v2 Model Readiness GET v2/models/[/versions/ ]/ready ** path contents in [] are optional Please see V1 Protocol and V2 Protocol documentation for more information.","title":"V2 APIs"},{"location":"modelserving/data_plane/v1_protocol/","text":"Data Plane (V1) \u00b6 KServe's V1 protocol offers a standardized prediction workflow across all model frameworks. This protocol version is still supported, but it is recommended that users migrate to the V2 protocol for better performance and standardization among serving runtimes. However, if a use case requires a more flexible schema than protocol v2 provides, v1 protocol is still an option. API Verb Path Request Payload Response Payload List Models GET /v1/models {\"models\": []} Model Ready GET /v1/models/ {\"name\": ,\"ready\": $bool} Predict POST /v1/models/:predict {\"instances\": []} ** {\"predictions\": []} Explain POST /v1/models/:explain {\"instances\": []} ** {\"predictions\": [], \"explanations\": []} ** = payload is optional Note: The response payload in V1 protocol is not strictly enforced. A custom server can define and return its own response payload. We encourage using the KServe defined response payload for consistency. API Definitions \u00b6 API Definition Predict The \"predict\" API performs inference on a model. The response is the prediction result. All InferenceServices speak the Tensorflow V1 HTTP API . Explain The \"explain\" API is an optional component that provides model explanations in addition to predictions. The standardized explainer interface is identical to the Tensorflow V1 HTTP API with the addition of an \":explain\" verb. Model Ready The \u201cmodel ready\u201d health API indicates if a specific model is ready for inferencing. If the model(s) is downloaded and ready to serve requests, the model ready endpoint returns the list of accessible (s). List Models The \"models\" API exposes a list of models in the model registry.","title":"V1 Inference Protocol"},{"location":"modelserving/data_plane/v1_protocol/#data-plane-v1","text":"KServe's V1 protocol offers a standardized prediction workflow across all model frameworks. This protocol version is still supported, but it is recommended that users migrate to the V2 protocol for better performance and standardization among serving runtimes. However, if a use case requires a more flexible schema than protocol v2 provides, v1 protocol is still an option. API Verb Path Request Payload Response Payload List Models GET /v1/models {\"models\": []} Model Ready GET /v1/models/ {\"name\": ,\"ready\": $bool} Predict POST /v1/models/:predict {\"instances\": []} ** {\"predictions\": []} Explain POST /v1/models/:explain {\"instances\": []} ** {\"predictions\": [], \"explanations\": []} ** = payload is optional Note: The response payload in V1 protocol is not strictly enforced. A custom server can define and return its own response payload. We encourage using the KServe defined response payload for consistency.","title":"Data Plane (V1)"},{"location":"modelserving/data_plane/v1_protocol/#api-definitions","text":"API Definition Predict The \"predict\" API performs inference on a model. The response is the prediction result. All InferenceServices speak the Tensorflow V1 HTTP API . Explain The \"explain\" API is an optional component that provides model explanations in addition to predictions. The standardized explainer interface is identical to the Tensorflow V1 HTTP API with the addition of an \":explain\" verb. Model Ready The \u201cmodel ready\u201d health API indicates if a specific model is ready for inferencing. If the model(s) is downloaded and ready to serve requests, the model ready endpoint returns the list of accessible (s). List Models The \"models\" API exposes a list of models in the model registry.","title":"API Definitions"},{"location":"modelserving/data_plane/v2_protocol/","text":"Open Inference Protocol (V2 Inference Protocol) \u00b6 For an inference server to be compliant with this protocol the server must implement the health, metadata, and inference V2 APIs . Optional features that are explicitly noted are not required. A compliant inference server may choose to implement the HTTP/REST API and/or the GRPC API . Check the model serving runtime table / the protocolVersion field in the runtime YAML to ensure V2 protocol is supported for model serving runtime that you are using. Note: For all API descriptions on this page, all strings in all contexts are case-sensitive. The V2 protocol supports an extension mechanism as a required part of the API, but this document does not propose any specific extensions. Any specific extensions will be proposed separately. Note on changes between V1 & V2 \u00b6 V2 protocol does not currently support the explain endpoint like V1 protocol does. If this is a feature you wish to have in the V2 protocol, please submit a github issue . HTTP/REST \u00b6 The HTTP/REST API uses JSON because it is widely supported and language independent. In all JSON schemas shown in this document $number, $string, $boolean, $object and $array refer to the fundamental JSON types. #optional indicates an optional JSON field. See also: The HTTP/REST endpoints are defined in rest_predict_v2.yaml API Verb Path Request Payload Response Payload Inference POST v2/models/ [/versions/]/infer $inference_request $inference_response Model Metadata GET v2/models/[/versions/] $metadata_model_response Server Ready GET v2/health/ready $ready_server_response Server Live GET v2/health/live $live_server_response Server Metadata GET v2 $metadata_server_response Model Ready GET v2/models/[/versions/ ]/ready $ready_model_response ** path contents in [] are optional For more information regarding payload contents, see Payload Contents . The versions portion of the Path URLs (in [] ) is shown as optional to allow implementations that don\u2019t support versioning or for cases when the user does not want to specify a specific model version (in which case the server will choose a version based on its own policies). For example, if a model does not implement a version, the Model Metadata request path could look like v2/model/my_model . If the model has been configured to implement a version, the request path could look something like v2/models/my_model/versions/v10 , where the version of the model is v10. API Definitions \u00b6 API Definition Inference The /infer endpoint performs inference on a model. The response is the prediction result. Model Metadata The \"model metadata\" API is a per-model endpoint that returns details about the model passed in the path. Server Ready The \u201cserver ready\u201d health API indicates if all the models are ready for inferencing. The \u201cserver ready\u201d health API can be used directly to implement the Kubernetes readinessProbe Server Live The \u201cserver live\u201d health API indicates if the inference server is able to receive and respond to metadata and inference requests. The \u201cserver live\u201d API can be used directly to implement the Kubernetes livenessProbe. Server Metadata The \"server metadata\" API returns details describing the server. Model Ready The \u201cmodel ready\u201d health API indicates if a specific model is ready for inferencing. The model name and (optionally) version must be available in the URL. Health/Readiness/Liveness Probes \u00b6 The Model Readiness probe the question \"Did the model download and is it able to serve requests?\" and responds with the available model name(s). The Server Readiness/Liveness probes answer the question \"Is my service and its infrastructure running, healthy, and able to receive and process requests?\" To read more about liveness and readiness probe concepts, visit the Configure Liveness, Readiness and Startup Probes Kubernetes documentation. Payload Contents \u00b6 Model Ready \u00b6 The model ready endpoint returns the readiness probe response for the server along with the name of the model. Model Ready Response JSON Object \u00b6 $ready_model_response = { \"name\" : $string, \"ready\": $bool } Server Ready \u00b6 The server ready endpoint returns the readiness probe response for the server. Server Ready Response JSON Object \u00b6 $ready_server_response = { \"live\" : $bool, } Server Live \u00b6 The server live endpoint returns the liveness probe response for the server. Server Live Response JSON Objet \u00b6 $live_server_response = { \"live\" : $bool, } Server Metadata \u00b6 The server metadata endpoint provides information about the server. A server metadata request is made with an HTTP GET to a server metadata endpoint. In the corresponding response the HTTP body contains the Server Metadata Response JSON Object or the Server Metadata Response JSON Error Object . Server Metadata Response JSON Object \u00b6 A successful server metadata request is indicated by a 200 HTTP status code. The server metadata response object, identified as $metadata_server_response , is returned in the HTTP body. $metadata_server_response = { \"name\" : $string, \"version\" : $string, \"extensions\" : [ $string, ... ] } \u201cname\u201d : A descriptive name for the server. \"version\" : The server version. \u201cextensions\u201d : The extensions supported by the server. Currently, no standard extensions are defined. Individual inference servers may define and document their own extensions. Server Metadata Response JSON Error Object \u00b6 A failed server metadata request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $metadata_server_error_response object. $metadata_server_error_response = { \"error\": $string } \u201cerror\u201d : The descriptive message for the error. The per-model metadata endpoint provides information about a model. A model metadata request is made with an HTTP GET to a model metadata endpoint. In the corresponding response the HTTP body contains the Model Metadata Response JSON Object or the Model Metadata Response JSON Error Object . The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error. Model Metadata \u00b6 The per-model metadata endpoint provides information about a model. A model metadata request is made with an HTTP GET to a model metadata endpoint. In the corresponding response the HTTP body contains the Model Metadata Response JSON Object or the Model Metadata Response JSON Error Object . The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error. Model Metadata Response JSON Object \u00b6 A successful model metadata request is indicated by a 200 HTTP status code. The metadata response object, identified as $metadata_model_response , is returned in the HTTP body for every successful model metadata request. $metadata_model_response = { \"name\" : $string, \"versions\" : [ $string, ... ] #optional, \"platform\" : $string, \"inputs\" : [ $metadata_tensor, ... ], \"outputs\" : [ $metadata_tensor, ... ] } \u201cname\u201d : The name of the model. \"versions\" : The model versions that may be explicitly requested via the appropriate endpoint. Optional for servers that don\u2019t support versions. Optional for models that don\u2019t allow a version to be explicitly requested. \u201cplatform\u201d : The framework/backend for the model. See Platforms . \u201cinputs\u201d : The inputs required by the model. \u201coutputs\u201d : The outputs produced by the model. Each model input and output tensors\u2019 metadata is described with a $metadata_tensor object . $metadata_tensor = { \"name\" : $string, \"datatype\" : $string, \"shape\" : [ $number, ... ] } \u201cname\u201d : The name of the tensor. \"datatype\" : The data-type of the tensor elements as defined in Tensor Data Types . \"shape\" : The shape of the tensor. Variable-size dimensions are specified as -1. Model Metadata Response JSON Error Object \u00b6 A failed model metadata request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $metadata_model_error_response object. $metadata_model_error_response = { \"error\": $string } \u201cerror\u201d : The descriptive message for the error. Inference \u00b6 An inference request is made with an HTTP POST to an inference endpoint. In the request the HTTP body contains the Inference Request JSON Object . In the corresponding response the HTTP body contains the Inference Response JSON Object or Inference Response JSON Error Object . See Inference Request Examples for some example HTTP/REST requests and responses. Inference Request JSON Object \u00b6 The inference request object, identified as $inference_request , is required in the HTTP body of the POST request. The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error. $inference_request = { \"id\" : $string #optional, \"parameters\" : $parameters #optional, \"inputs\" : [ $request_input, ... ], \"outputs\" : [ $request_output, ... ] #optional } \"id\" : An identifier for this request. Optional, but if specified this identifier must be returned in the response. \"parameters\" : An object containing zero or more parameters for this inference request expressed as key/value pairs. See Parameters for more information. \"inputs\" : The input tensors. Each input is described using the $request_input schema defined in Request Input . \"outputs\" : The output tensors requested for this inference. Each requested output is described using the $request_output schema defined in Request Output . Optional, if not specified all outputs produced by the model will be returned using default $request_output settings. Request Input \u00b6 The $inference_request_input JSON describes an input to the model. If the input is batched, the shape and data must represent the full shape and contents of the entire batch. $inference_request_input = { \"name\" : $string, \"shape\" : [ $number, ... ], \"datatype\" : $string, \"parameters\" : $parameters #optional, \"data\" : $tensor_data } \"name\" : The name of the input tensor. \"shape\" : The shape of the input tensor. Each dimension must be an integer representable as an unsigned 64-bit integer value. \"datatype\" : The data-type of the input tensor elements as defined in Tensor Data Types . \"parameters\" : An object containing zero or more parameters for this input expressed as key/value pairs. See Parameters for more information. \u201cdata\u201d: The contents of the tensor. See Tensor Data for more information. Request Output \u00b6 The $request_output JSON is used to request which output tensors should be returned from the model. $inference_request_output = { \"name\" : $string, \"parameters\" : $parameters #optional, } \"name\" : The name of the output tensor. \"parameters\" : An object containing zero or more parameters for this output expressed as key/value pairs. See Parameters for more information. Inference Response JSON Object \u00b6 A successful inference request is indicated by a 200 HTTP status code. The inference response object, identified as $inference_response , is returned in the HTTP body. $inference_response = { \"model_name\" : $string, \"model_version\" : $string #optional, \"id\" : $string, \"parameters\" : $parameters #optional, \"outputs\" : [ $response_output, ... ] } \"model_name\" : The name of the model used for inference. \"model_version\" : The specific model version used for inference. Inference servers that do not implement versioning should not provide this field in the response. \"id\" : The \"id\" identifier given in the request, if any. \"parameters\" : An object containing zero or more parameters for this response expressed as key/value pairs. See Parameters for more information. \"outputs\" : The output tensors. Each output is described using the $response_output schema defined in Response Output . Response Output \u00b6 The $response_output JSON describes an output from the model. If the output is batched, the shape and data represents the full shape of the entire batch. $response_output = { \"name\" : $string, \"shape\" : [ $number, ... ], \"datatype\" : $string, \"parameters\" : $parameters #optional, \"data\" : $tensor_data } \"name\" : The name of the output tensor. \"shape\" : The shape of the output tensor. Each dimension must be an integer representable as an unsigned 64-bit integer value. \"datatype\" : The data-type of the output tensor elements as defined in Tensor Data Types . \"parameters\" : An object containing zero or more parameters for this input expressed as key/value pairs. See Parameters for more information. \u201cdata\u201d: The contents of the tensor. See Tensor Data for more information. Inference Response JSON Error Object \u00b6 A failed inference request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $inference_error_response object. $inference_error_response = { \"error\": } \u201cerror\u201d : The descriptive message for the error. Parameters \u00b6 The $parameters JSON describes zero or more \u201cname\u201d/\u201dvalue\u201d pairs, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a $string, $number, or $boolean. $parameters = { $parameter, ... } $parameter = $string : $string | $number | $boolean Currently no parameters are defined. As required a future proposal may define one or more standard parameters to allow portable functionality across different inference servers. A server can implement server-specific parameters to provide non-standard capabilities. Tensor Data \u00b6 Tensor data must be presented in row-major order of the tensor elements. Element values must be given in \"linear\" order without any stride or padding between elements. Tensor elements may be presented in their nature multi-dimensional representation, or as a flattened one-dimensional representation. Tensor data given explicitly is provided in a JSON array. Each element of the array may be an integer, floating-point number, string or boolean value. The server can decide to coerce each element to the required type or return an error if an unexpected value is received. Note that fp16 and bf16 are problematic to communicate explicitly since there is not a standard fp16/bf16 representation across backends nor typically the programmatic support to create the fp16/bf16 representation for a JSON number. For example, the 2-dimensional matrix: [ 1 2 4 5 ] Can be represented in its natural format as: \"data\" : [ [ 1, 2 ], [ 4, 5 ] ] Or in a flattened one-dimensional representation: \"data\" : [ 1, 2, 4, 5 ] Tensor Data Types \u00b6 Tensor data types are shown in the following table along with the size of each type, in bytes. Data Type Size (bytes) BOOL 1 UINT8 1 UINT16 2 UINT32 4 UINT64 8 INT8 1 INT16 2 INT32 4 INT64 8 FP16 2 FP32 4 FP64 8 BYTES Variable (max 2 32 ) --- Inference Request Examples \u00b6 The following example shows an inference request to a model with two inputs and one output. The HTTP Content-Length header gives the size of the JSON object. POST /v2/models/mymodel/infer HTTP/1.1 Host: localhost:8000 Content-Type: application/json Content-Length: { \"id\" : \"42\", \"inputs\" : [ { \"name\" : \"input0\", \"shape\" : [ 2, 2 ], \"datatype\" : \"UINT32\", \"data\" : [ 1, 2, 3, 4 ] }, { \"name\" : \"input1\", \"shape\" : [ 3 ], \"datatype\" : \"BOOL\", \"data\" : [ true ] } ], \"outputs\" : [ { \"name\" : \"output0\" } ] } For the above request the inference server must return the \u201coutput0\u201d output tensor. Assuming the model returns a [ 3, 2 ] tensor of data type FP32 the following response would be returned. HTTP/1.1 200 OK Content-Type: application/json Content-Length: { \"id\" : \"42\" \"outputs\" : [ { \"name\" : \"output0\", \"shape\" : [ 3, 2 ], \"datatype\" : \"FP32\", \"data\" : [ 1.0, 1.1, 2.0, 2.1, 3.0, 3.1 ] } ] } gRPC \u00b6 The GRPC API closely follows the concepts defined in the HTTP/REST API. A compliant server must implement the health, metadata, and inference APIs described in this section. API rpc Endpoint Request Message Response Message Inference ModelInfer ModelInferRequest ModelInferResponse Model Ready ModelReady [ModelReadyRequest] ModelReadyResponse Model Metadata ModelMetadata ModelMetadataRequest ModelMetadataResponse Server Ready ServerReady ServerReadyRequest ServerReadyResponse Server Live ServerLive ServerLiveRequest ServerLiveResponse For more detailed information on each endpoint and its contents, see API Definitions and Message Contents . See also: The gRPC endpoints, request/response messages and contents are defined in grpc_predict_v2.proto API Definitions \u00b6 The GRPC definition of the service is: // // Inference Server GRPC endpoints. // service GRPCInferenceService { // Check liveness of the inference server. rpc ServerLive(ServerLiveRequest) returns (ServerLiveResponse) {} // Check readiness of the inference server. rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {} // Check readiness of a model in the inference server. rpc ModelReady(ModelReadyRequest) returns (ModelReadyResponse) {} // Get server metadata. rpc ServerMetadata(ServerMetadataRequest) returns (ServerMetadataResponse) {} // Get model metadata. rpc ModelMetadata(ModelMetadataRequest) returns (ModelMetadataResponse) {} // Perform inference using a specific model. rpc ModelInfer(ModelInferRequest) returns (ModelInferResponse) {} } Message Contents \u00b6 Health \u00b6 A health request is made using the ServerLive, ServerReady, or ModelReady endpoint. For each of these endpoints errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. Server Live \u00b6 The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. The request and response messages for ServerLive are: message ServerLiveRequest {} message ServerLiveResponse { // True if the inference server is live, false if not live. bool live = 1; } Server Ready \u00b6 The ServerReady API indicates if the server is ready for inferencing. The request and response messages for ServerReady are: message ServerReadyRequest {} message ServerReadyResponse { // True if the inference server is ready, false if not ready. bool ready = 1; } Model Ready \u00b6 The ModelReady API indicates if a specific model is ready for inferencing. The request and response messages for ModelReady are: message ModelReadyRequest { // The name of the model to check for readiness. string name = 1; // The version of the model to check for readiness. If not given the // server will choose a version based on the model and internal policy. string version = 2; } message ModelReadyResponse { // True if the model is ready, false if not ready. bool ready = 1; } Metadata \u00b6 Server Metadata \u00b6 The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ServerMetadata are: message ServerMetadataRequest {} message ServerMetadataResponse { // The server name. string name = 1; // The server version. string version = 2; // The extensions supported by the server. repeated string extensions = 3; } Model Metadata \u00b6 The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ModelMetadata are: message ModelMetadataRequest { // The name of the model. string name = 1; // The version of the model to check for readiness. If not given the // server will choose a version based on the model and internal policy. string version = 2; } message ModelMetadataResponse { // Metadata for a tensor. message TensorMetadata { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. A variable-size dimension is represented // by a -1 value. repeated int64 shape = 3; } // The model name. string name = 1; // The versions of the model available on the server. repeated string versions = 2; // The model's platform. See Platforms. string platform = 3; // The model's inputs. repeated TensorMetadata inputs = 4; // The model's outputs. repeated TensorMetadata outputs = 5; } Platforms \u00b6 A platform is a string indicating a DL/ML framework or backend. Platform is returned as part of the response to a Model Metadata request but is information only. The proposed inference APIs are generic relative to the DL/ML framework used by a model and so a client does not need to know the platform of a given model to use the API. Platform names use the format \u201c _ \u201d. The following platform names are allowed: tensorrt_plan : A TensorRT model encoded as a serialized engine or \u201cplan\u201d. tensorflow_graphdef : A TensorFlow model encoded as a GraphDef. tensorflow_savedmodel : A TensorFlow model encoded as a SavedModel. onnx_onnxv1 : A ONNX model encoded for ONNX Runtime. pytorch_torchscript : A PyTorch model encoded as TorchScript. mxnet_mxnet: An MXNet model caffe2_netdef : A Caffe2 model encoded as a NetDef. Inference \u00b6 The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ModelInfer are: message ModelInferRequest { // An input tensor for an inference request. message InferInputTensor { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. repeated int64 shape = 3; // Optional inference input tensor parameters. map parameters = 4; // The tensor contents using a data-type format. This field must // not be specified if \"raw\" tensor contents are being used for // the inference request. InferTensorContents contents = 5; } // An output tensor requested for an inference request. message InferRequestedOutputTensor { // The tensor name. string name = 1; // Optional requested output tensor parameters. map parameters = 2; } // The name of the model to use for inferencing. string model_name = 1; // The version of the model to use for inference. If not given the // server will choose a version based on the model and internal policy. string model_version = 2; // Optional identifier for the request. If specified will be // returned in the response. string id = 3; // Optional inference parameters. map parameters = 4; // The input tensors for the inference. repeated InferInputTensor inputs = 5; // The requested output tensors for the inference. Optional, if not // specified all outputs produced by the model will be returned. repeated InferRequestedOutputTensor outputs = 6; // The data contained in an input tensor can be represented in \"raw\" // bytes form or in the repeated type that matches the tensor's data // type. To use the raw representation 'raw_input_contents' must be // initialized with data for each tensor in the same order as // 'inputs'. For each tensor, the size of this content must match // what is expected by the tensor's shape and data type. The raw // data must be the flattened, one-dimensional, row-major order of // the tensor elements without any stride or padding between the // elements. Note that the FP16 data type must be represented as raw // content as there is no specific data type for a 16-bit float // type. // // If this field is specified then InferInputTensor::contents must // not be specified for any input tensor. repeated bytes raw_input_contents = 7; } message ModelInferResponse { // An output tensor returned for an inference request. message InferOutputTensor { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. repeated int64 shape = 3; // Optional output tensor parameters. map parameters = 4; // The tensor contents using a data-type format. This field must // not be specified if \"raw\" tensor contents are being used for // the inference response. InferTensorContents contents = 5; } // The name of the model used for inference. string model_name = 1; // The version of the model used for inference. string model_version = 2; // The id of the inference request if one was specified. string id = 3; // Optional inference response parameters. map parameters = 4; // The output tensors holding inference results. repeated InferOutputTensor outputs = 5; // The data contained in an output tensor can be represented in // \"raw\" bytes form or in the repeated type that matches the // tensor's data type. To use the raw representation 'raw_output_contents' // must be initialized with data for each tensor in the same order as // 'outputs'. For each tensor, the size of this content must match // what is expected by the tensor's shape and data type. The raw // data must be the flattened, one-dimensional, row-major order of // the tensor elements without any stride or padding between the // elements. Note that the FP16 data type must be represented as raw // content as there is no specific data type for a 16-bit float // type. // // If this field is specified then InferOutputTensor::contents must // not be specified for any output tensor. repeated bytes raw_output_contents = 6; } Parameters \u00b6 The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Currently, no parameters are defined. As required a future proposal may define one or more standard parameters to allow portable functionality across different inference servers. A server can implement server-specific parameters to provide non-standard capabilities. // // An inference parameter value. // message InferParameter { // The parameter value can be a string, an int64, a boolean // or a message specific to a predefined parameter. oneof parameter_choice { // A boolean parameter value. bool bool_param = 1; // An int64 parameter value. int64 int64_param = 2; // A string parameter value. string string_param = 3; } } Tensor Data \u00b6 In all representations tensor data must be flattened to a one-dimensional, row-major order of the tensor elements. Element values must be given in \"linear\" order without any stride or padding between elements. Using a \"raw\" representation of tensors with ModelInferRequest::raw_input_contents and ModelInferResponse::raw_output_contents will typically allow higher performance due to the way protobuf allocation and reuse interacts with GRPC. For example, see https://github.com/grpc/grpc/issues/23231. An alternative to the \"raw\" representation is to use InferTensorContents to represent the tensor data in a format that matches the tensor's data type. // // The data contained in a tensor represented by the repeated type // that matches the tensor's data type. Protobuf oneof is not used // because oneofs cannot contain repeated fields. // message InferTensorContents { // Representation for BOOL data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated bool bool_contents = 1; // Representation for INT8, INT16, and INT32 data types. The size // must match what is expected by the tensor's shape. The contents // must be the flattened, one-dimensional, row-major order of the // tensor elements. repeated int32 int_contents = 2; // Representation for INT64 data types. The size must match what // is expected by the tensor's shape. The contents must be the // flattened, one-dimensional, row-major order of the tensor elements. repeated int64 int64_contents = 3; // Representation for UINT8, UINT16, and UINT32 data types. The size // must match what is expected by the tensor's shape. The contents // must be the flattened, one-dimensional, row-major order of the // tensor elements. repeated uint32 uint_contents = 4; // Representation for UINT64 data types. The size must match what // is expected by the tensor's shape. The contents must be the // flattened, one-dimensional, row-major order of the tensor elements. repeated uint64 uint64_contents = 5; // Representation for FP32 data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated float fp32_contents = 6; // Representation for FP64 data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated double fp64_contents = 7; // Representation for BYTES data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated bytes bytes_contents = 8; } Tensor Data Types \u00b6 Tensor data types are shown in the following table along with the size of each type, in bytes. Data Type Size (bytes) BOOL 1 UINT8 1 UINT16 2 UINT32 4 UINT64 8 INT8 1 INT16 2 INT32 4 INT64 8 FP16 2 FP32 4 FP64 8 BYTES Variable (max 2 32 )","title":"Open Inference Protocol (V2 Inference Protocol)"},{"location":"modelserving/data_plane/v2_protocol/#open-inference-protocol-v2-inference-protocol","text":"For an inference server to be compliant with this protocol the server must implement the health, metadata, and inference V2 APIs . Optional features that are explicitly noted are not required. A compliant inference server may choose to implement the HTTP/REST API and/or the GRPC API . Check the model serving runtime table / the protocolVersion field in the runtime YAML to ensure V2 protocol is supported for model serving runtime that you are using. Note: For all API descriptions on this page, all strings in all contexts are case-sensitive. The V2 protocol supports an extension mechanism as a required part of the API, but this document does not propose any specific extensions. Any specific extensions will be proposed separately.","title":"Open Inference Protocol (V2 Inference Protocol)"},{"location":"modelserving/data_plane/v2_protocol/#note-on-changes-between-v1-v2","text":"V2 protocol does not currently support the explain endpoint like V1 protocol does. If this is a feature you wish to have in the V2 protocol, please submit a github issue .","title":"Note on changes between V1 & V2"},{"location":"modelserving/data_plane/v2_protocol/#httprest","text":"The HTTP/REST API uses JSON because it is widely supported and language independent. In all JSON schemas shown in this document $number, $string, $boolean, $object and $array refer to the fundamental JSON types. #optional indicates an optional JSON field. See also: The HTTP/REST endpoints are defined in rest_predict_v2.yaml API Verb Path Request Payload Response Payload Inference POST v2/models/ [/versions/]/infer $inference_request $inference_response Model Metadata GET v2/models/[/versions/] $metadata_model_response Server Ready GET v2/health/ready $ready_server_response Server Live GET v2/health/live $live_server_response Server Metadata GET v2 $metadata_server_response Model Ready GET v2/models/[/versions/ ]/ready $ready_model_response ** path contents in [] are optional For more information regarding payload contents, see Payload Contents . The versions portion of the Path URLs (in [] ) is shown as optional to allow implementations that don\u2019t support versioning or for cases when the user does not want to specify a specific model version (in which case the server will choose a version based on its own policies). For example, if a model does not implement a version, the Model Metadata request path could look like v2/model/my_model . If the model has been configured to implement a version, the request path could look something like v2/models/my_model/versions/v10 , where the version of the model is v10.","title":"HTTP/REST"},{"location":"modelserving/data_plane/v2_protocol/#api-definitions","text":"API Definition Inference The /infer endpoint performs inference on a model. The response is the prediction result. Model Metadata The \"model metadata\" API is a per-model endpoint that returns details about the model passed in the path. Server Ready The \u201cserver ready\u201d health API indicates if all the models are ready for inferencing. The \u201cserver ready\u201d health API can be used directly to implement the Kubernetes readinessProbe Server Live The \u201cserver live\u201d health API indicates if the inference server is able to receive and respond to metadata and inference requests. The \u201cserver live\u201d API can be used directly to implement the Kubernetes livenessProbe. Server Metadata The \"server metadata\" API returns details describing the server. Model Ready The \u201cmodel ready\u201d health API indicates if a specific model is ready for inferencing. The model name and (optionally) version must be available in the URL.","title":"API Definitions"},{"location":"modelserving/data_plane/v2_protocol/#healthreadinessliveness-probes","text":"The Model Readiness probe the question \"Did the model download and is it able to serve requests?\" and responds with the available model name(s). The Server Readiness/Liveness probes answer the question \"Is my service and its infrastructure running, healthy, and able to receive and process requests?\" To read more about liveness and readiness probe concepts, visit the Configure Liveness, Readiness and Startup Probes Kubernetes documentation.","title":"Health/Readiness/Liveness Probes"},{"location":"modelserving/data_plane/v2_protocol/#payload-contents","text":"","title":"Payload Contents"},{"location":"modelserving/data_plane/v2_protocol/#model-ready","text":"The model ready endpoint returns the readiness probe response for the server along with the name of the model.","title":"Model Ready"},{"location":"modelserving/data_plane/v2_protocol/#model-ready-response-json-object","text":"$ready_model_response = { \"name\" : $string, \"ready\": $bool }","title":"Model Ready Response JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#server-ready","text":"The server ready endpoint returns the readiness probe response for the server.","title":"Server Ready"},{"location":"modelserving/data_plane/v2_protocol/#server-ready-response-json-object","text":"$ready_server_response = { \"live\" : $bool, }","title":"Server Ready Response JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#server-live","text":"The server live endpoint returns the liveness probe response for the server.","title":"Server Live"},{"location":"modelserving/data_plane/v2_protocol/#server-live-response-json-objet","text":"$live_server_response = { \"live\" : $bool, }","title":"Server Live Response JSON Objet"},{"location":"modelserving/data_plane/v2_protocol/#server-metadata","text":"The server metadata endpoint provides information about the server. A server metadata request is made with an HTTP GET to a server metadata endpoint. In the corresponding response the HTTP body contains the Server Metadata Response JSON Object or the Server Metadata Response JSON Error Object .","title":"Server Metadata"},{"location":"modelserving/data_plane/v2_protocol/#server-metadata-response-json-object","text":"A successful server metadata request is indicated by a 200 HTTP status code. The server metadata response object, identified as $metadata_server_response , is returned in the HTTP body. $metadata_server_response = { \"name\" : $string, \"version\" : $string, \"extensions\" : [ $string, ... ] } \u201cname\u201d : A descriptive name for the server. \"version\" : The server version. \u201cextensions\u201d : The extensions supported by the server. Currently, no standard extensions are defined. Individual inference servers may define and document their own extensions.","title":"Server Metadata Response JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#server-metadata-response-json-error-object","text":"A failed server metadata request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $metadata_server_error_response object. $metadata_server_error_response = { \"error\": $string } \u201cerror\u201d : The descriptive message for the error. The per-model metadata endpoint provides information about a model. A model metadata request is made with an HTTP GET to a model metadata endpoint. In the corresponding response the HTTP body contains the Model Metadata Response JSON Object or the Model Metadata Response JSON Error Object . The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error.","title":"Server Metadata Response JSON Error Object"},{"location":"modelserving/data_plane/v2_protocol/#model-metadata","text":"The per-model metadata endpoint provides information about a model. A model metadata request is made with an HTTP GET to a model metadata endpoint. In the corresponding response the HTTP body contains the Model Metadata Response JSON Object or the Model Metadata Response JSON Error Object . The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error.","title":"Model Metadata"},{"location":"modelserving/data_plane/v2_protocol/#model-metadata-response-json-object","text":"A successful model metadata request is indicated by a 200 HTTP status code. The metadata response object, identified as $metadata_model_response , is returned in the HTTP body for every successful model metadata request. $metadata_model_response = { \"name\" : $string, \"versions\" : [ $string, ... ] #optional, \"platform\" : $string, \"inputs\" : [ $metadata_tensor, ... ], \"outputs\" : [ $metadata_tensor, ... ] } \u201cname\u201d : The name of the model. \"versions\" : The model versions that may be explicitly requested via the appropriate endpoint. Optional for servers that don\u2019t support versions. Optional for models that don\u2019t allow a version to be explicitly requested. \u201cplatform\u201d : The framework/backend for the model. See Platforms . \u201cinputs\u201d : The inputs required by the model. \u201coutputs\u201d : The outputs produced by the model. Each model input and output tensors\u2019 metadata is described with a $metadata_tensor object . $metadata_tensor = { \"name\" : $string, \"datatype\" : $string, \"shape\" : [ $number, ... ] } \u201cname\u201d : The name of the tensor. \"datatype\" : The data-type of the tensor elements as defined in Tensor Data Types . \"shape\" : The shape of the tensor. Variable-size dimensions are specified as -1.","title":"Model Metadata Response JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#model-metadata-response-json-error-object","text":"A failed model metadata request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $metadata_model_error_response object. $metadata_model_error_response = { \"error\": $string } \u201cerror\u201d : The descriptive message for the error.","title":"Model Metadata Response JSON Error Object"},{"location":"modelserving/data_plane/v2_protocol/#inference","text":"An inference request is made with an HTTP POST to an inference endpoint. In the request the HTTP body contains the Inference Request JSON Object . In the corresponding response the HTTP body contains the Inference Response JSON Object or Inference Response JSON Error Object . See Inference Request Examples for some example HTTP/REST requests and responses.","title":"Inference"},{"location":"modelserving/data_plane/v2_protocol/#inference-request-json-object","text":"The inference request object, identified as $inference_request , is required in the HTTP body of the POST request. The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error. $inference_request = { \"id\" : $string #optional, \"parameters\" : $parameters #optional, \"inputs\" : [ $request_input, ... ], \"outputs\" : [ $request_output, ... ] #optional } \"id\" : An identifier for this request. Optional, but if specified this identifier must be returned in the response. \"parameters\" : An object containing zero or more parameters for this inference request expressed as key/value pairs. See Parameters for more information. \"inputs\" : The input tensors. Each input is described using the $request_input schema defined in Request Input . \"outputs\" : The output tensors requested for this inference. Each requested output is described using the $request_output schema defined in Request Output . Optional, if not specified all outputs produced by the model will be returned using default $request_output settings.","title":"Inference Request JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#request-input","text":"The $inference_request_input JSON describes an input to the model. If the input is batched, the shape and data must represent the full shape and contents of the entire batch. $inference_request_input = { \"name\" : $string, \"shape\" : [ $number, ... ], \"datatype\" : $string, \"parameters\" : $parameters #optional, \"data\" : $tensor_data } \"name\" : The name of the input tensor. \"shape\" : The shape of the input tensor. Each dimension must be an integer representable as an unsigned 64-bit integer value. \"datatype\" : The data-type of the input tensor elements as defined in Tensor Data Types . \"parameters\" : An object containing zero or more parameters for this input expressed as key/value pairs. See Parameters for more information. \u201cdata\u201d: The contents of the tensor. See Tensor Data for more information.","title":"Request Input"},{"location":"modelserving/data_plane/v2_protocol/#request-output","text":"The $request_output JSON is used to request which output tensors should be returned from the model. $inference_request_output = { \"name\" : $string, \"parameters\" : $parameters #optional, } \"name\" : The name of the output tensor. \"parameters\" : An object containing zero or more parameters for this output expressed as key/value pairs. See Parameters for more information.","title":"Request Output"},{"location":"modelserving/data_plane/v2_protocol/#inference-response-json-object","text":"A successful inference request is indicated by a 200 HTTP status code. The inference response object, identified as $inference_response , is returned in the HTTP body. $inference_response = { \"model_name\" : $string, \"model_version\" : $string #optional, \"id\" : $string, \"parameters\" : $parameters #optional, \"outputs\" : [ $response_output, ... ] } \"model_name\" : The name of the model used for inference. \"model_version\" : The specific model version used for inference. Inference servers that do not implement versioning should not provide this field in the response. \"id\" : The \"id\" identifier given in the request, if any. \"parameters\" : An object containing zero or more parameters for this response expressed as key/value pairs. See Parameters for more information. \"outputs\" : The output tensors. Each output is described using the $response_output schema defined in Response Output .","title":"Inference Response JSON Object"},{"location":"modelserving/data_plane/v2_protocol/#response-output","text":"The $response_output JSON describes an output from the model. If the output is batched, the shape and data represents the full shape of the entire batch. $response_output = { \"name\" : $string, \"shape\" : [ $number, ... ], \"datatype\" : $string, \"parameters\" : $parameters #optional, \"data\" : $tensor_data } \"name\" : The name of the output tensor. \"shape\" : The shape of the output tensor. Each dimension must be an integer representable as an unsigned 64-bit integer value. \"datatype\" : The data-type of the output tensor elements as defined in Tensor Data Types . \"parameters\" : An object containing zero or more parameters for this input expressed as key/value pairs. See Parameters for more information. \u201cdata\u201d: The contents of the tensor. See Tensor Data for more information.","title":"Response Output"},{"location":"modelserving/data_plane/v2_protocol/#inference-response-json-error-object","text":"A failed inference request must be indicated by an HTTP error status (typically 400). The HTTP body must contain the $inference_error_response object. $inference_error_response = { \"error\": } \u201cerror\u201d : The descriptive message for the error.","title":"Inference Response JSON Error Object"},{"location":"modelserving/data_plane/v2_protocol/#parameters","text":"The $parameters JSON describes zero or more \u201cname\u201d/\u201dvalue\u201d pairs, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a $string, $number, or $boolean. $parameters = { $parameter, ... } $parameter = $string : $string | $number | $boolean Currently no parameters are defined. As required a future proposal may define one or more standard parameters to allow portable functionality across different inference servers. A server can implement server-specific parameters to provide non-standard capabilities.","title":"Parameters"},{"location":"modelserving/data_plane/v2_protocol/#tensor-data","text":"Tensor data must be presented in row-major order of the tensor elements. Element values must be given in \"linear\" order without any stride or padding between elements. Tensor elements may be presented in their nature multi-dimensional representation, or as a flattened one-dimensional representation. Tensor data given explicitly is provided in a JSON array. Each element of the array may be an integer, floating-point number, string or boolean value. The server can decide to coerce each element to the required type or return an error if an unexpected value is received. Note that fp16 and bf16 are problematic to communicate explicitly since there is not a standard fp16/bf16 representation across backends nor typically the programmatic support to create the fp16/bf16 representation for a JSON number. For example, the 2-dimensional matrix: [ 1 2 4 5 ] Can be represented in its natural format as: \"data\" : [ [ 1, 2 ], [ 4, 5 ] ] Or in a flattened one-dimensional representation: \"data\" : [ 1, 2, 4, 5 ]","title":"Tensor Data"},{"location":"modelserving/data_plane/v2_protocol/#tensor-data-types","text":"Tensor data types are shown in the following table along with the size of each type, in bytes. Data Type Size (bytes) BOOL 1 UINT8 1 UINT16 2 UINT32 4 UINT64 8 INT8 1 INT16 2 INT32 4 INT64 8 FP16 2 FP32 4 FP64 8 BYTES Variable (max 2 32 ) ---","title":"Tensor Data Types"},{"location":"modelserving/data_plane/v2_protocol/#inference-request-examples","text":"The following example shows an inference request to a model with two inputs and one output. The HTTP Content-Length header gives the size of the JSON object. POST /v2/models/mymodel/infer HTTP/1.1 Host: localhost:8000 Content-Type: application/json Content-Length: { \"id\" : \"42\", \"inputs\" : [ { \"name\" : \"input0\", \"shape\" : [ 2, 2 ], \"datatype\" : \"UINT32\", \"data\" : [ 1, 2, 3, 4 ] }, { \"name\" : \"input1\", \"shape\" : [ 3 ], \"datatype\" : \"BOOL\", \"data\" : [ true ] } ], \"outputs\" : [ { \"name\" : \"output0\" } ] } For the above request the inference server must return the \u201coutput0\u201d output tensor. Assuming the model returns a [ 3, 2 ] tensor of data type FP32 the following response would be returned. HTTP/1.1 200 OK Content-Type: application/json Content-Length: { \"id\" : \"42\" \"outputs\" : [ { \"name\" : \"output0\", \"shape\" : [ 3, 2 ], \"datatype\" : \"FP32\", \"data\" : [ 1.0, 1.1, 2.0, 2.1, 3.0, 3.1 ] } ] }","title":"Inference Request Examples"},{"location":"modelserving/data_plane/v2_protocol/#grpc","text":"The GRPC API closely follows the concepts defined in the HTTP/REST API. A compliant server must implement the health, metadata, and inference APIs described in this section. API rpc Endpoint Request Message Response Message Inference ModelInfer ModelInferRequest ModelInferResponse Model Ready ModelReady [ModelReadyRequest] ModelReadyResponse Model Metadata ModelMetadata ModelMetadataRequest ModelMetadataResponse Server Ready ServerReady ServerReadyRequest ServerReadyResponse Server Live ServerLive ServerLiveRequest ServerLiveResponse For more detailed information on each endpoint and its contents, see API Definitions and Message Contents . See also: The gRPC endpoints, request/response messages and contents are defined in grpc_predict_v2.proto","title":"gRPC"},{"location":"modelserving/data_plane/v2_protocol/#api-definitions_1","text":"The GRPC definition of the service is: // // Inference Server GRPC endpoints. // service GRPCInferenceService { // Check liveness of the inference server. rpc ServerLive(ServerLiveRequest) returns (ServerLiveResponse) {} // Check readiness of the inference server. rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {} // Check readiness of a model in the inference server. rpc ModelReady(ModelReadyRequest) returns (ModelReadyResponse) {} // Get server metadata. rpc ServerMetadata(ServerMetadataRequest) returns (ServerMetadataResponse) {} // Get model metadata. rpc ModelMetadata(ModelMetadataRequest) returns (ModelMetadataResponse) {} // Perform inference using a specific model. rpc ModelInfer(ModelInferRequest) returns (ModelInferResponse) {} }","title":"API Definitions"},{"location":"modelserving/data_plane/v2_protocol/#message-contents","text":"","title":"Message Contents"},{"location":"modelserving/data_plane/v2_protocol/#health","text":"A health request is made using the ServerLive, ServerReady, or ModelReady endpoint. For each of these endpoints errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure.","title":"Health"},{"location":"modelserving/data_plane/v2_protocol/#server-live_1","text":"The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. The request and response messages for ServerLive are: message ServerLiveRequest {} message ServerLiveResponse { // True if the inference server is live, false if not live. bool live = 1; }","title":"Server Live"},{"location":"modelserving/data_plane/v2_protocol/#server-ready_1","text":"The ServerReady API indicates if the server is ready for inferencing. The request and response messages for ServerReady are: message ServerReadyRequest {} message ServerReadyResponse { // True if the inference server is ready, false if not ready. bool ready = 1; }","title":"Server Ready"},{"location":"modelserving/data_plane/v2_protocol/#model-ready_1","text":"The ModelReady API indicates if a specific model is ready for inferencing. The request and response messages for ModelReady are: message ModelReadyRequest { // The name of the model to check for readiness. string name = 1; // The version of the model to check for readiness. If not given the // server will choose a version based on the model and internal policy. string version = 2; } message ModelReadyResponse { // True if the model is ready, false if not ready. bool ready = 1; }","title":"Model Ready"},{"location":"modelserving/data_plane/v2_protocol/#metadata","text":"","title":"Metadata"},{"location":"modelserving/data_plane/v2_protocol/#server-metadata_1","text":"The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ServerMetadata are: message ServerMetadataRequest {} message ServerMetadataResponse { // The server name. string name = 1; // The server version. string version = 2; // The extensions supported by the server. repeated string extensions = 3; }","title":"Server Metadata"},{"location":"modelserving/data_plane/v2_protocol/#model-metadata_1","text":"The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ModelMetadata are: message ModelMetadataRequest { // The name of the model. string name = 1; // The version of the model to check for readiness. If not given the // server will choose a version based on the model and internal policy. string version = 2; } message ModelMetadataResponse { // Metadata for a tensor. message TensorMetadata { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. A variable-size dimension is represented // by a -1 value. repeated int64 shape = 3; } // The model name. string name = 1; // The versions of the model available on the server. repeated string versions = 2; // The model's platform. See Platforms. string platform = 3; // The model's inputs. repeated TensorMetadata inputs = 4; // The model's outputs. repeated TensorMetadata outputs = 5; }","title":"Model Metadata"},{"location":"modelserving/data_plane/v2_protocol/#platforms","text":"A platform is a string indicating a DL/ML framework or backend. Platform is returned as part of the response to a Model Metadata request but is information only. The proposed inference APIs are generic relative to the DL/ML framework used by a model and so a client does not need to know the platform of a given model to use the API. Platform names use the format \u201c _ \u201d. The following platform names are allowed: tensorrt_plan : A TensorRT model encoded as a serialized engine or \u201cplan\u201d. tensorflow_graphdef : A TensorFlow model encoded as a GraphDef. tensorflow_savedmodel : A TensorFlow model encoded as a SavedModel. onnx_onnxv1 : A ONNX model encoded for ONNX Runtime. pytorch_torchscript : A PyTorch model encoded as TorchScript. mxnet_mxnet: An MXNet model caffe2_netdef : A Caffe2 model encoded as a NetDef.","title":"Platforms"},{"location":"modelserving/data_plane/v2_protocol/#inference_1","text":"The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ModelInfer are: message ModelInferRequest { // An input tensor for an inference request. message InferInputTensor { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. repeated int64 shape = 3; // Optional inference input tensor parameters. map parameters = 4; // The tensor contents using a data-type format. This field must // not be specified if \"raw\" tensor contents are being used for // the inference request. InferTensorContents contents = 5; } // An output tensor requested for an inference request. message InferRequestedOutputTensor { // The tensor name. string name = 1; // Optional requested output tensor parameters. map parameters = 2; } // The name of the model to use for inferencing. string model_name = 1; // The version of the model to use for inference. If not given the // server will choose a version based on the model and internal policy. string model_version = 2; // Optional identifier for the request. If specified will be // returned in the response. string id = 3; // Optional inference parameters. map parameters = 4; // The input tensors for the inference. repeated InferInputTensor inputs = 5; // The requested output tensors for the inference. Optional, if not // specified all outputs produced by the model will be returned. repeated InferRequestedOutputTensor outputs = 6; // The data contained in an input tensor can be represented in \"raw\" // bytes form or in the repeated type that matches the tensor's data // type. To use the raw representation 'raw_input_contents' must be // initialized with data for each tensor in the same order as // 'inputs'. For each tensor, the size of this content must match // what is expected by the tensor's shape and data type. The raw // data must be the flattened, one-dimensional, row-major order of // the tensor elements without any stride or padding between the // elements. Note that the FP16 data type must be represented as raw // content as there is no specific data type for a 16-bit float // type. // // If this field is specified then InferInputTensor::contents must // not be specified for any input tensor. repeated bytes raw_input_contents = 7; } message ModelInferResponse { // An output tensor returned for an inference request. message InferOutputTensor { // The tensor name. string name = 1; // The tensor data type. string datatype = 2; // The tensor shape. repeated int64 shape = 3; // Optional output tensor parameters. map parameters = 4; // The tensor contents using a data-type format. This field must // not be specified if \"raw\" tensor contents are being used for // the inference response. InferTensorContents contents = 5; } // The name of the model used for inference. string model_name = 1; // The version of the model used for inference. string model_version = 2; // The id of the inference request if one was specified. string id = 3; // Optional inference response parameters. map parameters = 4; // The output tensors holding inference results. repeated InferOutputTensor outputs = 5; // The data contained in an output tensor can be represented in // \"raw\" bytes form or in the repeated type that matches the // tensor's data type. To use the raw representation 'raw_output_contents' // must be initialized with data for each tensor in the same order as // 'outputs'. For each tensor, the size of this content must match // what is expected by the tensor's shape and data type. The raw // data must be the flattened, one-dimensional, row-major order of // the tensor elements without any stride or padding between the // elements. Note that the FP16 data type must be represented as raw // content as there is no specific data type for a 16-bit float // type. // // If this field is specified then InferOutputTensor::contents must // not be specified for any output tensor. repeated bytes raw_output_contents = 6; }","title":"Inference"},{"location":"modelserving/data_plane/v2_protocol/#parameters_1","text":"The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Currently, no parameters are defined. As required a future proposal may define one or more standard parameters to allow portable functionality across different inference servers. A server can implement server-specific parameters to provide non-standard capabilities. // // An inference parameter value. // message InferParameter { // The parameter value can be a string, an int64, a boolean // or a message specific to a predefined parameter. oneof parameter_choice { // A boolean parameter value. bool bool_param = 1; // An int64 parameter value. int64 int64_param = 2; // A string parameter value. string string_param = 3; } }","title":"Parameters"},{"location":"modelserving/data_plane/v2_protocol/#tensor-data_1","text":"In all representations tensor data must be flattened to a one-dimensional, row-major order of the tensor elements. Element values must be given in \"linear\" order without any stride or padding between elements. Using a \"raw\" representation of tensors with ModelInferRequest::raw_input_contents and ModelInferResponse::raw_output_contents will typically allow higher performance due to the way protobuf allocation and reuse interacts with GRPC. For example, see https://github.com/grpc/grpc/issues/23231. An alternative to the \"raw\" representation is to use InferTensorContents to represent the tensor data in a format that matches the tensor's data type. // // The data contained in a tensor represented by the repeated type // that matches the tensor's data type. Protobuf oneof is not used // because oneofs cannot contain repeated fields. // message InferTensorContents { // Representation for BOOL data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated bool bool_contents = 1; // Representation for INT8, INT16, and INT32 data types. The size // must match what is expected by the tensor's shape. The contents // must be the flattened, one-dimensional, row-major order of the // tensor elements. repeated int32 int_contents = 2; // Representation for INT64 data types. The size must match what // is expected by the tensor's shape. The contents must be the // flattened, one-dimensional, row-major order of the tensor elements. repeated int64 int64_contents = 3; // Representation for UINT8, UINT16, and UINT32 data types. The size // must match what is expected by the tensor's shape. The contents // must be the flattened, one-dimensional, row-major order of the // tensor elements. repeated uint32 uint_contents = 4; // Representation for UINT64 data types. The size must match what // is expected by the tensor's shape. The contents must be the // flattened, one-dimensional, row-major order of the tensor elements. repeated uint64 uint64_contents = 5; // Representation for FP32 data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated float fp32_contents = 6; // Representation for FP64 data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated double fp64_contents = 7; // Representation for BYTES data type. The size must match what is // expected by the tensor's shape. The contents must be the flattened, // one-dimensional, row-major order of the tensor elements. repeated bytes bytes_contents = 8; }","title":"Tensor Data"},{"location":"modelserving/data_plane/v2_protocol/#tensor-data-types_1","text":"Tensor data types are shown in the following table along with the size of each type, in bytes. Data Type Size (bytes) BOOL 1 UINT8 1 UINT16 2 UINT32 4 UINT64 8 INT8 1 INT16 2 INT32 4 INT64 8 FP16 2 FP32 4 FP64 8 BYTES Variable (max 2 32 )","title":"Tensor Data Types"},{"location":"modelserving/detect/aif/germancredit/","text":"Bias detection on an InferenceService using AIF360 \u00b6 This is an example of how to get bias metrics using AI Fairness 360 (AIF360) on KServe. AI Fairness 360, an LF AI incubation project, is an extensible open source toolkit that can help users examine, report, and mitigate discrimination and bias in machine learning models throughout the AI application lifecycle. We will be using the German Credit dataset maintained by the UC Irvine Machine Learning Repository . The German Credit dataset is a dataset that contains data as to whether or not a creditor gave a loan applicant access to a loan along with data about the applicant. The data includes relevant data on an applicant's credit history, savings, and employment as well as some data on the applicant's demographic such as age, sex, and marital status. Data like credit history, savings, and employment can be used by creditors to accurately predict the probability that an applicant will repay their loans, however, data such as age and sex should not be used to decide whether an applicant should be given a loan. We would like to be able to check if these \"protected classes\" are being used in a model's predictions. In this example we will feed the model some predictions and calculate metrics based off of the predictions the model makes. We will be using KServe payload logging capability collect the metrics. These metrics will give insight as to whether or not the model is biased for or against any protected classes. In this example we will look at the bias our deployed model has on those of age > 25 vs. those of age <= 25 and see if creditors are treating either unfairly. Sample resources for deploying the example can be found here Create the InferenceService \u00b6 Apply the CRD kubectl kubectl apply -f bias.yaml Expected Output $ inferenceservice.serving.kserve.io/german-credit created Deploy the message dumper (sample backend receiver for payload logs) \u00b6 Apply the message-dumper CRD which will collect the logs that are created when running predictions on the inferenceservice. In production setup, instead of message-dumper Kafka can be used to receive payload logs kubectl kubectl apply -f message-dumper.yaml Expected Output service.serving.knative.dev/message-dumper created Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = german-credit SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python simulate_predicts.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict ${ SERVICE_HOSTNAME } Process payload logs for metrics calculation \u00b6 Run json_from_logs.py which will craft a payload that AIF can interpret. First, the events logs are taken from the message-dumper and then those logs are parsed to match inputs with outputs. Then the input/outputs pairs are all combined into a list of inputs and a list of outputs for AIF to interpret. A data.json file should have been created in this folder which contains the json payload. python json_from_logs.py Run an explanation \u00b6 Finally, now that we have collected a number of our model's predictions and their corresponding inputs we will send these to the AIF server to calculate the bias metrics. python query_bias.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain ${ SERVICE_HOSTNAME } input.json Interpreting the results \u00b6 Now let's look at one of the metrics. In this example disparate impact represents the ratio between the probability of applicants of the privileged class (age > 25) getting a loan and the probability of applicants of the unprivileged class (age <= 25) getting a loan P(Y=1|D=privileged)/P(Y=1|D=unprivileged) . Since, in the sample output below, the disparate impact is less that 1 then the probability that an applicant whose age is greater than 25 gets a loan is significantly higher than the probability that an applicant whose age is less than or equal to 25 gets a loan. This in and of itself is not proof that the model is biased, but does hint that there may be some bias and a deeper look may be needed. python query_bias.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain ${ SERVICE_HOSTNAME } input.json Expected Output Sending bias query... TIME TAKEN: 0 .21137404441833496 base_rate : 0 .9329608938547486 consistency : [ 0 .982122905027933 ] disparate_impact : 0 .52 num_instances : 179 .0 num_negatives : 12 .0 num_positives : 167 .0 statistical_parity_difference : -0.48 Dataset \u00b6 The dataset used in this example is the German Credit dataset maintained by the UC Irvine Machine Learning Repository . Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.","title":"AIF Bias Detector"},{"location":"modelserving/detect/aif/germancredit/#bias-detection-on-an-inferenceservice-using-aif360","text":"This is an example of how to get bias metrics using AI Fairness 360 (AIF360) on KServe. AI Fairness 360, an LF AI incubation project, is an extensible open source toolkit that can help users examine, report, and mitigate discrimination and bias in machine learning models throughout the AI application lifecycle. We will be using the German Credit dataset maintained by the UC Irvine Machine Learning Repository . The German Credit dataset is a dataset that contains data as to whether or not a creditor gave a loan applicant access to a loan along with data about the applicant. The data includes relevant data on an applicant's credit history, savings, and employment as well as some data on the applicant's demographic such as age, sex, and marital status. Data like credit history, savings, and employment can be used by creditors to accurately predict the probability that an applicant will repay their loans, however, data such as age and sex should not be used to decide whether an applicant should be given a loan. We would like to be able to check if these \"protected classes\" are being used in a model's predictions. In this example we will feed the model some predictions and calculate metrics based off of the predictions the model makes. We will be using KServe payload logging capability collect the metrics. These metrics will give insight as to whether or not the model is biased for or against any protected classes. In this example we will look at the bias our deployed model has on those of age > 25 vs. those of age <= 25 and see if creditors are treating either unfairly. Sample resources for deploying the example can be found here","title":"Bias detection on an InferenceService using AIF360"},{"location":"modelserving/detect/aif/germancredit/#create-the-inferenceservice","text":"Apply the CRD kubectl kubectl apply -f bias.yaml Expected Output $ inferenceservice.serving.kserve.io/german-credit created","title":"Create the InferenceService"},{"location":"modelserving/detect/aif/germancredit/#deploy-the-message-dumper-sample-backend-receiver-for-payload-logs","text":"Apply the message-dumper CRD which will collect the logs that are created when running predictions on the inferenceservice. In production setup, instead of message-dumper Kafka can be used to receive payload logs kubectl kubectl apply -f message-dumper.yaml Expected Output service.serving.knative.dev/message-dumper created","title":"Deploy the message dumper (sample backend receiver for payload logs)"},{"location":"modelserving/detect/aif/germancredit/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = german-credit SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python simulate_predicts.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict ${ SERVICE_HOSTNAME }","title":"Run a prediction"},{"location":"modelserving/detect/aif/germancredit/#process-payload-logs-for-metrics-calculation","text":"Run json_from_logs.py which will craft a payload that AIF can interpret. First, the events logs are taken from the message-dumper and then those logs are parsed to match inputs with outputs. Then the input/outputs pairs are all combined into a list of inputs and a list of outputs for AIF to interpret. A data.json file should have been created in this folder which contains the json payload. python json_from_logs.py","title":"Process payload logs for metrics calculation"},{"location":"modelserving/detect/aif/germancredit/#run-an-explanation","text":"Finally, now that we have collected a number of our model's predictions and their corresponding inputs we will send these to the AIF server to calculate the bias metrics. python query_bias.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain ${ SERVICE_HOSTNAME } input.json","title":"Run an explanation"},{"location":"modelserving/detect/aif/germancredit/#interpreting-the-results","text":"Now let's look at one of the metrics. In this example disparate impact represents the ratio between the probability of applicants of the privileged class (age > 25) getting a loan and the probability of applicants of the unprivileged class (age <= 25) getting a loan P(Y=1|D=privileged)/P(Y=1|D=unprivileged) . Since, in the sample output below, the disparate impact is less that 1 then the probability that an applicant whose age is greater than 25 gets a loan is significantly higher than the probability that an applicant whose age is less than or equal to 25 gets a loan. This in and of itself is not proof that the model is biased, but does hint that there may be some bias and a deeper look may be needed. python query_bias.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain ${ SERVICE_HOSTNAME } input.json Expected Output Sending bias query... TIME TAKEN: 0 .21137404441833496 base_rate : 0 .9329608938547486 consistency : [ 0 .982122905027933 ] disparate_impact : 0 .52 num_instances : 179 .0 num_negatives : 12 .0 num_positives : 167 .0 statistical_parity_difference : -0.48","title":"Interpreting the results"},{"location":"modelserving/detect/aif/germancredit/#dataset","text":"The dataset used in this example is the German Credit dataset maintained by the UC Irvine Machine Learning Repository . Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.","title":"Dataset"},{"location":"modelserving/detect/aif/germancredit/server/","text":"Logistic Regression Model on the German Credit dataset \u00b6 Build a development docker image \u00b6 To build a development image first download these files and move them into the server/ folder - https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data - https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc First build your docker image by changing directory to kserve/python and replacing dockeruser with your docker username in the snippet below (running this will take some time). docker build -t dockeruser/aifserver:latest -f aiffairness.Dockerfile . Then push your docker image to your dockerhub repo (this will take some time) docker push dockeruser/aifserver:latest Once your docker image is pushed you can pull the image from dockeruser/aifserver:latest when deploying an inferenceservice by specifying the image in the yaml file.","title":"Logistic Regression Model on the German Credit dataset"},{"location":"modelserving/detect/aif/germancredit/server/#logistic-regression-model-on-the-german-credit-dataset","text":"","title":"Logistic Regression Model on the German Credit dataset"},{"location":"modelserving/detect/aif/germancredit/server/#build-a-development-docker-image","text":"To build a development image first download these files and move them into the server/ folder - https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data - https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc First build your docker image by changing directory to kserve/python and replacing dockeruser with your docker username in the snippet below (running this will take some time). docker build -t dockeruser/aifserver:latest -f aiffairness.Dockerfile . Then push your docker image to your dockerhub repo (this will take some time) docker push dockeruser/aifserver:latest Once your docker image is pushed you can pull the image from dockeruser/aifserver:latest when deploying an inferenceservice by specifying the image in the yaml file.","title":"Build a development docker image"},{"location":"modelserving/detect/alibi_detect/alibi_detect/","text":"Deploy InferenceService with Alibi Outlier/Drift Detector \u00b6 In order to trust and reliably act on model predictions, it is crucial to monitor the distribution of the incoming requests via various different type of detectors. KServe integrates Alibi Detect with the following components: Drift detector checks when the distribution of incoming requests is diverging from a reference distribution such as that of the training data. Outlier detector flags single instances which do not follow the training distribution. The architecture used is shown below and links the payload logging available within KServe with asynchronous processing of those payloads in KNative to detect outliers. CIFAR10 Outlier Detector \u00b6 A CIFAR10 Outlier Detector. Run the notebook demo to test. The notebook requires KNative Eventing >= 0.18. CIFAR10 Drift Detector \u00b6 A CIFAR10 Drift Detector. Run the notebook demo to test. The notebook requires KNative Eventing >= 0.18.","title":"Alibi Detector"},{"location":"modelserving/detect/alibi_detect/alibi_detect/#deploy-inferenceservice-with-alibi-outlierdrift-detector","text":"In order to trust and reliably act on model predictions, it is crucial to monitor the distribution of the incoming requests via various different type of detectors. KServe integrates Alibi Detect with the following components: Drift detector checks when the distribution of incoming requests is diverging from a reference distribution such as that of the training data. Outlier detector flags single instances which do not follow the training distribution. The architecture used is shown below and links the payload logging available within KServe with asynchronous processing of those payloads in KNative to detect outliers.","title":"Deploy InferenceService with Alibi Outlier/Drift Detector"},{"location":"modelserving/detect/alibi_detect/alibi_detect/#cifar10-outlier-detector","text":"A CIFAR10 Outlier Detector. Run the notebook demo to test. The notebook requires KNative Eventing >= 0.18.","title":"CIFAR10 Outlier Detector"},{"location":"modelserving/detect/alibi_detect/alibi_detect/#cifar10-drift-detector","text":"A CIFAR10 Drift Detector. Run the notebook demo to test. The notebook requires KNative Eventing >= 0.18.","title":"CIFAR10 Drift Detector"},{"location":"modelserving/detect/art/mnist/","text":"Using ART to get adversarial examples for MNIST classifications \u00b6 This is an example to show how adversarially modified inputs can trick models to predict incorrectly to highlight model vulnerability to adversarial attacks. It is using the Adversarial Robustness Toolbox (ART) on KServe. ART provides tools that enable developers to evaluate, defend, and verify ML models and applications against adversarial threats. Apart from giving capabilities to craft adversarial attacks , it also provides algorithms to defend against them. We will be using the MNIST dataset which is a dataset of handwritten digits and find adversarial examples which can make the model predict a classification incorrectly, thereby showing the vulnerability of the model against adversarial attacks. Sample resources for deploying the example can be found here To deploy the inferenceservice with v1beta1 API kubectl apply -f art.yaml Then find the url kubectl get inferenceservice NAME URL READY DEFAULT TRAFFIC CANARY TRAFFIC AGE artserver http://artserver.somecluster/v1/models/artserver True 100 40m Explanation \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = artserver SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } After some time you should see a pop up containing the explanation, similar to the image below. If a pop up does not display and the message \"Unable to find an adversarial example.\" appears then an adversarial example could not be found for the image given in a timely manner. If a pop up does display then the image on the left is the original image and the image on the right is the adversarial example. The labels above both images represent what classification the model made for each individual image. The Square Attack method used in this example creates a random update at each iteration and adds this update to the adversarial input if it makes a misclassification more likely (more specifically, if it improves the objective function). Once enough random updates are added together and the model misclassifies then the resulting adversarial input will be returned and displayed. To try a different MNIST example add an integer to the end of the query between 0-9,999. The integer chosen will be the index of the image to be chosen in the MNIST dataset. Or to try a file with custom data add the file path to the end. Keep in mind that the data format must be {\"instances\": [, ]} python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } ./input.json Stopping the Inference Service \u00b6 kubectl delete -f art.yaml Build a Development ART Explainer Docker Image \u00b6 If you would like to build a development image for the ART Explainer then follow these instructions Troubleshooting \u00b6 <504> Gateway Timeout <504> - the explainer is probably taking too long and not sending a response back quickly enough. Either there aren't enough resources allocated or the number of samples the explainer is allowed to take needs to be reduced. To fix this go to art.yaml and increase resources. If you see Configuration \"artserver-default\" does not have any ready Revision the container may have taken too long to download. If you run kubectl get revision and see your revision is stuck in ContainerCreating try deleting the inferenceservice and redeploying.","title":"ART Adversarial Detector"},{"location":"modelserving/detect/art/mnist/#using-art-to-get-adversarial-examples-for-mnist-classifications","text":"This is an example to show how adversarially modified inputs can trick models to predict incorrectly to highlight model vulnerability to adversarial attacks. It is using the Adversarial Robustness Toolbox (ART) on KServe. ART provides tools that enable developers to evaluate, defend, and verify ML models and applications against adversarial threats. Apart from giving capabilities to craft adversarial attacks , it also provides algorithms to defend against them. We will be using the MNIST dataset which is a dataset of handwritten digits and find adversarial examples which can make the model predict a classification incorrectly, thereby showing the vulnerability of the model against adversarial attacks. Sample resources for deploying the example can be found here To deploy the inferenceservice with v1beta1 API kubectl apply -f art.yaml Then find the url kubectl get inferenceservice NAME URL READY DEFAULT TRAFFIC CANARY TRAFFIC AGE artserver http://artserver.somecluster/v1/models/artserver True 100 40m","title":"Using ART to get adversarial examples for MNIST classifications"},{"location":"modelserving/detect/art/mnist/#explanation","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = artserver SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } After some time you should see a pop up containing the explanation, similar to the image below. If a pop up does not display and the message \"Unable to find an adversarial example.\" appears then an adversarial example could not be found for the image given in a timely manner. If a pop up does display then the image on the left is the original image and the image on the right is the adversarial example. The labels above both images represent what classification the model made for each individual image. The Square Attack method used in this example creates a random update at each iteration and adds this update to the adversarial input if it makes a misclassification more likely (more specifically, if it improves the objective function). Once enough random updates are added together and the model misclassifies then the resulting adversarial input will be returned and displayed. To try a different MNIST example add an integer to the end of the query between 0-9,999. The integer chosen will be the index of the image to be chosen in the MNIST dataset. Or to try a file with custom data add the file path to the end. Keep in mind that the data format must be {\"instances\": [, ]} python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } ./input.json","title":"Explanation"},{"location":"modelserving/detect/art/mnist/#stopping-the-inference-service","text":"kubectl delete -f art.yaml","title":"Stopping the Inference Service"},{"location":"modelserving/detect/art/mnist/#build-a-development-art-explainer-docker-image","text":"If you would like to build a development image for the ART Explainer then follow these instructions","title":"Build a Development ART Explainer Docker Image"},{"location":"modelserving/detect/art/mnist/#troubleshooting","text":"<504> Gateway Timeout <504> - the explainer is probably taking too long and not sending a response back quickly enough. Either there aren't enough resources allocated or the number of samples the explainer is allowed to take needs to be reduced. To fix this go to art.yaml and increase resources. If you see Configuration \"artserver-default\" does not have any ready Revision the container may have taken too long to download. If you run kubectl get revision and see your revision is stuck in ContainerCreating try deleting the inferenceservice and redeploying.","title":"Troubleshooting"},{"location":"modelserving/explainer/explainer/","text":"InferenceService Explainer \u00b6 Model explainability answers the question: \"Why did my model make this prediction\" for a given instance. KServe integrates with Alibi Explainer which implements a black-box algorithm by generating a lot of similar looking instances for a given instance and send out to the model server to produce an explanation. Additionally, KServe also integrates with The AI Explainability 360 (AIX360) toolkit, an LF AI Foundation incubation project, which is an open-source library that supports the interpretability and explainability of datasets and machine learning models. The AI Explainability 360 Python package includes a comprehensive set of algorithms that cover different dimensions of explanations along with proxy explainability metrics. In addition to native algorithms, AIX360 also provides algorithms from LIME and Shap. Explainer Examples Deploy Alibi Image Explainer Imagenet Explainer Deploy Alibi Income Explainer Income Explainer Deploy Alibi Text Explainer Alibi Text Explainer","title":"Concept"},{"location":"modelserving/explainer/explainer/#inferenceservice-explainer","text":"Model explainability answers the question: \"Why did my model make this prediction\" for a given instance. KServe integrates with Alibi Explainer which implements a black-box algorithm by generating a lot of similar looking instances for a given instance and send out to the model server to produce an explanation. Additionally, KServe also integrates with The AI Explainability 360 (AIX360) toolkit, an LF AI Foundation incubation project, which is an open-source library that supports the interpretability and explainability of datasets and machine learning models. The AI Explainability 360 Python package includes a comprehensive set of algorithms that cover different dimensions of explanations along with proxy explainability metrics. In addition to native algorithms, AIX360 also provides algorithms from LIME and Shap. Explainer Examples Deploy Alibi Image Explainer Imagenet Explainer Deploy Alibi Income Explainer Income Explainer Deploy Alibi Text Explainer Alibi Text Explainer","title":"InferenceService Explainer"},{"location":"modelserving/explainer/aix/mnist/aix/","text":"Using AIX to get explanations for MNIST classifications \u00b6 This is an example of how to explain model predictions using AI Explainability 360 (AIX360) on KServe. We will be using mnist dataset for handwritten digits for this model and explain how the model decides the predicted results. Create the InferenceService with AIX Explainer \u00b6 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"aix-explainer\" namespace : default spec : predictor : containers : - name : predictor image : aipipeline/rf-predictor:0.4.1 command : [ \"python\" , \"-m\" , \"rfserver\" , \"--model_name\" , \"aix-explainer\" ] imagePullPolicy : Always explainer : containers : - name : explainer image : kserve/aix-explainer:v0.10.1 args : - --model_name - aix-explainer - --explainer_type - LimeImages - --num_samples - \"100\" - --top_labels - \"10\" - --min_weight - \"0.01\" imagePullPolicy : Always resources : limits : cpu : \"1\" memory : 2Gi requests : cpu : \"1\" memory : 2Gi To deploy the InferenceService with v1beta1 API kubectl kubectl apply -f aix-explainer.yaml Then find the url. kubectl kubectl get inferenceservice aix-explainer NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE aix-explainer http://aix-explainer.default.example.com True 100 aix-explainer-predictor-default-00001 43m Run Explanation \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , the example code for model training and explainer client can be found here . MODEL_NAME = aix-explainer SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } After a bit of time you should see a pop up containing the explanation, similar to the image below. The LIME method used in this example highlights the pixels in red that score above a certain confidence value for indicating a classification. The explanation shown will contain a collection of images that are highlighted paired with a title to describe the context. For each title and image pair, the title will say Positive for Actual to denote that is the classification that LIME is testing for and is the correct label for that image. To give an example, the top-left image with the title \"Positive for 2 Actual 2\" is the image with pixels highlighted that score above a specified confidence level for indicating a classification of 2 (where 2 is also the correct classification). Similarly, the bottom-right image with the title \"Positive for 0 Actual 2\" is the image with pixels highlighted that score above a specified confidence level for indicating a classification of 0 (where 2 is the correct classification). If the model were to incorrectly classify the image as 0, then you could get an explanation of why by looking at the highlighted pixels as being especially troublesome. By raising and lowering the min_weight parameter in the deployment yaml you can test to see which pixels your model believes are the most and least relevant for each classification. To try a different MNIST example add an integer to the end of the query between 0-10,000. The integer chosen will be the index of the image to be chosen in the MNIST dataset. python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 To try different parameters with explainer, add another string json argument to specify the parameters. Supported modified parameters: top_labels, segmentation_alg, num_samples, positive_only, and min_weight. python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 '{\"top_labels\":\"10\"}' Stopping the Inference Service \u00b6 kubectl delete -f aix-explainer.yaml Build a Development AIX Model Explainer Docker Image \u00b6 If you would like to build a development image for the AIX Model Explainer then follow these instructions Troubleshooting \u00b6 <504> Gateway Timeout <504> - the explainer is probably taking too long and not sending a response back quickly enough. Either there aren't enough resources allocated or the number of samples the explainer is allowed to take needs to be reduced. To fix this go to aix-explainer.yaml and increase resources. Or to lower the number of allowed samples go to aix-explainer.yaml and add a flag to explainer: command: '--num_samples' (the default number of samples is 1000) If you see Configuration \"aixserver-explainer-default\" does not have any ready Revision the container may have taken too long to download. If you run kubectl get revision and see your revision is stuck in ContainerCreating try deleting the inferenceservice and redeploying.","title":"AIX Explainer"},{"location":"modelserving/explainer/aix/mnist/aix/#using-aix-to-get-explanations-for-mnist-classifications","text":"This is an example of how to explain model predictions using AI Explainability 360 (AIX360) on KServe. We will be using mnist dataset for handwritten digits for this model and explain how the model decides the predicted results.","title":"Using AIX to get explanations for MNIST classifications"},{"location":"modelserving/explainer/aix/mnist/aix/#create-the-inferenceservice-with-aix-explainer","text":"apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"aix-explainer\" namespace : default spec : predictor : containers : - name : predictor image : aipipeline/rf-predictor:0.4.1 command : [ \"python\" , \"-m\" , \"rfserver\" , \"--model_name\" , \"aix-explainer\" ] imagePullPolicy : Always explainer : containers : - name : explainer image : kserve/aix-explainer:v0.10.1 args : - --model_name - aix-explainer - --explainer_type - LimeImages - --num_samples - \"100\" - --top_labels - \"10\" - --min_weight - \"0.01\" imagePullPolicy : Always resources : limits : cpu : \"1\" memory : 2Gi requests : cpu : \"1\" memory : 2Gi To deploy the InferenceService with v1beta1 API kubectl kubectl apply -f aix-explainer.yaml Then find the url. kubectl kubectl get inferenceservice aix-explainer NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE aix-explainer http://aix-explainer.default.example.com True 100 aix-explainer-predictor-default-00001 43m","title":"Create the InferenceService with AIX Explainer"},{"location":"modelserving/explainer/aix/mnist/aix/#run-explanation","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , the example code for model training and explainer client can be found here . MODEL_NAME = aix-explainer SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } After a bit of time you should see a pop up containing the explanation, similar to the image below. The LIME method used in this example highlights the pixels in red that score above a certain confidence value for indicating a classification. The explanation shown will contain a collection of images that are highlighted paired with a title to describe the context. For each title and image pair, the title will say Positive for Actual to denote that is the classification that LIME is testing for and is the correct label for that image. To give an example, the top-left image with the title \"Positive for 2 Actual 2\" is the image with pixels highlighted that score above a specified confidence level for indicating a classification of 2 (where 2 is also the correct classification). Similarly, the bottom-right image with the title \"Positive for 0 Actual 2\" is the image with pixels highlighted that score above a specified confidence level for indicating a classification of 0 (where 2 is the correct classification). If the model were to incorrectly classify the image as 0, then you could get an explanation of why by looking at the highlighted pixels as being especially troublesome. By raising and lowering the min_weight parameter in the deployment yaml you can test to see which pixels your model believes are the most and least relevant for each classification. To try a different MNIST example add an integer to the end of the query between 0-10,000. The integer chosen will be the index of the image to be chosen in the MNIST dataset. python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 To try different parameters with explainer, add another string json argument to specify the parameters. Supported modified parameters: top_labels, segmentation_alg, num_samples, positive_only, and min_weight. python query_explain.py http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :explain ${ SERVICE_HOSTNAME } 100 '{\"top_labels\":\"10\"}'","title":"Run Explanation"},{"location":"modelserving/explainer/aix/mnist/aix/#stopping-the-inference-service","text":"kubectl delete -f aix-explainer.yaml","title":"Stopping the Inference Service"},{"location":"modelserving/explainer/aix/mnist/aix/#build-a-development-aix-model-explainer-docker-image","text":"If you would like to build a development image for the AIX Model Explainer then follow these instructions","title":"Build a Development AIX Model Explainer Docker Image"},{"location":"modelserving/explainer/aix/mnist/aix/#troubleshooting","text":"<504> Gateway Timeout <504> - the explainer is probably taking too long and not sending a response back quickly enough. Either there aren't enough resources allocated or the number of samples the explainer is allowed to take needs to be reduced. To fix this go to aix-explainer.yaml and increase resources. Or to lower the number of allowed samples go to aix-explainer.yaml and add a flag to explainer: command: '--num_samples' (the default number of samples is 1000) If you see Configuration \"aixserver-explainer-default\" does not have any ready Revision the container may have taken too long to download. If you run kubectl get revision and see your revision is stuck in ContainerCreating try deleting the inferenceservice and redeploying.","title":"Troubleshooting"},{"location":"modelserving/explainer/alibi/cifar10/","text":"CIFAR10 Image Classifier Explanations \u00b6 We will use a Tensorflow classifier built on CIFAR10 image dataset which is a 10 class image dataset to show the example of explanation on image data. Create the InferenceService with Alibi Explainer \u00b6 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"cifar10\" spec : predictor : tensorflow : storageUri : \"gs://seldon-models/tfserving/cifar10/resnet32\" resources : requests : cpu : 0.1 memory : 5Gi limits : memory : 10Gi explainer : alibi : type : AnchorImages storageUri : \"gs://kfserving-examples/models/tensorflow/cifar/explainer-0.9.1\" config : batch_size : \"40\" stop_on_first : \"True\" resources : requests : cpu : 0.1 memory : 5Gi limits : memory : 10Gi Note The InferenceService resource describes: A pretrained tensorflow model stored on a Google bucket An AnchorImage Seldon Alibi Explainer, see the Alibi Docs for further details. Test on notebook \u00b6 Run this example using the Jupyter notebook . Once created you will be able to test the predictions: And then get an explanation for it:","title":"Image Explainer"},{"location":"modelserving/explainer/alibi/cifar10/#cifar10-image-classifier-explanations","text":"We will use a Tensorflow classifier built on CIFAR10 image dataset which is a 10 class image dataset to show the example of explanation on image data.","title":"CIFAR10 Image Classifier Explanations"},{"location":"modelserving/explainer/alibi/cifar10/#create-the-inferenceservice-with-alibi-explainer","text":"apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"cifar10\" spec : predictor : tensorflow : storageUri : \"gs://seldon-models/tfserving/cifar10/resnet32\" resources : requests : cpu : 0.1 memory : 5Gi limits : memory : 10Gi explainer : alibi : type : AnchorImages storageUri : \"gs://kfserving-examples/models/tensorflow/cifar/explainer-0.9.1\" config : batch_size : \"40\" stop_on_first : \"True\" resources : requests : cpu : 0.1 memory : 5Gi limits : memory : 10Gi Note The InferenceService resource describes: A pretrained tensorflow model stored on a Google bucket An AnchorImage Seldon Alibi Explainer, see the Alibi Docs for further details.","title":"Create the InferenceService with Alibi Explainer"},{"location":"modelserving/explainer/alibi/cifar10/#test-on-notebook","text":"Run this example using the Jupyter notebook . Once created you will be able to test the predictions: And then get an explanation for it:","title":"Test on notebook"},{"location":"modelserving/explainer/alibi/income/","text":"Example Anchors Tabular Explaination for Income Prediction \u00b6 This example uses a US income dataset to show the example of explanation on tabular data. You can also try out the Jupyter notebook for a visual walkthrough. Create the InferenceService with alibi explainer \u00b6 We can create a InferenceService with a trained sklearn predictor for this dataset and an associated model explainer. The black box explainer algorithm we will use is the Tabular version of Anchors from the Alibi open source library . More details on this algorithm and configuration settings that can be set can be found in the Seldon Alibi documentation . The InferenceService is shown below: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"income\" spec : predictor : minReplicas : 1 sklearn : storageUri : \"gs://kfserving-examples/models/sklearn/1.3/income/model\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 1Gi explainer : minReplicas : 1 alibi : type : AnchorTabular storageUri : \"gs://kfserving-examples/models/sklearn/1.3/income/explainer\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 4Gi Create the InferenceService with above yaml: kubectl kubectl create -f income.yaml The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = income SERVICE_HOSTNAME = $( kubectl get inferenceservice income -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Run the inference \u00b6 Test the predictor: curl -H \"Host: $SERVICE_HOSTNAME \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[[39, 7, 1, 1, 1, 1, 4, 1, 2174, 0, 40, 9]]}' You should receive the response showing the prediction is for low salary: Expected Output { \"predictions\" : [ 0 ]} Run the explanation \u00b6 Now lets get an explanation for this: curl -H \"Host: $SERVICE_HOSTNAME \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[[39, 7, 1, 1, 1, 1, 4, 1, 2174, 0, 40, 9]]}' The returned explanation will be like: Expected Output { \"names\" : [ \"Marital Status = Never-Married\" , \"Workclass = State-gov\" ], \"precision\" : 0.9724770642201835 , \"coverage\" : 0.0147 , \"raw\" : { \"feature\" : [ 3 , 1 ], \"mean\" : [ 0.9129746835443038 , 0.9724770642201835 ], \"precision\" : [ 0.9129746835443038 , 0.9724770642201835 ], \"coverage\" : [ 0.3327 , 0.0147 ], \"examples\" : [ { \"covered\" : [ [ 30 , \"Self-emp-not-inc\" , \"Bachelors\" , \"Never-Married\" , \"Sales\" , \"Unmarried\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 69 , \"Private\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , 9386 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 52 , \"United-States\" ], [ 59 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 55 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 55 , \"United-States\" ], [ 32 , \"?\" , \"Bachelors\" , \"Never-Married\" , \"?\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 32 , \"United-States\" ], [ 47 , \"Private\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Female\" , 6849 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 35 , \"Private\" , \"Associates\" , \"Never-Married\" , \"Service\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 65 , \"United-States\" ], [ 32 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 48 , \"Private\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ] ], \"covered_true\" : [ [ 32 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 52 , \"United-States\" ], [ 36 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 30 , \"United-States\" ], [ 56 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 49 , \"Local-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 30 , \"United-States\" ], [ 20 , \"?\" , \"High School grad\" , \"Never-Married\" , \"?\" , \"Own-child\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 10 , \"United-States\" ], [ 22 , \"?\" , \"High School grad\" , \"Never-Married\" , \"?\" , \"Own-child\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , \"Hours per week > 45.00\" , \"United-States\" ], [ 29 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Own-child\" , \"Asian-Pac-Islander\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"SE-Asia\" ], [ 45 , \"Local-gov\" , \"Masters\" , \"Never-Married\" , \"Professional\" , \"Unmarried\" , \"White\" , \"Female\" , 1506 , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ], [ 27 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ] ], \"covered_false\" : [ [ 29 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"Service\" , \"Husband\" , \"White\" , \"Male\" , 7298 , \"Capital Loss <= 0.00\" , 42 , \"United-States\" ], [ 56 , \"Private\" , \"Associates\" , \"Never-Married\" , \"Sales\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 47 , \"Private\" , \"Masters\" , \"Never-Married\" , \"Sales\" , \"Not-in-family\" , \"White\" , \"Male\" , 27828 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 40 , \"Private\" , \"Associates\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 7688 , \"Capital Loss <= 0.00\" , 44 , \"United-States\" ], [ 55 , \"Self-emp-not-inc\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , 34095 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 53 , \"Private\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 48 , \"United-States\" ], [ 47 , \"Federal-gov\" , \"Doctorate\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 53 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , 1977 , 40 , \"United-States\" ], [ 46 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"Sales\" , \"Not-in-family\" , \"White\" , \"Male\" , 8614 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Not-in-family\" , \"White\" , \"Male\" , 10520 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] }, { \"covered\" : [ [ 41 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 64 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 33 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"Black\" , \"Female\" , 1831 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 35 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 25 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Own-child\" , \"Black\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 40 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 19 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Other-relative\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 20 , \"United-States\" ], [ 44 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 88 , \"United-States\" ], [ 80 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 24 , \"United-States\" ], [ 21 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Professional\" , \"Own-child\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 20 , \"United-States\" ] ], \"covered_true\" : [ [ 22 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"United-States\" ], [ 49 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 22 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"?\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"United-States\" ], [ 31 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 18 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 56 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 26 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 38 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 52 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 70 , \"United-States\" ], [ 25 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Professional\" , \"Wife\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , 1887 , 40 , \"United-States\" ] ], \"covered_false\" : [ [ 46 , \"State-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ], [ 42 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 46 , \"State-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 54 , \"State-gov\" , \"Doctorate\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 42 , \"State-gov\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , 14084 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 37 , \"State-gov\" , \"Masters\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"Marital Status = Never-Married\" , \"Workclass = State-gov\" ], \"instance\" : [ [ 39 ], [ 7 ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ 4 ], [ \"28.00 < Age <= 37.00\" ], [ 2174 ], [ \"Age <= 28.00\" ], [ 40 ], [ 9 ] ], \"prediction\" : 0 } }","title":"Income Explainer"},{"location":"modelserving/explainer/alibi/income/#example-anchors-tabular-explaination-for-income-prediction","text":"This example uses a US income dataset to show the example of explanation on tabular data. You can also try out the Jupyter notebook for a visual walkthrough.","title":"Example Anchors Tabular Explaination for Income Prediction"},{"location":"modelserving/explainer/alibi/income/#create-the-inferenceservice-with-alibi-explainer","text":"We can create a InferenceService with a trained sklearn predictor for this dataset and an associated model explainer. The black box explainer algorithm we will use is the Tabular version of Anchors from the Alibi open source library . More details on this algorithm and configuration settings that can be set can be found in the Seldon Alibi documentation . The InferenceService is shown below: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"income\" spec : predictor : minReplicas : 1 sklearn : storageUri : \"gs://kfserving-examples/models/sklearn/1.3/income/model\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 1Gi explainer : minReplicas : 1 alibi : type : AnchorTabular storageUri : \"gs://kfserving-examples/models/sklearn/1.3/income/explainer\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 4Gi Create the InferenceService with above yaml: kubectl kubectl create -f income.yaml The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = income SERVICE_HOSTNAME = $( kubectl get inferenceservice income -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 )","title":"Create the InferenceService with alibi explainer"},{"location":"modelserving/explainer/alibi/income/#run-the-inference","text":"Test the predictor: curl -H \"Host: $SERVICE_HOSTNAME \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[[39, 7, 1, 1, 1, 1, 4, 1, 2174, 0, 40, 9]]}' You should receive the response showing the prediction is for low salary: Expected Output { \"predictions\" : [ 0 ]}","title":"Run the inference"},{"location":"modelserving/explainer/alibi/income/#run-the-explanation","text":"Now lets get an explanation for this: curl -H \"Host: $SERVICE_HOSTNAME \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[[39, 7, 1, 1, 1, 1, 4, 1, 2174, 0, 40, 9]]}' The returned explanation will be like: Expected Output { \"names\" : [ \"Marital Status = Never-Married\" , \"Workclass = State-gov\" ], \"precision\" : 0.9724770642201835 , \"coverage\" : 0.0147 , \"raw\" : { \"feature\" : [ 3 , 1 ], \"mean\" : [ 0.9129746835443038 , 0.9724770642201835 ], \"precision\" : [ 0.9129746835443038 , 0.9724770642201835 ], \"coverage\" : [ 0.3327 , 0.0147 ], \"examples\" : [ { \"covered\" : [ [ 30 , \"Self-emp-not-inc\" , \"Bachelors\" , \"Never-Married\" , \"Sales\" , \"Unmarried\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 69 , \"Private\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , 9386 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 52 , \"United-States\" ], [ 59 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 55 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 55 , \"United-States\" ], [ 32 , \"?\" , \"Bachelors\" , \"Never-Married\" , \"?\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 32 , \"United-States\" ], [ 47 , \"Private\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Female\" , 6849 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 35 , \"Private\" , \"Associates\" , \"Never-Married\" , \"Service\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 65 , \"United-States\" ], [ 32 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 48 , \"Private\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ] ], \"covered_true\" : [ [ 32 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 52 , \"United-States\" ], [ 36 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 30 , \"United-States\" ], [ 56 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 49 , \"Local-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 30 , \"United-States\" ], [ 20 , \"?\" , \"High School grad\" , \"Never-Married\" , \"?\" , \"Own-child\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 10 , \"United-States\" ], [ 22 , \"?\" , \"High School grad\" , \"Never-Married\" , \"?\" , \"Own-child\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , \"Hours per week > 45.00\" , \"United-States\" ], [ 29 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Own-child\" , \"Asian-Pac-Islander\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"SE-Asia\" ], [ 45 , \"Local-gov\" , \"Masters\" , \"Never-Married\" , \"Professional\" , \"Unmarried\" , \"White\" , \"Female\" , 1506 , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ], [ 27 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ] ], \"covered_false\" : [ [ 29 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"Service\" , \"Husband\" , \"White\" , \"Male\" , 7298 , \"Capital Loss <= 0.00\" , 42 , \"United-States\" ], [ 56 , \"Private\" , \"Associates\" , \"Never-Married\" , \"Sales\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 47 , \"Private\" , \"Masters\" , \"Never-Married\" , \"Sales\" , \"Not-in-family\" , \"White\" , \"Male\" , 27828 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 40 , \"Private\" , \"Associates\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 7688 , \"Capital Loss <= 0.00\" , 44 , \"United-States\" ], [ 55 , \"Self-emp-not-inc\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , 34095 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 53 , \"Private\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 48 , \"United-States\" ], [ 47 , \"Federal-gov\" , \"Doctorate\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 53 , \"Private\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , 1977 , 40 , \"United-States\" ], [ 46 , \"Private\" , \"Bachelors\" , \"Never-Married\" , \"Sales\" , \"Not-in-family\" , \"White\" , \"Male\" , 8614 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 44 , \"Local-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Not-in-family\" , \"White\" , \"Male\" , 10520 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] }, { \"covered\" : [ [ 41 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 64 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 33 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"Black\" , \"Female\" , 1831 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 35 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 25 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Own-child\" , \"Black\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 40 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 19 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Other-relative\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 20 , \"United-States\" ], [ 44 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 88 , \"United-States\" ], [ 80 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 24 , \"United-States\" ], [ 21 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Professional\" , \"Own-child\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 20 , \"United-States\" ] ], \"covered_true\" : [ [ 22 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"United-States\" ], [ 49 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Service\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 22 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"?\" , \"Not-in-family\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 25 , \"United-States\" ], [ 31 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 18 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Blue-Collar\" , \"Not-in-family\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 56 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Unmarried\" , \"Black\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 26 , \"State-gov\" , \"Dropout\" , \"Never-Married\" , \"Service\" , \"Unmarried\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 38 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 52 , \"State-gov\" , \"High School grad\" , \"Never-Married\" , \"Blue-Collar\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 70 , \"United-States\" ], [ 25 , \"State-gov\" , \"Associates\" , \"Never-Married\" , \"Professional\" , \"Wife\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , 1887 , 40 , \"United-States\" ] ], \"covered_false\" : [ [ 46 , \"State-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ], [ 42 , \"State-gov\" , \"Bachelors\" , \"Never-Married\" , \"White-Collar\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 50 , \"United-States\" ], [ 46 , \"State-gov\" , \"Prof-School\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , 15024 , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 54 , \"State-gov\" , \"Doctorate\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 40 , \"United-States\" ], [ 42 , \"State-gov\" , \"Masters\" , \"Never-Married\" , \"White-Collar\" , \"Not-in-family\" , \"White\" , \"Female\" , 14084 , \"Capital Loss <= 0.00\" , 60 , \"United-States\" ], [ 37 , \"State-gov\" , \"Masters\" , \"Never-Married\" , \"Professional\" , \"Husband\" , \"White\" , \"Male\" , \"Capital Gain <= 0.00\" , \"Capital Loss <= 0.00\" , 45 , \"United-States\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"Marital Status = Never-Married\" , \"Workclass = State-gov\" ], \"instance\" : [ [ 39 ], [ 7 ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ \"28.00 < Age <= 37.00\" ], [ 4 ], [ \"28.00 < Age <= 37.00\" ], [ 2174 ], [ \"Age <= 28.00\" ], [ 40 ], [ 9 ] ], \"prediction\" : 0 } }","title":"Run the explanation"},{"location":"modelserving/explainer/alibi/moviesentiment/","text":"Example Anchors Text Explaination for Movie Sentiment \u00b6 This example uses a movie sentiment dataset to show the explanation on text data, for a more visual walkthrough please try the Jupyter notebook . Deploy InferenceService with AnchorText Explainer \u00b6 We can create a InferenceService with a trained sklearn predictor for this dataset and an associated explainer. The black box explainer algorithm we will use is the Text version of Anchors from the Alibi open source library . More details on this algorithm and configuration settings that can be set can be found in the Seldon Alibi documentation . The InferenceService is shown below: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"moviesentiment\" spec : predictor : minReplicas : 1 sklearn : storageUri : \"gs://seldon-models/v1.16.0/sklearn/moviesentiment\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 1Gi explainer : minReplicas : 1 alibi : type : AnchorText resources : requests : cpu : 0.1 memory : 6Gi limits : memory : 6Gi Create this InferenceService: kubectl kubectl create -f moviesentiment.yaml Run Inference and Explanation \u00b6 Set up some environment variables for the model name and cluster entrypoint. MODEL_NAME = moviesentiment SERVICE_HOSTNAME = $( kubectl get inferenceservice moviesentiment -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Test the predictor on an example sentence: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' You should receive the response showing negative sentiment: Expected Output { \"predictions\" : [ 0 ]} Test on another sentence: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[\"a touching , sophisticated film that almost seems like a documentary in the way it captures an italian immigrant family on the brink of major changes .\"]}' You should receive the response showing positive sentiment: Expected Output { \"predictions\" : [ 1 ]} Now lets get an explanation for the first sentence: curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' Expected Output { \"names\" : [ \"exercise\" ], \"precision\" : 1 , \"converage\" : 0.5005 , \"raw\" : { \"feature\" : [ 9 ], \"mean\" : [ 1 ], \"precision\" : [ 1 ], \"coverage\" : [ 0.5005 ], \"examples\" : [ { \"covered\" : [ [ \"a visually UNK UNK UNK opaque and emotionally vapid exercise UNK\" ], [ \"a visually flashy but UNK UNK and emotionally UNK exercise .\" ], [ \"a visually flashy but narratively UNK UNK UNK UNK exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK UNK vapid exercise .\" ], [ \"UNK visually UNK UNK UNK UNK and UNK vapid exercise UNK\" ], [ \"UNK UNK UNK but UNK opaque UNK emotionally UNK exercise UNK\" ], [ \"a UNK flashy UNK UNK UNK and emotionally vapid exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK emotionally UNK exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK UNK vapid exercise UNK\" ], [ \"a visually UNK but narratively opaque UNK UNK vapid exercise UNK\" ] ], \"covered_true\" : [ [ \"UNK visually flashy but UNK UNK and emotionally vapid exercise .\" ], [ \"UNK visually UNK UNK UNK UNK and UNK UNK exercise .\" ], [ \"a UNK UNK UNK narratively opaque UNK UNK UNK exercise UNK\" ], [ \"a visually UNK UNK narratively opaque UNK UNK UNK exercise UNK\" ], [ \"a UNK UNK UNK UNK UNK and emotionally vapid exercise UNK\" ], [ \"a UNK flashy UNK narratively UNK and UNK vapid exercise UNK\" ], [ \"UNK visually UNK UNK narratively UNK and emotionally UNK exercise .\" ], [ \"UNK visually flashy UNK narratively opaque UNK emotionally UNK exercise UNK\" ], [ \"UNK UNK flashy UNK UNK UNK and UNK vapid exercise UNK\" ], [ \"a UNK flashy UNK UNK UNK and emotionally vapid exercise .\" ] ], \"covered_false\" : [], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"exercise\" ], \"positions\" : [ 63 ], \"instance\" : \"a visually flashy but narratively opaque and emotionally vapid exercise .\" , \"prediction\" : 0 } } This shows the key word \"bad\" was identified and examples show it in context using the default \"UKN\" placeholder for surrounding words. Custom Configuration \u00b6 You can add custom configuration for the Anchor Text explainer in the 'config' section. For example, we can change the text explainer to sample from the corpus rather than use UKN placeholders: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"moviesentiment\" spec : predictor : sklearn : storageUri : \"gs://seldon-models/v1.16.0/sklearn/moviesentiment\" resources : requests : cpu : 0.1 explainer : alibi : type : AnchorText config : use_unk : \"false\" sample_proba : \"0.5\" resources : requests : cpu : 0.1 If we apply this: kubectl kubectl create -f moviesentiment2.yaml and then ask for an explanation: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' Expected Output { \"names\" : [ \"exercise\" ], \"precision\" : 0.9918032786885246 , \"coverage\" : 0.5072 , \"raw\" : { \"feature\" : [ 9 ], \"mean\" : [ 0.9918032786885246 ], \"precision\" : [ 0.9918032786885246 ], \"coverage\" : [ 0.5072 ], \"examples\" : [ { \"covered\" : [ [ \"each visually playful but enormously opaque and academically vapid exercise .\" ], [ \"each academically trashy but narratively pigmented and profoundly vapid exercise .\" ], [ \"a masterfully flashy but narratively straightforward and verbally disingenuous exercise .\" ], [ \"a visually gaudy but interestingly opaque and emotionally vapid exercise .\" ], [ \"some concurrently flashy but philosophically voxel and emotionally vapid exercise .\" ], [ \"a visually flashy but delightfully sensible and emotionally snobby exercise .\" ], [ \"a surprisingly bland but fantastically seamless and hideously vapid exercise .\" ], [ \"both visually classy but nonetheless robust and musically vapid exercise .\" ], [ \"a visually fancy but narratively robust and emotionally uninformed exercise .\" ], [ \"a visually flashy but tastefully opaque and weirdly vapid exercise .\" ] ], \"covered_true\" : [ [ \"another visually flashy but narratively opaque and emotionally vapid exercise .\" ], [ \"the visually classy but narratively opaque and emotionally vapid exercise .\" ], [ \"the visually arty but overshadow yellowish and emotionally vapid exercise .\" ], [ \"a objectively flashy but genuinely straightforward and emotionally vapid exercise .\" ], [ \"a visually flashy but tastefully opaque and weirdly vapid exercise .\" ], [ \"a emotionally crafty but narratively opaque and emotionally vapid exercise .\" ], [ \"some similarly eclectic but narratively dainty and emotionally illogical exercise .\" ], [ \"a nicely flashy but psychologically opaque and emotionally vapid exercise .\" ], [ \"a visually flashy but narratively colorless and emotionally vapid exercise .\" ], [ \"every properly lavish but logistically opaque and someway incomprehensible exercise .\" ] ], \"covered_false\" : [ [ \"another enormously inventive but socially opaque and somewhat idiotic exercise .\" ], [ \"each visually playful but enormously opaque and academically vapid exercise .\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"exercise\" ], \"positions\" : [ 63 ], \"instance\" : \"a visually flashy but narratively opaque and emotionally vapid exercise .\" , \"prediction\" : 0 } } Run on Notebook \u00b6 You can also run this example on notebook","title":"Text Explainer"},{"location":"modelserving/explainer/alibi/moviesentiment/#example-anchors-text-explaination-for-movie-sentiment","text":"This example uses a movie sentiment dataset to show the explanation on text data, for a more visual walkthrough please try the Jupyter notebook .","title":"Example Anchors Text Explaination for Movie Sentiment"},{"location":"modelserving/explainer/alibi/moviesentiment/#deploy-inferenceservice-with-anchortext-explainer","text":"We can create a InferenceService with a trained sklearn predictor for this dataset and an associated explainer. The black box explainer algorithm we will use is the Text version of Anchors from the Alibi open source library . More details on this algorithm and configuration settings that can be set can be found in the Seldon Alibi documentation . The InferenceService is shown below: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"moviesentiment\" spec : predictor : minReplicas : 1 sklearn : storageUri : \"gs://seldon-models/v1.16.0/sklearn/moviesentiment\" resources : requests : cpu : 0.1 memory : 1Gi limits : cpu : 1 memory : 1Gi explainer : minReplicas : 1 alibi : type : AnchorText resources : requests : cpu : 0.1 memory : 6Gi limits : memory : 6Gi Create this InferenceService: kubectl kubectl create -f moviesentiment.yaml","title":"Deploy InferenceService with AnchorText Explainer"},{"location":"modelserving/explainer/alibi/moviesentiment/#run-inference-and-explanation","text":"Set up some environment variables for the model name and cluster entrypoint. MODEL_NAME = moviesentiment SERVICE_HOSTNAME = $( kubectl get inferenceservice moviesentiment -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Test the predictor on an example sentence: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' You should receive the response showing negative sentiment: Expected Output { \"predictions\" : [ 0 ]} Test on another sentence: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d '{\"instances\":[\"a touching , sophisticated film that almost seems like a documentary in the way it captures an italian immigrant family on the brink of major changes .\"]}' You should receive the response showing positive sentiment: Expected Output { \"predictions\" : [ 1 ]} Now lets get an explanation for the first sentence: curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' Expected Output { \"names\" : [ \"exercise\" ], \"precision\" : 1 , \"converage\" : 0.5005 , \"raw\" : { \"feature\" : [ 9 ], \"mean\" : [ 1 ], \"precision\" : [ 1 ], \"coverage\" : [ 0.5005 ], \"examples\" : [ { \"covered\" : [ [ \"a visually UNK UNK UNK opaque and emotionally vapid exercise UNK\" ], [ \"a visually flashy but UNK UNK and emotionally UNK exercise .\" ], [ \"a visually flashy but narratively UNK UNK UNK UNK exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK UNK vapid exercise .\" ], [ \"UNK visually UNK UNK UNK UNK and UNK vapid exercise UNK\" ], [ \"UNK UNK UNK but UNK opaque UNK emotionally UNK exercise UNK\" ], [ \"a UNK flashy UNK UNK UNK and emotionally vapid exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK emotionally UNK exercise .\" ], [ \"UNK UNK flashy UNK narratively opaque UNK UNK vapid exercise UNK\" ], [ \"a visually UNK but narratively opaque UNK UNK vapid exercise UNK\" ] ], \"covered_true\" : [ [ \"UNK visually flashy but UNK UNK and emotionally vapid exercise .\" ], [ \"UNK visually UNK UNK UNK UNK and UNK UNK exercise .\" ], [ \"a UNK UNK UNK narratively opaque UNK UNK UNK exercise UNK\" ], [ \"a visually UNK UNK narratively opaque UNK UNK UNK exercise UNK\" ], [ \"a UNK UNK UNK UNK UNK and emotionally vapid exercise UNK\" ], [ \"a UNK flashy UNK narratively UNK and UNK vapid exercise UNK\" ], [ \"UNK visually UNK UNK narratively UNK and emotionally UNK exercise .\" ], [ \"UNK visually flashy UNK narratively opaque UNK emotionally UNK exercise UNK\" ], [ \"UNK UNK flashy UNK UNK UNK and UNK vapid exercise UNK\" ], [ \"a UNK flashy UNK UNK UNK and emotionally vapid exercise .\" ] ], \"covered_false\" : [], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"exercise\" ], \"positions\" : [ 63 ], \"instance\" : \"a visually flashy but narratively opaque and emotionally vapid exercise .\" , \"prediction\" : 0 } } This shows the key word \"bad\" was identified and examples show it in context using the default \"UKN\" placeholder for surrounding words.","title":"Run Inference and Explanation"},{"location":"modelserving/explainer/alibi/moviesentiment/#custom-configuration","text":"You can add custom configuration for the Anchor Text explainer in the 'config' section. For example, we can change the text explainer to sample from the corpus rather than use UKN placeholders: apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"moviesentiment\" spec : predictor : sklearn : storageUri : \"gs://seldon-models/v1.16.0/sklearn/moviesentiment\" resources : requests : cpu : 0.1 explainer : alibi : type : AnchorText config : use_unk : \"false\" sample_proba : \"0.5\" resources : requests : cpu : 0.1 If we apply this: kubectl kubectl create -f moviesentiment2.yaml and then ask for an explanation: curl -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :explain -d '{\"instances\":[\"a visually flashy but narratively opaque and emotionally vapid exercise .\"]}' Expected Output { \"names\" : [ \"exercise\" ], \"precision\" : 0.9918032786885246 , \"coverage\" : 0.5072 , \"raw\" : { \"feature\" : [ 9 ], \"mean\" : [ 0.9918032786885246 ], \"precision\" : [ 0.9918032786885246 ], \"coverage\" : [ 0.5072 ], \"examples\" : [ { \"covered\" : [ [ \"each visually playful but enormously opaque and academically vapid exercise .\" ], [ \"each academically trashy but narratively pigmented and profoundly vapid exercise .\" ], [ \"a masterfully flashy but narratively straightforward and verbally disingenuous exercise .\" ], [ \"a visually gaudy but interestingly opaque and emotionally vapid exercise .\" ], [ \"some concurrently flashy but philosophically voxel and emotionally vapid exercise .\" ], [ \"a visually flashy but delightfully sensible and emotionally snobby exercise .\" ], [ \"a surprisingly bland but fantastically seamless and hideously vapid exercise .\" ], [ \"both visually classy but nonetheless robust and musically vapid exercise .\" ], [ \"a visually fancy but narratively robust and emotionally uninformed exercise .\" ], [ \"a visually flashy but tastefully opaque and weirdly vapid exercise .\" ] ], \"covered_true\" : [ [ \"another visually flashy but narratively opaque and emotionally vapid exercise .\" ], [ \"the visually classy but narratively opaque and emotionally vapid exercise .\" ], [ \"the visually arty but overshadow yellowish and emotionally vapid exercise .\" ], [ \"a objectively flashy but genuinely straightforward and emotionally vapid exercise .\" ], [ \"a visually flashy but tastefully opaque and weirdly vapid exercise .\" ], [ \"a emotionally crafty but narratively opaque and emotionally vapid exercise .\" ], [ \"some similarly eclectic but narratively dainty and emotionally illogical exercise .\" ], [ \"a nicely flashy but psychologically opaque and emotionally vapid exercise .\" ], [ \"a visually flashy but narratively colorless and emotionally vapid exercise .\" ], [ \"every properly lavish but logistically opaque and someway incomprehensible exercise .\" ] ], \"covered_false\" : [ [ \"another enormously inventive but socially opaque and somewhat idiotic exercise .\" ], [ \"each visually playful but enormously opaque and academically vapid exercise .\" ] ], \"uncovered_true\" : [], \"uncovered_false\" : [] } ], \"all_precision\" : 0 , \"num_preds\" : 1000101 , \"names\" : [ \"exercise\" ], \"positions\" : [ 63 ], \"instance\" : \"a visually flashy but narratively opaque and emotionally vapid exercise .\" , \"prediction\" : 0 } }","title":"Custom Configuration"},{"location":"modelserving/explainer/alibi/moviesentiment/#run-on-notebook","text":"You can also run this example on notebook","title":"Run on Notebook"},{"location":"modelserving/inference_graph/","text":"Inference Graph \u00b6 Motivation \u00b6 ML inference systems are getting bigger and more complex, they often consist of many models to make a single prediction. Some common use cases are image classification and natural language processing pipelines. For example, a face recognition pipeline may need to first locate faces in a image, then compute the features of the faces to match records in a database. An NLP pipeline needs to first run document classification, then perform named entity detection downstream based on the previous classification results. KServe has unique strengths for building a distributed inference graph: an autoscaling graph router, native integration with individual InferenceServices , and a standard inference protocol for chaining models. KServe leverages these strengths to build an InferenceGraph and enable users to deploy complex ML inference pipelines to production in a declarative and scalable way. Concepts \u00b6 InferenceGraph : Made up of a list of routing Nodes , where each Node consists of a set of routing Steps . Each Step can either route to an InferenceService or another Node defined on the graph which makes the InferenceGraph highly composable. The graph router is deployed behind an HTTP endpoint and can be scaled dynamically based on request volume. The InferenceGraph supports four different types of Routing Nodes : Sequence , Switch , Ensemble , Splitter . Sequence Node : Allows users to define multiple Steps with InferenceServices or Nodes as routing targets in a sequence. The Steps are executed in sequence and the request/response from the previous step can be passed to the next step as input based on configuration. Switch Node : Enables users to define routing conditions and select a step to execute if it matches a condition. The response is returned as soon it finds the first step that matches the condition. If no condition is matched, the graph returns the original request. Ensemble Node : A model ensemble requires scoring each model separately and then combining the results into a single prediction response. You can then use different combination methods to produce the final result. Multiple classification trees, for example, are commonly combined using a \"majority vote\" method. Multiple regression trees are often combined using various averaging techniques. Splitter Node : Allows users to split the traffic to multiple targets using a weighted distribution. Features \u00b6 Headers Propagation \u00b6 If you want Inference Graph's router to propagate the headers, you passed in the request to Inference Graph, to all the steps in your graph then you can do so using inferenceservice-config config-map in kserve namespace. For example: If you want to propagate a certain header, say \"Custom-Header\", then you can edit the router section of inferenceservice-config config-map like this : { \"image\" : \"kserve/router:v0.11.0\" , \"memoryRequest\" : \"100Mi\" , \"memoryLimit\" : \"1Gi\" , \"cpuRequest\" : \"100m\" , \"cpuLimit\" : \"1\" , \"headers\" : { \"propagate\" :[ \"Custom-Header\" ] } } Once you update this config-map, kserve controller will automatically reconcile Inference Graph to start propagating headers.","title":"Concept"},{"location":"modelserving/inference_graph/#inference-graph","text":"","title":"Inference Graph"},{"location":"modelserving/inference_graph/#motivation","text":"ML inference systems are getting bigger and more complex, they often consist of many models to make a single prediction. Some common use cases are image classification and natural language processing pipelines. For example, a face recognition pipeline may need to first locate faces in a image, then compute the features of the faces to match records in a database. An NLP pipeline needs to first run document classification, then perform named entity detection downstream based on the previous classification results. KServe has unique strengths for building a distributed inference graph: an autoscaling graph router, native integration with individual InferenceServices , and a standard inference protocol for chaining models. KServe leverages these strengths to build an InferenceGraph and enable users to deploy complex ML inference pipelines to production in a declarative and scalable way.","title":"Motivation"},{"location":"modelserving/inference_graph/#concepts","text":"InferenceGraph : Made up of a list of routing Nodes , where each Node consists of a set of routing Steps . Each Step can either route to an InferenceService or another Node defined on the graph which makes the InferenceGraph highly composable. The graph router is deployed behind an HTTP endpoint and can be scaled dynamically based on request volume. The InferenceGraph supports four different types of Routing Nodes : Sequence , Switch , Ensemble , Splitter . Sequence Node : Allows users to define multiple Steps with InferenceServices or Nodes as routing targets in a sequence. The Steps are executed in sequence and the request/response from the previous step can be passed to the next step as input based on configuration. Switch Node : Enables users to define routing conditions and select a step to execute if it matches a condition. The response is returned as soon it finds the first step that matches the condition. If no condition is matched, the graph returns the original request. Ensemble Node : A model ensemble requires scoring each model separately and then combining the results into a single prediction response. You can then use different combination methods to produce the final result. Multiple classification trees, for example, are commonly combined using a \"majority vote\" method. Multiple regression trees are often combined using various averaging techniques. Splitter Node : Allows users to split the traffic to multiple targets using a weighted distribution.","title":"Concepts"},{"location":"modelserving/inference_graph/#features","text":"","title":"Features"},{"location":"modelserving/inference_graph/#headers-propagation","text":"If you want Inference Graph's router to propagate the headers, you passed in the request to Inference Graph, to all the steps in your graph then you can do so using inferenceservice-config config-map in kserve namespace. For example: If you want to propagate a certain header, say \"Custom-Header\", then you can edit the router section of inferenceservice-config config-map like this : { \"image\" : \"kserve/router:v0.11.0\" , \"memoryRequest\" : \"100Mi\" , \"memoryLimit\" : \"1Gi\" , \"cpuRequest\" : \"100m\" , \"cpuLimit\" : \"1\" , \"headers\" : { \"propagate\" :[ \"Custom-Header\" ] } } Once you update this config-map, kserve controller will automatically reconcile Inference Graph to start propagating headers.","title":"Headers Propagation"},{"location":"modelserving/inference_graph/image_pipeline/","text":"Deploy Image Processing Inference pipeline with InferenceGraph \u00b6 The tutorial demonstrates how to deploy an image processing inference pipeline with multiple stages using InferenceGraph . The example chains the two models, the first model is to classify if an image is a dog or a cat, if it is a dog the second model then does the dog breed classification. InferenceGraph Flow \u00b6 In the InferenceGraph request flow, the image is encoded with base64 format and first sent to the dog-cat-classifier model, the image input for the dog-cat-classifier InferenceService are then forwarded to send to the model on the next stage to classify the breed if the previous model prediction is a dog. Deploy the individual InferenceServices \u00b6 Train the models \u00b6 You can refer to dog-cat classification and dog breed classification to train the image classifier models for different stages. Deploy the InferenceServices \u00b6 Before deploying the graph router with InferenceGraph custom resource, you need to first deploy the individual InferenceServices with the models trained from previous step. The models should be packaged with the following commands and then upload to your model storage along with the configuration : torch-model-archiver -f --model-name cat_dog_classification --version 1 .0 \\ --model-file cat_dog_classification_arch.py \\ --serialized-file cat_dog_classification.pth \\ --handler cat_dog_classification_handler.py \\ --extra-files index_to_name.json --export-path model_store torch-model-archiver -f --model-name dog_breed_classification --version 1 .0 \\ --model-file dog_breed_classification_arch.py \\ --serialized-file dog_breed_classification.pth \\ --handler dog_breed_classification_handler.py \\ --extra-files index_to_name.json --export-path model_store You can then deploy the models to KServe with following InferenceService custom resources. New Schema Old Schema kubectl apply -f - <= 1.2 kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.9.7/eventing-crds.yaml kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.9.7/eventing-core.yaml Install Kafka Event Source . kubectl apply -f https://github.com/knative-sandbox/eventing-kafka/releases/download/knative-v1.9.1/source.yaml Install InferenceService addressable cluster role cat <= 1.2 kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.9.7/eventing-crds.yaml kubectl apply -f https://github.com/knative/eventing/releases/download/knative-v1.9.7/eventing-core.yaml Install Kafka Event Source . kubectl apply -f https://github.com/knative-sandbox/eventing-kafka/releases/download/knative-v1.9.1/source.yaml Install InferenceService addressable cluster role cat < $tmpdir/patch.txt [{ \"op\": \"replace\", \"path\": \"/data/storageInitializer\", \"value\": '$newValue' }] EOT # Apply the patch to the ConfigMap kubectl patch configmap -n kserve inferenceservice-config --type = json --patch-file = $tmpdir /patch.txt # Restart the KServe controller to apply changes kubectl delete pod -n kserve -l control-plane = kserve-controller-manager Prepare an OCI Image with Model Data \u00b6 To utilize Modelcars for serving models, you need to prepare an OCI (Open Container Initiative) image containing your model data. This process involves creating a Dockerfile and building an OCI image that houses your model in a specific directory. Below are the steps and an example to guide you through this process. Create a Dockerfile: Start by creating a Dockerfile that uses a base image containing the necessary commands like ln (for creating symbolic links) and sleep (for keeping the container running). The Dockerfile should also include steps to create a directory /model for your model data and copy the data into this directory. Here's an example Dockerfile where the data/ directory contains your model data. This data will later be mounted in /mnt/models by the runtime: FROM busybox RUN mkdir /models && chmod 775 /models COPY data/ /models/ Build and Push the Image to a Registry : Once your Dockerfile is ready, use either docker or podman to build and push the image to a container registry like Docker Hub or quay.io docker build -t myuser/mymodel:1.0 . docker push myuser/mymodel:1.0 By completing these steps, you'll have an OCI image ready with your model data, which can then be used with the Modelcars feature in KServe for efficient model serving. Using Modelcars \u00b6 With Modelcars enabled and your OCI image containing the model data prepared, integrating this setup into your InferenceService is straightforward. The key step involves specifying the storageURI with the oci:// schema in your InferenceService configuration to point to your OCI image. Here\u2019s an example of how an InferenceService configuration would look when using the Modelcars feature: apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : my-infeference-service spec : predictor : model : modelFormat : name : sklearn storageUri : oci://myuser/mymodel:1.0 In order to fully leverage the local caching capabilities of OCI images in the Modelcars setup, it is crucial to use a specific tag for your model image, rather than relying on the default latest tag. For instance, in the provided example, the tag 1.0 is utilized. This approach ensures that the modelcar image is pulled with a IfNotPresent policy, allowing for efficient use of local cache. On the other hand, using the latest tag, or omitting a tag altogether, defaults to a Always pull policy. This means the image would be re-downloaded every time a Pod restarts or scales up, negating the benefits of local caching and potentially leading to increased startup times. Example \u00b6 Let's see how modecars work by deploying the getting started example by using an OCI image and check how it is different to the startup with a storage-initalizer init-container. Asuming you have setup a namespace kserve-test that is KServe enabled, create an InferenceService that uses an oci:// storage URL: kubectl apply -n kserve-test -f - < /proc/38/root/models sklearn-iris-oci-predictor:/mnt$ cd /mnt/models sklearn-iris-oci-predictor:/mnt/models$ ls -l total 8 -rw-r--r-- 1 root root 5408 Jan 26 15 :58 model.joblib As you can see, the runtime can directly access the data coming from the modelcar image, without prior copying it over in another volume. Configuration \u00b6 Fine-tuning the behavior of Modelcars in KServe is possible through global configuration settings for inference services. These settings are located in the inferenceservice-config ConfigMap, which resides in the kserve namespace or the namespace where the KServe controller operates. This ConfigMap includes various subconfigurations, with the Modelcars configuration located under the storageInitializer entry. To view the current configuration, use the following command: kubectl get cm -n kserve inferenceservice-config --jsonpath \"{.data.storageInitializer}\" Sample Output { \"image\" : \"kserve/storage-initializer:v0.11.2\" , \"memoryRequest\" : \"100Mi\" , \"memoryLimit\" : \"1Gi\" , \"cpuRequest\" : \"100m\" , \"cpuLimit\" : \"1\" , \"enableDirectPvcVolumeMount\" : false , \"enableModelcar\" : true , \"uidModelcar\" : 1010 } The output is a JSON string representing the configuration. For Modelcars, several keys are available for customization: Key Description Example enableModelcar Enables direct access to an OCI container image using a source URL with an \"oci://\" schema. true cpuModelcar CPU request and limit for the modelcar container. 10m memoryModelcar Memory request and limit for the modelcar container. 15Mi uidModelcar UID under which the modelcar process and the main container run. Set to 0 for root if needed. If not set, the UID of the containers is used. 1042 References \u00b6 Modelcar Design document Original GitHub issue (discusses also some alternative solutions) 12-minute demo Code walkthrough showing the implementation of Modelcars in KServe (for background information)","title":"OCI"},{"location":"modelserving/storage/oci/#serving-models-with-oci-images","text":"KServe's traditional approach for model initialization involves fetching models from sources like S3 buckets or URIs . This process is adequate for small models but becomes a bottleneck for larger ones like used for large language models, significantly delaying startup times in auto-scaling scenarios. \"Modelcars\" is a KServe feature designed to address these challenges. It streamlines model fetching using OCI images, offering several advantages: Reduced Startup Times: By avoiding repetitive downloads of large models, startup delays are significantly minimized. Lower Disk Space Usage: The feature decreases the need for duplicated local storage, conserving disk space. Enhanced Performance: Modelcars allows for advanced techniques like pre-fetching images and lazy-loading, improving efficiency. Compatibility and Integration: It seamlessly integrates with existing KServe infrastructure, ensuring ease of adoption. Modelcars represents a step forward in efficient model serving, particularly beneficial for handling large models and dynamic serving environments.","title":"Serving models with OCI images"},{"location":"modelserving/storage/oci/#enabling-modelcars","text":"Modelcars is an experimental feature in KServe and is not enabled by default. To take advantage of this new model serving method, it needs to be activated in the KServe configuration. Follow the steps below to enable Modelcars in your environment. Note Modelcars are currently in an experimental phase. Enable this feature in a test environment first to ensure it meets your requirements before using it in a production setting. Modelcars can be enabled by modifying the storageInitializer configuration in the inferenceservice-config ConfigMap. This can be done manually using kubectl edit or by executing the script provided below, with the current namespace set to the namespace where the kserve-controller-manager is installed (depends on the way how KServer is installed.) # Script to enable Modelcars # Fetch the current storageInitializer configuration config = $( kubectl get configmap inferenceservice-config -n kserve -o jsonpath = '{.data.storageInitializer}' ) # Enable modelcars and set the UID for the containers to run (required for minikube) newValue = $( echo $config | jq -c '. + {\"enableModelcar\": true, \"uidModelcar\": 1010}' ) # Create a temporary directory for the patch file tmpdir = $( mktemp -d ) cat < $tmpdir/patch.txt [{ \"op\": \"replace\", \"path\": \"/data/storageInitializer\", \"value\": '$newValue' }] EOT # Apply the patch to the ConfigMap kubectl patch configmap -n kserve inferenceservice-config --type = json --patch-file = $tmpdir /patch.txt # Restart the KServe controller to apply changes kubectl delete pod -n kserve -l control-plane = kserve-controller-manager","title":"Enabling Modelcars"},{"location":"modelserving/storage/oci/#prepare-an-oci-image-with-model-data","text":"To utilize Modelcars for serving models, you need to prepare an OCI (Open Container Initiative) image containing your model data. This process involves creating a Dockerfile and building an OCI image that houses your model in a specific directory. Below are the steps and an example to guide you through this process. Create a Dockerfile: Start by creating a Dockerfile that uses a base image containing the necessary commands like ln (for creating symbolic links) and sleep (for keeping the container running). The Dockerfile should also include steps to create a directory /model for your model data and copy the data into this directory. Here's an example Dockerfile where the data/ directory contains your model data. This data will later be mounted in /mnt/models by the runtime: FROM busybox RUN mkdir /models && chmod 775 /models COPY data/ /models/ Build and Push the Image to a Registry : Once your Dockerfile is ready, use either docker or podman to build and push the image to a container registry like Docker Hub or quay.io docker build -t myuser/mymodel:1.0 . docker push myuser/mymodel:1.0 By completing these steps, you'll have an OCI image ready with your model data, which can then be used with the Modelcars feature in KServe for efficient model serving.","title":"Prepare an OCI Image with Model Data"},{"location":"modelserving/storage/oci/#using-modelcars","text":"With Modelcars enabled and your OCI image containing the model data prepared, integrating this setup into your InferenceService is straightforward. The key step involves specifying the storageURI with the oci:// schema in your InferenceService configuration to point to your OCI image. Here\u2019s an example of how an InferenceService configuration would look when using the Modelcars feature: apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : my-infeference-service spec : predictor : model : modelFormat : name : sklearn storageUri : oci://myuser/mymodel:1.0 In order to fully leverage the local caching capabilities of OCI images in the Modelcars setup, it is crucial to use a specific tag for your model image, rather than relying on the default latest tag. For instance, in the provided example, the tag 1.0 is utilized. This approach ensures that the modelcar image is pulled with a IfNotPresent policy, allowing for efficient use of local cache. On the other hand, using the latest tag, or omitting a tag altogether, defaults to a Always pull policy. This means the image would be re-downloaded every time a Pod restarts or scales up, negating the benefits of local caching and potentially leading to increased startup times.","title":"Using Modelcars"},{"location":"modelserving/storage/oci/#example","text":"Let's see how modecars work by deploying the getting started example by using an OCI image and check how it is different to the startup with a storage-initalizer init-container. Asuming you have setup a namespace kserve-test that is KServe enabled, create an InferenceService that uses an oci:// storage URL: kubectl apply -n kserve-test -f - < /proc/38/root/models sklearn-iris-oci-predictor:/mnt$ cd /mnt/models sklearn-iris-oci-predictor:/mnt/models$ ls -l total 8 -rw-r--r-- 1 root root 5408 Jan 26 15 :58 model.joblib As you can see, the runtime can directly access the data coming from the modelcar image, without prior copying it over in another volume.","title":"Example"},{"location":"modelserving/storage/oci/#configuration","text":"Fine-tuning the behavior of Modelcars in KServe is possible through global configuration settings for inference services. These settings are located in the inferenceservice-config ConfigMap, which resides in the kserve namespace or the namespace where the KServe controller operates. This ConfigMap includes various subconfigurations, with the Modelcars configuration located under the storageInitializer entry. To view the current configuration, use the following command: kubectl get cm -n kserve inferenceservice-config --jsonpath \"{.data.storageInitializer}\" Sample Output { \"image\" : \"kserve/storage-initializer:v0.11.2\" , \"memoryRequest\" : \"100Mi\" , \"memoryLimit\" : \"1Gi\" , \"cpuRequest\" : \"100m\" , \"cpuLimit\" : \"1\" , \"enableDirectPvcVolumeMount\" : false , \"enableModelcar\" : true , \"uidModelcar\" : 1010 } The output is a JSON string representing the configuration. For Modelcars, several keys are available for customization: Key Description Example enableModelcar Enables direct access to an OCI container image using a source URL with an \"oci://\" schema. true cpuModelcar CPU request and limit for the modelcar container. 10m memoryModelcar Memory request and limit for the modelcar container. 15Mi uidModelcar UID under which the modelcar process and the main container run. Set to 0 for root if needed. If not set, the UID of the containers is used. 1042","title":"Configuration"},{"location":"modelserving/storage/oci/#references","text":"Modelcar Design document Original GitHub issue (discusses also some alternative solutions) 12-minute demo Code walkthrough showing the implementation of Modelcars in KServe (for background information)","title":"References"},{"location":"modelserving/storage/storagecontainers/","text":"Storage Containers \u00b6 KServe downloads models using a storage initializer (initContainer). For example, this is the default storage initializer implementation . KServe introduced ClusterStorageContainer CRD in 0.11 which allows users to specify a custom container spec for a list of supported URI formats. A ClusterStorageContainer defines the container spec for one or more storage URI formats. Here is an example of a ClusterStorageContainer that corresponds to the default storage initializer. Note that this is incluced in the helm chart . apiVersion : \"serving.kserve.io/v1alpha1\" kind : ClusterStorageContainer metadata : name : default spec : container : name : storage-initializer image : kserve/storage-initializer:latest resources : requests : memory : 100Mi cpu : 100m limits : memory : 1Gi cpu : \"1\" supportedUriFormats : - prefix : gs:// - prefix : s3:// - prefix : hdfs:// - prefix : webhdfs:// - regex : \"https://(.+?).blob.core.windows.net/(.+)\" - regex : \"https://(.+?).file.core.windows.net/(.+)\" - regex : \"https?://(.+)/(.+)\" In a ClusterStorageContainer spec, you can specify container resource requests and limits, and a list of supported URI formats that this image supports. KServe can match the URI either with prefix or regex . Warning If a storage URI is supported by two or more ClusterStorageContainer CRs, there is no guarantee which one will be used. Please make sure that the URI format is only supported by one ClusterStorageContainer CR . Custom Protocol Example \u00b6 If you would like to use a custom protocol model-registry:// , for example, you can create a custom image and add a new ClusterStorageContainer CR to make it available to KServe. Create the Custom Storage Initializer Image \u00b6 The first step is to create a custom container image that will be injected into the KServe deployment, as init container, and that will be in charge to download the model. The only requirement is that the Entrypoint of this container image should take (and properly manage) 2 positional arguments: 1. Source URI : identifies the storageUri set in the InferenceService 2. Destination Path : the location where the model should be stored, e.g., /mnt/models Note KServe controller will take care of properly injecting your container image and invoking it with those proper arguments. A more concrete example can be found here , where the storage initializer query an existing model registry service in order to retrieve the original location of the model that the user requested to deploy. Create the ClusterStorageContainer CR \u00b6 Once the Custom Storage Initializer image is ready, you just need to create a new ClusterStorageContainer CR to make it available in the cluster. You just need to provide 2 essential information: 1. The container spec definition , this is strictly dependent on your own custom storage initializer image. 2. The supported uri formats for which your custom storage initializer should be injected, in this case just model-registry:// . kubectl kubectl apply -f - < POST /v1/models/sklearn-azure:predict HTTP/1.1 > Host: sklearn-azure.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"Azure"},{"location":"modelserving/storage/azure/azure/#deploy-inferenceservice-with-saved-model-on-azure","text":"","title":"Deploy InferenceService with saved model on Azure"},{"location":"modelserving/storage/azure/azure/#using-public-azure-blobs","text":"By default, KServe uses anonymous client to download artifacts. To point to an Azure Blob, specify StorageUri to point to an Azure Blob Storage with the format: https://{$STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{$CONTAINER}/{$PATH} e.g. https://modelstoreaccount.blob.core.windows.net/model-store/model.joblib","title":"Using Public Azure Blobs"},{"location":"modelserving/storage/azure/azure/#using-private-blobs","text":"KServe supports authenticating using an Azure Service Principle.","title":"Using Private Blobs"},{"location":"modelserving/storage/azure/azure/#create-an-authorized-azure-service-principle","text":"To create an Azure Service Principle follow the steps here . Assign the SP the Storage Blob Data Owner role on your blob (KServe needs this permission as it needs to list contents at the blob path to filter items to download). Details on assigning storage roles here . az ad sp create-for-rbac --name model-store-sp --role \"Storage Blob Data Owner\" \\ --scopes /subscriptions/2662a931-80ae-46f4-adc7-869c1f2bcabf/resourceGroups/cognitive/providers/Microsoft.Storage/storageAccounts/modelstoreaccount","title":"Create an authorized Azure Service Principle"},{"location":"modelserving/storage/azure/azure/#create-azure-secret-and-attach-to-service-account","text":"","title":"Create Azure Secret and attach to Service Account"},{"location":"modelserving/storage/azure/azure/#create-azure-secret","text":"yaml apiVersion : v1 kind : Secret metadata : name : azcreds type : Opaque stringData : # use `stringData` for raw credential string or `data` for base64 encoded string AZ_CLIENT_ID : xxxxx AZ_CLIENT_SECRET : xxxxx AZ_SUBSCRIPTION_ID : xxxxx AZ_TENANT_ID : xxxxx","title":"Create Azure secret"},{"location":"modelserving/storage/azure/azure/#attach-secret-to-a-service-account","text":"yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : azcreds kubectl kubectl apply -f create-azure-secret.yaml","title":"Attach secret to a service account"},{"location":"modelserving/storage/azure/azure/#deploy-the-model-on-azure-with-inferenceservice","text":"Create the InferenceService with the azure storageUri and the service account with azure credential attached. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-azure\" spec : predictor : serviceAccountName : sa model : modelFormat : name : sklearn storageUri : \"https://modelstoreaccount.blob.core.windows.net/model-store/model.joblib\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-azure\" spec : predictor : serviceAccountName : sa sklearn : storageUri : \"https://modelstoreaccount.blob.core.windows.net/model-store/model.joblib\" Apply the sklearn-azure.yaml . kubectl kubectl apply -f sklearn-azure.yaml","title":"Deploy the model on Azure with InferenceService"},{"location":"modelserving/storage/azure/azure/#run-a-prediction","text":"Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-azure -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-azure INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-azure:predict HTTP/1.1 > Host: sklearn-azure.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"Run a prediction"},{"location":"modelserving/storage/gcs/gcs/","text":"Deploy InferenceService with a saved model on Google Cloud Storage (GCS) \u00b6 Using Public GCS Bucket \u00b6 If no credential is provided, anonymous client will be used to download the artifact from GCS bucket. The uri is in the following format: gs://${BUCKET_ NAME}/${PATH} e.g. gs://kfserving-examples/models/tensorflow/flowers Using Private GCS bucket \u00b6 KServe supports authenticating using Google Service Account Key Create a Service Account Key \u00b6 To create a Service Account Key follow the steps here . Base64 encode the generated Service Account Key file Create Google Secret \u00b6 Create secret \u00b6 yaml apiVersion : v1 kind : Secret metadata : name : storage-config type : Opaque stringData : gcs : | { \"type\": \"gs\", \"bucket\": \"mlpipeline\", \"base64_service_account\": \"c2VydmljZWFjY291bnQ=\" # base64 encoded value of the credential file } kubectl kubectl apply -f create-gcs-secret.yaml Deploy the model on GCS with InferenceService \u00b6 Create the InferenceService with the Google service account credential yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-gcs spec : predictor : sklearn : storage : key : gcs path : models/tensorflow/flowers parameters : # Parameters to override the default values bucket : kfserving-examples Apply the sklearn-gcs.yaml . kubectl kubectl apply -f sklearn-gcs.yaml Run a prediction \u00b6 Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-gcs -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-gcs INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-gcs:predict HTTP/1.1 > Host: sklearn-gcs.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"GCS"},{"location":"modelserving/storage/gcs/gcs/#deploy-inferenceservice-with-a-saved-model-on-google-cloud-storage-gcs","text":"","title":"Deploy InferenceService with a saved model on Google Cloud Storage (GCS)"},{"location":"modelserving/storage/gcs/gcs/#using-public-gcs-bucket","text":"If no credential is provided, anonymous client will be used to download the artifact from GCS bucket. The uri is in the following format: gs://${BUCKET_ NAME}/${PATH} e.g. gs://kfserving-examples/models/tensorflow/flowers","title":"Using Public GCS Bucket"},{"location":"modelserving/storage/gcs/gcs/#using-private-gcs-bucket","text":"KServe supports authenticating using Google Service Account Key","title":"Using Private GCS bucket"},{"location":"modelserving/storage/gcs/gcs/#create-a-service-account-key","text":"To create a Service Account Key follow the steps here . Base64 encode the generated Service Account Key file","title":"Create a Service Account Key"},{"location":"modelserving/storage/gcs/gcs/#create-google-secret","text":"","title":"Create Google Secret"},{"location":"modelserving/storage/gcs/gcs/#create-secret","text":"yaml apiVersion : v1 kind : Secret metadata : name : storage-config type : Opaque stringData : gcs : | { \"type\": \"gs\", \"bucket\": \"mlpipeline\", \"base64_service_account\": \"c2VydmljZWFjY291bnQ=\" # base64 encoded value of the credential file } kubectl kubectl apply -f create-gcs-secret.yaml","title":"Create secret"},{"location":"modelserving/storage/gcs/gcs/#deploy-the-model-on-gcs-with-inferenceservice","text":"Create the InferenceService with the Google service account credential yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-gcs spec : predictor : sklearn : storage : key : gcs path : models/tensorflow/flowers parameters : # Parameters to override the default values bucket : kfserving-examples Apply the sklearn-gcs.yaml . kubectl kubectl apply -f sklearn-gcs.yaml","title":"Deploy the model on GCS with InferenceService"},{"location":"modelserving/storage/gcs/gcs/#run-a-prediction","text":"Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-gcs -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-gcs INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-gcs:predict HTTP/1.1 > Host: sklearn-gcs.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"Run a prediction"},{"location":"modelserving/storage/pvc/pvc/","text":"Deploy InferenceService with a saved model on PVC \u00b6 This doc shows how to store a model in PVC and create InferenceService with a saved model on PVC. Create PV and PVC \u00b6 Refer to the document to create Persistent Volume (PV) and Persistent Volume Claim (PVC), the PVC will be used to store model. This document uses local PV. yaml apiVersion : v1 kind : PersistentVolume metadata : name : task-pv-volume labels : type : local spec : storageClassName : manual capacity : storage : 2Gi accessModes : - ReadWriteOnce hostPath : path : \"/home/ubuntu/mnt/data\" --- apiVersion : v1 kind : PersistentVolumeClaim metadata : name : task-pv-claim spec : storageClassName : manual accessModes : - ReadWriteOnce resources : requests : storage : 1Gi kubectl kubectl apply -f pv-and-pvc.yaml Copy model to PV \u00b6 Run pod model-store-pod and login into container model-store . yaml apiVersion : v1 kind : Pod metadata : name : model-store-pod spec : volumes : - name : model-store persistentVolumeClaim : claimName : task-pv-claim containers : - name : model-store image : ubuntu command : [ \"sleep\" ] args : [ \"infinity\" ] volumeMounts : - mountPath : \"/pv\" name : model-store resources : limits : memory : \"1Gi\" cpu : \"1\" kubectl kubectl apply -f pv-model-store.yaml kubectl exec -it model-store-pod -- bash In different terminal, copy the model from local into PV, then delete model-store-pod . kubectl kubectl cp model.joblib model-store-pod:/pv/model.joblib -c model-store kubectl delete pod model-store-pod Deploy InferenceService with models on PVC \u00b6 Update the ${PVC_NAME} to the created PVC name and create the InferenceService with the PVC storageUri . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-pvc\" spec : predictor : model : modelFormat : name : sklearn storageUri : \"pvc://${PVC_NAME}/model.joblib\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-pvc\" spec : predictor : sklearn : storageUri : \"pvc://${PVC_NAME}/model.joblib\" Apply the autoscale-gpu.yaml . kubectl kubectl apply -f sklearn-pvc.yaml Run a prediction \u00b6 Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-pvc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-pvc INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-pvc:predict HTTP/1.1 > Host: sklearn-pvc.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"PVC"},{"location":"modelserving/storage/pvc/pvc/#deploy-inferenceservice-with-a-saved-model-on-pvc","text":"This doc shows how to store a model in PVC and create InferenceService with a saved model on PVC.","title":"Deploy InferenceService with a saved model on PVC"},{"location":"modelserving/storage/pvc/pvc/#create-pv-and-pvc","text":"Refer to the document to create Persistent Volume (PV) and Persistent Volume Claim (PVC), the PVC will be used to store model. This document uses local PV. yaml apiVersion : v1 kind : PersistentVolume metadata : name : task-pv-volume labels : type : local spec : storageClassName : manual capacity : storage : 2Gi accessModes : - ReadWriteOnce hostPath : path : \"/home/ubuntu/mnt/data\" --- apiVersion : v1 kind : PersistentVolumeClaim metadata : name : task-pv-claim spec : storageClassName : manual accessModes : - ReadWriteOnce resources : requests : storage : 1Gi kubectl kubectl apply -f pv-and-pvc.yaml","title":"Create PV and PVC"},{"location":"modelserving/storage/pvc/pvc/#copy-model-to-pv","text":"Run pod model-store-pod and login into container model-store . yaml apiVersion : v1 kind : Pod metadata : name : model-store-pod spec : volumes : - name : model-store persistentVolumeClaim : claimName : task-pv-claim containers : - name : model-store image : ubuntu command : [ \"sleep\" ] args : [ \"infinity\" ] volumeMounts : - mountPath : \"/pv\" name : model-store resources : limits : memory : \"1Gi\" cpu : \"1\" kubectl kubectl apply -f pv-model-store.yaml kubectl exec -it model-store-pod -- bash In different terminal, copy the model from local into PV, then delete model-store-pod . kubectl kubectl cp model.joblib model-store-pod:/pv/model.joblib -c model-store kubectl delete pod model-store-pod","title":"Copy model to PV"},{"location":"modelserving/storage/pvc/pvc/#deploy-inferenceservice-with-models-on-pvc","text":"Update the ${PVC_NAME} to the created PVC name and create the InferenceService with the PVC storageUri . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-pvc\" spec : predictor : model : modelFormat : name : sklearn storageUri : \"pvc://${PVC_NAME}/model.joblib\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-pvc\" spec : predictor : sklearn : storageUri : \"pvc://${PVC_NAME}/model.joblib\" Apply the autoscale-gpu.yaml . kubectl kubectl apply -f sklearn-pvc.yaml","title":"Deploy InferenceService with models on PVC"},{"location":"modelserving/storage/pvc/pvc/#run-a-prediction","text":"Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-pvc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-pvc INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-pvc:predict HTTP/1.1 > Host: sklearn-pvc.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 20 Sep 2021 04 :55:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 6 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"Run a prediction"},{"location":"modelserving/storage/s3/s3/","text":"Deploy InferenceService with a saved model on S3 \u00b6 There are two supported methods for configuring credentials for AWS S3 storage: AWS IAM Role for Service Account (Recommended) AWS IAM User Credentials Global configuration options for S3 credentials can be found in the inferenceservice configmap, and will be used as a backup if the relevant annotations aren't found on the secret or service account. Create Service Account with IAM Role \u00b6 Create an IAM Role and configure according to the AWS Documentation . KServe will read the annotations on the Service Acccount in order to inject the proper environment variables on the storage initializer container. Create Service Account \u00b6 yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa annotations : eks.amazonaws.com/role-arn : arn:aws:iam::123456789012:role/s3access # replace with your IAM role ARN serving.kserve.io/s3-endpoint : s3.amazonaws.com # replace with your s3 endpoint e.g minio-service.kubeflow:9000 serving.kserve.io/s3-usehttps : \"1\" # by default 1, if testing with minio you can set to 0 serving.kserve.io/s3-region : \"us-east-2\" serving.kserve.io/s3-useanoncredential : \"false\" # omitting this is the same as false, if true will ignore provided credential and use anonymous credentials kubectl kubectl apply -f create-s3-sa.yaml Create S3 Secret and attach to Service Account \u00b6 Create a secret with your S3 user credential , KServe reads the secret annotations to inject the S3 environment variables on storage initializer or model agent to download the models from S3 storage. Create S3 secret \u00b6 yaml apiVersion : v1 kind : Secret metadata : name : s3creds annotations : serving.kserve.io/s3-endpoint : s3.amazonaws.com # replace with your s3 endpoint e.g minio-service.kubeflow:9000 serving.kserve.io/s3-usehttps : \"1\" # by default 1, if testing with minio you can set to 0 serving.kserve.io/s3-region : \"us-east-2\" serving.kserve.io/s3-useanoncredential : \"false\" # omitting this is the same as false, if true will ignore provided credential and use anonymous credentials type : Opaque stringData : # use `stringData` for raw credential string or `data` for base64 encoded string AWS_ACCESS_KEY_ID : XXXX AWS_SECRET_ACCESS_KEY : XXXXXXXX Attach secret to a service account \u00b6 yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : s3creds kubectl kubectl apply -f create-s3-secret.yaml Note If you are running kserve with istio sidecars enabled, there can be a race condition between the istio proxy being ready and the agent pulling models. This will result in a tcp dial connection refused error when the agent tries to download from s3. To resolve it, istio allows the blocking of other containers in a pod until the proxy container is ready. You can enabled this by setting proxy.holdApplicationUntilProxyStarts: true in istio-sidecar-injector configmap, proxy.holdApplicationUntilProxyStarts flag was introduced in Istio 1.7 as an experimental feature and is turned off by default. Deploy the model on S3 with InferenceService \u00b6 Create the InferenceService with the s3 storageUri and the service account with s3 credential attached. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mnist-s3\" spec : predictor : serviceAccountName : sa model : modelFormat : name : tensorflow storageUri : \"s3://kserve-examples/mnist\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mnist-s3\" spec : predictor : serviceAccountName : sa tensorflow : storageUri : \"s3://kserve-examples/mnist\" Apply the autoscale-gpu.yaml . kubectl kubectl apply -f mnist-s3.yaml Run a prediction \u00b6 Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice mnist-s3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = mnist-s3 INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output Note: Unnecessary use of -X or --request, POST is already inferred. * Trying 35 .237.217.209... * TCP_NODELAY set * Connected to mnist-s3.default.35.237.217.209.xip.io ( 35 .237.217.209 ) port 80 ( #0) > POST /v1/models/mnist-s3:predict HTTP/1.1 > Host: mnist-s3.default.35.237.217.209.xip.io > User-Agent: curl/7.55.1 > Accept: */* > Content-Length: 2052 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 251 < content-type: application/json < date: Sun, 04 Apr 2021 20 :06:27 GMT < x-envoy-upstream-service-time: 5 < server: istio-envoy < * Connection #0 to host mnist-s3.default.35.237.217.209.xip.io left intact { \"predictions\" : [ { \"predictions\" : [ 0 .327352405, 2 .00153053e-07, 0 .0113353515, 0 .203903764, 3 .62863029e-05, 0 .416683704, 0 .000281196437, 8 .36911859e-05, 0 .0403052084, 1 .82206513e-05 ] , \"classes\" : 5 } ] }","title":"S3"},{"location":"modelserving/storage/s3/s3/#deploy-inferenceservice-with-a-saved-model-on-s3","text":"There are two supported methods for configuring credentials for AWS S3 storage: AWS IAM Role for Service Account (Recommended) AWS IAM User Credentials Global configuration options for S3 credentials can be found in the inferenceservice configmap, and will be used as a backup if the relevant annotations aren't found on the secret or service account.","title":"Deploy InferenceService with a saved model on S3"},{"location":"modelserving/storage/s3/s3/#create-service-account-with-iam-role","text":"Create an IAM Role and configure according to the AWS Documentation . KServe will read the annotations on the Service Acccount in order to inject the proper environment variables on the storage initializer container.","title":"Create Service Account with IAM Role"},{"location":"modelserving/storage/s3/s3/#create-service-account","text":"yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa annotations : eks.amazonaws.com/role-arn : arn:aws:iam::123456789012:role/s3access # replace with your IAM role ARN serving.kserve.io/s3-endpoint : s3.amazonaws.com # replace with your s3 endpoint e.g minio-service.kubeflow:9000 serving.kserve.io/s3-usehttps : \"1\" # by default 1, if testing with minio you can set to 0 serving.kserve.io/s3-region : \"us-east-2\" serving.kserve.io/s3-useanoncredential : \"false\" # omitting this is the same as false, if true will ignore provided credential and use anonymous credentials kubectl kubectl apply -f create-s3-sa.yaml","title":"Create Service Account"},{"location":"modelserving/storage/s3/s3/#create-s3-secret-and-attach-to-service-account","text":"Create a secret with your S3 user credential , KServe reads the secret annotations to inject the S3 environment variables on storage initializer or model agent to download the models from S3 storage.","title":"Create S3 Secret and attach to Service Account"},{"location":"modelserving/storage/s3/s3/#create-s3-secret","text":"yaml apiVersion : v1 kind : Secret metadata : name : s3creds annotations : serving.kserve.io/s3-endpoint : s3.amazonaws.com # replace with your s3 endpoint e.g minio-service.kubeflow:9000 serving.kserve.io/s3-usehttps : \"1\" # by default 1, if testing with minio you can set to 0 serving.kserve.io/s3-region : \"us-east-2\" serving.kserve.io/s3-useanoncredential : \"false\" # omitting this is the same as false, if true will ignore provided credential and use anonymous credentials type : Opaque stringData : # use `stringData` for raw credential string or `data` for base64 encoded string AWS_ACCESS_KEY_ID : XXXX AWS_SECRET_ACCESS_KEY : XXXXXXXX","title":"Create S3 secret"},{"location":"modelserving/storage/s3/s3/#attach-secret-to-a-service-account","text":"yaml apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : s3creds kubectl kubectl apply -f create-s3-secret.yaml Note If you are running kserve with istio sidecars enabled, there can be a race condition between the istio proxy being ready and the agent pulling models. This will result in a tcp dial connection refused error when the agent tries to download from s3. To resolve it, istio allows the blocking of other containers in a pod until the proxy container is ready. You can enabled this by setting proxy.holdApplicationUntilProxyStarts: true in istio-sidecar-injector configmap, proxy.holdApplicationUntilProxyStarts flag was introduced in Istio 1.7 as an experimental feature and is turned off by default.","title":"Attach secret to a service account"},{"location":"modelserving/storage/s3/s3/#deploy-the-model-on-s3-with-inferenceservice","text":"Create the InferenceService with the s3 storageUri and the service account with s3 credential attached. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mnist-s3\" spec : predictor : serviceAccountName : sa model : modelFormat : name : tensorflow storageUri : \"s3://kserve-examples/mnist\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mnist-s3\" spec : predictor : serviceAccountName : sa tensorflow : storageUri : \"s3://kserve-examples/mnist\" Apply the autoscale-gpu.yaml . kubectl kubectl apply -f mnist-s3.yaml","title":"Deploy the model on S3 with InferenceService"},{"location":"modelserving/storage/s3/s3/#run-a-prediction","text":"Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. SERVICE_HOSTNAME = $( kubectl get inferenceservice mnist-s3 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = mnist-s3 INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output Note: Unnecessary use of -X or --request, POST is already inferred. * Trying 35 .237.217.209... * TCP_NODELAY set * Connected to mnist-s3.default.35.237.217.209.xip.io ( 35 .237.217.209 ) port 80 ( #0) > POST /v1/models/mnist-s3:predict HTTP/1.1 > Host: mnist-s3.default.35.237.217.209.xip.io > User-Agent: curl/7.55.1 > Accept: */* > Content-Length: 2052 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 251 < content-type: application/json < date: Sun, 04 Apr 2021 20 :06:27 GMT < x-envoy-upstream-service-time: 5 < server: istio-envoy < * Connection #0 to host mnist-s3.default.35.237.217.209.xip.io left intact { \"predictions\" : [ { \"predictions\" : [ 0 .327352405, 2 .00153053e-07, 0 .0113353515, 0 .203903764, 3 .62863029e-05, 0 .416683704, 0 .000281196437, 8 .36911859e-05, 0 .0403052084, 1 .82206513e-05 ] , \"classes\" : 5 } ] }","title":"Run a prediction"},{"location":"modelserving/storage/uri/uri/","text":"Deploy InferenceService with a saved model from a URI \u00b6 This doc guides to specify a model object via the URI (Uniform Resource Identifier) of the model object exposed via an http or https endpoint. This storageUri option supports single file models, like sklearn which is specified by a joblib file, or artifacts (e.g. tar or zip ) which contain all the necessary dependencies for other model types (e.g. tensorflow or pytorch ). Here, we'll show examples from both of the above. Create HTTP/HTTPS header Secret and attach to Service account \u00b6 The HTTP/HTTPS service request headers can be defined as secret and attached to service account. This is optional. yaml apiVersion : v1 kind : Secret metadata : name : mysecret type : Opaque data : https-host : ZXhhbXBsZS5jb20= headers : |- ewoiYWNjb3VudC1uYW1lIjogInNvbWVfYWNjb3VudF9uYW1lIiwKInNlY3JldC1rZXkiOiAic29tZV9zZWNyZXRfa2V5Igp9 --- apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : mysecret kubectl kubectl apply -f create-uri-secret.yaml Note The serviceAccountName specified in your predictor in your inference service. These headers will be applied to any http/https requests that have the same host. The header and host should be base64 encoded format. example.com # echo -n \"example.com\" | base64 ZXhhbXBsZS5jb20= --- { \"account-name\": \"some_account_name\", \"secret-key\": \"some_secret_key\" } # echo -n '{\\n\"account-name\": \"some_account_name\",\\n\"secret-key\": \"some_secret_key\"\\n}' | base64 ewoiYWNjb3VudC1uYW1lIjogInNvbWVfYWNjb3VudF9uYW1lIiwKInNlY3JldC1rZXkiOiAic29tZV9zZWNyZXRfa2V5Igp9 Sklearn \u00b6 Train and freeze the model \u00b6 Here, we'll train a simple iris model. Please note that KServe requires sklearn==0.20.3 . python from sklearn import svm from sklearn import datasets import joblib def train ( X , y ): clf = svm . SVC ( gamma = 'auto' ) clf . fit ( X , y ) return clf def freeze ( clf , path = '../frozen' ): joblib . dump ( clf , f ' { path } /model.joblib' ) return True if __name__ == '__main__' : iris = datasets . load_iris () X , y = iris . data , iris . target clf = train ( X , y ) freeze ( clf ) Now, the frozen model object can be put it somewhere on the web to expose it. For instance, pushing the model.joblib file to some repo on GitHub. Specify and create the InferenceService \u00b6 New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-from-uri spec : predictor : model : modelFormat : name : sklearn storageUri : https://github.com/tduffy000/kfserving-uri-examples/blob/master/sklearn/frozen/model.joblib?raw=true apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-from-uri spec : predictor : sklearn : storageUri : https://github.com/tduffy000/kfserving-uri-examples/blob/master/sklearn/frozen/model.joblib?raw=true Apply the sklearn-from-uri.yaml . kubectl kubectl apply -f sklearn-from-uri.yaml Run a prediction \u00b6 Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. An example payload below: { \"instances\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-from-uri -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-from-uri INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output $ * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-from-uri:predict HTTP/1.1 > Host: sklearn-from-uri.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 76 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 76 out of 76 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 06 Sep 2021 15 :52:55 GMT < server: istio-envoy < x-envoy-upstream-service-time: 7 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]} Tensorflow \u00b6 This will serve as an example of the ability to also pull in a tarball containing all of the required model dependencies, for instance tensorflow requires multiple files in a strict directory structure in order to be servable. Train and freeze the model \u00b6 python from sklearn import datasets import numpy as np import tensorflow as tf def _ohe ( targets ): y = np . zeros (( 150 , 3 )) for i , label in enumerate ( targets ): y [ i , label ] = 1.0 return y def train ( X , y , epochs , batch_size = 16 ): model = tf . keras . Sequential ([ tf . keras . layers . InputLayer ( input_shape = ( 4 ,)), tf . keras . layers . Dense ( 16 , activation = tf . nn . relu ), tf . keras . layers . Dense ( 16 , activation = tf . nn . relu ), tf . keras . layers . Dense ( 3 , activation = 'softmax' ) ]) model . compile ( tf . keras . optimizers . RMSprop ( learning_rate = 0.001 ), loss = 'categorical_crossentropy' , metrics = [ 'accuracy' ]) model . fit ( X , y , epochs = epochs ) return model def freeze ( model , path = '../frozen' ): model . save ( f ' { path } /0001' ) return True if __name__ == '__main__' : iris = datasets . load_iris () X , targets = iris . data , iris . target y = _ohe ( targets ) model = train ( X , y , epochs = 50 ) freeze ( model ) The post-training procedure here is a bit different. Instead of directly pushing the frozen output to some URI, we'll need to package them into a tarball. To do so, cd ../frozen tar -cvf artifacts.tar 0001 / gzip < artifacts.tar > artifacts.tgz Where we assume the 0001/ directory has the structure: |-- 0001/ |-- saved_model.pb |-- variables/ |--- variables.data-00000-of-00001 |--- variables.index Note Building the tarball from the directory specifying a version number is required for tensorflow . Specify and create the InferenceService \u00b6 And again, if everything went to plan we should be able to pull down the tarball and expose the endpoint. yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : tensorflow-from-uri-gzip spec : predictor : tensorflow : storageUri : https://raw.githubusercontent.com/tduffy000/kfserving-uri-examples/master/tensorflow/frozen/model_artifacts.tar.gz kubectl kubectl apply -f tensorflow-from-uri-gzip.yaml Run a prediction \u00b6 Again, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. An example payload below: { \"instances\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } SERVICE_HOSTNAME = $( kubectl get inferenceservice tensorflow-from-uri-gzip -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = tensorflow-from-uri-gzip INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output $ * Trying 10 .0.1.16... * TCP_NODELAY set % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 * Connected to 10 .0.1.16 ( 10 .0.1.16 ) port 30749 ( #0) > POST /v1/models/tensorflow-from-uri:predict HTTP/1.1 > Host: tensorflow-from-uri.default.example.com > User-Agent: curl/7.58.0 > Accept: */* > Content-Length: 86 > Content-Type: application/x-www-form-urlencoded > } [ 86 bytes data ] * upload completely sent off: 86 out of 86 bytes < HTTP/1.1 200 OK < content-length: 112 < content-type: application/json < date: Thu, 06 Aug 2020 23 :21:19 GMT < x-envoy-upstream-service-time: 151 < server: istio-envoy < { [ 112 bytes data ] 100 198 100 112 100 86 722 554 --:--:-- --:--:-- --:--:-- 1285 * Connection #0 to host 10.0.1.16 left intact { \"predictions\" : [ [ 0 .0204100646, 0 .680984616, 0 .298605353 ] , [ 0 .0296604875, 0 .658412039, 0 .311927497 ] ] }","title":"URI"},{"location":"modelserving/storage/uri/uri/#deploy-inferenceservice-with-a-saved-model-from-a-uri","text":"This doc guides to specify a model object via the URI (Uniform Resource Identifier) of the model object exposed via an http or https endpoint. This storageUri option supports single file models, like sklearn which is specified by a joblib file, or artifacts (e.g. tar or zip ) which contain all the necessary dependencies for other model types (e.g. tensorflow or pytorch ). Here, we'll show examples from both of the above.","title":"Deploy InferenceService with a saved model from a URI"},{"location":"modelserving/storage/uri/uri/#create-httphttps-header-secret-and-attach-to-service-account","text":"The HTTP/HTTPS service request headers can be defined as secret and attached to service account. This is optional. yaml apiVersion : v1 kind : Secret metadata : name : mysecret type : Opaque data : https-host : ZXhhbXBsZS5jb20= headers : |- ewoiYWNjb3VudC1uYW1lIjogInNvbWVfYWNjb3VudF9uYW1lIiwKInNlY3JldC1rZXkiOiAic29tZV9zZWNyZXRfa2V5Igp9 --- apiVersion : v1 kind : ServiceAccount metadata : name : sa secrets : - name : mysecret kubectl kubectl apply -f create-uri-secret.yaml Note The serviceAccountName specified in your predictor in your inference service. These headers will be applied to any http/https requests that have the same host. The header and host should be base64 encoded format. example.com # echo -n \"example.com\" | base64 ZXhhbXBsZS5jb20= --- { \"account-name\": \"some_account_name\", \"secret-key\": \"some_secret_key\" } # echo -n '{\\n\"account-name\": \"some_account_name\",\\n\"secret-key\": \"some_secret_key\"\\n}' | base64 ewoiYWNjb3VudC1uYW1lIjogInNvbWVfYWNjb3VudF9uYW1lIiwKInNlY3JldC1rZXkiOiAic29tZV9zZWNyZXRfa2V5Igp9","title":"Create HTTP/HTTPS header Secret and attach to Service account"},{"location":"modelserving/storage/uri/uri/#sklearn","text":"","title":"Sklearn"},{"location":"modelserving/storage/uri/uri/#train-and-freeze-the-model","text":"Here, we'll train a simple iris model. Please note that KServe requires sklearn==0.20.3 . python from sklearn import svm from sklearn import datasets import joblib def train ( X , y ): clf = svm . SVC ( gamma = 'auto' ) clf . fit ( X , y ) return clf def freeze ( clf , path = '../frozen' ): joblib . dump ( clf , f ' { path } /model.joblib' ) return True if __name__ == '__main__' : iris = datasets . load_iris () X , y = iris . data , iris . target clf = train ( X , y ) freeze ( clf ) Now, the frozen model object can be put it somewhere on the web to expose it. For instance, pushing the model.joblib file to some repo on GitHub.","title":"Train and freeze the model"},{"location":"modelserving/storage/uri/uri/#specify-and-create-the-inferenceservice","text":"New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-from-uri spec : predictor : model : modelFormat : name : sklearn storageUri : https://github.com/tduffy000/kfserving-uri-examples/blob/master/sklearn/frozen/model.joblib?raw=true apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : sklearn-from-uri spec : predictor : sklearn : storageUri : https://github.com/tduffy000/kfserving-uri-examples/blob/master/sklearn/frozen/model.joblib?raw=true Apply the sklearn-from-uri.yaml . kubectl kubectl apply -f sklearn-from-uri.yaml","title":"Specify and create the InferenceService"},{"location":"modelserving/storage/uri/uri/#run-a-prediction","text":"Now, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. An example payload below: { \"instances\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-from-uri -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = sklearn-from-uri INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output $ * Trying 127 .0.0.1:8080... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 8080 ( #0) > POST /v1/models/sklearn-from-uri:predict HTTP/1.1 > Host: sklearn-from-uri.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 76 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 76 out of 76 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23 < content-type: application/json ; charset = UTF-8 < date: Mon, 06 Sep 2021 15 :52:55 GMT < server: istio-envoy < x-envoy-upstream-service-time: 7 < * Connection #0 to host localhost left intact { \"predictions\" : [ 1 , 1 ]}","title":"Run a prediction"},{"location":"modelserving/storage/uri/uri/#tensorflow","text":"This will serve as an example of the ability to also pull in a tarball containing all of the required model dependencies, for instance tensorflow requires multiple files in a strict directory structure in order to be servable.","title":"Tensorflow"},{"location":"modelserving/storage/uri/uri/#train-and-freeze-the-model_1","text":"python from sklearn import datasets import numpy as np import tensorflow as tf def _ohe ( targets ): y = np . zeros (( 150 , 3 )) for i , label in enumerate ( targets ): y [ i , label ] = 1.0 return y def train ( X , y , epochs , batch_size = 16 ): model = tf . keras . Sequential ([ tf . keras . layers . InputLayer ( input_shape = ( 4 ,)), tf . keras . layers . Dense ( 16 , activation = tf . nn . relu ), tf . keras . layers . Dense ( 16 , activation = tf . nn . relu ), tf . keras . layers . Dense ( 3 , activation = 'softmax' ) ]) model . compile ( tf . keras . optimizers . RMSprop ( learning_rate = 0.001 ), loss = 'categorical_crossentropy' , metrics = [ 'accuracy' ]) model . fit ( X , y , epochs = epochs ) return model def freeze ( model , path = '../frozen' ): model . save ( f ' { path } /0001' ) return True if __name__ == '__main__' : iris = datasets . load_iris () X , targets = iris . data , iris . target y = _ohe ( targets ) model = train ( X , y , epochs = 50 ) freeze ( model ) The post-training procedure here is a bit different. Instead of directly pushing the frozen output to some URI, we'll need to package them into a tarball. To do so, cd ../frozen tar -cvf artifacts.tar 0001 / gzip < artifacts.tar > artifacts.tgz Where we assume the 0001/ directory has the structure: |-- 0001/ |-- saved_model.pb |-- variables/ |--- variables.data-00000-of-00001 |--- variables.index Note Building the tarball from the directory specifying a version number is required for tensorflow .","title":"Train and freeze the model"},{"location":"modelserving/storage/uri/uri/#specify-and-create-the-inferenceservice_1","text":"And again, if everything went to plan we should be able to pull down the tarball and expose the endpoint. yaml apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : tensorflow-from-uri-gzip spec : predictor : tensorflow : storageUri : https://raw.githubusercontent.com/tduffy000/kfserving-uri-examples/master/tensorflow/frozen/model_artifacts.tar.gz kubectl kubectl apply -f tensorflow-from-uri-gzip.yaml","title":"Specify and create the InferenceService"},{"location":"modelserving/storage/uri/uri/#run-a-prediction_1","text":"Again, the ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or follow this instruction to find out the ingress IP and port. An example payload below: { \"instances\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } SERVICE_HOSTNAME = $( kubectl get inferenceservice tensorflow-from-uri-gzip -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) MODEL_NAME = tensorflow-from-uri-gzip INPUT_PATH = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output $ * Trying 10 .0.1.16... * TCP_NODELAY set % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 * Connected to 10 .0.1.16 ( 10 .0.1.16 ) port 30749 ( #0) > POST /v1/models/tensorflow-from-uri:predict HTTP/1.1 > Host: tensorflow-from-uri.default.example.com > User-Agent: curl/7.58.0 > Accept: */* > Content-Length: 86 > Content-Type: application/x-www-form-urlencoded > } [ 86 bytes data ] * upload completely sent off: 86 out of 86 bytes < HTTP/1.1 200 OK < content-length: 112 < content-type: application/json < date: Thu, 06 Aug 2020 23 :21:19 GMT < x-envoy-upstream-service-time: 151 < server: istio-envoy < { [ 112 bytes data ] 100 198 100 112 100 86 722 554 --:--:-- --:--:-- --:--:-- 1285 * Connection #0 to host 10.0.1.16 left intact { \"predictions\" : [ [ 0 .0204100646, 0 .680984616, 0 .298605353 ] , [ 0 .0296604875, 0 .658412039, 0 .311927497 ] ] }","title":"Run a prediction"},{"location":"modelserving/v1beta1/serving_runtime/","text":"Model Serving Runtimes \u00b6 KServe provides a simple Kubernetes CRD to enable deploying single or multiple trained models onto model serving runtimes such as TFServing , TorchServe , Triton Inference Server . In addition ModelServer is the Python model serving runtime implemented in KServe itself with prediction v1 protocol, MLServer implements the prediction v2 protocol with both REST and gRPC. These model serving runtimes are able to provide out-of-the-box model serving, but you could also choose to build your own model server for more complex use case. KServe provides basic API primitives to allow you easily build custom model serving runtime, you can use other tools like BentoML to build your custom model serving image. After models are deployed with InferenceService, you get all the following serverless features provided by KServe. Scale to and from Zero Request based Autoscaling on CPU/GPU Revision Management Optimized Container Batching Request/Response logging Traffic management Security with AuthN/AuthZ Distributed Tracing Out-of-the-box metrics Ingress/Egress control The table below identifies each of the model serving runtimes supported by KServe. The HTTP and gRPC columns indicate the prediction protocol version that the serving runtime supports. The KServe prediction protocol is noted as either \"v1\" or \"v2\". Some serving runtimes also support their own prediction protocol, these are noted with an * . The default serving runtime version column defines the source and version of the serving runtime - MLServer, KServe or its own. These versions can also be found in the runtime kustomization YAML . All KServe native model serving runtimes use the current KServe release version (v0.12). The supported framework version column lists the major version of the model that is supported. These can also be found in the respective runtime YAML under the supportedModelFormats field. For model frameworks using the KServe serving runtime, the specific default version can be found in kserve/python . In a given serving runtime directory the pyproject.toml file contains the exact model framework version used. For example, in kserve/python/lgbserver the pyproject.toml file sets the model framework version to 3.3.2, lightgbm ~= 3.3.2 . Model Serving Runtime Exported model HTTP gRPC Default Serving Runtime Version Supported Framework (Major) Version(s) Examples Custom ModelServer -- v1, v2 v2 -- -- Custom Model LightGBM MLServer Saved LightGBM Model v2 v2 v1.3.2 (MLServer) 3 LightGBM Iris V2 LightGBM ModelServer Saved LightGBM Model v1, v2 v2 v0.12 (KServe) 3 LightGBM Iris MLFlow ModelServer Saved MLFlow Model v2 v2 v1.3.2 (MLServer) 1 MLFLow wine-classifier PMML ModelServer PMML v1, v2 v2 v0.12 (KServe) 3, 4 ( PMML4.4.1 ) SKLearn PMML SKLearn MLServer Pickled Model v2 v2 v1.3.2 (MLServer) 1 SKLearn Iris V2 SKLearn ModelServer Pickled Model v1, v2 v2 v0.12 (KServe) 1.3 SKLearn Iris TFServing TensorFlow SavedModel v1 *tensorflow 2.6.2 ( TFServing Versions ) 2 TensorFlow flower TorchServe Eager Model/TorchScript v1, v2, *torchserve *torchserve 0.8.2 (TorchServe) 2 TorchServe mnist Triton Inference Server TensorFlow,TorchScript,ONNX v2 v2 23.05-py3 (Triton) 8 (TensoRT), 1, 2 (TensorFlow), 2 (PyTorch), 2 (Triton) Compatibility Matrix Torchscript cifar XGBoost MLServer Saved Model v2 v2 v1.3.2 (MLServer) 1 XGBoost Iris V2 XGBoost ModelServer Saved Model v1, v2 v2 v0.12 (KServe) 1 XGBoost Iris HuggingFace ModelServer Saved Model / Huggingface Hub Model_Id v1, v2 -- v0.12 (KServe) 4 ( Transformers ) -- HuggingFace VLLM ModelServer Saved Model / Huggingface Hub Model_Id v2 -- v0.12 (KServe) 0 ( Vllm ) -- *tensorflow - Tensorflow implements its own prediction protocol in addition to KServe's. See: Tensorflow Serving Prediction API documentation *torchserve - PyTorch implements its own prediction protocol in addition to KServe's. See: Torchserve gRPC API documentation Note The model serving runtime version can be overwritten with the runtimeVersion field on InferenceService yaml and we highly recommend setting this field for production services. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3","title":"Overview"},{"location":"modelserving/v1beta1/serving_runtime/#model-serving-runtimes","text":"KServe provides a simple Kubernetes CRD to enable deploying single or multiple trained models onto model serving runtimes such as TFServing , TorchServe , Triton Inference Server . In addition ModelServer is the Python model serving runtime implemented in KServe itself with prediction v1 protocol, MLServer implements the prediction v2 protocol with both REST and gRPC. These model serving runtimes are able to provide out-of-the-box model serving, but you could also choose to build your own model server for more complex use case. KServe provides basic API primitives to allow you easily build custom model serving runtime, you can use other tools like BentoML to build your custom model serving image. After models are deployed with InferenceService, you get all the following serverless features provided by KServe. Scale to and from Zero Request based Autoscaling on CPU/GPU Revision Management Optimized Container Batching Request/Response logging Traffic management Security with AuthN/AuthZ Distributed Tracing Out-of-the-box metrics Ingress/Egress control The table below identifies each of the model serving runtimes supported by KServe. The HTTP and gRPC columns indicate the prediction protocol version that the serving runtime supports. The KServe prediction protocol is noted as either \"v1\" or \"v2\". Some serving runtimes also support their own prediction protocol, these are noted with an * . The default serving runtime version column defines the source and version of the serving runtime - MLServer, KServe or its own. These versions can also be found in the runtime kustomization YAML . All KServe native model serving runtimes use the current KServe release version (v0.12). The supported framework version column lists the major version of the model that is supported. These can also be found in the respective runtime YAML under the supportedModelFormats field. For model frameworks using the KServe serving runtime, the specific default version can be found in kserve/python . In a given serving runtime directory the pyproject.toml file contains the exact model framework version used. For example, in kserve/python/lgbserver the pyproject.toml file sets the model framework version to 3.3.2, lightgbm ~= 3.3.2 . Model Serving Runtime Exported model HTTP gRPC Default Serving Runtime Version Supported Framework (Major) Version(s) Examples Custom ModelServer -- v1, v2 v2 -- -- Custom Model LightGBM MLServer Saved LightGBM Model v2 v2 v1.3.2 (MLServer) 3 LightGBM Iris V2 LightGBM ModelServer Saved LightGBM Model v1, v2 v2 v0.12 (KServe) 3 LightGBM Iris MLFlow ModelServer Saved MLFlow Model v2 v2 v1.3.2 (MLServer) 1 MLFLow wine-classifier PMML ModelServer PMML v1, v2 v2 v0.12 (KServe) 3, 4 ( PMML4.4.1 ) SKLearn PMML SKLearn MLServer Pickled Model v2 v2 v1.3.2 (MLServer) 1 SKLearn Iris V2 SKLearn ModelServer Pickled Model v1, v2 v2 v0.12 (KServe) 1.3 SKLearn Iris TFServing TensorFlow SavedModel v1 *tensorflow 2.6.2 ( TFServing Versions ) 2 TensorFlow flower TorchServe Eager Model/TorchScript v1, v2, *torchserve *torchserve 0.8.2 (TorchServe) 2 TorchServe mnist Triton Inference Server TensorFlow,TorchScript,ONNX v2 v2 23.05-py3 (Triton) 8 (TensoRT), 1, 2 (TensorFlow), 2 (PyTorch), 2 (Triton) Compatibility Matrix Torchscript cifar XGBoost MLServer Saved Model v2 v2 v1.3.2 (MLServer) 1 XGBoost Iris V2 XGBoost ModelServer Saved Model v1, v2 v2 v0.12 (KServe) 1 XGBoost Iris HuggingFace ModelServer Saved Model / Huggingface Hub Model_Id v1, v2 -- v0.12 (KServe) 4 ( Transformers ) -- HuggingFace VLLM ModelServer Saved Model / Huggingface Hub Model_Id v2 -- v0.12 (KServe) 0 ( Vllm ) -- *tensorflow - Tensorflow implements its own prediction protocol in addition to KServe's. See: Tensorflow Serving Prediction API documentation *torchserve - PyTorch implements its own prediction protocol in addition to KServe's. See: Torchserve gRPC API documentation Note The model serving runtime version can be overwritten with the runtimeVersion field on InferenceService yaml and we highly recommend setting this field for production services. apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchscript-cifar\" spec : predictor : triton : storageUri : \"gs://kfserving-examples/models/torchscript\" runtimeVersion : 21.08-py3","title":"Model Serving Runtimes"},{"location":"modelserving/v1beta1/amd/","text":"AMD Inference Server \u00b6 The AMD Inference Server is an easy-to-use inferencing solution specially designed for AMD CPUs, GPUs, and FPGAs. It can be deployed as a standalone executable or on a Kubernetes cluster with KServe or used to create custom applications by linking to its C++ API. This example demonstrates how to deploy a Tensorflow GraphDef model on KServe with the AMD Inference Server to run inference on AMD EPYC CPUs . Prerequisites \u00b6 This example was tested on an Ubuntu 18.04 host machine using the Bash shell. These instructions assume: You have a machine with a modern version of Docker (>=18.09) and sufficient disk space to build the image You have a Kubernetes cluster set up KServe has been installed on the Kubernetes cluster Some familiarity with Kubernetes / KServe Refer to the installation instructions for these tools to install them if needed. Set up the image \u00b6 This example uses the AMD ZenDNN backend to run inference on TensorFlow models on AMD EPYC CPUs. Build the image \u00b6 To build a Docker image for the AMD Inference Server that uses this backend, download the TF_v2.9_ZenDNN_v3.3_C++_API.zip package from ZenDNN. You must agree to the EULA to download this package. You need a modern version of Docker (at least 18.09) to build this image. # clone the inference server repository git clone https://github.com/Xilinx/inference-server.git # place the downloaded ZenDNN zip in the repository mv TF_v2.9_ZenDNN_v3.3_C++_API.zip ./inference-server/ # build the image cd inference-server ./amdinfer dockerize --production --tfzendnn = ./TF_v2.9_ZenDNN_v3.3_C++_API.zip This builds an image on your host: /amdinfer:latest . To use with KServe, you need to upload this image to a Docker registry server such as on a local server . You will also need to update the YAML files in this example to use this image. More documentation for building a ZenDNN image for KServe is available: ZenDNN + AMD Inference Server and KServe + AMD Inference Server . Set up the model \u00b6 In this example, you will use an MNIST Tensorflow model . The AMD Inference Server also supports PyTorch, ONNX and Vitis AI models models with the appropriate Docker images. To prepare new models, look at the KServe + AMD Inference Server documentation for more information about the expected model format. Make an inference \u00b6 The AMD Inference Server can be used in single model serving mode in KServe. The code snippets below use the environment variables INGRESS_HOST and INGRESS_PORT to make requests to the cluster. Find the ingress host and port for making requests to your cluster and set these values appropriately. Add the ClusterServingRuntime \u00b6 To use the AMD Inference Server with KServe, add it as a serving runtime . A ClusterServingRuntime configuration file is included in this example. To apply it: # update the kserve-amdserver.yaml to use the right image # if you have a different image name, you'll need to edit it manually sed -i \"s// $( whoami ) \\/amdinfer:latest/\" kserve-amdserver.yaml kubectl apply -f kserve-amdserver.yaml Single model serving \u00b6 Once the AMD Inference Server has been added as a serving runtime, you can start a service that uses it. # download the inference service file and input data curl -O https://raw.githubusercontent.com/kserve/website/master/docs/modelserving/v1beta1/amd/single_model.yaml curl -O https://raw.githubusercontent.com/kserve/website/master/docs/modelserving/v1beta1/amd/input.json # create the inference service kubectl apply -f single_model.yaml # wait for service to be ready kubectl wait --for = condition = ready isvc -l app = example-amdserver-runtime-isvc export SERVICE_HOSTNAME = $( kubectl get inferenceservice example-amdserver-runtime-isvc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Make a request with REST \u00b6 Once the service is ready, you can make requests to it. Assuming that INGRESS_HOST , INGRESS_PORT , and SERVICE_HOSTNAME have been defined as above, the following command runs an inference over REST to the example MNIST model. export MODEL_NAME = mnist export INPUT_DATA = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d ${ INPUT_DATA } This shows the response from the server in KServe's v2 API format. For this example, it will be similar to: Expected Output { \"id\" : \"\" , \"model_name\" : \"TFModel\" , \"outputs\" : [ { \"data\" : [ 0.11987821012735367 , 0.18648317456245422 , -0.83796119689941406 , -0.088459312915802002 , 0.030454874038696289 , 0.074872657656669617 , -1.1334009170532227 , -0.046301722526550293 , -0.31683838367462158 , 0.32014602422714233 ], \"datatype\" : \"FP32\" , \"name\" : \"input-0\" , \"parameters\" :{}, \"shape\" :[ 10 ] } ] } For MNIST, the data indicates the likely classification for the input image, which is the number 9. In this response, the index with the highest value is the last one, indicating that the image was correctly classified as nine.","title":"AMD"},{"location":"modelserving/v1beta1/amd/#amd-inference-server","text":"The AMD Inference Server is an easy-to-use inferencing solution specially designed for AMD CPUs, GPUs, and FPGAs. It can be deployed as a standalone executable or on a Kubernetes cluster with KServe or used to create custom applications by linking to its C++ API. This example demonstrates how to deploy a Tensorflow GraphDef model on KServe with the AMD Inference Server to run inference on AMD EPYC CPUs .","title":"AMD Inference Server"},{"location":"modelserving/v1beta1/amd/#prerequisites","text":"This example was tested on an Ubuntu 18.04 host machine using the Bash shell. These instructions assume: You have a machine with a modern version of Docker (>=18.09) and sufficient disk space to build the image You have a Kubernetes cluster set up KServe has been installed on the Kubernetes cluster Some familiarity with Kubernetes / KServe Refer to the installation instructions for these tools to install them if needed.","title":"Prerequisites"},{"location":"modelserving/v1beta1/amd/#set-up-the-image","text":"This example uses the AMD ZenDNN backend to run inference on TensorFlow models on AMD EPYC CPUs.","title":"Set up the image"},{"location":"modelserving/v1beta1/amd/#build-the-image","text":"To build a Docker image for the AMD Inference Server that uses this backend, download the TF_v2.9_ZenDNN_v3.3_C++_API.zip package from ZenDNN. You must agree to the EULA to download this package. You need a modern version of Docker (at least 18.09) to build this image. # clone the inference server repository git clone https://github.com/Xilinx/inference-server.git # place the downloaded ZenDNN zip in the repository mv TF_v2.9_ZenDNN_v3.3_C++_API.zip ./inference-server/ # build the image cd inference-server ./amdinfer dockerize --production --tfzendnn = ./TF_v2.9_ZenDNN_v3.3_C++_API.zip This builds an image on your host: /amdinfer:latest . To use with KServe, you need to upload this image to a Docker registry server such as on a local server . You will also need to update the YAML files in this example to use this image. More documentation for building a ZenDNN image for KServe is available: ZenDNN + AMD Inference Server and KServe + AMD Inference Server .","title":"Build the image"},{"location":"modelserving/v1beta1/amd/#set-up-the-model","text":"In this example, you will use an MNIST Tensorflow model . The AMD Inference Server also supports PyTorch, ONNX and Vitis AI models models with the appropriate Docker images. To prepare new models, look at the KServe + AMD Inference Server documentation for more information about the expected model format.","title":"Set up the model"},{"location":"modelserving/v1beta1/amd/#make-an-inference","text":"The AMD Inference Server can be used in single model serving mode in KServe. The code snippets below use the environment variables INGRESS_HOST and INGRESS_PORT to make requests to the cluster. Find the ingress host and port for making requests to your cluster and set these values appropriately.","title":"Make an inference"},{"location":"modelserving/v1beta1/amd/#add-the-clusterservingruntime","text":"To use the AMD Inference Server with KServe, add it as a serving runtime . A ClusterServingRuntime configuration file is included in this example. To apply it: # update the kserve-amdserver.yaml to use the right image # if you have a different image name, you'll need to edit it manually sed -i \"s// $( whoami ) \\/amdinfer:latest/\" kserve-amdserver.yaml kubectl apply -f kserve-amdserver.yaml","title":"Add the ClusterServingRuntime"},{"location":"modelserving/v1beta1/amd/#single-model-serving","text":"Once the AMD Inference Server has been added as a serving runtime, you can start a service that uses it. # download the inference service file and input data curl -O https://raw.githubusercontent.com/kserve/website/master/docs/modelserving/v1beta1/amd/single_model.yaml curl -O https://raw.githubusercontent.com/kserve/website/master/docs/modelserving/v1beta1/amd/input.json # create the inference service kubectl apply -f single_model.yaml # wait for service to be ready kubectl wait --for = condition = ready isvc -l app = example-amdserver-runtime-isvc export SERVICE_HOSTNAME = $( kubectl get inferenceservice example-amdserver-runtime-isvc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 )","title":"Single model serving"},{"location":"modelserving/v1beta1/amd/#make-a-request-with-rest","text":"Once the service is ready, you can make requests to it. Assuming that INGRESS_HOST , INGRESS_PORT , and SERVICE_HOSTNAME have been defined as above, the following command runs an inference over REST to the example MNIST model. export MODEL_NAME = mnist export INPUT_DATA = @./input.json curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d ${ INPUT_DATA } This shows the response from the server in KServe's v2 API format. For this example, it will be similar to: Expected Output { \"id\" : \"\" , \"model_name\" : \"TFModel\" , \"outputs\" : [ { \"data\" : [ 0.11987821012735367 , 0.18648317456245422 , -0.83796119689941406 , -0.088459312915802002 , 0.030454874038696289 , 0.074872657656669617 , -1.1334009170532227 , -0.046301722526550293 , -0.31683838367462158 , 0.32014602422714233 ], \"datatype\" : \"FP32\" , \"name\" : \"input-0\" , \"parameters\" :{}, \"shape\" :[ 10 ] } ] } For MNIST, the data indicates the likely classification for the input image, which is the number 9. In this response, the index with the highest value is the last one, indicating that the image was correctly classified as nine.","title":"Make a request with REST"},{"location":"modelserving/v1beta1/custom/custom_model/","text":"Deploy Custom Python Serving Runtime with InferenceService \u00b6 When the out-of-the-box Serving Runtime does not fit your need, you can choose to build your own model server using KServe ModelServer API to deploy as Custom Serving Runtime on KServe. Setup \u00b6 Install pack CLI to build your custom model server image. Create and Deploy Custom REST ServingRuntime \u00b6 Implement Custom Model using KServe API \u00b6 KServe.Model base class mainly defines three handlers preprocess , predict and postprocess , these handlers are executed in sequence, the output of the preprocess is passed to predict as the input, the predictor handler executes the inference for your model, the postprocess handler then turns the raw prediction result into user-friendly inference response. There is an additional load handler which is used for writing custom code to load your model into the memory from local file system or remote model storage, a general good practice is to call the load handler in the model server class __init__ function, so your model is loaded on startup and ready to serve prediction requests. import argparse from torchvision import models from typing import Dict , Union import torch import numpy as np from kserve import Model , ModelServer class AlexNetModel ( Model ): def __init__ ( self , name : str ): super () . __init__ ( name ) self . name = name self . load () def load ( self ): self . model = models . alexnet ( pretrained = True ) self . model . eval () self . ready = True def predict ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : img_data = payload [ \"instances\" ][ 0 ][ \"image\" ][ \"b64\" ] raw_img_data = base64 . b64decode ( img_data ) input_image = Image . open ( io . BytesIO ( raw_img_data )) preprocess = transforms . Compose ([ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ]), ]) input_tensor = preprocess ( input_image ) . unsqueeze ( 0 ) output = self . model ( input_tensor ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . flatten () . tolist () response_id = generate_uuid () return { \"predictions\" : result } if __name__ == \"__main__\" : model = AlexNetModel ( \"custom-model\" ) ModelServer () . start ([ model ]) Build Custom Serving Image with BuildPacks \u00b6 Buildpacks allows you to transform your inference code into images that can be deployed on KServe without needing to define the Dockerfile . Buildpacks automatically determines the python application and then install the dependencies from the requirements.txt file, it looks at the Procfile to determine how to start the model server. Here we are showing how to build the serving image manually with pack , you can also choose to use kpack to run the image build on the cloud and continuously build/deploy new versions from your source git repository. You can use pack cli to build and push the custom model server image pack build --builder = heroku/buildpacks:20 ${ DOCKER_USER } /custom-model:v1 docker push ${ DOCKER_USER } /custom-model:v1 Note: If your buildpack command fails, make sure you have a runtimes.txt file with the correct python version specified. See the custom model server runtime.txt file as an example. Deploy Locally and Test \u00b6 Launch the docker image built from last step with buildpack . docker run -ePORT = 8080 -p8080:8080 ${ DOCKER_USER } /custom-model:v1 Send a test inference request locally with input.json curl -H \"Content-Type: application/json\" localhost:8080/v1/models/custom-model:predict -d @./input.json Expected Output { \"predictions\" : [[ 14.861763000488281 , 13.94291877746582 , 13.924378395080566 , 12.182709693908691 , 12.00634765625 ]]} Deploy the REST Custom Serving Runtime on KServe \u00b6 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : custom-model spec : predictor : containers : - name : kserve-container image : ${DOCKER_USER}/custom-model:v1 In the custom.yaml file edit the container image and replace ${DOCKER_USER} with your Docker Hub username. Arguments \u00b6 You can supply additional command arguments on the container spec to configure the model server. --workers : Spawn the specified number of uvicorn workers(multi-processing) of the model server, the default value is 1, this option is often used to help increase the resource utilization of the container. --http_port : The http port model server is listening on, the default REST port is 8080. --model_name : The model name deployed in the model server, the default name the same as the service name. --max_asyncio_workers : Max number of workers to spawn for python async io loop, by default it is min(32,cpu.limit + 4) . --enable_latency_logging : Whether to log latency metrics per request, the default is True. --configure_logging : Whether to configure KServe and Uvicorn logging, the default is True. In this case you may want to set the KServe ModelServer 's log_config parameter to pass a dictionary containing all the logging directives and configurations (see the Python upstream docs for more info). The alternative is to use the --log_config_file argument described below. --log_config_file : The path of the Python config file configuration to use (either a json or a yaml file). This file allows to override the default Uvicorn configuration shipped with KServe. The default is None. --access_log_format : A string representing the access log format configuration to use. The functionality is provided by the asgi-logger library and it allows to override only the uvicorn.access 's format configuration with a richer set of fields (output hardcoded to stdout ). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn github issue for more info). By default is None. Environment Variables \u00b6 You can supply additional environment variables on the container spec. STORAGE_URI : load a model from a storage system supported by KServe e.g. pvc:// s3:// . This acts the same as storageUri when using a built-in predictor. The data will be available at /mnt/models in the container. For example, the following STORAGE_URI: \"pvc://my_model/model.onnx\" will be accessible at /mnt/models/model.onnx PROTOCOL : specify the protocol version supported by the model e.g V1 . This acts the same as protocolVersion when using a built-in predictor. KSERVE_LOGLEVEL : sets the kserve and kserve_trace 's logger verbosity. Default is INFO . Apply the yaml to deploy the InferenceService on KServe kubectl kubectl apply -f custom.yaml Expected Output $ inferenceservice.serving.kserve.io/custom-model created Run a Prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = custom-model INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d $INPUT_PATH Expected Output * Trying 169 .47.250.204... * TCP_NODELAY set * Connected to 169 .47.250.204 ( 169 .47.250.204 ) port 80 ( #0) > POST /v1/models/custom-model:predict HTTP/1.1 > Host: custom-model.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 105339 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 232 < content-type: text/html ; charset = UTF-8 < date: Wed, 26 Feb 2020 15 :19:15 GMT < server: istio-envoy < x-envoy-upstream-service-time: 213 < * Connection #0 to host 169.47.250.204 left intact { \"predictions\" : [[ 14 .861762046813965, 13 .942917823791504, 13 .9243803024292, 12 .182711601257324, 12 .00634765625 ]]} Delete the InferenceService \u00b6 kubectl delete -f custom.yaml Create and Deploy Custom gRPC ServingRuntime \u00b6 KServe gRPC ServingRuntimes enables high performance inference data plane which implements the Open(v2) Inference Protocol : gRPC is built on top of HTTP/2 for addressing the shortcomings of head-of-line-blocking and pipelining , gRPC transports binary data format with Protobuf which is efficient to send over the wire. Compared to REST it has limited support for browser and the message is not human-readable which requires additional debugging tools. Implement Custom Model using KServe API \u00b6 For Open(v2) Inference Protocol , KServe provides InferRequest and InferResponse API object for predict , preprocess , postprocess handlers to abstract away the implementation details of REST/gRPC decoding and encoding over the wire. model_grpc.py import io from typing import Dict import torch from kserve import InferRequest , InferResponse , InferOutput , Model , ModelServer from kserve.utils.utils import generate_uuid from PIL import Image from torchvision import models , transforms # This custom predictor example implements the custom model following KServe v2 inference gPPC protocol, # the input can be raw image bytes or image tensor which is pre-processed by transformer # and then passed to predictor, the output is the prediction response. class AlexNetModel ( Model ): def __init__ ( self , name : str ): super () . __init__ ( name ) self . name = name self . load () self . model = None self . ready = False def load ( self ): self . model = models . alexnet ( pretrained = True ) self . model . eval () self . ready = True def predict ( self , payload : InferRequest , headers : Dict [ str , str ] = None ) -> InferResponse : req = payload . inputs [ 0 ] input_image = Image . open ( io . BytesIO ( req . data [ 0 ])) preprocess = transforms . Compose ([ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ]), ]) input_tensor = preprocess ( input_image ) input_tensor = input_tensor . unsqueeze ( 0 ) output = self . model ( input_tensor ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . flatten () . tolist () response_id = generate_uuid () infer_output = InferOutput ( name = \"output-0\" , shape = list ( values . shape ), datatype = \"FP32\" , data = result ) infer_response = InferResponse ( model_name = self . name , infer_outputs = [ infer_output ], response_id = response_id ) return infer_response if __name__ == \"__main__\" : model = AlexNetModel ( \"custom-model\" ) model . load () ModelServer () . start ([ model ]) Build Custom Serving Image with BuildPacks \u00b6 Similar to building the REST custom image, you can also use pack cli to build and push the custom gRPC model server image pack build --builder = heroku/buildpacks:20 ${ DOCKER_USER } /custom-model-grpc:v1 docker push ${ DOCKER_USER } /custom-model-grpc:v1 Note: If your buildpack command fails, make sure you have a runtimes.txt file with the correct python version specified. See the custom model server runtime.txt file as an example. Deploy Locally and Test \u00b6 Launch the docker image built from last step with buildpack . docker run -ePORT = 8081 -p8081:8081 ${ DOCKER_USER } /custom-model-grpc:v1 Send a test inference request locally using InferenceServerClient grpc_test_client.py from kserve import InferRequest , InferInput , InferenceServerClient import json import base64 import os client = InferenceServerClient ( url = os . environ . get ( \"INGRESS_HOST\" , \"localhost\" ) + \":\" + os . environ . get ( \"INGRESS_PORT\" , \"8081\" ), channel_args = (( 'grpc.ssl_target_name_override' , os . environ . get ( \"SERVICE_HOSTNAME\" , \"\" )),)) json_file = open ( \"./input.json\" ) data = json . load ( json_file ) infer_input = InferInput ( name = \"input-0\" , shape = [ 1 ], datatype = \"BYTES\" , data = [ base64 . b64decode ( data [ \"instances\" ][ 0 ][ \"image\" ][ \"b64\" ])]) request = InferRequest ( infer_inputs = [ infer_input ], model_name = \"custom-model\" ) res = client . infer ( infer_request = request ) print ( res ) python grpc_test_client.py Expected Output id : \"df27b8a5-f13e-4c7a-af61-20bdb55b6523\" ou t pu ts { na me : \"output-0\" da tat ype : \"FP32\" shape : 1 shape : 5 co ntents { f p 32 _co ntents : 14.9756203 f p 32 _co ntents : 14.036808 f p 32 _co ntents : 13.9660349 f p 32 _co ntents : 12.2522783 f p 32 _co ntents : 12.0862684 } } model_ na me : \"custom-model\" id : \"df27b8a5-f13e-4c7a-af61-20bdb55b6523\" ou t pu ts { na me : \"output-0\" da tat ype : \"FP32\" shape : 1 shape : 5 co ntents { f p 32 _co ntents : 14.9756203 f p 32 _co ntents : 14.036808 f p 32 _co ntents : 13.9660349 f p 32 _co ntents : 12.2522783 f p 32 _co ntents : 12.0862684 } } Deploy the gRPC Custom Serving Runtime on KServe \u00b6 Create the InferenceService yaml and expose the gRPC port by specifying on ports section, currently only one port is allowed to expose and by default HTTP port is exposed. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : custom-model-grpc spec : predictor : containers : - name : kserve-container image : ${DOCKER_USER}/custom-model-grpc:v1 ports : - name : h2c containerPort : 8081 protocol : TCP In the custom_grpc.yaml file edit the container image and replace ${DOCKER_USER} with your Docker Hub username. Arguments \u00b6 You can supply additional command arguments on the container spec to configure the model server. --grpc_port : the http port model server is listening on, the default gRPC port is 8081. --model_name : the model name deployed in the model server, the default name the same as the service name. enable_latency_logging : whether to log latency metrics per request, the default is True. Apply the yaml to deploy the InferenceService on KServe kubectl kubectl apply -f custom_grpc.yaml Expected Output $ inferenceservice.serving.kserve.io/custom-model-grpc created Run a gRPC Prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = custom-model SERVICE_HOSTNAME = $( kubectl get inferenceservice custom-model-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Send an inference request to the gRPC service using InferenceServerClient grpc_test_client.py . python grpc_test_client.py Expected Output id : \"df27b8a5-f13e-4c7a-af61-20bdb55b6523\" ou t pu ts { na me : \"output-0\" da tat ype : \"FP32\" shape : 1 shape : 5 co ntents { f p 32 _co ntents : 14.9756203 f p 32 _co ntents : 14.036808 f p 32 _co ntents : 13.9660349 f p 32 _co ntents : 12.2522783 f p 32 _co ntents : 12.0862684 } } model_ na me : \"custom-model\" id : \"df27b8a5-f13e-4c7a-af61-20bdb55b6523\" ou t pu ts { na me : \"output-0\" da tat ype : \"FP32\" shape : 1 shape : 5 co ntents { f p 32 _co ntents : 14.9756203 f p 32 _co ntents : 14.036808 f p 32 _co ntents : 13.9660349 f p 32 _co ntents : 12.2522783 f p 32 _co ntents : 12.0862684 } } Parallel Model Inference \u00b6 By default, the models are loaded in the same process and inference is executed in the same process as the HTTP or gRPC server, if you are hosting multiple models the inference can only be run for one model at a time which limits the concurrency when you share the container for the models. KServe integrates RayServe which provides a programmable API to deploy models as separate python workers so the inference can be performed in parallel when serving multiple custom models. import kserve from typing import Dict from ray import serve @serve . deployment ( name = \"custom-model\" , num_replicas = 2 ) class AlexNetModel ( kserve . Model ): def __init__ ( self ): self . name = \"custom-model\" super () . __init__ ( self . name ) self . load () def load ( self ): ... def predict ( self , request : Dict ) -> Dict : ... if __name__ == \"__main__\" : kserve . ModelServer () . start ({ \"custom-model\" : AlexNetModel }) fractional gpu example @serve . deployment ( name = \"custom-model\" , num_replicas = 2 , ray_actor_options = { \"num_cpus\" : 1 , \"num_gpus\" : 0.5 }) class AlexNetModel ( kserve . Model ): def __init__ ( self ): self . name = \"custom-model\" super () . __init__ ( self . name ) self . load () def load ( self ): ... def predict ( self , request : Dict ) -> Dict : ... if __name__ == \"__main__\" : ray . init ( num_cpus = 2 , num_gpus = 1 ) kserve . ModelServer () . start ({ \"custom-model\" : AlexNetModel }) The more details for ray fractional cpu and gpu can be found here . The full code example can be found here . Modify the Procfile to web: python -m model_remote and then run the above pack command, it builds the serving image which launches each model as separate python worker and web server routes to the model workers by name.","title":"How to write a custom predictor"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-custom-python-serving-runtime-with-inferenceservice","text":"When the out-of-the-box Serving Runtime does not fit your need, you can choose to build your own model server using KServe ModelServer API to deploy as Custom Serving Runtime on KServe.","title":"Deploy Custom Python Serving Runtime with InferenceService"},{"location":"modelserving/v1beta1/custom/custom_model/#setup","text":"Install pack CLI to build your custom model server image.","title":"Setup"},{"location":"modelserving/v1beta1/custom/custom_model/#create-and-deploy-custom-rest-servingruntime","text":"","title":"Create and Deploy Custom REST ServingRuntime"},{"location":"modelserving/v1beta1/custom/custom_model/#implement-custom-model-using-kserve-api","text":"KServe.Model base class mainly defines three handlers preprocess , predict and postprocess , these handlers are executed in sequence, the output of the preprocess is passed to predict as the input, the predictor handler executes the inference for your model, the postprocess handler then turns the raw prediction result into user-friendly inference response. There is an additional load handler which is used for writing custom code to load your model into the memory from local file system or remote model storage, a general good practice is to call the load handler in the model server class __init__ function, so your model is loaded on startup and ready to serve prediction requests. import argparse from torchvision import models from typing import Dict , Union import torch import numpy as np from kserve import Model , ModelServer class AlexNetModel ( Model ): def __init__ ( self , name : str ): super () . __init__ ( name ) self . name = name self . load () def load ( self ): self . model = models . alexnet ( pretrained = True ) self . model . eval () self . ready = True def predict ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : img_data = payload [ \"instances\" ][ 0 ][ \"image\" ][ \"b64\" ] raw_img_data = base64 . b64decode ( img_data ) input_image = Image . open ( io . BytesIO ( raw_img_data )) preprocess = transforms . Compose ([ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ]), ]) input_tensor = preprocess ( input_image ) . unsqueeze ( 0 ) output = self . model ( input_tensor ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . flatten () . tolist () response_id = generate_uuid () return { \"predictions\" : result } if __name__ == \"__main__\" : model = AlexNetModel ( \"custom-model\" ) ModelServer () . start ([ model ])","title":"Implement Custom Model using KServe API"},{"location":"modelserving/v1beta1/custom/custom_model/#build-custom-serving-image-with-buildpacks","text":"Buildpacks allows you to transform your inference code into images that can be deployed on KServe without needing to define the Dockerfile . Buildpacks automatically determines the python application and then install the dependencies from the requirements.txt file, it looks at the Procfile to determine how to start the model server. Here we are showing how to build the serving image manually with pack , you can also choose to use kpack to run the image build on the cloud and continuously build/deploy new versions from your source git repository. You can use pack cli to build and push the custom model server image pack build --builder = heroku/buildpacks:20 ${ DOCKER_USER } /custom-model:v1 docker push ${ DOCKER_USER } /custom-model:v1 Note: If your buildpack command fails, make sure you have a runtimes.txt file with the correct python version specified. See the custom model server runtime.txt file as an example.","title":"Build Custom Serving Image with BuildPacks"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-locally-and-test","text":"Launch the docker image built from last step with buildpack . docker run -ePORT = 8080 -p8080:8080 ${ DOCKER_USER } /custom-model:v1 Send a test inference request locally with input.json curl -H \"Content-Type: application/json\" localhost:8080/v1/models/custom-model:predict -d @./input.json Expected Output { \"predictions\" : [[ 14.861763000488281 , 13.94291877746582 , 13.924378395080566 , 12.182709693908691 , 12.00634765625 ]]}","title":"Deploy Locally and Test"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-the-rest-custom-serving-runtime-on-kserve","text":"apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : custom-model spec : predictor : containers : - name : kserve-container image : ${DOCKER_USER}/custom-model:v1 In the custom.yaml file edit the container image and replace ${DOCKER_USER} with your Docker Hub username.","title":"Deploy the REST Custom Serving Runtime on KServe"},{"location":"modelserving/v1beta1/custom/custom_model/#arguments","text":"You can supply additional command arguments on the container spec to configure the model server. --workers : Spawn the specified number of uvicorn workers(multi-processing) of the model server, the default value is 1, this option is often used to help increase the resource utilization of the container. --http_port : The http port model server is listening on, the default REST port is 8080. --model_name : The model name deployed in the model server, the default name the same as the service name. --max_asyncio_workers : Max number of workers to spawn for python async io loop, by default it is min(32,cpu.limit + 4) . --enable_latency_logging : Whether to log latency metrics per request, the default is True. --configure_logging : Whether to configure KServe and Uvicorn logging, the default is True. In this case you may want to set the KServe ModelServer 's log_config parameter to pass a dictionary containing all the logging directives and configurations (see the Python upstream docs for more info). The alternative is to use the --log_config_file argument described below. --log_config_file : The path of the Python config file configuration to use (either a json or a yaml file). This file allows to override the default Uvicorn configuration shipped with KServe. The default is None. --access_log_format : A string representing the access log format configuration to use. The functionality is provided by the asgi-logger library and it allows to override only the uvicorn.access 's format configuration with a richer set of fields (output hardcoded to stdout ). This limitation is currently due to the ASGI specs that don't describe how access logging should be implemented in detail (please refer to this Uvicorn github issue for more info). By default is None.","title":"Arguments"},{"location":"modelserving/v1beta1/custom/custom_model/#environment-variables","text":"You can supply additional environment variables on the container spec. STORAGE_URI : load a model from a storage system supported by KServe e.g. pvc:// s3:// . This acts the same as storageUri when using a built-in predictor. The data will be available at /mnt/models in the container. For example, the following STORAGE_URI: \"pvc://my_model/model.onnx\" will be accessible at /mnt/models/model.onnx PROTOCOL : specify the protocol version supported by the model e.g V1 . This acts the same as protocolVersion when using a built-in predictor. KSERVE_LOGLEVEL : sets the kserve and kserve_trace 's logger verbosity. Default is INFO . Apply the yaml to deploy the InferenceService on KServe kubectl kubectl apply -f custom.yaml Expected Output $ inferenceservice.serving.kserve.io/custom-model created","title":"Environment Variables"},{"location":"modelserving/v1beta1/custom/custom_model/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = custom-model INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d $INPUT_PATH Expected Output * Trying 169 .47.250.204... * TCP_NODELAY set * Connected to 169 .47.250.204 ( 169 .47.250.204 ) port 80 ( #0) > POST /v1/models/custom-model:predict HTTP/1.1 > Host: custom-model.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 105339 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 232 < content-type: text/html ; charset = UTF-8 < date: Wed, 26 Feb 2020 15 :19:15 GMT < server: istio-envoy < x-envoy-upstream-service-time: 213 < * Connection #0 to host 169.47.250.204 left intact { \"predictions\" : [[ 14 .861762046813965, 13 .942917823791504, 13 .9243803024292, 12 .182711601257324, 12 .00634765625 ]]}","title":"Run a Prediction"},{"location":"modelserving/v1beta1/custom/custom_model/#delete-the-inferenceservice","text":"kubectl delete -f custom.yaml","title":"Delete the InferenceService"},{"location":"modelserving/v1beta1/custom/custom_model/#create-and-deploy-custom-grpc-servingruntime","text":"KServe gRPC ServingRuntimes enables high performance inference data plane which implements the Open(v2) Inference Protocol : gRPC is built on top of HTTP/2 for addressing the shortcomings of head-of-line-blocking and pipelining , gRPC transports binary data format with Protobuf which is efficient to send over the wire. Compared to REST it has limited support for browser and the message is not human-readable which requires additional debugging tools.","title":"Create and Deploy Custom gRPC ServingRuntime"},{"location":"modelserving/v1beta1/custom/custom_model/#implement-custom-model-using-kserve-api_1","text":"For Open(v2) Inference Protocol , KServe provides InferRequest and InferResponse API object for predict , preprocess , postprocess handlers to abstract away the implementation details of REST/gRPC decoding and encoding over the wire. model_grpc.py import io from typing import Dict import torch from kserve import InferRequest , InferResponse , InferOutput , Model , ModelServer from kserve.utils.utils import generate_uuid from PIL import Image from torchvision import models , transforms # This custom predictor example implements the custom model following KServe v2 inference gPPC protocol, # the input can be raw image bytes or image tensor which is pre-processed by transformer # and then passed to predictor, the output is the prediction response. class AlexNetModel ( Model ): def __init__ ( self , name : str ): super () . __init__ ( name ) self . name = name self . load () self . model = None self . ready = False def load ( self ): self . model = models . alexnet ( pretrained = True ) self . model . eval () self . ready = True def predict ( self , payload : InferRequest , headers : Dict [ str , str ] = None ) -> InferResponse : req = payload . inputs [ 0 ] input_image = Image . open ( io . BytesIO ( req . data [ 0 ])) preprocess = transforms . Compose ([ transforms . Resize ( 256 ), transforms . CenterCrop ( 224 ), transforms . ToTensor (), transforms . Normalize ( mean = [ 0.485 , 0.456 , 0.406 ], std = [ 0.229 , 0.224 , 0.225 ]), ]) input_tensor = preprocess ( input_image ) input_tensor = input_tensor . unsqueeze ( 0 ) output = self . model ( input_tensor ) torch . nn . functional . softmax ( output , dim = 1 ) values , top_5 = torch . topk ( output , 5 ) result = values . flatten () . tolist () response_id = generate_uuid () infer_output = InferOutput ( name = \"output-0\" , shape = list ( values . shape ), datatype = \"FP32\" , data = result ) infer_response = InferResponse ( model_name = self . name , infer_outputs = [ infer_output ], response_id = response_id ) return infer_response if __name__ == \"__main__\" : model = AlexNetModel ( \"custom-model\" ) model . load () ModelServer () . start ([ model ])","title":"Implement Custom Model using KServe API"},{"location":"modelserving/v1beta1/custom/custom_model/#build-custom-serving-image-with-buildpacks_1","text":"Similar to building the REST custom image, you can also use pack cli to build and push the custom gRPC model server image pack build --builder = heroku/buildpacks:20 ${ DOCKER_USER } /custom-model-grpc:v1 docker push ${ DOCKER_USER } /custom-model-grpc:v1 Note: If your buildpack command fails, make sure you have a runtimes.txt file with the correct python version specified. See the custom model server runtime.txt file as an example.","title":"Build Custom Serving Image with BuildPacks"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-locally-and-test_1","text":"Launch the docker image built from last step with buildpack . docker run -ePORT = 8081 -p8081:8081 ${ DOCKER_USER } /custom-model-grpc:v1 Send a test inference request locally using InferenceServerClient grpc_test_client.py from kserve import InferRequest , InferInput , InferenceServerClient import json import base64 import os client = InferenceServerClient ( url = os . environ . get ( \"INGRESS_HOST\" , \"localhost\" ) + \":\" + os . environ . get ( \"INGRESS_PORT\" , \"8081\" ), channel_args = (( 'grpc.ssl_target_name_override' , os . environ . get ( \"SERVICE_HOSTNAME\" , \"\" )),)) json_file = open ( \"./input.json\" ) data = json . load ( json_file ) infer_input = InferInput ( name = \"input-0\" , shape = [ 1 ], datatype = \"BYTES\" , data = [ base64 . b64decode ( data [ \"instances\" ][ 0 ][ \"image\" ][ \"b64\" ])]) request = InferRequest ( infer_inputs = [ infer_input ], model_name = \"custom-model\" ) res = client . infer ( infer_request = request ) print ( res ) python grpc_test_client.py Expected Output id : \"df27b8a5-f13e-4c7a-af61-20bdb55b6523\" ou t pu ts { na me : \"output-0\" da tat ype : \"FP32\" shape : 1 shape : 5 co ntents { f p 32 _co ntents : 14.9756203 f p 32 _co ntents : 14.036808 f p 32 _co ntents : 13.9660349 f p 32 _co ntents : 12.2522783 f p 32 _co ntents : 12.0862684 } } model_ na me : \"custom-model\" id : \"df27b8a5-f13e-4c7a-af61-20bdb55b6523\" ou t pu ts { na me : \"output-0\" da tat ype : \"FP32\" shape : 1 shape : 5 co ntents { f p 32 _co ntents : 14.9756203 f p 32 _co ntents : 14.036808 f p 32 _co ntents : 13.9660349 f p 32 _co ntents : 12.2522783 f p 32 _co ntents : 12.0862684 } }","title":"Deploy Locally and Test"},{"location":"modelserving/v1beta1/custom/custom_model/#deploy-the-grpc-custom-serving-runtime-on-kserve","text":"Create the InferenceService yaml and expose the gRPC port by specifying on ports section, currently only one port is allowed to expose and by default HTTP port is exposed. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : custom-model-grpc spec : predictor : containers : - name : kserve-container image : ${DOCKER_USER}/custom-model-grpc:v1 ports : - name : h2c containerPort : 8081 protocol : TCP In the custom_grpc.yaml file edit the container image and replace ${DOCKER_USER} with your Docker Hub username.","title":"Deploy the gRPC Custom Serving Runtime on KServe"},{"location":"modelserving/v1beta1/custom/custom_model/#arguments_1","text":"You can supply additional command arguments on the container spec to configure the model server. --grpc_port : the http port model server is listening on, the default gRPC port is 8081. --model_name : the model name deployed in the model server, the default name the same as the service name. enable_latency_logging : whether to log latency metrics per request, the default is True. Apply the yaml to deploy the InferenceService on KServe kubectl kubectl apply -f custom_grpc.yaml Expected Output $ inferenceservice.serving.kserve.io/custom-model-grpc created","title":"Arguments"},{"location":"modelserving/v1beta1/custom/custom_model/#run-a-grpc-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = custom-model SERVICE_HOSTNAME = $( kubectl get inferenceservice custom-model-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Send an inference request to the gRPC service using InferenceServerClient grpc_test_client.py . python grpc_test_client.py Expected Output id : \"df27b8a5-f13e-4c7a-af61-20bdb55b6523\" ou t pu ts { na me : \"output-0\" da tat ype : \"FP32\" shape : 1 shape : 5 co ntents { f p 32 _co ntents : 14.9756203 f p 32 _co ntents : 14.036808 f p 32 _co ntents : 13.9660349 f p 32 _co ntents : 12.2522783 f p 32 _co ntents : 12.0862684 } } model_ na me : \"custom-model\" id : \"df27b8a5-f13e-4c7a-af61-20bdb55b6523\" ou t pu ts { na me : \"output-0\" da tat ype : \"FP32\" shape : 1 shape : 5 co ntents { f p 32 _co ntents : 14.9756203 f p 32 _co ntents : 14.036808 f p 32 _co ntents : 13.9660349 f p 32 _co ntents : 12.2522783 f p 32 _co ntents : 12.0862684 } }","title":"Run a gRPC Prediction"},{"location":"modelserving/v1beta1/custom/custom_model/#parallel-model-inference","text":"By default, the models are loaded in the same process and inference is executed in the same process as the HTTP or gRPC server, if you are hosting multiple models the inference can only be run for one model at a time which limits the concurrency when you share the container for the models. KServe integrates RayServe which provides a programmable API to deploy models as separate python workers so the inference can be performed in parallel when serving multiple custom models. import kserve from typing import Dict from ray import serve @serve . deployment ( name = \"custom-model\" , num_replicas = 2 ) class AlexNetModel ( kserve . Model ): def __init__ ( self ): self . name = \"custom-model\" super () . __init__ ( self . name ) self . load () def load ( self ): ... def predict ( self , request : Dict ) -> Dict : ... if __name__ == \"__main__\" : kserve . ModelServer () . start ({ \"custom-model\" : AlexNetModel }) fractional gpu example @serve . deployment ( name = \"custom-model\" , num_replicas = 2 , ray_actor_options = { \"num_cpus\" : 1 , \"num_gpus\" : 0.5 }) class AlexNetModel ( kserve . Model ): def __init__ ( self ): self . name = \"custom-model\" super () . __init__ ( self . name ) self . load () def load ( self ): ... def predict ( self , request : Dict ) -> Dict : ... if __name__ == \"__main__\" : ray . init ( num_cpus = 2 , num_gpus = 1 ) kserve . ModelServer () . start ({ \"custom-model\" : AlexNetModel }) The more details for ray fractional cpu and gpu can be found here . The full code example can be found here . Modify the Procfile to web: python -m model_remote and then run the above pack command, it builds the serving image which launches each model as separate python worker and web server routes to the model workers by name.","title":"Parallel Model Inference"},{"location":"modelserving/v1beta1/lightgbm/","text":"Deploy LightGBM model with InferenceService \u00b6 Train a LightGBM model \u00b6 To test the LightGBM Server, first you need to train a simple LightGBM model with following python code. import lightgbm as lgb from sklearn.datasets import load_iris import os model_dir = \".\" BST_FILE = \"model.bst\" iris = load_iris () y = iris [ 'target' ] X = iris [ 'data' ] dtrain = lgb . Dataset ( X , label = y , feature_names = iris [ 'feature_names' ]) params = { 'objective' : 'multiclass' , 'metric' : 'softmax' , 'num_class' : 3 } lgb_model = lgb . train ( params = params , train_set = dtrain ) model_file = os . path . join ( model_dir , BST_FILE ) lgb_model . save_model ( model_file ) Deploy LightGBM model with V1 protocol \u00b6 Test the model locally \u00b6 Install and run the LightGBM Server using the trained model locally and test the prediction. python -m lgbserver --model_dir /path/to/model_dir --model_name lgb After the LightGBM Server is up locally we can then test the model by sending an inference request. import requests request = { 'sepal_width_(cm)' : { 0 : 3.5 }, 'petal_length_(cm)' : { 0 : 1.4 }, 'petal_width_(cm)' : { 0 : 0.2 }, 'sepal_length_(cm)' : { 0 : 5.1 } } formData = { 'inputs' : [ request ] } res = requests . post ( 'http://localhost:8080/v1/models/lgb:predict' , json = formData ) print ( res ) print ( res . text ) Deploy with InferenceService \u00b6 To deploy the model on Kubernetes you can create the InferenceService by specifying the modelFormat with lightgbm and storageUri . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-iris\" spec : predictor : model : modelFormat : name : lightgbm storageUri : \"gs://kfserving-examples/models/lightgbm/iris\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-iris\" spec : predictor : lightgbm : storageUri : \"gs://kfserving-examples/models/lightgbm/iris\" Apply the above yaml to create the InferenceService kubectl apply -f lightgbm.yaml Expected Output $ inferenceservice.serving.kserve.io/lightgbm-iris created Test the deployed model \u00b6 To test the deployed model the first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , then run the following curl command to send the inference request to the InferenceService . MODEL_NAME = lightgbm-iris INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 169 .63.251.68... * TCP_NODELAY set * Connected to 169 .63.251.68 ( 169 .63.251.68 ) port 80 ( #0) > POST /models/lightgbm-iris:predict HTTP/1.1 > Host: lightgbm-iris.default.svc.cluster.local > User-Agent: curl/7.60.0 > Accept: */* > Content-Length: 76 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 76 out of 76 bytes < HTTP/1.1 200 OK < content-length: 27 < content-type: application/json ; charset = UTF-8 < date: Tue, 21 May 2019 22 :40:09 GMT < server: istio-envoy < x-envoy-upstream-service-time: 13032 < * Connection #0 to host 169.63.251.68 left intact { \"predictions\" : [[ 0 .9, 0 .05, 0 .05 ]]} Deploy the model with Open Inference Protocol \u00b6 Test the model locally \u00b6 Once you've got your model serialized model.bst , we can then use KServe LightGBM Server to create a local model server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Pre-requisites \u00b6 Firstly, to use kserve lightgbm server locally, you will first need to install the lgbserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install lgbserver runtime. KServe uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/lgbserver poetry install Serving model locally \u00b6 The lgbserver package takes three arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. --nthread : Number of threads to use by LightGBM. This is optional and the default value is 1. With the lgbserver runtime package installed locally, you should now be ready to start our server as: python3 lgbserver --model_dir /path/to/model_dir --model_name lightgbm-v2-iris Deploy InferenceService with REST endpoint \u00b6 To deploy the LightGBM model with Open Inference Protocol, you need to set the protocolVersion field to v2 . Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris\" spec : predictor : model : modelFormat : name : lightgbm runtime : kserve-lgbserver protocolVersion : v2 storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the REST endpoint kubectl kubectl apply -f lightgbm-v2.yaml Expected Output $ inferenceservice.serving.kserve.io/lightgbm-v2-iris created Test the deployed model with curl \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the V2 Dataplane protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Now, assuming that your ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or you can follow this instruction to find out your ingress IP and port. You can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/lightgbm-v2-iris/infer Expected Output { \"model_name\" : \"lightgbm-v2-iris\" , \"model_version\" : null , \"id\" : \"96253e27-83cf-4262-b279-1bd4b18d7922\" , \"parameters\" : null , \"outputs\" :[ { \"name\" : \"predict\" , \"shape\" :[ 2 , 3 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 8.796664107010673e-06 , 0.9992300031041593 , 0.0007612002317336916 , 4.974786820804187e-06 , 0.9999919650711493 , 3.0601420299625077e-06 ] } ] } Create the InferenceService with gRPC endpoint \u00b6 Create the inference service yaml and expose the gRPC port, currently only one port is allowed to expose either HTTP or gRPC port and by default HTTP port is exposed. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris-grpc\" spec : predictor : model : modelFormat : name : lightgbm protocolVersion : v2 runtime : kserve-lgbserver storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris-grpc\" spec : predictor : model : modelFormat : name : lightgbm protocolVersion : v2 runtime : kserve-lgbserver storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" ports : - name : grpc-port # Istio requires the port name to be in the format [-] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f lightgbm-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follow the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"lightgbm-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. Notice that the input file differs from that used in the previous curl example. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: accept-encoding: identity,gzip content-type: application/grpc date: Sun, 25 Sep 2022 10 :25:05 GMT grpc-accept-encoding: identity,deflate,gzip server: istio-envoy x-envoy-upstream-service-time: 99 Estimated response size: 91 bytes Response contents: { \"modelName\" : \"lightgbm-v2-iris-grpc\" , \"outputs\" : [ { \"name\" : \"predict\" , \"datatype\" : \"FP64\" , \"shape\" : [ \"2\" , \"3\" ] , \"contents\" : { \"fp64Contents\" : [ 8 .796664107010673e-06, 0 .9992300031041593, 0 .0007612002317336916, 4 .974786820804187e-06, 0 .9999919650711493, 3 .0601420299625077e-06 ] } } ] }","title":"LightGBM"},{"location":"modelserving/v1beta1/lightgbm/#deploy-lightgbm-model-with-inferenceservice","text":"","title":"Deploy LightGBM model with InferenceService"},{"location":"modelserving/v1beta1/lightgbm/#train-a-lightgbm-model","text":"To test the LightGBM Server, first you need to train a simple LightGBM model with following python code. import lightgbm as lgb from sklearn.datasets import load_iris import os model_dir = \".\" BST_FILE = \"model.bst\" iris = load_iris () y = iris [ 'target' ] X = iris [ 'data' ] dtrain = lgb . Dataset ( X , label = y , feature_names = iris [ 'feature_names' ]) params = { 'objective' : 'multiclass' , 'metric' : 'softmax' , 'num_class' : 3 } lgb_model = lgb . train ( params = params , train_set = dtrain ) model_file = os . path . join ( model_dir , BST_FILE ) lgb_model . save_model ( model_file )","title":"Train a LightGBM model"},{"location":"modelserving/v1beta1/lightgbm/#deploy-lightgbm-model-with-v1-protocol","text":"","title":"Deploy LightGBM model with V1 protocol"},{"location":"modelserving/v1beta1/lightgbm/#test-the-model-locally","text":"Install and run the LightGBM Server using the trained model locally and test the prediction. python -m lgbserver --model_dir /path/to/model_dir --model_name lgb After the LightGBM Server is up locally we can then test the model by sending an inference request. import requests request = { 'sepal_width_(cm)' : { 0 : 3.5 }, 'petal_length_(cm)' : { 0 : 1.4 }, 'petal_width_(cm)' : { 0 : 0.2 }, 'sepal_length_(cm)' : { 0 : 5.1 } } formData = { 'inputs' : [ request ] } res = requests . post ( 'http://localhost:8080/v1/models/lgb:predict' , json = formData ) print ( res ) print ( res . text )","title":"Test the model locally"},{"location":"modelserving/v1beta1/lightgbm/#deploy-with-inferenceservice","text":"To deploy the model on Kubernetes you can create the InferenceService by specifying the modelFormat with lightgbm and storageUri . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-iris\" spec : predictor : model : modelFormat : name : lightgbm storageUri : \"gs://kfserving-examples/models/lightgbm/iris\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-iris\" spec : predictor : lightgbm : storageUri : \"gs://kfserving-examples/models/lightgbm/iris\" Apply the above yaml to create the InferenceService kubectl apply -f lightgbm.yaml Expected Output $ inferenceservice.serving.kserve.io/lightgbm-iris created","title":"Deploy with InferenceService"},{"location":"modelserving/v1beta1/lightgbm/#test-the-deployed-model","text":"To test the deployed model the first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , then run the following curl command to send the inference request to the InferenceService . MODEL_NAME = lightgbm-iris INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Trying 169 .63.251.68... * TCP_NODELAY set * Connected to 169 .63.251.68 ( 169 .63.251.68 ) port 80 ( #0) > POST /models/lightgbm-iris:predict HTTP/1.1 > Host: lightgbm-iris.default.svc.cluster.local > User-Agent: curl/7.60.0 > Accept: */* > Content-Length: 76 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 76 out of 76 bytes < HTTP/1.1 200 OK < content-length: 27 < content-type: application/json ; charset = UTF-8 < date: Tue, 21 May 2019 22 :40:09 GMT < server: istio-envoy < x-envoy-upstream-service-time: 13032 < * Connection #0 to host 169.63.251.68 left intact { \"predictions\" : [[ 0 .9, 0 .05, 0 .05 ]]}","title":"Test the deployed model"},{"location":"modelserving/v1beta1/lightgbm/#deploy-the-model-with-open-inference-protocol","text":"","title":"Deploy the model with Open Inference Protocol"},{"location":"modelserving/v1beta1/lightgbm/#test-the-model-locally_1","text":"Once you've got your model serialized model.bst , we can then use KServe LightGBM Server to create a local model server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Test the model locally"},{"location":"modelserving/v1beta1/lightgbm/#pre-requisites","text":"Firstly, to use kserve lightgbm server locally, you will first need to install the lgbserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install lgbserver runtime. KServe uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/lgbserver poetry install","title":"Pre-requisites"},{"location":"modelserving/v1beta1/lightgbm/#serving-model-locally","text":"The lgbserver package takes three arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. --nthread : Number of threads to use by LightGBM. This is optional and the default value is 1. With the lgbserver runtime package installed locally, you should now be ready to start our server as: python3 lgbserver --model_dir /path/to/model_dir --model_name lightgbm-v2-iris","title":"Serving model locally"},{"location":"modelserving/v1beta1/lightgbm/#deploy-inferenceservice-with-rest-endpoint","text":"To deploy the LightGBM model with Open Inference Protocol, you need to set the protocolVersion field to v2 . Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris\" spec : predictor : model : modelFormat : name : lightgbm runtime : kserve-lgbserver protocolVersion : v2 storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the REST endpoint kubectl kubectl apply -f lightgbm-v2.yaml Expected Output $ inferenceservice.serving.kserve.io/lightgbm-v2-iris created","title":"Deploy InferenceService with REST endpoint"},{"location":"modelserving/v1beta1/lightgbm/#test-the-deployed-model-with-curl","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the V2 Dataplane protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Now, assuming that your ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or you can follow this instruction to find out your ingress IP and port. You can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/lightgbm-v2-iris/infer Expected Output { \"model_name\" : \"lightgbm-v2-iris\" , \"model_version\" : null , \"id\" : \"96253e27-83cf-4262-b279-1bd4b18d7922\" , \"parameters\" : null , \"outputs\" :[ { \"name\" : \"predict\" , \"shape\" :[ 2 , 3 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 8.796664107010673e-06 , 0.9992300031041593 , 0.0007612002317336916 , 4.974786820804187e-06 , 0.9999919650711493 , 3.0601420299625077e-06 ] } ] }","title":"Test the deployed model with curl"},{"location":"modelserving/v1beta1/lightgbm/#create-the-inferenceservice-with-grpc-endpoint","text":"Create the inference service yaml and expose the gRPC port, currently only one port is allowed to expose either HTTP or gRPC port and by default HTTP port is exposed. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris-grpc\" spec : predictor : model : modelFormat : name : lightgbm protocolVersion : v2 runtime : kserve-lgbserver storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"lightgbm-v2-iris-grpc\" spec : predictor : model : modelFormat : name : lightgbm protocolVersion : v2 runtime : kserve-lgbserver storageUri : \"gs://kfserving-examples/models/lightgbm/v2/iris\" ports : - name : grpc-port # Istio requires the port name to be in the format [-] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f lightgbm-v2-grpc.yaml","title":"Create the InferenceService with gRPC endpoint"},{"location":"modelserving/v1beta1/lightgbm/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice lightgbm-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follow the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"lightgbm-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. Notice that the input file differs from that used in the previous curl example. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: accept-encoding: identity,gzip content-type: application/grpc date: Sun, 25 Sep 2022 10 :25:05 GMT grpc-accept-encoding: identity,deflate,gzip server: istio-envoy x-envoy-upstream-service-time: 99 Estimated response size: 91 bytes Response contents: { \"modelName\" : \"lightgbm-v2-iris-grpc\" , \"outputs\" : [ { \"name\" : \"predict\" , \"datatype\" : \"FP64\" , \"shape\" : [ \"2\" , \"3\" ] , \"contents\" : { \"fp64Contents\" : [ 8 .796664107010673e-06, 0 .9992300031041593, 0 .0007612002317336916, 4 .974786820804187e-06, 0 .9999919650711493, 3 .0601420299625077e-06 ] } } ] }","title":"Test the deployed model with grpcurl"},{"location":"modelserving/v1beta1/llm/huggingface/","text":"Deploy the Llama2 model with Hugging Face LLM Serving Runtime \u00b6 The Hugging Face LLM serving runtime implements a runtime that can serve Hugging Face LLM model out of the box. In this example, we deploy a Llama2 model from Hugging Face by running an InferenceService with Hugging Face Serving runtime . Based on the performance requirement for large language models, KServe chooses to perform the inference using a more optimized inference engine like vLLM for text generation models. Serve the Hugging Face LLM model using vLLM \u00b6 KServe Hugging Face runtime by default uses vLLM to serve the LLM models for faster inference, higher throughput than Hugging Face API, implemented with paged attention, continous batching, optmized CUDA kernel. You can still use --disable_vllm flag to fall back to perform the inference using Hugging Face API. Yaml kubectl apply -f - < 1 else 0.5 l1_ratio = float ( sys . argv [ 2 ]) if len ( sys . argv ) > 2 else 0.5 with mlflow . start_run (): lr = ElasticNet ( alpha = alpha , l1_ratio = l1_ratio , random_state = 42 ) lr . fit ( train_x , train_y ) predicted_qualities = lr . predict ( test_x ) ( rmse , mae , r2 ) = eval_metrics ( test_y , predicted_qualities ) print ( \"Elasticnet model (alpha= %f , l1_ratio= %f ):\" % ( alpha , l1_ratio )) print ( \" RMSE: %s \" % rmse ) print ( \" MAE: %s \" % mae ) print ( \" R2: %s \" % r2 ) mlflow . log_param ( \"alpha\" , alpha ) mlflow . log_param ( \"l1_ratio\" , l1_ratio ) mlflow . log_metric ( \"rmse\" , rmse ) mlflow . log_metric ( \"r2\" , r2 ) mlflow . log_metric ( \"mae\" , mae ) tracking_url_type_store = urlparse ( mlflow . get_tracking_uri ()) . scheme model_signature = infer_signature ( train_x , train_y ) # Model registry does not work with file store if tracking_url_type_store != \"file\" : # Register the model # There are other ways to use the Model Registry, # which depends on the use case, # please refer to the doc for more information: # https://mlflow.org/docs/latest/model-registry.html#api-workflow mlflow . sklearn . log_model ( lr , \"model\" , registered_model_name = \"ElasticnetWineModel\" , signature = model_signature , ) else : mlflow . sklearn . log_model ( lr , \"model\" , signature = model_signature ) The training script will also serialise our trained model, leveraging the MLflow Model format. model/ \u251c\u2500\u2500 MLmodel \u251c\u2500\u2500 model.pkl \u251c\u2500\u2500 conda.yaml \u2514\u2500\u2500 requirements.txt Testing locally \u00b6 Once you've got your model serialised model.pkl , we can then use MLServer to spin up a local server. For more details on MLServer, feel free to check the MLflow example doc . Note this step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Pre-requisites \u00b6 Firstly, to use MLServer locally, you will first need to install the mlserver package in your local environment, as well as the MLflow runtime. pip install mlserver mlserver-mlflow Model settings \u00b6 The next step will be providing some model settings so that MLServer knows: The inference runtime to serve your model (i.e. mlserver_mlflow.MLflowRuntime ) The model's name and version These can be specified through environment variables or by creating a local model-settings.json file: { \"name\" : \"mlflow-wine-classifier\" , \"version\" : \"v1.0.0\" , \"implementation\" : \"mlserver_mlflow.MLflowRuntime\" } Start the model server locally \u00b6 With the mlserver package installed locally and a local model-settings.json file, you should now be ready to start our server as: mlserver start . Deploy with InferenceService \u00b6 When you deploy the model with InferenceService, KServe injects sensible defaults so that it runs out-of-the-box without any further configuration. However, you can still override these defaults by providing a model-settings.json file similar to your local one. You can even provide a set of model-settings.json files to load multiple models . To use v2 protocol for inference with the deployed model you set the protocolVersion field to v2 , in this example your model artifacts have already been uploaded to a \"GCS model repository\" and can be accessed as gs://kfserving-examples/models/mlflow/wine . New Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mlflow-v2-wine-classifier\" spec : predictor : model : modelFormat : name : mlflow protocolVersion : v2 storageUri : \"gs://kfserving-examples/models/mlflow/wine\" kubectl kubectl apply -f mlflow-new.yaml Testing deployed model \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the V2 Dataplane protocol . You can see an example payload below: { \"parameters\" : { \"content_type\" : \"pd\" }, \"inputs\" : [ { \"name\" : \"fixed acidity\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 7.4 ] }, { \"name\" : \"volatile acidity\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.7000 ] }, { \"name\" : \"citric acid\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0 ] }, { \"name\" : \"residual sugar\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 1.9 ] }, { \"name\" : \"chlorides\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.076 ] }, { \"name\" : \"free sulfur dioxide\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 11 ] }, { \"name\" : \"total sulfur dioxide\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 34 ] }, { \"name\" : \"density\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.9978 ] }, { \"name\" : \"pH\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 3.51 ] }, { \"name\" : \"sulphates\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.56 ] }, { \"name\" : \"alcohol\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 9.4 ] } ] } Now, assuming that your ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or you can follow this instruction to find out your ingress IP and port. You can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice mlflow-v2-wine-classifier -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./mlflow-input.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/mlflow-v2-wine-classifier/infer Expected Output { \"model_name\" : \"mlflow-v2-wine-classifier\" , \"model_version\" : null , \"id\" : \"699cf11c-e843-444e-9dc3-000d991052cc\" , \"parameters\" : null , \"outputs\" :[ { \"name\" : \"predict\" , \"shape\" :[ 1 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" :[ 5.576883936610762 ] } ] }","title":"MLFlow"},{"location":"modelserving/v1beta1/mlflow/v2/#deploy-mlflow-models-with-inferenceservice","text":"This example walks you through how to deploy a mlflow model leveraging the KServe InferenceService CRD and how to send the inference request using V2 Dataplane .","title":"Deploy MLflow models with InferenceService"},{"location":"modelserving/v1beta1/mlflow/v2/#training","text":"The first step is to train a sample sklearn model and save as mlflow model format by calling mlflow log_model API. # Original source code and more details can be found in: # https://www.mlflow.org/docs/latest/tutorials-and-examples/tutorial.html # The data set used in this example is from # http://archive.ics.uci.edu/ml/datasets/Wine+Quality # P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. # Modeling wine preferences by data mining from physicochemical properties. # In Decision Support Systems, Elsevier, 47(4):547-553, 2009. import warnings import sys import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error , mean_absolute_error , r2_score from sklearn.model_selection import train_test_split from sklearn.linear_model import ElasticNet from urllib.parse import urlparse import mlflow import mlflow.sklearn from mlflow.models.signature import infer_signature import logging logging . basicConfig ( level = logging . WARN ) logger = logging . getLogger ( __name__ ) def eval_metrics ( actual , pred ): rmse = np . sqrt ( mean_squared_error ( actual , pred )) mae = mean_absolute_error ( actual , pred ) r2 = r2_score ( actual , pred ) return rmse , mae , r2 if __name__ == \"__main__\" : warnings . filterwarnings ( \"ignore\" ) np . random . seed ( 40 ) # Read the wine-quality csv file from the URL csv_url = ( \"http://archive.ics.uci.edu/ml\" \"/machine-learning-databases/wine-quality/winequality-red.csv\" ) try : data = pd . read_csv ( csv_url , sep = \";\" ) except Exception as e : logger . exception ( \"Unable to download training & test CSV, \" \"check your internet connection. Error: %s \" , e , ) # Split the data into training and test sets. (0.75, 0.25) split. train , test = train_test_split ( data ) # The predicted column is \"quality\" which is a scalar from [3, 9] train_x = train . drop ([ \"quality\" ], axis = 1 ) test_x = test . drop ([ \"quality\" ], axis = 1 ) train_y = train [[ \"quality\" ]] test_y = test [[ \"quality\" ]] alpha = float ( sys . argv [ 1 ]) if len ( sys . argv ) > 1 else 0.5 l1_ratio = float ( sys . argv [ 2 ]) if len ( sys . argv ) > 2 else 0.5 with mlflow . start_run (): lr = ElasticNet ( alpha = alpha , l1_ratio = l1_ratio , random_state = 42 ) lr . fit ( train_x , train_y ) predicted_qualities = lr . predict ( test_x ) ( rmse , mae , r2 ) = eval_metrics ( test_y , predicted_qualities ) print ( \"Elasticnet model (alpha= %f , l1_ratio= %f ):\" % ( alpha , l1_ratio )) print ( \" RMSE: %s \" % rmse ) print ( \" MAE: %s \" % mae ) print ( \" R2: %s \" % r2 ) mlflow . log_param ( \"alpha\" , alpha ) mlflow . log_param ( \"l1_ratio\" , l1_ratio ) mlflow . log_metric ( \"rmse\" , rmse ) mlflow . log_metric ( \"r2\" , r2 ) mlflow . log_metric ( \"mae\" , mae ) tracking_url_type_store = urlparse ( mlflow . get_tracking_uri ()) . scheme model_signature = infer_signature ( train_x , train_y ) # Model registry does not work with file store if tracking_url_type_store != \"file\" : # Register the model # There are other ways to use the Model Registry, # which depends on the use case, # please refer to the doc for more information: # https://mlflow.org/docs/latest/model-registry.html#api-workflow mlflow . sklearn . log_model ( lr , \"model\" , registered_model_name = \"ElasticnetWineModel\" , signature = model_signature , ) else : mlflow . sklearn . log_model ( lr , \"model\" , signature = model_signature ) The training script will also serialise our trained model, leveraging the MLflow Model format. model/ \u251c\u2500\u2500 MLmodel \u251c\u2500\u2500 model.pkl \u251c\u2500\u2500 conda.yaml \u2514\u2500\u2500 requirements.txt","title":"Training"},{"location":"modelserving/v1beta1/mlflow/v2/#testing-locally","text":"Once you've got your model serialised model.pkl , we can then use MLServer to spin up a local server. For more details on MLServer, feel free to check the MLflow example doc . Note this step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Testing locally"},{"location":"modelserving/v1beta1/mlflow/v2/#pre-requisites","text":"Firstly, to use MLServer locally, you will first need to install the mlserver package in your local environment, as well as the MLflow runtime. pip install mlserver mlserver-mlflow","title":"Pre-requisites"},{"location":"modelserving/v1beta1/mlflow/v2/#model-settings","text":"The next step will be providing some model settings so that MLServer knows: The inference runtime to serve your model (i.e. mlserver_mlflow.MLflowRuntime ) The model's name and version These can be specified through environment variables or by creating a local model-settings.json file: { \"name\" : \"mlflow-wine-classifier\" , \"version\" : \"v1.0.0\" , \"implementation\" : \"mlserver_mlflow.MLflowRuntime\" }","title":"Model settings"},{"location":"modelserving/v1beta1/mlflow/v2/#start-the-model-server-locally","text":"With the mlserver package installed locally and a local model-settings.json file, you should now be ready to start our server as: mlserver start .","title":"Start the model server locally"},{"location":"modelserving/v1beta1/mlflow/v2/#deploy-with-inferenceservice","text":"When you deploy the model with InferenceService, KServe injects sensible defaults so that it runs out-of-the-box without any further configuration. However, you can still override these defaults by providing a model-settings.json file similar to your local one. You can even provide a set of model-settings.json files to load multiple models . To use v2 protocol for inference with the deployed model you set the protocolVersion field to v2 , in this example your model artifacts have already been uploaded to a \"GCS model repository\" and can be accessed as gs://kfserving-examples/models/mlflow/wine . New Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"mlflow-v2-wine-classifier\" spec : predictor : model : modelFormat : name : mlflow protocolVersion : v2 storageUri : \"gs://kfserving-examples/models/mlflow/wine\" kubectl kubectl apply -f mlflow-new.yaml","title":"Deploy with InferenceService"},{"location":"modelserving/v1beta1/mlflow/v2/#testing-deployed-model","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the V2 Dataplane protocol . You can see an example payload below: { \"parameters\" : { \"content_type\" : \"pd\" }, \"inputs\" : [ { \"name\" : \"fixed acidity\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 7.4 ] }, { \"name\" : \"volatile acidity\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.7000 ] }, { \"name\" : \"citric acid\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0 ] }, { \"name\" : \"residual sugar\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 1.9 ] }, { \"name\" : \"chlorides\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.076 ] }, { \"name\" : \"free sulfur dioxide\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 11 ] }, { \"name\" : \"total sulfur dioxide\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 34 ] }, { \"name\" : \"density\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.9978 ] }, { \"name\" : \"pH\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 3.51 ] }, { \"name\" : \"sulphates\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 0.56 ] }, { \"name\" : \"alcohol\" , \"shape\" : [ 1 ], \"datatype\" : \"FP32\" , \"data\" : [ 9.4 ] } ] } Now, assuming that your ingress can be accessed at ${INGRESS_HOST}:${INGRESS_PORT} or you can follow this instruction to find out your ingress IP and port. You can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice mlflow-v2-wine-classifier -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./mlflow-input.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/mlflow-v2-wine-classifier/infer Expected Output { \"model_name\" : \"mlflow-v2-wine-classifier\" , \"model_version\" : null , \"id\" : \"699cf11c-e843-444e-9dc3-000d991052cc\" , \"parameters\" : null , \"outputs\" :[ { \"name\" : \"predict\" , \"shape\" :[ 1 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" :[ 5.576883936610762 ] } ] }","title":"Testing deployed model"},{"location":"modelserving/v1beta1/onnx/","text":"Deploy InferenceService with ONNX model \u00b6 Setup \u00b6 Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . Create the InferenceService \u00b6 New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"style-sample\" spec : predictor : model : protocolVersion : v2 modelFormat : name : onnx storageUri : \"gs://kfserving-examples/models/onnx\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"style-sample\" spec : predictor : onnx : storageUri : \"gs://kfserving-examples/models/onnx\" Note For the default kserve installation, While using new schema, you must specify protocolVersion as v2 for onnx. Otherwise, you will get a no runtime found error. Expected Output $ inferenceservice.serving.kserve.io/style-sample configured Run a sample inference \u00b6 Setup env vars The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT export ISVC_NAME=style-sample export SERVICE_HOSTNAME=$(kubectl get inferenceservice ${ISVC_NAME} -o jsonpath='{.status.url}' | cut -d \"/\" -f 3) 2. Verify the service is healthy curl -v -H \"Host:${SERVICE_HOSTNAME}\" http://localhost:8080//v2/health/ready 3. Install dependencies pip install -r requirements.txt 4. Run the sample notebook in jupyter jupyter notebook Uploading your own model \u00b6 The sample model for the example in this readme is already uploaded and available for use. However if you would like to modify the example to use your own ONNX model, all you need to do is to upload your model as model.onnx to S3, GCS or an Azure Blob.","title":"ONNX"},{"location":"modelserving/v1beta1/onnx/#deploy-inferenceservice-with-onnx-model","text":"","title":"Deploy InferenceService with ONNX model"},{"location":"modelserving/v1beta1/onnx/#setup","text":"Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible .","title":"Setup"},{"location":"modelserving/v1beta1/onnx/#create-the-inferenceservice","text":"New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"style-sample\" spec : predictor : model : protocolVersion : v2 modelFormat : name : onnx storageUri : \"gs://kfserving-examples/models/onnx\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"style-sample\" spec : predictor : onnx : storageUri : \"gs://kfserving-examples/models/onnx\" Note For the default kserve installation, While using new schema, you must specify protocolVersion as v2 for onnx. Otherwise, you will get a no runtime found error. Expected Output $ inferenceservice.serving.kserve.io/style-sample configured","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/onnx/#run-a-sample-inference","text":"Setup env vars The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT export ISVC_NAME=style-sample export SERVICE_HOSTNAME=$(kubectl get inferenceservice ${ISVC_NAME} -o jsonpath='{.status.url}' | cut -d \"/\" -f 3) 2. Verify the service is healthy curl -v -H \"Host:${SERVICE_HOSTNAME}\" http://localhost:8080//v2/health/ready 3. Install dependencies pip install -r requirements.txt 4. Run the sample notebook in jupyter jupyter notebook","title":"Run a sample inference"},{"location":"modelserving/v1beta1/onnx/#uploading-your-own-model","text":"The sample model for the example in this readme is already uploaded and available for use. However if you would like to modify the example to use your own ONNX model, all you need to do is to upload your model as model.onnx to S3, GCS or an Azure Blob.","title":"Uploading your own model"},{"location":"modelserving/v1beta1/paddle/","text":"Deploy Paddle model with InferenceService \u00b6 In this example, we use a trained paddle resnet50 model to classify images by running an inference service with Paddle predictor. Deploy Paddle model with V1 protocol \u00b6 Create the InferenceService \u00b6 New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-resnet50\" spec : predictor : model : modelFormat : name : paddle storageUri : \"gs://kfserving-examples/models/paddle/resnet\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-resnet50\" spec : predictor : paddle : storageUri : \"gs://kfserving-examples/models/paddle/resnet\" Apply the above yaml to create the InferenceService kubectl apply -f paddle.yaml Expected Output $ inferenceservice.serving.kserve.io/paddle-resnet50 created Run a Prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload jay.json as the sample input to test the model. MODEL_NAME = paddle-resnet50 SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./jay.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict Expected Output * Trying 127 .0.0.1:80... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 80 ( #0) > POST /v1/models/paddle-resnet50:predict HTTP/1.1 > Host: paddle-resnet50.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 3010209 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > * Mark bundle as not supporting multiuse < HTTP/1.1 100 Continue * We are completely uploaded and fine * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23399 < content-type: application/json ; charset = UTF-8 < date: Mon, 17 May 2021 03 :34:58 GMT < server: istio-envoy < x-envoy-upstream-service-time: 511 < { \"predictions\" : [[ 6 .736678770380422e-09, 1 .1535990829258935e-08, 5 .142250714129659e-08, 6 .647170636142619e-08, 4 .094492567219277e-08, 1 .3402451770616608e-07, 9 .355561303436843e-08, 2 .8935891904779965e-08, 6 .845367295227334e-08, 7 .680615965455218e-08, 2 .0334689452283783e-06, 1 .1085678579547675e-06, 2 .3477592492326949e-07, 6 .582037030966603e-07, 0 .00012373103527352214, 4 .2878804151769145e-07, 6 .419959845516132e-06, 0 .9993496537208557, 7 .372002437477931e-05, 3 .101135735050775e-05, 5 .6028093240456656e-06, 2 .1862508674530545e-06, 1 .9544044604913324e-08, 3 .728893887000595e-07, 4 .2903633357127546e-07, 1 .8251179767503345e-07, 7 .159925985433802e-08, 9 .231618136595898e-09, 6 .469241498052725e-07, 7 .031690341108288e-09, 4 .451231561120039e-08, 1 .2455971898361895e-07, 9 .44632745358831e-08, 4 .347704418705689e-08, 4 .658220120745682e-07, 6 .797721141538204e-08, 2 .1060276367279585e-07, 2 .2605123106700376e-08, 1 .4311490303953178e-07, 7 .951298641728499e-08, 1 .2341783417468832e-07, 1 .0921713737843675e-06, 1 .5243892448779661e-05, 3 .1173343018053856e-07, 2 .4152058131221565e-07, 6 .863762536113427e-08, 8 .467682022228473e-08, 9 .4246772164297e-08, 1 .0219210366813058e-08, 3 .3770753304906975e-08, 3 .6928835100979995e-08, 1 .3694031508748594e-07, 1 .0674284567357972e-07, 2 .599483650556067e-07, 3 .4866405940192635e-07, 3 .132053549848024e-08, 3 .574873232992104e-07, 6 .64843895492595e-08, 3 .1638955988455564e-07, 1 .2095878219042788e-06, 8 .66409024524728e-08, 4 .0144172430700564e-08, 1 .2544761318622477e-07, 3 .3201178695208e-08, 1 .9731444922399533e-07, 3 .806405572959193e-07, 1 .3827865075199952e-07, 2 .300225965257141e-08, 7 .14422512260171e-08, 2 .851114544455413e-08, 2 .982567437470607e-08, 8 .936032713791064e-08, 6 .22388370175031e-07, 6 .478838798784636e-08, 1 .3663023423760023e-07, 9 .973181391842445e-08, 2 .5761554667269593e-08, 4 .130220077058766e-08, 3 .9384463690339544e-08, 1 .2158079698565416e-07, 4 .302821707824478e-06, 1 .8179063090428826e-06, 1 .8520155435908237e-06, 1 .6246107179540559e-06, 1 .6448313544970006e-05, 1 .0544916221988387e-05, 3 .993061909568496e-06, 2 .646479799750523e-07, 1 .9193475964129902e-05, 4 .803242745765601e-07, 1 .696285067964709e-07, 4 .550505764200352e-06, 4 .235929372953251e-05, 4 .443338639248395e-06, 5 .104009687784128e-06, 1 .3506396498996764e-05, 4 .1758724478313525e-07, 4 .494491463447048e-07, 3 .156698369366495e-07, 1 .0557599807725637e-06, 1 .336463917311903e-08, 1 .3893659556174498e-08, 6 .770379457066156e-08, 1 .4129696523923485e-07, 7 .170518756538513e-08, 7 .934466594861078e-08, 2 .639154317307657e-08, 2 .6134321373660896e-08, 7 .196725881897237e-09, 2 .1752363466021052e-08, 6 .684639686227456e-08, 3 .417795824134373e-08, 1 .6228275967478112e-07, 4 .107114648377319e-07, 6 .472135396506928e-07, 2 .951379372007068e-07, 5 .653474133282543e-09, 4 .830144462175667e-08, 8 .887481861563629e-09, 3 .7306168820805397e-08, 1 .7784264727538357e-08, 4 .641905082536368e-09, 3 .413118676576232e-08, 1 .937393818707278e-07, 1 .2980176506971475e-06, 3 .5641004814124244e-08, 2 .149332445355867e-08, 3 .055293689158134e-07, 1 .5532516783878236e-07, 1 .4520978766086046e-06, 3 .488464628276233e-08, 3 .825438398052938e-05, 4 .5088432898410247e-07, 4 .1766969616219285e-07, 6 .770622462681786e-07, 1 .4142248971893423e-07, 1 .4235997696232516e-05, 6 .293820433711517e-07, 4 .762866865348769e-06, 9 .024900577969674e-07, 9 .058987870957935e-07, 1 .5713684433649178e-06, 1 .5720647184025438e-07, 1 .818536503606083e-07, 7 .193188622522939e-08, 1 .1952824934269302e-06, 8 .874837362782273e-07, 2 .0870831463071227e-07, 9 .906239029078279e-08, 7 .793621747964607e-09, 1 .0058498389753368e-07, 4 .2059440374941914e-07, 1 .843624630737395e-07, 1 .6437947181202617e-07, 7 .025352743994517e-08, 2 .570448600636155e-07, 7 .586877615040066e-08, 7 .841313731660193e-07, 2 .495309274763713e-07, 5 .157681925993529e-08, 4 .0674127177453556e-08, 7 .531796519799627e-09, 4 .797485431140558e-08, 1 .7419973019627832e-08, 1 .7958679165985814e-07, 1 .2566392371127222e-08, 8 .975440124459055e-08, 3 .26965476915575e-08, 1 .1208359751435637e-07, 3 .906746215420753e-08, 4 .6769045525252295e-08, 1 .8523553535487736e-07, 1 .4833052830454108e-07, 1 .2279349448363064e-07, 1 .0729105497375713e-06, 3 .6538490011395197e-09, 1 .6198403329781286e-07, 1 .6190719875908144e-08, 1 .2004933580556099e-07, 1 .4800277448046018e-08, 4 .02294837442696e-08, 2 .15060893538066e-07, 1 .1925696696835075e-07, 4 .8982514044837444e-08, 7 .608920071788816e-08, 2 .3137479487900237e-08, 8 .521050176568679e-08, 9 .586213423062873e-08, 1 .3351650807180704e-07, 3 .021699157557123e-08, 4 .423876376336011e-08, 2 .610667060309879e-08, 2 .3977091245797055e-07, 1 .3192564551900432e-07, 1 .6734931662654162e-08, 1 .588336999702733e-07, 4 .0643516285854275e-07, 8 .753454494581092e-08, 8 .366999395548191e-07, 3 .437598650180007e-08, 7 .847892646850596e-08, 8 .526394701391382e-09, 9 .601382799928615e-08, 5 .258924034023948e-07, 1 .3557448141909845e-07, 1 .0307226716577134e-07, 1 .0429813457335513e-08, 5 .187714435805901e-08, 2 .187001335585137e-08, 1 .1791439824548888e-08, 2 .98065643278278e-08, 4 .338393466696289e-08, 2 .9991046091026874e-08, 2 .8507610494443725e-08, 3 .058665143385042e-08, 6 .441099031917474e-08, 1 .5364101102477434e-08, 1 .5973883549236234e-08, 2 .5736850872704053e-08, 1 .0903765712555469e-07, 3 .2118737891551064e-08, 6 .819742992547617e-09, 1 .9251311300649832e-07, 5 .8258109447706374e-08, 1 .8765761922168167e-07, 4 .0070790419122204e-07, 1 .5791577823165426e-08, 1 .950158434738114e-07, 1 .0142063189277906e-08, 2 .744815041921811e-08, 1 .2843531571604672e-08, 3 .7297493094001766e-08, 7 .407496838141014e-08, 4 .20607833007125e-08, 1 .6924804668860816e-08, 1 .459203531339881e-07, 4 .344977000414474e-08, 1 .7191403856031684e-07, 3 .5817443233554513e-08, 8 .440249388286247e-09, 4 .194829728021432e-08, 2 .514032360068086e-08, 2 .8340199520471288e-08, 8 .747196034164517e-08, 8 .277125651545703e-09, 1 .1676293709683705e-08, 1 .4548514570833504e-07, 7 .200282148289716e-09, 2 .623600948936655e-06, 5 .675736929333652e-07, 1 .9483527466945816e-06, 6 .752595282932816e-08, 8 .168475318370838e-08, 1 .0933046468153407e-07, 1 .670913718498923e-07, 3 .1387276777650186e-08, 2 .973524537708272e-08, 5 .752163900751839e-08, 5 .850877471402782e-08, 3 .2544622285968217e-07, 3 .330221431951941e-08, 4 .186786668469722e-07, 1 .5085906568401697e-07, 2 .3346819943981245e-07, 2 .86402780602657e-07, 2 .2940319865938363e-07, 1 .8537603807544656e-07, 3 .151798182443599e-07, 1 .1075967449869495e-06, 1 .5369782602192572e-07, 1 .9237509718550427e-07, 1 .64044664074936e-07, 2 .900835340824415e-07, 1 .246654903752642e-07, 5 .802622027317739e-08, 5 .186220519703966e-08, 6 .0094205167615655e-09, 1 .2333241272699524e-07, 1 .3798474185477971e-07, 1 .7370231830682314e-07, 5 .617761189569137e-07, 5 .1604470030497396e-08, 4 .813277598714194e-08, 8 .032698417537176e-08, 2 .0645263703045202e-06, 5 .638597713186755e-07, 8 .794199857220519e-07, 3 .4785980460583232e-06, 2 .972389268052211e-07, 3 .3904532870110415e-07, 9 .469074058188198e-08, 3 .754845678827223e-08, 1 .5679037801419327e-07, 8 .203105039683578e-08, 6 .847962641387539e-09, 1 .8251624211984563e-08, 6 .050240841659615e-08, 3 .956342808919544e-08, 1 .0699947949888156e-07, 3 .2566634899922065e-07, 3 .5369430406717584e-07, 7 .326295303755614e-08, 4 .85765610847011e-07, 7 .717713401689252e-07, 3 .4567779749750116e-08, 3 .246204585138912e-07, 3 .1608601602783892e-06, 5 .33099466792919e-08, 3 .645687343123427e-07, 5 .48158936908294e-07, 4 .62306957160763e-08, 1 .3466177506415988e-07, 4 .3529482240955986e-08, 1 .6404105451783835e-07, 2 .463695381038633e-08, 5 .958712634424046e-08, 9 .493651020875404e-08, 5 .523462576206839e-08, 5 .7412357534758485e-08, 1 .1850350347231142e-05, 5 .8263944993086625e-06, 7 .4208674050169066e-06, 9 .127966222877149e-07, 2 .0019581370434025e-06, 1 .033498961078294e-06, 3 .5146850763112525e-08, 2 .058995278275688e-06, 3 .5655509122989315e-07, 6 .873234070781109e-08, 2 .1935298022413008e-09, 5 .560363547374436e-08, 3 .3266996979364194e-07, 1 .307369217329324e-07, 2 .718762992515167e-08, 1 .0462929189714032e-08, 7 .466680358447775e-07, 6 .923166040451179e-08, 1 .6145664361033596e-08, 8 .568521003837759e-09, 4 .76221018175238e-09, 1 .233977116044116e-07, 8 .340628632197422e-09, 3 .2649041248333788e-09, 5 .0632489312363305e-09, 4 .0704994930251814e-09, 1 .2043538610839732e-08, 5 .105608380517879e-09, 7 .267142887457112e-09, 1 .184516307262129e-07, 7 .53557927168913e-08, 6 .386964201965384e-08, 1 .6212936770898523e-08, 2 .610429419291904e-07, 6 .979425393183192e-07, 6 .647513117741255e-08, 7 .717492849224072e-07, 6 .651206945207377e-07, 3 .324495310152997e-07, 3 .707282019149716e-07, 3 .99564243025452e-07, 6 .411632114122767e-08, 7 .107352217872176e-08, 1 .6380016631956096e-07, 6 .876800995314625e-08, 3 .462474467141874e-07, 2 .0256503319160402e-07, 6 .19610148078209e-07, 2 .6841073363925716e-08, 6 .720335363752383e-07, 1 .1348340649419697e-06, 1 .8397931853542104e-06, 6 .397251581802266e-07, 7 .257533241045167e-08, 4 .2213909523525217e-07, 3 .9657925299252383e-07, 1 .4037439655112394e-07, 3 .249856774800719e-07, 1 .5857655455420172e-07, 1 .1122217102865761e-07, 7 .391420808744442e-08, 3 .42322238111592e-07, 5 .39796154441774e-08, 8 .517296379295658e-08, 4 .061009803990601e-06, 1 .4478755474556237e-05, 7 .317032757470088e-09, 6 .9484960008026064e-09, 4 .468917325084476e-08, 9 .23141172393116e-08, 5 .411982328951126e-08, 2 .2242811326123046e-07, 1 .7609554703312824e-08, 2 .0906279374344194e-08, 3 .6797682678724186e-09, 6 .177919686933819e-08, 1 .7920288541972695e-07, 2 .6279179721200308e-08, 2 .6988200119149042e-08, 1 .6432807115052128e-07, 1 .2827612749788386e-07, 4 .468908798571647e-08, 6 .316552969565237e-08, 1 .9461760203398626e-08, 2 .087125849925542e-08, 2 .2414580413965268e-08, 2 .4765244077684656e-08, 6 .785398465325443e-09, 2 .4248794971981624e-08, 4 .554979504689527e-09, 2 .8977037658250993e-08, 2 .0402325162649504e-08, 1 .600950270130852e-07, 2 .0199709638291097e-07, 1 .611188515937556e-08, 5 .964113825029926e-08, 4 .098318573397819e-09, 3 .9080127578472457e-08, 7 .511338218080255e-09, 5 .965624154669058e-07, 1 .6478223585636442e-07, 1 .4106989354445432e-08, 3 .2855584919389e-08, 3 .3387166364917675e-09, 1 .220043444050134e-08, 4 .624639160510924e-08, 6 .842309385746148e-09, 1 .74262879681919e-08, 4 .6611329906909305e-08, 9 .331947836699328e-08, 1 .2306078644996887e-07, 1 .2359445022980253e-08, 1 .1173199254699284e-08, 2 .7724862405875683e-08, 2 .419210147763806e-07, 3 .451186785241589e-07, 2 .593766978975509e-08, 9 .964568192799561e-08, 9 .797809674694236e-09, 1 .9085564417764544e-07, 3 .972706252852731e-08, 2 .6639204619982593e-08, 6 .874148805735558e-09, 3 .146993776681484e-08, 2 .4086594407890516e-07, 1 .3126927456141857e-07, 2 .1254339799270383e-07, 2 .050203384840188e-08, 3 .694976058454813e-08, 6 .563175816154398e-07, 2 .560050127442537e-08, 2 .6882981174480847e-08, 6 .880636078676616e-07, 2 .0092733166166e-07, 2 .788039665801989e-08, 2 .628409134786125e-08, 5 .1678345158734373e-08, 1 .8935413947929192e-07, 4 .61852835087484e-07, 1 .1086777718105623e-08, 1 .4542604276357451e-07, 2 .8737009216683873e-08, 6 .105167926762078e-07, 1 .2016463379893594e-08, 1 .3944705301582871e-07, 2 .093712758721722e-08, 4 .3801410498645055e-08, 1 .966320795077081e-08, 6 .654448991838535e-09, 1 .1149590584125235e-08, 6 .424939158478082e-08, 6 .971554888934861e-09, 3 .260019587614238e-09, 1 .4260189473702667e-08, 2 .7895078247297533e-08, 8 .11578289017234e-08, 2 .5995715802196173e-08, 2 .2855578762914774e-08, 1 .055962854934478e-07, 8 .145542551574181e-08, 3 .7793686402665116e-08, 4 .881891513264236e-08, 2 .342062366267328e-08, 1 .059935517133681e-08, 3 .604105103249822e-08, 5 .062430830093945e-08, 3 .6804440384230475e-08, 1 .501580193519203e-09, 1 .4475033367489232e-06, 1 .076210423889279e-06, 1 .304991315009829e-07, 3 .073601462233455e-08, 1 .7184021317007137e-08, 2 .0421090596300928e-08, 7 .904992216367646e-09, 1 .6902052379919041e-07, 1 .2416506933732308e-08, 5 .4758292122869534e-08, 2 .6250422280327257e-08, 1 .3261367115546818e-08, 6 .29807459517906e-08, 1 .270998595259698e-08, 2 .0171681569536304e-07, 4 .386637186826192e-08, 6 .962349630157405e-08, 2 .9565120485131047e-07, 7 .925131626507209e-07, 2 .0868920103112032e-07, 1 .7341794489311724e-07, 4 .2942417621816276e-08, 4 .213406956665722e-09, 8 .824785169281313e-08, 1 .7341569957807224e-08, 7 .321587247588468e-08, 1 .7941774288487977e-08, 1 .1245148101579616e-07, 4 .242405395871174e-07, 8 .259573469615589e-09, 1 .1336403105133286e-07, 8 .268798978861014e-08, 2 .2186977588489754e-08, 1 .9539720952366224e-08, 1 .0675703876472653e-08, 3 .288517547161973e-08, 2 .4340963022950746e-08, 6 .639137239972115e-08, 5 .604687380866835e-09, 1 .386604697728444e-08, 6 .675873720496384e-08, 1 .1355886009312144e-08, 3 .132159633878473e-07, 3 .12451788886392e-08, 1 .502181845580708e-07, 1 .3461754377885882e-08, 1 .8882955998833495e-07, 4 .645742279762999e-08, 4 .6453880742092224e-08, 7 .714453964524637e-09, 3 .5857155467056145e-08, 7 .60832108426257e-09, 4 .221501370693659e-08, 4 .3407251126836854e-09, 1 .340157496088068e-08, 8 .565600495558101e-08, 1 .7045413969185574e-08, 5 .4221903411644234e-08, 3 .021912675649219e-08, 6 .153376119755194e-08, 3 .938857240370908e-09, 4 .135628017820636e-08, 1 .781920389021252e-08, 4 .3105885083605244e-08, 3 .903354972578654e-09, 7 .663085455078544e-08, 1 .1890405993142394e-08, 9 .304217840622186e-09, 1 .0968062014171664e-09, 1 .0536767902635802e-08, 1 .1516804221400889e-07, 8 .134522886393825e-07, 5 .952623993721318e-08, 2 .806350174466843e-08, 1 .2833099027886874e-08, 1 .0605690192733164e-07, 7 .872949936427176e-07, 2 .7501393162765453e-08, 3 .936289072470345e-09, 2 .0519442145428002e-08, 7 .394815870753746e-09, 3 .598397313453461e-08, 2 .5378517065632877e-08, 4 .698972233541099e-08, 7 .54952989012736e-09, 6 .322805461422831e-07, 5 .582006412652163e-09, 1 .29640980617296e-07, 1 .5874988434916304e-08, 3 .3837810775594335e-08, 6 .474512037613067e-09, 9 .121148281110436e-08, 1 .3918511676536127e-08, 8 .230025549949005e-09, 2 .7061290097663004e-08, 2 .6095918315149902e-08, 5 .722363471960534e-09, 6 .963475698285038e-07, 4 .685091781198025e-08, 9 .590579885809802e-09, 2 .099205858030473e-07, 3 .082160660028421e-08, 3 .563162565001221e-08, 7 .326312925215461e-07, 2 .1759731225756695e-06, 2 .407518309155421e-07, 2 .974515780351794e-07, 2 .529018416908002e-08, 7 .667950718825978e-09, 2 .663289251358947e-07, 3 .4358880185436647e-08, 2 .3130198201215535e-08, 3 .1239693498719134e-08, 2 .8691621878351725e-07, 3 .895845068768722e-08, 2 .4184130253956937e-08, 1 .1582445225144511e-08, 5 .1545349322168477e-08, 2 .034345492063494e-08, 8 .201963197507212e-08, 1 .164153573540716e-08, 5 .496356720868789e-07, 1 .1682151246361627e-08, 4 .7576914852243135e-08, 1 .6349824605299546e-08, 4 .090862759653646e-08, 2 .1271189609706198e-07, 1 .6697286753242224e-07, 3 .989708119433999e-08, 2 .852450279533514e-06, 1 .2500372292834072e-07, 2 .4846613655427063e-07, 1 .245429093188477e-08, 2 .9700272463628608e-08, 4 .250991558762962e-09, 1 .61443480806156e-07, 2 .6386018703306036e-07, 7 .638056409575711e-09, 3 .4455793773702226e-09, 7 .273289526210647e-08, 1 .7631434090503717e-08, 7 .58661311550668e-09, 2 .1547013062672704e-08, 1 .2675349125856883e-07, 2 .5637149292379036e-08, 3 .500976220038865e-08, 6 .472243541111311e-08, 8 .387915251262257e-09, 3 .069512288789156e-08, 7 .520387867998579e-08, 1 .5724964441687916e-07, 1 .9634005354873807e-07, 1 .2290831818972947e-07, 1 .112118730439704e-09, 1 .546895944670723e-08, 9 .91701032404535e-09, 6 .882473257974198e-07, 8 .267616635748709e-08, 4 .469531234008173e-08, 2 .075201344098332e-08, 8 .649378457903367e-08, 5 .202766573120243e-08, 4 .5564942041664835e-08, 2 .0319955496006514e-08, 8 .705182352741758e-09, 6 .452066969586667e-08, 2 .1777438519166026e-08, 1 .030954166481024e-08, 3 .211904342492744e-08, 2 .3336936294526822e-07, 8 .054096056753224e-09, 1 .9623354319264763e-07, 1 .2888089884199871e-07, 1 .5392496166555247e-08, 1 .401903038100727e-09, 5 .696818305978013e-08, 6 .080025372057207e-09, 1 .0782793324892737e-08, 2 .4260730313585555e-08, 1 .9388659566743627e-08, 2 .2970310453729326e-07, 1 .9971754028347277e-08, 2 .8477993296860404e-08, 5 .2273552597625894e-08, 2 .7392806600801123e-07, 9 .857291161097237e-08, 3 .12910977129377e-08, 4 .151442212219081e-08, 5 .251196366629074e-09, 1 .580681100676884e-06, 8 .547603442821128e-07, 1 .068913135782168e-08, 1 .0621830597301596e-06, 7 .737313012512459e-08, 6 .394216711669287e-08, 1 .1698345758759388e-07, 1 .0486609625104393e-07, 2 .1161000063329993e-07, 1 .53396815250062e-08, 5 .094453570109181e-08, 1 .4005379966874898e-08, 2 .6282036102998063e-08, 8 .778433624456738e-08, 7 .772066545896905e-09, 4 .228875383205377e-08, 3 .3243779284930497e-07, 7 .729244799747903e-08, 7 .636901111496286e-10, 5 .989500806435899e-08, 1 .326090597331131e-07, 1 .2853634245857393e-07, 8 .844242671557367e-09, 1 .0194374766570036e-07, 2 .493779334145074e-07, 1 .6547971881664125e-07, 1 .1762754326127833e-08, 1 .1496195639892903e-07, 2 .9342709240154363e-07, 1 .326124099421122e-08, 8 .630262726683213e-08, 5 .7394842656322e-08, 1 .1094081031615133e-07, 2 .2933713239581266e-07, 3 .4706170026765903e-07, 1 .4751107357824367e-07, 1 .502495017291494e-08, 6 .454319390059027e-08, 5 .164533689594464e-08, 6 .23741556182722e-08, 1 .293601457064142e-07, 1 .4052071506398534e-08, 5 .386946000385251e-08, 2 .0827554791935654e-08, 1 .3040637902861363e-08, 1 .0578981601838677e-07, 1 .5079727688771527e-08, 8 .92632726845477e-07, 4 .6374381668101705e-08, 7 .481006036869076e-07, 5 .883147302654379e-09, 2 .8707685117979054e-09, 8 .381598490814213e-07, 7 .341958596640552e-09, 1 .4245998158912698e-08, 1 .0926417104428765e-07, 1 .1308178216040687e-07, 2 .52339901862797e-07, 1 .1782835684925885e-07, 4 .6678056975224536e-08, 2 .7959197179683315e-09, 3 .4363861090014325e-08, 1 .4674496640054713e-07, 3 .5396915620822256e-08, 2 .0581127557761647e-07, 7 .18387909159901e-08, 2 .7693943138729082e-08, 4 .5493386835460115e-08, 1 .9559182717898693e-08, 1 .5359708172013598e-08, 1 .2336623278486059e-08, 2 .9570605519779747e-08, 2 .877552560676122e-07, 9 .051845495378075e-07, 2 .3732602016934834e-07, 1 .6521676471370483e-08, 1 .5478875070584763e-08, 3 .526786329643983e-08, 3 .616410637619083e-08, 1 .61590953950963e-08, 7 .65007328595857e-08, 1 .9661483108279754e-08, 4 .917534823789538e-08, 1 .1712612746350715e-07, 1 .0889253054813253e-08, 1 .494120169809321e-06, 1 .018585660261806e-08, 3 .7575969003000864e-08, 2 .097097784314883e-08, 3 .368558054717141e-08, 4 .845588819080149e-09, 6 .039624622644624e-07, 1 .037331109898787e-08, 2 .841650257323636e-07, 4 .4990630954089283e-07, 3 .463186004637464e-08, 7 .720684180867465e-08, 1 .471122175189521e-07, 1 .1601575522490748e-07, 4 .007488030310924e-07, 3 .025649775167949e-08, 6 .706784461130155e-08, 2 .0128741340386114e-08, 1 .5987744461654074e-09, 4 .1919822280078733e-08, 1 .3167154477855547e-08, 3 .231814815762846e-08, 9 .247659704669786e-08, 1 .3075300842047e-07, 1 .0574301256838226e-07, 3 .762165334819656e-08, 1 .0942246575496029e-07, 7 .001474955359299e-08, 2 .742706151082075e-08, 2 .0766625752344225e-08, 4 .5403403703403455e-08, 3 .39040298058535e-08, 1 .0469661759771043e-07, 2 .8271578855765256e-08, 3 .406226767310727e-07, 5 .146206945028098e-07, 6 .740708613506285e-07, 6 .382248063374618e-09, 3 .63878704945364e-08, 3 .626059807970705e-08, 1 .6065602892467723e-07, 3 .639055989879125e-07, 6 .232691696084203e-09, 4 .805490050330263e-08, 3 .372633727849461e-08, 6 .328880317596486e-07, 6 .480631498106959e-08, 2 .1165197949812864e-07, 8 .38779143919055e-08, 1 .7589144363228115e-08, 2 .729027670511641e-09, 2 .144795097080987e-08, 7 .861271456022223e-08, 2 .0118186228046397e-08, 2 .8407685093156942e-08, 2 .4922530883486615e-07, 2 .0156670998972004e-08, 2 .6551649767725394e-08, 2 .7848242822869906e-08, 6 .907123761834555e-09, 1 .880543720744754e-08, 1 .3006903998302732e-08, 3 .685918272822164e-07, 3 .967941211158177e-07, 2 .7592133022835696e-08, 2 .5228947819755376e-08, 1 .547002881352455e-07, 3 .689306637966183e-08, 1 .440177199718562e-09, 2 .1504929392790473e-08, 5 .068111263994979e-08, 5 .081711407228795e-08, 1 .171875219085905e-08, 5 .409278358570191e-08, 7 .138276600926474e-07, 2 .5237213208129106e-07, 7 .072044638789521e-08, 7 .199763984999663e-08, 1 .2525473103153217e-08, 3 .4803417747752974e-07, 1 .9591827538079087e-07, 1 .2404700555634918e-07, 1 .234617457157583e-07, 1 .9201337408958352e-08, 1 .9895249181445251e-07, 3 .7876677794201896e-08, 1 .0629785052174157e-08, 1 .2437127772102485e-08, 2 .1861892207653e-07, 2 .6181456291851646e-07, 1 .112900775979142e-07, 1 .0776630432474121e-07, 6 .380325157095967e-09, 3 .895085143312826e-09, 1 .5762756788717525e-07, 2 .909027019271093e-09, 1 .0381050685737137e-08, 2 .8135211493918177e-08, 1 .0778002490496874e-08, 1 .3605974125141529e-08, 2 .9236465692861202e-08, 1 .9189795352758665e-07, 2 .199506354827463e-07, 1 .326399790002597e-08, 4 .9004846403022384e-08, 2 .980837132682268e-09, 8 .926045680368588e-09, 1 .0996975774446582e-08, 7 .71560149104289e-09, 7 .454491246505768e-09, 5 .086162246925596e-08, 1 .5129764108223753e-07, 1 .1960075596562092e-08, 1 .1323334270230134e-08, 9 .391332156383214e-09, 9 .585701832293125e-08, 1 .905532798218701e-08, 1 .8105303922766325e-08, 6 .179227796110354e-08, 6 .389401363549041e-08, 1 .1853179771037503e-08, 9 .37277544466042e-09, 1 .2332148457971925e-07, 1 .6522022860954166e-08, 1 .246116454467483e-07, 4 .196171854431441e-09, 3 .996593278543514e-08, 1 .2554556505506298e-08, 1 .4302138140465104e-08, 6 .631793780798034e-09, 5 .964224669696705e-09, 5 .556936244488497e-09, 1 .4192455921602232e-07, 1 .7613080771639034e-08, 3 .380189639301534e-07, 7 .85651934620546e-08, 2 .966783085867064e-08, 2 .8992105853831163e-06, 1 .3787366697215475e-06, 5 .313622430946907e-09, 2 .512852859126724e-08, 8 .406627216572815e-08, 4 .492839167369311e-08, 5 .408793057881667e-08, 2 .4239175999696272e-08, 4 .016805235096399e-07, 4 .1083545454512205e-08, 5 .4153481698904216e-08, 8 .640767212853007e-09, 5 .773256717134245e-08, 2 .6443152023603034e-07, 8 .953217047746875e-07, 2 .7994001783326894e-08, 5 .889480014786841e-09, 4 .1788819515886644e-08, 2 .8880645430717777e-08, 2 .135752907861388e-08, 2 .3024175277441827e-07, 8 .786625471657317e-08, 2 .0697297209437693e-09, 2 .236410523437371e-08, 3 .203276310870251e-09, 1 .176874686592555e-08, 6 .963571053120177e-08, 2 .271932153519174e-08, 7 .360382525689602e-09, 6 .922528772435044e-09, 3 .213871480056696e-08, 1 .370577820125618e-07, 1 .9815049157045905e-08, 1 .0578956377571558e-08, 2 .7049420481262132e-08, 2 .9755937713815683e-09, 2 .1773699288019088e-08, 1 .09755387001087e-08, 1 .991872444762066e-08, 2 .3882098076910552e-08, 2 .1357365653784655e-08, 6 .109098560358461e-09, 1 .1890497475519624e-08, 1 .1459891702259029e-08, 3 .73173456580389e-08, 1 .572620256240498e-08, 3 .404023374287135e-08, 3 .6921580459647885e-08, 9 .281765045443535e-08, 1 .2323201303843234e-07, 4 .2347593876002065e-08, 1 .7423728237986325e-08, 5 .8113389656000436e-08, 3 .931436154402945e-08, 2 .3690461148362374e-08, 1 .792850135018398e-08, 1 .440664210150544e-08, 7 .019830494670032e-09, 6 .041522482291839e-08, 4 .867479930226182e-08, 1 .0685319296044327e-08, 1 .0051243393149889e-08, 4 .2426261614991745e-08, 2 .607815297039906e-08, 5 .136670200300 841e-09, 1 .69729952315123e-09, 1 .9131586981302462e-08, 2 .111743526711507e-07, 1 .337269672774255e-08, 2 .0002481448955223e-08, 1 .0454256482717028e-07, 2 .8144228281234973e-08, 2 .1344791889532644e-07, 2 .1046110632028103e-08, 1 .9114453664315079e-07, 3 .957693550660224e-08, 2 .931631826186276e-08, 1 .105203111251285e-07, 4 .84007678380749e-08, 5 .583606110803885e-08, 1 .2130111315400427e-07, 1 .77621615193857e-08, 2 .5610853882085394e-08, 1 .203865309662433e-07, 4 .674859610531712e-09, 1 .5916098661250544e-08, 3 .147594185293201e-08, 6 .147686093527227e-08, 2 .204641802450169e-08, 3 .257763410147163e-07, 1 .198914532096751e-07, 2 .3818989802748547e-07, 1 .4909986134625797e-08, 5 .10168831624469e-08, 5 .5142201915714395e-08, 2 .288550327023131e-08, 5 .714110073995471e-08, 5 .185095801607531e-07, 4 .977285783525076e-08, 1 .1049896109227575e-08, 1 .264099296349741e-07, 8 .174881571676451e-08 ]]} * Connection #0 to host localhost left intact Deploy the model with Open Inference Protocol \u00b6 Test the Model locally \u00b6 Once you've got your model serialised model.pdmodel , we can then use KServe Paddle Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Using KServe PaddleServer \u00b6 Pre-requisites \u00b6 Firstly, to use KServe Paddle server locally, you will first need to install the paddleserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install paddleserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/paddleserver poetry install Serving model locally \u00b6 The paddleserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the paddleserver runtime package installed locally, you should now be ready to start our server as: python3 paddleserver --model_dir /path/to/model_dir --model_name paddle-v2-resnet50 Deploy the Model with REST endpoint through InferenceService \u00b6 Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" Apply the InferenceService yaml to get the REST endpoint kubectl kubectl apply -f paddle-v2-resnet.yaml Test the Deployed Model \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can use the example payload jay-v2.json as the sample input to test the model. Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice paddle-v2-resnet50 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./jay-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/paddle-v2-resnet50/infer Expected Output { \"model_name\" : \"paddle-v2-resnet50\" , \"model_version\" : null , \"id\" : \"afa5ec0b-a5c7-454b-a464-53ba931b22df\" , \"parameters\" : null , \"outputs\" :[{ \"name\" : \"output-0\" , \"shape\" :[ 1 , 1000 ], \"datatype\" : \"FP32\" , \"parameters\" : null , \"data\" :[ 6.736678770380422e-9 , 1.1535990829258935e-8 , 5.142250714129659e-8 , 6.647170636142619e-8 , 4.094492567219277e-8 , 1.3402451770616608e-7 , 9.355561303436843e-8 , 2.8935891904779965e-8 , 6.845367295227334e-8 , 7.680615965455218e-8 , 0.0000020334689452283783 , 0.0000011085678579547675 , 2.3477592492326949e-7 , 6.582037030966603e-7 , 0.00012373103527352214 , 4.2878804151769145e-7 , 0.000006419959845516132 , 0.9993496537208557 , 0.00007372002437477931 , 0.00003101135735050775 , 0.0000056028093240456656 , 0.0000021862508674530545 , 1.9544044604913324e-8 , 3.728893887000595e-7 , 4.2903633357127546e-7 , 1.8251179767503345e-7 , 7.159925985433802e-8 , 9.231618136595898e-9 , 6.469241498052725e-7 , 7.031690341108288e-9 , 4.451231561120039e-8 , 1.2455971898361895e-7 , 9.44632745358831e-8 , 4.347704418705689e-8 , 4.658220120745682e-7 , 6.797721141538204e-8 , 2.1060276367279585e-7 , 2.2605123106700376e-8 , 1.4311490303953178e-7 , 7.951298641728499e-8 , 1.2341783417468832e-7 , 0.0000010921713737843675 , 0.000015243892448779661 , 3.1173343018053856e-7 , 2.4152058131221565e-7 , 6.863762536113427e-8 , 8.467682022228473e-8 , 9.4246772164297e-8 , 1.0219210366813058e-8 , 3.3770753304906975e-8 , 3.6928835100979995e-8 , 1.3694031508748594e-7 , 1.0674284567357972e-7 , 2.599483650556067e-7 , 3.4866405940192635e-7 , 3.132053549848024e-8 , 3.574873232992104e-7 , 6.64843895492595e-8 , 3.1638955988455564e-7 , 0.0000012095878219042788 , 8.66409024524728e-8 , 4.0144172430700564e-8 , 1.2544761318622477e-7 , 3.3201178695208e-8 , 1.9731444922399533e-7 , 3.806405572959193e-7 , 1.3827865075199952e-7 , 2.300225965257141e-8 , 7.14422512260171e-8 , 2.851114544455413e-8 , 2.982567437470607e-8 , 8.936032713791064e-8 , 6.22388370175031e-7 , 6.478838798784636e-8 , 1.3663023423760023e-7 , 9.973181391842445e-8 , 2.5761554667269593e-8 , 4.130220077058766e-8 , 3.9384463690339544e-8 , 1.2158079698565416e-7 , 0.000004302821707824478 , 0.0000018179063090428826 , 0.0000018520155435908237 , 0.0000016246107179540559 , 0.000016448313544970006 , 0.000010544916221988387 , 0.000003993061909568496 , 2.646479799750523e-7 , 0.000019193475964129902 , 4.803242745765601e-7 , 1.696285067964709e-7 , 0.000004550505764200352 , 0.00004235929372953251 , 0.000004443338639248395 , 0.000005104009687784128 , 0.000013506396498996764 , 4.1758724478313525e-7 , 4.494491463447048e-7 , 3.156698369366495e-7 , 0.0000010557599807725637 , 1.336463917311903e-8 , 1.3893659556174498e-8 , 6.770379457066156e-8 , 1.4129696523923485e-7 , 7.170518756538513e-8 , 7.934466594861078e-8 , 2.639154317307657e-8 , 2.6134321373660896e-8 , 7.196725881897237e-9 , 2.1752363466021052e-8 , 6.684639686227456e-8 , 3.417795824134373e-8 , 1.6228275967478112e-7 , 4.107114648377319e-7 , 6.472135396506928e-7 , 2.951379372007068e-7 , 5.653474133282543e-9 , 4.830144462175667e-8 , 8.887481861563629e-9 , 3.7306168820805397e-8 , 1.7784264727538357e-8 , 4.641905082536368e-9 , 3.413118676576232e-8 , 1.937393818707278e-7 , 0.0000012980176506971475 , 3.5641004814124244e-8 , 2.149332445355867e-8 , 3.055293689158134e-7 , 1.5532516783878236e-7 , 0.0000014520978766086046 , 3.488464628276233e-8 , 0.00003825438398052938 , 4.5088432898410247e-7 , 4.1766969616219285e-7 , 6.770622462681786e-7 , 1.4142248971893423e-7 , 0.000014235997696232516 , 6.293820433711517e-7 , 0.000004762866865348769 , 9.024900577969674e-7 , 9.058987870957935e-7 , 0.0000015713684433649178 , 1.5720647184025438e-7 , 1.818536503606083e-7 , 7.193188622522939e-8 , 0.0000011952824934269302 , 8.874837362782273e-7 , 2.0870831463071227e-7 , 9.906239029078279e-8 , 7.793621747964607e-9 , 1.0058498389753368e-7 , 4.2059440374941914e-7 , 1.843624630737395e-7 , 1.6437947181202617e-7 , 7.025352743994517e-8 , 2.570448600636155e-7 , 7.586877615040066e-8 , 7.841313731660193e-7 , 2.495309274763713e-7 , 5.157681925993529e-8 , 4.0674127177453556e-8 , 7.531796519799627e-9 , 4.797485431140558e-8 , 1.7419973019627832e-8 , 1.7958679165985814e-7 , 1.2566392371127222e-8 , 8.975440124459055e-8 , 3.26965476915575e-8 , 1.1208359751435637e-7 , 3.906746215420753e-8 , 4.6769045525252295e-8 , 1.8523553535487736e-7 , 1.4833052830454108e-7 , 1.2279349448363064e-7 , 0.0000010729105497375713 , 3.6538490011395197e-9 , 1.6198403329781286e-7 , 1.6190719875908144e-8 , 1.2004933580556099e-7 , 1.4800277448046018e-8 , 4.02294837442696e-8 , 2.15060893538066e-7 , 1.1925696696835075e-7 , 4.8982514044837444e-8 , 7.608920071788816e-8 , 2.3137479487900237e-8 , 8.521050176568679e-8 , 9.586213423062873e-8 , 1.3351650807180704e-7 , 3.021699157557123e-8 , 4.423876376336011e-8 , 2.610667060309879e-8 , 2.3977091245797055e-7 , 1.3192564551900432e-7 , 1.6734931662654162e-8 , 1.588336999702733e-7 , 4.0643516285854275e-7 , 8.753454494581092e-8 , 8.366999395548191e-7 , 3.437598650180007e-8 , 7.847892646850596e-8 , 8.526394701391382e-9 , 9.601382799928615e-8 , 5.258924034023948e-7 , 1.3557448141909845e-7 , 1.0307226716577134e-7 , 1.0429813457335513e-8 , 5.187714435805901e-8 , 2.187001335585137e-8 , 1.1791439824548888e-8 , 2.98065643278278e-8 , 4.338393466696289e-8 , 2.9991046091026874e-8 , 2.8507610494443725e-8 , 3.058665143385042e-8 , 6.441099031917474e-8 , 1.5364101102477434e-8 , 1.5973883549236234e-8 , 2.5736850872704053e-8 , 1.0903765712555469e-7 , 3.2118737891551064e-8 , 6.819742992547617e-9 , 1.9251311300649832e-7 , 5.8258109447706374e-8 , 1.8765761922168167e-7 , 4.0070790419122204e-7 , 1.5791577823165426e-8 , 1.950158434738114e-7 , 1.0142063189277906e-8 , 2.744815041921811e-8 , 1.2843531571604672e-8 , 3.7297493094001766e-8 , 7.407496838141014e-8 , 4.20607833007125e-8 , 1.6924804668860816e-8 , 1.459203531339881e-7 , 4.344977000414474e-8 , 1.7191403856031684e-7 , 3.5817443233554513e-8 , 8.440249388286247e-9 , 4.194829728021432e-8 , 2.514032360068086e-8 , 2.8340199520471288e-8 , 8.747196034164517e-8 , 8.277125651545703e-9 , 1.1676293709683705e-8 , 1.4548514570833504e-7 , 7.200282148289716e-9 , 0.000002623600948936655 , 5.675736929333652e-7 , 0.0000019483527466945816 , 6.752595282932816e-8 , 8.168475318370838e-8 , 1.0933046468153407e-7 , 1.670913718498923e-7 , 3.1387276777650186e-8 , 2.973524537708272e-8 , 5.752163900751839e-8 , 5.850877471402782e-8 , 3.2544622285968217e-7 , 3.330221431951941e-8 , 4.186786668469722e-7 , 1.5085906568401697e-7 , 2.3346819943981245e-7 , 2.86402780602657e-7 , 2.2940319865938363e-7 , 1.8537603807544656e-7 , 3.151798182443599e-7 , 0.0000011075967449869495 , 1.5369782602192572e-7 , 1.9237509718550427e-7 , 1.64044664074936e-7 , 2.900835340824415e-7 , 1.246654903752642e-7 , 5.802622027317739e-8 , 5.186220519703966e-8 , 6.0094205167615655e-9 , 1.2333241272699524e-7 , 1.3798474185477971e-7 , 1.7370231830682314e-7 , 5.617761189569137e-7 , 5.1604470030497396e-8 , 4.813277598714194e-8 , 8.032698417537176e-8 , 0.0000020645263703045202 , 5.638597713186755e-7 , 8.794199857220519e-7 , 0.0000034785980460583232 , 2.972389268052211e-7 , 3.3904532870110415e-7 , 9.469074058188198e-8 , 3.754845678827223e-8 , 1.5679037801419327e-7 , 8.203105039683578e-8 , 6.847962641387539e-9 , 1.8251624211984563e-8 , 6.050240841659615e-8 , 3.956342808919544e-8 , 1.0699947949888156e-7 , 3.2566634899922065e-7 , 3.5369430406717584e-7 , 7.326295303755614e-8 , 4.85765610847011e-7 , 7.717713401689252e-7 , 3.4567779749750116e-8 , 3.246204585138912e-7 , 0.0000031608601602783892 , 5.33099466792919e-8 , 3.645687343123427e-7 , 5.48158936908294e-7 , 4.62306957160763e-8 , 1.3466177506415988e-7 , 4.3529482240955986e-8 , 1.6404105451783835e-7 , 2.463695381038633e-8 , 5.958712634424046e-8 , 9.493651020875404e-8 , 5.523462576206839e-8 , 5.7412357534758485e-8 , 0.000011850350347231142 , 0.0000058263944993086625 , 0.0000074208674050169066 , 9.127966222877149e-7 , 0.0000020019581370434025 , 0.000001033498961078294 , 3.5146850763112525e-8 , 0.000002058995278275688 , 3.5655509122989315e-7 , 6.873234070781109e-8 , 2.1935298022413008e-9 , 5.560363547374436e-8 , 3.3266996979364194e-7 , 1.307369217329324e-7 , 2.718762992515167e-8 , 1.0462929189714032e-8 , 7.466680358447775e-7 , 6.923166040451179e-8 , 1.6145664361033596e-8 , 8.568521003837759e-9 , 4.76221018175238e-9 , 1.233977116044116e-7 , 8.340628632197422e-9 , 3.2649041248333788e-9 , 5.0632489312363305e-9 , 4.0704994930251814e-9 , 1.2043538610839732e-8 , 5.105608380517879e-9 , 7.267142887457112e-9 , 1.184516307262129e-7 , 7.53557927168913e-8 , 6.386964201965384e-8 , 1.6212936770898523e-8 , 2.610429419291904e-7 , 6.979425393183192e-7 , 6.647513117741255e-8 , 7.717492849224072e-7 , 6.651206945207377e-7 , 3.324495310152997e-7 , 3.707282019149716e-7 , 3.99564243025452e-7 , 6.411632114122767e-8 , 7.107352217872176e-8 , 1.6380016631956096e-7 , 6.876800995314625e-8 , 3.462474467141874e-7 , 2.0256503319160402e-7 , 6.19610148078209e-7 , 2.6841073363925716e-8 , 6.720335363752383e-7 , 0.0000011348340649419697 , 0.0000018397931853542104 , 6.397251581802266e-7 , 7.257533241045167e-8 , 4.2213909523525217e-7 , 3.9657925299252383e-7 , 1.4037439655112394e-7 , 3.249856774800719e-7 , 1.5857655455420172e-7 , 1.1122217102865761e-7 , 7.391420808744442e-8 , 3.42322238111592e-7 , 5.39796154441774e-8 , 8.517296379295658e-8 , 0.000004061009803990601 , 0.000014478755474556237 , 7.317032757470088e-9 , 6.9484960008026064e-9 , 4.468917325084476e-8 , 9.23141172393116e-8 , 5.411982328951126e-8 , 2.2242811326123046e-7 , 1.7609554703312824e-8 , 2.0906279374344194e-8 , 3.6797682678724186e-9 , 6.177919686933819e-8 , 1.7920288541972695e-7 , 2.6279179721200308e-8 , 2.6988200119149042e-8 , 1.6432807115052128e-7 , 1.2827612749788386e-7 , 4.468908798571647e-8 , 6.316552969565237e-8 , 1.9461760203398626e-8 , 2.087125849925542e-8 , 2.2414580413965268e-8 , 2.4765244077684656e-8 , 6.785398465325443e-9 , 2.4248794971981624e-8 , 4.554979504689527e-9 , 2.8977037658250993e-8 , 2.0402325162649504e-8 , 1.600950270130852e-7 , 2.0199709638291097e-7 , 1.611188515937556e-8 , 5.964113825029926e-8 , 4.098318573397819e-9 , 3.9080127578472457e-8 , 7.511338218080255e-9 , 5.965624154669058e-7 , 1.6478223585636442e-7 , 1.4106989354445432e-8 , 3.2855584919389e-8 , 3.3387166364917675e-9 , 1.220043444050134e-8 , 4.624639160510924e-8 , 6.842309385746148e-9 , 1.74262879681919e-8 , 4.6611329906909305e-8 , 9.331947836699328e-8 , 1.2306078644996887e-7 , 1.2359445022980253e-8 , 1.1173199254699284e-8 , 2.7724862405875683e-8 , 2.419210147763806e-7 , 3.451186785241589e-7 , 2.593766978975509e-8 , 9.964568192799561e-8 , 9.797809674694236e-9 , 1.9085564417764544e-7 , 3.972706252852731e-8 , 2.6639204619982593e-8 , 6.874148805735558e-9 , 3.146993776681484e-8 , 2.4086594407890516e-7 , 1.3126927456141857e-7 , 2.1254339799270383e-7 , 2.050203384840188e-8 , 3.694976058454813e-8 , 6.563175816154398e-7 , 2.560050127442537e-8 , 2.6882981174480847e-8 , 6.880636078676616e-7 , 2.0092733166166e-7 , 2.788039665801989e-8 , 2.628409134786125e-8 , 5.1678345158734373e-8 , 1.8935413947929192e-7 , 4.61852835087484e-7 , 1.1086777718105623e-8 , 1.4542604276357451e-7 , 2.8737009216683873e-8 , 6.105167926762078e-7 , 1.2016463379893594e-8 , 1.3944705301582871e-7 , 2.093712758721722e-8 , 4.3801410498645055e-8 , 1.966320795077081e-8 , 6.654448991838535e-9 , 1.1149590584125235e-8 , 6.424939158478082e-8 , 6.971554888934861e-9 , 3.260019587614238e-9 , 1.4260189473702667e-8 , 2.7895078247297533e-8 , 8.11578289017234e-8 , 2.5995715802196173e-8 , 2.2855578762914774e-8 , 1.055962854934478e-7 , 8.145542551574181e-8 , 3.7793686402665116e-8 , 4.881891513264236e-8 , 2.342062366267328e-8 , 1.059935517133681e-8 , 3.604105103249822e-8 , 5.062430830093945e-8 , 3.6804440384230475e-8 , 1.501580193519203e-9 , 0.0000014475033367489232 , 0.000001076210423889279 , 1.304991315009829e-7 , 3.073601462233455e-8 , 1.7184021317007137e-8 , 2.0421090596300928e-8 , 7.904992216367646e-9 , 1.6902052379919041e-7 , 1.2416506933732308e-8 , 5.4758292122869534e-8 , 2.6250422280327257e-8 , 1.3261367115546818e-8 , 6.29807459517906e-8 , 1.270998595259698e-8 , 2.0171681569536304e-7 , 4.386637186826192e-8 , 6.962349630157405e-8 , 2.9565120485131047e-7 , 7.925131626507209e-7 , 2.0868920103112032e-7 , 1.7341794489311724e-7 , 4.2942417621816276e-8 , 4.213406956665722e-9 , 8.824785169281313e-8 , 1.7341569957807224e-8 , 7.321587247588468e-8 , 1.7941774288487977e-8 , 1.1245148101579616e-7 , 4.242405395871174e-7 , 8.259573469615589e-9 , 1.1336403105133286e-7 , 8.268798978861014e-8 , 2.2186977588489754e-8 , 1.9539720952366224e-8 , 1.0675703876472653e-8 , 3.288517547161973e-8 , 2.4340963022950746e-8 , 6.639137239972115e-8 , 5.604687380866835e-9 , 1.386604697728444e-8 , 6.675873720496384e-8 , 1.1355886009312144e-8 , 3.132159633878473e-7 , 3.12451788886392e-8 , 1.502181845580708e-7 , 1.3461754377885882e-8 , 1.8882955998833495e-7 , 4.645742279762999e-8 , 4.6453880742092224e-8 , 7.714453964524637e-9 , 3.5857155467056145e-8 , 7.60832108426257e-9 , 4.221501370693659e-8 , 4.3407251126836854e-9 , 1.340157496088068e-8 , 8.565600495558101e-8 , 1.7045413969185574e-8 , 5.4221903411644234e-8 , 3.021912675649219e-8 , 6.153376119755194e-8 , 3.938857240370908e-9 , 4.135628017820636e-8 , 1.781920389021252e-8 , 4.3105885083605244e-8 , 3.903354972578654e-9 , 7.663085455078544e-8 , 1.1890405993142394e-8 , 9.304217840622186e-9 , 1.0968062014171664e-9 , 1.0536767902635802e-8 , 1.1516804221400889e-7 , 8.134522886393825e-7 , 5.952623993721318e-8 , 2.806350174466843e-8 , 1.2833099027886874e-8 , 1.0605690192733164e-7 , 7.872949936427176e-7 , 2.7501393162765453e-8 , 3.936289072470345e-9 , 2.0519442145428002e-8 , 7.394815870753746e-9 , 3.598397313453461e-8 , 2.5378517065632877e-8 , 4.698972233541099e-8 , 7.54952989012736e-9 , 6.322805461422831e-7 , 5.582006412652163e-9 , 1.29640980617296e-7 , 1.5874988434916304e-8 , 3.3837810775594335e-8 , 6.474512037613067e-9 , 9.121148281110436e-8 , 1.3918511676536127e-8 , 8.230025549949005e-9 , 2.7061290097663004e-8 , 2.6095918315149902e-8 , 5.722363471960534e-9 , 6.963475698285038e-7 , 4.685091781198025e-8 , 9.590579885809802e-9 , 2.099205858030473e-7 , 3.082160660028421e-8 , 3.563162565001221e-8 , 7.326312925215461e-7 , 0.0000021759731225756695 , 2.407518309155421e-7 , 2.974515780351794e-7 , 2.529018416908002e-8 , 7.667950718825978e-9 , 2.663289251358947e-7 , 3.4358880185436647e-8 , 2.3130198201215535e-8 , 3.1239693498719134e-8 , 2.8691621878351725e-7 , 3.895845068768722e-8 , 2.4184130253956937e-8 , 1.1582445225144511e-8 , 5.1545349322168477e-8 , 2.034345492063494e-8 , 8.201963197507212e-8 , 1.164153573540716e-8 , 5.496356720868789e-7 , 1.1682151246361627e-8 , 4.7576914852243135e-8 , 1.6349824605299546e-8 , 4.090862759653646e-8 , 2.1271189609706198e-7 , 1.6697286753242224e-7 , 3.989708119433999e-8 , 0.000002852450279533514 , 1.2500372292834072e-7 , 2.4846613655427063e-7 , 1.245429093188477e-8 , 2.9700272463628608e-8 , 4.250991558762962e-9 , 1.61443480806156e-7 , 2.6386018703306036e-7 , 7.638056409575711e-9 , 3.4455793773702226e-9 , 7.273289526210647e-8 , 1.7631434090503717e-8 , 7.58661311550668e-9 , 2.1547013062672704e-8 , 1.2675349125856883e-7 , 2.5637149292379036e-8 , 3.500976220038865e-8 , 6.472243541111311e-8 , 8.387915251262257e-9 , 3.069512288789156e-8 , 7.520387867998579e-8 , 1.5724964441687916e-7 , 1.9634005354873807e-7 , 1.2290831818972947e-7 , 1.112118730439704e-9 , 1.546895944670723e-8 , 9.91701032404535e-9 , 6.882473257974198e-7 , 8.267616635748709e-8 , 4.469531234008173e-8 , 2.075201344098332e-8 , 8.649378457903367e-8 , 5.202766573120243e-8 , 4.5564942041664835e-8 , 2.0319955496006514e-8 , 8.705182352741758e-9 , 6.452066969586667e-8 , 2.1777438519166026e-8 , 1.030954166481024e-8 , 3.211904342492744e-8 , 2.3336936294526822e-7 , 8.054096056753224e-9 , 1.9623354319264763e-7 , 1.2888089884199871e-7 , 1.5392496166555247e-8 , 1.401903038100727e-9 , 5.696818305978013e-8 , 6.080025372057207e-9 , 1.0782793324892737e-8 , 2.4260730313585555e-8 , 1.9388659566743627e-8 , 2.2970310453729326e-7 , 1.9971754028347277e-8 , 2.8477993296860404e-8 , 5.2273552597625894e-8 , 2.7392806600801123e-7 , 9.857291161097237e-8 , 3.12910977129377e-8 , 4.151442212219081e-8 , 5.251196366629074e-9 , 0.000001580681100676884 , 8.547603442821128e-7 , 1.068913135782168e-8 , 0.0000010621830597301596 , 7.737313012512459e-8 , 6.394216711669287e-8 , 1.1698345758759388e-7 , 1.0486609625104393e-7 , 2.1161000063329993e-7 , 1.53396815250062e-8 , 5.094453570109181e-8 , 1.4005379966874898e-8 , 2.6282036102998063e-8 , 8.778433624456738e-8 , 7.772066545896905e-9 , 4.228875383205377e-8 , 3.3243779284930497e-7 , 7.729244799747903e-8 , 7.636901111496286e-10 , 5.989500806435899e-8 , 1.326090597331131e-7 , 1.2853634245857393e-7 , 8.844242671557367e-9 , 1.0194374766570036e-7 , 2.493779334145074e-7 , 1.6547971881664125e-7 , 1.1762754326127833e-8 , 1.1496195639892903e-7 , 2.9342709240154363e-7 , 1.326124099421122e-8 , 8.630262726683213e-8 , 5.7394842656322e-8 , 1.1094081031615133e-7 , 2.2933713239581266e-7 , 3.4706170026765903e-7 , 1.4751107357824367e-7 , 1.502495017291494e-8 , 6.454319390059027e-8 , 5.164533689594464e-8 , 6.23741556182722e-8 , 1.293601457064142e-7 , 1.4052071506398534e-8 , 5.386946000385251e-8 , 2.0827554791935654e-8 , 1.3040637902861363e-8 , 1.0578981601838677e-7 , 1.5079727688771527e-8 , 8.92632726845477e-7 , 4.6374381668101705e-8 , 7.481006036869076e-7 , 5.883147302654379e-9 , 2.8707685117979054e-9 , 8.381598490814213e-7 , 7.341958596640552e-9 , 1.4245998158912698e-8 , 1.0926417104428765e-7 , 1.1308178216040687e-7 , 2.52339901862797e-7 , 1.1782835684925885e-7 , 4.6678056975224536e-8 , 2.7959197179683315e-9 , 3.4363861090014325e-8 , 1.4674496640054713e-7 , 3.5396915620822256e-8 , 2.0581127557761647e-7 , 7.18387909159901e-8 , 2.7693943138729082e-8 , 4.5493386835460115e-8 , 1.9559182717898693e-8 , 1.5359708172013598e-8 , 1.2336623278486059e-8 , 2.9570605519779747e-8 , 2.877552560676122e-7 , 9.051845495378075e-7 , 2.3732602016934834e-7 , 1.6521676471370483e-8 , 1.5478875070584763e-8 , 3.526786329643983e-8 , 3.616410637619083e-8 , 1.61590953950963e-8 , 7.65007328595857e-8 , 1.9661483108279754e-8 , 4.917534823789538e-8 , 1.1712612746350715e-7 , 1.0889253054813253e-8 , 0.000001494120169809321 , 1.018585660261806e-8 , 3.7575969003000864e-8 , 2.097097784314883e-8 , 3.368558054717141e-8 , 4.845588819080149e-9 , 6.039624622644624e-7 , 1.037331109898787e-8 , 2.841650257323636e-7 , 4.4990630954089283e-7 , 3.463186004637464e-8 , 7.720684180867465e-8 , 1.471122175189521e-7 , 1.1601575522490748e-7 , 4.007488030310924e-7 , 3.025649775167949e-8 , 6.706784461130155e-8 , 2.0128741340386114e-8 , 1.5987744461654074e-9 , 4.1919822280078733e-8 , 1.3167154477855547e-8 , 3.231814815762846e-8 , 9.247659704669786e-8 , 1.3075300842047e-7 , 1.0574301256838226e-7 , 3.762165334819656e-8 , 1.0942246575496029e-7 , 7.001474955359299e-8 , 2.742706151082075e-8 , 2.0766625752344225e-8 , 4.5403403703403455e-8 , 3.39040298058535e-8 , 1.0469661759771043e-7 , 2.8271578855765256e-8 , 3.406226767310727e-7 , 5.146206945028098e-7 , 6.740708613506285e-7 , 6.382248063374618e-9 , 3.63878704945364e-8 , 3.626059807970705e-8 , 1.6065602892467723e-7 , 3.639055989879125e-7 , 6.232691696084203e-9 , 4.805490050330263e-8 , 3.372633727849461e-8 , 6.328880317596486e-7 , 6.480631498106959e-8 , 2.1165197949812864e-7 , 8.38779143919055e-8 , 1.7589144363228115e-8 , 2.729027670511641e-9 , 2.144795097080987e-8 , 7.861271456022223e-8 , 2.0118186228046397e-8 , 2.8407685093156942e-8 , 2.4922530883486615e-7 , 2.0156670998972004e-8 , 2.6551649767725394e-8 , 2.7848242822869906e-8 , 6.907123761834555e-9 , 1.880543720744754e-8 , 1.3006903998302732e-8 , 3.685918272822164e-7 , 3.967941211158177e-7 , 2.7592133022835696e-8 , 2.5228947819755376e-8 , 1.547002881352455e-7 , 3.689306637966183e-8 , 1.440177199718562e-9 , 2.1504929392790473e-8 , 5.068111263994979e-8 , 5.081711407228795e-8 , 1.171875219085905e-8 , 5.409278358570191e-8 , 7.138276600926474e-7 , 2.5237213208129106e-7 , 7.072044638789521e-8 , 7.199763984999663e-8 , 1.2525473103153217e-8 , 3.4803417747752974e-7 , 1.9591827538079087e-7 , 1.2404700555634918e-7 , 1.234617457157583e-7 , 1.9201337408958352e-8 , 1.9895249181445251e-7 , 3.7876677794201896e-8 , 1.0629785052174157e-8 , 1.2437127772102485e-8 , 2.1861892207653e-7 , 2.6181456291851646e-7 , 1.112900775979142e-7 , 1.0776630432474121e-7 , 6.380325157095967e-9 , 3.895085143312826e-9 , 1.5762756788717525e-7 , 2.909027019271093e-9 , 1.0381050685737137e-8 , 2.8135211493918177e-8 , 1.0778002490496874e-8 , 1.3605974125141529e-8 , 2.9236465692861202e-8 , 1.9189795352758665e-7 , 2.199506354827463e-7 , 1.326399790002597e-8 , 4.9004846403022384e-8 , 2.980837132682268e-9 , 8.926045680368588e-9 , 1.0996975774446582e-8 , 7.71560149104289e-9 , 7.454491246505768e-9 , 5.086162246925596e-8 , 1.5129764108223753e-7 , 1.1960075596562092e-8 , 1.1323334270230134e-8 , 9.391332156383214e-9 , 9.585701832293125e-8 , 1.905532798218701e-8 , 1.8105303922766325e-8 , 6.179227796110354e-8 , 6.389401363549041e-8 , 1.1853179771037503e-8 , 9.37277544466042e-9 , 1.2332148457971925e-7 , 1.6522022860954166e-8 , 1.246116454467483e-7 , 4.196171854431441e-9 , 3.996593278543514e-8 , 1.2554556505506298e-8 , 1.4302138140465104e-8 , 6.631793780798034e-9 , 5.964224669696705e-9 , 5.556936244488497e-9 , 1.4192455921602232e-7 , 1.7613080771639034e-8 , 3.380189639301534e-7 , 7.85651934620546e-8 , 2.966783085867064e-8 , 0.0000028992105853831163 , 0.0000013787366697215475 , 5.313622430946907e-9 , 2.512852859126724e-8 , 8.406627216572815e-8 , 4.492839167369311e-8 , 5.408793057881667e-8 , 2.4239175999696272e-8 , 4.016805235096399e-7 , 4.1083545454512205e-8 , 5.4153481698904216e-8 , 8.640767212853007e-9 , 5.773256717134245e-8 , 2.6443152023603034e-7 , 8.953217047746875e-7 , 2.7994001783326894e-8 , 5.889480014786841e-9 , 4.1788819515886644e-8 , 2.8880645430717777e-8 , 2.135752907861388e-8 , 2.3024175277441827e-7 , 8.786625471657317e-8 , 2.0697297209437693e-9 , 2.236410523437371e-8 , 3.203276310870251e-9 , 1.176874686592555e-8 , 6.963571053120177e-8 , 2.271932153519174e-8 , 7.360382525689602e-9 , 6.922528772435044e-9 , 3.213871480056696e-8 , 1.370577820125618e-7 , 1.9815049157045905e-8 , 1.0578956377571558e-8 , 2.7049420481262132e-8 , 2.9755937713815683e-9 , 2.1773699288019088e-8 , 1.09755387001087e-8 , 1.991872444762066e-8 , 2.3882098076910552e-8 , 2.1357365653784655e-8 , 6.109098560358461e-9 , 1.1890497475519624e-8 , 1.1459891702259029e-8 , 3.73173456580389e-8 , 1.572620256240498e-8 , 3.404023374287135e-8 , 3.6921580459647885e-8 , 9.281765045443535e-8 , 1.2323201303843234e-7 , 4.2347593876002065e-8 , 1.7423728237986325e-8 , 5.8113389656000436e-8 , 3.931436154402945e-8 , 2.3690461148362374e-8 , 1.792850135018398e-8 , 1.440664210150544e-8 , 7.019830494670032e-9 , 6.041522482291839e-8 , 4.867479930226182e-8 , 1.0685319296044327e-8 , 1.0051243393149889e-8 , 4.2426261614991745e-8 , 2.607815297039906e-8 , 5.136670200300841e-9 , 1.69729952315123e-9 , 1.9131586981302462e-8 , 2.111743526711507e-7 , 1.337269672774255e-8 , 2.0002481448955223e-8 , 1.0454256482717028e-7 , 2.8144228281234973e-8 , 2.1344791889532644e-7 , 2.1046110632028103e-8 , 1.9114453664315079e-7 , 3.957693550660224e-8 , 2.931631826186276e-8 , 1.105203111251285e-7 , 4.84007678380749e-8 , 5.583606110803885e-8 , 1.2130111315400427e-7 , 1.77621615193857e-8 , 2.5610853882085394e-8 , 1.203865309662433e-7 , 4.674859610531712e-9 , 1.5916098661250544e-8 , 3.147594185293201e-8 , 6.147686093527227e-8 , 2.204641802450169e-8 , 3.257763410147163e-7 , 1.198914532096751e-7 , 2.3818989802748547e-7 , 1.4909986134625797e-8 , 5.10168831624469e-8 , 5.5142201915714395e-8 , 2.288550327023131e-8 , 5.714110073995471e-8 , 5.185095801607531e-7 , 4.977285783525076e-8 , 1.1049896109227575e-8 , 1.264099296349741e-7 , 8.174881571676451e-8 ]}]} Deploy the Model with GRPC endpoint through InferenceService \u00b6 Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50-grpc\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50-grpc\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" ports : - name : grpc-port # Istio requires the port name to be in the format [-] protocol : TCP containerPort : 8081 Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f paddle-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = jay-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice paddle-v2-resnet50-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can use the example payload jay-v2-grpc.json as the sample input to test the model. Notice that the input format differs from the in the previous REST endpoint example. ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Tue, 10 Oct 2023 14 :55:27 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 190 Estimated response size: 4093 bytes Response contents: { \"modelName\" : \"paddle-v2-resnet50-grpc\" , \"id\" : \"97db0c56-95d2-4171-afd5-f7609a87032d\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"FP32\" , \"shape\" : [ \"1\" , \"1000\" ] , \"contents\" : { \"fp32Contents\" : [ 6 .7366788e-9,1.1535991e-8,5.1422507e-8,6.6471706e-8,4.0944926e-8,1.3402452e-7,9.355561e-8,2.8935892e-8,6.845367e-8,7.680616e-8,0.000002033469,0.0000011085679,2.3477592e-7,6.582037e-7,0.00012373104,4.2878804e-7,0.00000641996,0.99934965,0.000073720024,0.000031011357,0.0000056028093,0.0000021862509,1.9544045e-8,3.728894e-7,4.2903633e-7,1.825118e-7,7.159926e-8,9.231618e-9,6.4692415e-7,7.0316903e-9,4.4512316e-8,1.2455972e-7,9.4463275e-8,4.3477044e-8,4.65822e-7,6.797721e-8,2.1060276e-7,2.2605123e-8,1.431149e-7,7.9512986e-8,1.2341783e-7,0.0000010921714,0.000015243892,3.1173343e-7,2.4152058e-7,6.8637625e-8,8.467682e-8,9.424677e-8,1.021921e-8,3.3770753e-8,3.6928835e-8,1.3694032e-7,1.06742846e-7,2.5994837e-7,3.4866406e-7,3.1320535e-8,3.5748732e-7,6.648439e-8,3.1638956e-7,0.0000012095878,8.66409e-8,4.0144172e-8,1.2544761e-7,3.320118e-8,1.9731445e-7,3.8064056e-7,1.3827865e-7,2.300226e-8,7.144225e-8,2.8511145e-8,2.9825674e-8,8.936033e-8,6.2238837e-7,6.478839e-8,1.3663023e-7,9.9731814e-8,2.5761555e-8,4.13022e-8,3.9384464e-8,1.215808e-7,0.0000043028217,0.0000018179063,0.0000018520155,0.0000016246107,0.000016448314,0.000010544916,0.000003993062,2.6464798e-7,0.000019193476,4.803243e-7,1.696285e-7,0.0000045505058,0.000042359294,0.0000044433386,0.0000051040097,0.0000135063965,4.1758724e-7,4.4944915e-7,3.1566984e-7,0.00000105576,1.3364639e-8,1.389366e-8,6.7703795e-8,1.4129697e-7,7.170519e-8,7.9344666e-8,2.6391543e-8,2.6134321e-8,7.196726e-9,2.1752363e-8,6.68464e-8,3.417796e-8,1.6228276e-7,4.1071146e-7,6.4721354e-7,2.9513794e-7,5.653474e-9,4.8301445e-8,8.887482e-9,3.730617e-8,1.7784265e-8,4.641905e-9,3.4131187e-8,1.9373938e-7,0.0000012980177,3.5641005e-8,2.1493324e-8,3.0552937e-7,1.5532517e-7,0.0000014520979,3.4884646e-8,0.000038254384,4.5088433e-7,4.176697e-7,6.7706225e-7,1.4142249e-7,0.000014235998,6.2938204e-7,0.000004762867,9.0249006e-7,9.058988e-7,0.0000015713684,1.5720647e-7,1.8185365e-7,7.1931886e-8,0.0000011952825,8.8748374e-7,2.0870831e-7,9.906239e-8,7.793622e-9,1.00584984e-7,4.205944e-7,1.8436246e-7,1.6437947e-7,7.025353e-8,2.5704486e-7,7.5868776e-8,7.841314e-7,2.4953093e-7,5.157682e-8,4.0674127e-8,7.5317965e-9,4.7974854e-8,1.7419973e-8,1.7958679e-7,1.2566392e-8,8.97544e-8,3.2696548e-8,1.120836e-7,3.9067462e-8,4.6769046e-8,1.8523554e-7,1.4833053e-7,1.227935e-7,0.0000010729105,3.653849e-9,1.6198403e-7,1.619072e-8,1.2004934e-7,1.48002774e-8,4.0229484e-8,2.150609e-7,1.1925697e-7,4.8982514e-8,7.60892e-8,2.313748e-8,8.52105e-8,9.5862134e-8,1.3351651e-7,3.021699e-8,4.4238764e-8,2.610667e-8,2.397709e-7,1.3192565e-7,1.6734932e-8,1.588337e-7,4.0643516e-7,8.7534545e-8,8.3669994e-7,3.4375987e-8,7.847893e-8,8.526395e-9,9.601383e-8,5.258924e-7,1.3557448e-7,1.0307227e-7,1.04298135e-8,5.1877144e-8,2.1870013e-8,1.179144e-8,2.9806564e-8,4.3383935e-8,2.9991046e-8,2.850761e-8,3.058665e-8,6.441099e-8,1.5364101e-8,1.5973884e-8,2.573685e-8,1.0903766e-7,3.2118738e-8,6.819743e-9,1.9251311e-7,5.825811e-8,1.8765762e-7,4.007079e-7,1.5791578e-8,1.9501584e-7,1.0142063e-8,2.744815e-8,1.2843532e-8,3.7297493e-8,7.407497e-8,4.2060783e-8,1.6924805e-8,1.4592035e-7,4.344977e-8,1.7191404e-7,3.5817443e-8,8.440249e-9,4.1948297e-8,2.5140324e-8,2.83402e-8,8.747196e-8,8.277126e-9,1.1676294e-8,1.4548515e-7,7.200282e-9,0.000002623601,5.675737e-7,0.0000019483527,6.752595e-8,8.168475e-8,1.09330465e-7,1.6709137e-7,3.1387277e-8,2.9735245e-8,5.752164e-8,5.8508775e-8,3.2544622e-7,3.3302214e-8,4.1867867e-7,1.5085907e-7,2.334682e-7,2.8640278e-7,2.294032e-7,1.8537604e-7,3.1517982e-7,0.0000011075967,1.5369783e-7,1.923751e-7, 1 .6404466e-7,2.9008353e-7,1.2466549e-7,5.802622e-8,5.1862205e-8,6.0094205e-9,1.2333241e-7,1.3798474e-7,1.7370232e-7,5.617761e-7,5.160447e-8,4.8132776e-8,8.0326984e-8,0.0000020645264,5.6385977e-7,8.7942e-7,0.000003478598,2.9723893e-7,3.3904533e-7,9.469074e-8,3.7548457e-8,1.5679038e-7,8.203105e-8,6.8479626e-9,1.8251624e-8,6.050241e-8,3.9563428e-8,1.0699948e-7,3.2566635e-7,3.536943e-7,7.326295e-8,4.857656e-7,7.7177134e-7,3.456778e-8,3.2462046e-7,0.0000031608602,5.3309947e-8,3.6456873e-7,5.4815894e-7,4.6230696e-8,1.3466178e-7,4.3529482e-8,1.6404105e-7,2.4636954e-8,5.9587126e-8,9.493651e-8,5.5234626e-8,5.7412358e-8,0.00001185035,0.0000058263945,0.0000074208674,9.127966e-7,0.0000020019581,0.000001033499,3.514685e-8,0.0000020589953,3.565551e-7,6.873234e-8,2.1935298e-9,5.5603635e-8,3.3266997e-7,1.3073692e-7,2.718763e-8,1.0462929e-8,7.4666804e-7,6.923166e-8,1.6145664e-8,8.568521e-9,4.76221e-9,1.2339771e-7,8.340629e-9,3.2649041e-9,5.063249e-9,4.0704995e-9,1.2043539e-8,5.1056084e-9,7.267143e-9,1.1845163e-7,7.535579e-8,6.386964e-8,1.6212937e-8,2.6104294e-7,6.9794254e-7,6.647513e-8,7.717493e-7,6.651207e-7,3.3244953e-7,3.707282e-7,3.9956424e-7,6.411632e-8,7.107352e-8,1.6380017e-7,6.876801e-8,3.4624745e-7,2.0256503e-7,6.1961015e-7,2.6841073e-8,6.7203354e-7,0.0000011348341,0.0000018397932,6.3972516e-7,7.257533e-8,4.221391e-7,3.9657925e-7,1.403744e-7,3.2498568e-7,1.5857655e-7,1.1122217e-7,7.391421e-8,3.4232224e-7,5.3979615e-8,8.5172964e-8,0.00000406101,0.0000144787555,7.3170328e-9,6.948496e-9,4.4689173e-8,9.231412e-8,5.4119823e-8,2.2242811e-7,1.7609555e-8,2.090628e-8,3.6797683e-9,6.17792e-8,1.7920289e-7,2.627918e-8,2.69882e-8,1.6432807e-7,1.2827613e-7,4.4689088e-8,6.316553e-8,1.946176e-8,2.0871258e-8,2.241458e-8,2.4765244e-8,6.7853985e-9,2.4248795e-8,4.5549795e-9,2.8977038e-8,2.0402325e-8,1.6009503e-7,2.019971e-7,1.6111885e-8,5.964114e-8,4.0983186e-9,3.9080128e-8,7.511338e-9,5.965624e-7,1.6478224e-7,1.4106989e-8,3.2855585e-8,3.3387166e-9,1.2200434e-8,4.624639e-8,6.8423094e-9,1.7426288e-8,4.661133e-8,9.331948e-8,1.2306079e-7,1.2359445e-8,1.1173199e-8,2.7724862e-8,2.4192101e-7,3.4511868e-7,2.593767e-8,9.964568e-8,9.79781e-9,1.9085564e-7,3.9727063e-8,2.6639205e-8,6.874149e-9,3.1469938e-8,2.4086594e-7,1.3126927e-7,2.125434e-7,2.0502034e-8,3.694976e-8,6.563176e-7,2.5600501e-8,2.6882981e-8,6.880636e-7,2.0092733e-7,2.7880397e-8,2.6284091e-8,5.1678345e-8,1.8935414e-7,4.6185284e-7,1.1086778e-8,1.4542604e-7,2.873701e-8,6.105168e-7,1.2016463e-8,1.3944705e-7,2.0937128e-8,4.380141e-8,1.9663208e-8,6.654449e-9,1.1149591e-8,6.424939e-8,6.971555e-9,3.2600196e-9,1.42601895e-8,2.7895078e-8,8.115783e-8,2.5995716e-8,2.2855579e-8,1.05596285e-7,8.1455426e-8,3.7793686e-8,4.8818915e-8,2.3420624e-8,1.0599355e-8,3.604105e-8,5.062431e-8,3.680444e-8,1.5015802e-9,0.0000014475033,0.0000010762104,1.3049913e-7,3.0736015e-8,1.7184021e-8,2.042109e-8,7.904992e-9,1.6902052e-7,1.2416507e-8,5.4758292e-8,2.6250422e-8,1.3261367e-8,6.2980746e-8,1.2709986e-8,2.0171682e-7,4.3866372e-8,6.9623496e-8,2.956512e-7,7.9251316e-7,2.086892e-7,1.7341794e-7,4.2942418e-8,4.213407e-9,8.824785e-8,1.734157e-8,7.321587e-8,1.7941774e-8,1.1245148e-7,4.2424054e-7,8.2595735e-9,1.1336403e-7,8.268799e-8,2.2186978e-8,1.9539721e-8,1.0675704e-8,3.2885175e-8,2.4340963e-8,6.639137e-8,5.6046874e-9,1.3866047e-8,6.675874e-8,1.1355886e-8,3.1321596e-7,3.124518e-8,1.5021818e-7,1.3461754e-8,1.8882956e-7,4.6457423e-8,4.645388e-8,7.714454e-9,3.5857155e-8,7.608321e-9,4.2215014e-8,4.340725e-9,1.3401575e-8,8.5656005e-8,1.7045414e-8,5.4221903e-8,3.0219127e-8,6.153376e-8,3.9388572e-9, 4 .135628e-8,1.7819204e-8,4.3105885e-8,3.903355e-9,7.6630855e-8,1.1890406e-8,9.304218e-9,1.0968062e-9,1.0536768e-8,1.1516804e-7,8.134523e-7,5.952624e-8,2.8063502e-8,1.2833099e-8,1.060569e-7,7.87295e-7,2.7501393e-8,3.936289e-9,2.0519442e-8,7.394816e-9,3.5983973e-8,2.5378517e-8,4.6989722e-8,7.54953e-9,6.3228055e-7,5.5820064e-9,1.2964098e-7,1.5874988e-8,3.383781e-8,6.474512e-9,9.121148e-8,1.3918512e-8,8.2300255e-9,2.706129e-8,2.6095918e-8,5.7223635e-9,6.9634757e-7,4.6850918e-8,9.59058e-9,2.0992059e-7,3.0821607e-8,3.5631626e-8,7.326313e-7,0.0000021759731,2.4075183e-7,2.9745158e-7,2.5290184e-8,7.667951e-9,2.6632893e-7,3.435888e-8,2.3130198e-8,3.1239693e-8,2.8691622e-7,3.895845e-8,2.418413e-8,1.1582445e-8,5.154535e-8,2.0343455e-8,8.201963e-8,1.1641536e-8,5.496357e-7,1.1682151e-8,4.7576915e-8,1.6349825e-8,4.0908628e-8,2.127119e-7,1.6697287e-7,3.989708e-8,0.0000028524503,1.2500372e-7,2.4846614e-7,1.2454291e-8,2.9700272e-8,4.2509916e-9,1.6144348e-7,2.638602e-7,7.638056e-9,3.4455794e-9,7.2732895e-8,1.7631434e-8,7.586613e-9,2.1547013e-8,1.2675349e-7,2.563715e-8,3.5009762e-8,6.4722435e-8,8.387915e-9,3.0695123e-8,7.520388e-8,1.5724964e-7,1.9634005e-7,1.2290832e-7,1.1121187e-9,1.546896e-8,9.91701e-9,6.882473e-7,8.2676166e-8,4.4695312e-8,2.0752013e-8,8.6493785e-8,5.2027666e-8,4.5564942e-8,2.0319955e-8,8.705182e-9,6.452067e-8,2.1777439e-8,1.0309542e-8,3.2119043e-8,2.3336936e-7,8.054096e-9,1.9623354e-7,1.288809e-7,1.5392496e-8,1.401903e-9,5.6968183e-8,6.0800254e-9,1.0782793e-8,2.426073e-8,1.938866e-8,2.297031e-7,1.9971754e-8,2.8477993e-8,5.2273553e-8,2.7392807e-7,9.857291e-8,3.1291098e-8,4.1514422e-8,5.2511964e-9,0.0000015806811,8.5476034e-7,1.0689131e-8,0.0000010621831,7.737313e-8,6.394217e-8,1.1698346e-7,1.04866096e-7,2.1161e-7,1.5339682e-8,5.0944536e-8,1.400538e-8,2.6282036e-8,8.7784336e-8,7.7720665e-9,4.2288754e-8,3.324378e-7,7.729245e-8,7.636901e-10,5.989501e-8,1.3260906e-7,1.2853634e-7,8.844243e-9,1.0194375e-7,2.4937793e-7,1.6547972e-7,1.1762754e-8,1.14961956e-7,2.934271e-7,1.3261241e-8,8.630263e-8,5.7394843e-8,1.1094081e-7,2.2933713e-7,3.470617e-7,1.4751107e-7,1.502495e-8,6.4543194e-8,5.1645337e-8,6.2374156e-8,1.2936015e-7,1.40520715e-8,5.386946e-8,2.0827555e-8,1.3040638e-8,1.05789816e-7,1.5079728e-8,8.926327e-7,4.637438e-8,7.481006e-7,5.8831473e-9,2.8707685e-9,8.3815985e-7,7.3419586e-9,1.4245998e-8,1.0926417e-7,1.1308178e-7,2.523399e-7,1.1782836e-7,4.6678057e-8,2.7959197e-9,3.436386e-8,1.4674497e-7,3.5396916e-8,2.0581128e-7,7.183879e-8,2.7693943e-8,4.5493387e-8,1.9559183e-8,1.5359708e-8,1.2336623e-8,2.9570606e-8,2.8775526e-7,9.0518455e-7,2.3732602e-7,1.6521676e-8,1.5478875e-8,3.5267863e-8,3.6164106e-8,1.6159095e-8,7.650073e-8,1.9661483e-8,4.917535e-8,1.1712613e-7,1.0889253e-8,0.0000014941202,1.0185857e-8,3.757597e-8,2.0970978e-8,3.368558e-8,4.845589e-9,6.0396246e-7,1.0373311e-8,2.8416503e-7,4.499063e-7,3.463186e-8,7.720684e-8,1.4711222e-7,1.16015755e-7,4.007488e-7,3.0256498e-8,6.7067845e-8,2.0128741e-8,1.5987744e-9,4.1919822e-8,1.31671545e-8,3.2318148e-8,9.24766e-8,1.3075301e-7,1.0574301e-7,3.7621653e-8,1.09422466e-7,7.001475e-8,2.7427062e-8,2.0766626e-8,4.5403404e-8,3.390403e-8,1.0469662e-7,2.8271579e-8,3.4062268e-7,5.146207e-7,6.7407086e-7,6.382248e-9,3.638787e-8,3.6260598e-8,1.6065603e-7,3.639056e-7,6.2326917e-9,4.80549e-8,3.3726337e-8,6.3288803e-7,6.4806315e-8,2.1165198e-7,8.3877914e-8,1.7589144e-8,2.7290277e-9,2.1447951e-8,7.8612715e-8,2.0118186e-8,2.8407685e-8,2.492253e-7,2.0156671e-8,2.655165e-8,2.7848243e-8,6.9071238e-9,1.8805437e-8,1.3006904e-8,3.6859183e-7,3.9679412e-7,2.7592133e-8,2.5228948e-8,1.5470029e-7,3.6893066e-8,1.4401772e-9,2.150493e-8,5.0681113e-8,5.0817114e-8,1.1718752e-8,5.4092784e-8,7.1382766e-7,2.5237213e-7,7.0720446e-8,7.199764e-8,1.2525473e-8,3.4803418e-7,1.9591828e-7,1.24047e-7,1.2346175e-7,1.9201337e-8,1.9895249e-7,3.7876678e-8,1.0629785e-8,1.2437128e-8,2.1861892e-7,2.6181456e-7,1.1129008e-7,1.07766304e-7,6.380325e-9,3.895085e-9,1.5762757e-7,2.909027e-9,1.0381051e-8,2.8135211e-8,1.07780025e-8,1.3605974e-8,2.9236466e-8,1.9189795e-7,2.1995064e-7,1.3263998e-8,4.9004846e-8,2.9808371e-9,8.926046e-9,1.0996976e-8,7.7156015e-9,7.454491e-9,5.0861622e-8,1.5129764e-7,1.1960076e-8,1.1323334e-8,9.391332e-9,9.585702e-8,1.9055328e-8,1.8105304e-8,6.179228e-8,6.3894014e-8,1.185318e-8,9.3727754e-9,1.2332148e-7,1.6522023e-8,1.2461165e-7,4.196172e-9,3.9965933e-8,1.25545565e-8,1.4302138e-8,6.631794e-9,5.9642247e-9,5.5569362e-9,1.4192456e-7,1.761308e-8,3.3801896e-7,7.856519e-8,2.966783e-8,0.0000028992106,0.0000013787367,5.3136224e-9,2.5128529e-8,8.406627e-8,4.492839e-8,5.408793e-8,2.4239176e-8,4.0168052e-7,4.1083545e-8,5.415348e-8,8.640767e-9,5.7732567e-8,2.6443152e-7,8.953217e-7,2.7994002e-8,5.88948e-9,4.178882e-8,2.8880645e-8,2.1357529e-8,2.3024175e-7,8.7866255e-8,2.0697297e-9,2.2364105e-8,3.2032763e-9,1.1768747e-8,6.963571e-8,2.2719322e-8,7.3603825e-9,6.9225288e-9,3.2138715e-8,1.3705778e-7,1.981505e-8,1.0578956e-8,2.704942e-8,2.9755938e-9,2.17737e-8,1.0975539e-8,1.9918724e-8,2.3882098e-8,2.1357366e-8,6.1090986e-9,1.18904975e-8,1.1459892e-8,3.7317346e-8,1.5726203e-8,3.4040234e-8,3.692158e-8,9.281765e-8,1.2323201e-7,4.2347594e-8,1.7423728e-8,5.811339e-8,3.931436e-8,2.3690461e-8,1.7928501e-8,1.4406642e-8,7.0198305e-9,6.0415225e-8,4.86748e-8,1.0685319e-8,1.0051243e-8,4.242626e-8,2.6078153e-8,5.13667e-9,1.6972995e-9,1.9131587e-8,2.1117435e-7,1.3372697e-8,2.0002481e-8,1.04542565e-7,2.8144228e-8,2.1344792e-7,2.104611e-8,1.9114454e-7,3.9576936e-8,2.9316318e-8,1.1052031e-7,4.8400768e-8,5.583606e-8,1.2130111e-7,1.7762162e-8,2.5610854e-8,1.2038653e-7,4.6748596e-9,1.5916099e-8,3.1475942e-8,6.147686e-8,2.2046418e-8,3.2577634e-7,1.1989145e-7,2.381899e-7,1.4909986e-8,5.1016883e-8,5.5142202e-8,2.2885503e-8,5.71411e-8,5.185096e-7,4.9772858e-8,1.1049896e-8,1.2640993e-7,8.1748816e-8 ]} } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Paddle"},{"location":"modelserving/v1beta1/paddle/#deploy-paddle-model-with-inferenceservice","text":"In this example, we use a trained paddle resnet50 model to classify images by running an inference service with Paddle predictor.","title":"Deploy Paddle model with InferenceService"},{"location":"modelserving/v1beta1/paddle/#deploy-paddle-model-with-v1-protocol","text":"","title":"Deploy Paddle model with V1 protocol"},{"location":"modelserving/v1beta1/paddle/#create-the-inferenceservice","text":"New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-resnet50\" spec : predictor : model : modelFormat : name : paddle storageUri : \"gs://kfserving-examples/models/paddle/resnet\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-resnet50\" spec : predictor : paddle : storageUri : \"gs://kfserving-examples/models/paddle/resnet\" Apply the above yaml to create the InferenceService kubectl apply -f paddle.yaml Expected Output $ inferenceservice.serving.kserve.io/paddle-resnet50 created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/paddle/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload jay.json as the sample input to test the model. MODEL_NAME = paddle-resnet50 SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./jay.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict Expected Output * Trying 127 .0.0.1:80... * TCP_NODELAY set * Connected to localhost ( 127 .0.0.1 ) port 80 ( #0) > POST /v1/models/paddle-resnet50:predict HTTP/1.1 > Host: paddle-resnet50.default.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 3010209 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > * Mark bundle as not supporting multiuse < HTTP/1.1 100 Continue * We are completely uploaded and fine * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 23399 < content-type: application/json ; charset = UTF-8 < date: Mon, 17 May 2021 03 :34:58 GMT < server: istio-envoy < x-envoy-upstream-service-time: 511 < { \"predictions\" : [[ 6 .736678770380422e-09, 1 .1535990829258935e-08, 5 .142250714129659e-08, 6 .647170636142619e-08, 4 .094492567219277e-08, 1 .3402451770616608e-07, 9 .355561303436843e-08, 2 .8935891904779965e-08, 6 .845367295227334e-08, 7 .680615965455218e-08, 2 .0334689452283783e-06, 1 .1085678579547675e-06, 2 .3477592492326949e-07, 6 .582037030966603e-07, 0 .00012373103527352214, 4 .2878804151769145e-07, 6 .419959845516132e-06, 0 .9993496537208557, 7 .372002437477931e-05, 3 .101135735050775e-05, 5 .6028093240456656e-06, 2 .1862508674530545e-06, 1 .9544044604913324e-08, 3 .728893887000595e-07, 4 .2903633357127546e-07, 1 .8251179767503345e-07, 7 .159925985433802e-08, 9 .231618136595898e-09, 6 .469241498052725e-07, 7 .031690341108288e-09, 4 .451231561120039e-08, 1 .2455971898361895e-07, 9 .44632745358831e-08, 4 .347704418705689e-08, 4 .658220120745682e-07, 6 .797721141538204e-08, 2 .1060276367279585e-07, 2 .2605123106700376e-08, 1 .4311490303953178e-07, 7 .951298641728499e-08, 1 .2341783417468832e-07, 1 .0921713737843675e-06, 1 .5243892448779661e-05, 3 .1173343018053856e-07, 2 .4152058131221565e-07, 6 .863762536113427e-08, 8 .467682022228473e-08, 9 .4246772164297e-08, 1 .0219210366813058e-08, 3 .3770753304906975e-08, 3 .6928835100979995e-08, 1 .3694031508748594e-07, 1 .0674284567357972e-07, 2 .599483650556067e-07, 3 .4866405940192635e-07, 3 .132053549848024e-08, 3 .574873232992104e-07, 6 .64843895492595e-08, 3 .1638955988455564e-07, 1 .2095878219042788e-06, 8 .66409024524728e-08, 4 .0144172430700564e-08, 1 .2544761318622477e-07, 3 .3201178695208e-08, 1 .9731444922399533e-07, 3 .806405572959193e-07, 1 .3827865075199952e-07, 2 .300225965257141e-08, 7 .14422512260171e-08, 2 .851114544455413e-08, 2 .982567437470607e-08, 8 .936032713791064e-08, 6 .22388370175031e-07, 6 .478838798784636e-08, 1 .3663023423760023e-07, 9 .973181391842445e-08, 2 .5761554667269593e-08, 4 .130220077058766e-08, 3 .9384463690339544e-08, 1 .2158079698565416e-07, 4 .302821707824478e-06, 1 .8179063090428826e-06, 1 .8520155435908237e-06, 1 .6246107179540559e-06, 1 .6448313544970006e-05, 1 .0544916221988387e-05, 3 .993061909568496e-06, 2 .646479799750523e-07, 1 .9193475964129902e-05, 4 .803242745765601e-07, 1 .696285067964709e-07, 4 .550505764200352e-06, 4 .235929372953251e-05, 4 .443338639248395e-06, 5 .104009687784128e-06, 1 .3506396498996764e-05, 4 .1758724478313525e-07, 4 .494491463447048e-07, 3 .156698369366495e-07, 1 .0557599807725637e-06, 1 .336463917311903e-08, 1 .3893659556174498e-08, 6 .770379457066156e-08, 1 .4129696523923485e-07, 7 .170518756538513e-08, 7 .934466594861078e-08, 2 .639154317307657e-08, 2 .6134321373660896e-08, 7 .196725881897237e-09, 2 .1752363466021052e-08, 6 .684639686227456e-08, 3 .417795824134373e-08, 1 .6228275967478112e-07, 4 .107114648377319e-07, 6 .472135396506928e-07, 2 .951379372007068e-07, 5 .653474133282543e-09, 4 .830144462175667e-08, 8 .887481861563629e-09, 3 .7306168820805397e-08, 1 .7784264727538357e-08, 4 .641905082536368e-09, 3 .413118676576232e-08, 1 .937393818707278e-07, 1 .2980176506971475e-06, 3 .5641004814124244e-08, 2 .149332445355867e-08, 3 .055293689158134e-07, 1 .5532516783878236e-07, 1 .4520978766086046e-06, 3 .488464628276233e-08, 3 .825438398052938e-05, 4 .5088432898410247e-07, 4 .1766969616219285e-07, 6 .770622462681786e-07, 1 .4142248971893423e-07, 1 .4235997696232516e-05, 6 .293820433711517e-07, 4 .762866865348769e-06, 9 .024900577969674e-07, 9 .058987870957935e-07, 1 .5713684433649178e-06, 1 .5720647184025438e-07, 1 .818536503606083e-07, 7 .193188622522939e-08, 1 .1952824934269302e-06, 8 .874837362782273e-07, 2 .0870831463071227e-07, 9 .906239029078279e-08, 7 .793621747964607e-09, 1 .0058498389753368e-07, 4 .2059440374941914e-07, 1 .843624630737395e-07, 1 .6437947181202617e-07, 7 .025352743994517e-08, 2 .570448600636155e-07, 7 .586877615040066e-08, 7 .841313731660193e-07, 2 .495309274763713e-07, 5 .157681925993529e-08, 4 .0674127177453556e-08, 7 .531796519799627e-09, 4 .797485431140558e-08, 1 .7419973019627832e-08, 1 .7958679165985814e-07, 1 .2566392371127222e-08, 8 .975440124459055e-08, 3 .26965476915575e-08, 1 .1208359751435637e-07, 3 .906746215420753e-08, 4 .6769045525252295e-08, 1 .8523553535487736e-07, 1 .4833052830454108e-07, 1 .2279349448363064e-07, 1 .0729105497375713e-06, 3 .6538490011395197e-09, 1 .6198403329781286e-07, 1 .6190719875908144e-08, 1 .2004933580556099e-07, 1 .4800277448046018e-08, 4 .02294837442696e-08, 2 .15060893538066e-07, 1 .1925696696835075e-07, 4 .8982514044837444e-08, 7 .608920071788816e-08, 2 .3137479487900237e-08, 8 .521050176568679e-08, 9 .586213423062873e-08, 1 .3351650807180704e-07, 3 .021699157557123e-08, 4 .423876376336011e-08, 2 .610667060309879e-08, 2 .3977091245797055e-07, 1 .3192564551900432e-07, 1 .6734931662654162e-08, 1 .588336999702733e-07, 4 .0643516285854275e-07, 8 .753454494581092e-08, 8 .366999395548191e-07, 3 .437598650180007e-08, 7 .847892646850596e-08, 8 .526394701391382e-09, 9 .601382799928615e-08, 5 .258924034023948e-07, 1 .3557448141909845e-07, 1 .0307226716577134e-07, 1 .0429813457335513e-08, 5 .187714435805901e-08, 2 .187001335585137e-08, 1 .1791439824548888e-08, 2 .98065643278278e-08, 4 .338393466696289e-08, 2 .9991046091026874e-08, 2 .8507610494443725e-08, 3 .058665143385042e-08, 6 .441099031917474e-08, 1 .5364101102477434e-08, 1 .5973883549236234e-08, 2 .5736850872704053e-08, 1 .0903765712555469e-07, 3 .2118737891551064e-08, 6 .819742992547617e-09, 1 .9251311300649832e-07, 5 .8258109447706374e-08, 1 .8765761922168167e-07, 4 .0070790419122204e-07, 1 .5791577823165426e-08, 1 .950158434738114e-07, 1 .0142063189277906e-08, 2 .744815041921811e-08, 1 .2843531571604672e-08, 3 .7297493094001766e-08, 7 .407496838141014e-08, 4 .20607833007125e-08, 1 .6924804668860816e-08, 1 .459203531339881e-07, 4 .344977000414474e-08, 1 .7191403856031684e-07, 3 .5817443233554513e-08, 8 .440249388286247e-09, 4 .194829728021432e-08, 2 .514032360068086e-08, 2 .8340199520471288e-08, 8 .747196034164517e-08, 8 .277125651545703e-09, 1 .1676293709683705e-08, 1 .4548514570833504e-07, 7 .200282148289716e-09, 2 .623600948936655e-06, 5 .675736929333652e-07, 1 .9483527466945816e-06, 6 .752595282932816e-08, 8 .168475318370838e-08, 1 .0933046468153407e-07, 1 .670913718498923e-07, 3 .1387276777650186e-08, 2 .973524537708272e-08, 5 .752163900751839e-08, 5 .850877471402782e-08, 3 .2544622285968217e-07, 3 .330221431951941e-08, 4 .186786668469722e-07, 1 .5085906568401697e-07, 2 .3346819943981245e-07, 2 .86402780602657e-07, 2 .2940319865938363e-07, 1 .8537603807544656e-07, 3 .151798182443599e-07, 1 .1075967449869495e-06, 1 .5369782602192572e-07, 1 .9237509718550427e-07, 1 .64044664074936e-07, 2 .900835340824415e-07, 1 .246654903752642e-07, 5 .802622027317739e-08, 5 .186220519703966e-08, 6 .0094205167615655e-09, 1 .2333241272699524e-07, 1 .3798474185477971e-07, 1 .7370231830682314e-07, 5 .617761189569137e-07, 5 .1604470030497396e-08, 4 .813277598714194e-08, 8 .032698417537176e-08, 2 .0645263703045202e-06, 5 .638597713186755e-07, 8 .794199857220519e-07, 3 .4785980460583232e-06, 2 .972389268052211e-07, 3 .3904532870110415e-07, 9 .469074058188198e-08, 3 .754845678827223e-08, 1 .5679037801419327e-07, 8 .203105039683578e-08, 6 .847962641387539e-09, 1 .8251624211984563e-08, 6 .050240841659615e-08, 3 .956342808919544e-08, 1 .0699947949888156e-07, 3 .2566634899922065e-07, 3 .5369430406717584e-07, 7 .326295303755614e-08, 4 .85765610847011e-07, 7 .717713401689252e-07, 3 .4567779749750116e-08, 3 .246204585138912e-07, 3 .1608601602783892e-06, 5 .33099466792919e-08, 3 .645687343123427e-07, 5 .48158936908294e-07, 4 .62306957160763e-08, 1 .3466177506415988e-07, 4 .3529482240955986e-08, 1 .6404105451783835e-07, 2 .463695381038633e-08, 5 .958712634424046e-08, 9 .493651020875404e-08, 5 .523462576206839e-08, 5 .7412357534758485e-08, 1 .1850350347231142e-05, 5 .8263944993086625e-06, 7 .4208674050169066e-06, 9 .127966222877149e-07, 2 .0019581370434025e-06, 1 .033498961078294e-06, 3 .5146850763112525e-08, 2 .058995278275688e-06, 3 .5655509122989315e-07, 6 .873234070781109e-08, 2 .1935298022413008e-09, 5 .560363547374436e-08, 3 .3266996979364194e-07, 1 .307369217329324e-07, 2 .718762992515167e-08, 1 .0462929189714032e-08, 7 .466680358447775e-07, 6 .923166040451179e-08, 1 .6145664361033596e-08, 8 .568521003837759e-09, 4 .76221018175238e-09, 1 .233977116044116e-07, 8 .340628632197422e-09, 3 .2649041248333788e-09, 5 .0632489312363305e-09, 4 .0704994930251814e-09, 1 .2043538610839732e-08, 5 .105608380517879e-09, 7 .267142887457112e-09, 1 .184516307262129e-07, 7 .53557927168913e-08, 6 .386964201965384e-08, 1 .6212936770898523e-08, 2 .610429419291904e-07, 6 .979425393183192e-07, 6 .647513117741255e-08, 7 .717492849224072e-07, 6 .651206945207377e-07, 3 .324495310152997e-07, 3 .707282019149716e-07, 3 .99564243025452e-07, 6 .411632114122767e-08, 7 .107352217872176e-08, 1 .6380016631956096e-07, 6 .876800995314625e-08, 3 .462474467141874e-07, 2 .0256503319160402e-07, 6 .19610148078209e-07, 2 .6841073363925716e-08, 6 .720335363752383e-07, 1 .1348340649419697e-06, 1 .8397931853542104e-06, 6 .397251581802266e-07, 7 .257533241045167e-08, 4 .2213909523525217e-07, 3 .9657925299252383e-07, 1 .4037439655112394e-07, 3 .249856774800719e-07, 1 .5857655455420172e-07, 1 .1122217102865761e-07, 7 .391420808744442e-08, 3 .42322238111592e-07, 5 .39796154441774e-08, 8 .517296379295658e-08, 4 .061009803990601e-06, 1 .4478755474556237e-05, 7 .317032757470088e-09, 6 .9484960008026064e-09, 4 .468917325084476e-08, 9 .23141172393116e-08, 5 .411982328951126e-08, 2 .2242811326123046e-07, 1 .7609554703312824e-08, 2 .0906279374344194e-08, 3 .6797682678724186e-09, 6 .177919686933819e-08, 1 .7920288541972695e-07, 2 .6279179721200308e-08, 2 .6988200119149042e-08, 1 .6432807115052128e-07, 1 .2827612749788386e-07, 4 .468908798571647e-08, 6 .316552969565237e-08, 1 .9461760203398626e-08, 2 .087125849925542e-08, 2 .2414580413965268e-08, 2 .4765244077684656e-08, 6 .785398465325443e-09, 2 .4248794971981624e-08, 4 .554979504689527e-09, 2 .8977037658250993e-08, 2 .0402325162649504e-08, 1 .600950270130852e-07, 2 .0199709638291097e-07, 1 .611188515937556e-08, 5 .964113825029926e-08, 4 .098318573397819e-09, 3 .9080127578472457e-08, 7 .511338218080255e-09, 5 .965624154669058e-07, 1 .6478223585636442e-07, 1 .4106989354445432e-08, 3 .2855584919389e-08, 3 .3387166364917675e-09, 1 .220043444050134e-08, 4 .624639160510924e-08, 6 .842309385746148e-09, 1 .74262879681919e-08, 4 .6611329906909305e-08, 9 .331947836699328e-08, 1 .2306078644996887e-07, 1 .2359445022980253e-08, 1 .1173199254699284e-08, 2 .7724862405875683e-08, 2 .419210147763806e-07, 3 .451186785241589e-07, 2 .593766978975509e-08, 9 .964568192799561e-08, 9 .797809674694236e-09, 1 .9085564417764544e-07, 3 .972706252852731e-08, 2 .6639204619982593e-08, 6 .874148805735558e-09, 3 .146993776681484e-08, 2 .4086594407890516e-07, 1 .3126927456141857e-07, 2 .1254339799270383e-07, 2 .050203384840188e-08, 3 .694976058454813e-08, 6 .563175816154398e-07, 2 .560050127442537e-08, 2 .6882981174480847e-08, 6 .880636078676616e-07, 2 .0092733166166e-07, 2 .788039665801989e-08, 2 .628409134786125e-08, 5 .1678345158734373e-08, 1 .8935413947929192e-07, 4 .61852835087484e-07, 1 .1086777718105623e-08, 1 .4542604276357451e-07, 2 .8737009216683873e-08, 6 .105167926762078e-07, 1 .2016463379893594e-08, 1 .3944705301582871e-07, 2 .093712758721722e-08, 4 .3801410498645055e-08, 1 .966320795077081e-08, 6 .654448991838535e-09, 1 .1149590584125235e-08, 6 .424939158478082e-08, 6 .971554888934861e-09, 3 .260019587614238e-09, 1 .4260189473702667e-08, 2 .7895078247297533e-08, 8 .11578289017234e-08, 2 .5995715802196173e-08, 2 .2855578762914774e-08, 1 .055962854934478e-07, 8 .145542551574181e-08, 3 .7793686402665116e-08, 4 .881891513264236e-08, 2 .342062366267328e-08, 1 .059935517133681e-08, 3 .604105103249822e-08, 5 .062430830093945e-08, 3 .6804440384230475e-08, 1 .501580193519203e-09, 1 .4475033367489232e-06, 1 .076210423889279e-06, 1 .304991315009829e-07, 3 .073601462233455e-08, 1 .7184021317007137e-08, 2 .0421090596300928e-08, 7 .904992216367646e-09, 1 .6902052379919041e-07, 1 .2416506933732308e-08, 5 .4758292122869534e-08, 2 .6250422280327257e-08, 1 .3261367115546818e-08, 6 .29807459517906e-08, 1 .270998595259698e-08, 2 .0171681569536304e-07, 4 .386637186826192e-08, 6 .962349630157405e-08, 2 .9565120485131047e-07, 7 .925131626507209e-07, 2 .0868920103112032e-07, 1 .7341794489311724e-07, 4 .2942417621816276e-08, 4 .213406956665722e-09, 8 .824785169281313e-08, 1 .7341569957807224e-08, 7 .321587247588468e-08, 1 .7941774288487977e-08, 1 .1245148101579616e-07, 4 .242405395871174e-07, 8 .259573469615589e-09, 1 .1336403105133286e-07, 8 .268798978861014e-08, 2 .2186977588489754e-08, 1 .9539720952366224e-08, 1 .0675703876472653e-08, 3 .288517547161973e-08, 2 .4340963022950746e-08, 6 .639137239972115e-08, 5 .604687380866835e-09, 1 .386604697728444e-08, 6 .675873720496384e-08, 1 .1355886009312144e-08, 3 .132159633878473e-07, 3 .12451788886392e-08, 1 .502181845580708e-07, 1 .3461754377885882e-08, 1 .8882955998833495e-07, 4 .645742279762999e-08, 4 .6453880742092224e-08, 7 .714453964524637e-09, 3 .5857155467056145e-08, 7 .60832108426257e-09, 4 .221501370693659e-08, 4 .3407251126836854e-09, 1 .340157496088068e-08, 8 .565600495558101e-08, 1 .7045413969185574e-08, 5 .4221903411644234e-08, 3 .021912675649219e-08, 6 .153376119755194e-08, 3 .938857240370908e-09, 4 .135628017820636e-08, 1 .781920389021252e-08, 4 .3105885083605244e-08, 3 .903354972578654e-09, 7 .663085455078544e-08, 1 .1890405993142394e-08, 9 .304217840622186e-09, 1 .0968062014171664e-09, 1 .0536767902635802e-08, 1 .1516804221400889e-07, 8 .134522886393825e-07, 5 .952623993721318e-08, 2 .806350174466843e-08, 1 .2833099027886874e-08, 1 .0605690192733164e-07, 7 .872949936427176e-07, 2 .7501393162765453e-08, 3 .936289072470345e-09, 2 .0519442145428002e-08, 7 .394815870753746e-09, 3 .598397313453461e-08, 2 .5378517065632877e-08, 4 .698972233541099e-08, 7 .54952989012736e-09, 6 .322805461422831e-07, 5 .582006412652163e-09, 1 .29640980617296e-07, 1 .5874988434916304e-08, 3 .3837810775594335e-08, 6 .474512037613067e-09, 9 .121148281110436e-08, 1 .3918511676536127e-08, 8 .230025549949005e-09, 2 .7061290097663004e-08, 2 .6095918315149902e-08, 5 .722363471960534e-09, 6 .963475698285038e-07, 4 .685091781198025e-08, 9 .590579885809802e-09, 2 .099205858030473e-07, 3 .082160660028421e-08, 3 .563162565001221e-08, 7 .326312925215461e-07, 2 .1759731225756695e-06, 2 .407518309155421e-07, 2 .974515780351794e-07, 2 .529018416908002e-08, 7 .667950718825978e-09, 2 .663289251358947e-07, 3 .4358880185436647e-08, 2 .3130198201215535e-08, 3 .1239693498719134e-08, 2 .8691621878351725e-07, 3 .895845068768722e-08, 2 .4184130253956937e-08, 1 .1582445225144511e-08, 5 .1545349322168477e-08, 2 .034345492063494e-08, 8 .201963197507212e-08, 1 .164153573540716e-08, 5 .496356720868789e-07, 1 .1682151246361627e-08, 4 .7576914852243135e-08, 1 .6349824605299546e-08, 4 .090862759653646e-08, 2 .1271189609706198e-07, 1 .6697286753242224e-07, 3 .989708119433999e-08, 2 .852450279533514e-06, 1 .2500372292834072e-07, 2 .4846613655427063e-07, 1 .245429093188477e-08, 2 .9700272463628608e-08, 4 .250991558762962e-09, 1 .61443480806156e-07, 2 .6386018703306036e-07, 7 .638056409575711e-09, 3 .4455793773702226e-09, 7 .273289526210647e-08, 1 .7631434090503717e-08, 7 .58661311550668e-09, 2 .1547013062672704e-08, 1 .2675349125856883e-07, 2 .5637149292379036e-08, 3 .500976220038865e-08, 6 .472243541111311e-08, 8 .387915251262257e-09, 3 .069512288789156e-08, 7 .520387867998579e-08, 1 .5724964441687916e-07, 1 .9634005354873807e-07, 1 .2290831818972947e-07, 1 .112118730439704e-09, 1 .546895944670723e-08, 9 .91701032404535e-09, 6 .882473257974198e-07, 8 .267616635748709e-08, 4 .469531234008173e-08, 2 .075201344098332e-08, 8 .649378457903367e-08, 5 .202766573120243e-08, 4 .5564942041664835e-08, 2 .0319955496006514e-08, 8 .705182352741758e-09, 6 .452066969586667e-08, 2 .1777438519166026e-08, 1 .030954166481024e-08, 3 .211904342492744e-08, 2 .3336936294526822e-07, 8 .054096056753224e-09, 1 .9623354319264763e-07, 1 .2888089884199871e-07, 1 .5392496166555247e-08, 1 .401903038100727e-09, 5 .696818305978013e-08, 6 .080025372057207e-09, 1 .0782793324892737e-08, 2 .4260730313585555e-08, 1 .9388659566743627e-08, 2 .2970310453729326e-07, 1 .9971754028347277e-08, 2 .8477993296860404e-08, 5 .2273552597625894e-08, 2 .7392806600801123e-07, 9 .857291161097237e-08, 3 .12910977129377e-08, 4 .151442212219081e-08, 5 .251196366629074e-09, 1 .580681100676884e-06, 8 .547603442821128e-07, 1 .068913135782168e-08, 1 .0621830597301596e-06, 7 .737313012512459e-08, 6 .394216711669287e-08, 1 .1698345758759388e-07, 1 .0486609625104393e-07, 2 .1161000063329993e-07, 1 .53396815250062e-08, 5 .094453570109181e-08, 1 .4005379966874898e-08, 2 .6282036102998063e-08, 8 .778433624456738e-08, 7 .772066545896905e-09, 4 .228875383205377e-08, 3 .3243779284930497e-07, 7 .729244799747903e-08, 7 .636901111496286e-10, 5 .989500806435899e-08, 1 .326090597331131e-07, 1 .2853634245857393e-07, 8 .844242671557367e-09, 1 .0194374766570036e-07, 2 .493779334145074e-07, 1 .6547971881664125e-07, 1 .1762754326127833e-08, 1 .1496195639892903e-07, 2 .9342709240154363e-07, 1 .326124099421122e-08, 8 .630262726683213e-08, 5 .7394842656322e-08, 1 .1094081031615133e-07, 2 .2933713239581266e-07, 3 .4706170026765903e-07, 1 .4751107357824367e-07, 1 .502495017291494e-08, 6 .454319390059027e-08, 5 .164533689594464e-08, 6 .23741556182722e-08, 1 .293601457064142e-07, 1 .4052071506398534e-08, 5 .386946000385251e-08, 2 .0827554791935654e-08, 1 .3040637902861363e-08, 1 .0578981601838677e-07, 1 .5079727688771527e-08, 8 .92632726845477e-07, 4 .6374381668101705e-08, 7 .481006036869076e-07, 5 .883147302654379e-09, 2 .8707685117979054e-09, 8 .381598490814213e-07, 7 .341958596640552e-09, 1 .4245998158912698e-08, 1 .0926417104428765e-07, 1 .1308178216040687e-07, 2 .52339901862797e-07, 1 .1782835684925885e-07, 4 .6678056975224536e-08, 2 .7959197179683315e-09, 3 .4363861090014325e-08, 1 .4674496640054713e-07, 3 .5396915620822256e-08, 2 .0581127557761647e-07, 7 .18387909159901e-08, 2 .7693943138729082e-08, 4 .5493386835460115e-08, 1 .9559182717898693e-08, 1 .5359708172013598e-08, 1 .2336623278486059e-08, 2 .9570605519779747e-08, 2 .877552560676122e-07, 9 .051845495378075e-07, 2 .3732602016934834e-07, 1 .6521676471370483e-08, 1 .5478875070584763e-08, 3 .526786329643983e-08, 3 .616410637619083e-08, 1 .61590953950963e-08, 7 .65007328595857e-08, 1 .9661483108279754e-08, 4 .917534823789538e-08, 1 .1712612746350715e-07, 1 .0889253054813253e-08, 1 .494120169809321e-06, 1 .018585660261806e-08, 3 .7575969003000864e-08, 2 .097097784314883e-08, 3 .368558054717141e-08, 4 .845588819080149e-09, 6 .039624622644624e-07, 1 .037331109898787e-08, 2 .841650257323636e-07, 4 .4990630954089283e-07, 3 .463186004637464e-08, 7 .720684180867465e-08, 1 .471122175189521e-07, 1 .1601575522490748e-07, 4 .007488030310924e-07, 3 .025649775167949e-08, 6 .706784461130155e-08, 2 .0128741340386114e-08, 1 .5987744461654074e-09, 4 .1919822280078733e-08, 1 .3167154477855547e-08, 3 .231814815762846e-08, 9 .247659704669786e-08, 1 .3075300842047e-07, 1 .0574301256838226e-07, 3 .762165334819656e-08, 1 .0942246575496029e-07, 7 .001474955359299e-08, 2 .742706151082075e-08, 2 .0766625752344225e-08, 4 .5403403703403455e-08, 3 .39040298058535e-08, 1 .0469661759771043e-07, 2 .8271578855765256e-08, 3 .406226767310727e-07, 5 .146206945028098e-07, 6 .740708613506285e-07, 6 .382248063374618e-09, 3 .63878704945364e-08, 3 .626059807970705e-08, 1 .6065602892467723e-07, 3 .639055989879125e-07, 6 .232691696084203e-09, 4 .805490050330263e-08, 3 .372633727849461e-08, 6 .328880317596486e-07, 6 .480631498106959e-08, 2 .1165197949812864e-07, 8 .38779143919055e-08, 1 .7589144363228115e-08, 2 .729027670511641e-09, 2 .144795097080987e-08, 7 .861271456022223e-08, 2 .0118186228046397e-08, 2 .8407685093156942e-08, 2 .4922530883486615e-07, 2 .0156670998972004e-08, 2 .6551649767725394e-08, 2 .7848242822869906e-08, 6 .907123761834555e-09, 1 .880543720744754e-08, 1 .3006903998302732e-08, 3 .685918272822164e-07, 3 .967941211158177e-07, 2 .7592133022835696e-08, 2 .5228947819755376e-08, 1 .547002881352455e-07, 3 .689306637966183e-08, 1 .440177199718562e-09, 2 .1504929392790473e-08, 5 .068111263994979e-08, 5 .081711407228795e-08, 1 .171875219085905e-08, 5 .409278358570191e-08, 7 .138276600926474e-07, 2 .5237213208129106e-07, 7 .072044638789521e-08, 7 .199763984999663e-08, 1 .2525473103153217e-08, 3 .4803417747752974e-07, 1 .9591827538079087e-07, 1 .2404700555634918e-07, 1 .234617457157583e-07, 1 .9201337408958352e-08, 1 .9895249181445251e-07, 3 .7876677794201896e-08, 1 .0629785052174157e-08, 1 .2437127772102485e-08, 2 .1861892207653e-07, 2 .6181456291851646e-07, 1 .112900775979142e-07, 1 .0776630432474121e-07, 6 .380325157095967e-09, 3 .895085143312826e-09, 1 .5762756788717525e-07, 2 .909027019271093e-09, 1 .0381050685737137e-08, 2 .8135211493918177e-08, 1 .0778002490496874e-08, 1 .3605974125141529e-08, 2 .9236465692861202e-08, 1 .9189795352758665e-07, 2 .199506354827463e-07, 1 .326399790002597e-08, 4 .9004846403022384e-08, 2 .980837132682268e-09, 8 .926045680368588e-09, 1 .0996975774446582e-08, 7 .71560149104289e-09, 7 .454491246505768e-09, 5 .086162246925596e-08, 1 .5129764108223753e-07, 1 .1960075596562092e-08, 1 .1323334270230134e-08, 9 .391332156383214e-09, 9 .585701832293125e-08, 1 .905532798218701e-08, 1 .8105303922766325e-08, 6 .179227796110354e-08, 6 .389401363549041e-08, 1 .1853179771037503e-08, 9 .37277544466042e-09, 1 .2332148457971925e-07, 1 .6522022860954166e-08, 1 .246116454467483e-07, 4 .196171854431441e-09, 3 .996593278543514e-08, 1 .2554556505506298e-08, 1 .4302138140465104e-08, 6 .631793780798034e-09, 5 .964224669696705e-09, 5 .556936244488497e-09, 1 .4192455921602232e-07, 1 .7613080771639034e-08, 3 .380189639301534e-07, 7 .85651934620546e-08, 2 .966783085867064e-08, 2 .8992105853831163e-06, 1 .3787366697215475e-06, 5 .313622430946907e-09, 2 .512852859126724e-08, 8 .406627216572815e-08, 4 .492839167369311e-08, 5 .408793057881667e-08, 2 .4239175999696272e-08, 4 .016805235096399e-07, 4 .1083545454512205e-08, 5 .4153481698904216e-08, 8 .640767212853007e-09, 5 .773256717134245e-08, 2 .6443152023603034e-07, 8 .953217047746875e-07, 2 .7994001783326894e-08, 5 .889480014786841e-09, 4 .1788819515886644e-08, 2 .8880645430717777e-08, 2 .135752907861388e-08, 2 .3024175277441827e-07, 8 .786625471657317e-08, 2 .0697297209437693e-09, 2 .236410523437371e-08, 3 .203276310870251e-09, 1 .176874686592555e-08, 6 .963571053120177e-08, 2 .271932153519174e-08, 7 .360382525689602e-09, 6 .922528772435044e-09, 3 .213871480056696e-08, 1 .370577820125618e-07, 1 .9815049157045905e-08, 1 .0578956377571558e-08, 2 .7049420481262132e-08, 2 .9755937713815683e-09, 2 .1773699288019088e-08, 1 .09755387001087e-08, 1 .991872444762066e-08, 2 .3882098076910552e-08, 2 .1357365653784655e-08, 6 .109098560358461e-09, 1 .1890497475519624e-08, 1 .1459891702259029e-08, 3 .73173456580389e-08, 1 .572620256240498e-08, 3 .404023374287135e-08, 3 .6921580459647885e-08, 9 .281765045443535e-08, 1 .2323201303843234e-07, 4 .2347593876002065e-08, 1 .7423728237986325e-08, 5 .8113389656000436e-08, 3 .931436154402945e-08, 2 .3690461148362374e-08, 1 .792850135018398e-08, 1 .440664210150544e-08, 7 .019830494670032e-09, 6 .041522482291839e-08, 4 .867479930226182e-08, 1 .0685319296044327e-08, 1 .0051243393149889e-08, 4 .2426261614991745e-08, 2 .607815297039906e-08, 5 .136670200300 841e-09, 1 .69729952315123e-09, 1 .9131586981302462e-08, 2 .111743526711507e-07, 1 .337269672774255e-08, 2 .0002481448955223e-08, 1 .0454256482717028e-07, 2 .8144228281234973e-08, 2 .1344791889532644e-07, 2 .1046110632028103e-08, 1 .9114453664315079e-07, 3 .957693550660224e-08, 2 .931631826186276e-08, 1 .105203111251285e-07, 4 .84007678380749e-08, 5 .583606110803885e-08, 1 .2130111315400427e-07, 1 .77621615193857e-08, 2 .5610853882085394e-08, 1 .203865309662433e-07, 4 .674859610531712e-09, 1 .5916098661250544e-08, 3 .147594185293201e-08, 6 .147686093527227e-08, 2 .204641802450169e-08, 3 .257763410147163e-07, 1 .198914532096751e-07, 2 .3818989802748547e-07, 1 .4909986134625797e-08, 5 .10168831624469e-08, 5 .5142201915714395e-08, 2 .288550327023131e-08, 5 .714110073995471e-08, 5 .185095801607531e-07, 4 .977285783525076e-08, 1 .1049896109227575e-08, 1 .264099296349741e-07, 8 .174881571676451e-08 ]]} * Connection #0 to host localhost left intact","title":"Run a Prediction"},{"location":"modelserving/v1beta1/paddle/#deploy-the-model-with-open-inference-protocol","text":"","title":"Deploy the model with Open Inference Protocol"},{"location":"modelserving/v1beta1/paddle/#test-the-model-locally","text":"Once you've got your model serialised model.pdmodel , we can then use KServe Paddle Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Test the Model locally"},{"location":"modelserving/v1beta1/paddle/#using-kserve-paddleserver","text":"","title":"Using KServe PaddleServer"},{"location":"modelserving/v1beta1/paddle/#pre-requisites","text":"Firstly, to use KServe Paddle server locally, you will first need to install the paddleserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install paddleserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/paddleserver poetry install","title":"Pre-requisites"},{"location":"modelserving/v1beta1/paddle/#serving-model-locally","text":"The paddleserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the paddleserver runtime package installed locally, you should now be ready to start our server as: python3 paddleserver --model_dir /path/to/model_dir --model_name paddle-v2-resnet50","title":"Serving model locally"},{"location":"modelserving/v1beta1/paddle/#deploy-the-model-with-rest-endpoint-through-inferenceservice","text":"Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" Apply the InferenceService yaml to get the REST endpoint kubectl kubectl apply -f paddle-v2-resnet.yaml","title":"Deploy the Model with REST endpoint through InferenceService"},{"location":"modelserving/v1beta1/paddle/#test-the-deployed-model","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can use the example payload jay-v2.json as the sample input to test the model. Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice paddle-v2-resnet50 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./jay-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/paddle-v2-resnet50/infer Expected Output { \"model_name\" : \"paddle-v2-resnet50\" , \"model_version\" : null , \"id\" : \"afa5ec0b-a5c7-454b-a464-53ba931b22df\" , \"parameters\" : null , \"outputs\" :[{ \"name\" : \"output-0\" , \"shape\" :[ 1 , 1000 ], \"datatype\" : \"FP32\" , \"parameters\" : null , \"data\" :[ 6.736678770380422e-9 , 1.1535990829258935e-8 , 5.142250714129659e-8 , 6.647170636142619e-8 , 4.094492567219277e-8 , 1.3402451770616608e-7 , 9.355561303436843e-8 , 2.8935891904779965e-8 , 6.845367295227334e-8 , 7.680615965455218e-8 , 0.0000020334689452283783 , 0.0000011085678579547675 , 2.3477592492326949e-7 , 6.582037030966603e-7 , 0.00012373103527352214 , 4.2878804151769145e-7 , 0.000006419959845516132 , 0.9993496537208557 , 0.00007372002437477931 , 0.00003101135735050775 , 0.0000056028093240456656 , 0.0000021862508674530545 , 1.9544044604913324e-8 , 3.728893887000595e-7 , 4.2903633357127546e-7 , 1.8251179767503345e-7 , 7.159925985433802e-8 , 9.231618136595898e-9 , 6.469241498052725e-7 , 7.031690341108288e-9 , 4.451231561120039e-8 , 1.2455971898361895e-7 , 9.44632745358831e-8 , 4.347704418705689e-8 , 4.658220120745682e-7 , 6.797721141538204e-8 , 2.1060276367279585e-7 , 2.2605123106700376e-8 , 1.4311490303953178e-7 , 7.951298641728499e-8 , 1.2341783417468832e-7 , 0.0000010921713737843675 , 0.000015243892448779661 , 3.1173343018053856e-7 , 2.4152058131221565e-7 , 6.863762536113427e-8 , 8.467682022228473e-8 , 9.4246772164297e-8 , 1.0219210366813058e-8 , 3.3770753304906975e-8 , 3.6928835100979995e-8 , 1.3694031508748594e-7 , 1.0674284567357972e-7 , 2.599483650556067e-7 , 3.4866405940192635e-7 , 3.132053549848024e-8 , 3.574873232992104e-7 , 6.64843895492595e-8 , 3.1638955988455564e-7 , 0.0000012095878219042788 , 8.66409024524728e-8 , 4.0144172430700564e-8 , 1.2544761318622477e-7 , 3.3201178695208e-8 , 1.9731444922399533e-7 , 3.806405572959193e-7 , 1.3827865075199952e-7 , 2.300225965257141e-8 , 7.14422512260171e-8 , 2.851114544455413e-8 , 2.982567437470607e-8 , 8.936032713791064e-8 , 6.22388370175031e-7 , 6.478838798784636e-8 , 1.3663023423760023e-7 , 9.973181391842445e-8 , 2.5761554667269593e-8 , 4.130220077058766e-8 , 3.9384463690339544e-8 , 1.2158079698565416e-7 , 0.000004302821707824478 , 0.0000018179063090428826 , 0.0000018520155435908237 , 0.0000016246107179540559 , 0.000016448313544970006 , 0.000010544916221988387 , 0.000003993061909568496 , 2.646479799750523e-7 , 0.000019193475964129902 , 4.803242745765601e-7 , 1.696285067964709e-7 , 0.000004550505764200352 , 0.00004235929372953251 , 0.000004443338639248395 , 0.000005104009687784128 , 0.000013506396498996764 , 4.1758724478313525e-7 , 4.494491463447048e-7 , 3.156698369366495e-7 , 0.0000010557599807725637 , 1.336463917311903e-8 , 1.3893659556174498e-8 , 6.770379457066156e-8 , 1.4129696523923485e-7 , 7.170518756538513e-8 , 7.934466594861078e-8 , 2.639154317307657e-8 , 2.6134321373660896e-8 , 7.196725881897237e-9 , 2.1752363466021052e-8 , 6.684639686227456e-8 , 3.417795824134373e-8 , 1.6228275967478112e-7 , 4.107114648377319e-7 , 6.472135396506928e-7 , 2.951379372007068e-7 , 5.653474133282543e-9 , 4.830144462175667e-8 , 8.887481861563629e-9 , 3.7306168820805397e-8 , 1.7784264727538357e-8 , 4.641905082536368e-9 , 3.413118676576232e-8 , 1.937393818707278e-7 , 0.0000012980176506971475 , 3.5641004814124244e-8 , 2.149332445355867e-8 , 3.055293689158134e-7 , 1.5532516783878236e-7 , 0.0000014520978766086046 , 3.488464628276233e-8 , 0.00003825438398052938 , 4.5088432898410247e-7 , 4.1766969616219285e-7 , 6.770622462681786e-7 , 1.4142248971893423e-7 , 0.000014235997696232516 , 6.293820433711517e-7 , 0.000004762866865348769 , 9.024900577969674e-7 , 9.058987870957935e-7 , 0.0000015713684433649178 , 1.5720647184025438e-7 , 1.818536503606083e-7 , 7.193188622522939e-8 , 0.0000011952824934269302 , 8.874837362782273e-7 , 2.0870831463071227e-7 , 9.906239029078279e-8 , 7.793621747964607e-9 , 1.0058498389753368e-7 , 4.2059440374941914e-7 , 1.843624630737395e-7 , 1.6437947181202617e-7 , 7.025352743994517e-8 , 2.570448600636155e-7 , 7.586877615040066e-8 , 7.841313731660193e-7 , 2.495309274763713e-7 , 5.157681925993529e-8 , 4.0674127177453556e-8 , 7.531796519799627e-9 , 4.797485431140558e-8 , 1.7419973019627832e-8 , 1.7958679165985814e-7 , 1.2566392371127222e-8 , 8.975440124459055e-8 , 3.26965476915575e-8 , 1.1208359751435637e-7 , 3.906746215420753e-8 , 4.6769045525252295e-8 , 1.8523553535487736e-7 , 1.4833052830454108e-7 , 1.2279349448363064e-7 , 0.0000010729105497375713 , 3.6538490011395197e-9 , 1.6198403329781286e-7 , 1.6190719875908144e-8 , 1.2004933580556099e-7 , 1.4800277448046018e-8 , 4.02294837442696e-8 , 2.15060893538066e-7 , 1.1925696696835075e-7 , 4.8982514044837444e-8 , 7.608920071788816e-8 , 2.3137479487900237e-8 , 8.521050176568679e-8 , 9.586213423062873e-8 , 1.3351650807180704e-7 , 3.021699157557123e-8 , 4.423876376336011e-8 , 2.610667060309879e-8 , 2.3977091245797055e-7 , 1.3192564551900432e-7 , 1.6734931662654162e-8 , 1.588336999702733e-7 , 4.0643516285854275e-7 , 8.753454494581092e-8 , 8.366999395548191e-7 , 3.437598650180007e-8 , 7.847892646850596e-8 , 8.526394701391382e-9 , 9.601382799928615e-8 , 5.258924034023948e-7 , 1.3557448141909845e-7 , 1.0307226716577134e-7 , 1.0429813457335513e-8 , 5.187714435805901e-8 , 2.187001335585137e-8 , 1.1791439824548888e-8 , 2.98065643278278e-8 , 4.338393466696289e-8 , 2.9991046091026874e-8 , 2.8507610494443725e-8 , 3.058665143385042e-8 , 6.441099031917474e-8 , 1.5364101102477434e-8 , 1.5973883549236234e-8 , 2.5736850872704053e-8 , 1.0903765712555469e-7 , 3.2118737891551064e-8 , 6.819742992547617e-9 , 1.9251311300649832e-7 , 5.8258109447706374e-8 , 1.8765761922168167e-7 , 4.0070790419122204e-7 , 1.5791577823165426e-8 , 1.950158434738114e-7 , 1.0142063189277906e-8 , 2.744815041921811e-8 , 1.2843531571604672e-8 , 3.7297493094001766e-8 , 7.407496838141014e-8 , 4.20607833007125e-8 , 1.6924804668860816e-8 , 1.459203531339881e-7 , 4.344977000414474e-8 , 1.7191403856031684e-7 , 3.5817443233554513e-8 , 8.440249388286247e-9 , 4.194829728021432e-8 , 2.514032360068086e-8 , 2.8340199520471288e-8 , 8.747196034164517e-8 , 8.277125651545703e-9 , 1.1676293709683705e-8 , 1.4548514570833504e-7 , 7.200282148289716e-9 , 0.000002623600948936655 , 5.675736929333652e-7 , 0.0000019483527466945816 , 6.752595282932816e-8 , 8.168475318370838e-8 , 1.0933046468153407e-7 , 1.670913718498923e-7 , 3.1387276777650186e-8 , 2.973524537708272e-8 , 5.752163900751839e-8 , 5.850877471402782e-8 , 3.2544622285968217e-7 , 3.330221431951941e-8 , 4.186786668469722e-7 , 1.5085906568401697e-7 , 2.3346819943981245e-7 , 2.86402780602657e-7 , 2.2940319865938363e-7 , 1.8537603807544656e-7 , 3.151798182443599e-7 , 0.0000011075967449869495 , 1.5369782602192572e-7 , 1.9237509718550427e-7 , 1.64044664074936e-7 , 2.900835340824415e-7 , 1.246654903752642e-7 , 5.802622027317739e-8 , 5.186220519703966e-8 , 6.0094205167615655e-9 , 1.2333241272699524e-7 , 1.3798474185477971e-7 , 1.7370231830682314e-7 , 5.617761189569137e-7 , 5.1604470030497396e-8 , 4.813277598714194e-8 , 8.032698417537176e-8 , 0.0000020645263703045202 , 5.638597713186755e-7 , 8.794199857220519e-7 , 0.0000034785980460583232 , 2.972389268052211e-7 , 3.3904532870110415e-7 , 9.469074058188198e-8 , 3.754845678827223e-8 , 1.5679037801419327e-7 , 8.203105039683578e-8 , 6.847962641387539e-9 , 1.8251624211984563e-8 , 6.050240841659615e-8 , 3.956342808919544e-8 , 1.0699947949888156e-7 , 3.2566634899922065e-7 , 3.5369430406717584e-7 , 7.326295303755614e-8 , 4.85765610847011e-7 , 7.717713401689252e-7 , 3.4567779749750116e-8 , 3.246204585138912e-7 , 0.0000031608601602783892 , 5.33099466792919e-8 , 3.645687343123427e-7 , 5.48158936908294e-7 , 4.62306957160763e-8 , 1.3466177506415988e-7 , 4.3529482240955986e-8 , 1.6404105451783835e-7 , 2.463695381038633e-8 , 5.958712634424046e-8 , 9.493651020875404e-8 , 5.523462576206839e-8 , 5.7412357534758485e-8 , 0.000011850350347231142 , 0.0000058263944993086625 , 0.0000074208674050169066 , 9.127966222877149e-7 , 0.0000020019581370434025 , 0.000001033498961078294 , 3.5146850763112525e-8 , 0.000002058995278275688 , 3.5655509122989315e-7 , 6.873234070781109e-8 , 2.1935298022413008e-9 , 5.560363547374436e-8 , 3.3266996979364194e-7 , 1.307369217329324e-7 , 2.718762992515167e-8 , 1.0462929189714032e-8 , 7.466680358447775e-7 , 6.923166040451179e-8 , 1.6145664361033596e-8 , 8.568521003837759e-9 , 4.76221018175238e-9 , 1.233977116044116e-7 , 8.340628632197422e-9 , 3.2649041248333788e-9 , 5.0632489312363305e-9 , 4.0704994930251814e-9 , 1.2043538610839732e-8 , 5.105608380517879e-9 , 7.267142887457112e-9 , 1.184516307262129e-7 , 7.53557927168913e-8 , 6.386964201965384e-8 , 1.6212936770898523e-8 , 2.610429419291904e-7 , 6.979425393183192e-7 , 6.647513117741255e-8 , 7.717492849224072e-7 , 6.651206945207377e-7 , 3.324495310152997e-7 , 3.707282019149716e-7 , 3.99564243025452e-7 , 6.411632114122767e-8 , 7.107352217872176e-8 , 1.6380016631956096e-7 , 6.876800995314625e-8 , 3.462474467141874e-7 , 2.0256503319160402e-7 , 6.19610148078209e-7 , 2.6841073363925716e-8 , 6.720335363752383e-7 , 0.0000011348340649419697 , 0.0000018397931853542104 , 6.397251581802266e-7 , 7.257533241045167e-8 , 4.2213909523525217e-7 , 3.9657925299252383e-7 , 1.4037439655112394e-7 , 3.249856774800719e-7 , 1.5857655455420172e-7 , 1.1122217102865761e-7 , 7.391420808744442e-8 , 3.42322238111592e-7 , 5.39796154441774e-8 , 8.517296379295658e-8 , 0.000004061009803990601 , 0.000014478755474556237 , 7.317032757470088e-9 , 6.9484960008026064e-9 , 4.468917325084476e-8 , 9.23141172393116e-8 , 5.411982328951126e-8 , 2.2242811326123046e-7 , 1.7609554703312824e-8 , 2.0906279374344194e-8 , 3.6797682678724186e-9 , 6.177919686933819e-8 , 1.7920288541972695e-7 , 2.6279179721200308e-8 , 2.6988200119149042e-8 , 1.6432807115052128e-7 , 1.2827612749788386e-7 , 4.468908798571647e-8 , 6.316552969565237e-8 , 1.9461760203398626e-8 , 2.087125849925542e-8 , 2.2414580413965268e-8 , 2.4765244077684656e-8 , 6.785398465325443e-9 , 2.4248794971981624e-8 , 4.554979504689527e-9 , 2.8977037658250993e-8 , 2.0402325162649504e-8 , 1.600950270130852e-7 , 2.0199709638291097e-7 , 1.611188515937556e-8 , 5.964113825029926e-8 , 4.098318573397819e-9 , 3.9080127578472457e-8 , 7.511338218080255e-9 , 5.965624154669058e-7 , 1.6478223585636442e-7 , 1.4106989354445432e-8 , 3.2855584919389e-8 , 3.3387166364917675e-9 , 1.220043444050134e-8 , 4.624639160510924e-8 , 6.842309385746148e-9 , 1.74262879681919e-8 , 4.6611329906909305e-8 , 9.331947836699328e-8 , 1.2306078644996887e-7 , 1.2359445022980253e-8 , 1.1173199254699284e-8 , 2.7724862405875683e-8 , 2.419210147763806e-7 , 3.451186785241589e-7 , 2.593766978975509e-8 , 9.964568192799561e-8 , 9.797809674694236e-9 , 1.9085564417764544e-7 , 3.972706252852731e-8 , 2.6639204619982593e-8 , 6.874148805735558e-9 , 3.146993776681484e-8 , 2.4086594407890516e-7 , 1.3126927456141857e-7 , 2.1254339799270383e-7 , 2.050203384840188e-8 , 3.694976058454813e-8 , 6.563175816154398e-7 , 2.560050127442537e-8 , 2.6882981174480847e-8 , 6.880636078676616e-7 , 2.0092733166166e-7 , 2.788039665801989e-8 , 2.628409134786125e-8 , 5.1678345158734373e-8 , 1.8935413947929192e-7 , 4.61852835087484e-7 , 1.1086777718105623e-8 , 1.4542604276357451e-7 , 2.8737009216683873e-8 , 6.105167926762078e-7 , 1.2016463379893594e-8 , 1.3944705301582871e-7 , 2.093712758721722e-8 , 4.3801410498645055e-8 , 1.966320795077081e-8 , 6.654448991838535e-9 , 1.1149590584125235e-8 , 6.424939158478082e-8 , 6.971554888934861e-9 , 3.260019587614238e-9 , 1.4260189473702667e-8 , 2.7895078247297533e-8 , 8.11578289017234e-8 , 2.5995715802196173e-8 , 2.2855578762914774e-8 , 1.055962854934478e-7 , 8.145542551574181e-8 , 3.7793686402665116e-8 , 4.881891513264236e-8 , 2.342062366267328e-8 , 1.059935517133681e-8 , 3.604105103249822e-8 , 5.062430830093945e-8 , 3.6804440384230475e-8 , 1.501580193519203e-9 , 0.0000014475033367489232 , 0.000001076210423889279 , 1.304991315009829e-7 , 3.073601462233455e-8 , 1.7184021317007137e-8 , 2.0421090596300928e-8 , 7.904992216367646e-9 , 1.6902052379919041e-7 , 1.2416506933732308e-8 , 5.4758292122869534e-8 , 2.6250422280327257e-8 , 1.3261367115546818e-8 , 6.29807459517906e-8 , 1.270998595259698e-8 , 2.0171681569536304e-7 , 4.386637186826192e-8 , 6.962349630157405e-8 , 2.9565120485131047e-7 , 7.925131626507209e-7 , 2.0868920103112032e-7 , 1.7341794489311724e-7 , 4.2942417621816276e-8 , 4.213406956665722e-9 , 8.824785169281313e-8 , 1.7341569957807224e-8 , 7.321587247588468e-8 , 1.7941774288487977e-8 , 1.1245148101579616e-7 , 4.242405395871174e-7 , 8.259573469615589e-9 , 1.1336403105133286e-7 , 8.268798978861014e-8 , 2.2186977588489754e-8 , 1.9539720952366224e-8 , 1.0675703876472653e-8 , 3.288517547161973e-8 , 2.4340963022950746e-8 , 6.639137239972115e-8 , 5.604687380866835e-9 , 1.386604697728444e-8 , 6.675873720496384e-8 , 1.1355886009312144e-8 , 3.132159633878473e-7 , 3.12451788886392e-8 , 1.502181845580708e-7 , 1.3461754377885882e-8 , 1.8882955998833495e-7 , 4.645742279762999e-8 , 4.6453880742092224e-8 , 7.714453964524637e-9 , 3.5857155467056145e-8 , 7.60832108426257e-9 , 4.221501370693659e-8 , 4.3407251126836854e-9 , 1.340157496088068e-8 , 8.565600495558101e-8 , 1.7045413969185574e-8 , 5.4221903411644234e-8 , 3.021912675649219e-8 , 6.153376119755194e-8 , 3.938857240370908e-9 , 4.135628017820636e-8 , 1.781920389021252e-8 , 4.3105885083605244e-8 , 3.903354972578654e-9 , 7.663085455078544e-8 , 1.1890405993142394e-8 , 9.304217840622186e-9 , 1.0968062014171664e-9 , 1.0536767902635802e-8 , 1.1516804221400889e-7 , 8.134522886393825e-7 , 5.952623993721318e-8 , 2.806350174466843e-8 , 1.2833099027886874e-8 , 1.0605690192733164e-7 , 7.872949936427176e-7 , 2.7501393162765453e-8 , 3.936289072470345e-9 , 2.0519442145428002e-8 , 7.394815870753746e-9 , 3.598397313453461e-8 , 2.5378517065632877e-8 , 4.698972233541099e-8 , 7.54952989012736e-9 , 6.322805461422831e-7 , 5.582006412652163e-9 , 1.29640980617296e-7 , 1.5874988434916304e-8 , 3.3837810775594335e-8 , 6.474512037613067e-9 , 9.121148281110436e-8 , 1.3918511676536127e-8 , 8.230025549949005e-9 , 2.7061290097663004e-8 , 2.6095918315149902e-8 , 5.722363471960534e-9 , 6.963475698285038e-7 , 4.685091781198025e-8 , 9.590579885809802e-9 , 2.099205858030473e-7 , 3.082160660028421e-8 , 3.563162565001221e-8 , 7.326312925215461e-7 , 0.0000021759731225756695 , 2.407518309155421e-7 , 2.974515780351794e-7 , 2.529018416908002e-8 , 7.667950718825978e-9 , 2.663289251358947e-7 , 3.4358880185436647e-8 , 2.3130198201215535e-8 , 3.1239693498719134e-8 , 2.8691621878351725e-7 , 3.895845068768722e-8 , 2.4184130253956937e-8 , 1.1582445225144511e-8 , 5.1545349322168477e-8 , 2.034345492063494e-8 , 8.201963197507212e-8 , 1.164153573540716e-8 , 5.496356720868789e-7 , 1.1682151246361627e-8 , 4.7576914852243135e-8 , 1.6349824605299546e-8 , 4.090862759653646e-8 , 2.1271189609706198e-7 , 1.6697286753242224e-7 , 3.989708119433999e-8 , 0.000002852450279533514 , 1.2500372292834072e-7 , 2.4846613655427063e-7 , 1.245429093188477e-8 , 2.9700272463628608e-8 , 4.250991558762962e-9 , 1.61443480806156e-7 , 2.6386018703306036e-7 , 7.638056409575711e-9 , 3.4455793773702226e-9 , 7.273289526210647e-8 , 1.7631434090503717e-8 , 7.58661311550668e-9 , 2.1547013062672704e-8 , 1.2675349125856883e-7 , 2.5637149292379036e-8 , 3.500976220038865e-8 , 6.472243541111311e-8 , 8.387915251262257e-9 , 3.069512288789156e-8 , 7.520387867998579e-8 , 1.5724964441687916e-7 , 1.9634005354873807e-7 , 1.2290831818972947e-7 , 1.112118730439704e-9 , 1.546895944670723e-8 , 9.91701032404535e-9 , 6.882473257974198e-7 , 8.267616635748709e-8 , 4.469531234008173e-8 , 2.075201344098332e-8 , 8.649378457903367e-8 , 5.202766573120243e-8 , 4.5564942041664835e-8 , 2.0319955496006514e-8 , 8.705182352741758e-9 , 6.452066969586667e-8 , 2.1777438519166026e-8 , 1.030954166481024e-8 , 3.211904342492744e-8 , 2.3336936294526822e-7 , 8.054096056753224e-9 , 1.9623354319264763e-7 , 1.2888089884199871e-7 , 1.5392496166555247e-8 , 1.401903038100727e-9 , 5.696818305978013e-8 , 6.080025372057207e-9 , 1.0782793324892737e-8 , 2.4260730313585555e-8 , 1.9388659566743627e-8 , 2.2970310453729326e-7 , 1.9971754028347277e-8 , 2.8477993296860404e-8 , 5.2273552597625894e-8 , 2.7392806600801123e-7 , 9.857291161097237e-8 , 3.12910977129377e-8 , 4.151442212219081e-8 , 5.251196366629074e-9 , 0.000001580681100676884 , 8.547603442821128e-7 , 1.068913135782168e-8 , 0.0000010621830597301596 , 7.737313012512459e-8 , 6.394216711669287e-8 , 1.1698345758759388e-7 , 1.0486609625104393e-7 , 2.1161000063329993e-7 , 1.53396815250062e-8 , 5.094453570109181e-8 , 1.4005379966874898e-8 , 2.6282036102998063e-8 , 8.778433624456738e-8 , 7.772066545896905e-9 , 4.228875383205377e-8 , 3.3243779284930497e-7 , 7.729244799747903e-8 , 7.636901111496286e-10 , 5.989500806435899e-8 , 1.326090597331131e-7 , 1.2853634245857393e-7 , 8.844242671557367e-9 , 1.0194374766570036e-7 , 2.493779334145074e-7 , 1.6547971881664125e-7 , 1.1762754326127833e-8 , 1.1496195639892903e-7 , 2.9342709240154363e-7 , 1.326124099421122e-8 , 8.630262726683213e-8 , 5.7394842656322e-8 , 1.1094081031615133e-7 , 2.2933713239581266e-7 , 3.4706170026765903e-7 , 1.4751107357824367e-7 , 1.502495017291494e-8 , 6.454319390059027e-8 , 5.164533689594464e-8 , 6.23741556182722e-8 , 1.293601457064142e-7 , 1.4052071506398534e-8 , 5.386946000385251e-8 , 2.0827554791935654e-8 , 1.3040637902861363e-8 , 1.0578981601838677e-7 , 1.5079727688771527e-8 , 8.92632726845477e-7 , 4.6374381668101705e-8 , 7.481006036869076e-7 , 5.883147302654379e-9 , 2.8707685117979054e-9 , 8.381598490814213e-7 , 7.341958596640552e-9 , 1.4245998158912698e-8 , 1.0926417104428765e-7 , 1.1308178216040687e-7 , 2.52339901862797e-7 , 1.1782835684925885e-7 , 4.6678056975224536e-8 , 2.7959197179683315e-9 , 3.4363861090014325e-8 , 1.4674496640054713e-7 , 3.5396915620822256e-8 , 2.0581127557761647e-7 , 7.18387909159901e-8 , 2.7693943138729082e-8 , 4.5493386835460115e-8 , 1.9559182717898693e-8 , 1.5359708172013598e-8 , 1.2336623278486059e-8 , 2.9570605519779747e-8 , 2.877552560676122e-7 , 9.051845495378075e-7 , 2.3732602016934834e-7 , 1.6521676471370483e-8 , 1.5478875070584763e-8 , 3.526786329643983e-8 , 3.616410637619083e-8 , 1.61590953950963e-8 , 7.65007328595857e-8 , 1.9661483108279754e-8 , 4.917534823789538e-8 , 1.1712612746350715e-7 , 1.0889253054813253e-8 , 0.000001494120169809321 , 1.018585660261806e-8 , 3.7575969003000864e-8 , 2.097097784314883e-8 , 3.368558054717141e-8 , 4.845588819080149e-9 , 6.039624622644624e-7 , 1.037331109898787e-8 , 2.841650257323636e-7 , 4.4990630954089283e-7 , 3.463186004637464e-8 , 7.720684180867465e-8 , 1.471122175189521e-7 , 1.1601575522490748e-7 , 4.007488030310924e-7 , 3.025649775167949e-8 , 6.706784461130155e-8 , 2.0128741340386114e-8 , 1.5987744461654074e-9 , 4.1919822280078733e-8 , 1.3167154477855547e-8 , 3.231814815762846e-8 , 9.247659704669786e-8 , 1.3075300842047e-7 , 1.0574301256838226e-7 , 3.762165334819656e-8 , 1.0942246575496029e-7 , 7.001474955359299e-8 , 2.742706151082075e-8 , 2.0766625752344225e-8 , 4.5403403703403455e-8 , 3.39040298058535e-8 , 1.0469661759771043e-7 , 2.8271578855765256e-8 , 3.406226767310727e-7 , 5.146206945028098e-7 , 6.740708613506285e-7 , 6.382248063374618e-9 , 3.63878704945364e-8 , 3.626059807970705e-8 , 1.6065602892467723e-7 , 3.639055989879125e-7 , 6.232691696084203e-9 , 4.805490050330263e-8 , 3.372633727849461e-8 , 6.328880317596486e-7 , 6.480631498106959e-8 , 2.1165197949812864e-7 , 8.38779143919055e-8 , 1.7589144363228115e-8 , 2.729027670511641e-9 , 2.144795097080987e-8 , 7.861271456022223e-8 , 2.0118186228046397e-8 , 2.8407685093156942e-8 , 2.4922530883486615e-7 , 2.0156670998972004e-8 , 2.6551649767725394e-8 , 2.7848242822869906e-8 , 6.907123761834555e-9 , 1.880543720744754e-8 , 1.3006903998302732e-8 , 3.685918272822164e-7 , 3.967941211158177e-7 , 2.7592133022835696e-8 , 2.5228947819755376e-8 , 1.547002881352455e-7 , 3.689306637966183e-8 , 1.440177199718562e-9 , 2.1504929392790473e-8 , 5.068111263994979e-8 , 5.081711407228795e-8 , 1.171875219085905e-8 , 5.409278358570191e-8 , 7.138276600926474e-7 , 2.5237213208129106e-7 , 7.072044638789521e-8 , 7.199763984999663e-8 , 1.2525473103153217e-8 , 3.4803417747752974e-7 , 1.9591827538079087e-7 , 1.2404700555634918e-7 , 1.234617457157583e-7 , 1.9201337408958352e-8 , 1.9895249181445251e-7 , 3.7876677794201896e-8 , 1.0629785052174157e-8 , 1.2437127772102485e-8 , 2.1861892207653e-7 , 2.6181456291851646e-7 , 1.112900775979142e-7 , 1.0776630432474121e-7 , 6.380325157095967e-9 , 3.895085143312826e-9 , 1.5762756788717525e-7 , 2.909027019271093e-9 , 1.0381050685737137e-8 , 2.8135211493918177e-8 , 1.0778002490496874e-8 , 1.3605974125141529e-8 , 2.9236465692861202e-8 , 1.9189795352758665e-7 , 2.199506354827463e-7 , 1.326399790002597e-8 , 4.9004846403022384e-8 , 2.980837132682268e-9 , 8.926045680368588e-9 , 1.0996975774446582e-8 , 7.71560149104289e-9 , 7.454491246505768e-9 , 5.086162246925596e-8 , 1.5129764108223753e-7 , 1.1960075596562092e-8 , 1.1323334270230134e-8 , 9.391332156383214e-9 , 9.585701832293125e-8 , 1.905532798218701e-8 , 1.8105303922766325e-8 , 6.179227796110354e-8 , 6.389401363549041e-8 , 1.1853179771037503e-8 , 9.37277544466042e-9 , 1.2332148457971925e-7 , 1.6522022860954166e-8 , 1.246116454467483e-7 , 4.196171854431441e-9 , 3.996593278543514e-8 , 1.2554556505506298e-8 , 1.4302138140465104e-8 , 6.631793780798034e-9 , 5.964224669696705e-9 , 5.556936244488497e-9 , 1.4192455921602232e-7 , 1.7613080771639034e-8 , 3.380189639301534e-7 , 7.85651934620546e-8 , 2.966783085867064e-8 , 0.0000028992105853831163 , 0.0000013787366697215475 , 5.313622430946907e-9 , 2.512852859126724e-8 , 8.406627216572815e-8 , 4.492839167369311e-8 , 5.408793057881667e-8 , 2.4239175999696272e-8 , 4.016805235096399e-7 , 4.1083545454512205e-8 , 5.4153481698904216e-8 , 8.640767212853007e-9 , 5.773256717134245e-8 , 2.6443152023603034e-7 , 8.953217047746875e-7 , 2.7994001783326894e-8 , 5.889480014786841e-9 , 4.1788819515886644e-8 , 2.8880645430717777e-8 , 2.135752907861388e-8 , 2.3024175277441827e-7 , 8.786625471657317e-8 , 2.0697297209437693e-9 , 2.236410523437371e-8 , 3.203276310870251e-9 , 1.176874686592555e-8 , 6.963571053120177e-8 , 2.271932153519174e-8 , 7.360382525689602e-9 , 6.922528772435044e-9 , 3.213871480056696e-8 , 1.370577820125618e-7 , 1.9815049157045905e-8 , 1.0578956377571558e-8 , 2.7049420481262132e-8 , 2.9755937713815683e-9 , 2.1773699288019088e-8 , 1.09755387001087e-8 , 1.991872444762066e-8 , 2.3882098076910552e-8 , 2.1357365653784655e-8 , 6.109098560358461e-9 , 1.1890497475519624e-8 , 1.1459891702259029e-8 , 3.73173456580389e-8 , 1.572620256240498e-8 , 3.404023374287135e-8 , 3.6921580459647885e-8 , 9.281765045443535e-8 , 1.2323201303843234e-7 , 4.2347593876002065e-8 , 1.7423728237986325e-8 , 5.8113389656000436e-8 , 3.931436154402945e-8 , 2.3690461148362374e-8 , 1.792850135018398e-8 , 1.440664210150544e-8 , 7.019830494670032e-9 , 6.041522482291839e-8 , 4.867479930226182e-8 , 1.0685319296044327e-8 , 1.0051243393149889e-8 , 4.2426261614991745e-8 , 2.607815297039906e-8 , 5.136670200300841e-9 , 1.69729952315123e-9 , 1.9131586981302462e-8 , 2.111743526711507e-7 , 1.337269672774255e-8 , 2.0002481448955223e-8 , 1.0454256482717028e-7 , 2.8144228281234973e-8 , 2.1344791889532644e-7 , 2.1046110632028103e-8 , 1.9114453664315079e-7 , 3.957693550660224e-8 , 2.931631826186276e-8 , 1.105203111251285e-7 , 4.84007678380749e-8 , 5.583606110803885e-8 , 1.2130111315400427e-7 , 1.77621615193857e-8 , 2.5610853882085394e-8 , 1.203865309662433e-7 , 4.674859610531712e-9 , 1.5916098661250544e-8 , 3.147594185293201e-8 , 6.147686093527227e-8 , 2.204641802450169e-8 , 3.257763410147163e-7 , 1.198914532096751e-7 , 2.3818989802748547e-7 , 1.4909986134625797e-8 , 5.10168831624469e-8 , 5.5142201915714395e-8 , 2.288550327023131e-8 , 5.714110073995471e-8 , 5.185095801607531e-7 , 4.977285783525076e-8 , 1.1049896109227575e-8 , 1.264099296349741e-7 , 8.174881571676451e-8 ]}]}","title":"Test the Deployed Model"},{"location":"modelserving/v1beta1/paddle/#deploy-the-model-with-grpc-endpoint-through-inferenceservice","text":"Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50-grpc\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"paddle-v2-resnet50-grpc\" spec : predictor : model : modelFormat : name : paddle protocolVersion : v2 runtime : kserve-paddleserver storageUri : \"gs://kfserving-examples/models/paddle/resnet\" ports : - name : grpc-port # Istio requires the port name to be in the format [-] protocol : TCP containerPort : 8081 Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f paddle-v2-grpc.yaml","title":"Deploy the Model with GRPC endpoint through InferenceService"},{"location":"modelserving/v1beta1/paddle/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = jay-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice paddle-v2-resnet50-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can use the example payload jay-v2-grpc.json as the sample input to test the model. Notice that the input format differs from the in the previous REST endpoint example. ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Tue, 10 Oct 2023 14 :55:27 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 190 Estimated response size: 4093 bytes Response contents: { \"modelName\" : \"paddle-v2-resnet50-grpc\" , \"id\" : \"97db0c56-95d2-4171-afd5-f7609a87032d\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"FP32\" , \"shape\" : [ \"1\" , \"1000\" ] , \"contents\" : { \"fp32Contents\" : [ 6 .7366788e-9,1.1535991e-8,5.1422507e-8,6.6471706e-8,4.0944926e-8,1.3402452e-7,9.355561e-8,2.8935892e-8,6.845367e-8,7.680616e-8,0.000002033469,0.0000011085679,2.3477592e-7,6.582037e-7,0.00012373104,4.2878804e-7,0.00000641996,0.99934965,0.000073720024,0.000031011357,0.0000056028093,0.0000021862509,1.9544045e-8,3.728894e-7,4.2903633e-7,1.825118e-7,7.159926e-8,9.231618e-9,6.4692415e-7,7.0316903e-9,4.4512316e-8,1.2455972e-7,9.4463275e-8,4.3477044e-8,4.65822e-7,6.797721e-8,2.1060276e-7,2.2605123e-8,1.431149e-7,7.9512986e-8,1.2341783e-7,0.0000010921714,0.000015243892,3.1173343e-7,2.4152058e-7,6.8637625e-8,8.467682e-8,9.424677e-8,1.021921e-8,3.3770753e-8,3.6928835e-8,1.3694032e-7,1.06742846e-7,2.5994837e-7,3.4866406e-7,3.1320535e-8,3.5748732e-7,6.648439e-8,3.1638956e-7,0.0000012095878,8.66409e-8,4.0144172e-8,1.2544761e-7,3.320118e-8,1.9731445e-7,3.8064056e-7,1.3827865e-7,2.300226e-8,7.144225e-8,2.8511145e-8,2.9825674e-8,8.936033e-8,6.2238837e-7,6.478839e-8,1.3663023e-7,9.9731814e-8,2.5761555e-8,4.13022e-8,3.9384464e-8,1.215808e-7,0.0000043028217,0.0000018179063,0.0000018520155,0.0000016246107,0.000016448314,0.000010544916,0.000003993062,2.6464798e-7,0.000019193476,4.803243e-7,1.696285e-7,0.0000045505058,0.000042359294,0.0000044433386,0.0000051040097,0.0000135063965,4.1758724e-7,4.4944915e-7,3.1566984e-7,0.00000105576,1.3364639e-8,1.389366e-8,6.7703795e-8,1.4129697e-7,7.170519e-8,7.9344666e-8,2.6391543e-8,2.6134321e-8,7.196726e-9,2.1752363e-8,6.68464e-8,3.417796e-8,1.6228276e-7,4.1071146e-7,6.4721354e-7,2.9513794e-7,5.653474e-9,4.8301445e-8,8.887482e-9,3.730617e-8,1.7784265e-8,4.641905e-9,3.4131187e-8,1.9373938e-7,0.0000012980177,3.5641005e-8,2.1493324e-8,3.0552937e-7,1.5532517e-7,0.0000014520979,3.4884646e-8,0.000038254384,4.5088433e-7,4.176697e-7,6.7706225e-7,1.4142249e-7,0.000014235998,6.2938204e-7,0.000004762867,9.0249006e-7,9.058988e-7,0.0000015713684,1.5720647e-7,1.8185365e-7,7.1931886e-8,0.0000011952825,8.8748374e-7,2.0870831e-7,9.906239e-8,7.793622e-9,1.00584984e-7,4.205944e-7,1.8436246e-7,1.6437947e-7,7.025353e-8,2.5704486e-7,7.5868776e-8,7.841314e-7,2.4953093e-7,5.157682e-8,4.0674127e-8,7.5317965e-9,4.7974854e-8,1.7419973e-8,1.7958679e-7,1.2566392e-8,8.97544e-8,3.2696548e-8,1.120836e-7,3.9067462e-8,4.6769046e-8,1.8523554e-7,1.4833053e-7,1.227935e-7,0.0000010729105,3.653849e-9,1.6198403e-7,1.619072e-8,1.2004934e-7,1.48002774e-8,4.0229484e-8,2.150609e-7,1.1925697e-7,4.8982514e-8,7.60892e-8,2.313748e-8,8.52105e-8,9.5862134e-8,1.3351651e-7,3.021699e-8,4.4238764e-8,2.610667e-8,2.397709e-7,1.3192565e-7,1.6734932e-8,1.588337e-7,4.0643516e-7,8.7534545e-8,8.3669994e-7,3.4375987e-8,7.847893e-8,8.526395e-9,9.601383e-8,5.258924e-7,1.3557448e-7,1.0307227e-7,1.04298135e-8,5.1877144e-8,2.1870013e-8,1.179144e-8,2.9806564e-8,4.3383935e-8,2.9991046e-8,2.850761e-8,3.058665e-8,6.441099e-8,1.5364101e-8,1.5973884e-8,2.573685e-8,1.0903766e-7,3.2118738e-8,6.819743e-9,1.9251311e-7,5.825811e-8,1.8765762e-7,4.007079e-7,1.5791578e-8,1.9501584e-7,1.0142063e-8,2.744815e-8,1.2843532e-8,3.7297493e-8,7.407497e-8,4.2060783e-8,1.6924805e-8,1.4592035e-7,4.344977e-8,1.7191404e-7,3.5817443e-8,8.440249e-9,4.1948297e-8,2.5140324e-8,2.83402e-8,8.747196e-8,8.277126e-9,1.1676294e-8,1.4548515e-7,7.200282e-9,0.000002623601,5.675737e-7,0.0000019483527,6.752595e-8,8.168475e-8,1.09330465e-7,1.6709137e-7,3.1387277e-8,2.9735245e-8,5.752164e-8,5.8508775e-8,3.2544622e-7,3.3302214e-8,4.1867867e-7,1.5085907e-7,2.334682e-7,2.8640278e-7,2.294032e-7,1.8537604e-7,3.1517982e-7,0.0000011075967,1.5369783e-7,1.923751e-7, 1 .6404466e-7,2.9008353e-7,1.2466549e-7,5.802622e-8,5.1862205e-8,6.0094205e-9,1.2333241e-7,1.3798474e-7,1.7370232e-7,5.617761e-7,5.160447e-8,4.8132776e-8,8.0326984e-8,0.0000020645264,5.6385977e-7,8.7942e-7,0.000003478598,2.9723893e-7,3.3904533e-7,9.469074e-8,3.7548457e-8,1.5679038e-7,8.203105e-8,6.8479626e-9,1.8251624e-8,6.050241e-8,3.9563428e-8,1.0699948e-7,3.2566635e-7,3.536943e-7,7.326295e-8,4.857656e-7,7.7177134e-7,3.456778e-8,3.2462046e-7,0.0000031608602,5.3309947e-8,3.6456873e-7,5.4815894e-7,4.6230696e-8,1.3466178e-7,4.3529482e-8,1.6404105e-7,2.4636954e-8,5.9587126e-8,9.493651e-8,5.5234626e-8,5.7412358e-8,0.00001185035,0.0000058263945,0.0000074208674,9.127966e-7,0.0000020019581,0.000001033499,3.514685e-8,0.0000020589953,3.565551e-7,6.873234e-8,2.1935298e-9,5.5603635e-8,3.3266997e-7,1.3073692e-7,2.718763e-8,1.0462929e-8,7.4666804e-7,6.923166e-8,1.6145664e-8,8.568521e-9,4.76221e-9,1.2339771e-7,8.340629e-9,3.2649041e-9,5.063249e-9,4.0704995e-9,1.2043539e-8,5.1056084e-9,7.267143e-9,1.1845163e-7,7.535579e-8,6.386964e-8,1.6212937e-8,2.6104294e-7,6.9794254e-7,6.647513e-8,7.717493e-7,6.651207e-7,3.3244953e-7,3.707282e-7,3.9956424e-7,6.411632e-8,7.107352e-8,1.6380017e-7,6.876801e-8,3.4624745e-7,2.0256503e-7,6.1961015e-7,2.6841073e-8,6.7203354e-7,0.0000011348341,0.0000018397932,6.3972516e-7,7.257533e-8,4.221391e-7,3.9657925e-7,1.403744e-7,3.2498568e-7,1.5857655e-7,1.1122217e-7,7.391421e-8,3.4232224e-7,5.3979615e-8,8.5172964e-8,0.00000406101,0.0000144787555,7.3170328e-9,6.948496e-9,4.4689173e-8,9.231412e-8,5.4119823e-8,2.2242811e-7,1.7609555e-8,2.090628e-8,3.6797683e-9,6.17792e-8,1.7920289e-7,2.627918e-8,2.69882e-8,1.6432807e-7,1.2827613e-7,4.4689088e-8,6.316553e-8,1.946176e-8,2.0871258e-8,2.241458e-8,2.4765244e-8,6.7853985e-9,2.4248795e-8,4.5549795e-9,2.8977038e-8,2.0402325e-8,1.6009503e-7,2.019971e-7,1.6111885e-8,5.964114e-8,4.0983186e-9,3.9080128e-8,7.511338e-9,5.965624e-7,1.6478224e-7,1.4106989e-8,3.2855585e-8,3.3387166e-9,1.2200434e-8,4.624639e-8,6.8423094e-9,1.7426288e-8,4.661133e-8,9.331948e-8,1.2306079e-7,1.2359445e-8,1.1173199e-8,2.7724862e-8,2.4192101e-7,3.4511868e-7,2.593767e-8,9.964568e-8,9.79781e-9,1.9085564e-7,3.9727063e-8,2.6639205e-8,6.874149e-9,3.1469938e-8,2.4086594e-7,1.3126927e-7,2.125434e-7,2.0502034e-8,3.694976e-8,6.563176e-7,2.5600501e-8,2.6882981e-8,6.880636e-7,2.0092733e-7,2.7880397e-8,2.6284091e-8,5.1678345e-8,1.8935414e-7,4.6185284e-7,1.1086778e-8,1.4542604e-7,2.873701e-8,6.105168e-7,1.2016463e-8,1.3944705e-7,2.0937128e-8,4.380141e-8,1.9663208e-8,6.654449e-9,1.1149591e-8,6.424939e-8,6.971555e-9,3.2600196e-9,1.42601895e-8,2.7895078e-8,8.115783e-8,2.5995716e-8,2.2855579e-8,1.05596285e-7,8.1455426e-8,3.7793686e-8,4.8818915e-8,2.3420624e-8,1.0599355e-8,3.604105e-8,5.062431e-8,3.680444e-8,1.5015802e-9,0.0000014475033,0.0000010762104,1.3049913e-7,3.0736015e-8,1.7184021e-8,2.042109e-8,7.904992e-9,1.6902052e-7,1.2416507e-8,5.4758292e-8,2.6250422e-8,1.3261367e-8,6.2980746e-8,1.2709986e-8,2.0171682e-7,4.3866372e-8,6.9623496e-8,2.956512e-7,7.9251316e-7,2.086892e-7,1.7341794e-7,4.2942418e-8,4.213407e-9,8.824785e-8,1.734157e-8,7.321587e-8,1.7941774e-8,1.1245148e-7,4.2424054e-7,8.2595735e-9,1.1336403e-7,8.268799e-8,2.2186978e-8,1.9539721e-8,1.0675704e-8,3.2885175e-8,2.4340963e-8,6.639137e-8,5.6046874e-9,1.3866047e-8,6.675874e-8,1.1355886e-8,3.1321596e-7,3.124518e-8,1.5021818e-7,1.3461754e-8,1.8882956e-7,4.6457423e-8,4.645388e-8,7.714454e-9,3.5857155e-8,7.608321e-9,4.2215014e-8,4.340725e-9,1.3401575e-8,8.5656005e-8,1.7045414e-8,5.4221903e-8,3.0219127e-8,6.153376e-8,3.9388572e-9, 4 .135628e-8,1.7819204e-8,4.3105885e-8,3.903355e-9,7.6630855e-8,1.1890406e-8,9.304218e-9,1.0968062e-9,1.0536768e-8,1.1516804e-7,8.134523e-7,5.952624e-8,2.8063502e-8,1.2833099e-8,1.060569e-7,7.87295e-7,2.7501393e-8,3.936289e-9,2.0519442e-8,7.394816e-9,3.5983973e-8,2.5378517e-8,4.6989722e-8,7.54953e-9,6.3228055e-7,5.5820064e-9,1.2964098e-7,1.5874988e-8,3.383781e-8,6.474512e-9,9.121148e-8,1.3918512e-8,8.2300255e-9,2.706129e-8,2.6095918e-8,5.7223635e-9,6.9634757e-7,4.6850918e-8,9.59058e-9,2.0992059e-7,3.0821607e-8,3.5631626e-8,7.326313e-7,0.0000021759731,2.4075183e-7,2.9745158e-7,2.5290184e-8,7.667951e-9,2.6632893e-7,3.435888e-8,2.3130198e-8,3.1239693e-8,2.8691622e-7,3.895845e-8,2.418413e-8,1.1582445e-8,5.154535e-8,2.0343455e-8,8.201963e-8,1.1641536e-8,5.496357e-7,1.1682151e-8,4.7576915e-8,1.6349825e-8,4.0908628e-8,2.127119e-7,1.6697287e-7,3.989708e-8,0.0000028524503,1.2500372e-7,2.4846614e-7,1.2454291e-8,2.9700272e-8,4.2509916e-9,1.6144348e-7,2.638602e-7,7.638056e-9,3.4455794e-9,7.2732895e-8,1.7631434e-8,7.586613e-9,2.1547013e-8,1.2675349e-7,2.563715e-8,3.5009762e-8,6.4722435e-8,8.387915e-9,3.0695123e-8,7.520388e-8,1.5724964e-7,1.9634005e-7,1.2290832e-7,1.1121187e-9,1.546896e-8,9.91701e-9,6.882473e-7,8.2676166e-8,4.4695312e-8,2.0752013e-8,8.6493785e-8,5.2027666e-8,4.5564942e-8,2.0319955e-8,8.705182e-9,6.452067e-8,2.1777439e-8,1.0309542e-8,3.2119043e-8,2.3336936e-7,8.054096e-9,1.9623354e-7,1.288809e-7,1.5392496e-8,1.401903e-9,5.6968183e-8,6.0800254e-9,1.0782793e-8,2.426073e-8,1.938866e-8,2.297031e-7,1.9971754e-8,2.8477993e-8,5.2273553e-8,2.7392807e-7,9.857291e-8,3.1291098e-8,4.1514422e-8,5.2511964e-9,0.0000015806811,8.5476034e-7,1.0689131e-8,0.0000010621831,7.737313e-8,6.394217e-8,1.1698346e-7,1.04866096e-7,2.1161e-7,1.5339682e-8,5.0944536e-8,1.400538e-8,2.6282036e-8,8.7784336e-8,7.7720665e-9,4.2288754e-8,3.324378e-7,7.729245e-8,7.636901e-10,5.989501e-8,1.3260906e-7,1.2853634e-7,8.844243e-9,1.0194375e-7,2.4937793e-7,1.6547972e-7,1.1762754e-8,1.14961956e-7,2.934271e-7,1.3261241e-8,8.630263e-8,5.7394843e-8,1.1094081e-7,2.2933713e-7,3.470617e-7,1.4751107e-7,1.502495e-8,6.4543194e-8,5.1645337e-8,6.2374156e-8,1.2936015e-7,1.40520715e-8,5.386946e-8,2.0827555e-8,1.3040638e-8,1.05789816e-7,1.5079728e-8,8.926327e-7,4.637438e-8,7.481006e-7,5.8831473e-9,2.8707685e-9,8.3815985e-7,7.3419586e-9,1.4245998e-8,1.0926417e-7,1.1308178e-7,2.523399e-7,1.1782836e-7,4.6678057e-8,2.7959197e-9,3.436386e-8,1.4674497e-7,3.5396916e-8,2.0581128e-7,7.183879e-8,2.7693943e-8,4.5493387e-8,1.9559183e-8,1.5359708e-8,1.2336623e-8,2.9570606e-8,2.8775526e-7,9.0518455e-7,2.3732602e-7,1.6521676e-8,1.5478875e-8,3.5267863e-8,3.6164106e-8,1.6159095e-8,7.650073e-8,1.9661483e-8,4.917535e-8,1.1712613e-7,1.0889253e-8,0.0000014941202,1.0185857e-8,3.757597e-8,2.0970978e-8,3.368558e-8,4.845589e-9,6.0396246e-7,1.0373311e-8,2.8416503e-7,4.499063e-7,3.463186e-8,7.720684e-8,1.4711222e-7,1.16015755e-7,4.007488e-7,3.0256498e-8,6.7067845e-8,2.0128741e-8,1.5987744e-9,4.1919822e-8,1.31671545e-8,3.2318148e-8,9.24766e-8,1.3075301e-7,1.0574301e-7,3.7621653e-8,1.09422466e-7,7.001475e-8,2.7427062e-8,2.0766626e-8,4.5403404e-8,3.390403e-8,1.0469662e-7,2.8271579e-8,3.4062268e-7,5.146207e-7,6.7407086e-7,6.382248e-9,3.638787e-8,3.6260598e-8,1.6065603e-7,3.639056e-7,6.2326917e-9,4.80549e-8,3.3726337e-8,6.3288803e-7,6.4806315e-8,2.1165198e-7,8.3877914e-8,1.7589144e-8,2.7290277e-9,2.1447951e-8,7.8612715e-8,2.0118186e-8,2.8407685e-8,2.492253e-7,2.0156671e-8,2.655165e-8,2.7848243e-8,6.9071238e-9,1.8805437e-8,1.3006904e-8,3.6859183e-7,3.9679412e-7,2.7592133e-8,2.5228948e-8,1.5470029e-7,3.6893066e-8,1.4401772e-9,2.150493e-8,5.0681113e-8,5.0817114e-8,1.1718752e-8,5.4092784e-8,7.1382766e-7,2.5237213e-7,7.0720446e-8,7.199764e-8,1.2525473e-8,3.4803418e-7,1.9591828e-7,1.24047e-7,1.2346175e-7,1.9201337e-8,1.9895249e-7,3.7876678e-8,1.0629785e-8,1.2437128e-8,2.1861892e-7,2.6181456e-7,1.1129008e-7,1.07766304e-7,6.380325e-9,3.895085e-9,1.5762757e-7,2.909027e-9,1.0381051e-8,2.8135211e-8,1.07780025e-8,1.3605974e-8,2.9236466e-8,1.9189795e-7,2.1995064e-7,1.3263998e-8,4.9004846e-8,2.9808371e-9,8.926046e-9,1.0996976e-8,7.7156015e-9,7.454491e-9,5.0861622e-8,1.5129764e-7,1.1960076e-8,1.1323334e-8,9.391332e-9,9.585702e-8,1.9055328e-8,1.8105304e-8,6.179228e-8,6.3894014e-8,1.185318e-8,9.3727754e-9,1.2332148e-7,1.6522023e-8,1.2461165e-7,4.196172e-9,3.9965933e-8,1.25545565e-8,1.4302138e-8,6.631794e-9,5.9642247e-9,5.5569362e-9,1.4192456e-7,1.761308e-8,3.3801896e-7,7.856519e-8,2.966783e-8,0.0000028992106,0.0000013787367,5.3136224e-9,2.5128529e-8,8.406627e-8,4.492839e-8,5.408793e-8,2.4239176e-8,4.0168052e-7,4.1083545e-8,5.415348e-8,8.640767e-9,5.7732567e-8,2.6443152e-7,8.953217e-7,2.7994002e-8,5.88948e-9,4.178882e-8,2.8880645e-8,2.1357529e-8,2.3024175e-7,8.7866255e-8,2.0697297e-9,2.2364105e-8,3.2032763e-9,1.1768747e-8,6.963571e-8,2.2719322e-8,7.3603825e-9,6.9225288e-9,3.2138715e-8,1.3705778e-7,1.981505e-8,1.0578956e-8,2.704942e-8,2.9755938e-9,2.17737e-8,1.0975539e-8,1.9918724e-8,2.3882098e-8,2.1357366e-8,6.1090986e-9,1.18904975e-8,1.1459892e-8,3.7317346e-8,1.5726203e-8,3.4040234e-8,3.692158e-8,9.281765e-8,1.2323201e-7,4.2347594e-8,1.7423728e-8,5.811339e-8,3.931436e-8,2.3690461e-8,1.7928501e-8,1.4406642e-8,7.0198305e-9,6.0415225e-8,4.86748e-8,1.0685319e-8,1.0051243e-8,4.242626e-8,2.6078153e-8,5.13667e-9,1.6972995e-9,1.9131587e-8,2.1117435e-7,1.3372697e-8,2.0002481e-8,1.04542565e-7,2.8144228e-8,2.1344792e-7,2.104611e-8,1.9114454e-7,3.9576936e-8,2.9316318e-8,1.1052031e-7,4.8400768e-8,5.583606e-8,1.2130111e-7,1.7762162e-8,2.5610854e-8,1.2038653e-7,4.6748596e-9,1.5916099e-8,3.1475942e-8,6.147686e-8,2.2046418e-8,3.2577634e-7,1.1989145e-7,2.381899e-7,1.4909986e-8,5.1016883e-8,5.5142202e-8,2.2885503e-8,5.71411e-8,5.185096e-7,4.9772858e-8,1.1049896e-8,1.2640993e-7,8.1748816e-8 ]} } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Test the deployed model with grpcurl"},{"location":"modelserving/v1beta1/pmml/","text":"Deploy PMML model with InferenceService \u00b6 PMML, or predictive model markup language, is an XML format for describing data mining and statistical models, including inputs to the models, transformations used to prepare data for data mining, and the parameters that define the models themselves. In this example we show how you can serve the PMML format model on InferenceService . Deploy PMML model with V1 protocol \u00b6 Create the InferenceService \u00b6 New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : model : modelFormat : name : pmml storageUri : \"gs://kfserving-examples/models/pmml\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/pmml Create the InferenceService with above yaml kubectl apply -f pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/pmml-demo created Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload below. Create a file named iris-input.json with the sample input. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } MODEL_NAME = pmml-demo INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-demo -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * TCP_NODELAY set * Connected to localhost ( ::1 ) port 8081 ( #0) > POST /v1/models/pmml-demo:predict HTTP/1.1 > Host: pmml-demo.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes < HTTP/1.1 200 OK < content-length: 39 < content-type: application/json ; charset = UTF-8 < date: Sun, 18 Oct 2020 15 :50:02 GMT < server: istio-envoy < x-envoy-upstream-service-time: 12 < * Connection #0 to host localhost left intact { \"predictions\" : [{ 'Species' : 'setosa' , 'Probability_setosa' : 1 .0, 'Probability_versicolor' : 0 .0, 'Probability_virginica' : 0 .0, 'Node_Id' : '2' }]} * Closing connection 0 Deploy the model with Open Inference Protocol \u00b6 Test the Model locally \u00b6 Once you've got your model serialised model.pmml , we can then use KServe Pmml Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Using KServe PMMLServer \u00b6 Pre-requisites \u00b6 Firstly, to use KServe pmml server locally, you will first need to install the pmmlserver runtime package in your local environment. Install OpenJdk-11 . Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install pmmlserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/pmmlserver poetry install Serving model locally \u00b6 The pmmlserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the pmmlserver runtime package installed locally, you should now be ready to start our server as: python3 pmmlserver --model_dir /path/to/model_dir --model_name pmml-v2-iris Deploy the Model with REST endpoint through InferenceService \u00b6 Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. kubectl kubectl apply -f pmml-v2-iris.yaml Test the Deployed Model \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/pmml-v2-iris/infer Expected Output { \"model_name\" : \"pmml-v2-iris\" , \"model_version\" : null , \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"versicolor\" , \"versicolor\" ] }, { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0 , 0 ] }, { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.9074074074074074 , 0.9074074074074074 ] }, { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.09259259259259259 , 0.09259259259259259 ] }, { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"6\" , \"6\" ] } ] } Deploy the Model with GRPC endpoint through InferenceService \u00b6 Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" ports : - name : grpc-port # Istio requires the port name to be in the format [-] protocol : TCP containerPort : 8081 Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f pmml-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"pmml-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"model_name\" : \"pmml-v2-iris\" , \"model_version\" : null, \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null, \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"versicolor\" , \"versicolor\" ] } , { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 , 0 ] } , { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .9074074074074074, 0 .9074074074074074 ] } , { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .09259259259259259, 0 .09259259259259259 ] } , { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"6\" , \"6\" ] } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"PMML"},{"location":"modelserving/v1beta1/pmml/#deploy-pmml-model-with-inferenceservice","text":"PMML, or predictive model markup language, is an XML format for describing data mining and statistical models, including inputs to the models, transformations used to prepare data for data mining, and the parameters that define the models themselves. In this example we show how you can serve the PMML format model on InferenceService .","title":"Deploy PMML model with InferenceService"},{"location":"modelserving/v1beta1/pmml/#deploy-pmml-model-with-v1-protocol","text":"","title":"Deploy PMML model with V1 protocol"},{"location":"modelserving/v1beta1/pmml/#create-the-inferenceservice","text":"New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : model : modelFormat : name : pmml storageUri : \"gs://kfserving-examples/models/pmml\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-demo\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/pmml Create the InferenceService with above yaml kubectl apply -f pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/pmml-demo created Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size.","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/pmml/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload below. Create a file named iris-input.json with the sample input. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } MODEL_NAME = pmml-demo INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-demo -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * TCP_NODELAY set * Connected to localhost ( ::1 ) port 8081 ( #0) > POST /v1/models/pmml-demo:predict HTTP/1.1 > Host: pmml-demo.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes < HTTP/1.1 200 OK < content-length: 39 < content-type: application/json ; charset = UTF-8 < date: Sun, 18 Oct 2020 15 :50:02 GMT < server: istio-envoy < x-envoy-upstream-service-time: 12 < * Connection #0 to host localhost left intact { \"predictions\" : [{ 'Species' : 'setosa' , 'Probability_setosa' : 1 .0, 'Probability_versicolor' : 0 .0, 'Probability_virginica' : 0 .0, 'Node_Id' : '2' }]} * Closing connection 0","title":"Run a prediction"},{"location":"modelserving/v1beta1/pmml/#deploy-the-model-with-open-inference-protocol","text":"","title":"Deploy the model with Open Inference Protocol"},{"location":"modelserving/v1beta1/pmml/#test-the-model-locally","text":"Once you've got your model serialised model.pmml , we can then use KServe Pmml Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Test the Model locally"},{"location":"modelserving/v1beta1/pmml/#using-kserve-pmmlserver","text":"","title":"Using KServe PMMLServer"},{"location":"modelserving/v1beta1/pmml/#pre-requisites","text":"Firstly, to use KServe pmml server locally, you will first need to install the pmmlserver runtime package in your local environment. Install OpenJdk-11 . Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install pmmlserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/pmmlserver poetry install","title":"Pre-requisites"},{"location":"modelserving/v1beta1/pmml/#serving-model-locally","text":"The pmmlserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the pmmlserver runtime package installed locally, you should now be ready to start our server as: python3 pmmlserver --model_dir /path/to/model_dir --model_name pmml-v2-iris","title":"Serving model locally"},{"location":"modelserving/v1beta1/pmml/#deploy-the-model-with-rest-endpoint-through-inferenceservice","text":"Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. kubectl kubectl apply -f pmml-v2-iris.yaml","title":"Deploy the Model with REST endpoint through InferenceService"},{"location":"modelserving/v1beta1/pmml/#test-the-deployed-model","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/pmml-v2-iris/infer Expected Output { \"model_name\" : \"pmml-v2-iris\" , \"model_version\" : null , \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"versicolor\" , \"versicolor\" ] }, { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0 , 0 ] }, { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.9074074074074074 , 0.9074074074074074 ] }, { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.09259259259259259 , 0.09259259259259259 ] }, { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"6\" , \"6\" ] } ] }","title":"Test the Deployed Model"},{"location":"modelserving/v1beta1/pmml/#deploy-the-model-with-grpc-endpoint-through-inferenceservice","text":"Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"pmml-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/pmml\" ports : - name : grpc-port # Istio requires the port name to be in the format [-] protocol : TCP containerPort : 8081 Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f pmml-v2-grpc.yaml","title":"Deploy the Model with GRPC endpoint through InferenceService"},{"location":"modelserving/v1beta1/pmml/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice pmml-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"pmml-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"model_name\" : \"pmml-v2-iris\" , \"model_version\" : null, \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null, \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"versicolor\" , \"versicolor\" ] } , { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 , 0 ] } , { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .9074074074074074, 0 .9074074074074074 ] } , { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .09259259259259259, 0 .09259259259259259 ] } , { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"6\" , \"6\" ] } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Test the deployed model with grpcurl"},{"location":"modelserving/v1beta1/rollout/canary-example/","text":"Canary Rollout Example \u00b6 Setup \u00b6 Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . Create the InferenceService \u00b6 Complete steps 1-3 in the First Inference Service tutorial. Set up a namespace (if not already created), and create an InferenceService. After rolling out the first model, 100% traffic goes to the initial model with service revision 1. Run kubectl get isvc sklearn-iris in the command line to see the amount of traffic routing to the InferenceService under the LATEST column. NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 sklearn-iris-predictor-default-00001 46s 2m39s 70s Update the InferenceService with the canary rollout strategy \u00b6 Add the canaryTrafficPercent field to the predictor component and update the storageUri to use a new/updated model. NOTE: A new predictor schema was introduced in v0.8.0 . New InferenceServices should be deployed using the new schema. The old schema is provided as reference. New Schema Old Schema kubectl apply -n kserve-test -f - <[-] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f sklearn-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"sklearn-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"modelName\" : \"sklearn-v2-iris-grpc\" , \"id\" : \"41738561-7219-4e4a-984d-5fe19bed6298\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"INT32\" , \"shape\" : [ \"2\" ] , \"contents\" : { \"intContents\" : [ 1 , 1 ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Scikit-learn"},{"location":"modelserving/v1beta1/sklearn/v2/#deploy-scikit-learn-models-with-inferenceservice","text":"This example walks you through how to deploy a scikit-learn model leveraging the v1beta1 version of the InferenceService CRD. Note that, by default the v1beta1 version will expose your model through an API compatible with the existing V1 Dataplane. This example will show you how to serve a model through Open Inference Protocol .","title":"Deploy Scikit-learn models with InferenceService"},{"location":"modelserving/v1beta1/sklearn/v2/#train-the-model","text":"The first step will be to train a sample scikit-learn model. Note that this model will be then saved as model.joblib . from sklearn import svm from sklearn import datasets from joblib import dump iris = datasets . load_iris () X , y = iris . data , iris . target clf = svm . SVC ( gamma = 'scale' ) clf . fit ( X , y ) dump ( clf , 'model.joblib' )","title":"Train the Model"},{"location":"modelserving/v1beta1/sklearn/v2/#test-the-model-locally","text":"Once you've got your model serialised model.joblib , we can then use KServe Sklearn Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Test the Model locally"},{"location":"modelserving/v1beta1/sklearn/v2/#using-kserve-sklearnserver","text":"","title":"Using KServe SklearnServer"},{"location":"modelserving/v1beta1/sklearn/v2/#pre-requisites","text":"Firstly, to use KServe sklearn server locally, you will first need to install the sklearnserver runtime package in your local environment. Clone the KServe repository and navigate into the directory. git clone https://github.com/kserve/kserve Install sklearnserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/sklearnserver poetry install","title":"Pre-requisites"},{"location":"modelserving/v1beta1/sklearn/v2/#serving-model-locally","text":"The sklearnserver package takes two arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. With the sklearnserver runtime package installed locally, you should now be ready to start our server as: python3 sklearnserver --model_dir /path/to/model_dir --model_name sklearn-v2-iris","title":"Serving model locally"},{"location":"modelserving/v1beta1/sklearn/v2/#deploy-the-model-with-rest-endpoint-through-inferenceservice","text":"Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-v2-iris\" spec : predictor : model : modelFormat : name : sklearn protocolVersion : v2 runtime : kserve-sklearnserver storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Note that this makes the following assumptions: Your model weights (i.e. your model.joblib file) have already been uploaded to a \"model repository\" (GCS in this example) and can be accessed as gs://seldon-models/sklearn/iris . There is a K8s cluster available, accessible through kubectl . KServe has already been installed in your cluster . kubectl kubectl apply -f sklearn.yaml","title":"Deploy the Model with REST endpoint through InferenceService"},{"location":"modelserving/v1beta1/sklearn/v2/#test-the-deployed-model","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/sklearn-v2-iris/infer Expected Output { \"id\" : \"823248cc-d770-4a51-9606-16803395569c\" , \"model_name\" : \"sklearn-v2-iris\" , \"model_version\" : \"v1.0.0\" , \"outputs\" : [ { \"data\" : [ 1 , 1 ], \"datatype\" : \"INT64\" , \"name\" : \"predict\" , \"parameters\" : null , \"shape\" : [ 2 ] } ] }","title":"Test the Deployed Model"},{"location":"modelserving/v1beta1/sklearn/v2/#deploy-the-model-with-grpc-endpoint-through-inferenceservice","text":"Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-v2-iris-grpc\" spec : predictor : model : modelFormat : name : sklearn protocolVersion : v2 runtime : kserve-sklearnserver storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"sklearn-v2-iris-grpc\" spec : predictor : model : modelFormat : name : sklearn protocolVersion : v2 runtime : kserve-sklearnserver storageUri : \"gs://kfserving-examples/models/sklearn/1.0/model\" ports : - name : grpc-port # Istio requires the port name to be in the format [-] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f sklearn-v2-grpc.yaml","title":"Deploy the Model with GRPC endpoint through InferenceService"},{"location":"modelserving/v1beta1/sklearn/v2/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice sklearn-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"sklearn-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"modelName\" : \"sklearn-v2-iris-grpc\" , \"id\" : \"41738561-7219-4e4a-984d-5fe19bed6298\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"INT32\" , \"shape\" : [ \"2\" ] , \"contents\" : { \"intContents\" : [ 1 , 1 ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Test the deployed model with grpcurl"},{"location":"modelserving/v1beta1/spark/","text":"Deploy Spark MLlib model with PMML InferenceService \u00b6 Setup \u00b6 Install pyspark 3.0.x and pyspark2pmml pip install pyspark~ = 3 .0.0 pip install pyspark2pmml Get JPMML-SparkML jar Train a Spark MLlib model and export to PMML file \u00b6 Launch pyspark with --jars to specify the location of the JPMML-SparkML uber-JAR pyspark --jars ./jpmml-sparkml-executable-1.6.3.jar Fitting a Spark ML pipeline: from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import RFormula df = spark . read . csv ( \"Iris.csv\" , header = True , inferSchema = True ) formula = RFormula ( formula = \"Species ~ .\" ) classifier = DecisionTreeClassifier () pipeline = Pipeline ( stages = [ formula , classifier ]) pipelineModel = pipeline . fit ( df ) from pyspark2pmml import PMMLBuilder pmmlBuilder = PMMLBuilder ( sc , df , pipelineModel ) pmmlBuilder . buildFile ( \"DecisionTreeIris.pmml\" ) Upload the DecisionTreeIris.pmml to a GCS bucket. gsutil cp ./DecisionTreeIris.pmml gs:// $BUCKET_NAME /sparkpmml/model.pmml Test the Model locally \u00b6 For testing the model locally, please refer the pmml server documentation . Deploy Spark MLlib model with V1 protocol \u00b6 Create the InferenceService with PMMLServer \u00b6 Create the InferenceService with pmml predictor and specify the storageUri with bucket location you uploaded to New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-pmml\" spec : predictor : model : modelFormat : name : pmml storageUri : gs://kfserving-examples/models/sparkpmml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-pmml\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/sparkpmml Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService custom resource kubectl apply -f spark_pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/spark-pmml created Wait the InferenceService to be ready kubectl wait --for = condition = Ready inferenceservice spark-pmml $ inferenceservice.serving.kserve.io/spark-pmml condition met Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload below. Create a file named iris-input.json with the sample input. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } MODEL_NAME = spark-pmml INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-pmml -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Connected to spark-pmml.default.35.237.217.209.xip.io ( 35 .237.217.209 ) port 80 ( #0) > POST /v1/models/spark-pmml:predict HTTP/1.1 > Host: spark-pmml.default.35.237.217.209.xip.io > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 39 < content-type: application/json ; charset = UTF-8 < date: Sun, 07 Mar 2021 19 :32:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 14 < * Connection #0 to host spark-pmml.default.35.237.217.209.xip.io left intact { \"predictions\" : [[ 1 .0, 0 .0, 1 .0, 0 .0 ]]} Deploy the model with Open Inference Protocol \u00b6 Deploy the Model with REST endpoint through InferenceService \u00b6 Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. kubectl kubectl apply -f spark-v2-iris.yaml Test the Deployed Model \u00b6 You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/spark-v2-iris/infer Expected Output { \"model_name\" : \"spark-v2-iris\" , \"model_version\" : null , \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"versicolor\" , \"versicolor\" ] }, { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0 , 0 ] }, { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.9074074074074074 , 0.9074074074074074 ] }, { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.09259259259259259 , 0.09259259259259259 ] }, { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"6\" , \"6\" ] } ] } Deploy the Model with GRPC endpoint through InferenceService \u00b6 Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" ports : - name : grpc-port # Istio requires the port name to be in the format [-] protocol : TCP containerPort : 8081 Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f spark-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"spark-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"model_name\" : \"spark-v2-iris\" , \"model_version\" : null, \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null, \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"versicolor\" , \"versicolor\" ] } , { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 , 0 ] } , { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .9074074074074074, 0 .9074074074074074 ] } , { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .09259259259259259, 0 .09259259259259259 ] } , { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"6\" , \"6\" ] } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Spark MLlib"},{"location":"modelserving/v1beta1/spark/#deploy-spark-mllib-model-with-pmml-inferenceservice","text":"","title":"Deploy Spark MLlib model with PMML InferenceService"},{"location":"modelserving/v1beta1/spark/#setup","text":"Install pyspark 3.0.x and pyspark2pmml pip install pyspark~ = 3 .0.0 pip install pyspark2pmml Get JPMML-SparkML jar","title":"Setup"},{"location":"modelserving/v1beta1/spark/#train-a-spark-mllib-model-and-export-to-pmml-file","text":"Launch pyspark with --jars to specify the location of the JPMML-SparkML uber-JAR pyspark --jars ./jpmml-sparkml-executable-1.6.3.jar Fitting a Spark ML pipeline: from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import RFormula df = spark . read . csv ( \"Iris.csv\" , header = True , inferSchema = True ) formula = RFormula ( formula = \"Species ~ .\" ) classifier = DecisionTreeClassifier () pipeline = Pipeline ( stages = [ formula , classifier ]) pipelineModel = pipeline . fit ( df ) from pyspark2pmml import PMMLBuilder pmmlBuilder = PMMLBuilder ( sc , df , pipelineModel ) pmmlBuilder . buildFile ( \"DecisionTreeIris.pmml\" ) Upload the DecisionTreeIris.pmml to a GCS bucket. gsutil cp ./DecisionTreeIris.pmml gs:// $BUCKET_NAME /sparkpmml/model.pmml","title":"Train a Spark MLlib model and export to PMML file"},{"location":"modelserving/v1beta1/spark/#test-the-model-locally","text":"For testing the model locally, please refer the pmml server documentation .","title":"Test the Model locally"},{"location":"modelserving/v1beta1/spark/#deploy-spark-mllib-model-with-v1-protocol","text":"","title":"Deploy Spark MLlib model with V1 protocol"},{"location":"modelserving/v1beta1/spark/#create-the-inferenceservice-with-pmmlserver","text":"Create the InferenceService with pmml predictor and specify the storageUri with bucket location you uploaded to New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-pmml\" spec : predictor : model : modelFormat : name : pmml storageUri : gs://kfserving-examples/models/sparkpmml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-pmml\" spec : predictor : pmml : storageUri : gs://kfserving-examples/models/sparkpmml Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService custom resource kubectl apply -f spark_pmml.yaml Expected Output $ inferenceservice.serving.kserve.io/spark-pmml created Wait the InferenceService to be ready kubectl wait --for = condition = Ready inferenceservice spark-pmml $ inferenceservice.serving.kserve.io/spark-pmml condition met","title":"Create the InferenceService with PMMLServer"},{"location":"modelserving/v1beta1/spark/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . You can see an example payload below. Create a file named iris-input.json with the sample input. { \"instances\" : [ [ 5.1 , 3.5 , 1.4 , 0.2 ] ] } MODEL_NAME = spark-pmml INPUT_PATH = @./iris-input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-pmml -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Connected to spark-pmml.default.35.237.217.209.xip.io ( 35 .237.217.209 ) port 80 ( #0) > POST /v1/models/spark-pmml:predict HTTP/1.1 > Host: spark-pmml.default.35.237.217.209.xip.io > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 45 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 45 out of 45 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 39 < content-type: application/json ; charset = UTF-8 < date: Sun, 07 Mar 2021 19 :32:50 GMT < server: istio-envoy < x-envoy-upstream-service-time: 14 < * Connection #0 to host spark-pmml.default.35.237.217.209.xip.io left intact { \"predictions\" : [[ 1 .0, 0 .0, 1 .0, 0 .0 ]]}","title":"Run a prediction"},{"location":"modelserving/v1beta1/spark/#deploy-the-model-with-open-inference-protocol","text":"","title":"Deploy the model with Open Inference Protocol"},{"location":"modelserving/v1beta1/spark/#deploy-the-model-with-rest-endpoint-through-inferenceservice","text":"Lastly, you will use KServe to deploy the trained model onto Kubernetes. For this, you will just need to use version v1beta1 of the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. kubectl kubectl apply -f spark-v2-iris.yaml","title":"Deploy the Model with REST endpoint through InferenceService"},{"location":"modelserving/v1beta1/spark/#test-the-deployed-model","text":"You can now test your deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/spark-v2-iris/infer Expected Output { \"model_name\" : \"spark-v2-iris\" , \"model_version\" : null , \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null , \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"versicolor\" , \"versicolor\" ] }, { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0 , 0 ] }, { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.9074074074074074 , 0.9074074074074074 ] }, { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ], \"datatype\" : \"FP64\" , \"parameters\" : null , \"data\" : [ 0.09259259259259259 , 0.09259259259259259 ] }, { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ], \"datatype\" : \"BYTES\" , \"parameters\" : null , \"data\" : [ \"6\" , \"6\" ] } ] }","title":"Test the Deployed Model"},{"location":"modelserving/v1beta1/spark/#deploy-the-model-with-grpc-endpoint-through-inferenceservice","text":"Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"spark-v2-iris-grpc\" spec : predictor : model : modelFormat : name : pmml protocolVersion : v2 runtime : kserve-pmmlserver storageUri : \"gs://kfserving-examples/models/sparkpmml\" ports : - name : grpc-port # Istio requires the port name to be in the format [-] protocol : TCP containerPort : 8081 Warning The pmmlserver is based on Py4J and that doesn't support multi-process mode, so we can't set spec.predictor.containerConcurrency . If you want to scale the PMMLServer to improve prediction performance, you should set the InferenceService's resources.limits.cpu to 1 and scale the replica size. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f spark-v2-grpc.yaml","title":"Deploy the Model with GRPC endpoint through InferenceService"},{"location":"modelserving/v1beta1/spark/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice spark-v2-iris-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"spark-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"model_name\" : \"spark-v2-iris\" , \"model_version\" : null, \"id\" : \"a187a478-c614-46ce-a7de-2f07871f43f3\" , \"parameters\" : null, \"outputs\" : [ { \"name\" : \"Species\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"versicolor\" , \"versicolor\" ] } , { \"name\" : \"Probability_setosa\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 , 0 ] } , { \"name\" : \"Probability_versicolor\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .9074074074074074, 0 .9074074074074074 ] } , { \"name\" : \"Probability_virginica\" , \"shape\" : [ 2 ] , \"datatype\" : \"FP64\" , \"parameters\" : null, \"data\" : [ 0 .09259259259259259, 0 .09259259259259259 ] } , { \"name\" : \"Node_Id\" , \"shape\" : [ 2 ] , \"datatype\" : \"BYTES\" , \"parameters\" : null, \"data\" : [ \"6\" , \"6\" ] } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Test the deployed model with grpcurl"},{"location":"modelserving/v1beta1/tensorflow/","text":"Deploy Tensorflow Model with InferenceService \u00b6 Create the HTTP InferenceService \u00b6 Create an InferenceService yaml which specifies the framework tensorflow and storageUri that is pointed to a saved tensorflow model , and name it as tensorflow.yaml . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the tensorflow.yaml to create the InferenceService , by default it exposes a HTTP/REST endpoint. kubectl kubectl apply -f tensorflow.yaml Expected Output $ inferenceservice.serving.kserve.io/flower-sample created Wait for the InferenceService to be in ready state kubectl get isvc flower-sample NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE flower-sample http://flower-sample.default.example.com True 100 flower-sample-predictor-default-n9zs6 7m15s Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , the inference request input file can be downloaded here . MODEL_NAME = flower-sample INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/tensorflow-sample:predict HTTP/1.1 > Host: tensorflow-sample.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 16201 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 16201 out of 16201 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 222 < content-type: application/json < date: Sun, 31 Jan 2021 01 :01:50 GMT < x-envoy-upstream-service-time: 280 < server: istio-envoy < { \"predictions\" : [ { \"scores\" : [ 0 .999114931, 9 .20987877e-05, 0 .000136786213, 0 .000337257545, 0 .000300532585, 1 .84813616e-05 ] , \"prediction\" : 0 , \"key\" : \" 1\" } ] } Canary Rollout \u00b6 Canary rollout is a great way to control the risk of rolling out a new model by first moving a small percent of the traffic to it and then gradually increase the percentage. To run a canary rollout, you can apply the canary.yaml with the canaryTrafficPercent field specified. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : canaryTrafficPercent : 20 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers-2\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : canaryTrafficPercent : 20 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers-2\" Apply the canary.yaml to create the Canary InferenceService. kubectl kubectl apply -f canary.yaml To verify if the traffic split percentage is applied correctly, you can run the following command: kubectl kubectl get isvc flower-sample NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE flower-sample http://flower-sample.default.example.com True 80 20 flower-sample-predictor-default-n9zs6 flower-sample-predictor-default-2kwtr 7m15s As you can see the traffic is split between the last rolled out revision and the current latest ready revision, KServe automatically tracks the last rolled out(stable) revision for you so you do not need to maintain both default and canary on the InferenceService as in v1alpha2. Create the gRPC InferenceService \u00b6 Create InferenceService which exposes the gRPC port and by default it listens on port 9000. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-grpc\" spec : predictor : model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" ports : - containerPort : 9000 name : h2c protocol : TCP apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-grpc\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" ports : - containerPort : 9000 name : h2c protocol : TCP Apply grpc.yaml to create the gRPC InferenceService. kubectl kubectl apply -f grpc.yaml Expected Output $ inferenceservice.serving.kserve.io/flower-grpc created Run a prediction \u00b6 We use a python gRPC client for the prediction, so you need to create a python virtual environment and install the tensorflow-serving-api . # The prediction script is written in TensorFlow 1.x pip install tensorflow-serving-api> = 1 .14.0,< 2 .0.0 Run the gRPC prediction script . MODEL_NAME = flower-grpc INPUT_PATH = ./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python grpc_client.py --host $INGRESS_HOST --port $INGRESS_PORT --model $MODEL_NAME --hostname $SERVICE_HOSTNAME --input_path $INPUT_PATH Expected Output ou t pu ts { key : \"key\" value { d t ype : DT_STRING tens or_shape { dim { size : 1 } } s tr i n g_val : \" 1\" } } ou t pu ts { key : \"prediction\" value { d t ype : DT_INT 64 tens or_shape { dim { size : 1 } } i nt 64 _val : 0 } } ou t pu ts { key : \"scores\" value { d t ype : DT_FLOAT tens or_shape { dim { size : 1 } dim { size : 6 } } fl oa t _val : 0.9991149306297302 fl oa t _val : 9.209887502947822e-05 fl oa t _val : 0.00013678647519554943 fl oa t _val : 0.0003372581850271672 fl oa t _val : 0.0003005331673193723 fl oa t _val : 1.848137799242977e-05 } } model_spec { na me : \"flowers-sample\" versio n { value : 1 } sig nature _ na me : \"serving_default\" }","title":"Tensorflow"},{"location":"modelserving/v1beta1/tensorflow/#deploy-tensorflow-model-with-inferenceservice","text":"","title":"Deploy Tensorflow Model with InferenceService"},{"location":"modelserving/v1beta1/tensorflow/#create-the-http-inferenceservice","text":"Create an InferenceService yaml which specifies the framework tensorflow and storageUri that is pointed to a saved tensorflow model , and name it as tensorflow.yaml . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" Apply the tensorflow.yaml to create the InferenceService , by default it exposes a HTTP/REST endpoint. kubectl kubectl apply -f tensorflow.yaml Expected Output $ inferenceservice.serving.kserve.io/flower-sample created Wait for the InferenceService to be in ready state kubectl get isvc flower-sample NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE flower-sample http://flower-sample.default.example.com True 100 flower-sample-predictor-default-n9zs6 7m15s","title":"Create the HTTP InferenceService"},{"location":"modelserving/v1beta1/tensorflow/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT , the inference request input file can be downloaded here . MODEL_NAME = flower-sample INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict -d $INPUT_PATH Expected Output * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/tensorflow-sample:predict HTTP/1.1 > Host: tensorflow-sample.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 16201 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 16201 out of 16201 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 222 < content-type: application/json < date: Sun, 31 Jan 2021 01 :01:50 GMT < x-envoy-upstream-service-time: 280 < server: istio-envoy < { \"predictions\" : [ { \"scores\" : [ 0 .999114931, 9 .20987877e-05, 0 .000136786213, 0 .000337257545, 0 .000300532585, 1 .84813616e-05 ] , \"prediction\" : 0 , \"key\" : \" 1\" } ] }","title":"Run a prediction"},{"location":"modelserving/v1beta1/tensorflow/#canary-rollout","text":"Canary rollout is a great way to control the risk of rolling out a new model by first moving a small percent of the traffic to it and then gradually increase the percentage. To run a canary rollout, you can apply the canary.yaml with the canaryTrafficPercent field specified. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : canaryTrafficPercent : 20 model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers-2\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-sample\" spec : predictor : canaryTrafficPercent : 20 tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers-2\" Apply the canary.yaml to create the Canary InferenceService. kubectl kubectl apply -f canary.yaml To verify if the traffic split percentage is applied correctly, you can run the following command: kubectl kubectl get isvc flower-sample NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE flower-sample http://flower-sample.default.example.com True 80 20 flower-sample-predictor-default-n9zs6 flower-sample-predictor-default-2kwtr 7m15s As you can see the traffic is split between the last rolled out revision and the current latest ready revision, KServe automatically tracks the last rolled out(stable) revision for you so you do not need to maintain both default and canary on the InferenceService as in v1alpha2.","title":"Canary Rollout"},{"location":"modelserving/v1beta1/tensorflow/#create-the-grpc-inferenceservice","text":"Create InferenceService which exposes the gRPC port and by default it listens on port 9000. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-grpc\" spec : predictor : model : modelFormat : name : tensorflow storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" ports : - containerPort : 9000 name : h2c protocol : TCP apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"flower-grpc\" spec : predictor : tensorflow : storageUri : \"gs://kfserving-examples/models/tensorflow/flowers\" ports : - containerPort : 9000 name : h2c protocol : TCP Apply grpc.yaml to create the gRPC InferenceService. kubectl kubectl apply -f grpc.yaml Expected Output $ inferenceservice.serving.kserve.io/flower-grpc created","title":"Create the gRPC InferenceService"},{"location":"modelserving/v1beta1/tensorflow/#run-a-prediction_1","text":"We use a python gRPC client for the prediction, so you need to create a python virtual environment and install the tensorflow-serving-api . # The prediction script is written in TensorFlow 1.x pip install tensorflow-serving-api> = 1 .14.0,< 2 .0.0 Run the gRPC prediction script . MODEL_NAME = flower-grpc INPUT_PATH = ./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python grpc_client.py --host $INGRESS_HOST --port $INGRESS_PORT --model $MODEL_NAME --hostname $SERVICE_HOSTNAME --input_path $INPUT_PATH Expected Output ou t pu ts { key : \"key\" value { d t ype : DT_STRING tens or_shape { dim { size : 1 } } s tr i n g_val : \" 1\" } } ou t pu ts { key : \"prediction\" value { d t ype : DT_INT 64 tens or_shape { dim { size : 1 } } i nt 64 _val : 0 } } ou t pu ts { key : \"scores\" value { d t ype : DT_FLOAT tens or_shape { dim { size : 1 } dim { size : 6 } } fl oa t _val : 0.9991149306297302 fl oa t _val : 9.209887502947822e-05 fl oa t _val : 0.00013678647519554943 fl oa t _val : 0.0003372581850271672 fl oa t _val : 0.0003005331673193723 fl oa t _val : 1.848137799242977e-05 } } model_spec { na me : \"flowers-sample\" versio n { value : 1 } sig nature _ na me : \"serving_default\" }","title":"Run a prediction"},{"location":"modelserving/v1beta1/torchserve/","text":"Deploy a PyTorch Model with TorchServe InferenceService \u00b6 In this example, we deploy a trained PyTorch MNIST model to predict handwritten digits by running an InferenceService with TorchServe runtime which is the default installed serving runtime for PyTorch models. Model interpretability is also an important aspect which helps to understand which of the input features were important for a particular classification. Captum is a model interpretability library. In this example, TorchServe explain endpoint is implemented with Captum's state-of-the-art algorithm, including integrated gradients to provide users with an easy way to understand which features are contributing to the model output. You can refer to the Captum Tutorial for more examples. Create Model Storage with a Model Archive File and Config \u00b6 The KServe/TorchServe integration expects following model store layout. \u251c\u2500\u2500 config \u2502 \u251c\u2500\u2500 config.properties \u251c\u2500\u2500 model-store \u2502 \u251c\u2500\u2500 densenet_161.mar \u2502 \u251c\u2500\u2500 mnist.mar TorchServe provides a utility to package all the model artifacts into a single TorchServe Model Archive File (MAR) . After model artifacts are packaged into a MAR file, you then upload to the model-store under the model storage path. You can store your model and dependent files on remote storage or local persistent volume. The MNIST model and dependent files can be obtained from here . Note For remote storage you can choose to start the example using the prebuilt MNIST MAR file stored on KServe example GCS bucket gs://kfserving-examples/models/torchserve/image_classifier , or generate the MAR file with torch-model-archiver and create the model store on remote storage according to the above layout. torch-model-archiver --model-name mnist --version 1 .0 \\ --model-file model-archiver/model-store/mnist/mnist.py \\ --serialized-file model-archiver/model-store/mnist/mnist_cnn.pt \\ --handler model-archiver/model-store/mnist/mnist_handler.py \\ For PVC user please refer to model archive file generation for auto generation of MAR files with the model and dependent files. TorchServe uses a config.properties file to store configuration. Please see here for more details with the properties supported by the configuration file. The following is a sample file for KServe: inference_address=http://0.0.0.0:8085 management_address=http://0.0.0.0:8085 metrics_address=http://0.0.0.0:8082 grpc_inference_port=7070 grpc_management_port=7071 enable_metrics_api=true metrics_format=prometheus number_of_netty_threads=4 job_queue_size=10 enable_envvars_config=true install_py_dep_per_model=true model_store=/mnt/models/model-store model_snapshot={\"name\":\"startup.cfg\",\"modelCount\":1,\"models\":{\"mnist\":{\"1.0\":{\"defaultVersion\":true,\"marName\":\"mnist.mar\",\"minWorkers\":1,\"maxWorkers\":5,\"batchSize\":1,\"maxBatchDelay\":10,\"responseTimeout\":120}}}} The KServe/TorchServe integration supports KServe v1/v2 REST protocol. In the config.properties , we need to turn on the flag enable_envvars_config to enable setting the KServe envelop using an environment variable. Warning The previous service_envelope property has been deprecated and in the config.properties file use the flag enable_envvars_config=true to enable setting the service envelope at runtime. The requests are converted from KServe inference request format to TorchServe request format and sent to the inference_address configured via local socket. Deploy PyTorch Model with V1 REST Protocol \u00b6 Create the TorchServe InferenceService \u00b6 KServe by default selects the TorchServe runtime when you specify the model format pytorch on new model spec. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 For deploying the model on CPU, apply the following torchserve.yaml to create the InferenceService . kubectl kubectl apply -f torchserve.yaml New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 resources : limits : memory : 4Gi nvidia.com/gpu : \"1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 resources : limits : memory : 4Gi nvidia.com/gpu : \"1\" For deploying the model on GPU, apply the gpu.yaml to create the GPU InferenceService . kubectl kubectl apply -f gpu.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve created Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) You can use image converter to convert the images to base64 byte array, for other models please refer to input request . curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output * Trying 52 .89.19.61... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 52 .89.19.61 ) port 80 ( #0) > PUT /v1/models/mnist HTTP/1.1 > Host: torchserve.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 167 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 1 < date: Tue, 27 Oct 2020 08 :26:19 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: b10cfc9f-cd0f-4cda-9c6c-194c2cdaa517 < x-envoy-upstream-service-time: 6 < server: istio-envoy < * Connection #0 to host a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com left intact { \"predictions\" : [ \"2\" ]} Model Explanation \u00b6 To get model explanation: curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/mnist:explain -d @./mnist.json Expected Output { \"explanations\" : [[[[ 0.0005394675730469475 , -0.0022280013123036043 , -0.003416480100841055 , -0.0051329881112415965 , -0.009973864160829985 , -0.004112560908882716 , -0.009223458030656112 , -0.0006676354577291628 , -0.005249806664413386 , -0.0009790519227372953 , -0.0026914653993121195 , -0.0069470097151383995 , -0.00693530415962956 , -0.005973878697847718 , -0.00425042437288857 , 0.0032867281838150977 , -0.004297780258633562 , -0.005643196661192014 , -0.00653025019738562 , -0.0047062916121001185 , -0.0018656628277792628 , -0.0016757477204072532 , -0.0010410417081844845 , -0.0019093520822156726 , -0.004451403461006374 , -0.0008552767257773671 , -0.0027638888169885267 , -0.0 ], [ 0.006971297052106784 , 0.007316855222185687 , 0.012144494329150574 , 0.011477799383288441 , 0.006846725347670252 , 0.01149386176451476 , 0.0045351987881190655 , 0.007038361889638708 , 0.0035855377023272157 , 0.003031419502053957 , -0.0008611575226775316 , -0.0011085224745969223 , -0.0050840743637658534 , 0.009855491784340777 , 0.007220680811043034 , 0.011374285598070253 , 0.007147725481709019 , 0.0037114580912849457 , 0.00030763245479291384 , 0.0018305492665953394 , 0.010106224395114147 , 0.012932881164284687 , 0.008862892007714321 , 0.0070960526615982435 , -0.0015931137903787505 , 0.0036495747329455906 , 0.0002593849391051298 , -0.0 ], [ 0.006467265785857396 , -0.00041793201228071674 , 0.004900316089756856 , 0.002308395474823997 , 0.007859295399592283 , 0.003916404948969494 , 0.005630750246437249 , 0.0043712538044184375 , 0.006128530599133763 , -0.009446321309831246 , -0.014173645867037036 , -0.0062988650915794565 , -0.011473838941118539 , -0.009049151947644047 , -0.0007625645864610934 , -0.013721416630061238 , -0.0005580156670410108 , 0.0033404383756480784 , -0.006693278798487951 , -0.003705084551144756 , 0.005100375089529131 , 5.5276874714401074e-05 , 0.007221745280359063 , -0.00573598303916232 , -0.006836169033785967 , 0.0025401608627538936 , 9.303533912921196e-05 , -0.0 ], [ 0.005914399808621816 , 0.00452643561023696 , 0.003968242261515448 , 0.010422786058967673 , 0.007728358107899074 , 0.01147115923288383 , 0.005683869479056691 , 0.011150670502307374 , 0.008742555292485278 , 0.0032882897575743754 , 0.014841138421861584 , 0.011741228362482451 , 0.0004296862879259221 , -0.0035118140680654854 , -0.006152254410078331 , -0.004925121936901983 , -2.3611205202801947e-06 , 0.029347073037039074 , 0.02901626308947743 , 0.023379353021343398 , 0.004027157620197582 , -0.01677662249919171 , -0.013497255736128979 , 0.006957482854214602 , 0.0018321766800746145 , 0.008277034396684563 , 0.002733405455464871 , -0.0 ], [ 0.0049579739156640065 , -0.002168016158233997 , 0.0020644317321723642 , 0.0020912464240293825 , 0.004719691119907336 , 0.007879231202446626 , 0.010594445898145937 , 0.006533067778982801 , 0.002290214592708113 , -0.0036651114968251986 , 0.010753227423379443 , 0.006402706020466243 , -0.047075193909339695 , -0.08108259303568185 , -0.07646875196692542 , -0.1681834845371156 , -0.1610307396135756 , -0.12010309927453829 , -0.016148831320070896 , -0.009541525999486027 , 0.04575604594761406 , 0.031470966329886635 , 0.02452149438024385 , 0.016594078577569567 , 0.012213591301610382 , -0.002230875840404426 , 0.0036704051254298374 , -0.0 ], [ 0.006410107592414739 , 0.005578283890924384 , 0.001977103461731095 , 0.008935476507124939 , 0.0011305055729953436 , 0.0004946313900665659 , -0.0040266029554395935 , -0.004270765544167256 , -0.010832150944943138 , -0.01653511868336456 , -0.011121302103373972 , -0.42038514526905024 , -0.22874576003118394 , -0.16752936178907055 , -0.17021699697722079 , -0.09998584936787697 , -0.09041117495322142 , -0.10230248444795721 , -0.15260897522094888 , 0.07770835838531896 , -0.0813761125123066 , 0.027556910053932963 , 0.036305965104261866 , 0.03407793793894619 , 0.01212761779302579 , 0.006695133380685627 , 0.005331392748588556 , -0.0 ], [ 0.008342680065996267 , -0.00029249776150416367 , 0.002782130291086583 , 0.0027793744856745373 , 0.0020525102690845407 , 0.003679269934110004 , 0.009373846012918791 , -0.0031751745946300403 , -0.009042846256743316 , 0.0074141593032070775 , -0.02796812516561052 , -0.593171583786029 , -0.4830164472795136 , -0.353860128479443 , -0.256482708704862 , 0.11515586314578445 , 0.12700563162828346 , 0.0022342450630152204 , -0.24673707669992118 , -0.012878340813781437 , 0.16866821780196756 , 0.009739033161051434 , -0.000827843726513152 , -0.0002137320694585577 , -0.004179480126338929 , 0.008454049232317358 , -0.002767934266266998 , -0.0 ], [ 0.007070382982749552 , 0.005342127805750565 , -0.000983984198542354 , 0.007910101170274493 , 0.001266267696096404 , 0.0038575136843053844 , 0.006941130321773131 , -0.015195182020687892 , -0.016954974010578504 , -0.031186444096787943 , -0.031754626467747966 , 0.038918845112017694 , 0.06248943950328597 , 0.07703301092601872 , 0.0438493628024275 , -0.0482404449771698 , -0.08718650815999045 , -0.0014764704694506415 , -0.07426336448916614 , -0.10378029666564882 , 0.008572087846793842 , -0.00017173413848283343 , 0.010058893270893113 , 0.0028410498666004377 , 0.002008290211806285 , 0.011905375389931099 , 0.006071375802943992 , -0.0 ], [ 0.0076080165949142685 , -0.0017127333725310495 , 0.00153128150106188 , 0.0033391793764531563 , 0.005373442509691564 , 0.007207746020295443 , 0.007422946703693544 , -0.00699779191449194 , 0.002395328253696969 , -0.011682618874195954 , -0.012737004464649057 , -0.05379966383523857 , -0.07174960461749053 , -0.03027341304050314 , 0.0019411862216381327 , -0.0205575129473766 , -0.04617091711614171 , -0.017655308106959804 , -0.009297162816368814 , -0.03358572117988279 , -0.1626068444778013 , -0.015874364762085157 , -0.0013736074085577258 , -0.014763439328689378 , 0.00631805792697278 , 0.0021769414283267273 , 0.0023061635006792498 , -0.0 ], [ 0.005569931813561535 , 0.004363218328087518 , 0.00025609463218383973 , 0.009577483244680675 , 0.007257755916229399 , 0.00976284778532342 , -0.006388840235419147 , -0.009017880790555707 , -0.015308709334434867 , -0.016743935775597355 , -0.04372596546189275 , -0.03523469356755156 , -0.017257810114846107 , 0.011960489902313411 , 0.01529079831828911 , -0.020076559119468443 , -0.042792547669901516 , -0.0029492027218867116 , -0.011109560582516062 , -0.12985858077848939 , -0.2262858575494602 , -0.003391725540087574 , -0.03063368684328981 , -0.01353486587575121 , 0.0011140822443932317 , 0.006583451102528798 , 0.005667533945285076 , -0.0 ], [ 0.004056272267155598 , -0.0006394041203204911 , 0.004664893926197093 , 0.010593032387298614 , 0.014750931538689989 , 0.015428721146282149 , 0.012167820222401367 , 0.017604752451202518 , 0.01038886849969188 , 0.020544326931163263 , -0.0004206566917812794 , -0.0037463581359232674 , -0.0024656693040735075 , 0.0026061897697624353 , -0.05186055271869177 , -0.09158655048397382 , 0.022976389912563913 , -0.19851635458461808 , -0.11801281807622972 , -0.29127727790584423 , -0.017138655663803876 , -0.04395515676468641 , -0.019241432506341576 , 0.0011342298743447392 , 0.0030625771422964584 , -0.0002867924892991192 , -0.0017908808807543712 , -0.0 ], [ 0.0030114260660488892 , 0.0020246448273580006 , -0.003293361220376816 , 0.0036965043883218584 , 0.00013185761728146236 , -0.004355610866966878 , -0.006432601921104354 , -0.004148701459814858 , 0.005974553907915845 , -0.0001399233607281906 , 0.010392944122965082 , 0.015693249298693028 , 0.0459528427528407 , -0.013921539948093455 , -0.06615556518538708 , 0.02921438991320325 , -0.16345220625101778 , -0.002130491295590408 , -0.11449749664916867 , -0.030980255589300607 , -0.04804122537359171 , -0.05144994776295644 , 0.005122827412776085 , 0.006464862173908011 , 0.008624278272940246 , 0.0037316228508156427 , 0.0036947794337026706 , -0.0 ], [ 0.0038173843228389405 , -0.0017091931226819494 , -0.0030871869816778068 , 0.002115642501535999 , -0.006926441921580917 , -0.003023077828426468 , -0.014451359520861637 , -0.0020793048380231397 , -0.010948003939342523 , -0.0014460716966395166 , -0.01656990336897737 , 0.003052317148320358 , -0.0026729564809943513 , -0.06360067057346147 , 0.07780985635080599 , -0.1436689936630281 , -0.040817177623437874 , -0.04373367754296477 , -0.18337299150349698 , 0.025295182977407064 , -0.03874921104331938 , -0.002353901742617205 , 0.011772560401335033 , 0.012480994515707569 , 0.006498422579824301 , 0.00632320984076023 , 0.003407169765754805 , -0.0 ], [ 0.00944355257990139 , 0.009242583578688485 , 0.005069860444386138 , 0.012666191449103024 , 0.00941789912565746 , 0.004720427012836104 , 0.007597687789204113 , 0.008679266528089945 , 0.00889322771021875 , -0.0008577904940828809 , 0.0022973860384607604 , 0.025328230809207493 , -0.09908781123080951 , -0.07836626399832172 , -0.1546141264726177 , -0.2582207272050766 , -0.2297524599578219 , -0.29561835103416967 , 0.12048787956671528 , -0.06279365699861471 , -0.03832012404275233 , 0.022910264999199934 , 0.005803508497672737 , -0.003858461926053348 , 0.0039451232171312765 , 0.003858476747495933 , 0.0013034515558609956 , -0.0 ], [ 0.009725756015628606 , -0.0004001101998876524 , 0.006490722835571152 , 0.00800808023631959 , 0.0065880711806331265 , -0.0010264326176194034 , -0.0018914305972878344 , -0.008822522194658438 , -0.016650520788128117 , -0.03254382594389507 , -0.014795713101569494 , -0.05826499837818885 , -0.05165369567511702 , -0.13384277337594377 , -0.22572641373340493 , -0.21584739544668635 , -0.2366836351939208 , 0.14937824076489659 , -0.08127414932170171 , -0.06720440139736879 , -0.0038552732903526744 , 0.0107597891707803 , -5.67453590118174e-05 , 0.0020161340511396244 , -0.000783322694907436 , -0.0006397207517995289 , -0.005291639205010064 , -0.0 ], [ 0.008627543242777584 , 0.007700097300051849 , 0.0020430960246806138 , 0.012949015733198586 , 0.008428709579953574 , 0.001358177022953576 , 0.00421863939925833 , 0.002657580000868709 , -0.007339431957237175 , 0.02008439775442315 , -0.0033717631758033114 , -0.05176633249899187 , -0.013790328758662772 , -0.39102366157050594 , -0.167341447585844 , -0.04813367828213947 , 0.1367781582239039 , -0.04672809260566293 , -0.03237784669978756 , 0.03218068777925178 , 0.02415063765016493 , -0.017849899351200002 , -0.002975675228088795 , -0.004819438014786686 , 0.005106898651831245 , 0.0024278620704227456 , 6.784303333368138e-05 , -0.0 ], [ 0.009644258527009343 , -0.001331907219439711 , -0.0014639718434477777 , 0.008481926798958248 , 0.010278031715467508 , 0.003625808326891529 , -0.01121188617599796 , -0.0010634587872994379 , -0.0002603820881968461 , -0.017985648016990465 , -0.06446652745470374 , 0.07726063173046191 , -0.24739929795334742 , -0.2701855018480216 , -0.08888614776216278 , 0.1373325760136816 , -0.02316068912438066 , -0.042164834956711514 , 0.0009266091344106458 , 0.03141872420427644 , 0.011587728430225652 , 0.0004755143243520787 , 0.005860642609620605 , 0.008979633931394438 , 0.005061734169974005 , 0.003932710387086098 , 0.0015489986106803626 , -0.0 ], [ 0.010998736164377534 , 0.009378969800902604 , 0.00030577045264713074 , 0.0159329353530375 , 0.014849508018911006 , -0.0026513365659554225 , 0.002923303082126996 , 0.01917908707828847 , -0.02338288107991566 , -0.05706674679291175 , 0.009526265752669624 , -0.19945255386401284 , -0.10725519695909647 , -0.3222906835083537 , -0.03857038318412844 , -0.013279804965996065 , -0.046626023244262085 , -0.029299060237210447 , -0.043269580558906555 , -0.03768510002290657 , -0.02255977771908117 , -0.02632588166863199 , -0.014417349488098566 , -0.003077271951572957 , -0.0004973277708010661 , 0.0003475839139671271 , -0.0014522783025903258 , -0.0 ], [ 0.012215315671616316 , -0.001693194176229889 , 0.011365785434529038 , 0.0036964574178487792 , -0.010126738168635003 , -0.025554378647710443 , 0.006538003839811914 , -0.03181759044467965 , -0.016424751042854728 , 0.06177539736110035 , -0.43801735323216856 , -0.29991040815937386 , -0.2516019795363623 , 0.037789523540809 , -0.010948746374759491 , -0.0633901687126727 , -0.005976006160777705 , 0.006035133605976937 , -0.04961632526071937 , -0.04142116972831476 , -0.07558952727782252 , -0.04165176179187153 , -0.02021603856619006 , -0.0027365663096057032 , -0.011145473712733575 , 0.0003566937349350848 , -0.00546472985268321 , -0.0 ], [ 0.008009386447317503 , 0.006831207743885825 , 0.0051306149795546365 , 0.016239014770865052 , 0.020925441734273218 , 0.028344800173195076 , -0.004805080609285047 , -0.01880521614501033 , -0.1272329010865855 , -0.39835936819190537 , -0.09113694760349819 , -0.04061591094832608 , -0.12677021961235907 , 0.015567707226741051 , -0.005615051546243333 , -0.06454044862001587 , 0.0195457674752272 , -0.04219686517155871 , -0.08060569979524296 , 0.027234494361702787 , -0.009152881336047056 , -0.030865118003992217 , -0.005770311060090559 , 0.002905833371986098 , 5.606663556872091e-05 , 0.003209538083839772 , -0.0018588810743365345 , -0.0 ], [ 0.007587008852984699 , -0.0021213639853557625 , 0.0007709558092903736 , 0.013883256128746423 , 0.017328713012428214 , 0.03645357525636198 , -0.04043993335238427 , 0.05730125171252314 , -0.2563293727512057 , -0.11438826083879326 , 0.02662382809034687 , 0.03525271352483709 , 0.04745678120172762 , 0.0336360484090392 , -0.002916635707204059 , -0.17950855098650784 , -0.44161773297052964 , -0.4512180227831197 , -0.4940283106297913 , -0.1970108671285798 , 0.04344323143078066 , -0.012005120444897523 , 0.00987576109166055 , -0.0018336757466252476 , 0.0004913959502151706 , -0.0005409724034216215 , -0.005039223900868212 , -0.0 ], [ 0.00637876531169957 , 0.005189469227685454 , 0.0007676355246000376 , 0.018378100865097655 , 0.015739815031394887 , -0.035524983116512455 , 0.03781006978038308 , 0.28859052096740495 , 0.0726464110153121 , -0.026768468497420147 , 0.06278766200288134 , 0.17897045813699355 , -0.13780371920803108 , -0.14176458123649577 , -0.1733103177731656 , -0.3106508869296763 , 0.04788355140275794 , 0.04235327890285105 , -0.031266625292514394 , -0.016263819217960652 , -0.031388328800811355 , -0.01791363975905968 , -0.012025067979443894 , 0.008335083985905805 , -0.0014386677797296231 , 0.0055376544652972854 , 0.002241522815466253 , -0.0 ], [ 0.007455256326741617 , -0.0009475207572210404 , 0.0020288385162615286 , 0.015399640135796092 , 0.021133843188103074 , -0.019846405097622234 , -0.003162485751163173 , -0.14199005055318842 , -0.044200898667146035 , -0.013395459413208084 , 0.11019680479230103 , -0.014057216041764874 , -0.12553853334447865 , -0.05992513534766256 , 0.06467942189539834 , 0.08866056095907732 , -0.1451321508061849 , -0.07382491447758655 , -0.046961739981080476 , 0.0008943713493160624 , 0.03231044103656507 , 0.00036034241706501196 , -0.011387669277619417 , -0.00014602449257226195 , -0.0021863729003374116 , 0.0018817840156005856 , 0.0037909804578166286 , -0.0 ], [ 0.006511855618626698 , 0.006236866054439829 , -0.001440571166157676 , 0.012795776609942026 , 0.011530545030403624 , 0.03495489377257363 , 0.04792403136095304 , 0.049378583599065225 , 0.03296101702085617 , -0.0005351385876652296 , 0.017744115897640366 , 0.0011656622496764954 , 0.0232845869823761 , -0.0561191397060232 , -0.02854070511118366 , -0.028614174047247348 , -0.007763531086362863 , 0.01823079560098924 , 0.021961392405283622 , -0.009666681805706179 , 0.009547046884328725 , -0.008729943263791338 , 0.006408909680578429 , 0.009794327096359952 , -0.0025825219195515304 , 0.007063559189211571 , 0.007867244119267047 , -0.0 ], [ 0.007936663546039311 , -0.00010710180170593153 , 0.002716512705673228 , 0.0038633557307721487 , -0.0014877316616940372 , -0.0004788143065635909 , 0.012508842248031202 , 0.0045381104608414645 , -0.010650910516128294 , -0.013785341529644855 , -0.034287643221318206 , -0.022152707546335495 , -0.047056481347685974 , -0.032166744564720455 , -0.021551611335278546 , -0.002174962503376043 , 0.024344287130424306 , 0.015579272560525105 , 0.010958169741952194 , -0.010607232913436921 , -0.005548369726118836 , -0.0014630046444242706 , 0.013144180105016433 , 0.0031349366359021916 , 0.0010984887428255974 , 0.005426941473328394 , 0.006566511860044785 , -0.0 ], [ 0.0005529184874606495 , 0.00026139355020588705 , -0.002887623443531047 , 0.0013988462990850632 , 0.00203365139495493 , -0.007276926701775218 , -0.004010419939595932 , 0.017521952161185662 , 0.0006996977433557911 , 0.02083134683611201 , 0.013690533534289498 , -0.005466724359976675 , -0.008857712321334327 , 0.017408578822635818 , 0.0076439343049154425 , 0.0017861314923539985 , 0.007465865707523924 , 0.008034420825988495 , 0.003976298558337994 , 0.00411970637898539 , -0.004572592545819698 , 0.0029563907011979935 , -0.0006382227820088148 , 0.0015153753877889707 , -0.0052626601797995595 , 0.0025664706985019416 , 0.005161751034260073 , -0.0 ], [ 0.0009424280561998445 , -0.0012942360298110595 , 0.0011900868416523343 , 0.000984424113178899 , 0.0020988269382781564 , -0.005870080062890889 , -0.004950484744457169 , 0.003117643454332697 , -0.002509563565777083 , 0.005831604884101081 , 0.009531085216183116 , 0.010030206821909806 , 0.005858190171099734 , 4.9344529936340524e-05 , -0.004027895832421331 , 0.0025436439920587606 , 0.00531153867563076 , 0.00495942692369508 , 0.009215148318606382 , 0.00010011928317543458 , 0.0060051362999805355 , -0.0008195376963202741 , 0.0041728603512658224 , -0.0017597169567888774 , -0.0010577007775543158 , 0.00046033327178068433 , -0.0007674196306044449 , -0.0 ], [ -0.0 , -0.0 , 0.0013386963856532302 , 0.00035183178922260837 , 0.0030610334903526204 , 8.951834979315781e-05 , 0.0023676793550483524 , -0.0002900551076915047 , -0.00207019445286608 , -7.61697478482574e-05 , 0.0012150086715244216 , 0.009831239281792168 , 0.003479667642621962 , 0.0070584324334114525 , 0.004161851261339585 , 0.0026146296354490665 , -9.194746959222099e-05 , 0.0013583866966571571 , 0.0016821551239318913 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 ]]]]} Deploy PyTorch Model with V1 gRPC Protocol \u00b6 Note : Since kserve has no grpc client methods for v1, we are using torchserve's grpc v1 client Create the InferenceService \u00b6 For deploying the InferenceService with gRPC protocol you need to expose the gRPC port on InferenceService. Here 7070 is torchserve gRPC port. Apply the following mnist_grpc.yaml to create the InferenceService . kubectl kubectl apply -f mnist_grpc.yaml Expected Output inferenceservice.serving.kserve.io/torchserve-grpc created New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 ports : - containerPort : 7070 name : h2c protocol : TCP apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 ports : - containerPort : 7070 name : h2c protocol : TCP Run Inference with TorchServe gRPC protocol \u00b6 Install gRPC python dependencies: pip install -U grpcio protobuf grpcio-tools Download TorchServe's inference and management proto: mkdir -p proto/v1 INFERENCE_PROTO_FILE_PATH = https://raw.githubusercontent.com/pytorch/serve/master/frontend/server/src/main/resources/proto/inference.proto MANAGEMENT_PROTO_FILE_PATH = https://raw.githubusercontent.com/pytorch/serve/master/frontend/server/src/main/resources/proto/management.proto curl -s -L ${ INFERENCE_PROTO_FILE_PATH } > ./proto/v1/inference.proto curl -s -L ${ MANAGEMENT_PROTO_FILE_PATH } > ./proto/v1/management.proto Generate python gRPC client stub using the proto files: python -m grpc_tools.protoc --proto_path = proto/v1/ --python_out = . --grpc_python_out = . proto/v1/inference.proto proto/v1/management.proto You can use image converter to convert the images to base64 byte array, for other models please refer to input request . Run gRPC Inference using torchserve_grpc_client.py with mnist.json as an example prediction input. MODEL_NAME = mnist INPUT_PATH = mnist.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python torchserve_grpc_client.py --api_name infer --model $MODEL_NAME --input_path $INPUT_PATH --hostname $SERVICE_HOSTNAME Expected Output { \"predictions\" : [ 2 ] } Deploy PyTorch model with Open Inference REST Protocol \u00b6 Create the InferenceService \u00b6 KServe by default selects the TorchServe runtime when you specify the model format pytorch on new model spec and enables the KServe v1 inference protocol. To enable v2 open inference protocol, specify the protocolVersion field with the value v2 . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve-mnist-v2\" spec : predictor : model : modelFormat : name : pytorch protocolVersion : v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve-mnist-v2\" spec : predictor : pytorch : protocolVersion : v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 For deploying the model on CPU, apply the mnist_v2.yaml to create the InferenceService . kubectl kubectl apply -f mnist_v2.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve-mnist-v2 created Model Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-mnist-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) You can send both byte array and tensor with v2 protocol, for byte array use image converter to convert the image to byte array input. Here we use the mnist_v2_bytes.json file to run an example inference. curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d @./mnist_v2_bytes.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]} For tensor input use the tensor image converter to convert the image to tensor input and here we use the mnist_v2.json file to run an example inference. curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d @./mnist_v2.json Expected Output { \"id\" : \"2266ec1e-f600-40af-97b5-7429b8195a80\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]} Model Explanation \u00b6 To get the model explanation with v2 explain endpoint: MODEL_NAME = mnist curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/mnist/explain -d @./mnist_v2.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"explain\" , \"shape\" : [ 1 , 28 , 28 ], \"datatype\" : \"FP64\" , \"datae-05 , 0.01024804634283815 , 0.0009971135240970147 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0004501048968956462 , -0.0019630535686311007 , -0.0006664793297549408 , 0.0020157403539278907 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0022144569383238466 , 0.008361583574785395 , 0.00314019428604999 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0028943544591141838 , -0.0031301383432286406 , 0.002113252872926688 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0010321050605717045 , 0.008905753926369048 , 0.002846438277738756 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.005305288883499087 , -0.00192711009725932 , 0.0012090042768467344 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0011945156500241256 , 0.005654442715832439 , 0.0020132075345016807 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0014689356969061985 , 0.0010743412638183228 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0017047980586912376 , 0.00290660517425009 , -0.0007805869640505143 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 5.541 * Co nne c t io n # 0 t o hos t localhos t le ft i nta c t 725422148614e-05 , 0.0014516114512869852 , 0.0002827701966546988 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0014401407633627265 , 0.0023812497776698745 , 0.002146825301700187 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0011500529125940918 , 0.0002865015572973405 , 0.0029798151042282686 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0017750295500283872 , 0.0008339859126060243 , -0.00377073933577687 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0006093176894575109 , -0.00046905787892409935 , 0.0034053218511795034 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0007450011768391558 , 0.001298767372877851 , -0.008499247640112315 , -6.145166131400234e-05 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0011809726042792137 , -0.001838476328106708 , 0.00541110661116898 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.002139234224224006 , 0.0003259163407641124 , -0.005276118873855287 , -0.001950984007438105 , -9.545670742026532e-07 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0007772404228681039 , -0.0001517956264720738 , 0.0064814848131711815 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 8.098064985902114e-05 , -0.00249042660692983 , -0.0020718619200672302 , -5.341117902942147e-05 , -0.00045564724429915073 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0022750983476959733 , 0.0017164060958460778 , 0.0003221344707738082 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0015560282678744543 , 9.107238495871273e-05 , 0.0008772841497928399 , 0.0006502978626355868 , -0.004128780767525651 , 0.0006030386900152659 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.001395995791096219 , 0.0026791526689584344 , 0.0023995008266391488 , -0.0004496096312746451 , 0.003101832450753724 , 0.007494536066960778 , 0.0028641187148287965 , -0.0030525907182629075 , 0.003420222396518567 , 0.0014924018363498125 , -0.0009357388301326025 , 0.0007856228933169799 , -0.0018433973914981437 , 1.6031856831240914e-05 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0006999018502034005 , 0.004382250870697946 , -0.0035419313267119365 , -0.0028896748092595375 , -0.00048734542493666705 , -0.0060873452419295 , 0.000388224990424471 , 0.002533641537585585 , -0.004352836563597573 , -0.0006079418766875505 , -0.0038101334053377753 , -0.000828441340357984 , 0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0010901530866342661 , -0.013135008038845744 , 0.0004734518707654666 , 0.002050423283568135 , -0.006609451922460863 , 0.0023647861820124366 , 0.0046789204256194 , -0.0018122527412311837 , 0.002137538353955849 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 ]}]} Deploy PyTorch Model with Open Inference gRPC Protocol \u00b6 Create the InferenceService \u00b6 For deploying the InferenceService with Open Inference gRPC Protocol you need to expose the gRPC port on InferenceService. Here 8081 is kserve gRPC port. Apply the following mnist_grpc_v2.yaml to create the InferenceService . kubectl kubectl apply -f mnist_grpc_v2.yaml Expected Output $inferenceservice .serving.kserve.io/torchserve-grpc-v2 created New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc-v2\" spec : predictor : model : modelFormat : name : pytorch protocolVersion : grpc-v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 ports : - containerPort : 8081 name : h2c protocol : TCP apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc-v2\" spec : predictor : pytorch : protocolVersion : grpc-v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 ports : - containerPort : 8081 name : h2c protocol : TCP Run gRPC Inference \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . Then download Open Inference gRPC proto file: mkdir -p proto/v2 PROTO_FILE_PATH = https://raw.githubusercontent.com/kserve/kserve/master/python/kserve/kserve/protocol/grpc/grpc_predict_v2.proto curl -s -L ${ PROTO_FILE_PATH } > ./proto/v2/grpc_predict_v2.proto Run the inference test with grpcurl: INPUT_PATH = ./mnist_v2_grpc_tensor.json PROTO_FILE = proto/v2/grpc_predict_v2.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-grpc-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) grpcurl -v -plaintext -proto ${ PROTO_FILE } -authority ${ SERVICE_HOSTNAME } -d @ ${ INGRESS_HOST } : ${ INGRESS_PORT } inference.GRPCInferenceService.ModelInfer <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Wed, 11 Oct 2023 13 :36:30 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 581 Response contents: { \"modelName\" : \"mnist\" , \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"outputs\" : [ { \"name\" : \"input-0\" , \"datatype\" : \"INT64\" , \"shape\" : [ \"1\" ] , \"contents\" : { \"int64Contents\" : [ \"1\" ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response Autoscaling \u00b6 One of the main serverless inference features is to automatically scale the replicas of an InferenceService matching the incoming workload. KServe by default enables Knative Pod Autoscaler which watches traffic flow and scales up and down based on the configured metrics. Knative Autoscaler \u00b6 KServe supports the implementation of Knative Pod Autoscaler (KPA) and Kubernetes\u2019 Horizontal Pod Autoscaler (HPA) . The features and limitations of each of these Autoscalers are listed below. Note If you want to use Kubernetes Horizontal Pod Autoscaler (HPA), you must install HPA extension Knative Pod Autoscaler (KPA) Part of the Knative Serving core and enabled by default once Knative Serving is installed. Supports scale to zero functionality. Does not support CPU-based autoscaling. Horizontal Pod Autoscaler (HPA) Not part of the Knative Serving core, and must be enabled after Knative Serving installation. Does not support scale to zero functionality. Supports CPU-based autoscaling. Create InferenceService with Concurrency Target \u00b6 Hard/Soft Autoscaling Limit \u00b6 You can configure InferenceService with annotation autoscaling.knative.dev/target for a soft limit. The soft limit is a targeted limit rather than a strictly enforced bound, particularly if there is a sudden burst of requests, this value can be exceeded. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : autoscaling.knative.dev/target : \"10\" spec : predictor : model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : autoscaling.knative.dev/target : \"10\" spec : predictor : pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" You can also configure InferenceService with field containerConcurrency with a hard limit. The hard limit is an enforced upper bound. If concurrency reaches the hard limit, surplus requests will be buffered and must wait until enough capacity is free to execute the requests. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : containerConcurrency : 10 model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : containerConcurrency : 10 pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" After specifying the soft or hard limits of the scaling target, you can now deploy the InferenceService with autoscaling.yaml . kubectl kubectl apply -f autoscaling.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve created Run Inference with Concurrent Requests \u00b6 The first step is to install the hey load generator and then send the concurrent requests to the InferenceService . go get -u github.com/rakyll/hey MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -m POST -z 30s -D ./mnist.json -host ${ SERVICE_HOSTNAME } http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict Check Pod Autoscaling \u00b6 hey by default generates 50 requests concurrently, so you can see that the InferenceService scales to 5 pods as the container concurrency target is set to 10. Expected Output NAME READY STATUS RESTARTS AGE torchserve-predictor-default-cj2d8-deployment-69444c9c74-67qwb 2 /2 Terminating 0 103s torchserve-predictor-default-cj2d8-deployment-69444c9c74-nnxk8 2 /2 Terminating 0 95s torchserve-predictor-default-cj2d8-deployment-69444c9c74-rq8jq 2 /2 Running 0 50m torchserve-predictor-default-cj2d8-deployment-69444c9c74-tsrwr 2 /2 Running 0 113s torchserve-predictor-default-cj2d8-deployment-69444c9c74-vvpjl 2 /2 Running 0 109s torchserve-predictor-default-cj2d8-deployment-69444c9c74-xvn7t 2 /2 Terminating 0 103s Canary Rollout \u00b6 Canary rollout is a deployment strategy when you release a new version of model to a small percent of the production traffic. Create InferenceService with Canary Model \u00b6 After the above experiments, now let's see how you can rollout a new model without moving full traffic to the new model by default. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : serving.kserve.io/enable-tag-routing : \"true\" spec : predictor : canaryTrafficPercent : 20 model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v2\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : serving.kserve.io/enable-tag-routing : \"true\" spec : predictor : canaryTrafficPercent : 20 pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v2\" In this example we change the storageUri to the v2 version with canaryTrafficPercent field and then apply the canary.yaml . kubectl kubectl apply -f canary.yaml Expected Output kubectl get revisions -l serving.kserve.io/inferenceservice = torchserve NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON ACTUAL REPLICAS DESIRED REPLICAS torchserve-predictor-default-00001 torchserve-predictor-default 1 True 1 1 torchserve-predictor-default-00002 torchserve-predictor-default 2 True 1 1 kubectl get pods -l serving.kserve.io/inferenceservice = torchserve NAME READY STATUS RESTARTS AGE torchserve-predictor-default-00001-deployment-7d99979c99-p49gk 2 /2 Running 0 28m torchserve-predictor-default-00002-deployment-c6fcc65dd-rjknq 2 /2 Running 0 3m37s Check Traffic Status \u00b6 After the canary model is rolled out, the traffic should be split between the canary model revision and the \"stable\" revision which was rolled out with 100% percent traffic, now check the traffic split from the InferenceService traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 20 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, { \"latestRevision\" : false , \"percent\" : 80 , \"revisionName\" : \"torchserve-predictor-default-00001\" , \"tag\" : \"prev\" , \"url\" : \"http://prev-torchserve-predictor-default.default.example.com\" } ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } } Traffic Rollout \u00b6 Run the following curl requests a few times to the InferenceService , you can see that requests are sent to the two revisions with 20/80 splits. MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) for i in { 1 ..10 } ; do curl -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json ; done Expected Output { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 500: Internal Server Error 500: Internal Server ErrorHandling connection for 8080 500: Internal Server Error 500: Internal Server ErrorHandling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 You can notice that when the request hits the canary revision it fails, this is because that the new revision requires the v2 inference input mnist_v2.json which is a breaking change, in addition the traffic is randomly splitted between the two revisions according to the specified traffic percentage. In this case you should rollout the canary model with 0 canaryTrafficPercent and use the latest tagged url to test the canary model before moving the full traffic to the new model. kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 0}]' curl -v -H \"Host: latest-torchserve-predictor-default.default.example.com\" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [ 1 ], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]} After the new model is tested and verified, you can now bump the canaryTrafficPercent to 100 to fully rollout the traffic to the new revision and now the latestRolledoutRevision becomes torchserve-predictor-default-00002 and previousRolledoutRevision becomes torchserve-predictor-default-00001 . kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 100}]' Check the traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00002\" , \"previousRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 100 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } } Rollback the Model \u00b6 In case the new model version does not work after the traffic is moved to the new revision, you can still patch the canaryTrafficPercent to 0 and move the traffic back to the previously rolled model which is torchserve-predictor-default-00001 . kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 0}]' Check the traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"previousRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 0 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, { \"latestRevision\" : false , \"percent\" : 100 , \"revisionName\" : \"torchserve-predictor-default-00001\" , \"tag\" : \"prev\" , \"url\" : \"http://prev-torchserve-predictor-default.default.example.com\" } ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } } Monitoring \u00b6 Metrics Exposure and Grafana Dashboard Setup","title":"PyTorch"},{"location":"modelserving/v1beta1/torchserve/#deploy-a-pytorch-model-with-torchserve-inferenceservice","text":"In this example, we deploy a trained PyTorch MNIST model to predict handwritten digits by running an InferenceService with TorchServe runtime which is the default installed serving runtime for PyTorch models. Model interpretability is also an important aspect which helps to understand which of the input features were important for a particular classification. Captum is a model interpretability library. In this example, TorchServe explain endpoint is implemented with Captum's state-of-the-art algorithm, including integrated gradients to provide users with an easy way to understand which features are contributing to the model output. You can refer to the Captum Tutorial for more examples.","title":"Deploy a PyTorch Model with TorchServe InferenceService"},{"location":"modelserving/v1beta1/torchserve/#create-model-storage-with-a-model-archive-file-and-config","text":"The KServe/TorchServe integration expects following model store layout. \u251c\u2500\u2500 config \u2502 \u251c\u2500\u2500 config.properties \u251c\u2500\u2500 model-store \u2502 \u251c\u2500\u2500 densenet_161.mar \u2502 \u251c\u2500\u2500 mnist.mar TorchServe provides a utility to package all the model artifacts into a single TorchServe Model Archive File (MAR) . After model artifacts are packaged into a MAR file, you then upload to the model-store under the model storage path. You can store your model and dependent files on remote storage or local persistent volume. The MNIST model and dependent files can be obtained from here . Note For remote storage you can choose to start the example using the prebuilt MNIST MAR file stored on KServe example GCS bucket gs://kfserving-examples/models/torchserve/image_classifier , or generate the MAR file with torch-model-archiver and create the model store on remote storage according to the above layout. torch-model-archiver --model-name mnist --version 1 .0 \\ --model-file model-archiver/model-store/mnist/mnist.py \\ --serialized-file model-archiver/model-store/mnist/mnist_cnn.pt \\ --handler model-archiver/model-store/mnist/mnist_handler.py \\ For PVC user please refer to model archive file generation for auto generation of MAR files with the model and dependent files. TorchServe uses a config.properties file to store configuration. Please see here for more details with the properties supported by the configuration file. The following is a sample file for KServe: inference_address=http://0.0.0.0:8085 management_address=http://0.0.0.0:8085 metrics_address=http://0.0.0.0:8082 grpc_inference_port=7070 grpc_management_port=7071 enable_metrics_api=true metrics_format=prometheus number_of_netty_threads=4 job_queue_size=10 enable_envvars_config=true install_py_dep_per_model=true model_store=/mnt/models/model-store model_snapshot={\"name\":\"startup.cfg\",\"modelCount\":1,\"models\":{\"mnist\":{\"1.0\":{\"defaultVersion\":true,\"marName\":\"mnist.mar\",\"minWorkers\":1,\"maxWorkers\":5,\"batchSize\":1,\"maxBatchDelay\":10,\"responseTimeout\":120}}}} The KServe/TorchServe integration supports KServe v1/v2 REST protocol. In the config.properties , we need to turn on the flag enable_envvars_config to enable setting the KServe envelop using an environment variable. Warning The previous service_envelope property has been deprecated and in the config.properties file use the flag enable_envvars_config=true to enable setting the service envelope at runtime. The requests are converted from KServe inference request format to TorchServe request format and sent to the inference_address configured via local socket.","title":"Create Model Storage with a Model Archive File and Config"},{"location":"modelserving/v1beta1/torchserve/#deploy-pytorch-model-with-v1-rest-protocol","text":"","title":"Deploy PyTorch Model with V1 REST Protocol"},{"location":"modelserving/v1beta1/torchserve/#create-the-torchserve-inferenceservice","text":"KServe by default selects the TorchServe runtime when you specify the model format pytorch on new model spec. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 For deploying the model on CPU, apply the following torchserve.yaml to create the InferenceService . kubectl kubectl apply -f torchserve.yaml New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 resources : limits : memory : 4Gi nvidia.com/gpu : \"1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 resources : limits : memory : 4Gi nvidia.com/gpu : \"1\" For deploying the model on GPU, apply the gpu.yaml to create the GPU InferenceService . kubectl kubectl apply -f gpu.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve created","title":"Create the TorchServe InferenceService"},{"location":"modelserving/v1beta1/torchserve/#model-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) You can use image converter to convert the images to base64 byte array, for other models please refer to input request . curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output * Trying 52 .89.19.61... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 52 .89.19.61 ) port 80 ( #0) > PUT /v1/models/mnist HTTP/1.1 > Host: torchserve.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 167 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 1 < date: Tue, 27 Oct 2020 08 :26:19 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: b10cfc9f-cd0f-4cda-9c6c-194c2cdaa517 < x-envoy-upstream-service-time: 6 < server: istio-envoy < * Connection #0 to host a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com left intact { \"predictions\" : [ \"2\" ]}","title":"Model Inference"},{"location":"modelserving/v1beta1/torchserve/#model-explanation","text":"To get model explanation: curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/mnist:explain -d @./mnist.json Expected Output { \"explanations\" : [[[[ 0.0005394675730469475 , -0.0022280013123036043 , -0.003416480100841055 , -0.0051329881112415965 , -0.009973864160829985 , -0.004112560908882716 , -0.009223458030656112 , -0.0006676354577291628 , -0.005249806664413386 , -0.0009790519227372953 , -0.0026914653993121195 , -0.0069470097151383995 , -0.00693530415962956 , -0.005973878697847718 , -0.00425042437288857 , 0.0032867281838150977 , -0.004297780258633562 , -0.005643196661192014 , -0.00653025019738562 , -0.0047062916121001185 , -0.0018656628277792628 , -0.0016757477204072532 , -0.0010410417081844845 , -0.0019093520822156726 , -0.004451403461006374 , -0.0008552767257773671 , -0.0027638888169885267 , -0.0 ], [ 0.006971297052106784 , 0.007316855222185687 , 0.012144494329150574 , 0.011477799383288441 , 0.006846725347670252 , 0.01149386176451476 , 0.0045351987881190655 , 0.007038361889638708 , 0.0035855377023272157 , 0.003031419502053957 , -0.0008611575226775316 , -0.0011085224745969223 , -0.0050840743637658534 , 0.009855491784340777 , 0.007220680811043034 , 0.011374285598070253 , 0.007147725481709019 , 0.0037114580912849457 , 0.00030763245479291384 , 0.0018305492665953394 , 0.010106224395114147 , 0.012932881164284687 , 0.008862892007714321 , 0.0070960526615982435 , -0.0015931137903787505 , 0.0036495747329455906 , 0.0002593849391051298 , -0.0 ], [ 0.006467265785857396 , -0.00041793201228071674 , 0.004900316089756856 , 0.002308395474823997 , 0.007859295399592283 , 0.003916404948969494 , 0.005630750246437249 , 0.0043712538044184375 , 0.006128530599133763 , -0.009446321309831246 , -0.014173645867037036 , -0.0062988650915794565 , -0.011473838941118539 , -0.009049151947644047 , -0.0007625645864610934 , -0.013721416630061238 , -0.0005580156670410108 , 0.0033404383756480784 , -0.006693278798487951 , -0.003705084551144756 , 0.005100375089529131 , 5.5276874714401074e-05 , 0.007221745280359063 , -0.00573598303916232 , -0.006836169033785967 , 0.0025401608627538936 , 9.303533912921196e-05 , -0.0 ], [ 0.005914399808621816 , 0.00452643561023696 , 0.003968242261515448 , 0.010422786058967673 , 0.007728358107899074 , 0.01147115923288383 , 0.005683869479056691 , 0.011150670502307374 , 0.008742555292485278 , 0.0032882897575743754 , 0.014841138421861584 , 0.011741228362482451 , 0.0004296862879259221 , -0.0035118140680654854 , -0.006152254410078331 , -0.004925121936901983 , -2.3611205202801947e-06 , 0.029347073037039074 , 0.02901626308947743 , 0.023379353021343398 , 0.004027157620197582 , -0.01677662249919171 , -0.013497255736128979 , 0.006957482854214602 , 0.0018321766800746145 , 0.008277034396684563 , 0.002733405455464871 , -0.0 ], [ 0.0049579739156640065 , -0.002168016158233997 , 0.0020644317321723642 , 0.0020912464240293825 , 0.004719691119907336 , 0.007879231202446626 , 0.010594445898145937 , 0.006533067778982801 , 0.002290214592708113 , -0.0036651114968251986 , 0.010753227423379443 , 0.006402706020466243 , -0.047075193909339695 , -0.08108259303568185 , -0.07646875196692542 , -0.1681834845371156 , -0.1610307396135756 , -0.12010309927453829 , -0.016148831320070896 , -0.009541525999486027 , 0.04575604594761406 , 0.031470966329886635 , 0.02452149438024385 , 0.016594078577569567 , 0.012213591301610382 , -0.002230875840404426 , 0.0036704051254298374 , -0.0 ], [ 0.006410107592414739 , 0.005578283890924384 , 0.001977103461731095 , 0.008935476507124939 , 0.0011305055729953436 , 0.0004946313900665659 , -0.0040266029554395935 , -0.004270765544167256 , -0.010832150944943138 , -0.01653511868336456 , -0.011121302103373972 , -0.42038514526905024 , -0.22874576003118394 , -0.16752936178907055 , -0.17021699697722079 , -0.09998584936787697 , -0.09041117495322142 , -0.10230248444795721 , -0.15260897522094888 , 0.07770835838531896 , -0.0813761125123066 , 0.027556910053932963 , 0.036305965104261866 , 0.03407793793894619 , 0.01212761779302579 , 0.006695133380685627 , 0.005331392748588556 , -0.0 ], [ 0.008342680065996267 , -0.00029249776150416367 , 0.002782130291086583 , 0.0027793744856745373 , 0.0020525102690845407 , 0.003679269934110004 , 0.009373846012918791 , -0.0031751745946300403 , -0.009042846256743316 , 0.0074141593032070775 , -0.02796812516561052 , -0.593171583786029 , -0.4830164472795136 , -0.353860128479443 , -0.256482708704862 , 0.11515586314578445 , 0.12700563162828346 , 0.0022342450630152204 , -0.24673707669992118 , -0.012878340813781437 , 0.16866821780196756 , 0.009739033161051434 , -0.000827843726513152 , -0.0002137320694585577 , -0.004179480126338929 , 0.008454049232317358 , -0.002767934266266998 , -0.0 ], [ 0.007070382982749552 , 0.005342127805750565 , -0.000983984198542354 , 0.007910101170274493 , 0.001266267696096404 , 0.0038575136843053844 , 0.006941130321773131 , -0.015195182020687892 , -0.016954974010578504 , -0.031186444096787943 , -0.031754626467747966 , 0.038918845112017694 , 0.06248943950328597 , 0.07703301092601872 , 0.0438493628024275 , -0.0482404449771698 , -0.08718650815999045 , -0.0014764704694506415 , -0.07426336448916614 , -0.10378029666564882 , 0.008572087846793842 , -0.00017173413848283343 , 0.010058893270893113 , 0.0028410498666004377 , 0.002008290211806285 , 0.011905375389931099 , 0.006071375802943992 , -0.0 ], [ 0.0076080165949142685 , -0.0017127333725310495 , 0.00153128150106188 , 0.0033391793764531563 , 0.005373442509691564 , 0.007207746020295443 , 0.007422946703693544 , -0.00699779191449194 , 0.002395328253696969 , -0.011682618874195954 , -0.012737004464649057 , -0.05379966383523857 , -0.07174960461749053 , -0.03027341304050314 , 0.0019411862216381327 , -0.0205575129473766 , -0.04617091711614171 , -0.017655308106959804 , -0.009297162816368814 , -0.03358572117988279 , -0.1626068444778013 , -0.015874364762085157 , -0.0013736074085577258 , -0.014763439328689378 , 0.00631805792697278 , 0.0021769414283267273 , 0.0023061635006792498 , -0.0 ], [ 0.005569931813561535 , 0.004363218328087518 , 0.00025609463218383973 , 0.009577483244680675 , 0.007257755916229399 , 0.00976284778532342 , -0.006388840235419147 , -0.009017880790555707 , -0.015308709334434867 , -0.016743935775597355 , -0.04372596546189275 , -0.03523469356755156 , -0.017257810114846107 , 0.011960489902313411 , 0.01529079831828911 , -0.020076559119468443 , -0.042792547669901516 , -0.0029492027218867116 , -0.011109560582516062 , -0.12985858077848939 , -0.2262858575494602 , -0.003391725540087574 , -0.03063368684328981 , -0.01353486587575121 , 0.0011140822443932317 , 0.006583451102528798 , 0.005667533945285076 , -0.0 ], [ 0.004056272267155598 , -0.0006394041203204911 , 0.004664893926197093 , 0.010593032387298614 , 0.014750931538689989 , 0.015428721146282149 , 0.012167820222401367 , 0.017604752451202518 , 0.01038886849969188 , 0.020544326931163263 , -0.0004206566917812794 , -0.0037463581359232674 , -0.0024656693040735075 , 0.0026061897697624353 , -0.05186055271869177 , -0.09158655048397382 , 0.022976389912563913 , -0.19851635458461808 , -0.11801281807622972 , -0.29127727790584423 , -0.017138655663803876 , -0.04395515676468641 , -0.019241432506341576 , 0.0011342298743447392 , 0.0030625771422964584 , -0.0002867924892991192 , -0.0017908808807543712 , -0.0 ], [ 0.0030114260660488892 , 0.0020246448273580006 , -0.003293361220376816 , 0.0036965043883218584 , 0.00013185761728146236 , -0.004355610866966878 , -0.006432601921104354 , -0.004148701459814858 , 0.005974553907915845 , -0.0001399233607281906 , 0.010392944122965082 , 0.015693249298693028 , 0.0459528427528407 , -0.013921539948093455 , -0.06615556518538708 , 0.02921438991320325 , -0.16345220625101778 , -0.002130491295590408 , -0.11449749664916867 , -0.030980255589300607 , -0.04804122537359171 , -0.05144994776295644 , 0.005122827412776085 , 0.006464862173908011 , 0.008624278272940246 , 0.0037316228508156427 , 0.0036947794337026706 , -0.0 ], [ 0.0038173843228389405 , -0.0017091931226819494 , -0.0030871869816778068 , 0.002115642501535999 , -0.006926441921580917 , -0.003023077828426468 , -0.014451359520861637 , -0.0020793048380231397 , -0.010948003939342523 , -0.0014460716966395166 , -0.01656990336897737 , 0.003052317148320358 , -0.0026729564809943513 , -0.06360067057346147 , 0.07780985635080599 , -0.1436689936630281 , -0.040817177623437874 , -0.04373367754296477 , -0.18337299150349698 , 0.025295182977407064 , -0.03874921104331938 , -0.002353901742617205 , 0.011772560401335033 , 0.012480994515707569 , 0.006498422579824301 , 0.00632320984076023 , 0.003407169765754805 , -0.0 ], [ 0.00944355257990139 , 0.009242583578688485 , 0.005069860444386138 , 0.012666191449103024 , 0.00941789912565746 , 0.004720427012836104 , 0.007597687789204113 , 0.008679266528089945 , 0.00889322771021875 , -0.0008577904940828809 , 0.0022973860384607604 , 0.025328230809207493 , -0.09908781123080951 , -0.07836626399832172 , -0.1546141264726177 , -0.2582207272050766 , -0.2297524599578219 , -0.29561835103416967 , 0.12048787956671528 , -0.06279365699861471 , -0.03832012404275233 , 0.022910264999199934 , 0.005803508497672737 , -0.003858461926053348 , 0.0039451232171312765 , 0.003858476747495933 , 0.0013034515558609956 , -0.0 ], [ 0.009725756015628606 , -0.0004001101998876524 , 0.006490722835571152 , 0.00800808023631959 , 0.0065880711806331265 , -0.0010264326176194034 , -0.0018914305972878344 , -0.008822522194658438 , -0.016650520788128117 , -0.03254382594389507 , -0.014795713101569494 , -0.05826499837818885 , -0.05165369567511702 , -0.13384277337594377 , -0.22572641373340493 , -0.21584739544668635 , -0.2366836351939208 , 0.14937824076489659 , -0.08127414932170171 , -0.06720440139736879 , -0.0038552732903526744 , 0.0107597891707803 , -5.67453590118174e-05 , 0.0020161340511396244 , -0.000783322694907436 , -0.0006397207517995289 , -0.005291639205010064 , -0.0 ], [ 0.008627543242777584 , 0.007700097300051849 , 0.0020430960246806138 , 0.012949015733198586 , 0.008428709579953574 , 0.001358177022953576 , 0.00421863939925833 , 0.002657580000868709 , -0.007339431957237175 , 0.02008439775442315 , -0.0033717631758033114 , -0.05176633249899187 , -0.013790328758662772 , -0.39102366157050594 , -0.167341447585844 , -0.04813367828213947 , 0.1367781582239039 , -0.04672809260566293 , -0.03237784669978756 , 0.03218068777925178 , 0.02415063765016493 , -0.017849899351200002 , -0.002975675228088795 , -0.004819438014786686 , 0.005106898651831245 , 0.0024278620704227456 , 6.784303333368138e-05 , -0.0 ], [ 0.009644258527009343 , -0.001331907219439711 , -0.0014639718434477777 , 0.008481926798958248 , 0.010278031715467508 , 0.003625808326891529 , -0.01121188617599796 , -0.0010634587872994379 , -0.0002603820881968461 , -0.017985648016990465 , -0.06446652745470374 , 0.07726063173046191 , -0.24739929795334742 , -0.2701855018480216 , -0.08888614776216278 , 0.1373325760136816 , -0.02316068912438066 , -0.042164834956711514 , 0.0009266091344106458 , 0.03141872420427644 , 0.011587728430225652 , 0.0004755143243520787 , 0.005860642609620605 , 0.008979633931394438 , 0.005061734169974005 , 0.003932710387086098 , 0.0015489986106803626 , -0.0 ], [ 0.010998736164377534 , 0.009378969800902604 , 0.00030577045264713074 , 0.0159329353530375 , 0.014849508018911006 , -0.0026513365659554225 , 0.002923303082126996 , 0.01917908707828847 , -0.02338288107991566 , -0.05706674679291175 , 0.009526265752669624 , -0.19945255386401284 , -0.10725519695909647 , -0.3222906835083537 , -0.03857038318412844 , -0.013279804965996065 , -0.046626023244262085 , -0.029299060237210447 , -0.043269580558906555 , -0.03768510002290657 , -0.02255977771908117 , -0.02632588166863199 , -0.014417349488098566 , -0.003077271951572957 , -0.0004973277708010661 , 0.0003475839139671271 , -0.0014522783025903258 , -0.0 ], [ 0.012215315671616316 , -0.001693194176229889 , 0.011365785434529038 , 0.0036964574178487792 , -0.010126738168635003 , -0.025554378647710443 , 0.006538003839811914 , -0.03181759044467965 , -0.016424751042854728 , 0.06177539736110035 , -0.43801735323216856 , -0.29991040815937386 , -0.2516019795363623 , 0.037789523540809 , -0.010948746374759491 , -0.0633901687126727 , -0.005976006160777705 , 0.006035133605976937 , -0.04961632526071937 , -0.04142116972831476 , -0.07558952727782252 , -0.04165176179187153 , -0.02021603856619006 , -0.0027365663096057032 , -0.011145473712733575 , 0.0003566937349350848 , -0.00546472985268321 , -0.0 ], [ 0.008009386447317503 , 0.006831207743885825 , 0.0051306149795546365 , 0.016239014770865052 , 0.020925441734273218 , 0.028344800173195076 , -0.004805080609285047 , -0.01880521614501033 , -0.1272329010865855 , -0.39835936819190537 , -0.09113694760349819 , -0.04061591094832608 , -0.12677021961235907 , 0.015567707226741051 , -0.005615051546243333 , -0.06454044862001587 , 0.0195457674752272 , -0.04219686517155871 , -0.08060569979524296 , 0.027234494361702787 , -0.009152881336047056 , -0.030865118003992217 , -0.005770311060090559 , 0.002905833371986098 , 5.606663556872091e-05 , 0.003209538083839772 , -0.0018588810743365345 , -0.0 ], [ 0.007587008852984699 , -0.0021213639853557625 , 0.0007709558092903736 , 0.013883256128746423 , 0.017328713012428214 , 0.03645357525636198 , -0.04043993335238427 , 0.05730125171252314 , -0.2563293727512057 , -0.11438826083879326 , 0.02662382809034687 , 0.03525271352483709 , 0.04745678120172762 , 0.0336360484090392 , -0.002916635707204059 , -0.17950855098650784 , -0.44161773297052964 , -0.4512180227831197 , -0.4940283106297913 , -0.1970108671285798 , 0.04344323143078066 , -0.012005120444897523 , 0.00987576109166055 , -0.0018336757466252476 , 0.0004913959502151706 , -0.0005409724034216215 , -0.005039223900868212 , -0.0 ], [ 0.00637876531169957 , 0.005189469227685454 , 0.0007676355246000376 , 0.018378100865097655 , 0.015739815031394887 , -0.035524983116512455 , 0.03781006978038308 , 0.28859052096740495 , 0.0726464110153121 , -0.026768468497420147 , 0.06278766200288134 , 0.17897045813699355 , -0.13780371920803108 , -0.14176458123649577 , -0.1733103177731656 , -0.3106508869296763 , 0.04788355140275794 , 0.04235327890285105 , -0.031266625292514394 , -0.016263819217960652 , -0.031388328800811355 , -0.01791363975905968 , -0.012025067979443894 , 0.008335083985905805 , -0.0014386677797296231 , 0.0055376544652972854 , 0.002241522815466253 , -0.0 ], [ 0.007455256326741617 , -0.0009475207572210404 , 0.0020288385162615286 , 0.015399640135796092 , 0.021133843188103074 , -0.019846405097622234 , -0.003162485751163173 , -0.14199005055318842 , -0.044200898667146035 , -0.013395459413208084 , 0.11019680479230103 , -0.014057216041764874 , -0.12553853334447865 , -0.05992513534766256 , 0.06467942189539834 , 0.08866056095907732 , -0.1451321508061849 , -0.07382491447758655 , -0.046961739981080476 , 0.0008943713493160624 , 0.03231044103656507 , 0.00036034241706501196 , -0.011387669277619417 , -0.00014602449257226195 , -0.0021863729003374116 , 0.0018817840156005856 , 0.0037909804578166286 , -0.0 ], [ 0.006511855618626698 , 0.006236866054439829 , -0.001440571166157676 , 0.012795776609942026 , 0.011530545030403624 , 0.03495489377257363 , 0.04792403136095304 , 0.049378583599065225 , 0.03296101702085617 , -0.0005351385876652296 , 0.017744115897640366 , 0.0011656622496764954 , 0.0232845869823761 , -0.0561191397060232 , -0.02854070511118366 , -0.028614174047247348 , -0.007763531086362863 , 0.01823079560098924 , 0.021961392405283622 , -0.009666681805706179 , 0.009547046884328725 , -0.008729943263791338 , 0.006408909680578429 , 0.009794327096359952 , -0.0025825219195515304 , 0.007063559189211571 , 0.007867244119267047 , -0.0 ], [ 0.007936663546039311 , -0.00010710180170593153 , 0.002716512705673228 , 0.0038633557307721487 , -0.0014877316616940372 , -0.0004788143065635909 , 0.012508842248031202 , 0.0045381104608414645 , -0.010650910516128294 , -0.013785341529644855 , -0.034287643221318206 , -0.022152707546335495 , -0.047056481347685974 , -0.032166744564720455 , -0.021551611335278546 , -0.002174962503376043 , 0.024344287130424306 , 0.015579272560525105 , 0.010958169741952194 , -0.010607232913436921 , -0.005548369726118836 , -0.0014630046444242706 , 0.013144180105016433 , 0.0031349366359021916 , 0.0010984887428255974 , 0.005426941473328394 , 0.006566511860044785 , -0.0 ], [ 0.0005529184874606495 , 0.00026139355020588705 , -0.002887623443531047 , 0.0013988462990850632 , 0.00203365139495493 , -0.007276926701775218 , -0.004010419939595932 , 0.017521952161185662 , 0.0006996977433557911 , 0.02083134683611201 , 0.013690533534289498 , -0.005466724359976675 , -0.008857712321334327 , 0.017408578822635818 , 0.0076439343049154425 , 0.0017861314923539985 , 0.007465865707523924 , 0.008034420825988495 , 0.003976298558337994 , 0.00411970637898539 , -0.004572592545819698 , 0.0029563907011979935 , -0.0006382227820088148 , 0.0015153753877889707 , -0.0052626601797995595 , 0.0025664706985019416 , 0.005161751034260073 , -0.0 ], [ 0.0009424280561998445 , -0.0012942360298110595 , 0.0011900868416523343 , 0.000984424113178899 , 0.0020988269382781564 , -0.005870080062890889 , -0.004950484744457169 , 0.003117643454332697 , -0.002509563565777083 , 0.005831604884101081 , 0.009531085216183116 , 0.010030206821909806 , 0.005858190171099734 , 4.9344529936340524e-05 , -0.004027895832421331 , 0.0025436439920587606 , 0.00531153867563076 , 0.00495942692369508 , 0.009215148318606382 , 0.00010011928317543458 , 0.0060051362999805355 , -0.0008195376963202741 , 0.0041728603512658224 , -0.0017597169567888774 , -0.0010577007775543158 , 0.00046033327178068433 , -0.0007674196306044449 , -0.0 ], [ -0.0 , -0.0 , 0.0013386963856532302 , 0.00035183178922260837 , 0.0030610334903526204 , 8.951834979315781e-05 , 0.0023676793550483524 , -0.0002900551076915047 , -0.00207019445286608 , -7.61697478482574e-05 , 0.0012150086715244216 , 0.009831239281792168 , 0.003479667642621962 , 0.0070584324334114525 , 0.004161851261339585 , 0.0026146296354490665 , -9.194746959222099e-05 , 0.0013583866966571571 , 0.0016821551239318913 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 ]]]]}","title":"Model Explanation"},{"location":"modelserving/v1beta1/torchserve/#deploy-pytorch-model-with-v1-grpc-protocol","text":"Note : Since kserve has no grpc client methods for v1, we are using torchserve's grpc v1 client","title":"Deploy PyTorch Model with V1 gRPC Protocol"},{"location":"modelserving/v1beta1/torchserve/#create-the-inferenceservice","text":"For deploying the InferenceService with gRPC protocol you need to expose the gRPC port on InferenceService. Here 7070 is torchserve gRPC port. Apply the following mnist_grpc.yaml to create the InferenceService . kubectl kubectl apply -f mnist_grpc.yaml Expected Output inferenceservice.serving.kserve.io/torchserve-grpc created New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc\" spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 ports : - containerPort : 7070 name : h2c protocol : TCP apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc\" spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 ports : - containerPort : 7070 name : h2c protocol : TCP","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/torchserve/#run-inference-with-torchserve-grpc-protocol","text":"Install gRPC python dependencies: pip install -U grpcio protobuf grpcio-tools Download TorchServe's inference and management proto: mkdir -p proto/v1 INFERENCE_PROTO_FILE_PATH = https://raw.githubusercontent.com/pytorch/serve/master/frontend/server/src/main/resources/proto/inference.proto MANAGEMENT_PROTO_FILE_PATH = https://raw.githubusercontent.com/pytorch/serve/master/frontend/server/src/main/resources/proto/management.proto curl -s -L ${ INFERENCE_PROTO_FILE_PATH } > ./proto/v1/inference.proto curl -s -L ${ MANAGEMENT_PROTO_FILE_PATH } > ./proto/v1/management.proto Generate python gRPC client stub using the proto files: python -m grpc_tools.protoc --proto_path = proto/v1/ --python_out = . --grpc_python_out = . proto/v1/inference.proto proto/v1/management.proto You can use image converter to convert the images to base64 byte array, for other models please refer to input request . Run gRPC Inference using torchserve_grpc_client.py with mnist.json as an example prediction input. MODEL_NAME = mnist INPUT_PATH = mnist.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-grpc -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) python torchserve_grpc_client.py --api_name infer --model $MODEL_NAME --input_path $INPUT_PATH --hostname $SERVICE_HOSTNAME Expected Output { \"predictions\" : [ 2 ] }","title":"Run Inference with TorchServe gRPC protocol"},{"location":"modelserving/v1beta1/torchserve/#deploy-pytorch-model-with-open-inference-rest-protocol","text":"","title":"Deploy PyTorch model with Open Inference REST Protocol"},{"location":"modelserving/v1beta1/torchserve/#create-the-inferenceservice_1","text":"KServe by default selects the TorchServe runtime when you specify the model format pytorch on new model spec and enables the KServe v1 inference protocol. To enable v2 open inference protocol, specify the protocolVersion field with the value v2 . New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve-mnist-v2\" spec : predictor : model : modelFormat : name : pytorch protocolVersion : v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve-mnist-v2\" spec : predictor : pytorch : protocolVersion : v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 For deploying the model on CPU, apply the mnist_v2.yaml to create the InferenceService . kubectl kubectl apply -f mnist_v2.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve-mnist-v2 created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/torchserve/#model-inference_1","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-mnist-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) You can send both byte array and tensor with v2 protocol, for byte array use image converter to convert the image to byte array input. Here we use the mnist_v2_bytes.json file to run an example inference. curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d @./mnist_v2_bytes.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]} For tensor input use the tensor image converter to convert the image to tensor input and here we use the mnist_v2.json file to run an example inference. curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d @./mnist_v2.json Expected Output { \"id\" : \"2266ec1e-f600-40af-97b5-7429b8195a80\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]}","title":"Model Inference"},{"location":"modelserving/v1beta1/torchserve/#model-explanation_1","text":"To get the model explanation with v2 explain endpoint: MODEL_NAME = mnist curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/mnist/explain -d @./mnist_v2.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"explain\" , \"shape\" : [ 1 , 28 , 28 ], \"datatype\" : \"FP64\" , \"datae-05 , 0.01024804634283815 , 0.0009971135240970147 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0004501048968956462 , -0.0019630535686311007 , -0.0006664793297549408 , 0.0020157403539278907 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0022144569383238466 , 0.008361583574785395 , 0.00314019428604999 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0028943544591141838 , -0.0031301383432286406 , 0.002113252872926688 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0010321050605717045 , 0.008905753926369048 , 0.002846438277738756 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.005305288883499087 , -0.00192711009725932 , 0.0012090042768467344 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0011945156500241256 , 0.005654442715832439 , 0.0020132075345016807 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0014689356969061985 , 0.0010743412638183228 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0017047980586912376 , 0.00290660517425009 , -0.0007805869640505143 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 5.541 * Co nne c t io n # 0 t o hos t localhos t le ft i nta c t 725422148614e-05 , 0.0014516114512869852 , 0.0002827701966546988 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0014401407633627265 , 0.0023812497776698745 , 0.002146825301700187 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0011500529125940918 , 0.0002865015572973405 , 0.0029798151042282686 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0017750295500283872 , 0.0008339859126060243 , -0.00377073933577687 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0006093176894575109 , -0.00046905787892409935 , 0.0034053218511795034 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0007450011768391558 , 0.001298767372877851 , -0.008499247640112315 , -6.145166131400234e-05 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0011809726042792137 , -0.001838476328106708 , 0.00541110661116898 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.002139234224224006 , 0.0003259163407641124 , -0.005276118873855287 , -0.001950984007438105 , -9.545670742026532e-07 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0007772404228681039 , -0.0001517956264720738 , 0.0064814848131711815 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 8.098064985902114e-05 , -0.00249042660692983 , -0.0020718619200672302 , -5.341117902942147e-05 , -0.00045564724429915073 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0022750983476959733 , 0.0017164060958460778 , 0.0003221344707738082 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0015560282678744543 , 9.107238495871273e-05 , 0.0008772841497928399 , 0.0006502978626355868 , -0.004128780767525651 , 0.0006030386900152659 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.001395995791096219 , 0.0026791526689584344 , 0.0023995008266391488 , -0.0004496096312746451 , 0.003101832450753724 , 0.007494536066960778 , 0.0028641187148287965 , -0.0030525907182629075 , 0.003420222396518567 , 0.0014924018363498125 , -0.0009357388301326025 , 0.0007856228933169799 , -0.0018433973914981437 , 1.6031856831240914e-05 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0006999018502034005 , 0.004382250870697946 , -0.0035419313267119365 , -0.0028896748092595375 , -0.00048734542493666705 , -0.0060873452419295 , 0.000388224990424471 , 0.002533641537585585 , -0.004352836563597573 , -0.0006079418766875505 , -0.0038101334053377753 , -0.000828441340357984 , 0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0010901530866342661 , -0.013135008038845744 , 0.0004734518707654666 , 0.002050423283568135 , -0.006609451922460863 , 0.0023647861820124366 , 0.0046789204256194 , -0.0018122527412311837 , 0.002137538353955849 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , -0.0 , 0.0 , -0.0 , -0.0 , -0.0 , -0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 ]}]}","title":"Model Explanation"},{"location":"modelserving/v1beta1/torchserve/#deploy-pytorch-model-with-open-inference-grpc-protocol","text":"","title":"Deploy PyTorch Model with Open Inference gRPC Protocol"},{"location":"modelserving/v1beta1/torchserve/#create-the-inferenceservice_2","text":"For deploying the InferenceService with Open Inference gRPC Protocol you need to expose the gRPC port on InferenceService. Here 8081 is kserve gRPC port. Apply the following mnist_grpc_v2.yaml to create the InferenceService . kubectl kubectl apply -f mnist_grpc_v2.yaml Expected Output $inferenceservice .serving.kserve.io/torchserve-grpc-v2 created New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc-v2\" spec : predictor : model : modelFormat : name : pytorch protocolVersion : grpc-v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 ports : - containerPort : 8081 name : h2c protocol : TCP apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torchserve-grpc-v2\" spec : predictor : pytorch : protocolVersion : grpc-v2 storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v2 ports : - containerPort : 8081 name : h2c protocol : TCP","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/torchserve/#run-grpc-inference","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . Then download Open Inference gRPC proto file: mkdir -p proto/v2 PROTO_FILE_PATH = https://raw.githubusercontent.com/kserve/kserve/master/python/kserve/kserve/protocol/grpc/grpc_predict_v2.proto curl -s -L ${ PROTO_FILE_PATH } > ./proto/v2/grpc_predict_v2.proto Run the inference test with grpcurl: INPUT_PATH = ./mnist_v2_grpc_tensor.json PROTO_FILE = proto/v2/grpc_predict_v2.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve-grpc-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) grpcurl -v -plaintext -proto ${ PROTO_FILE } -authority ${ SERVICE_HOSTNAME } -d @ ${ INGRESS_HOST } : ${ INGRESS_PORT } inference.GRPCInferenceService.ModelInfer <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Wed, 11 Oct 2023 13 :36:30 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 581 Response contents: { \"modelName\" : \"mnist\" , \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"outputs\" : [ { \"name\" : \"input-0\" , \"datatype\" : \"INT64\" , \"shape\" : [ \"1\" ] , \"contents\" : { \"int64Contents\" : [ \"1\" ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Run gRPC Inference"},{"location":"modelserving/v1beta1/torchserve/#autoscaling","text":"One of the main serverless inference features is to automatically scale the replicas of an InferenceService matching the incoming workload. KServe by default enables Knative Pod Autoscaler which watches traffic flow and scales up and down based on the configured metrics.","title":"Autoscaling"},{"location":"modelserving/v1beta1/torchserve/#knative-autoscaler","text":"KServe supports the implementation of Knative Pod Autoscaler (KPA) and Kubernetes\u2019 Horizontal Pod Autoscaler (HPA) . The features and limitations of each of these Autoscalers are listed below. Note If you want to use Kubernetes Horizontal Pod Autoscaler (HPA), you must install HPA extension Knative Pod Autoscaler (KPA) Part of the Knative Serving core and enabled by default once Knative Serving is installed. Supports scale to zero functionality. Does not support CPU-based autoscaling. Horizontal Pod Autoscaler (HPA) Not part of the Knative Serving core, and must be enabled after Knative Serving installation. Does not support scale to zero functionality. Supports CPU-based autoscaling.","title":"Knative Autoscaler"},{"location":"modelserving/v1beta1/torchserve/#create-inferenceservice-with-concurrency-target","text":"","title":"Create InferenceService with Concurrency Target"},{"location":"modelserving/v1beta1/torchserve/#hardsoft-autoscaling-limit","text":"You can configure InferenceService with annotation autoscaling.knative.dev/target for a soft limit. The soft limit is a targeted limit rather than a strictly enforced bound, particularly if there is a sudden burst of requests, this value can be exceeded. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : autoscaling.knative.dev/target : \"10\" spec : predictor : model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : autoscaling.knative.dev/target : \"10\" spec : predictor : pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" You can also configure InferenceService with field containerConcurrency with a hard limit. The hard limit is an enforced upper bound. If concurrency reaches the hard limit, surplus requests will be buffered and must wait until enough capacity is free to execute the requests. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : containerConcurrency : 10 model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" spec : predictor : containerConcurrency : 10 pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v1\" After specifying the soft or hard limits of the scaling target, you can now deploy the InferenceService with autoscaling.yaml . kubectl kubectl apply -f autoscaling.yaml Expected Output $ inferenceservice.serving.kserve.io/torchserve created","title":"Hard/Soft Autoscaling Limit"},{"location":"modelserving/v1beta1/torchserve/#run-inference-with-concurrent-requests","text":"The first step is to install the hey load generator and then send the concurrent requests to the InferenceService . go get -u github.com/rakyll/hey MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) hey -m POST -z 30s -D ./mnist.json -host ${ SERVICE_HOSTNAME } http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict","title":"Run Inference with Concurrent Requests"},{"location":"modelserving/v1beta1/torchserve/#check-pod-autoscaling","text":"hey by default generates 50 requests concurrently, so you can see that the InferenceService scales to 5 pods as the container concurrency target is set to 10. Expected Output NAME READY STATUS RESTARTS AGE torchserve-predictor-default-cj2d8-deployment-69444c9c74-67qwb 2 /2 Terminating 0 103s torchserve-predictor-default-cj2d8-deployment-69444c9c74-nnxk8 2 /2 Terminating 0 95s torchserve-predictor-default-cj2d8-deployment-69444c9c74-rq8jq 2 /2 Running 0 50m torchserve-predictor-default-cj2d8-deployment-69444c9c74-tsrwr 2 /2 Running 0 113s torchserve-predictor-default-cj2d8-deployment-69444c9c74-vvpjl 2 /2 Running 0 109s torchserve-predictor-default-cj2d8-deployment-69444c9c74-xvn7t 2 /2 Terminating 0 103s","title":"Check Pod Autoscaling"},{"location":"modelserving/v1beta1/torchserve/#canary-rollout","text":"Canary rollout is a deployment strategy when you release a new version of model to a small percent of the production traffic.","title":"Canary Rollout"},{"location":"modelserving/v1beta1/torchserve/#create-inferenceservice-with-canary-model","text":"After the above experiments, now let's see how you can rollout a new model without moving full traffic to the new model by default. New Schema Old Schema apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : serving.kserve.io/enable-tag-routing : \"true\" spec : predictor : canaryTrafficPercent : 20 model : modelFormat : name : pytorch storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v2\" apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"torchserve\" annotations : serving.kserve.io/enable-tag-routing : \"true\" spec : predictor : canaryTrafficPercent : 20 pytorch : storageUri : \"gs://kfserving-examples/models/torchserve/image_classifier/v2\" In this example we change the storageUri to the v2 version with canaryTrafficPercent field and then apply the canary.yaml . kubectl kubectl apply -f canary.yaml Expected Output kubectl get revisions -l serving.kserve.io/inferenceservice = torchserve NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON ACTUAL REPLICAS DESIRED REPLICAS torchserve-predictor-default-00001 torchserve-predictor-default 1 True 1 1 torchserve-predictor-default-00002 torchserve-predictor-default 2 True 1 1 kubectl get pods -l serving.kserve.io/inferenceservice = torchserve NAME READY STATUS RESTARTS AGE torchserve-predictor-default-00001-deployment-7d99979c99-p49gk 2 /2 Running 0 28m torchserve-predictor-default-00002-deployment-c6fcc65dd-rjknq 2 /2 Running 0 3m37s","title":"Create InferenceService with Canary Model"},{"location":"modelserving/v1beta1/torchserve/#check-traffic-status","text":"After the canary model is rolled out, the traffic should be split between the canary model revision and the \"stable\" revision which was rolled out with 100% percent traffic, now check the traffic split from the InferenceService traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 20 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, { \"latestRevision\" : false , \"percent\" : 80 , \"revisionName\" : \"torchserve-predictor-default-00001\" , \"tag\" : \"prev\" , \"url\" : \"http://prev-torchserve-predictor-default.default.example.com\" } ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } }","title":"Check Traffic Status"},{"location":"modelserving/v1beta1/torchserve/#traffic-rollout","text":"Run the following curl requests a few times to the InferenceService , you can see that requests are sent to the two revisions with 20/80 splits. MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torchserve -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) for i in { 1 ..10 } ; do curl -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json ; done Expected Output { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 500: Internal Server Error 500: Internal Server ErrorHandling connection for 8080 500: Internal Server Error 500: Internal Server ErrorHandling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 { \"predictions\" : [ 2 ]} Handling connection for 8080 You can notice that when the request hits the canary revision it fails, this is because that the new revision requires the v2 inference input mnist_v2.json which is a breaking change, in addition the traffic is randomly splitted between the two revisions according to the specified traffic percentage. In this case you should rollout the canary model with 0 canaryTrafficPercent and use the latest tagged url to test the canary model before moving the full traffic to the new model. kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 0}]' curl -v -H \"Host: latest-torchserve-predictor-default.default.example.com\" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output { \"id\" : \"d3b15cad-50a2-4eaf-80ce-8b0a428bd298\" , \"model_name\" : \"mnist\" , \"model_version\" : \"1.0\" , \"outputs\" : [{ \"name\" : \"predict\" , \"shape\" : [ 1 ], \"datatype\" : \"INT64\" , \"data\" : [ 1 ]}]} After the new model is tested and verified, you can now bump the canaryTrafficPercent to 100 to fully rollout the traffic to the new revision and now the latestRolledoutRevision becomes torchserve-predictor-default-00002 and previousRolledoutRevision becomes torchserve-predictor-default-00001 . kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 100}]' Check the traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00002\" , \"previousRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 100 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } }","title":"Traffic Rollout"},{"location":"modelserving/v1beta1/torchserve/#rollback-the-model","text":"In case the new model version does not work after the traffic is moved to the new revision, you can still patch the canaryTrafficPercent to 0 and move the traffic back to the previously rolled model which is torchserve-predictor-default-00001 . kubectl kubectl patch isvc torchserve --type = 'json' -p '[{\"op\": \"replace\", \"path\": \"/spec/predictor/canaryTrafficPercent\", \"value\": 0}]' Check the traffic status: kubectl get isvc torchserve -ojsonpath = '{.status.components}' Expected Output { \"predictor\" : { \"address\" : { \"url\" : \"http://torchserve-predictor-default.default.svc.cluster.local\" }, \"latestCreatedRevision\" : \"torchserve-predictor-default-00002\" , \"latestReadyRevision\" : \"torchserve-predictor-default-00002\" , \"latestRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"previousRolledoutRevision\" : \"torchserve-predictor-default-00001\" , \"traffic\" : [ { \"latestRevision\" : true , \"percent\" : 0 , \"revisionName\" : \"torchserve-predictor-default-00002\" , \"tag\" : \"latest\" , \"url\" : \"http://latest-torchserve-predictor-default.default.example.com\" }, { \"latestRevision\" : false , \"percent\" : 100 , \"revisionName\" : \"torchserve-predictor-default-00001\" , \"tag\" : \"prev\" , \"url\" : \"http://prev-torchserve-predictor-default.default.example.com\" } ], \"url\" : \"http://torchserve-predictor-default.default.example.com\" } }","title":"Rollback the Model"},{"location":"modelserving/v1beta1/torchserve/#monitoring","text":"Metrics Exposure and Grafana Dashboard Setup","title":"Monitoring"},{"location":"modelserving/v1beta1/torchserve/bert/","text":"TorchServe example with Huggingface bert model \u00b6 In this example we will show how to serve Huggingface Transformers with TorchServe on KServe. Model archive file creation \u00b6 Clone pytorch/serve repository, navigate to examples/Huggingface_Transformers and follow the steps for creating the MAR file including serialized model and other dependent files. TorchServe supports both eager model and torchscript and here we save as the pretrained model. torch-model-archiver --model-name BERTSeqClassification --version 1 .0 \\ --serialized-file Transformer_model/pytorch_model.bin \\ --handler ./Transformer_handler_generalized.py \\ --extra-files \"Transformer_model/config.json,./setup_config.json,./Seq_classification_artifacts/index_to_name.json\" Create the InferenceService \u00b6 Apply the CRD kubectl apply -f bert.yaml Expected Output $inferenceservice .serving.kserve.io/torchserve-bert created Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = torchserve-bert SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -n -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/BERTSeqClassification:predict -d ./sample_text.txt Expected Output * Trying 44 .239.20.204... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 44 .239.20.204 ) port 80 ( #0) > PUT /v1/models/BERTSeqClassification:predict HTTP/1.1 > Host: torchserve-bert.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 79 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 8 < date: Wed, 04 Nov 2020 10 :54:49 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: 4b54d3ac-185f-444c-b344-b8a785fdeb50 < x-envoy-upstream-service-time: 2085 < server: istio-envoy < * Connection #0 to host torchserve-bert.kserve-test.example.com left intact Accepted Captum Explanations \u00b6 In order to understand the word importances and attributions when we make an explanation Request, we use Captum Insights for the Hugginface Transformers pre-trained model. MODEL_NAME = torchserve-bert SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -n -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/BERTSeqClassification:explaine -d ./sample_text.txt Expected output * Trying ::1:8080... * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/BERTSeqClassification:explain HTTP/1.1 > Host: torchserve-bert.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded >Handling connection for 8080 * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 292 < content-type: application/json ; charset = UTF-8 < date: Sun, 27 Dec 2020 05 :53:52 GMT < server: istio-envoy < x-envoy-upstream-service-time: 5769 < * Connection #0 to host localhost left intact { \"explanations\" : [{ \"importances\" : [ 0 .0, -0.6324463574494716, -0.033115653530477414, 0 .2681695752722339, -0.29124745608778546, 0 .5422589681903883, -0.3848768219546909, 0 .0 ] , \"words\" : [ \"[CLS]\" , \"bloomberg\" , \"has\" , \"reported\" , \"on\" , \"the\" , \"economy\" , \"[SEP]\" ] , \"delta\" : -0.0007350619859377225 }]}","title":"TorchServe example with Huggingface bert model"},{"location":"modelserving/v1beta1/torchserve/bert/#torchserve-example-with-huggingface-bert-model","text":"In this example we will show how to serve Huggingface Transformers with TorchServe on KServe.","title":"TorchServe example with Huggingface bert model"},{"location":"modelserving/v1beta1/torchserve/bert/#model-archive-file-creation","text":"Clone pytorch/serve repository, navigate to examples/Huggingface_Transformers and follow the steps for creating the MAR file including serialized model and other dependent files. TorchServe supports both eager model and torchscript and here we save as the pretrained model. torch-model-archiver --model-name BERTSeqClassification --version 1 .0 \\ --serialized-file Transformer_model/pytorch_model.bin \\ --handler ./Transformer_handler_generalized.py \\ --extra-files \"Transformer_model/config.json,./setup_config.json,./Seq_classification_artifacts/index_to_name.json\"","title":"Model archive file creation"},{"location":"modelserving/v1beta1/torchserve/bert/#create-the-inferenceservice","text":"Apply the CRD kubectl apply -f bert.yaml Expected Output $inferenceservice .serving.kserve.io/torchserve-bert created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/torchserve/bert/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = torchserve-bert SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -n -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/BERTSeqClassification:predict -d ./sample_text.txt Expected Output * Trying 44 .239.20.204... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 44 .239.20.204 ) port 80 ( #0) > PUT /v1/models/BERTSeqClassification:predict HTTP/1.1 > Host: torchserve-bert.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 79 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 8 < date: Wed, 04 Nov 2020 10 :54:49 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: 4b54d3ac-185f-444c-b344-b8a785fdeb50 < x-envoy-upstream-service-time: 2085 < server: istio-envoy < * Connection #0 to host torchserve-bert.kserve-test.example.com left intact Accepted","title":"Run a prediction"},{"location":"modelserving/v1beta1/torchserve/bert/#captum-explanations","text":"In order to understand the word importances and attributions when we make an explanation Request, we use Captum Insights for the Hugginface Transformers pre-trained model. MODEL_NAME = torchserve-bert SERVICE_HOSTNAME = $( kubectl get inferenceservice ${ MODEL_NAME } -n -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/BERTSeqClassification:explaine -d ./sample_text.txt Expected output * Trying ::1:8080... * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/BERTSeqClassification:explain HTTP/1.1 > Host: torchserve-bert.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 84 > Content-Type: application/x-www-form-urlencoded >Handling connection for 8080 * upload completely sent off: 84 out of 84 bytes * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 292 < content-type: application/json ; charset = UTF-8 < date: Sun, 27 Dec 2020 05 :53:52 GMT < server: istio-envoy < x-envoy-upstream-service-time: 5769 < * Connection #0 to host localhost left intact { \"explanations\" : [{ \"importances\" : [ 0 .0, -0.6324463574494716, -0.033115653530477414, 0 .2681695752722339, -0.29124745608778546, 0 .5422589681903883, -0.3848768219546909, 0 .0 ] , \"words\" : [ \"[CLS]\" , \"bloomberg\" , \"has\" , \"reported\" , \"on\" , \"the\" , \"economy\" , \"[SEP]\" ] , \"delta\" : -0.0007350619859377225 }]}","title":"Captum Explanations"},{"location":"modelserving/v1beta1/torchserve/metrics/","text":"Expose TorchServe Metrics \u00b6 This tutorial setups prometheus and granfana to the cluster with TorchServe metrics. Install Istio with Grafana and Prometheus \u00b6 Note: Make sure to enable prometheus and grafana while installing istio. After installation Grafana and Prometheus can be accessed from the below links # Grafana istioctl dashboard grafana # Prometheus istioctl dashboard prometheus Create the InferenceService \u00b6 Enable prometheus scraping by adding annotations to deployment yaml, by default the torchserve's metrics port is 8082. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torch-metrics\" annotations : prometheus.io/scrape : 'true' prometheus.io/port : '8082' spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torch-metrics\" annotations : prometheus.io/scrape : 'true' prometheus.io/port : '8082' spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 Apply the CRD kubectl apply -f metrics.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-metrics created Run a prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torch-metrics -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output * Trying 52 .89.19.61... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 52 .89.19.61 ) port 80 ( #0) > PUT /v1/models/mnist:predict HTTP/1.1 > Host: torch-metrics.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 272 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 1 < date: Fri, 23 Oct 2020 13 :01:09 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: 8881f2b9-462e-4e2d-972f-90b4eb083e53 < x-envoy-upstream-service-time: 5018 < server: istio-envoy < * Connection #0 to host a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com left intact { \"predictions\" : [ \"2\" ]} Check the dashboard \u00b6 Prometheus graph view \u00b6 Navigate to prometheus page Add a query in the prometheus page Grafana dashboard \u00b6 Navigate to grafana page Add a dashboard from the top left + symbol Click add query and enter the query For Exposing grafana and prometheus under istio ingress please refer to remotely accessing telemetry addons Apply below deployment for a demo setup. apiVersion : networking.istio.io/v1alpha3 kind : Gateway metadata : name : grafana-gateway namespace : istio-system spec : selector : istio : ingressgateway servers : - port : number : 80 name : http-grafana protocol : HTTP hosts : - \"grafana.example.com\" --- apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : grafana-vs namespace : istio-system spec : hosts : - \"grafana.example.com\" gateways : - grafana-gateway http : - route : - destination : host : grafana port : number : 3000 --- apiVersion : networking.istio.io/v1alpha3 kind : DestinationRule metadata : name : grafana namespace : istio-system spec : host : grafana trafficPolicy : tls : mode : DISABLE --- All request with hostname grafana.example.com redirects to grafana.","title":"Expose TorchServe Metrics"},{"location":"modelserving/v1beta1/torchserve/metrics/#expose-torchserve-metrics","text":"This tutorial setups prometheus and granfana to the cluster with TorchServe metrics.","title":"Expose TorchServe Metrics"},{"location":"modelserving/v1beta1/torchserve/metrics/#install-istio-with-grafana-and-prometheus","text":"Note: Make sure to enable prometheus and grafana while installing istio. After installation Grafana and Prometheus can be accessed from the below links # Grafana istioctl dashboard grafana # Prometheus istioctl dashboard prometheus","title":"Install Istio with Grafana and Prometheus"},{"location":"modelserving/v1beta1/torchserve/metrics/#create-the-inferenceservice","text":"Enable prometheus scraping by adding annotations to deployment yaml, by default the torchserve's metrics port is 8082. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torch-metrics\" annotations : prometheus.io/scrape : 'true' prometheus.io/port : '8082' spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : \"torch-metrics\" annotations : prometheus.io/scrape : 'true' prometheus.io/port : '8082' spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 Apply the CRD kubectl apply -f metrics.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-metrics created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/torchserve/metrics/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT MODEL_NAME = mnist SERVICE_HOSTNAME = $( kubectl get inferenceservice torch-metrics -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d @./mnist.json Expected Output * Trying 52 .89.19.61... * Connected to a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com ( 52 .89.19.61 ) port 80 ( #0) > PUT /v1/models/mnist:predict HTTP/1.1 > Host: torch-metrics.kserve-test.example.com > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 272 > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < cache-control: no-cache ; no-store, must-revalidate, private < content-length: 1 < date: Fri, 23 Oct 2020 13 :01:09 GMT < expires: Thu, 01 Jan 1970 00 :00:00 UTC < pragma: no-cache < x-request-id: 8881f2b9-462e-4e2d-972f-90b4eb083e53 < x-envoy-upstream-service-time: 5018 < server: istio-envoy < * Connection #0 to host a881f5a8c676a41edbccdb0a394a80d6-2069247558.us-west-2.elb.amazonaws.com left intact { \"predictions\" : [ \"2\" ]}","title":"Run a prediction"},{"location":"modelserving/v1beta1/torchserve/metrics/#check-the-dashboard","text":"","title":"Check the dashboard"},{"location":"modelserving/v1beta1/torchserve/metrics/#prometheus-graph-view","text":"Navigate to prometheus page Add a query in the prometheus page","title":"Prometheus graph view"},{"location":"modelserving/v1beta1/torchserve/metrics/#grafana-dashboard","text":"Navigate to grafana page Add a dashboard from the top left + symbol Click add query and enter the query For Exposing grafana and prometheus under istio ingress please refer to remotely accessing telemetry addons Apply below deployment for a demo setup. apiVersion : networking.istio.io/v1alpha3 kind : Gateway metadata : name : grafana-gateway namespace : istio-system spec : selector : istio : ingressgateway servers : - port : number : 80 name : http-grafana protocol : HTTP hosts : - \"grafana.example.com\" --- apiVersion : networking.istio.io/v1alpha3 kind : VirtualService metadata : name : grafana-vs namespace : istio-system spec : hosts : - \"grafana.example.com\" gateways : - grafana-gateway http : - route : - destination : host : grafana port : number : 3000 --- apiVersion : networking.istio.io/v1alpha3 kind : DestinationRule metadata : name : grafana namespace : istio-system spec : host : grafana trafficPolicy : tls : mode : DISABLE --- All request with hostname grafana.example.com redirects to grafana.","title":"Grafana dashboard"},{"location":"modelserving/v1beta1/torchserve/model-archiver/","text":"Generate model archiver files for torchserve \u00b6 Setup \u00b6 Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . 1. Create PV and PVC \u00b6 Create a Persistent volume and volume claim. This document uses amazonEBS PV. For AWS EFS storage you can refer to AWS EFS storage 1.1 Create PV \u00b6 Edit volume id in pv.yaml file kubectl apply -f pv.yaml Expected Output $ persistentvolume/model-pv-volume created 1.2 Create PVC \u00b6 kubectl apply -f pvc.yaml Expected Output $ persistentvolumeclaim/model-pv-claim created 2 Create model store files layout and copy to PV \u00b6 We create a pod with the PV attached to copy the model files and config.properties for generating model archive file. 2.1 Create pod for copying model store files to PV \u00b6 kubectl apply -f pvpod.yaml Expected Output $ pod/model-store-pod created 2.2 Create model store file layout on PV \u00b6 2.2.1 Create properties.json file \u00b6 This file has model-name, version, model-file name, serialized-file name, extra-files, handlers, workers etc. of the models. [ { \"model-name\" : \"mnist\" , \"version\" : \"1.0\" , \"model-file\" : \"\" , \"serialized-file\" : \"mnist_cnn.pt\" , \"extra-files\" : \"\" , \"handler\" : \"mnist_handler.py\" , \"min-workers\" : 1 , \"max-workers\" : 3 , \"batch-size\" : 1 , \"max-batch-delay\" : 100 , \"response-timeout\" : 120 , \"requirements\" : \"\" }, { \"model-name\" : \"densenet_161\" , \"version\" : \"1.0\" , \"model-file\" : \"\" , \"serialized-file\" : \"densenet161-8d451a50.pth\" , \"extra-files\" : \"index_to_name.json\" , \"handler\" : \"image_classifier\" , \"min-workers\" : 1 , \"max-workers\" : 3 , \"batch-size\" : 1 , \"max-batch-delay\" : 100 , \"response-timeout\" : 120 , \"requirements\" : \"\" } ] 2.2.2 Copy model and its dependent Files \u00b6 Copy all the model and dependent files to the PV in the structure given below. An empty config folder, a model-store folder containing model name as folder name. Within that model folder, the files required to build the marfile. \u251c\u2500\u2500 config \u251c\u2500\u2500 model-store \u2502 \u251c\u2500\u2500 densenet_161 \u2502 \u2502 \u251c\u2500\u2500 densenet161-8d451a50.pth \u2502 \u2502 \u251c\u2500\u2500 index_to_name.json \u2502 \u2502 \u2514\u2500\u2500 model.py \u2502 \u251c\u2500\u2500 mnist \u2502 \u2502 \u251c\u2500\u2500 mnist_cnn.pt \u2502 \u2502 \u251c\u2500\u2500 mnist_handler.py \u2502 \u2502 \u2514\u2500\u2500 mnist.py \u2502 \u2514\u2500\u2500 properties.json 2.2.3 Create folders for model-store and config in PV \u00b6 kubectl exec -it model-store-pod -c model-store -n kserve-test -- mkdir /pv/model-store/ kubectl exec -it model-store-pod -c model-store -n kserve-test -- mkdir /pv/config/ 2.3 Copy model files and config.properties to the PV \u00b6 kubectl cp model-store/* model-store-pod:/pv/model-store/ -c model-store -n kserve-test kubectl cp config.properties model-store-pod:/pv/config/ -c model-store -n kserve-test 2.4 Delete pv pod \u00b6 Since amazon EBS provide only ReadWriteOnce mode, we have to unbind the PV for use of model archiver. kubectl delete pod model-store-pod -n kserve-test 3 Generate model archive file and server configuration file \u00b6 3.1 Create model archive pod and run model archive file generation script \u00b6 kubectl apply -f model-archiver.yaml -n kserve-test 3.2 Check the output and delete model archive pod \u00b6 Verify mar files and config.properties kubectl exec -it margen-pod -n kserve-test -- ls -lR /home/model-server/model-store kubectl exec -it margen-pod -n kserve-test -- cat /home/model-server/config/config.properties 3.3 Delete model archiver \u00b6 kubectl delete -f model-archiver.yaml -n kserve-test","title":"Generate model archiver files for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#generate-model-archiver-files-for-torchserve","text":"","title":"Generate model archiver files for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#setup","text":"Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible .","title":"Setup"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#1-create-pv-and-pvc","text":"Create a Persistent volume and volume claim. This document uses amazonEBS PV. For AWS EFS storage you can refer to AWS EFS storage","title":"1. Create PV and PVC"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#11-create-pv","text":"Edit volume id in pv.yaml file kubectl apply -f pv.yaml Expected Output $ persistentvolume/model-pv-volume created","title":"1.1 Create PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#12-create-pvc","text":"kubectl apply -f pvc.yaml Expected Output $ persistentvolumeclaim/model-pv-claim created","title":"1.2 Create PVC"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#2-create-model-store-files-layout-and-copy-to-pv","text":"We create a pod with the PV attached to copy the model files and config.properties for generating model archive file.","title":"2 Create model store files layout and copy to PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#21-create-pod-for-copying-model-store-files-to-pv","text":"kubectl apply -f pvpod.yaml Expected Output $ pod/model-store-pod created","title":"2.1 Create pod for copying model store files to PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#22-create-model-store-file-layout-on-pv","text":"","title":"2.2 Create model store file layout on PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#221-create-propertiesjson-file","text":"This file has model-name, version, model-file name, serialized-file name, extra-files, handlers, workers etc. of the models. [ { \"model-name\" : \"mnist\" , \"version\" : \"1.0\" , \"model-file\" : \"\" , \"serialized-file\" : \"mnist_cnn.pt\" , \"extra-files\" : \"\" , \"handler\" : \"mnist_handler.py\" , \"min-workers\" : 1 , \"max-workers\" : 3 , \"batch-size\" : 1 , \"max-batch-delay\" : 100 , \"response-timeout\" : 120 , \"requirements\" : \"\" }, { \"model-name\" : \"densenet_161\" , \"version\" : \"1.0\" , \"model-file\" : \"\" , \"serialized-file\" : \"densenet161-8d451a50.pth\" , \"extra-files\" : \"index_to_name.json\" , \"handler\" : \"image_classifier\" , \"min-workers\" : 1 , \"max-workers\" : 3 , \"batch-size\" : 1 , \"max-batch-delay\" : 100 , \"response-timeout\" : 120 , \"requirements\" : \"\" } ]","title":"2.2.1 Create properties.json file"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#222-copy-model-and-its-dependent-files","text":"Copy all the model and dependent files to the PV in the structure given below. An empty config folder, a model-store folder containing model name as folder name. Within that model folder, the files required to build the marfile. \u251c\u2500\u2500 config \u251c\u2500\u2500 model-store \u2502 \u251c\u2500\u2500 densenet_161 \u2502 \u2502 \u251c\u2500\u2500 densenet161-8d451a50.pth \u2502 \u2502 \u251c\u2500\u2500 index_to_name.json \u2502 \u2502 \u2514\u2500\u2500 model.py \u2502 \u251c\u2500\u2500 mnist \u2502 \u2502 \u251c\u2500\u2500 mnist_cnn.pt \u2502 \u2502 \u251c\u2500\u2500 mnist_handler.py \u2502 \u2502 \u2514\u2500\u2500 mnist.py \u2502 \u2514\u2500\u2500 properties.json","title":"2.2.2 Copy model and its dependent Files"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#223-create-folders-for-model-store-and-config-in-pv","text":"kubectl exec -it model-store-pod -c model-store -n kserve-test -- mkdir /pv/model-store/ kubectl exec -it model-store-pod -c model-store -n kserve-test -- mkdir /pv/config/","title":"2.2.3 Create folders for model-store and config in PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#23-copy-model-files-and-configproperties-to-the-pv","text":"kubectl cp model-store/* model-store-pod:/pv/model-store/ -c model-store -n kserve-test kubectl cp config.properties model-store-pod:/pv/config/ -c model-store -n kserve-test","title":"2.3 Copy model files and config.properties to the PV"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#24-delete-pv-pod","text":"Since amazon EBS provide only ReadWriteOnce mode, we have to unbind the PV for use of model archiver. kubectl delete pod model-store-pod -n kserve-test","title":"2.4 Delete pv pod"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#3-generate-model-archive-file-and-server-configuration-file","text":"","title":"3 Generate model archive file and server configuration file"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#31-create-model-archive-pod-and-run-model-archive-file-generation-script","text":"kubectl apply -f model-archiver.yaml -n kserve-test","title":"3.1 Create model archive pod and run model archive file generation script"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#32-check-the-output-and-delete-model-archive-pod","text":"Verify mar files and config.properties kubectl exec -it margen-pod -n kserve-test -- ls -lR /home/model-server/model-store kubectl exec -it margen-pod -n kserve-test -- cat /home/model-server/config/config.properties","title":"3.2 Check the output and delete model archive pod"},{"location":"modelserving/v1beta1/torchserve/model-archiver/#33-delete-model-archiver","text":"kubectl delete -f model-archiver.yaml -n kserve-test","title":"3.3 Delete model archiver"},{"location":"modelserving/v1beta1/torchserve/model-archiver/model-archiver-image/","text":"Model archiver for torchserve \u00b6 Steps: Modify config in entrypoint for default config (optional) Build docker image Push docker image to repo docker build --file Dockerfile -t margen:latest . docker tag margen:latest { username } /margen:latest docker push { username } /margen:latest","title":"Model archiver for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/model-archiver-image/#model-archiver-for-torchserve","text":"Steps: Modify config in entrypoint for default config (optional) Build docker image Push docker image to repo docker build --file Dockerfile -t margen:latest . docker tag margen:latest { username } /margen:latest docker push { username } /margen:latest","title":"Model archiver for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/model-store/","text":"Model archiver for torchserve \u00b6 Place all the file required to grenerate marfile in the model folder \u00b6","title":"Model archiver for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/model-store/#model-archiver-for-torchserve","text":"","title":"Model archiver for torchserve"},{"location":"modelserving/v1beta1/torchserve/model-archiver/model-store/#place-all-the-file-required-to-grenerate-marfile-in-the-model-folder","text":"","title":"Place all the file required to grenerate marfile in the model folder"},{"location":"modelserving/v1beta1/transformer/collocation/","text":"Collocate transformer and predictor in same pod \u00b6 KServe by default deploys the Transformer and Predictor as separate services, allowing you to deploy them on different devices and scale them independently. Nevertheless, there are certain situations where you might prefer to collocate the transformer and predictor within the same pod. Here are a few scenarios: If your transformer is tightly coupled with the predictor and you want to perform canary deployment together. If you want to reduce sidecar resources. If you want to reduce networking latency. Before you begin \u00b6 Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . You can find the code samples on kserve repository. Deploy the InferenceService \u00b6 Since, the predictor and the transformer are in the same pod, they need to listen on different ports to avoid conflict. Transformer is configured to listen on port 8080 (REST) and 8081 (GRPC) while, Predictor listens on port 8085 (REST). Transformer calls Predictor on port 8085 via local socket. Deploy the Inferenceservice using the below command. cat < POST /v1/models/mnist:predict HTTP/1.1 > Host: custom-transformer-collocation.default.example.com > User-Agent: curl/7.85.0 > Accept: */* > Content-Type: application/json > Content-Length: 427 > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 19 < content-type: application/json < date: Sat, 02 Dec 2023 09 :13:16 GMT < server: istio-envoy < x-envoy-upstream-service-time: 315 < * Connection #0 to host localhost left intact { \"predictions\" : [ 2 ]}","title":"Collocate transformer and predictor"},{"location":"modelserving/v1beta1/transformer/collocation/#collocate-transformer-and-predictor-in-same-pod","text":"KServe by default deploys the Transformer and Predictor as separate services, allowing you to deploy them on different devices and scale them independently. Nevertheless, there are certain situations where you might prefer to collocate the transformer and predictor within the same pod. Here are a few scenarios: If your transformer is tightly coupled with the predictor and you want to perform canary deployment together. If you want to reduce sidecar resources. If you want to reduce networking latency.","title":"Collocate transformer and predictor in same pod"},{"location":"modelserving/v1beta1/transformer/collocation/#before-you-begin","text":"Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . You can find the code samples on kserve repository.","title":"Before you begin"},{"location":"modelserving/v1beta1/transformer/collocation/#deploy-the-inferenceservice","text":"Since, the predictor and the transformer are in the same pod, they need to listen on different ports to avoid conflict. Transformer is configured to listen on port 8080 (REST) and 8081 (GRPC) while, Predictor listens on port 8085 (REST). Transformer calls Predictor on port 8085 via local socket. Deploy the Inferenceservice using the below command. cat < POST /v1/models/mnist:predict HTTP/1.1 > Host: custom-transformer-collocation.default.example.com > User-Agent: curl/7.85.0 > Accept: */* > Content-Type: application/json > Content-Length: 427 > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 19 < content-type: application/json < date: Sat, 02 Dec 2023 09 :13:16 GMT < server: istio-envoy < x-envoy-upstream-service-time: 315 < * Connection #0 to host localhost left intact { \"predictions\" : [ 2 ]}","title":"Run a prediction"},{"location":"modelserving/v1beta1/transformer/feast/","text":"Deploy InferenceService with Transformer using Feast online feature store \u00b6 Transformer is an InferenceService component which does pre/post processing alongside with model inference. In this example, instead of typical input transformation of raw data to tensors, we demonstrate a use case of online feature augmentation as part of preprocessing. We use a Feast Transformer to gather online features, run inference with a SKLearn predictor, and leave post processing as pass-through. Before you begin \u00b6 Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . You can find the code samples on kserve repository. Note This example uses Feast version 0.30.2 Create the Redis server \u00b6 This example uses the Redis as the online store. Deploy the Redis server using the below command. cat < or entity_id_name : The name of the entity ID for which to retrieve features from the Feast feature store feature_refs : The feature references for the features to be retrieved Build Transformer docker image \u00b6 The driver transformer dockerfile can be found in the code example directory. Checkout the feast code example and under the example directory run the commands as following: docker build -t $USERNAME /driver-transformer:latest -f driver_transformer.Dockerfile . docker push $USERNAME /driver-transformer:latest Create the InferenceService \u00b6 In the Feast Transformer image we packaged the driver transformer class so KServe knows to use the preprocess implementation to augment inputs with online features before making model inference requests. Then the InferenceService uses SKLearn to serve the driver ranking model , which is trained with Feast offline features, available in a gcs bucket specified under storageUri . Update the container's image field and the feast_serving_url argument to create the InferenceService , which includes a Feast Transformer and a SKLearn Predictor. New Schema Old Schema cat < POST /v1/models/sklearn-driver-transformer:predict HTTP/1.1 > Host: sklearn-driver-transformer.default.example.com > User-Agent: curl/7.85.0 > Accept: */* > Content-Length: 57 > Content-Type: application/x-www-form-urlencoded > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 115 < content-type: application/json < date: Thu, 30 Mar 2023 09 :46:52 GMT < server: istio-envoy < x-envoy-upstream-service-time: 112 < * Connection #0 to host 1.2.3.4 left intact { \"predictions\" : [ 0 .45905828209879473,1.5118208033011165,0.21514156911776539,0.5555778492605103,0.49638665080127176 ]}","title":"Feast"},{"location":"modelserving/v1beta1/transformer/feast/#deploy-inferenceservice-with-transformer-using-feast-online-feature-store","text":"Transformer is an InferenceService component which does pre/post processing alongside with model inference. In this example, instead of typical input transformation of raw data to tensors, we demonstrate a use case of online feature augmentation as part of preprocessing. We use a Feast Transformer to gather online features, run inference with a SKLearn predictor, and leave post processing as pass-through.","title":"Deploy InferenceService with Transformer using Feast online feature store"},{"location":"modelserving/v1beta1/transformer/feast/#before-you-begin","text":"Your ~/.kube/config should point to a cluster with KServe installed . Your cluster's Istio Ingress gateway must be network accessible . You can find the code samples on kserve repository. Note This example uses Feast version 0.30.2","title":"Before you begin"},{"location":"modelserving/v1beta1/transformer/feast/#create-the-redis-server","text":"This example uses the Redis as the online store. Deploy the Redis server using the below command. cat < or entity_id_name : The name of the entity ID for which to retrieve features from the Feast feature store feature_refs : The feature references for the features to be retrieved","title":"Extend the Model class and implement pre/post processing functions"},{"location":"modelserving/v1beta1/transformer/feast/#build-transformer-docker-image","text":"The driver transformer dockerfile can be found in the code example directory. Checkout the feast code example and under the example directory run the commands as following: docker build -t $USERNAME /driver-transformer:latest -f driver_transformer.Dockerfile . docker push $USERNAME /driver-transformer:latest","title":"Build Transformer docker image"},{"location":"modelserving/v1beta1/transformer/feast/#create-the-inferenceservice","text":"In the Feast Transformer image we packaged the driver transformer class so KServe knows to use the preprocess implementation to augment inputs with online features before making model inference requests. Then the InferenceService uses SKLearn to serve the driver ranking model , which is trained with Feast offline features, available in a gcs bucket specified under storageUri . Update the container's image field and the feast_serving_url argument to create the InferenceService , which includes a Feast Transformer and a SKLearn Predictor. New Schema Old Schema cat < POST /v1/models/sklearn-driver-transformer:predict HTTP/1.1 > Host: sklearn-driver-transformer.default.example.com > User-Agent: curl/7.85.0 > Accept: */* > Content-Length: 57 > Content-Type: application/x-www-form-urlencoded > * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 115 < content-type: application/json < date: Thu, 30 Mar 2023 09 :46:52 GMT < server: istio-envoy < x-envoy-upstream-service-time: 112 < * Connection #0 to host 1.2.3.4 left intact { \"predictions\" : [ 0 .45905828209879473,1.5118208033011165,0.21514156911776539,0.5555778492605103,0.49638665080127176 ]}","title":"Run a prediction"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/","text":"Deploy Transformer with InferenceService \u00b6 Transformer is an InferenceService component which does pre/post processing alongside with model inference. It usually takes raw input and transforms them to the input tensors model server expects. In this example we demonstrate an example of running inference with a custom Transformer communicating by REST and gRPC protocol. Create Custom Image Transformer \u00b6 Implement pre/post processing with KServe Model API \u00b6 KServe.Model base class mainly defines three handlers preprocess , predict and postprocess , these handlers are executed in sequence where the output of the preprocess handler is passed to the predict handler as the input. When predictor_host is passed, the predict handler makes a call to the predictor and gets back a response which is then passed to the postprocess handler. KServe automatically fills in the predictor_host for Transformer and hands over the call to the Predictor . By default transformer makes a REST call to predictor, to make a gRPC call to predictor, you can pass the --protocol argument with value grpc-v2 . To implement a Transformer you can derive from the base Model class and then overwrite the preprocess and postprocess handler to have your own customized transformation logic. For Open(v2) Inference Protocol , KServe provides InferRequest and InferResponse API object for predict , preprocess , postprocess handlers to abstract away the implementation details of REST/gRPC decoding and encoding over the wire. from kserve import Model , ModelServer , model_server , InferInput , InferRequest from typing import Dict from PIL import Image import torchvision.transforms as transforms import logging import io import base64 logging . basicConfig ( level = kserve . constants . KSERVE_LOGLEVEL ) def image_transform ( byte_array ): \"\"\"converts the input image of Bytes Array into Tensor Args: instance (dict): The request input for image bytes. Returns: list: Returns converted tensor as input for predict handler with v1/v2 inference protocol. \"\"\" image_processing = transforms . Compose ([ transforms . ToTensor (), transforms . Normalize (( 0.1307 ,), ( 0.3081 ,)) ]) image = Image . open ( io . BytesIO ( byte_array )) tensor = image_processing ( image ) . numpy () return tensor # for v1 REST predictor the preprocess handler converts to input image bytes to float tensor dict in v1 inference REST protocol format class ImageTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , headers : Dict [ str , str ] = None ): super () . __init__ ( name ) self . predictor_host = predictor_host self . ready = True def preprocess ( self , inputs : Dict , headers : Dict [ str , str ] = None ) -> Dict : return { 'instances' : [ image_transform ( instance ) for instance in inputs [ 'instances' ]]} def postprocess ( self , inputs : Dict , headers : Dict [ str , str ] = None ) -> Dict : return inputs # for v2 gRPC predictor the preprocess handler converts the input image bytes tensor to float tensor in v2 inference protocol format class ImageTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , protocol : str , headers : Dict [ str , str ] = None ): super () . __init__ ( name ) self . predictor_host = predictor_host self . protocol = protocol self . ready = True def preprocess ( self , request : InferRequest , headers : Dict [ str , str ] = None ) -> InferRequest : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) infer_inputs = [ InferInput ( name = \"INPUT__0\" , datatype = 'FP32' , shape = list ( input_tensors . shape ), data = input_tensors )] infer_request = InferRequest ( model_name = self . model_name , infer_inputs = infer_inputs ) return infer_request Please see the code example here . Transformer Server Entrypoint \u00b6 For single model you just create a transformer object and register that to the model server. if __name__ == \"__main__\" : model = ImageTransformer ( args . model_name , predictor_host = args . predictor_host , protocol = args . protocol ) ModelServer () . start ( models = [ model ]) For multi-model case if all the models can share the same transformer you can register the same transformer for different models, or different transformers if each model requires its own transformation. if __name__ == \"__main__\" : for model_name in model_names : transformer = ImageTransformer ( model_name , predictor_host = args . predictor_host ) models . append ( transformer ) kserve . ModelServer () . start ( models = models ) Build Transformer docker image \u00b6 Under kserve/python directory, build the transformer docker image using Dockerfile cd python docker build -t $DOCKER_USER /image-transformer:latest -f transformer.Dockerfile . docker push { username } /image-transformer:latest Deploy the InferenceService with REST Predictor \u00b6 Create the InferenceService \u00b6 By default InferenceService uses TorchServe to serve the PyTorch models and the models can be loaded from a model repository in cloud storage according to TorchServe model repository layout. In this example, the model repository contains a MNIST model, but you can store more than one model there. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist Note STORAGE_URI is a build-in environment variable used to inject the storage initializer for custom container just like StorageURI field for prepackaged predictors. The downloaded artifacts are stored under /mnt/models . Apply the InferenceService transformer-new.yaml kubectl apply -f transformer-new.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-transformer created Run a prediction \u00b6 First, download the request input payload . Then, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . SERVICE_NAME = torch-transformer MODEL_NAME = mnist INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output > POST /v1/models/mnist:predict HTTP/1.1 > Host: torch-transformer.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 401 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 401 out of 401 bytes Handling connection for 8080 * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 20 < content-type: application/json ; charset = UTF-8 < date: Tue, 12 Jan 2021 09 :52:30 GMT < server: istio-envoy < x-envoy-upstream-service-time: 83 < * Connection #0 to host localhost left intact { \"predictions\" : [ 2 ]} Deploy the InferenceService calling Predictor with gRPC protocol \u00b6 Comparing with REST, gRPC is faster due to the tight packing of the Protocol Buffer and the use of HTTP/2 by gRPC. In many cases, gRPC can be more efficient communication protocol between Transformer and Predictor as you may need to transmit large tensors between them. Create InferenceService \u00b6 Create the InferenceService with following yaml which includes a Transformer and a Triton Predictor. As KServe by default uses TorchServe serving runtime for PyTorch model, here you need to override the serving runtime to kserve-tritonserver for using the gRPC protocol. The transformer calls out to predictor with V2 gRPC Protocol by specifying the --protocol argument. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-grpc-transformer spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchscript runtime : kserve-tritonserver runtimeVersion : 20.10-py3 ports : - name : h2c protocol : TCP containerPort : 9000 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - cifar10 - --protocol - grpc-v2 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-grpc-transformer spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 ports : - name : h2c protocol : TCP containerPort : 9000 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - cifar10 - --protocol - grpc-v2 Apply the InferenceService grpc_transformer.yaml kubectl apply -f grpc_transformer.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-grpc-transformer created Run a prediction \u00b6 First, download the request input payload . Then, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT SERVICE_NAME = torch-grpc-transformer MODEL_NAME = cifar10 INPUT_PATH = @./image.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output * Trying ::1... * TCP_NODELAY set * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/cifar10:predict HTTP/1.1 > Host: torch-transformer.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 3394 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > Handling connection for 8080 < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 222 < content-type: application/json ; charset = UTF-8 < date: Thu, 03 Feb 2022 01 :50:07 GMT < server: istio-envoy < x-envoy-upstream-service-time: 73 < * Connection #0 to host localhost left intact { \"predictions\" : [[ -1.192867636680603, -0.35750141739845276, -2.3665435314178467, 3 .9186441898345947, -2.0592284202575684, 4 .091977119445801, 0 .1266237050294876, -1.8284690380096436, 2 .628898859024048, -4.255198001861572 ]]} * Closing connection 0 Performance Comparison between gRPC and REST \u00b6 From the following latency stats of both transformer and predictor you can see that the transformer to predictor call takes longer time(92ms vs 55ms) for REST than gRPC, REST takes more time serializing and deserializing 3*32*32 shape tensor and with gRPC it is transmitted as tightly packed numpy array serialized bytes. # from REST v1 transformer log 2023 -01-09 07 :15:55.263 79476 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 6 .083965302, explain_ms: 0 , predict_ms: 92 .653036118, postprocess_ms: 0 .007867813 # from REST v1 predictor log 2023 -01-09 07 :16:02.581 79402 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 13 .532876968, explain_ms: 0 , predict_ms: 48 .450231552, postprocess_ms: 0 .006914139 # from REST v1 transformer log 2023 -01-09 07 :27:52.172 79715 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 2 .567052841, explain_ms: 0 , predict_ms: 55 .0532341, postprocess_ms: 0 .101804733 # from gPPC v2 predictor log 2023 -01-09 07 :27:52.171 79711 root INFO [ __call__ () :128 ] requestId: , preprocess_ms: 0 .067949295, explain_ms: 0 , predict_ms: 51 .237106323, postprocess_ms: 0 .049114227","title":"How to write a custom transformer"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#deploy-transformer-with-inferenceservice","text":"Transformer is an InferenceService component which does pre/post processing alongside with model inference. It usually takes raw input and transforms them to the input tensors model server expects. In this example we demonstrate an example of running inference with a custom Transformer communicating by REST and gRPC protocol.","title":"Deploy Transformer with InferenceService"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#create-custom-image-transformer","text":"","title":"Create Custom Image Transformer"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#implement-prepost-processing-with-kserve-model-api","text":"KServe.Model base class mainly defines three handlers preprocess , predict and postprocess , these handlers are executed in sequence where the output of the preprocess handler is passed to the predict handler as the input. When predictor_host is passed, the predict handler makes a call to the predictor and gets back a response which is then passed to the postprocess handler. KServe automatically fills in the predictor_host for Transformer and hands over the call to the Predictor . By default transformer makes a REST call to predictor, to make a gRPC call to predictor, you can pass the --protocol argument with value grpc-v2 . To implement a Transformer you can derive from the base Model class and then overwrite the preprocess and postprocess handler to have your own customized transformation logic. For Open(v2) Inference Protocol , KServe provides InferRequest and InferResponse API object for predict , preprocess , postprocess handlers to abstract away the implementation details of REST/gRPC decoding and encoding over the wire. from kserve import Model , ModelServer , model_server , InferInput , InferRequest from typing import Dict from PIL import Image import torchvision.transforms as transforms import logging import io import base64 logging . basicConfig ( level = kserve . constants . KSERVE_LOGLEVEL ) def image_transform ( byte_array ): \"\"\"converts the input image of Bytes Array into Tensor Args: instance (dict): The request input for image bytes. Returns: list: Returns converted tensor as input for predict handler with v1/v2 inference protocol. \"\"\" image_processing = transforms . Compose ([ transforms . ToTensor (), transforms . Normalize (( 0.1307 ,), ( 0.3081 ,)) ]) image = Image . open ( io . BytesIO ( byte_array )) tensor = image_processing ( image ) . numpy () return tensor # for v1 REST predictor the preprocess handler converts to input image bytes to float tensor dict in v1 inference REST protocol format class ImageTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , headers : Dict [ str , str ] = None ): super () . __init__ ( name ) self . predictor_host = predictor_host self . ready = True def preprocess ( self , inputs : Dict , headers : Dict [ str , str ] = None ) -> Dict : return { 'instances' : [ image_transform ( instance ) for instance in inputs [ 'instances' ]]} def postprocess ( self , inputs : Dict , headers : Dict [ str , str ] = None ) -> Dict : return inputs # for v2 gRPC predictor the preprocess handler converts the input image bytes tensor to float tensor in v2 inference protocol format class ImageTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , protocol : str , headers : Dict [ str , str ] = None ): super () . __init__ ( name ) self . predictor_host = predictor_host self . protocol = protocol self . ready = True def preprocess ( self , request : InferRequest , headers : Dict [ str , str ] = None ) -> InferRequest : input_tensors = [ image_transform ( instance ) for instance in request . inputs [ 0 ] . data ] input_tensors = np . asarray ( input_tensors ) infer_inputs = [ InferInput ( name = \"INPUT__0\" , datatype = 'FP32' , shape = list ( input_tensors . shape ), data = input_tensors )] infer_request = InferRequest ( model_name = self . model_name , infer_inputs = infer_inputs ) return infer_request Please see the code example here .","title":"Implement pre/post processing with KServe Model API"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#transformer-server-entrypoint","text":"For single model you just create a transformer object and register that to the model server. if __name__ == \"__main__\" : model = ImageTransformer ( args . model_name , predictor_host = args . predictor_host , protocol = args . protocol ) ModelServer () . start ( models = [ model ]) For multi-model case if all the models can share the same transformer you can register the same transformer for different models, or different transformers if each model requires its own transformation. if __name__ == \"__main__\" : for model_name in model_names : transformer = ImageTransformer ( model_name , predictor_host = args . predictor_host ) models . append ( transformer ) kserve . ModelServer () . start ( models = models )","title":"Transformer Server Entrypoint"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#build-transformer-docker-image","text":"Under kserve/python directory, build the transformer docker image using Dockerfile cd python docker build -t $DOCKER_USER /image-transformer:latest -f transformer.Dockerfile . docker push { username } /image-transformer:latest","title":"Build Transformer docker image"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#deploy-the-inferenceservice-with-rest-predictor","text":"","title":"Deploy the InferenceService with REST Predictor"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#create-the-inferenceservice","text":"By default InferenceService uses TorchServe to serve the PyTorch models and the models can be loaded from a model repository in cloud storage according to TorchServe model repository layout. In this example, the model repository contains a MNIST model, but you can store more than one model there. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transformer spec : predictor : pytorch : storageUri : gs://kfserving-examples/models/torchserve/image_classifier/v1 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - mnist Note STORAGE_URI is a build-in environment variable used to inject the storage initializer for custom container just like StorageURI field for prepackaged predictors. The downloaded artifacts are stored under /mnt/models . Apply the InferenceService transformer-new.yaml kubectl apply -f transformer-new.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-transformer created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#run-a-prediction","text":"First, download the request input payload . Then, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT . SERVICE_NAME = torch-transformer MODEL_NAME = mnist INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output > POST /v1/models/mnist:predict HTTP/1.1 > Host: torch-transformer.default.example.com > User-Agent: curl/7.73.0 > Accept: */* > Content-Length: 401 > Content-Type: application/x-www-form-urlencoded > * upload completely sent off: 401 out of 401 bytes Handling connection for 8080 * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 20 < content-type: application/json ; charset = UTF-8 < date: Tue, 12 Jan 2021 09 :52:30 GMT < server: istio-envoy < x-envoy-upstream-service-time: 83 < * Connection #0 to host localhost left intact { \"predictions\" : [ 2 ]}","title":"Run a prediction"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#deploy-the-inferenceservice-calling-predictor-with-grpc-protocol","text":"Comparing with REST, gRPC is faster due to the tight packing of the Protocol Buffer and the use of HTTP/2 by gRPC. In many cases, gRPC can be more efficient communication protocol between Transformer and Predictor as you may need to transmit large tensors between them.","title":"Deploy the InferenceService calling Predictor with gRPC protocol"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#create-inferenceservice","text":"Create the InferenceService with following yaml which includes a Transformer and a Triton Predictor. As KServe by default uses TorchServe serving runtime for PyTorch model, here you need to override the serving runtime to kserve-tritonserver for using the gRPC protocol. The transformer calls out to predictor with V2 gRPC Protocol by specifying the --protocol argument. New Schema Old Schema apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-grpc-transformer spec : predictor : model : modelFormat : name : pytorch storageUri : gs://kfserving-examples/models/torchscript runtime : kserve-tritonserver runtimeVersion : 20.10-py3 ports : - name : h2c protocol : TCP containerPort : 9000 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - cifar10 - --protocol - grpc-v2 apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-grpc-transformer spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 ports : - name : h2c protocol : TCP containerPort : 9000 transformer : containers : - image : kserve/image-transformer:latest name : kserve-container command : - \"python\" - \"-m\" - \"model\" args : - --model_name - cifar10 - --protocol - grpc-v2 Apply the InferenceService grpc_transformer.yaml kubectl apply -f grpc_transformer.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-grpc-transformer created","title":"Create InferenceService"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#run-a-prediction_1","text":"First, download the request input payload . Then, determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT SERVICE_NAME = torch-grpc-transformer MODEL_NAME = cifar10 INPUT_PATH = @./image.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected Output * Trying ::1... * TCP_NODELAY set * Connected to localhost ( ::1 ) port 8080 ( #0) > POST /v1/models/cifar10:predict HTTP/1.1 > Host: torch-transformer.default.example.com > User-Agent: curl/7.64.1 > Accept: */* > Content-Length: 3394 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > Handling connection for 8080 < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 222 < content-type: application/json ; charset = UTF-8 < date: Thu, 03 Feb 2022 01 :50:07 GMT < server: istio-envoy < x-envoy-upstream-service-time: 73 < * Connection #0 to host localhost left intact { \"predictions\" : [[ -1.192867636680603, -0.35750141739845276, -2.3665435314178467, 3 .9186441898345947, -2.0592284202575684, 4 .091977119445801, 0 .1266237050294876, -1.8284690380096436, 2 .628898859024048, -4.255198001861572 ]]} * Closing connection 0","title":"Run a prediction"},{"location":"modelserving/v1beta1/transformer/torchserve_image_transformer/#performance-comparison-between-grpc-and-rest","text":"From the following latency stats of both transformer and predictor you can see that the transformer to predictor call takes longer time(92ms vs 55ms) for REST than gRPC, REST takes more time serializing and deserializing 3*32*32 shape tensor and with gRPC it is transmitted as tightly packed numpy array serialized bytes. # from REST v1 transformer log 2023 -01-09 07 :15:55.263 79476 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 6 .083965302, explain_ms: 0 , predict_ms: 92 .653036118, postprocess_ms: 0 .007867813 # from REST v1 predictor log 2023 -01-09 07 :16:02.581 79402 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 13 .532876968, explain_ms: 0 , predict_ms: 48 .450231552, postprocess_ms: 0 .006914139 # from REST v1 transformer log 2023 -01-09 07 :27:52.172 79715 root INFO [ __call__ () :128 ] requestId: N.A., preprocess_ms: 2 .567052841, explain_ms: 0 , predict_ms: 55 .0532341, postprocess_ms: 0 .101804733 # from gPPC v2 predictor log 2023 -01-09 07 :27:52.171 79711 root INFO [ __call__ () :128 ] requestId: , preprocess_ms: 0 .067949295, explain_ms: 0 , predict_ms: 51 .237106323, postprocess_ms: 0 .049114227","title":"Performance Comparison between gRPC and REST"},{"location":"modelserving/v1beta1/triton/bert/","text":"QA Inference with BERT model using Triton Inference Server \u00b6 Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This example demonstrates Inference on Question Answering (QA) task with BERT Base/Large model The use of fine-tuned NVIDIA BERT models Deploy Transformer for preprocess using BERT tokenizer Deploy BERT model on Triton Inference Server Inference with V2 KServe protocol We can run inference on a fine-tuned BERT model for tasks like Question Answering. Here we use a BERT model fine-tuned on a SQuaD 2.0 Dataset which contains 100,000+ question-answer pairs on 500+ articles combined with over 50,000 new, unanswerable questions. Setup \u00b6 Your cluster's Istio Ingress gateway must be network accessible . Skip tag resolution for nvcr.io which requires auth to resolve triton inference server image digest kubectl patch cm config-deployment --patch '{\"data\":{\"registriesSkippingTagResolving\":\"nvcr.io\"}}' -n knative-serving Increase progress deadline since pulling triton image and big bert model may longer than default timeout for 120s, this setting requires knative 0.15.0+ kubectl patch cm config-deployment --patch '{\"data\":{\"progressDeadline\": \"600s\"}}' -n knative-serving Create Custom Transformer for BERT Tokenizer \u00b6 Extend ModelServer base and Implement pre/postprocess \u00b6 The preprocess handler converts the paragraph and the question to BERT input using BERT tokenizer The predict handler calls Triton Inference Server using PYTHON REST API The postprocess handler converts raw prediction to the answer with the probability class BertTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str ): super () . __init__ ( name ) self . short_paragraph_text = \"The Apollo program was the third United States human spaceflight program. First conceived as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was dedicated to President John F. Kennedy's national goal of landing a man on the Moon. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972 followed by the Apollo-Soyuz Test Project a joint Earth orbit mission with the Soviet Union in 1975.\" self . predictor_host = predictor_host self . tokenizer = tokenization . FullTokenizer ( vocab_file = \"/mnt/models/vocab.txt\" , do_lower_case = True ) self . model_name = \"bert_tf_v2_large_fp16_128_v2\" self . triton_client = None def preprocess ( self , inputs : Dict ) -> Dict : self . doc_tokens = data_processing . convert_doc_tokens ( self . short_paragraph_text ) self . features = data_processing . convert_examples_to_features ( self . doc_tokens , inputs [ \"instances\" ][ 0 ], self . tokenizer , 128 , 128 , 64 ) return self . features def predict ( self , features : Dict ) -> Dict : if not self . triton_client : self . triton_client = httpclient . InferenceServerClient ( url = self . predictor_host , verbose = True ) unique_ids = np . zeros ([ 1 , 1 ], dtype = np . int32 ) segment_ids = features [ \"segment_ids\" ] . reshape ( 1 , 128 ) input_ids = features [ \"input_ids\" ] . reshape ( 1 , 128 ) input_mask = features [ \"input_mask\" ] . reshape ( 1 , 128 ) inputs = [] inputs . append ( httpclient . InferInput ( 'unique_ids' , [ 1 , 1 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'segment_ids' , [ 1 , 128 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'input_ids' , [ 1 , 128 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'input_mask' , [ 1 , 128 ], \"INT32\" )) inputs [ 0 ] . set_data_from_numpy ( unique_ids ) inputs [ 1 ] . set_data_from_numpy ( segment_ids ) inputs [ 2 ] . set_data_from_numpy ( input_ids ) inputs [ 3 ] . set_data_from_numpy ( input_mask ) outputs = [] outputs . append ( httpclient . InferRequestedOutput ( 'start_logits' , binary_data = False )) outputs . append ( httpclient . InferRequestedOutput ( 'end_logits' , binary_data = False )) result = self . triton_client . infer ( self . model_name , inputs , outputs = outputs ) return result . get_response () def postprocess ( self , result : Dict ) -> Dict : end_logits = result [ 'outputs' ][ 0 ][ 'data' ] start_logits = result [ 'outputs' ][ 1 ][ 'data' ] n_best_size = 20 # The maximum length of an answer that can be generated. This is needed # because the start and end predictions are not conditioned on one another max_answer_length = 30 ( prediction , nbest_json , scores_diff_json ) = \\ data_processing . get_predictions ( self . doc_tokens , self . features , start_logits , end_logits , n_best_size , max_answer_length ) return { \"predictions\" : prediction , \"prob\" : nbest_json [ 0 ][ 'probability' ] * 100.0 } Please find the code example here . Build Transformer docker image \u00b6 Build the KServe Transformer image with above code cd bert_tokenizer_v2 docker build -t $USER /bert_transformer-v2:latest . --rm Or you can use the prebuild image kfserving/bert-transformer-v2:latest Create the InferenceService \u00b6 Add above custom KServe Transformer image and Triton Predictor to the InferenceService spec apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"bert-v2\" spec : transformer : containers : - name : kserve-container image : kfserving/bert-transformer-v2:latest command : - \"python\" - \"-m\" - \"bert_transformer_v2\" env : - name : STORAGE_URI value : \"gs://kfserving-examples/models/triton/bert-transformer\" predictor : triton : runtimeVersion : 20.10-py3 resources : limits : cpu : \"1\" memory : 8Gi requests : cpu : \"1\" memory : 8Gi storageUri : \"gs://kfserving-examples/models/triton/bert\" Apply the InferenceService yaml. kubectl apply -f bert_v1beta1.yaml Expected Output $ inferenceservice.serving.kserve.io/bert-v2 created Check the InferenceService \u00b6 kubectl get inferenceservice bert-v2 NAME URL READY AGE bert-v2 http://bert-v2.default.35.229.120.99.xip.io True 71s you will see both transformer and predictor are created and in ready state kubectl get revision -l serving.kserve.io/inferenceservice = bert-v2 NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON bert-v2-predictor-default-plhgs bert-v2-predictor-default bert-v2-predictor-default-plhgs 1 True bert-v2-transformer-default-sd6nc bert-v2-transformer-default bert-v2-transformer-default-sd6nc 1 True Run a Prediction \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send a question request with following input, the transformer expects sending a list of instances or inputs and preprocess then converts the inputs to expected tensor sending to Triton Inference Server . { \"instances\" : [ \"What President is credited with the original notion of putting Americans in space?\" ] } MODEL_NAME = bert-v2 INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservices bert-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected output { \"predictions\" : \"John F. Kennedy\" , \"prob\" : 77 .91848979818604 }","title":"Tensorflow"},{"location":"modelserving/v1beta1/triton/bert/#qa-inference-with-bert-model-using-triton-inference-server","text":"Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This example demonstrates Inference on Question Answering (QA) task with BERT Base/Large model The use of fine-tuned NVIDIA BERT models Deploy Transformer for preprocess using BERT tokenizer Deploy BERT model on Triton Inference Server Inference with V2 KServe protocol We can run inference on a fine-tuned BERT model for tasks like Question Answering. Here we use a BERT model fine-tuned on a SQuaD 2.0 Dataset which contains 100,000+ question-answer pairs on 500+ articles combined with over 50,000 new, unanswerable questions.","title":"QA Inference with BERT model using Triton Inference Server"},{"location":"modelserving/v1beta1/triton/bert/#setup","text":"Your cluster's Istio Ingress gateway must be network accessible . Skip tag resolution for nvcr.io which requires auth to resolve triton inference server image digest kubectl patch cm config-deployment --patch '{\"data\":{\"registriesSkippingTagResolving\":\"nvcr.io\"}}' -n knative-serving Increase progress deadline since pulling triton image and big bert model may longer than default timeout for 120s, this setting requires knative 0.15.0+ kubectl patch cm config-deployment --patch '{\"data\":{\"progressDeadline\": \"600s\"}}' -n knative-serving","title":"Setup"},{"location":"modelserving/v1beta1/triton/bert/#create-custom-transformer-for-bert-tokenizer","text":"","title":"Create Custom Transformer for BERT Tokenizer"},{"location":"modelserving/v1beta1/triton/bert/#extend-modelserver-base-and-implement-prepostprocess","text":"The preprocess handler converts the paragraph and the question to BERT input using BERT tokenizer The predict handler calls Triton Inference Server using PYTHON REST API The postprocess handler converts raw prediction to the answer with the probability class BertTransformer ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str ): super () . __init__ ( name ) self . short_paragraph_text = \"The Apollo program was the third United States human spaceflight program. First conceived as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was dedicated to President John F. Kennedy's national goal of landing a man on the Moon. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972 followed by the Apollo-Soyuz Test Project a joint Earth orbit mission with the Soviet Union in 1975.\" self . predictor_host = predictor_host self . tokenizer = tokenization . FullTokenizer ( vocab_file = \"/mnt/models/vocab.txt\" , do_lower_case = True ) self . model_name = \"bert_tf_v2_large_fp16_128_v2\" self . triton_client = None def preprocess ( self , inputs : Dict ) -> Dict : self . doc_tokens = data_processing . convert_doc_tokens ( self . short_paragraph_text ) self . features = data_processing . convert_examples_to_features ( self . doc_tokens , inputs [ \"instances\" ][ 0 ], self . tokenizer , 128 , 128 , 64 ) return self . features def predict ( self , features : Dict ) -> Dict : if not self . triton_client : self . triton_client = httpclient . InferenceServerClient ( url = self . predictor_host , verbose = True ) unique_ids = np . zeros ([ 1 , 1 ], dtype = np . int32 ) segment_ids = features [ \"segment_ids\" ] . reshape ( 1 , 128 ) input_ids = features [ \"input_ids\" ] . reshape ( 1 , 128 ) input_mask = features [ \"input_mask\" ] . reshape ( 1 , 128 ) inputs = [] inputs . append ( httpclient . InferInput ( 'unique_ids' , [ 1 , 1 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'segment_ids' , [ 1 , 128 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'input_ids' , [ 1 , 128 ], \"INT32\" )) inputs . append ( httpclient . InferInput ( 'input_mask' , [ 1 , 128 ], \"INT32\" )) inputs [ 0 ] . set_data_from_numpy ( unique_ids ) inputs [ 1 ] . set_data_from_numpy ( segment_ids ) inputs [ 2 ] . set_data_from_numpy ( input_ids ) inputs [ 3 ] . set_data_from_numpy ( input_mask ) outputs = [] outputs . append ( httpclient . InferRequestedOutput ( 'start_logits' , binary_data = False )) outputs . append ( httpclient . InferRequestedOutput ( 'end_logits' , binary_data = False )) result = self . triton_client . infer ( self . model_name , inputs , outputs = outputs ) return result . get_response () def postprocess ( self , result : Dict ) -> Dict : end_logits = result [ 'outputs' ][ 0 ][ 'data' ] start_logits = result [ 'outputs' ][ 1 ][ 'data' ] n_best_size = 20 # The maximum length of an answer that can be generated. This is needed # because the start and end predictions are not conditioned on one another max_answer_length = 30 ( prediction , nbest_json , scores_diff_json ) = \\ data_processing . get_predictions ( self . doc_tokens , self . features , start_logits , end_logits , n_best_size , max_answer_length ) return { \"predictions\" : prediction , \"prob\" : nbest_json [ 0 ][ 'probability' ] * 100.0 } Please find the code example here .","title":"Extend ModelServer base and Implement pre/postprocess"},{"location":"modelserving/v1beta1/triton/bert/#build-transformer-docker-image","text":"Build the KServe Transformer image with above code cd bert_tokenizer_v2 docker build -t $USER /bert_transformer-v2:latest . --rm Or you can use the prebuild image kfserving/bert-transformer-v2:latest","title":"Build Transformer docker image"},{"location":"modelserving/v1beta1/triton/bert/#create-the-inferenceservice","text":"Add above custom KServe Transformer image and Triton Predictor to the InferenceService spec apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"bert-v2\" spec : transformer : containers : - name : kserve-container image : kfserving/bert-transformer-v2:latest command : - \"python\" - \"-m\" - \"bert_transformer_v2\" env : - name : STORAGE_URI value : \"gs://kfserving-examples/models/triton/bert-transformer\" predictor : triton : runtimeVersion : 20.10-py3 resources : limits : cpu : \"1\" memory : 8Gi requests : cpu : \"1\" memory : 8Gi storageUri : \"gs://kfserving-examples/models/triton/bert\" Apply the InferenceService yaml. kubectl apply -f bert_v1beta1.yaml Expected Output $ inferenceservice.serving.kserve.io/bert-v2 created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/triton/bert/#check-the-inferenceservice","text":"kubectl get inferenceservice bert-v2 NAME URL READY AGE bert-v2 http://bert-v2.default.35.229.120.99.xip.io True 71s you will see both transformer and predictor are created and in ready state kubectl get revision -l serving.kserve.io/inferenceservice = bert-v2 NAME CONFIG NAME K8S SERVICE NAME GENERATION READY REASON bert-v2-predictor-default-plhgs bert-v2-predictor-default bert-v2-predictor-default-plhgs 1 True bert-v2-transformer-default-sd6nc bert-v2-transformer-default bert-v2-transformer-default-sd6nc 1 True","title":"Check the InferenceService"},{"location":"modelserving/v1beta1/triton/bert/#run-a-prediction","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT Send a question request with following input, the transformer expects sending a list of instances or inputs and preprocess then converts the inputs to expected tensor sending to Triton Inference Server . { \"instances\" : [ \"What President is credited with the original notion of putting Americans in space?\" ] } MODEL_NAME = bert-v2 INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservices bert-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" -d $INPUT_PATH http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ $MODEL_NAME :predict Expected output { \"predictions\" : \"John F. Kennedy\" , \"prob\" : 77 .91848979818604 }","title":"Run a Prediction"},{"location":"modelserving/v1beta1/triton/huggingface/","text":"Serve the Huggingface model using Triton Inference Runtime \u00b6 Nvidia Triton Inference Server is a robust serving runtime thanks to its optmized performance, scalability, and flexibility. Combined with the expansive library of Hugging Face, which offers state-of-the-art natural language processing capabilities, it opens up immense possibilities for deploying production-ready Huggface Face transformer based models. By harnessing the power of these tools, here we'll show you how KServe can help further simplify the Triton Inference containers deployment and make efficient use of GPUs by automatically wiring up the open inference protocol between pre/post processing(tokenization) and model inference on triton inference container. Export the Model to Triton format \u00b6 Export the Hugging Face models to supported model formats Torchscript or ONNX in triton model repository layout . For more details, please refer to triton model configuration . Deploy InferenceService with Triton and Hugging Face Runtime \u00b6 Create an InferenceService with triton predictor by specifying the storageUri with the Hugging Face model stored on cloud storage according to triton model repository layout. The KServe transformer container is created using the KServe Hugging Face runtime for the tokenization step to encode the text tokens and decode the token ids from the output the triton inference container. The Hugging Face tokenizing container and triton inference container can communicate with either REST or gRPC protocol by specifiying the --predictor_protocol=v2 or --predictor_protocol=grpc-v2 . Yaml kubectl apply -f - </ / [config.pbtxt] [ ...] / / ... / [config.pbtxt] [ ...] / / For example in your model repository bucket gs://kfserving-examples/models/torchscript , the layout can be torchscript/ cifar/ config.pbtxt 1/ model.pt The config.pbtxt defines a model configuration that provides the required and optional information for the model. A minimal model configuration must specify name, platform, max_batch_size, input, and output. Due to the absence of names for inputs and outputs in a TorchScript model, the name attribute of both the inputs and outputs in the configuration must follow a specific naming convention i.e. \u201c __ \u201d. Where can be any string and refers to the position of the corresponding input/output. This means if there are two inputs and two outputs they must be named as: INPUT__0 , INPUT__1 and OUTPUT__0 , OUTPUT__1 such that INPUT__0 refers to first input and INPUT__1 refers to the second input, etc. na me : \"cifar\" pla tf orm : \"pytorch_libtorch\" max_ba t ch_size : 1 i n pu t [ { na me : \"INPUT__0\" da ta _ t ype : TYPE_FP 32 dims : [ 3 , 32 , 32 ] } ] ou t pu t [ { na me : \"OUTPUT__0\" da ta _ t ype : TYPE_FP 32 dims : [ 10 ] } ] i nstan ce_group [ { cou nt : 1 ki n d : KIND_CPU } ] instance_group provides multiple instances of a model so that multiple inference requests for that model can be handled simultaneously. instance_group [ { count: 4 kind: KIND_CPU } ] To schedule the model on GPU you would need to change the instance_group with GPU kind instance_group [ { count: 1 kind: KIND_GPU } ] For more details, please refer to triton model configuration . Inference with HTTP endpoint \u00b6 Create the InferenceService \u00b6 Create the inference service yaml with the above specified model repository uri. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torchscript-cifar10 spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" Warning Setting OMP_NUM_THREADS or MKL_NUM_THREADS envs are critical for performance, these environment variables are used to control the intra-op parallelism for TorchScript model inference, the number of CPU threads defaults to the number of CPU cores. Please refer to CPU threading & TorchScript Inference for more details. kubectl kubectl apply -f torchscript.yaml Expected Output $ inferenceservice.serving.kserve.io/torchscript-cifar10 created Run a prediction with curl \u00b6 The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT The latest Triton Inference Server already switched to use KServe prediction V2 protocol , so the input request needs to follow the V2 schema with the specified data type, shape. # download the input file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/triton/torchscript/input.json MODEL_NAME = cifar10 INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchscript-cifar10 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d $INPUT_PATH Expected Output * Connected to torchscript-cifar.default.svc.cluster.local ( 10 .51.242.87 ) port 80 ( #0) > POST /v2/models/cifar10/infer HTTP/1.1 > Host: torchscript-cifar.default.svc.cluster.local > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 110765 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 315 < content-type: application/json < date: Sun, 11 Oct 2020 21 :26:51 GMT < x-envoy-upstream-service-time: 8 < server: istio-envoy < * Connection #0 to host torchscript-cifar.default.svc.cluster.local left intact { \"model_name\" : \"cifar10\" , \"model_version\" : \"1\" , \"outputs\" : [{ \"name\" : \"OUTPUT__0\" , \"datatype\" : \"FP32\" , \"shape\" : [ 1 ,10 ] , \"data\" : [ -2.0964810848236086,-0.13700756430625916,-0.5095657706260681,2.795621395111084,-0.5605481863021851,1.9934231042861939,1.1288187503814698,-1.4043136835098267,0.6004879474639893,-2.1237082481384279 ]}]} Run a performance test \u00b6 QPS rate --rate can be changed in the perf.yaml . kubectl create -f perf.yaml Requests [ total, rate, throughput ] 6000 , 100 .02, 100 .01 Duration [ total, attack, wait ] 59 .995s, 59 .99s, 4 .961ms Latencies [ min, mean, 50 , 90 , 95 , 99 , max ] 4 .222ms, 5 .7ms, 5 .548ms, 6 .384ms, 6 .743ms, 9 .286ms, 25 .85ms Bytes In [ total, mean ] 1890000 , 315 .00 Bytes Out [ total, mean ] 665874000 , 110979 .00 Success [ ratio ] 100 .00% Status Codes [ code:count ] 200 :6000 Error Set: Inference with gRPC endpoint \u00b6 Create the InferenceService \u00b6 Create the inference service yaml and expose the gRPC port, currently only one port is allowed to expose either HTTP or gRPC port and by default HTTP port is exposed. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torchscript-cifar10 spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 ports : - containerPort : 9000 name : h2c protocol : TCP env : - name : OMP_NUM_THREADS value : \"1\" Apply the gRPC InferenceService yaml and then you can call the model with tritonclient python library after InferenceService is ready. kubectl apply -f torchscript_grpc.yaml Run a prediction with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/predict-api/v2/grpc_predict_v2.proto # download the input json file curl -O https://raw.githubusercontent.com/kserve/website/main/docs/modelserving/v1beta1/triton/torchscript/input-grpc.json INPUT_PATH = input-grpc.json PROTO_FILE = grpc_predict_v2.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice torchscript-cifar10 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) The gRPC APIs follow the KServe prediction V2 protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \" \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. Notice that the input file differs from that used in the previous curl example. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: host: torchscript-cifar10.default.example.com Response headers received: accept-encoding: identity,gzip content-type: application/grpc date: Fri, 12 Aug 2022 01 :49:53 GMT grpc-accept-encoding: identity,deflate,gzip server: istio-envoy x-envoy-upstream-service-time: 16 Response contents: { \"modelName\" : \"cifar10\" , \"modelVersion\" : \"1\" , \"outputs\" : [ { \"name\" : \"OUTPUT__0\" , \"datatype\" : \"FP32\" , \"shape\" : [ \"1\" , \"10\" ] } ] , \"rawOutputContents\" : [ \"wCwGwOJLDL7icgK/dusyQAqAD799KP8/In2QP4zAs7+WuRk/2OoHwA==\" ] } Response trailers received: ( empty ) Sent 1 request and received 1 response The content of output tensor is encoded in rawOutputContents field. It can be base64 decoded and loaded into a Numpy array with the given datatype and shape. Alternatively, Triton also provides Python client library which has many examples showing how to interact with the KServe V2 gPRC protocol. Add Transformer to the InferenceService \u00b6 Triton Inference Server expects tensors as input data, often times a pre-processing step is required before making the prediction call when the user is sending in request with raw input format. Transformer component can be specified on InferenceService spec for user implemented pre/post processing code. User is responsible to create a python class which extends from KServe Model base class which implements preprocess handler to transform raw input format to tensor format according to V2 prediction protocol, postprocess handle is to convert raw prediction response to a more user friendly response. Implement pre/post processing functions \u00b6 image_transformer_v2.py import kserve from typing import Dict from PIL import Image import torchvision.transforms as transforms import logging import io import numpy as np import base64 logging . basicConfig ( level = kserve . constants . KSERVE_LOGLEVEL ) transform = transforms . Compose ( [ transforms . ToTensor (), transforms . Normalize (( 0.5 , 0.5 , 0.5 ), ( 0.5 , 0.5 , 0.5 ))]) def image_transform ( instance ): byte_array = base64 . b64decode ( instance [ 'image_bytes' ][ 'b64' ]) image = Image . open ( io . BytesIO ( byte_array )) a = np . asarray ( image ) im = Image . fromarray ( a ) res = transform ( im ) logging . info ( res ) return res . tolist () class ImageTransformerV2 ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , protocol : str ): super () . __init__ ( name ) self . predictor_host = predictor_host self . protocol = protocol def preprocess ( self , inputs : Dict ) -> Dict : return { 'inputs' : [ { 'name' : 'INPUT__0' , 'shape' : [ 1 , 3 , 32 , 32 ], 'datatype' : \"FP32\" , 'data' : [ image_transform ( instance ) for instance in inputs [ 'instances' ]] } ] } def postprocess ( self , results : Dict ) -> Dict : return { output [ \"name\" ]: np . array ( output [ \"data\" ]) . reshape ( output [ \"shape\" ]) . tolist () for output in results [ \"outputs\" ]} Please find the code example and Dockerfile . Build Transformer docker image \u00b6 docker build -t $DOCKER_USER /image-transformer-v2:latest -f transformer.Dockerfile . --rm Create the InferenceService with Transformer \u00b6 Please use the YAML file to create the InferenceService, which adds the image transformer component with the docker image built from above. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transfomer spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" transformer : containers : - image : kfserving/image-transformer-v2:latest name : kserve-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 kubectl apply -f torch_transformer.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-transfomer created Run a prediction with curl \u00b6 The transformer does not enforce a specific schema like predictor but the general recommendation is to send in as a list of object(dict): \"instances\": | { \"instances\" : [ { \"image_bytes\" : { \"b64\" : \"aW1hZ2UgYnl0ZXM=\" }, \"caption\" : \"seaside\" }, { \"image_bytes\" : { \"b64\" : \"YXdlc29tZSBpbWFnZSBieXRlcw==\" }, \"caption\" : \"mountains\" } ] } # download the input file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/triton/torchscript/image.json SERVICE_NAME = torch-transfomer MODEL_NAME = cifar10 INPUT_PATH = @./image.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d $INPUT_PATH Expected Output > POST /v1/models/cifar10:predict HTTP/1.1 > Host: torch-transformer.kserve-triton.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 3400 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > * Mark bundle as not supporting multiuse < HTTP/1.1 100 Continue * We are completely uploaded and fine * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 219 < content-type: application/json ; charset = UTF-8 < date: Sat, 19 Mar 2022 12 :15:54 GMT < server: istio-envoy < x-envoy-upstream-service-time: 41 < { \"OUTPUT__0\" : [[ -2.0964810848236084, -0.137007474899292, -0.5095658302307129, 2 .795621395111084, -0.560547947883606, 1 .9934231042861938, 1 .1288189888000488, - 4043136835098267 , 0 .600488007068634, -2.1237082481384277 ]]} %","title":"Torchscript"},{"location":"modelserving/v1beta1/triton/torchscript/#predict-on-a-triton-inferenceservice-with-torchscript-model","text":"While Python is a suitable and preferred language for many scenarios requiring dynamism and ease of iteration, there are equally many situations where precisely these properties of Python are unfavorable. One environment in which the latter often applies is production \u2013 the land of low latencies and strict deployment requirements. For production scenarios, C++ is very often the language of choice, The following example will outline the path PyTorch provides to go from an existing Python model to a serialized representation that can be loaded and executed purely from C++ like Triton Inference Server, with no dependency on Python.","title":"Predict on a Triton InferenceService with TorchScript model"},{"location":"modelserving/v1beta1/triton/torchscript/#setup","text":"Make sure you have installed KServe Skip tag resolution for nvcr.io which requires auth to resolve triton inference server image digest kubectl patch cm config-deployment --patch '{\"data\":{\"registriesSkippingTagResolving\":\"nvcr.io\"}}' -n knative-serving Increase progress deadline since pulling triton image and big bert model may longer than default timeout for 120s, this setting requires knative 0.15.0+ kubectl patch cm config-deployment --patch '{\"data\":{\"progressDeadline\": \"600s\"}}' -n knative-serving","title":"Setup"},{"location":"modelserving/v1beta1/triton/torchscript/#export-as-torchscript-model","text":"A PyTorch model\u2019s journey from Python to C++ is enabled by Torch Script , a representation of a PyTorch model that can be understood, compiled and serialized by the Torch Script compiler. If you are starting out from an existing PyTorch model written in the vanilla eager API, you must first convert your model to Torch Script. Convert the above model via Tracing and serialize the script module to a file import torch # Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing. example = torch . rand ( 1 , 3 , 32 , 32 ) traced_script_module = torch . jit . trace ( net , example ) traced_script_module . save ( \"model.pt\" )","title":"Export as Torchscript Model"},{"location":"modelserving/v1beta1/triton/torchscript/#store-your-trained-model-on-cloud-storage-in-a-model-repository","text":"Once the model is exported as TorchScript model file, the next step is to upload the model to a GCS bucket. Triton supports loading multiple models so it expects a model repository which follows a required layout in the bucket. / / [config.pbtxt] [ ...] / / ... / [config.pbtxt] [ ...] / / For example in your model repository bucket gs://kfserving-examples/models/torchscript , the layout can be torchscript/ cifar/ config.pbtxt 1/ model.pt The config.pbtxt defines a model configuration that provides the required and optional information for the model. A minimal model configuration must specify name, platform, max_batch_size, input, and output. Due to the absence of names for inputs and outputs in a TorchScript model, the name attribute of both the inputs and outputs in the configuration must follow a specific naming convention i.e. \u201c __ \u201d. Where can be any string and refers to the position of the corresponding input/output. This means if there are two inputs and two outputs they must be named as: INPUT__0 , INPUT__1 and OUTPUT__0 , OUTPUT__1 such that INPUT__0 refers to first input and INPUT__1 refers to the second input, etc. na me : \"cifar\" pla tf orm : \"pytorch_libtorch\" max_ba t ch_size : 1 i n pu t [ { na me : \"INPUT__0\" da ta _ t ype : TYPE_FP 32 dims : [ 3 , 32 , 32 ] } ] ou t pu t [ { na me : \"OUTPUT__0\" da ta _ t ype : TYPE_FP 32 dims : [ 10 ] } ] i nstan ce_group [ { cou nt : 1 ki n d : KIND_CPU } ] instance_group provides multiple instances of a model so that multiple inference requests for that model can be handled simultaneously. instance_group [ { count: 4 kind: KIND_CPU } ] To schedule the model on GPU you would need to change the instance_group with GPU kind instance_group [ { count: 1 kind: KIND_GPU } ] For more details, please refer to triton model configuration .","title":"Store your trained model on cloud storage in a Model Repository"},{"location":"modelserving/v1beta1/triton/torchscript/#inference-with-http-endpoint","text":"","title":"Inference with HTTP endpoint"},{"location":"modelserving/v1beta1/triton/torchscript/#create-the-inferenceservice","text":"Create the inference service yaml with the above specified model repository uri. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torchscript-cifar10 spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" Warning Setting OMP_NUM_THREADS or MKL_NUM_THREADS envs are critical for performance, these environment variables are used to control the intra-op parallelism for TorchScript model inference, the number of CPU threads defaults to the number of CPU cores. Please refer to CPU threading & TorchScript Inference for more details. kubectl kubectl apply -f torchscript.yaml Expected Output $ inferenceservice.serving.kserve.io/torchscript-cifar10 created","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/triton/torchscript/#run-a-prediction-with-curl","text":"The first step is to determine the ingress IP and ports and set INGRESS_HOST and INGRESS_PORT The latest Triton Inference Server already switched to use KServe prediction V2 protocol , so the input request needs to follow the V2 schema with the specified data type, shape. # download the input file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/triton/torchscript/input.json MODEL_NAME = cifar10 INPUT_PATH = @./input.json SERVICE_HOSTNAME = $( kubectl get inferenceservice torchscript-cifar10 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/ ${ MODEL_NAME } /infer -d $INPUT_PATH Expected Output * Connected to torchscript-cifar.default.svc.cluster.local ( 10 .51.242.87 ) port 80 ( #0) > POST /v2/models/cifar10/infer HTTP/1.1 > Host: torchscript-cifar.default.svc.cluster.local > User-Agent: curl/7.47.0 > Accept: */* > Content-Length: 110765 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > < HTTP/1.1 100 Continue * We are completely uploaded and fine < HTTP/1.1 200 OK < content-length: 315 < content-type: application/json < date: Sun, 11 Oct 2020 21 :26:51 GMT < x-envoy-upstream-service-time: 8 < server: istio-envoy < * Connection #0 to host torchscript-cifar.default.svc.cluster.local left intact { \"model_name\" : \"cifar10\" , \"model_version\" : \"1\" , \"outputs\" : [{ \"name\" : \"OUTPUT__0\" , \"datatype\" : \"FP32\" , \"shape\" : [ 1 ,10 ] , \"data\" : [ -2.0964810848236086,-0.13700756430625916,-0.5095657706260681,2.795621395111084,-0.5605481863021851,1.9934231042861939,1.1288187503814698,-1.4043136835098267,0.6004879474639893,-2.1237082481384279 ]}]}","title":"Run a prediction with curl"},{"location":"modelserving/v1beta1/triton/torchscript/#run-a-performance-test","text":"QPS rate --rate can be changed in the perf.yaml . kubectl create -f perf.yaml Requests [ total, rate, throughput ] 6000 , 100 .02, 100 .01 Duration [ total, attack, wait ] 59 .995s, 59 .99s, 4 .961ms Latencies [ min, mean, 50 , 90 , 95 , 99 , max ] 4 .222ms, 5 .7ms, 5 .548ms, 6 .384ms, 6 .743ms, 9 .286ms, 25 .85ms Bytes In [ total, mean ] 1890000 , 315 .00 Bytes Out [ total, mean ] 665874000 , 110979 .00 Success [ ratio ] 100 .00% Status Codes [ code:count ] 200 :6000 Error Set:","title":"Run a performance test"},{"location":"modelserving/v1beta1/triton/torchscript/#inference-with-grpc-endpoint","text":"","title":"Inference with gRPC endpoint"},{"location":"modelserving/v1beta1/triton/torchscript/#create-the-inferenceservice_1","text":"Create the inference service yaml and expose the gRPC port, currently only one port is allowed to expose either HTTP or gRPC port and by default HTTP port is exposed. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torchscript-cifar10 spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 ports : - containerPort : 9000 name : h2c protocol : TCP env : - name : OMP_NUM_THREADS value : \"1\" Apply the gRPC InferenceService yaml and then you can call the model with tritonclient python library after InferenceService is ready. kubectl apply -f torchscript_grpc.yaml","title":"Create the InferenceService"},{"location":"modelserving/v1beta1/triton/torchscript/#run-a-prediction-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/predict-api/v2/grpc_predict_v2.proto # download the input json file curl -O https://raw.githubusercontent.com/kserve/website/main/docs/modelserving/v1beta1/triton/torchscript/input-grpc.json INPUT_PATH = input-grpc.json PROTO_FILE = grpc_predict_v2.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice torchscript-cifar10 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) The gRPC APIs follow the KServe prediction V2 protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \" \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. Notice that the input file differs from that used in the previous curl example. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: host: torchscript-cifar10.default.example.com Response headers received: accept-encoding: identity,gzip content-type: application/grpc date: Fri, 12 Aug 2022 01 :49:53 GMT grpc-accept-encoding: identity,deflate,gzip server: istio-envoy x-envoy-upstream-service-time: 16 Response contents: { \"modelName\" : \"cifar10\" , \"modelVersion\" : \"1\" , \"outputs\" : [ { \"name\" : \"OUTPUT__0\" , \"datatype\" : \"FP32\" , \"shape\" : [ \"1\" , \"10\" ] } ] , \"rawOutputContents\" : [ \"wCwGwOJLDL7icgK/dusyQAqAD799KP8/In2QP4zAs7+WuRk/2OoHwA==\" ] } Response trailers received: ( empty ) Sent 1 request and received 1 response The content of output tensor is encoded in rawOutputContents field. It can be base64 decoded and loaded into a Numpy array with the given datatype and shape. Alternatively, Triton also provides Python client library which has many examples showing how to interact with the KServe V2 gPRC protocol.","title":"Run a prediction with grpcurl"},{"location":"modelserving/v1beta1/triton/torchscript/#add-transformer-to-the-inferenceservice","text":"Triton Inference Server expects tensors as input data, often times a pre-processing step is required before making the prediction call when the user is sending in request with raw input format. Transformer component can be specified on InferenceService spec for user implemented pre/post processing code. User is responsible to create a python class which extends from KServe Model base class which implements preprocess handler to transform raw input format to tensor format according to V2 prediction protocol, postprocess handle is to convert raw prediction response to a more user friendly response.","title":"Add Transformer to the InferenceService"},{"location":"modelserving/v1beta1/triton/torchscript/#implement-prepost-processing-functions","text":"image_transformer_v2.py import kserve from typing import Dict from PIL import Image import torchvision.transforms as transforms import logging import io import numpy as np import base64 logging . basicConfig ( level = kserve . constants . KSERVE_LOGLEVEL ) transform = transforms . Compose ( [ transforms . ToTensor (), transforms . Normalize (( 0.5 , 0.5 , 0.5 ), ( 0.5 , 0.5 , 0.5 ))]) def image_transform ( instance ): byte_array = base64 . b64decode ( instance [ 'image_bytes' ][ 'b64' ]) image = Image . open ( io . BytesIO ( byte_array )) a = np . asarray ( image ) im = Image . fromarray ( a ) res = transform ( im ) logging . info ( res ) return res . tolist () class ImageTransformerV2 ( kserve . Model ): def __init__ ( self , name : str , predictor_host : str , protocol : str ): super () . __init__ ( name ) self . predictor_host = predictor_host self . protocol = protocol def preprocess ( self , inputs : Dict ) -> Dict : return { 'inputs' : [ { 'name' : 'INPUT__0' , 'shape' : [ 1 , 3 , 32 , 32 ], 'datatype' : \"FP32\" , 'data' : [ image_transform ( instance ) for instance in inputs [ 'instances' ]] } ] } def postprocess ( self , results : Dict ) -> Dict : return { output [ \"name\" ]: np . array ( output [ \"data\" ]) . reshape ( output [ \"shape\" ]) . tolist () for output in results [ \"outputs\" ]} Please find the code example and Dockerfile .","title":"Implement pre/post processing functions"},{"location":"modelserving/v1beta1/triton/torchscript/#build-transformer-docker-image","text":"docker build -t $DOCKER_USER /image-transformer-v2:latest -f transformer.Dockerfile . --rm","title":"Build Transformer docker image"},{"location":"modelserving/v1beta1/triton/torchscript/#create-the-inferenceservice-with-transformer","text":"Please use the YAML file to create the InferenceService, which adds the image transformer component with the docker image built from above. apiVersion : serving.kserve.io/v1beta1 kind : InferenceService metadata : name : torch-transfomer spec : predictor : triton : storageUri : gs://kfserving-examples/models/torchscript runtimeVersion : 20.10-py3 env : - name : OMP_NUM_THREADS value : \"1\" transformer : containers : - image : kfserving/image-transformer-v2:latest name : kserve-container command : - \"python\" - \"-m\" - \"image_transformer_v2\" args : - --model_name - cifar10 - --protocol - v2 kubectl apply -f torch_transformer.yaml Expected Output $ inferenceservice.serving.kserve.io/torch-transfomer created","title":"Create the InferenceService with Transformer"},{"location":"modelserving/v1beta1/triton/torchscript/#run-a-prediction-with-curl_1","text":"The transformer does not enforce a specific schema like predictor but the general recommendation is to send in as a list of object(dict): \"instances\": | { \"instances\" : [ { \"image_bytes\" : { \"b64\" : \"aW1hZ2UgYnl0ZXM=\" }, \"caption\" : \"seaside\" }, { \"image_bytes\" : { \"b64\" : \"YXdlc29tZSBpbWFnZSBieXRlcw==\" }, \"caption\" : \"mountains\" } ] } # download the input file curl -O https://raw.githubusercontent.com/kserve/kserve/master/docs/samples/v1beta1/triton/torchscript/image.json SERVICE_NAME = torch-transfomer MODEL_NAME = cifar10 INPUT_PATH = @./image.json SERVICE_HOSTNAME = $( kubectl get inferenceservice $SERVICE_NAME -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v -H \"Host: ${ SERVICE_HOSTNAME } \" -H \"Content-Type: application/json\" http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v1/models/ ${ MODEL_NAME } :predict -d $INPUT_PATH Expected Output > POST /v1/models/cifar10:predict HTTP/1.1 > Host: torch-transformer.kserve-triton.example.com > User-Agent: curl/7.68.0 > Accept: */* > Content-Length: 3400 > Content-Type: application/x-www-form-urlencoded > Expect: 100 -continue > * Mark bundle as not supporting multiuse < HTTP/1.1 100 Continue * We are completely uploaded and fine * Mark bundle as not supporting multiuse < HTTP/1.1 200 OK < content-length: 219 < content-type: application/json ; charset = UTF-8 < date: Sat, 19 Mar 2022 12 :15:54 GMT < server: istio-envoy < x-envoy-upstream-service-time: 41 < { \"OUTPUT__0\" : [[ -2.0964810848236084, -0.137007474899292, -0.5095658302307129, 2 .795621395111084, -0.560547947883606, 1 .9934231042861938, 1 .1288189888000488, - 4043136835098267 , 0 .600488007068634, -2.1237082481384277 ]]} %","title":"Run a prediction with curl"},{"location":"modelserving/v1beta1/xgboost/","text":"Deploying XGBoost models with InferenceService \u00b6 This example walks you through how to deploy a xgboost model using KServe's InferenceService CRD. Note that, by default it exposes your model through an API compatible with the existing V1 Dataplane. This example will show you how to serve a model through an API compatible with the Open Inference Protocol . Train the Model \u00b6 The first step will be to train a sample xgboost model. We will save this model as model.bst . import xgboost as xgb from sklearn.datasets import load_iris import os model_dir = \".\" BST_FILE = \"model.bst\" iris = load_iris () y = iris [ 'target' ] X = iris [ 'data' ] dtrain = xgb . DMatrix ( X , label = y ) param = { 'max_depth' : 6 , 'eta' : 0.1 , 'silent' : 1 , 'nthread' : 4 , 'num_class' : 10 , 'objective' : 'multi:softmax' } xgb_model = xgb . train ( params = param , dtrain = dtrain ) model_file = os . path . join (( model_dir ), BST_FILE ) xgb_model . save_model ( model_file ) Test the model locally \u00b6 Once you've got your model serialized model.bst , we can then use KServe XGBoost Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService . Pre-requisites \u00b6 Firstly, to use kserve xgboost server locally, you will first need to install the xgbserver runtime package in your local environment. Clone the Kserve repository and navigate into the directory. git clone https://github.com/kserve/kserve Install xgbserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/xgbserver poetry install Serving model locally \u00b6 The xgbserver package takes three arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. --nthread : Number of threads to use by LightGBM. This is optional and the default value is 1. With the xgbserver runtime package installed locally, you should now be ready to start our server as: python3 xgbserver --model_dir /path/to/model_dir --model_name xgboost-v2-iris Deploy the Model with REST endpoint through InferenceService \u00b6 Lastly, we use KServe to deploy our trained model on Kubernetes. For this, we use the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Assuming that we've got a cluster accessible through kubectl with KServe already installed, we can deploy our model as: kubectl apply -f xgboost.yaml Test the Deployed Model \u00b6 We can now test our deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice xgboost-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/xgboost-v2-iris/infer The output will be something similar to: Expected Output { \"id\" : \"4e546709-0887-490a-abd6-00cbc4c26cf4\" , \"model_name\" : \"xgboost-v2-iris\" , \"model_version\" : \"v1.0.0\" , \"outputs\" : [ { \"data\" : [ 1.0 , 1.0 ], \"datatype\" : \"FP32\" , \"name\" : \"predict\" , \"parameters\" : null , \"shape\" : [ 2 ] } ] } Deploy the Model with GRPC endpoint through InferenceService \u00b6 Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris-grpc\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris-grpc\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" ports : - name : grpc-port # Istio requires the port name to be in the format [-] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f xgboost-v2-grpc.yaml Test the deployed model with grpcurl \u00b6 After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice xgboost-iris-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"xgboost-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"modelName\" : \"xgboost-v2-iris-grpc\" , \"id\" : \"41738561-7219-4e4a-984d-5fe19bed6298\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"INT32\" , \"shape\" : [ \"2\" ] , \"contents\" : { \"intContents\" : [ 1 , 1 ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"XGBoost"},{"location":"modelserving/v1beta1/xgboost/#deploying-xgboost-models-with-inferenceservice","text":"This example walks you through how to deploy a xgboost model using KServe's InferenceService CRD. Note that, by default it exposes your model through an API compatible with the existing V1 Dataplane. This example will show you how to serve a model through an API compatible with the Open Inference Protocol .","title":"Deploying XGBoost models with InferenceService"},{"location":"modelserving/v1beta1/xgboost/#train-the-model","text":"The first step will be to train a sample xgboost model. We will save this model as model.bst . import xgboost as xgb from sklearn.datasets import load_iris import os model_dir = \".\" BST_FILE = \"model.bst\" iris = load_iris () y = iris [ 'target' ] X = iris [ 'data' ] dtrain = xgb . DMatrix ( X , label = y ) param = { 'max_depth' : 6 , 'eta' : 0.1 , 'silent' : 1 , 'nthread' : 4 , 'num_class' : 10 , 'objective' : 'multi:softmax' } xgb_model = xgb . train ( params = param , dtrain = dtrain ) model_file = os . path . join (( model_dir ), BST_FILE ) xgb_model . save_model ( model_file )","title":"Train the Model"},{"location":"modelserving/v1beta1/xgboost/#test-the-model-locally","text":"Once you've got your model serialized model.bst , we can then use KServe XGBoost Server to spin up a local server. Note This step is optional and just meant for testing, feel free to jump straight to deploying with InferenceService .","title":"Test the model locally"},{"location":"modelserving/v1beta1/xgboost/#pre-requisites","text":"Firstly, to use kserve xgboost server locally, you will first need to install the xgbserver runtime package in your local environment. Clone the Kserve repository and navigate into the directory. git clone https://github.com/kserve/kserve Install xgbserver runtime. Kserve uses Poetry as the dependency management tool. Make sure you have already installed poetry . cd python/xgbserver poetry install","title":"Pre-requisites"},{"location":"modelserving/v1beta1/xgboost/#serving-model-locally","text":"The xgbserver package takes three arguments. --model_dir : The model directory path where the model is stored. --model_name : The name of the model deployed in the model server, the default value is model . This is optional. --nthread : Number of threads to use by LightGBM. This is optional and the default value is 1. With the xgbserver runtime package installed locally, you should now be ready to start our server as: python3 xgbserver --model_dir /path/to/model_dir --model_name xgboost-v2-iris","title":"Serving model locally"},{"location":"modelserving/v1beta1/xgboost/#deploy-the-model-with-rest-endpoint-through-inferenceservice","text":"Lastly, we use KServe to deploy our trained model on Kubernetes. For this, we use the InferenceService CRD and set the protocolVersion field to v2 . Yaml apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Assuming that we've got a cluster accessible through kubectl with KServe already installed, we can deploy our model as: kubectl apply -f xgboost.yaml","title":"Deploy the Model with REST endpoint through InferenceService"},{"location":"modelserving/v1beta1/xgboost/#test-the-deployed-model","text":"We can now test our deployed model by sending a sample request. Note that this request needs to follow the Open Inference Protocol . You can see an example payload below. Create a file named iris-input-v2.json with the sample input. { \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"data\" : [ [ 6.8 , 2.8 , 4.8 , 1.4 ], [ 6.0 , 3.4 , 4.5 , 1.6 ] ] } ] } Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference request as: SERVICE_HOSTNAME = $( kubectl get inferenceservice xgboost-v2-iris -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) curl -v \\ -H \"Host: ${ SERVICE_HOSTNAME } \" \\ -H \"Content-Type: application/json\" \\ -d @./iris-input-v2.json \\ http:// ${ INGRESS_HOST } : ${ INGRESS_PORT } /v2/models/xgboost-v2-iris/infer The output will be something similar to: Expected Output { \"id\" : \"4e546709-0887-490a-abd6-00cbc4c26cf4\" , \"model_name\" : \"xgboost-v2-iris\" , \"model_version\" : \"v1.0.0\" , \"outputs\" : [ { \"data\" : [ 1.0 , 1.0 ], \"datatype\" : \"FP32\" , \"name\" : \"predict\" , \"parameters\" : null , \"shape\" : [ 2 ] } ] }","title":"Test the Deployed Model"},{"location":"modelserving/v1beta1/xgboost/#deploy-the-model-with-grpc-endpoint-through-inferenceservice","text":"Create the inference service resource and expose the gRPC port using the below yaml. Note Currently, KServe only supports exposing either HTTP or gRPC port. By default, HTTP port is exposed. Serverless RawDeployment apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris-grpc\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" ports : - name : h2c # knative expects grpc port name to be 'h2c' protocol : TCP containerPort : 8081 apiVersion : \"serving.kserve.io/v1beta1\" kind : \"InferenceService\" metadata : name : \"xgboost-v2-iris-grpc\" spec : predictor : model : modelFormat : name : xgboost protocolVersion : v2 runtime : kserve-xgbserver storageUri : \"gs://kfserving-examples/models/xgboost/iris\" ports : - name : grpc-port # Istio requires the port name to be in the format [-] protocol : TCP containerPort : 8081 Note For V2 protocol (open inference protocol) if runtime field is not provided then, by default mlserver runtime is used. Apply the InferenceService yaml to get the gRPC endpoint kubectl kubectl apply -f xgboost-v2-grpc.yaml","title":"Deploy the Model with GRPC endpoint through InferenceService"},{"location":"modelserving/v1beta1/xgboost/#test-the-deployed-model-with-grpcurl","text":"After the gRPC InferenceService becomes ready, grpcurl , can be used to send gRPC requests to the InferenceService . # download the proto file curl -O https://raw.githubusercontent.com/kserve/open-inference-protocol/main/specification/protocol/open_inference_grpc.proto INPUT_PATH = iris-input-v2-grpc.json PROTO_FILE = open_inference_grpc.proto SERVICE_HOSTNAME = $( kubectl get inferenceservice xgboost-iris-v2 -o jsonpath = '{.status.url}' | cut -d \"/\" -f 3 ) Determine the ingress IP and port and set INGRESS_HOST and INGRESS_PORT . Now, you can use curl to send the inference requests. The gRPC APIs follows the KServe prediction V2 protocol / Open Inference Protocol . For example, ServerReady API can be used to check if the server is ready: grpcurl \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ServerReady Expected Output { \"ready\" : true } You can test the deployed model by sending a sample request with the below payload. Notice that the input format differs from the in the previous REST endpoint example. Prepare the inference input inside the file named iris-input-v2-grpc.json . { \"model_name\" : \"xgboost-v2-iris-grpc\" , \"inputs\" : [ { \"name\" : \"input-0\" , \"shape\" : [ 2 , 4 ], \"datatype\" : \"FP32\" , \"contents\" : { \"fp32_contents\" : [ 6.8 , 2.8 , 4.8 , 1.4 , 6.0 , 3.4 , 4.5 , 1.6 ] } } ] } ModelInfer API takes input following the ModelInferRequest schema defined in the grpc_predict_v2.proto file. grpcurl \\ -vv \\ -plaintext \\ -proto ${ PROTO_FILE } \\ -authority ${ SERVICE_HOSTNAME } \\ -d @ \\ ${ INGRESS_HOST } : ${ INGRESS_PORT } \\ inference.GRPCInferenceService.ModelInfer \\ <<< $( cat \" $INPUT_PATH \" ) Expected Output Resolved method descriptor: // The ModelInfer API performs inference using the specified model. Errors are // indicated by the google.rpc.Status returned for the request. The OK code // indicates success and other codes indicate failure. rpc ModelInfer ( .inference.ModelInferRequest ) returns ( .inference.ModelInferResponse ) ; Request metadata to send: ( empty ) Response headers received: content-type: application/grpc date: Mon, 09 Oct 2023 11 :07:26 GMT grpc-accept-encoding: identity, deflate, gzip server: istio-envoy x-envoy-upstream-service-time: 16 Estimated response size: 83 bytes Response contents: { \"modelName\" : \"xgboost-v2-iris-grpc\" , \"id\" : \"41738561-7219-4e4a-984d-5fe19bed6298\" , \"outputs\" : [ { \"name\" : \"output-0\" , \"datatype\" : \"INT32\" , \"shape\" : [ \"2\" ] , \"contents\" : { \"intContents\" : [ 1 , 1 ] } } ] } Response trailers received: ( empty ) Sent 1 request and received 1 response","title":"Test the deployed model with grpcurl"},{"location":"python_runtime_api/docs/","text":"KServe Python Runtime API \u00b6 KServe's python runtime API implements a standardized python model server API following open inference protocol . It encapsulates data plane API definitions and storage retrieval for models. It provides many functionalities, including among others: Implements the data plane API following open inference protocol. Provide extensible model server and model API. Allow customizing pre-processing, prediction and post-processing handlers. Readiness and liveness Handlers. Installation \u00b6 KServe Python SDK can be installed by pip or poetry . pip install \u00b6 pip install kserve Poetry \u00b6 Checkout KServe GitHub repository and Install via poetry . cd kserve/python/kserve peotry install API Reference \u00b6 Please refer to API docs . Storage API \u00b6 The storage API is used by KServe Storage Initializer which supports the following cloud storage providers. The storage package is optional and can be installed via pip install kserve [ storage ] Google Cloud Storage with a prefix: \"gs://\" By default, it uses GOOGLE_APPLICATION_CREDENTIALS environment variable for user authentication. If GOOGLE_APPLICATION_CREDENTIALS is not provided, anonymous client will be used to download the artifacts. S3 Compatible Object Storage with a prefix \"s3://\" For static credentials it uses S3_ENDPOINT , AWS_ACCESS_KEY_ID , and AWS_SECRET_ACCESS_KEY environment variables for authentication. Azure Blob Storage with the format: https://{$STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{$CONTAINER}/{$PATH} By default, it uses anonymous client to download the artifacts. For e.g. https://kfserving.blob.core.windows.net/triton/simple_string/ Persistent Volume Claim (PVC) with the format pvc://{$pvcname}/[path] . The pvcname is the name of the PVC that contains the model. The [path] is the relative path to the model on the PVC. For e.g. pvc://mypvcname/model/path/on/pvc Generic URI over either HTTP prefixed with http:// or HTTPS prefixed with https:// . For example: https://.com/model.joblib http://.com/model.zip","title":"Python Runtime Server SDK"},{"location":"python_runtime_api/docs/#kserve-python-runtime-api","text":"KServe's python runtime API implements a standardized python model server API following open inference protocol . It encapsulates data plane API definitions and storage retrieval for models. It provides many functionalities, including among others: Implements the data plane API following open inference protocol. Provide extensible model server and model API. Allow customizing pre-processing, prediction and post-processing handlers. Readiness and liveness Handlers.","title":"KServe Python Runtime API"},{"location":"python_runtime_api/docs/#installation","text":"KServe Python SDK can be installed by pip or poetry .","title":"Installation"},{"location":"python_runtime_api/docs/#pip-install","text":"pip install kserve","title":"pip install"},{"location":"python_runtime_api/docs/#poetry","text":"Checkout KServe GitHub repository and Install via poetry . cd kserve/python/kserve peotry install","title":"Poetry"},{"location":"python_runtime_api/docs/#api-reference","text":"Please refer to API docs .","title":"API Reference"},{"location":"python_runtime_api/docs/#storage-api","text":"The storage API is used by KServe Storage Initializer which supports the following cloud storage providers. The storage package is optional and can be installed via pip install kserve [ storage ] Google Cloud Storage with a prefix: \"gs://\" By default, it uses GOOGLE_APPLICATION_CREDENTIALS environment variable for user authentication. If GOOGLE_APPLICATION_CREDENTIALS is not provided, anonymous client will be used to download the artifacts. S3 Compatible Object Storage with a prefix \"s3://\" For static credentials it uses S3_ENDPOINT , AWS_ACCESS_KEY_ID , and AWS_SECRET_ACCESS_KEY environment variables for authentication. Azure Blob Storage with the format: https://{$STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{$CONTAINER}/{$PATH} By default, it uses anonymous client to download the artifacts. For e.g. https://kfserving.blob.core.windows.net/triton/simple_string/ Persistent Volume Claim (PVC) with the format pvc://{$pvcname}/[path] . The pvcname is the name of the PVC that contains the model. The [path] is the relative path to the model on the PVC. For e.g. pvc://mypvcname/model/path/on/pvc Generic URI over either HTTP prefixed with http:// or HTTPS prefixed with https:// . For example: https://.com/model.joblib http://.com/model.zip","title":"Storage API"},{"location":"python_runtime_api/docs/api/","text":"KServe Python Serving Runtime API \u00b6 ModelServer \u00b6 Source code in kserve/model_server.pyclass ModelServer : def __init__ ( self , http_port : int = args . http_port , grpc_port : int = args . grpc_port , workers : int = args . workers , max_threads : int = args . max_threads , max_asyncio_workers : int = args . max_asyncio_workers , registered_models : ModelRepository = ModelRepository (), enable_grpc : bool = args . enable_grpc , enable_docs_url : bool = args . enable_docs_url , enable_latency_logging : bool = args . enable_latency_logging , configure_logging : bool = args . configure_logging , log_config : Optional [ Union [ Dict , str ]] = args . log_config_file , access_log_format : str = args . access_log_format , ): \"\"\"KServe ModelServer Constructor Args: http_port: HTTP port. Default: ``8080``. grpc_port: GRPC port. Default: ``8081``. workers: Number of uvicorn workers. Default: ``1``. max_threads: Max number of gRPC processing threads. Default: ``4`` max_asyncio_workers: Max number of AsyncIO threads. Default: ``None`` registered_models: Model repository with registered models. enable_grpc: Whether to turn on grpc server. Default: ``True`` enable_docs_url: Whether to turn on ``/docs`` Swagger UI. Default: ``False``. enable_latency_logging: Whether to log latency metric. Default: ``True``. configure_logging: Whether to configure KServe and Uvicorn logging. Default: ``True``. log_config: File path or dict containing log config. Default: ``None``. access_log_format: Format to set for the access log (provided by asgi-logger). Default: ``None`` \"\"\" self . registered_models = registered_models self . http_port = http_port self . grpc_port = grpc_port self . workers = workers self . max_threads = max_threads self . max_asyncio_workers = max_asyncio_workers self . enable_grpc = enable_grpc self . enable_docs_url = enable_docs_url self . enable_latency_logging = enable_latency_logging self . dataplane = DataPlane ( model_registry = registered_models ) self . model_repository_extension = ModelRepositoryExtension ( model_registry = self . registered_models ) self . _grpc_server = None self . _rest_server = None if self . enable_grpc : self . _grpc_server = GRPCServer ( grpc_port , self . dataplane , self . model_repository_extension ) # Logs can be passed as a path to a file or a dictConfig. # We rely on Uvicorn to configure the loggers for us. if configure_logging : self . log_config = ( log_config if log_config is not None else KSERVE_LOG_CONFIG ) else : # By setting log_config to None we tell Uvicorn not to configure logging self . log_config = None self . access_log_format = access_log_format self . _custom_exception_handler = None def start ( self , models : Union [ List [ BaseKServeModel ], Dict [ str , Deployment ]] ) -> None : \"\"\"Start the model server with a set of registered models. Args: models: a list of models to register to the model server. \"\"\" if isinstance ( models , list ): for model in models : if isinstance ( model , BaseKServeModel ): self . register_model ( model ) # pass whether to log request latency into the model model . enable_latency_logging = self . enable_latency_logging else : raise RuntimeError ( \"Model type should be 'BaseKServeModel'\" ) elif isinstance ( models , dict ): if all ([ isinstance ( v , Deployment ) for v in models . values ()]): # TODO: make this port number a variable rayserve . start ( detached = True , http_options = { \"host\" : \"0.0.0.0\" , \"port\" : 9071 } ) for key in models : models [ key ] . deploy () handle = models [ key ] . get_handle () self . register_model_handle ( key , handle ) else : raise RuntimeError ( \"Model type should be RayServe Deployment\" ) else : raise RuntimeError ( \"Unknown model collection types\" ) if self . max_asyncio_workers is None : # formula as suggest in https://bugs.python.org/issue35279 self . max_asyncio_workers = min ( 32 , utils . cpu_count () + 4 ) logger . info ( f \"Setting max asyncio worker threads as { self . max_asyncio_workers } \" ) asyncio . get_event_loop () . set_default_executor ( concurrent . futures . ThreadPoolExecutor ( max_workers = self . max_asyncio_workers ) ) async def serve (): logger . info ( f \"Starting uvicorn with { self . workers } workers\" ) loop = asyncio . get_event_loop () if sys . platform not in [ \"win32\" , \"win64\" ]: sig_list = [ signal . SIGINT , signal . SIGTERM , signal . SIGQUIT ] else : sig_list = [ signal . SIGINT , signal . SIGTERM ] for sig in sig_list : loop . add_signal_handler ( sig , lambda s = sig : asyncio . create_task ( self . stop ( sig = s )) ) if self . _custom_exception_handler is None : loop . set_exception_handler ( self . default_exception_handler ) else : loop . set_exception_handler ( self . _custom_exception_handler ) if self . workers == 1 : self . _rest_server = UvicornServer ( self . http_port , [], self . dataplane , self . model_repository_extension , self . enable_docs_url , log_config = self . log_config , access_log_format = self . access_log_format , ) await self . _rest_server . run () else : # Since py38 MacOS/Windows defaults to use spawn for starting multiprocessing. # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods # Spawn does not work with FastAPI/uvicorn in multiprocessing mode, use fork for multiprocessing # https://github.com/tiangolo/fastapi/issues/1586 serversocket = socket . socket ( socket . AF_INET , socket . SOCK_STREAM ) serversocket . setsockopt ( socket . SOL_SOCKET , socket . SO_REUSEADDR , 1 ) serversocket . bind (( \"0.0.0.0\" , self . http_port )) serversocket . listen ( 5 ) multiprocessing . set_start_method ( \"fork\" ) self . _rest_server = UvicornServer ( self . http_port , [ serversocket ], self . dataplane , self . model_repository_extension , self . enable_docs_url , log_config = self . log_config , access_log_format = self . access_log_format , ) for _ in range ( self . workers ): p = Process ( target = self . _rest_server . run_sync ) p . start () async def servers_task (): servers = [ serve ()] if self . enable_grpc : servers . append ( self . _grpc_server . start ( self . max_threads )) await asyncio . gather ( * servers ) asyncio . run ( servers_task ()) async def stop ( self , sig : Optional [ int ] = None ): \"\"\"Stop the instances of REST and gRPC model servers. Args: sig: The signal to stop the server. Default: ``None``. \"\"\" logger . info ( \"Stopping the model server\" ) if self . _rest_server : logger . info ( \"Stopping the rest server\" ) await self . _rest_server . stop () if self . _grpc_server : logger . info ( \"Stopping the grpc server\" ) await self . _grpc_server . stop ( sig ) for model_name in list ( self . registered_models . get_models () . keys ()): self . registered_models . unload ( model_name ) def register_exception_handler ( self , handler : Callable [[ asyncio . events . AbstractEventLoop , Dict [ str , Any ]], None ], ): \"\"\"Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see `call_exception_handler()` documentation for details about context). \"\"\" self . _custom_exception_handler = handler def default_exception_handler ( self , loop : asyncio . events . AbstractEventLoop , context : Dict [ str , Any ] ): \"\"\"Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. By default, this will shut down the server gracefully. This can be called by a custom exception handler that wants to defer to the default handler behavior. \"\"\" # gracefully shutdown the server loop . run_until_complete ( self . stop ()) loop . default_exception_handler ( context ) def register_model_handle ( self , name : str , model_handle : DeploymentHandle ): \"\"\"Register a model handle to the model server. Args: name: The name of the model handle. model_handle: The model handle object. \"\"\" self . registered_models . update_handle ( name , model_handle ) logger . info ( \"Registering model handle: %s \" , name ) def register_model ( self , model : BaseKServeModel ): \"\"\"Register a model to the model server. Args: model: The model object. \"\"\" if not model . name : raise Exception ( \"Failed to register model, model.name must be provided.\" ) self . registered_models . update ( model ) logger . info ( \"Registering model: %s \" , model . name ) __init__ ( http_port = args . http_port , grpc_port = args . grpc_port , workers = args . workers , max_threads = args . max_threads , max_asyncio_workers = args . max_asyncio_workers , registered_models = ModelRepository (), enable_grpc = args . enable_grpc , enable_docs_url = args . enable_docs_url , enable_latency_logging = args . enable_latency_logging , configure_logging = args . configure_logging , log_config = args . log_config_file , access_log_format = args . access_log_format ) \u00b6 KServe ModelServer Constructor Parameters: Name Type Description Default http_port int HTTP port. Default: 8080 . http_port grpc_port int GRPC port. Default: 8081 . grpc_port workers int Number of uvicorn workers. Default: 1 . workers max_threads int Max number of gRPC processing threads. Default: 4 max_threads max_asyncio_workers int Max number of AsyncIO threads. Default: None max_asyncio_workers registered_models ModelRepository Model repository with registered models. ModelRepository () enable_grpc bool Whether to turn on grpc server. Default: True enable_grpc enable_docs_url bool Whether to turn on /docs Swagger UI. Default: False . enable_docs_url enable_latency_logging bool Whether to log latency metric. Default: True . enable_latency_logging configure_logging bool Whether to configure KServe and Uvicorn logging. Default: True . configure_logging log_config Optional [ Union [ Dict , str ]] File path or dict containing log config. Default: None . log_config_file access_log_format str Format to set for the access log (provided by asgi-logger). Default: None access_log_format Source code in kserve/model_server.py 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 def __init__ ( self , http_port : int = args . http_port , grpc_port : int = args . grpc_port , workers : int = args . workers , max_threads : int = args . max_threads , max_asyncio_workers : int = args . max_asyncio_workers , registered_models : ModelRepository = ModelRepository (), enable_grpc : bool = args . enable_grpc , enable_docs_url : bool = args . enable_docs_url , enable_latency_logging : bool = args . enable_latency_logging , configure_logging : bool = args . configure_logging , log_config : Optional [ Union [ Dict , str ]] = args . log_config_file , access_log_format : str = args . access_log_format , ): \"\"\"KServe ModelServer Constructor Args: http_port: HTTP port. Default: ``8080``. grpc_port: GRPC port. Default: ``8081``. workers: Number of uvicorn workers. Default: ``1``. max_threads: Max number of gRPC processing threads. Default: ``4`` max_asyncio_workers: Max number of AsyncIO threads. Default: ``None`` registered_models: Model repository with registered models. enable_grpc: Whether to turn on grpc server. Default: ``True`` enable_docs_url: Whether to turn on ``/docs`` Swagger UI. Default: ``False``. enable_latency_logging: Whether to log latency metric. Default: ``True``. configure_logging: Whether to configure KServe and Uvicorn logging. Default: ``True``. log_config: File path or dict containing log config. Default: ``None``. access_log_format: Format to set for the access log (provided by asgi-logger). Default: ``None`` \"\"\" self . registered_models = registered_models self . http_port = http_port self . grpc_port = grpc_port self . workers = workers self . max_threads = max_threads self . max_asyncio_workers = max_asyncio_workers self . enable_grpc = enable_grpc self . enable_docs_url = enable_docs_url self . enable_latency_logging = enable_latency_logging self . dataplane = DataPlane ( model_registry = registered_models ) self . model_repository_extension = ModelRepositoryExtension ( model_registry = self . registered_models ) self . _grpc_server = None self . _rest_server = None if self . enable_grpc : self . _grpc_server = GRPCServer ( grpc_port , self . dataplane , self . model_repository_extension ) # Logs can be passed as a path to a file or a dictConfig. # We rely on Uvicorn to configure the loggers for us. if configure_logging : self . log_config = ( log_config if log_config is not None else KSERVE_LOG_CONFIG ) else : # By setting log_config to None we tell Uvicorn not to configure logging self . log_config = None self . access_log_format = access_log_format self . _custom_exception_handler = None default_exception_handler ( loop , context ) \u00b6 Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. By default, this will shut down the server gracefully. This can be called by a custom exception handler that wants to defer to the default handler behavior. Source code in kserve/model_server.py 348 349 350 351 352 353 354 355 356 357 358 359 360 def default_exception_handler ( self , loop : asyncio . events . AbstractEventLoop , context : Dict [ str , Any ] ): \"\"\"Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. By default, this will shut down the server gracefully. This can be called by a custom exception handler that wants to defer to the default handler behavior. \"\"\" # gracefully shutdown the server loop . run_until_complete ( self . stop ()) loop . default_exception_handler ( context ) register_exception_handler ( handler ) \u00b6 Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see call_exception_handler() documentation for details about context). Source code in kserve/model_server.py 334 335 336 337 338 339 340 341 342 343 344 345 346 def register_exception_handler ( self , handler : Callable [[ asyncio . events . AbstractEventLoop , Dict [ str , Any ]], None ], ): \"\"\"Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see `call_exception_handler()` documentation for details about context). \"\"\" self . _custom_exception_handler = handler register_model ( model ) \u00b6 Register a model to the model server. Parameters: Name Type Description Default model BaseKServeModel The model object. required Source code in kserve/model_server.py 372 373 374 375 376 377 378 379 380 381 def register_model ( self , model : BaseKServeModel ): \"\"\"Register a model to the model server. Args: model: The model object. \"\"\" if not model . name : raise Exception ( \"Failed to register model, model.name must be provided.\" ) self . registered_models . update ( model ) logger . info ( \"Registering model: %s \" , model . name ) register_model_handle ( name , model_handle ) \u00b6 Register a model handle to the model server. Parameters: Name Type Description Default name str The name of the model handle. required model_handle DeploymentHandle The model handle object. required Source code in kserve/model_server.py 362 363 364 365 366 367 368 369 370 def register_model_handle ( self , name : str , model_handle : DeploymentHandle ): \"\"\"Register a model handle to the model server. Args: name: The name of the model handle. model_handle: The model handle object. \"\"\" self . registered_models . update_handle ( name , model_handle ) logger . info ( \"Registering model handle: %s \" , name ) start ( models ) \u00b6 Start the model server with a set of registered models. Parameters: Name Type Description Default models Union [ List [ BaseKServeModel ], Dict [ str , Deployment ]] a list of models to register to the model server. required Source code in kserve/model_server.py 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 def start ( self , models : Union [ List [ BaseKServeModel ], Dict [ str , Deployment ]] ) -> None : \"\"\"Start the model server with a set of registered models. Args: models: a list of models to register to the model server. \"\"\" if isinstance ( models , list ): for model in models : if isinstance ( model , BaseKServeModel ): self . register_model ( model ) # pass whether to log request latency into the model model . enable_latency_logging = self . enable_latency_logging else : raise RuntimeError ( \"Model type should be 'BaseKServeModel'\" ) elif isinstance ( models , dict ): if all ([ isinstance ( v , Deployment ) for v in models . values ()]): # TODO: make this port number a variable rayserve . start ( detached = True , http_options = { \"host\" : \"0.0.0.0\" , \"port\" : 9071 } ) for key in models : models [ key ] . deploy () handle = models [ key ] . get_handle () self . register_model_handle ( key , handle ) else : raise RuntimeError ( \"Model type should be RayServe Deployment\" ) else : raise RuntimeError ( \"Unknown model collection types\" ) if self . max_asyncio_workers is None : # formula as suggest in https://bugs.python.org/issue35279 self . max_asyncio_workers = min ( 32 , utils . cpu_count () + 4 ) logger . info ( f \"Setting max asyncio worker threads as { self . max_asyncio_workers } \" ) asyncio . get_event_loop () . set_default_executor ( concurrent . futures . ThreadPoolExecutor ( max_workers = self . max_asyncio_workers ) ) async def serve (): logger . info ( f \"Starting uvicorn with { self . workers } workers\" ) loop = asyncio . get_event_loop () if sys . platform not in [ \"win32\" , \"win64\" ]: sig_list = [ signal . SIGINT , signal . SIGTERM , signal . SIGQUIT ] else : sig_list = [ signal . SIGINT , signal . SIGTERM ] for sig in sig_list : loop . add_signal_handler ( sig , lambda s = sig : asyncio . create_task ( self . stop ( sig = s )) ) if self . _custom_exception_handler is None : loop . set_exception_handler ( self . default_exception_handler ) else : loop . set_exception_handler ( self . _custom_exception_handler ) if self . workers == 1 : self . _rest_server = UvicornServer ( self . http_port , [], self . dataplane , self . model_repository_extension , self . enable_docs_url , log_config = self . log_config , access_log_format = self . access_log_format , ) await self . _rest_server . run () else : # Since py38 MacOS/Windows defaults to use spawn for starting multiprocessing. # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods # Spawn does not work with FastAPI/uvicorn in multiprocessing mode, use fork for multiprocessing # https://github.com/tiangolo/fastapi/issues/1586 serversocket = socket . socket ( socket . AF_INET , socket . SOCK_STREAM ) serversocket . setsockopt ( socket . SOL_SOCKET , socket . SO_REUSEADDR , 1 ) serversocket . bind (( \"0.0.0.0\" , self . http_port )) serversocket . listen ( 5 ) multiprocessing . set_start_method ( \"fork\" ) self . _rest_server = UvicornServer ( self . http_port , [ serversocket ], self . dataplane , self . model_repository_extension , self . enable_docs_url , log_config = self . log_config , access_log_format = self . access_log_format , ) for _ in range ( self . workers ): p = Process ( target = self . _rest_server . run_sync ) p . start () async def servers_task (): servers = [ serve ()] if self . enable_grpc : servers . append ( self . _grpc_server . start ( self . max_threads )) await asyncio . gather ( * servers ) asyncio . run ( servers_task ()) stop ( sig = None ) async \u00b6 Stop the instances of REST and gRPC model servers. Parameters: Name Type Description Default sig Optional [ int ] The signal to stop the server. Default: None . None Source code in kserve/model_server.py 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 async def stop ( self , sig : Optional [ int ] = None ): \"\"\"Stop the instances of REST and gRPC model servers. Args: sig: The signal to stop the server. Default: ``None``. \"\"\" logger . info ( \"Stopping the model server\" ) if self . _rest_server : logger . info ( \"Stopping the rest server\" ) await self . _rest_server . stop () if self . _grpc_server : logger . info ( \"Stopping the grpc server\" ) await self . _grpc_server . stop ( sig ) for model_name in list ( self . registered_models . get_models () . keys ()): self . registered_models . unload ( model_name ) BaseKServeModel \u00b6 Bases: ABC A base class to inherit all of the kserve models from. This class implements the expectations of model repository and model server. Source code in kserve/model.py 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 class BaseKServeModel ( ABC ): \"\"\" A base class to inherit all of the kserve models from. This class implements the expectations of model repository and model server. \"\"\" def __init__ ( self , name : str ): \"\"\" Adds the required attributes Args: name: The name of the model. \"\"\" self . name = name self . ready = False def stop ( self ): \"\"\"Stop handler can be overridden to perform model teardown\"\"\" pass __init__ ( name ) \u00b6 Adds the required attributes Parameters: Name Type Description Default name str The name of the model. required Source code in kserve/model.py 53 54 55 56 57 58 59 60 61 def __init__ ( self , name : str ): \"\"\" Adds the required attributes Args: name: The name of the model. \"\"\" self . name = name self . ready = False stop () \u00b6 Stop handler can be overridden to perform model teardown Source code in kserve/model.py 63 64 65 def stop ( self ): \"\"\"Stop handler can be overridden to perform model teardown\"\"\" pass Model \u00b6 Bases: BaseKServeModel Source code in kserve/model.pyclass Model ( BaseKServeModel ): def __init__ ( self , name : str , predictor_config : Optional [ PredictorConfig ] = None ): \"\"\"KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Args: name: The name of the model. predictor_config: The configurations for http call to the predictor. \"\"\" super () . __init__ ( name ) # The predictor config member fields are kept for backwards compatibility as they could be set outside self . protocol = ( predictor_config . predictor_protocol if predictor_config else PredictorProtocol . REST_V1 . value ) self . predictor_host = ( predictor_config . predictor_host if predictor_config else None ) # The default timeout matches what is set in generated Istio virtual service resources. # We generally don't want things to time out at the request level here, # timeouts should be handled elsewhere in the system. self . timeout = ( predictor_config . predictor_request_timeout_seconds if predictor_config else 600 ) self . use_ssl = predictor_config . predictor_use_ssl if predictor_config else False self . explainer_host = None self . _http_client_instance = None self . _grpc_client_stub = None self . enable_latency_logging = False async def __call__ ( self , body : Union [ Dict , CloudEvent , InferRequest ], verb : InferenceVerb = InferenceVerb . PREDICT , headers : Dict [ str , str ] = None , ) -> Union [ Dict , InferResponse , List [ str ]]: \"\"\"Method to call predictor or explainer with the given input. Args: body: Request body. verb: The inference verb for predict/generate/explain headers: Request headers. Returns: Response output from preprocess -> predict/generate/explain -> postprocess \"\"\" request_id = headers . get ( \"x-request-id\" , \"N.A.\" ) if headers else \"N.A.\" # latency vars preprocess_ms = 0 explain_ms = 0 predict_ms = 0 postprocess_ms = 0 prom_labels = get_labels ( self . name ) with PRE_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () payload = ( await self . preprocess ( body , headers ) if inspect . iscoroutinefunction ( self . preprocess ) else self . preprocess ( body , headers ) ) preprocess_ms = get_latency_ms ( start , time . time ()) payload = self . validate ( payload ) if verb == InferenceVerb . EXPLAIN : with EXPLAIN_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . explain ( payload , headers )) if inspect . iscoroutinefunction ( self . explain ) else self . explain ( payload , headers ) ) explain_ms = get_latency_ms ( start , time . time ()) elif verb == InferenceVerb . PREDICT : with PREDICT_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . predict ( payload , headers )) if inspect . iscoroutinefunction ( self . predict ) else self . predict ( payload , headers ) ) predict_ms = get_latency_ms ( start , time . time ()) else : raise NotImplementedError with POST_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( await self . postprocess ( response , headers ) if inspect . iscoroutinefunction ( self . postprocess ) else self . postprocess ( response , headers ) ) postprocess_ms = get_latency_ms ( start , time . time ()) if self . enable_latency_logging is True : trace_logger . info ( f \"requestId: { request_id } , preprocess_ms: { preprocess_ms } , \" f \"explain_ms: { explain_ms } , predict_ms: { predict_ms } , \" f \"postprocess_ms: { postprocess_ms } \" ) return response @property def _http_client ( self ): if self . _http_client_instance is None : self . _http_client_instance = httpx . AsyncClient () return self . _http_client_instance @property def _grpc_client ( self ): if self . _grpc_client_stub is None : # requires appending the port to the predictor host for gRPC to work if \":\" not in self . predictor_host : port = 443 if self . use_ssl else 80 self . predictor_host = f \" { self . predictor_host } : { port } \" if self . use_ssl : _channel = grpc . aio . secure_channel ( self . predictor_host , grpc . ssl_channel_credentials () ) else : _channel = grpc . aio . insecure_channel ( self . predictor_host ) self . _grpc_client_stub = grpc_predict_v2_pb2_grpc . GRPCInferenceServiceStub ( _channel ) return self . _grpc_client_stub def validate ( self , payload ): if isinstance ( payload , ModelInferRequest ): return payload if isinstance ( payload , InferRequest ): return payload # TODO: validate the request if self.get_input_types() defines the input types. if self . protocol == PredictorProtocol . REST_V2 . value : if \"inputs\" in payload and not isinstance ( payload [ \"inputs\" ], list ): raise InvalidInput ( 'Expected \"inputs\" to be a list' ) elif self . protocol == PredictorProtocol . REST_V1 . value : if ( isinstance ( payload , Dict ) and \"instances\" in payload and not isinstance ( payload [ \"instances\" ], list ) ): raise InvalidInput ( 'Expected \"instances\" to be a list' ) return payload def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready def get_input_types ( self ) -> List [ Dict ]: # Override this function to return appropriate input format expected by your model. # Refer https://kserve.github.io/website/0.9/modelserving/inference_api/#model-metadata-response-json-object # Eg. # return [{ \"name\": \"\", \"datatype\": \"INT32\", \"shape\": [1,5], }] return [] def get_output_types ( self ) -> List [ Dict ]: # Override this function to return appropriate output format returned by your model. # Refer https://kserve.github.io/website/0.9/modelserving/inference_api/#model-metadata-response-json-object # Eg. # return [{ \"name\": \"\", \"datatype\": \"INT32\", \"shape\": [1,5], }] return [] async def preprocess ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferRequest ]: \"\"\"`preprocess` handler can be overridden for data or feature transformation. The model decodes the request body to `Dict` for v1 endpoints and `InferRequest` for v2 endpoints. Args: payload: Payload of the request. headers: Request headers. Returns: A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. \"\"\" return payload async def postprocess ( self , result : Union [ Dict , InferResponse ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferResponse ]: \"\"\"The `postprocess` handler can be overridden for inference result or response transformation. The predictor sends back the inference result in `Dict` for v1 endpoints and `InferResponse` for v2 endpoints. Args: result: The inference result passed from `predict` handler or the HTTP response from predictor. headers: Request headers. Returns: A Dict or InferResponse after post-process to return back to the client. \"\"\" return result async def _http_predict ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Dict : protocol = \"https\" if self . use_ssl else \"http\" predict_url = PREDICTOR_URL_FORMAT . format ( protocol , self . predictor_host , self . name ) if self . protocol == PredictorProtocol . REST_V2 . value : predict_url = PREDICTOR_V2_URL_FORMAT . format ( protocol , self . predictor_host , self . name ) # Adjusting headers. Inject content type if not exist. # Also, removing host, as the header is the one passed to transformer and contains transformer's host predict_headers = { \"Content-Type\" : \"application/json\" } if headers is not None : if \"x-request-id\" in headers : predict_headers [ \"x-request-id\" ] = headers [ \"x-request-id\" ] if \"x-b3-traceid\" in headers : predict_headers [ \"x-b3-traceid\" ] = headers [ \"x-b3-traceid\" ] if isinstance ( payload , InferRequest ): payload = payload . to_rest () data = orjson . dumps ( payload ) try : response = await self . _http_client . post ( predict_url , timeout = self . timeout , headers = predict_headers , content = data ) except Exception as exc : request_id = predict_headers . get ( \"x-request-id\" , \"N.A.\" ) logger . error ( f \"Could not send a request to predictor at url { predict_url } \" f \"for { request_id =} \" f \"due to exception { exc } \" ) raise exc if not response . is_success : message = ( \" {error_message} , ' {0.status_code} {0.reason_phrase} ' for url ' {0.url} '\" ) error_message = \"\" if ( \"content-type\" in response . headers and response . headers [ \"content-type\" ] == \"application/json\" ): error_message = response . json () if \"error\" in error_message : error_message = error_message [ \"error\" ] message = message . format ( response , error_message = error_message ) raise HTTPStatusError ( message , request = response . request , response = response ) return orjson . loads ( response . content ) async def _grpc_predict ( self , payload : Union [ ModelInferRequest , InferRequest ], headers : Dict [ str , str ] = None , ) -> ModelInferResponse : if isinstance ( payload , InferRequest ): payload = payload . to_grpc () async_result = await self . _grpc_client . ModelInfer ( request = payload , timeout = self . timeout , metadata = ( ( \"request_type\" , \"grpc_v2\" ), ( \"response_type\" , \"grpc_v2\" ), ( \"x-request-id\" , headers . get ( \"x-request-id\" , \"\" )), ), ) return async_result async def predict ( self , payload : Union [ Dict , InferRequest , ModelInferRequest ], headers : Dict [ str , str ] = None , ) -> Union [ Dict , InferResponse , AsyncIterator [ Any ]]: \"\"\"The `predict` handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Args: payload: Model inputs passed from `preprocess` handler. headers: Request headers. Returns: Inference result or a Response from the predictor. Raises: HTTPStatusError when getting back an error response from the predictor. \"\"\" if not self . predictor_host : raise NotImplementedError ( \"Could not find predictor_host.\" ) if self . protocol == PredictorProtocol . GRPC_V2 . value : res = await self . _grpc_predict ( payload , headers ) return InferResponse . from_grpc ( res ) else : res = await self . _http_predict ( payload , headers ) # return an InferResponse if this is REST V2, otherwise just return the dictionary return ( InferResponse . from_rest ( self . name , res ) if is_v2 ( PredictorProtocol ( self . protocol )) else res ) async def explain ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : \"\"\"`explain` handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if ``explainer_host`` is specified. Args: payload: Explainer model inputs passed from preprocess handler. headers: Request headers. Returns: An Explanation for the inference result. Raises: HTTPStatusError when getting back an error response from the explainer. \"\"\" if self . explainer_host is None : raise NotImplementedError ( \"Could not find explainer_host.\" ) protocol = \"https\" if self . use_ssl else \"http\" # Currently explainer only supports the kserve v1 endpoints explain_url = EXPLAINER_URL_FORMAT . format ( protocol , self . explainer_host , self . name ) response = await self . _http_client . post ( url = explain_url , timeout = self . timeout , content = orjson . dumps ( payload ) ) response . raise_for_status () return orjson . loads ( response . content ) __call__ ( body , verb = InferenceVerb . PREDICT , headers = None ) async \u00b6 Method to call predictor or explainer with the given input. Parameters: Name Type Description Default body Union [ Dict , CloudEvent , InferRequest ] Request body. required verb InferenceVerb The inference verb for predict/generate/explain PREDICT headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferResponse , List [ str ]] Response output from preprocess -> predict/generate/explain -> postprocess Source code in kserve/model.py 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 async def __call__ ( self , body : Union [ Dict , CloudEvent , InferRequest ], verb : InferenceVerb = InferenceVerb . PREDICT , headers : Dict [ str , str ] = None , ) -> Union [ Dict , InferResponse , List [ str ]]: \"\"\"Method to call predictor or explainer with the given input. Args: body: Request body. verb: The inference verb for predict/generate/explain headers: Request headers. Returns: Response output from preprocess -> predict/generate/explain -> postprocess \"\"\" request_id = headers . get ( \"x-request-id\" , \"N.A.\" ) if headers else \"N.A.\" # latency vars preprocess_ms = 0 explain_ms = 0 predict_ms = 0 postprocess_ms = 0 prom_labels = get_labels ( self . name ) with PRE_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () payload = ( await self . preprocess ( body , headers ) if inspect . iscoroutinefunction ( self . preprocess ) else self . preprocess ( body , headers ) ) preprocess_ms = get_latency_ms ( start , time . time ()) payload = self . validate ( payload ) if verb == InferenceVerb . EXPLAIN : with EXPLAIN_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . explain ( payload , headers )) if inspect . iscoroutinefunction ( self . explain ) else self . explain ( payload , headers ) ) explain_ms = get_latency_ms ( start , time . time ()) elif verb == InferenceVerb . PREDICT : with PREDICT_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . predict ( payload , headers )) if inspect . iscoroutinefunction ( self . predict ) else self . predict ( payload , headers ) ) predict_ms = get_latency_ms ( start , time . time ()) else : raise NotImplementedError with POST_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( await self . postprocess ( response , headers ) if inspect . iscoroutinefunction ( self . postprocess ) else self . postprocess ( response , headers ) ) postprocess_ms = get_latency_ms ( start , time . time ()) if self . enable_latency_logging is True : trace_logger . info ( f \"requestId: { request_id } , preprocess_ms: { preprocess_ms } , \" f \"explain_ms: { explain_ms } , predict_ms: { predict_ms } , \" f \"postprocess_ms: { postprocess_ms } \" ) return response __init__ ( name , predictor_config = None ) \u00b6 KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Parameters: Name Type Description Default name str The name of the model. required predictor_config Optional [ PredictorConfig ] The configurations for http call to the predictor. None Source code in kserve/model.py 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 def __init__ ( self , name : str , predictor_config : Optional [ PredictorConfig ] = None ): \"\"\"KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Args: name: The name of the model. predictor_config: The configurations for http call to the predictor. \"\"\" super () . __init__ ( name ) # The predictor config member fields are kept for backwards compatibility as they could be set outside self . protocol = ( predictor_config . predictor_protocol if predictor_config else PredictorProtocol . REST_V1 . value ) self . predictor_host = ( predictor_config . predictor_host if predictor_config else None ) # The default timeout matches what is set in generated Istio virtual service resources. # We generally don't want things to time out at the request level here, # timeouts should be handled elsewhere in the system. self . timeout = ( predictor_config . predictor_request_timeout_seconds if predictor_config else 600 ) self . use_ssl = predictor_config . predictor_use_ssl if predictor_config else False self . explainer_host = None self . _http_client_instance = None self . _grpc_client_stub = None self . enable_latency_logging = False explain ( payload , headers = None ) async \u00b6 explain handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if explainer_host is specified. Parameters: Name Type Description Default payload Dict Explainer model inputs passed from preprocess handler. required headers Dict [ str , str ] Request headers. None Returns: Type Description Dict An Explanation for the inference result. Source code in kserve/model.py 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 async def explain ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : \"\"\"`explain` handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if ``explainer_host`` is specified. Args: payload: Explainer model inputs passed from preprocess handler. headers: Request headers. Returns: An Explanation for the inference result. Raises: HTTPStatusError when getting back an error response from the explainer. \"\"\" if self . explainer_host is None : raise NotImplementedError ( \"Could not find explainer_host.\" ) protocol = \"https\" if self . use_ssl else \"http\" # Currently explainer only supports the kserve v1 endpoints explain_url = EXPLAINER_URL_FORMAT . format ( protocol , self . explainer_host , self . name ) response = await self . _http_client . post ( url = explain_url , timeout = self . timeout , content = orjson . dumps ( payload ) ) response . raise_for_status () return orjson . loads ( response . content ) load () \u00b6 Load handler can be overridden to load the model from storage. The self.ready should be set to True after the model is loaded. The flag is used for model health check. Returns: Name Type Description bool bool True if model is ready, False otherwise Source code in kserve/model.py 260 261 262 263 264 265 266 267 268 def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready postprocess ( result , headers = None ) async \u00b6 The postprocess handler can be overridden for inference result or response transformation. The predictor sends back the inference result in Dict for v1 endpoints and InferResponse for v2 endpoints. Parameters: Name Type Description Default result Union [ Dict , InferResponse ] The inference result passed from predict handler or the HTTP response from predictor. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferResponse ] A Dict or InferResponse after post-process to return back to the client. Source code in kserve/model.py 303 304 305 306 307 308 309 310 311 312 313 314 315 316 async def postprocess ( self , result : Union [ Dict , InferResponse ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferResponse ]: \"\"\"The `postprocess` handler can be overridden for inference result or response transformation. The predictor sends back the inference result in `Dict` for v1 endpoints and `InferResponse` for v2 endpoints. Args: result: The inference result passed from `predict` handler or the HTTP response from predictor. headers: Request headers. Returns: A Dict or InferResponse after post-process to return back to the client. \"\"\" return result predict ( payload , headers = None ) async \u00b6 The predict handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Parameters: Name Type Description Default payload Union [ Dict , InferRequest , ModelInferRequest ] Model inputs passed from preprocess handler. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferResponse , AsyncIterator [ Any ]] Inference result or a Response from the predictor. Source code in kserve/model.py 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 async def predict ( self , payload : Union [ Dict , InferRequest , ModelInferRequest ], headers : Dict [ str , str ] = None , ) -> Union [ Dict , InferResponse , AsyncIterator [ Any ]]: \"\"\"The `predict` handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Args: payload: Model inputs passed from `preprocess` handler. headers: Request headers. Returns: Inference result or a Response from the predictor. Raises: HTTPStatusError when getting back an error response from the predictor. \"\"\" if not self . predictor_host : raise NotImplementedError ( \"Could not find predictor_host.\" ) if self . protocol == PredictorProtocol . GRPC_V2 . value : res = await self . _grpc_predict ( payload , headers ) return InferResponse . from_grpc ( res ) else : res = await self . _http_predict ( payload , headers ) # return an InferResponse if this is REST V2, otherwise just return the dictionary return ( InferResponse . from_rest ( self . name , res ) if is_v2 ( PredictorProtocol ( self . protocol )) else res ) preprocess ( payload , headers = None ) async \u00b6 preprocess handler can be overridden for data or feature transformation. The model decodes the request body to Dict for v1 endpoints and InferRequest for v2 endpoints. Parameters: Name Type Description Default payload Union [ Dict , InferRequest ] Payload of the request. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferRequest ] A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Union [ Dict , InferRequest ] Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. Source code in kserve/model.py 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 async def preprocess ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferRequest ]: \"\"\"`preprocess` handler can be overridden for data or feature transformation. The model decodes the request body to `Dict` for v1 endpoints and `InferRequest` for v2 endpoints. Args: payload: Payload of the request. headers: Request headers. Returns: A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. \"\"\" return payload PredictorConfig \u00b6 Source code in kserve/model.py 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 class PredictorConfig : def __init__ ( self , predictor_host : str , predictor_protocol : str = PredictorProtocol . REST_V1 . value , predictor_use_ssl : bool = False , predictor_request_timeout_seconds : int = 600 , ): \"\"\"The configuration for the http call to the predictor Args: predictor_host: The host name of the predictor predictor_protocol: The inference protocol used for predictor http call predictor_use_ssl: Enable using ssl for http connection to the predictor predictor_request_timeout_seconds: The request timeout seconds for the predictor http call \"\"\" self . predictor_host = predictor_host self . predictor_protocol = predictor_protocol self . predictor_use_ssl = predictor_use_ssl self . predictor_request_timeout_seconds = predictor_request_timeout_seconds __init__ ( predictor_host , predictor_protocol = PredictorProtocol . REST_V1 . value , predictor_use_ssl = False , predictor_request_timeout_seconds = 600 ) \u00b6 The configuration for the http call to the predictor Parameters: Name Type Description Default predictor_host str The host name of the predictor required predictor_protocol str The inference protocol used for predictor http call REST_V1 .value predictor_use_ssl bool Enable using ssl for http connection to the predictor False predictor_request_timeout_seconds int The request timeout seconds for the predictor http call 600 Source code in kserve/model.py 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 def __init__ ( self , predictor_host : str , predictor_protocol : str = PredictorProtocol . REST_V1 . value , predictor_use_ssl : bool = False , predictor_request_timeout_seconds : int = 600 , ): \"\"\"The configuration for the http call to the predictor Args: predictor_host: The host name of the predictor predictor_protocol: The inference protocol used for predictor http call predictor_use_ssl: Enable using ssl for http connection to the predictor predictor_request_timeout_seconds: The request timeout seconds for the predictor http call \"\"\" self . predictor_host = predictor_host self . predictor_protocol = predictor_protocol self . predictor_use_ssl = predictor_use_ssl self . predictor_request_timeout_seconds = predictor_request_timeout_seconds InferInput \u00b6 Source code in kserve/protocol/infer_type.pyclass InferInput : _name : str _shape : List [ int ] _datatype : str _parameters : Dict def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferInput class is used to describe the input tensor of an inference request. Args: name: The name of the inference input whose data will be described by this object. shape : The shape of the associated inference input. datatype : The data type of the associated inference input. data : The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using `set_data_from_numpy`. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None @property def name ( self ) -> str : \"\"\"Get the name of inference input associated with this object. Returns: The name of the inference input \"\"\" return self . _name @property def datatype ( self ) -> str : \"\"\"Get the datatype of inference input associated with this object. Returns: The datatype of the inference input. \"\"\" return self . _datatype @property def data ( self ) -> Union [ List , np . ndarray , InferTensorContents ]: \"\"\"Get the data of the inference input associated with this object. Returns: The data of the inference input. \"\"\" return self . _data @property def shape ( self ) -> List [ int ]: \"\"\"Get the shape of inference input associated with this object. Returns: The shape of the inference input. \"\"\" return self . _shape @property def parameters ( self ) -> Union [ Dict , MessageMap [ str , InferParameter ], None ]: \"\"\"Get the parameters of the inference input associated with this object. Returns: The additional inference parameters \"\"\" return self . _parameters def set_shape ( self , shape : List [ int ]): \"\"\"Set the shape of inference input. Args: shape : The shape of the associated inference input. \"\"\" self . _shape = shape def as_string ( self ) -> List [ List [ str ]]: if self . datatype == \"BYTES\" : return [ s . decode ( \"utf-8\" ) for li in self . _data for s in li ] else : raise InvalidInput ( f \"invalid datatype { self . datatype } in the input\" ) def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the inference input data as numpy array. Returns: A numpy array of the inference input data \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( f \"invalid datatype { dtype } in the input\" ) if self . _raw_data is not None : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) def set_data_from_numpy ( self , input_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for input associated with this object. Args: input_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the tensor. \"\"\" if not isinstance ( input_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( input_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( input_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != input_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( input_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if input_tensor . size > 0 : for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in input_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( input_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = input_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) def __eq__ ( self , other ): if not isinstance ( other , InferInput ): return False if self . name != other . name : return False if self . shape != other . shape : return False if self . datatype != other . datatype : return False if self . parameters != other . parameters : return False if self . data != other . data : return False return True data : Union [ List , np . ndarray , InferTensorContents ] property \u00b6 Get the data of the inference input associated with this object. Returns: Type Description Union [ List , ndarray , InferTensorContents ] The data of the inference input. datatype : str property \u00b6 Get the datatype of inference input associated with this object. Returns: Type Description str The datatype of the inference input. name : str property \u00b6 Get the name of inference input associated with this object. Returns: Type Description str The name of the inference input parameters : Union [ Dict , MessageMap [ str , InferParameter ], None ] property \u00b6 Get the parameters of the inference input associated with this object. Returns: Type Description Union [ Dict , MessageMap [ str , InferParameter ], None] The additional inference parameters shape : List [ int ] property \u00b6 Get the shape of inference input associated with this object. Returns: Type Description List [ int ] The shape of the inference input. __init__ ( name , shape , datatype , data = None , parameters = None ) \u00b6 An object of InferInput class is used to describe the input tensor of an inference request. Parameters: Name Type Description Default name str The name of the inference input whose data will be described by this object. required shape The shape of the associated inference input. required datatype The data type of the associated inference input. required data The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using set_data_from_numpy . None parameters The additional inference parameters. None Source code in kserve/protocol/infer_type.py 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferInput class is used to describe the input tensor of an inference request. Args: name: The name of the inference input whose data will be described by this object. shape : The shape of the associated inference input. datatype : The data type of the associated inference input. data : The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using `set_data_from_numpy`. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None as_numpy () \u00b6 Decode the inference input data as numpy array. Returns: Type Description ndarray A numpy array of the inference input data Source code in kserve/protocol/infer_type.py 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the inference input data as numpy array. Returns: A numpy array of the inference input data \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( f \"invalid datatype { dtype } in the input\" ) if self . _raw_data is not None : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) set_data_from_numpy ( input_tensor , binary_data = True ) \u00b6 Set the tensor data from the specified numpy array for input associated with this object. Parameters: Name Type Description Default input_tensor The tensor data in numpy array format. required binary_data Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. True Source code in kserve/protocol/infer_type.py 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 def set_data_from_numpy ( self , input_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for input associated with this object. Args: input_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the tensor. \"\"\" if not isinstance ( input_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( input_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( input_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != input_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( input_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if input_tensor . size > 0 : for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in input_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( input_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = input_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) set_shape ( shape ) \u00b6 Set the shape of inference input. Parameters: Name Type Description Default shape The shape of the associated inference input. required Source code in kserve/protocol/infer_type.py 152 153 154 155 156 157 158 def set_shape ( self , shape : List [ int ]): \"\"\"Set the shape of inference input. Args: shape : The shape of the associated inference input. \"\"\" self . _shape = shape InferOutput \u00b6 Source code in kserve/protocol/infer_type.pyclass InferOutput : def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferOutput class is used to describe the output tensor for an inference response. Args: name : The name of inference output whose data will be described by this object. shape : The shape of the associated inference output. datatype : The data type of the associated inference output. data : The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None @property def name ( self ) -> str : \"\"\"Get the name of inference output associated with this object. Returns: The name of inference output. \"\"\" return self . _name @property def datatype ( self ) -> str : \"\"\"Get the data type of inference output associated with this object. Returns: The data type of inference output. \"\"\" return self . _datatype @property def data ( self ) -> Union [ List , np . ndarray , InferTensorContents ]: \"\"\"Get the data of inference output associated with this object. Returns: The data of inference output. \"\"\" return self . _data @property def shape ( self ) -> List [ int ]: \"\"\"Get the shape of inference output associated with this object. Returns: The shape of inference output \"\"\" return self . _shape @property def parameters ( self ) -> Union [ Dict , MessageMap [ str , InferParameter ], None ]: \"\"\"Get the parameters of inference output associated with this object. Returns: The additional inference parameters associated with the inference output. \"\"\" return self . _parameters @parameters . setter def parameters ( self , params : Union [ Dict , MessageMap [ str , InferParameter ]]): self . _parameters = params def set_shape ( self , shape : List [ int ]): \"\"\"Set the shape of inference output. Args: shape: The shape of the associated inference output. \"\"\" self . _shape = shape def as_numpy ( self ) -> numpy . ndarray : \"\"\"Decode the tensor output data as numpy array. Returns: The numpy array of the associated inference output data. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( \"invalid datatype in the input\" ) if self . _raw_data is not None : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) def set_data_from_numpy ( self , output_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for the inference output associated with this object. Args: output_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the output tensor. \"\"\" if not isinstance ( output_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( output_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( output_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != output_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( output_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if output_tensor . size > 0 : for obj in np . nditer ( output_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if output_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in output_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( output_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = output_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) def __eq__ ( self , other ): if not isinstance ( other , InferOutput ): return False if self . name != other . name : return False if self . shape != other . shape : return False if self . datatype != other . datatype : return False if self . parameters != other . parameters : return False if self . data != other . data : return False return True data : Union [ List , np . ndarray , InferTensorContents ] property \u00b6 Get the data of inference output associated with this object. Returns: Type Description Union [ List , ndarray , InferTensorContents ] The data of inference output. datatype : str property \u00b6 Get the data type of inference output associated with this object. Returns: Type Description str The data type of inference output. name : str property \u00b6 Get the name of inference output associated with this object. Returns: Type Description str The name of inference output. parameters : Union [ Dict , MessageMap [ str , InferParameter ], None ] property writable \u00b6 Get the parameters of inference output associated with this object. Returns: Type Description Union [ Dict , MessageMap [ str , InferParameter ], None] The additional inference parameters associated with the inference output. shape : List [ int ] property \u00b6 Get the shape of inference output associated with this object. Returns: Type Description List [ int ] The shape of inference output __init__ ( name , shape , datatype , data = None , parameters = None ) \u00b6 An object of InferOutput class is used to describe the output tensor for an inference response. Parameters: Name Type Description Default name The name of inference output whose data will be described by this object. required shape The shape of the associated inference output. required datatype The data type of the associated inference output. required data The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. None parameters The additional inference parameters. None Source code in kserve/protocol/infer_type.py 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferOutput class is used to describe the output tensor for an inference response. Args: name : The name of inference output whose data will be described by this object. shape : The shape of the associated inference output. datatype : The data type of the associated inference output. data : The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None as_numpy () \u00b6 Decode the tensor output data as numpy array. Returns: Type Description ndarray The numpy array of the associated inference output data. Source code in kserve/protocol/infer_type.py 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 def as_numpy ( self ) -> numpy . ndarray : \"\"\"Decode the tensor output data as numpy array. Returns: The numpy array of the associated inference output data. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( \"invalid datatype in the input\" ) if self . _raw_data is not None : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) set_data_from_numpy ( output_tensor , binary_data = True ) \u00b6 Set the tensor data from the specified numpy array for the inference output associated with this object. Parameters: Name Type Description Default output_tensor The tensor data in numpy array format. required binary_data Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. True Source code in kserve/protocol/infer_type.py 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 def set_data_from_numpy ( self , output_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for the inference output associated with this object. Args: output_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the output tensor. \"\"\" if not isinstance ( output_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( output_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( output_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != output_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( output_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if output_tensor . size > 0 : for obj in np . nditer ( output_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if output_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in output_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( output_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = output_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) set_shape ( shape ) \u00b6 Set the shape of inference output. Parameters: Name Type Description Default shape List [ int ] The shape of the associated inference output. required Source code in kserve/protocol/infer_type.py 547 548 549 550 551 552 553 def set_shape ( self , shape : List [ int ]): \"\"\"Set the shape of inference output. Args: shape: The shape of the associated inference output. \"\"\" self . _shape = shape InferRequest \u00b6 Source code in kserve/protocol/infer_type.pyclass InferRequest : id : Optional [ str ] model_name : str parameters : Optional [ Dict ] inputs : List [ InferInput ] from_grpc : bool def __init__ ( self , model_name : str , infer_inputs : List [ InferInput ], request_id : Optional [ str ] = None , raw_inputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"InferRequest Data Model. Args: model_name: The model name. infer_inputs: The inference inputs for the model. request_id: The id for the inference request. raw_inputs: The binary data for the inference inputs. from_grpc: Indicate if the data model is constructed from gRPC request. parameters: The additional inference parameters. \"\"\" self . id = request_id self . model_name = model_name self . inputs = infer_inputs self . parameters = parameters self . from_grpc = from_grpc if raw_inputs : for i , raw_input in enumerate ( raw_inputs ): self . inputs [ i ] . _raw_data = raw_input @classmethod def from_grpc ( cls , request : ModelInferRequest ): \"\"\"The class method to construct the InferRequest from a ModelInferRequest\"\"\" infer_inputs = [ InferInput ( name = input_tensor . name , shape = list ( input_tensor . shape ), datatype = input_tensor . datatype , data = get_content ( input_tensor . datatype , input_tensor . contents ), parameters = input_tensor . parameters , ) for input_tensor in request . inputs ] return cls ( request_id = request . id , model_name = request . model_name , infer_inputs = infer_inputs , raw_inputs = request . raw_input_contents , from_grpc = True , parameters = request . parameters , ) def to_rest ( self ) -> Dict : \"\"\"Converts the InferRequest object to v2 REST InferRequest Dict. Returns: The InferRequest Dict converted from InferRequest object. \"\"\" infer_inputs = [] for infer_input in self . inputs : datatype = infer_input . datatype if isinstance ( infer_input . datatype , numpy . dtype ): datatype = from_np_dtype ( infer_input . datatype ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_http_parameters ( infer_input . parameters ) if isinstance ( infer_input . data , numpy . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = False ) infer_input_dict [ \"data\" ] = infer_input . data else : infer_input_dict [ \"data\" ] = infer_input . data infer_inputs . append ( infer_input_dict ) infer_request = { \"id\" : self . id if self . id else str ( uuid . uuid4 ()), \"inputs\" : infer_inputs , } if self . parameters : infer_request [ \"parameters\" ] = to_http_parameters ( self . parameters ) return infer_request def to_grpc ( self ) -> ModelInferRequest : \"\"\"Converts the InferRequest object to gRPC ModelInferRequest type. Returns: The ModelInferResponse gRPC type converted from InferRequest object. \"\"\" infer_inputs = [] raw_input_contents = [] for infer_input in self . inputs : if isinstance ( infer_input . data , numpy . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = True ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_grpc_parameters ( infer_input . parameters ) if infer_input . _raw_data is not None : raw_input_contents . append ( infer_input . _raw_data ) else : if not isinstance ( infer_input . data , List ): raise InvalidInput ( \"input data is not a List\" ) infer_input_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_input . datatype , None ) if data_key is not None : infer_input . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_input . data ] # str to byte conversion for grpc proto infer_input_dict [ \"contents\" ][ data_key ] = infer_input . data else : raise InvalidInput ( \"invalid input datatype\" ) infer_inputs . append ( infer_input_dict ) return ModelInferRequest ( id = self . id , model_name = self . model_name , inputs = infer_inputs , raw_input_contents = raw_input_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , ) def as_dataframe ( self ) -> pd . DataFrame : \"\"\"Decode the tensor inputs as pandas dataframe. Returns: The inference input data as pandas dataframe \"\"\" dfs = [] for input in self . inputs : input_data = input . data if input . datatype == \"BYTES\" : input_data = [ str ( val , \"utf-8\" ) if isinstance ( val , bytes ) else val for val in input . data ] dfs . append ( pd . DataFrame ( input_data , columns = [ input . name ])) return pd . concat ( dfs , axis = 1 ) def __eq__ ( self , other ): if not isinstance ( other , InferRequest ): return False if self . model_name != other . model_name : return False if self . id != other . id : return False if self . from_grpc != other . from_grpc : return False if self . parameters != other . parameters : return False if self . inputs != other . inputs : return False return True __init__ ( model_name , infer_inputs , request_id = None , raw_inputs = None , from_grpc = False , parameters = None ) \u00b6 InferRequest Data Model. Parameters: Name Type Description Default model_name str The model name. required infer_inputs List [ InferInput ] The inference inputs for the model. required request_id Optional [ str ] The id for the inference request. None raw_inputs The binary data for the inference inputs. None from_grpc Optional [ bool ] Indicate if the data model is constructed from gRPC request. False parameters Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] The additional inference parameters. None Source code in kserve/protocol/infer_type.py 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 def __init__ ( self , model_name : str , infer_inputs : List [ InferInput ], request_id : Optional [ str ] = None , raw_inputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"InferRequest Data Model. Args: model_name: The model name. infer_inputs: The inference inputs for the model. request_id: The id for the inference request. raw_inputs: The binary data for the inference inputs. from_grpc: Indicate if the data model is constructed from gRPC request. parameters: The additional inference parameters. \"\"\" self . id = request_id self . model_name = model_name self . inputs = infer_inputs self . parameters = parameters self . from_grpc = from_grpc if raw_inputs : for i , raw_input in enumerate ( raw_inputs ): self . inputs [ i ] . _raw_data = raw_input as_dataframe () \u00b6 Decode the tensor inputs as pandas dataframe. Returns: Type Description DataFrame The inference input data as pandas dataframe Source code in kserve/protocol/infer_type.py 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 def as_dataframe ( self ) -> pd . DataFrame : \"\"\"Decode the tensor inputs as pandas dataframe. Returns: The inference input data as pandas dataframe \"\"\" dfs = [] for input in self . inputs : input_data = input . data if input . datatype == \"BYTES\" : input_data = [ str ( val , \"utf-8\" ) if isinstance ( val , bytes ) else val for val in input . data ] dfs . append ( pd . DataFrame ( input_data , columns = [ input . name ])) return pd . concat ( dfs , axis = 1 ) from_grpc ( request ) classmethod \u00b6 The class method to construct the InferRequest from a ModelInferRequest Source code in kserve/protocol/infer_type.py 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 @classmethod def from_grpc ( cls , request : ModelInferRequest ): \"\"\"The class method to construct the InferRequest from a ModelInferRequest\"\"\" infer_inputs = [ InferInput ( name = input_tensor . name , shape = list ( input_tensor . shape ), datatype = input_tensor . datatype , data = get_content ( input_tensor . datatype , input_tensor . contents ), parameters = input_tensor . parameters , ) for input_tensor in request . inputs ] return cls ( request_id = request . id , model_name = request . model_name , infer_inputs = infer_inputs , raw_inputs = request . raw_input_contents , from_grpc = True , parameters = request . parameters , ) to_grpc () \u00b6 Converts the InferRequest object to gRPC ModelInferRequest type. Returns: Type Description ModelInferRequest The ModelInferResponse gRPC type converted from InferRequest object. Source code in kserve/protocol/infer_type.py 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 def to_grpc ( self ) -> ModelInferRequest : \"\"\"Converts the InferRequest object to gRPC ModelInferRequest type. Returns: The ModelInferResponse gRPC type converted from InferRequest object. \"\"\" infer_inputs = [] raw_input_contents = [] for infer_input in self . inputs : if isinstance ( infer_input . data , numpy . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = True ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_grpc_parameters ( infer_input . parameters ) if infer_input . _raw_data is not None : raw_input_contents . append ( infer_input . _raw_data ) else : if not isinstance ( infer_input . data , List ): raise InvalidInput ( \"input data is not a List\" ) infer_input_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_input . datatype , None ) if data_key is not None : infer_input . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_input . data ] # str to byte conversion for grpc proto infer_input_dict [ \"contents\" ][ data_key ] = infer_input . data else : raise InvalidInput ( \"invalid input datatype\" ) infer_inputs . append ( infer_input_dict ) return ModelInferRequest ( id = self . id , model_name = self . model_name , inputs = infer_inputs , raw_input_contents = raw_input_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , ) to_rest () \u00b6 Converts the InferRequest object to v2 REST InferRequest Dict. Returns: Type Description Dict The InferRequest Dict converted from InferRequest object. Source code in kserve/protocol/infer_type.py 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 def to_rest ( self ) -> Dict : \"\"\"Converts the InferRequest object to v2 REST InferRequest Dict. Returns: The InferRequest Dict converted from InferRequest object. \"\"\" infer_inputs = [] for infer_input in self . inputs : datatype = infer_input . datatype if isinstance ( infer_input . datatype , numpy . dtype ): datatype = from_np_dtype ( infer_input . datatype ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_http_parameters ( infer_input . parameters ) if isinstance ( infer_input . data , numpy . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = False ) infer_input_dict [ \"data\" ] = infer_input . data else : infer_input_dict [ \"data\" ] = infer_input . data infer_inputs . append ( infer_input_dict ) infer_request = { \"id\" : self . id if self . id else str ( uuid . uuid4 ()), \"inputs\" : infer_inputs , } if self . parameters : infer_request [ \"parameters\" ] = to_http_parameters ( self . parameters ) return infer_request InferResponse \u00b6 Source code in kserve/protocol/infer_type.pyclass InferResponse : id : str model_name : str model_version : Optional [ str ] parameters : Optional [ Dict ] outputs : List [ InferOutput ] from_grpc : bool def __init__ ( self , response_id : str , model_name : str , infer_outputs : List [ InferOutput ], model_version : Optional [ str ] = None , raw_outputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"The InferResponse Data Model Args: response_id: The id of the inference response. model_name: The name of the model. infer_outputs: The inference outputs of the inference response. model_version: The version of the model. raw_outputs: The raw binary data of the inference outputs. from_grpc: Indicate if the InferResponse is constructed from a gRPC response. parameters: The additional inference parameters. \"\"\" self . id = response_id self . model_name = model_name self . model_version = model_version self . outputs = infer_outputs self . parameters = parameters self . from_grpc = from_grpc if raw_outputs : for i , raw_output in enumerate ( raw_outputs ): self . outputs [ i ] . _raw_data = raw_output @classmethod def from_grpc ( cls , response : ModelInferResponse ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from gRPC message type.\"\"\" infer_outputs = [ InferOutput ( name = output . name , shape = list ( output . shape ), datatype = output . datatype , data = get_content ( output . datatype , output . contents ), parameters = output . parameters , ) for output in response . outputs ] return cls ( model_name = response . model_name , model_version = response . model_version , response_id = response . id , parameters = response . parameters , infer_outputs = infer_outputs , raw_outputs = response . raw_output_contents , from_grpc = True , ) @classmethod def from_rest ( cls , model_name : str , response : Dict ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from REST message type.\"\"\" infer_outputs = [ InferOutput ( name = output [ \"name\" ], shape = list ( output [ \"shape\" ]), datatype = output [ \"datatype\" ], data = output [ \"data\" ], parameters = output . get ( \"parameters\" , None ), ) for output in response [ \"outputs\" ] ] return cls ( model_name = model_name , model_version = response . get ( \"model_version\" , None ), response_id = response . get ( \"id\" , None ), parameters = response . get ( \"parameters\" , None ), infer_outputs = infer_outputs , ) def to_rest ( self ) -> Dict : \"\"\"Converts the InferResponse object to v2 REST InferResponse dict. Returns: The InferResponse Dict. \"\"\" infer_outputs = [] for i , infer_output in enumerate ( self . outputs ): infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_http_parameters ( infer_output . parameters ) if isinstance ( infer_output . data , numpy . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = False ) infer_output_dict [ \"data\" ] = infer_output . data elif isinstance ( infer_output . _raw_data , bytes ): infer_output_dict [ \"data\" ] = infer_output . as_numpy () . tolist () else : infer_output_dict [ \"data\" ] = infer_output . data infer_outputs . append ( infer_output_dict ) res = { \"id\" : self . id , \"model_name\" : self . model_name , \"model_version\" : self . model_version , \"outputs\" : infer_outputs , } if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) return res def to_grpc ( self ) -> ModelInferResponse : \"\"\"Converts the InferResponse object to gRPC ModelInferResponse type. Returns: The ModelInferResponse gRPC message. \"\"\" infer_outputs = [] raw_output_contents = [] for infer_output in self . outputs : if isinstance ( infer_output . data , numpy . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = True ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_grpc_parameters ( infer_output . parameters ) if infer_output . _raw_data is not None : raw_output_contents . append ( infer_output . _raw_data ) else : if not isinstance ( infer_output . data , List ): raise InvalidInput ( \"output data is not a List\" ) infer_output_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_output . datatype , None ) if data_key is not None : infer_output . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_output . data ] # str to byte conversion for grpc proto infer_output_dict [ \"contents\" ][ data_key ] = infer_output . data else : raise InvalidInput ( \"to_grpc: invalid output datatype\" ) infer_outputs . append ( infer_output_dict ) return ModelInferResponse ( id = self . id , model_name = self . model_name , model_version = self . model_version , outputs = infer_outputs , raw_output_contents = raw_output_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , ) def __eq__ ( self , other ): if not isinstance ( other , InferResponse ): return False if self . model_name != other . model_name : return False if self . model_version != other . model_version : return False if self . id != other . id : return False if self . from_grpc != other . from_grpc : return False if self . parameters != other . parameters : return False if self . outputs != other . outputs : return False return True __init__ ( response_id , model_name , infer_outputs , model_version = None , raw_outputs = None , from_grpc = False , parameters = None ) \u00b6 The InferResponse Data Model Parameters: Name Type Description Default response_id str The id of the inference response. required model_name str The name of the model. required infer_outputs List [ InferOutput ] The inference outputs of the inference response. required model_version Optional [ str ] The version of the model. None raw_outputs The raw binary data of the inference outputs. None from_grpc Optional [ bool ] Indicate if the InferResponse is constructed from a gRPC response. False parameters Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] The additional inference parameters. None Source code in kserve/protocol/infer_type.py 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 def __init__ ( self , response_id : str , model_name : str , infer_outputs : List [ InferOutput ], model_version : Optional [ str ] = None , raw_outputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"The InferResponse Data Model Args: response_id: The id of the inference response. model_name: The name of the model. infer_outputs: The inference outputs of the inference response. model_version: The version of the model. raw_outputs: The raw binary data of the inference outputs. from_grpc: Indicate if the InferResponse is constructed from a gRPC response. parameters: The additional inference parameters. \"\"\" self . id = response_id self . model_name = model_name self . model_version = model_version self . outputs = infer_outputs self . parameters = parameters self . from_grpc = from_grpc if raw_outputs : for i , raw_output in enumerate ( raw_outputs ): self . outputs [ i ] . _raw_data = raw_output from_grpc ( response ) classmethod \u00b6 The class method to construct the InferResponse object from gRPC message type. Source code in kserve/protocol/infer_type.py 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 @classmethod def from_grpc ( cls , response : ModelInferResponse ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from gRPC message type.\"\"\" infer_outputs = [ InferOutput ( name = output . name , shape = list ( output . shape ), datatype = output . datatype , data = get_content ( output . datatype , output . contents ), parameters = output . parameters , ) for output in response . outputs ] return cls ( model_name = response . model_name , model_version = response . model_version , response_id = response . id , parameters = response . parameters , infer_outputs = infer_outputs , raw_outputs = response . raw_output_contents , from_grpc = True , ) from_rest ( model_name , response ) classmethod \u00b6 The class method to construct the InferResponse object from REST message type. Source code in kserve/protocol/infer_type.py 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 @classmethod def from_rest ( cls , model_name : str , response : Dict ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from REST message type.\"\"\" infer_outputs = [ InferOutput ( name = output [ \"name\" ], shape = list ( output [ \"shape\" ]), datatype = output [ \"datatype\" ], data = output [ \"data\" ], parameters = output . get ( \"parameters\" , None ), ) for output in response [ \"outputs\" ] ] return cls ( model_name = model_name , model_version = response . get ( \"model_version\" , None ), response_id = response . get ( \"id\" , None ), parameters = response . get ( \"parameters\" , None ), infer_outputs = infer_outputs , ) to_grpc () \u00b6 Converts the InferResponse object to gRPC ModelInferResponse type. Returns: Type Description ModelInferResponse The ModelInferResponse gRPC message. Source code in kserve/protocol/infer_type.py 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 def to_grpc ( self ) -> ModelInferResponse : \"\"\"Converts the InferResponse object to gRPC ModelInferResponse type. Returns: The ModelInferResponse gRPC message. \"\"\" infer_outputs = [] raw_output_contents = [] for infer_output in self . outputs : if isinstance ( infer_output . data , numpy . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = True ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_grpc_parameters ( infer_output . parameters ) if infer_output . _raw_data is not None : raw_output_contents . append ( infer_output . _raw_data ) else : if not isinstance ( infer_output . data , List ): raise InvalidInput ( \"output data is not a List\" ) infer_output_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_output . datatype , None ) if data_key is not None : infer_output . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_output . data ] # str to byte conversion for grpc proto infer_output_dict [ \"contents\" ][ data_key ] = infer_output . data else : raise InvalidInput ( \"to_grpc: invalid output datatype\" ) infer_outputs . append ( infer_output_dict ) return ModelInferResponse ( id = self . id , model_name = self . model_name , model_version = self . model_version , outputs = infer_outputs , raw_output_contents = raw_output_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , ) to_rest () \u00b6 Converts the InferResponse object to v2 REST InferResponse dict. Returns: Type Description Dict The InferResponse Dict. Source code in kserve/protocol/infer_type.py 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 def to_rest ( self ) -> Dict : \"\"\"Converts the InferResponse object to v2 REST InferResponse dict. Returns: The InferResponse Dict. \"\"\" infer_outputs = [] for i , infer_output in enumerate ( self . outputs ): infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_http_parameters ( infer_output . parameters ) if isinstance ( infer_output . data , numpy . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = False ) infer_output_dict [ \"data\" ] = infer_output . data elif isinstance ( infer_output . _raw_data , bytes ): infer_output_dict [ \"data\" ] = infer_output . as_numpy () . tolist () else : infer_output_dict [ \"data\" ] = infer_output . data infer_outputs . append ( infer_output_dict ) res = { \"id\" : self . id , \"model_name\" : self . model_name , \"model_version\" : self . model_version , \"outputs\" : infer_outputs , } if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) return res serialize_byte_tensor ( input_tensor ) \u00b6 Serializes a bytes tensor into a flat numpy array of length prepended bytes. The numpy array should use dtype of np.object_. For np.bytes_, numpy will remove trailing zeros at the end of byte sequence and because of this it should be avoided. Args: input_tensor : np.array of the bytes tensor to serialize. Returns: serialized_bytes_tensor : The 1-D numpy array of type uint8 containing the serialized bytes in 'C' order. Source code in kserve/protocol/infer_type.py 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 def serialize_byte_tensor ( input_tensor : numpy . ndarray ): \"\"\" Serializes a bytes tensor into a flat numpy array of length prepended bytes. The numpy array should use dtype of np.object_. For np.bytes_, numpy will remove trailing zeros at the end of byte sequence and because of this it should be avoided. Args: input_tensor : np.array of the bytes tensor to serialize. Returns: serialized_bytes_tensor : The 1-D numpy array of type uint8 containing the serialized bytes in 'C' order. \"\"\" if input_tensor . size == 0 : return () # If the input is a tensor of string/bytes objects, then must flatten those # into a 1-dimensional array containing the 4-byte byte size followed by the # actual element bytes. All elements are concatenated together in \"C\" order. if ( input_tensor . dtype == np . object_ ) or ( input_tensor . dtype . type == np . bytes_ ): flattened_ls = [] for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # If directly passing bytes to BYTES type, # don't convert it to str as Python will encode the # bytes which may distort the meaning if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : s = obj . item () else : s = str ( obj . item ()) . encode ( \"utf-8\" ) else : s = obj . item () flattened_ls . append ( struct . pack ( \" Dict [ str , InferParameter ]: \"\"\" Converts REST parameters to GRPC InferParameter objects :param parameters: parameters to be converted. :return: converted parameters as Dict[str, InferParameter] :raises InvalidInput: if the parameter type is not supported. \"\"\" grpc_params : Dict [ str , InferParameter ] = {} for key , val in parameters . items (): if isinstance ( val , str ): grpc_params [ key ] = InferParameter ( string_param = val ) elif isinstance ( val , bool ): grpc_params [ key ] = InferParameter ( bool_param = val ) elif isinstance ( val , int ): grpc_params [ key ] = InferParameter ( int64_param = val ) elif isinstance ( val , InferParameter ): grpc_params [ key ] = val else : raise InvalidInput ( f \"to_grpc: invalid parameter value: { val } \" ) return grpc_params to_http_parameters ( parameters ) \u00b6 Converts GRPC InferParameter parameters to REST parameters :param parameters: parameters to be converted. :return: converted parameters as Dict[str, Union[str, bool, int]] Source code in kserve/protocol/infer_type.py 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 def to_http_parameters ( parameters : Union [ dict , MessageMap [ str , InferParameter ]] ) -> Dict [ str , Union [ str , bool , int ]]: \"\"\" Converts GRPC InferParameter parameters to REST parameters :param parameters: parameters to be converted. :return: converted parameters as Dict[str, Union[str, bool, int]] \"\"\" http_params : Dict [ str , Union [ str , bool , int ]] = {} for key , val in parameters . items (): if isinstance ( val , InferParameter ): if val . HasField ( \"bool_param\" ): http_params [ key ] = val . bool_param elif val . HasField ( \"int64_param\" ): http_params [ key ] = val . int64_param elif val . HasField ( \"string_param\" ): http_params [ key ] = val . string_param else : http_params [ key ] = val return http_params","title":"KServe Python Serving Runtime API"},{"location":"python_runtime_api/docs/api/#kserve-python-serving-runtime-api","text":"","title":"KServe Python Serving Runtime API"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer","text":"Source code in kserve/model_server.py 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 class ModelServer : def __init__ ( self , http_port : int = args . http_port , grpc_port : int = args . grpc_port , workers : int = args . workers , max_threads : int = args . max_threads , max_asyncio_workers : int = args . max_asyncio_workers , registered_models : ModelRepository = ModelRepository (), enable_grpc : bool = args . enable_grpc , enable_docs_url : bool = args . enable_docs_url , enable_latency_logging : bool = args . enable_latency_logging , configure_logging : bool = args . configure_logging , log_config : Optional [ Union [ Dict , str ]] = args . log_config_file , access_log_format : str = args . access_log_format , ): \"\"\"KServe ModelServer Constructor Args: http_port: HTTP port. Default: ``8080``. grpc_port: GRPC port. Default: ``8081``. workers: Number of uvicorn workers. Default: ``1``. max_threads: Max number of gRPC processing threads. Default: ``4`` max_asyncio_workers: Max number of AsyncIO threads. Default: ``None`` registered_models: Model repository with registered models. enable_grpc: Whether to turn on grpc server. Default: ``True`` enable_docs_url: Whether to turn on ``/docs`` Swagger UI. Default: ``False``. enable_latency_logging: Whether to log latency metric. Default: ``True``. configure_logging: Whether to configure KServe and Uvicorn logging. Default: ``True``. log_config: File path or dict containing log config. Default: ``None``. access_log_format: Format to set for the access log (provided by asgi-logger). Default: ``None`` \"\"\" self . registered_models = registered_models self . http_port = http_port self . grpc_port = grpc_port self . workers = workers self . max_threads = max_threads self . max_asyncio_workers = max_asyncio_workers self . enable_grpc = enable_grpc self . enable_docs_url = enable_docs_url self . enable_latency_logging = enable_latency_logging self . dataplane = DataPlane ( model_registry = registered_models ) self . model_repository_extension = ModelRepositoryExtension ( model_registry = self . registered_models ) self . _grpc_server = None self . _rest_server = None if self . enable_grpc : self . _grpc_server = GRPCServer ( grpc_port , self . dataplane , self . model_repository_extension ) # Logs can be passed as a path to a file or a dictConfig. # We rely on Uvicorn to configure the loggers for us. if configure_logging : self . log_config = ( log_config if log_config is not None else KSERVE_LOG_CONFIG ) else : # By setting log_config to None we tell Uvicorn not to configure logging self . log_config = None self . access_log_format = access_log_format self . _custom_exception_handler = None def start ( self , models : Union [ List [ BaseKServeModel ], Dict [ str , Deployment ]] ) -> None : \"\"\"Start the model server with a set of registered models. Args: models: a list of models to register to the model server. \"\"\" if isinstance ( models , list ): for model in models : if isinstance ( model , BaseKServeModel ): self . register_model ( model ) # pass whether to log request latency into the model model . enable_latency_logging = self . enable_latency_logging else : raise RuntimeError ( \"Model type should be 'BaseKServeModel'\" ) elif isinstance ( models , dict ): if all ([ isinstance ( v , Deployment ) for v in models . values ()]): # TODO: make this port number a variable rayserve . start ( detached = True , http_options = { \"host\" : \"0.0.0.0\" , \"port\" : 9071 } ) for key in models : models [ key ] . deploy () handle = models [ key ] . get_handle () self . register_model_handle ( key , handle ) else : raise RuntimeError ( \"Model type should be RayServe Deployment\" ) else : raise RuntimeError ( \"Unknown model collection types\" ) if self . max_asyncio_workers is None : # formula as suggest in https://bugs.python.org/issue35279 self . max_asyncio_workers = min ( 32 , utils . cpu_count () + 4 ) logger . info ( f \"Setting max asyncio worker threads as { self . max_asyncio_workers } \" ) asyncio . get_event_loop () . set_default_executor ( concurrent . futures . ThreadPoolExecutor ( max_workers = self . max_asyncio_workers ) ) async def serve (): logger . info ( f \"Starting uvicorn with { self . workers } workers\" ) loop = asyncio . get_event_loop () if sys . platform not in [ \"win32\" , \"win64\" ]: sig_list = [ signal . SIGINT , signal . SIGTERM , signal . SIGQUIT ] else : sig_list = [ signal . SIGINT , signal . SIGTERM ] for sig in sig_list : loop . add_signal_handler ( sig , lambda s = sig : asyncio . create_task ( self . stop ( sig = s )) ) if self . _custom_exception_handler is None : loop . set_exception_handler ( self . default_exception_handler ) else : loop . set_exception_handler ( self . _custom_exception_handler ) if self . workers == 1 : self . _rest_server = UvicornServer ( self . http_port , [], self . dataplane , self . model_repository_extension , self . enable_docs_url , log_config = self . log_config , access_log_format = self . access_log_format , ) await self . _rest_server . run () else : # Since py38 MacOS/Windows defaults to use spawn for starting multiprocessing. # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods # Spawn does not work with FastAPI/uvicorn in multiprocessing mode, use fork for multiprocessing # https://github.com/tiangolo/fastapi/issues/1586 serversocket = socket . socket ( socket . AF_INET , socket . SOCK_STREAM ) serversocket . setsockopt ( socket . SOL_SOCKET , socket . SO_REUSEADDR , 1 ) serversocket . bind (( \"0.0.0.0\" , self . http_port )) serversocket . listen ( 5 ) multiprocessing . set_start_method ( \"fork\" ) self . _rest_server = UvicornServer ( self . http_port , [ serversocket ], self . dataplane , self . model_repository_extension , self . enable_docs_url , log_config = self . log_config , access_log_format = self . access_log_format , ) for _ in range ( self . workers ): p = Process ( target = self . _rest_server . run_sync ) p . start () async def servers_task (): servers = [ serve ()] if self . enable_grpc : servers . append ( self . _grpc_server . start ( self . max_threads )) await asyncio . gather ( * servers ) asyncio . run ( servers_task ()) async def stop ( self , sig : Optional [ int ] = None ): \"\"\"Stop the instances of REST and gRPC model servers. Args: sig: The signal to stop the server. Default: ``None``. \"\"\" logger . info ( \"Stopping the model server\" ) if self . _rest_server : logger . info ( \"Stopping the rest server\" ) await self . _rest_server . stop () if self . _grpc_server : logger . info ( \"Stopping the grpc server\" ) await self . _grpc_server . stop ( sig ) for model_name in list ( self . registered_models . get_models () . keys ()): self . registered_models . unload ( model_name ) def register_exception_handler ( self , handler : Callable [[ asyncio . events . AbstractEventLoop , Dict [ str , Any ]], None ], ): \"\"\"Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see `call_exception_handler()` documentation for details about context). \"\"\" self . _custom_exception_handler = handler def default_exception_handler ( self , loop : asyncio . events . AbstractEventLoop , context : Dict [ str , Any ] ): \"\"\"Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. By default, this will shut down the server gracefully. This can be called by a custom exception handler that wants to defer to the default handler behavior. \"\"\" # gracefully shutdown the server loop . run_until_complete ( self . stop ()) loop . default_exception_handler ( context ) def register_model_handle ( self , name : str , model_handle : DeploymentHandle ): \"\"\"Register a model handle to the model server. Args: name: The name of the model handle. model_handle: The model handle object. \"\"\" self . registered_models . update_handle ( name , model_handle ) logger . info ( \"Registering model handle: %s \" , name ) def register_model ( self , model : BaseKServeModel ): \"\"\"Register a model to the model server. Args: model: The model object. \"\"\" if not model . name : raise Exception ( \"Failed to register model, model.name must be provided.\" ) self . registered_models . update ( model ) logger . info ( \"Registering model: %s \" , model . name )","title":"ModelServer"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.__init__","text":"KServe ModelServer Constructor Parameters: Name Type Description Default http_port int HTTP port. Default: 8080 . http_port grpc_port int GRPC port. Default: 8081 . grpc_port workers int Number of uvicorn workers. Default: 1 . workers max_threads int Max number of gRPC processing threads. Default: 4 max_threads max_asyncio_workers int Max number of AsyncIO threads. Default: None max_asyncio_workers registered_models ModelRepository Model repository with registered models. ModelRepository () enable_grpc bool Whether to turn on grpc server. Default: True enable_grpc enable_docs_url bool Whether to turn on /docs Swagger UI. Default: False . enable_docs_url enable_latency_logging bool Whether to log latency metric. Default: True . enable_latency_logging configure_logging bool Whether to configure KServe and Uvicorn logging. Default: True . configure_logging log_config Optional [ Union [ Dict , str ]] File path or dict containing log config. Default: None . log_config_file access_log_format str Format to set for the access log (provided by asgi-logger). Default: None access_log_format Source code in kserve/model_server.py 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 def __init__ ( self , http_port : int = args . http_port , grpc_port : int = args . grpc_port , workers : int = args . workers , max_threads : int = args . max_threads , max_asyncio_workers : int = args . max_asyncio_workers , registered_models : ModelRepository = ModelRepository (), enable_grpc : bool = args . enable_grpc , enable_docs_url : bool = args . enable_docs_url , enable_latency_logging : bool = args . enable_latency_logging , configure_logging : bool = args . configure_logging , log_config : Optional [ Union [ Dict , str ]] = args . log_config_file , access_log_format : str = args . access_log_format , ): \"\"\"KServe ModelServer Constructor Args: http_port: HTTP port. Default: ``8080``. grpc_port: GRPC port. Default: ``8081``. workers: Number of uvicorn workers. Default: ``1``. max_threads: Max number of gRPC processing threads. Default: ``4`` max_asyncio_workers: Max number of AsyncIO threads. Default: ``None`` registered_models: Model repository with registered models. enable_grpc: Whether to turn on grpc server. Default: ``True`` enable_docs_url: Whether to turn on ``/docs`` Swagger UI. Default: ``False``. enable_latency_logging: Whether to log latency metric. Default: ``True``. configure_logging: Whether to configure KServe and Uvicorn logging. Default: ``True``. log_config: File path or dict containing log config. Default: ``None``. access_log_format: Format to set for the access log (provided by asgi-logger). Default: ``None`` \"\"\" self . registered_models = registered_models self . http_port = http_port self . grpc_port = grpc_port self . workers = workers self . max_threads = max_threads self . max_asyncio_workers = max_asyncio_workers self . enable_grpc = enable_grpc self . enable_docs_url = enable_docs_url self . enable_latency_logging = enable_latency_logging self . dataplane = DataPlane ( model_registry = registered_models ) self . model_repository_extension = ModelRepositoryExtension ( model_registry = self . registered_models ) self . _grpc_server = None self . _rest_server = None if self . enable_grpc : self . _grpc_server = GRPCServer ( grpc_port , self . dataplane , self . model_repository_extension ) # Logs can be passed as a path to a file or a dictConfig. # We rely on Uvicorn to configure the loggers for us. if configure_logging : self . log_config = ( log_config if log_config is not None else KSERVE_LOG_CONFIG ) else : # By setting log_config to None we tell Uvicorn not to configure logging self . log_config = None self . access_log_format = access_log_format self . _custom_exception_handler = None","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.default_exception_handler","text":"Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. By default, this will shut down the server gracefully. This can be called by a custom exception handler that wants to defer to the default handler behavior. Source code in kserve/model_server.py 348 349 350 351 352 353 354 355 356 357 358 359 360 def default_exception_handler ( self , loop : asyncio . events . AbstractEventLoop , context : Dict [ str , Any ] ): \"\"\"Default exception handler for event loop. This is called when an exception occurs and no exception handler is set. By default, this will shut down the server gracefully. This can be called by a custom exception handler that wants to defer to the default handler behavior. \"\"\" # gracefully shutdown the server loop . run_until_complete ( self . stop ()) loop . default_exception_handler ( context )","title":"default_exception_handler"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.register_exception_handler","text":"Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see call_exception_handler() documentation for details about context). Source code in kserve/model_server.py 334 335 336 337 338 339 340 341 342 343 344 345 346 def register_exception_handler ( self , handler : Callable [[ asyncio . events . AbstractEventLoop , Dict [ str , Any ]], None ], ): \"\"\"Add a custom handler as the event loop exception handler. If a handler is not provided, the default exception handler will be set. handler should be a callable object, it should have a signature matching '(loop, context)', where 'loop' will be a reference to the active event loop, 'context' will be a dict object (see `call_exception_handler()` documentation for details about context). \"\"\" self . _custom_exception_handler = handler","title":"register_exception_handler"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.register_model","text":"Register a model to the model server. Parameters: Name Type Description Default model BaseKServeModel The model object. required Source code in kserve/model_server.py 372 373 374 375 376 377 378 379 380 381 def register_model ( self , model : BaseKServeModel ): \"\"\"Register a model to the model server. Args: model: The model object. \"\"\" if not model . name : raise Exception ( \"Failed to register model, model.name must be provided.\" ) self . registered_models . update ( model ) logger . info ( \"Registering model: %s \" , model . name )","title":"register_model"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.register_model_handle","text":"Register a model handle to the model server. Parameters: Name Type Description Default name str The name of the model handle. required model_handle DeploymentHandle The model handle object. required Source code in kserve/model_server.py 362 363 364 365 366 367 368 369 370 def register_model_handle ( self , name : str , model_handle : DeploymentHandle ): \"\"\"Register a model handle to the model server. Args: name: The name of the model handle. model_handle: The model handle object. \"\"\" self . registered_models . update_handle ( name , model_handle ) logger . info ( \"Registering model handle: %s \" , name )","title":"register_model_handle"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.start","text":"Start the model server with a set of registered models. Parameters: Name Type Description Default models Union [ List [ BaseKServeModel ], Dict [ str , Deployment ]] a list of models to register to the model server. required Source code in kserve/model_server.py 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 def start ( self , models : Union [ List [ BaseKServeModel ], Dict [ str , Deployment ]] ) -> None : \"\"\"Start the model server with a set of registered models. Args: models: a list of models to register to the model server. \"\"\" if isinstance ( models , list ): for model in models : if isinstance ( model , BaseKServeModel ): self . register_model ( model ) # pass whether to log request latency into the model model . enable_latency_logging = self . enable_latency_logging else : raise RuntimeError ( \"Model type should be 'BaseKServeModel'\" ) elif isinstance ( models , dict ): if all ([ isinstance ( v , Deployment ) for v in models . values ()]): # TODO: make this port number a variable rayserve . start ( detached = True , http_options = { \"host\" : \"0.0.0.0\" , \"port\" : 9071 } ) for key in models : models [ key ] . deploy () handle = models [ key ] . get_handle () self . register_model_handle ( key , handle ) else : raise RuntimeError ( \"Model type should be RayServe Deployment\" ) else : raise RuntimeError ( \"Unknown model collection types\" ) if self . max_asyncio_workers is None : # formula as suggest in https://bugs.python.org/issue35279 self . max_asyncio_workers = min ( 32 , utils . cpu_count () + 4 ) logger . info ( f \"Setting max asyncio worker threads as { self . max_asyncio_workers } \" ) asyncio . get_event_loop () . set_default_executor ( concurrent . futures . ThreadPoolExecutor ( max_workers = self . max_asyncio_workers ) ) async def serve (): logger . info ( f \"Starting uvicorn with { self . workers } workers\" ) loop = asyncio . get_event_loop () if sys . platform not in [ \"win32\" , \"win64\" ]: sig_list = [ signal . SIGINT , signal . SIGTERM , signal . SIGQUIT ] else : sig_list = [ signal . SIGINT , signal . SIGTERM ] for sig in sig_list : loop . add_signal_handler ( sig , lambda s = sig : asyncio . create_task ( self . stop ( sig = s )) ) if self . _custom_exception_handler is None : loop . set_exception_handler ( self . default_exception_handler ) else : loop . set_exception_handler ( self . _custom_exception_handler ) if self . workers == 1 : self . _rest_server = UvicornServer ( self . http_port , [], self . dataplane , self . model_repository_extension , self . enable_docs_url , log_config = self . log_config , access_log_format = self . access_log_format , ) await self . _rest_server . run () else : # Since py38 MacOS/Windows defaults to use spawn for starting multiprocessing. # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods # Spawn does not work with FastAPI/uvicorn in multiprocessing mode, use fork for multiprocessing # https://github.com/tiangolo/fastapi/issues/1586 serversocket = socket . socket ( socket . AF_INET , socket . SOCK_STREAM ) serversocket . setsockopt ( socket . SOL_SOCKET , socket . SO_REUSEADDR , 1 ) serversocket . bind (( \"0.0.0.0\" , self . http_port )) serversocket . listen ( 5 ) multiprocessing . set_start_method ( \"fork\" ) self . _rest_server = UvicornServer ( self . http_port , [ serversocket ], self . dataplane , self . model_repository_extension , self . enable_docs_url , log_config = self . log_config , access_log_format = self . access_log_format , ) for _ in range ( self . workers ): p = Process ( target = self . _rest_server . run_sync ) p . start () async def servers_task (): servers = [ serve ()] if self . enable_grpc : servers . append ( self . _grpc_server . start ( self . max_threads )) await asyncio . gather ( * servers ) asyncio . run ( servers_task ())","title":"start"},{"location":"python_runtime_api/docs/api/#kserve.model_server.ModelServer.stop","text":"Stop the instances of REST and gRPC model servers. Parameters: Name Type Description Default sig Optional [ int ] The signal to stop the server. Default: None . None Source code in kserve/model_server.py 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 async def stop ( self , sig : Optional [ int ] = None ): \"\"\"Stop the instances of REST and gRPC model servers. Args: sig: The signal to stop the server. Default: ``None``. \"\"\" logger . info ( \"Stopping the model server\" ) if self . _rest_server : logger . info ( \"Stopping the rest server\" ) await self . _rest_server . stop () if self . _grpc_server : logger . info ( \"Stopping the grpc server\" ) await self . _grpc_server . stop ( sig ) for model_name in list ( self . registered_models . get_models () . keys ()): self . registered_models . unload ( model_name )","title":"stop"},{"location":"python_runtime_api/docs/api/#kserve.model.BaseKServeModel","text":"Bases: ABC A base class to inherit all of the kserve models from. This class implements the expectations of model repository and model server. Source code in kserve/model.py 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 class BaseKServeModel ( ABC ): \"\"\" A base class to inherit all of the kserve models from. This class implements the expectations of model repository and model server. \"\"\" def __init__ ( self , name : str ): \"\"\" Adds the required attributes Args: name: The name of the model. \"\"\" self . name = name self . ready = False def stop ( self ): \"\"\"Stop handler can be overridden to perform model teardown\"\"\" pass","title":"BaseKServeModel"},{"location":"python_runtime_api/docs/api/#kserve.model.BaseKServeModel.__init__","text":"Adds the required attributes Parameters: Name Type Description Default name str The name of the model. required Source code in kserve/model.py 53 54 55 56 57 58 59 60 61 def __init__ ( self , name : str ): \"\"\" Adds the required attributes Args: name: The name of the model. \"\"\" self . name = name self . ready = False","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.model.BaseKServeModel.stop","text":"Stop handler can be overridden to perform model teardown Source code in kserve/model.py 63 64 65 def stop ( self ): \"\"\"Stop handler can be overridden to perform model teardown\"\"\" pass","title":"stop"},{"location":"python_runtime_api/docs/api/#kserve.model.Model","text":"Bases: BaseKServeModel Source code in kserve/model.pyclass Model ( BaseKServeModel ): def __init__ ( self , name : str , predictor_config : Optional [ PredictorConfig ] = None ): \"\"\"KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Args: name: The name of the model. predictor_config: The configurations for http call to the predictor. \"\"\" super () . __init__ ( name ) # The predictor config member fields are kept for backwards compatibility as they could be set outside self . protocol = ( predictor_config . predictor_protocol if predictor_config else PredictorProtocol . REST_V1 . value ) self . predictor_host = ( predictor_config . predictor_host if predictor_config else None ) # The default timeout matches what is set in generated Istio virtual service resources. # We generally don't want things to time out at the request level here, # timeouts should be handled elsewhere in the system. self . timeout = ( predictor_config . predictor_request_timeout_seconds if predictor_config else 600 ) self . use_ssl = predictor_config . predictor_use_ssl if predictor_config else False self . explainer_host = None self . _http_client_instance = None self . _grpc_client_stub = None self . enable_latency_logging = False async def __call__ ( self , body : Union [ Dict , CloudEvent , InferRequest ], verb : InferenceVerb = InferenceVerb . PREDICT , headers : Dict [ str , str ] = None , ) -> Union [ Dict , InferResponse , List [ str ]]: \"\"\"Method to call predictor or explainer with the given input. Args: body: Request body. verb: The inference verb for predict/generate/explain headers: Request headers. Returns: Response output from preprocess -> predict/generate/explain -> postprocess \"\"\" request_id = headers . get ( \"x-request-id\" , \"N.A.\" ) if headers else \"N.A.\" # latency vars preprocess_ms = 0 explain_ms = 0 predict_ms = 0 postprocess_ms = 0 prom_labels = get_labels ( self . name ) with PRE_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () payload = ( await self . preprocess ( body , headers ) if inspect . iscoroutinefunction ( self . preprocess ) else self . preprocess ( body , headers ) ) preprocess_ms = get_latency_ms ( start , time . time ()) payload = self . validate ( payload ) if verb == InferenceVerb . EXPLAIN : with EXPLAIN_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . explain ( payload , headers )) if inspect . iscoroutinefunction ( self . explain ) else self . explain ( payload , headers ) ) explain_ms = get_latency_ms ( start , time . time ()) elif verb == InferenceVerb . PREDICT : with PREDICT_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . predict ( payload , headers )) if inspect . iscoroutinefunction ( self . predict ) else self . predict ( payload , headers ) ) predict_ms = get_latency_ms ( start , time . time ()) else : raise NotImplementedError with POST_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( await self . postprocess ( response , headers ) if inspect . iscoroutinefunction ( self . postprocess ) else self . postprocess ( response , headers ) ) postprocess_ms = get_latency_ms ( start , time . time ()) if self . enable_latency_logging is True : trace_logger . info ( f \"requestId: { request_id } , preprocess_ms: { preprocess_ms } , \" f \"explain_ms: { explain_ms } , predict_ms: { predict_ms } , \" f \"postprocess_ms: { postprocess_ms } \" ) return response @property def _http_client ( self ): if self . _http_client_instance is None : self . _http_client_instance = httpx . AsyncClient () return self . _http_client_instance @property def _grpc_client ( self ): if self . _grpc_client_stub is None : # requires appending the port to the predictor host for gRPC to work if \":\" not in self . predictor_host : port = 443 if self . use_ssl else 80 self . predictor_host = f \" { self . predictor_host } : { port } \" if self . use_ssl : _channel = grpc . aio . secure_channel ( self . predictor_host , grpc . ssl_channel_credentials () ) else : _channel = grpc . aio . insecure_channel ( self . predictor_host ) self . _grpc_client_stub = grpc_predict_v2_pb2_grpc . GRPCInferenceServiceStub ( _channel ) return self . _grpc_client_stub def validate ( self , payload ): if isinstance ( payload , ModelInferRequest ): return payload if isinstance ( payload , InferRequest ): return payload # TODO: validate the request if self.get_input_types() defines the input types. if self . protocol == PredictorProtocol . REST_V2 . value : if \"inputs\" in payload and not isinstance ( payload [ \"inputs\" ], list ): raise InvalidInput ( 'Expected \"inputs\" to be a list' ) elif self . protocol == PredictorProtocol . REST_V1 . value : if ( isinstance ( payload , Dict ) and \"instances\" in payload and not isinstance ( payload [ \"instances\" ], list ) ): raise InvalidInput ( 'Expected \"instances\" to be a list' ) return payload def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready def get_input_types ( self ) -> List [ Dict ]: # Override this function to return appropriate input format expected by your model. # Refer https://kserve.github.io/website/0.9/modelserving/inference_api/#model-metadata-response-json-object # Eg. # return [{ \"name\": \"\", \"datatype\": \"INT32\", \"shape\": [1,5], }] return [] def get_output_types ( self ) -> List [ Dict ]: # Override this function to return appropriate output format returned by your model. # Refer https://kserve.github.io/website/0.9/modelserving/inference_api/#model-metadata-response-json-object # Eg. # return [{ \"name\": \"\", \"datatype\": \"INT32\", \"shape\": [1,5], }] return [] async def preprocess ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferRequest ]: \"\"\"`preprocess` handler can be overridden for data or feature transformation. The model decodes the request body to `Dict` for v1 endpoints and `InferRequest` for v2 endpoints. Args: payload: Payload of the request. headers: Request headers. Returns: A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. \"\"\" return payload async def postprocess ( self , result : Union [ Dict , InferResponse ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferResponse ]: \"\"\"The `postprocess` handler can be overridden for inference result or response transformation. The predictor sends back the inference result in `Dict` for v1 endpoints and `InferResponse` for v2 endpoints. Args: result: The inference result passed from `predict` handler or the HTTP response from predictor. headers: Request headers. Returns: A Dict or InferResponse after post-process to return back to the client. \"\"\" return result async def _http_predict ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Dict : protocol = \"https\" if self . use_ssl else \"http\" predict_url = PREDICTOR_URL_FORMAT . format ( protocol , self . predictor_host , self . name ) if self . protocol == PredictorProtocol . REST_V2 . value : predict_url = PREDICTOR_V2_URL_FORMAT . format ( protocol , self . predictor_host , self . name ) # Adjusting headers. Inject content type if not exist. # Also, removing host, as the header is the one passed to transformer and contains transformer's host predict_headers = { \"Content-Type\" : \"application/json\" } if headers is not None : if \"x-request-id\" in headers : predict_headers [ \"x-request-id\" ] = headers [ \"x-request-id\" ] if \"x-b3-traceid\" in headers : predict_headers [ \"x-b3-traceid\" ] = headers [ \"x-b3-traceid\" ] if isinstance ( payload , InferRequest ): payload = payload . to_rest () data = orjson . dumps ( payload ) try : response = await self . _http_client . post ( predict_url , timeout = self . timeout , headers = predict_headers , content = data ) except Exception as exc : request_id = predict_headers . get ( \"x-request-id\" , \"N.A.\" ) logger . error ( f \"Could not send a request to predictor at url { predict_url } \" f \"for { request_id =} \" f \"due to exception { exc } \" ) raise exc if not response . is_success : message = ( \" {error_message} , ' {0.status_code} {0.reason_phrase} ' for url ' {0.url} '\" ) error_message = \"\" if ( \"content-type\" in response . headers and response . headers [ \"content-type\" ] == \"application/json\" ): error_message = response . json () if \"error\" in error_message : error_message = error_message [ \"error\" ] message = message . format ( response , error_message = error_message ) raise HTTPStatusError ( message , request = response . request , response = response ) return orjson . loads ( response . content ) async def _grpc_predict ( self , payload : Union [ ModelInferRequest , InferRequest ], headers : Dict [ str , str ] = None , ) -> ModelInferResponse : if isinstance ( payload , InferRequest ): payload = payload . to_grpc () async_result = await self . _grpc_client . ModelInfer ( request = payload , timeout = self . timeout , metadata = ( ( \"request_type\" , \"grpc_v2\" ), ( \"response_type\" , \"grpc_v2\" ), ( \"x-request-id\" , headers . get ( \"x-request-id\" , \"\" )), ), ) return async_result async def predict ( self , payload : Union [ Dict , InferRequest , ModelInferRequest ], headers : Dict [ str , str ] = None , ) -> Union [ Dict , InferResponse , AsyncIterator [ Any ]]: \"\"\"The `predict` handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Args: payload: Model inputs passed from `preprocess` handler. headers: Request headers. Returns: Inference result or a Response from the predictor. Raises: HTTPStatusError when getting back an error response from the predictor. \"\"\" if not self . predictor_host : raise NotImplementedError ( \"Could not find predictor_host.\" ) if self . protocol == PredictorProtocol . GRPC_V2 . value : res = await self . _grpc_predict ( payload , headers ) return InferResponse . from_grpc ( res ) else : res = await self . _http_predict ( payload , headers ) # return an InferResponse if this is REST V2, otherwise just return the dictionary return ( InferResponse . from_rest ( self . name , res ) if is_v2 ( PredictorProtocol ( self . protocol )) else res ) async def explain ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : \"\"\"`explain` handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if ``explainer_host`` is specified. Args: payload: Explainer model inputs passed from preprocess handler. headers: Request headers. Returns: An Explanation for the inference result. Raises: HTTPStatusError when getting back an error response from the explainer. \"\"\" if self . explainer_host is None : raise NotImplementedError ( \"Could not find explainer_host.\" ) protocol = \"https\" if self . use_ssl else \"http\" # Currently explainer only supports the kserve v1 endpoints explain_url = EXPLAINER_URL_FORMAT . format ( protocol , self . explainer_host , self . name ) response = await self . _http_client . post ( url = explain_url , timeout = self . timeout , content = orjson . dumps ( payload ) ) response . raise_for_status () return orjson . loads ( response . content )","title":"Model"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.__call__","text":"Method to call predictor or explainer with the given input. Parameters: Name Type Description Default body Union [ Dict , CloudEvent , InferRequest ] Request body. required verb InferenceVerb The inference verb for predict/generate/explain PREDICT headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferResponse , List [ str ]] Response output from preprocess -> predict/generate/explain -> postprocess Source code in kserve/model.py 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 async def __call__ ( self , body : Union [ Dict , CloudEvent , InferRequest ], verb : InferenceVerb = InferenceVerb . PREDICT , headers : Dict [ str , str ] = None , ) -> Union [ Dict , InferResponse , List [ str ]]: \"\"\"Method to call predictor or explainer with the given input. Args: body: Request body. verb: The inference verb for predict/generate/explain headers: Request headers. Returns: Response output from preprocess -> predict/generate/explain -> postprocess \"\"\" request_id = headers . get ( \"x-request-id\" , \"N.A.\" ) if headers else \"N.A.\" # latency vars preprocess_ms = 0 explain_ms = 0 predict_ms = 0 postprocess_ms = 0 prom_labels = get_labels ( self . name ) with PRE_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () payload = ( await self . preprocess ( body , headers ) if inspect . iscoroutinefunction ( self . preprocess ) else self . preprocess ( body , headers ) ) preprocess_ms = get_latency_ms ( start , time . time ()) payload = self . validate ( payload ) if verb == InferenceVerb . EXPLAIN : with EXPLAIN_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . explain ( payload , headers )) if inspect . iscoroutinefunction ( self . explain ) else self . explain ( payload , headers ) ) explain_ms = get_latency_ms ( start , time . time ()) elif verb == InferenceVerb . PREDICT : with PREDICT_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( ( await self . predict ( payload , headers )) if inspect . iscoroutinefunction ( self . predict ) else self . predict ( payload , headers ) ) predict_ms = get_latency_ms ( start , time . time ()) else : raise NotImplementedError with POST_HIST_TIME . labels ( ** prom_labels ) . time (): start = time . time () response = ( await self . postprocess ( response , headers ) if inspect . iscoroutinefunction ( self . postprocess ) else self . postprocess ( response , headers ) ) postprocess_ms = get_latency_ms ( start , time . time ()) if self . enable_latency_logging is True : trace_logger . info ( f \"requestId: { request_id } , preprocess_ms: { preprocess_ms } , \" f \"explain_ms: { explain_ms } , predict_ms: { predict_ms } , \" f \"postprocess_ms: { postprocess_ms } \" ) return response","title":"__call__"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.__init__","text":"KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Parameters: Name Type Description Default name str The name of the model. required predictor_config Optional [ PredictorConfig ] The configurations for http call to the predictor. None Source code in kserve/model.py 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 def __init__ ( self , name : str , predictor_config : Optional [ PredictorConfig ] = None ): \"\"\"KServe Model Public Interface Model is intended to be subclassed to implement the model handlers. Args: name: The name of the model. predictor_config: The configurations for http call to the predictor. \"\"\" super () . __init__ ( name ) # The predictor config member fields are kept for backwards compatibility as they could be set outside self . protocol = ( predictor_config . predictor_protocol if predictor_config else PredictorProtocol . REST_V1 . value ) self . predictor_host = ( predictor_config . predictor_host if predictor_config else None ) # The default timeout matches what is set in generated Istio virtual service resources. # We generally don't want things to time out at the request level here, # timeouts should be handled elsewhere in the system. self . timeout = ( predictor_config . predictor_request_timeout_seconds if predictor_config else 600 ) self . use_ssl = predictor_config . predictor_use_ssl if predictor_config else False self . explainer_host = None self . _http_client_instance = None self . _grpc_client_stub = None self . enable_latency_logging = False","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.explain","text":"explain handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if explainer_host is specified. Parameters: Name Type Description Default payload Dict Explainer model inputs passed from preprocess handler. required headers Dict [ str , str ] Request headers. None Returns: Type Description Dict An Explanation for the inference result. Source code in kserve/model.py 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 async def explain ( self , payload : Dict , headers : Dict [ str , str ] = None ) -> Dict : \"\"\"`explain` handler can be overridden to implement the model explanation. The default implementation makes call to the explainer if ``explainer_host`` is specified. Args: payload: Explainer model inputs passed from preprocess handler. headers: Request headers. Returns: An Explanation for the inference result. Raises: HTTPStatusError when getting back an error response from the explainer. \"\"\" if self . explainer_host is None : raise NotImplementedError ( \"Could not find explainer_host.\" ) protocol = \"https\" if self . use_ssl else \"http\" # Currently explainer only supports the kserve v1 endpoints explain_url = EXPLAINER_URL_FORMAT . format ( protocol , self . explainer_host , self . name ) response = await self . _http_client . post ( url = explain_url , timeout = self . timeout , content = orjson . dumps ( payload ) ) response . raise_for_status () return orjson . loads ( response . content )","title":"explain"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.load","text":"Load handler can be overridden to load the model from storage. The self.ready should be set to True after the model is loaded. The flag is used for model health check. Returns: Name Type Description bool bool True if model is ready, False otherwise Source code in kserve/model.py 260 261 262 263 264 265 266 267 268 def load ( self ) -> bool : \"\"\"Load handler can be overridden to load the model from storage. The `self.ready` should be set to True after the model is loaded. The flag is used for model health check. Returns: bool: True if model is ready, False otherwise \"\"\" self . ready = True return self . ready","title":"load"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.postprocess","text":"The postprocess handler can be overridden for inference result or response transformation. The predictor sends back the inference result in Dict for v1 endpoints and InferResponse for v2 endpoints. Parameters: Name Type Description Default result Union [ Dict , InferResponse ] The inference result passed from predict handler or the HTTP response from predictor. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferResponse ] A Dict or InferResponse after post-process to return back to the client. Source code in kserve/model.py 303 304 305 306 307 308 309 310 311 312 313 314 315 316 async def postprocess ( self , result : Union [ Dict , InferResponse ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferResponse ]: \"\"\"The `postprocess` handler can be overridden for inference result or response transformation. The predictor sends back the inference result in `Dict` for v1 endpoints and `InferResponse` for v2 endpoints. Args: result: The inference result passed from `predict` handler or the HTTP response from predictor. headers: Request headers. Returns: A Dict or InferResponse after post-process to return back to the client. \"\"\" return result","title":"postprocess"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.predict","text":"The predict handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Parameters: Name Type Description Default payload Union [ Dict , InferRequest , ModelInferRequest ] Model inputs passed from preprocess handler. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferResponse , AsyncIterator [ Any ]] Inference result or a Response from the predictor. Source code in kserve/model.py 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 async def predict ( self , payload : Union [ Dict , InferRequest , ModelInferRequest ], headers : Dict [ str , str ] = None , ) -> Union [ Dict , InferResponse , AsyncIterator [ Any ]]: \"\"\"The `predict` handler can be overridden for performing the inference. By default, the predict handler makes call to predictor for the inference step. Args: payload: Model inputs passed from `preprocess` handler. headers: Request headers. Returns: Inference result or a Response from the predictor. Raises: HTTPStatusError when getting back an error response from the predictor. \"\"\" if not self . predictor_host : raise NotImplementedError ( \"Could not find predictor_host.\" ) if self . protocol == PredictorProtocol . GRPC_V2 . value : res = await self . _grpc_predict ( payload , headers ) return InferResponse . from_grpc ( res ) else : res = await self . _http_predict ( payload , headers ) # return an InferResponse if this is REST V2, otherwise just return the dictionary return ( InferResponse . from_rest ( self . name , res ) if is_v2 ( PredictorProtocol ( self . protocol )) else res )","title":"predict"},{"location":"python_runtime_api/docs/api/#kserve.model.Model.preprocess","text":"preprocess handler can be overridden for data or feature transformation. The model decodes the request body to Dict for v1 endpoints and InferRequest for v2 endpoints. Parameters: Name Type Description Default payload Union [ Dict , InferRequest ] Payload of the request. required headers Dict [ str , str ] Request headers. None Returns: Type Description Union [ Dict , InferRequest ] A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Union [ Dict , InferRequest ] Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. Source code in kserve/model.py 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 async def preprocess ( self , payload : Union [ Dict , InferRequest ], headers : Dict [ str , str ] = None ) -> Union [ Dict , InferRequest ]: \"\"\"`preprocess` handler can be overridden for data or feature transformation. The model decodes the request body to `Dict` for v1 endpoints and `InferRequest` for v2 endpoints. Args: payload: Payload of the request. headers: Request headers. Returns: A Dict or InferRequest in KServe Model Transformer mode which is transmitted on the wire to predictor. Tensors in KServe Predictor mode which is passed to predict handler for performing the inference. \"\"\" return payload","title":"preprocess"},{"location":"python_runtime_api/docs/api/#kserve.model.PredictorConfig","text":"Source code in kserve/model.py 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 class PredictorConfig : def __init__ ( self , predictor_host : str , predictor_protocol : str = PredictorProtocol . REST_V1 . value , predictor_use_ssl : bool = False , predictor_request_timeout_seconds : int = 600 , ): \"\"\"The configuration for the http call to the predictor Args: predictor_host: The host name of the predictor predictor_protocol: The inference protocol used for predictor http call predictor_use_ssl: Enable using ssl for http connection to the predictor predictor_request_timeout_seconds: The request timeout seconds for the predictor http call \"\"\" self . predictor_host = predictor_host self . predictor_protocol = predictor_protocol self . predictor_use_ssl = predictor_use_ssl self . predictor_request_timeout_seconds = predictor_request_timeout_seconds","title":"PredictorConfig"},{"location":"python_runtime_api/docs/api/#kserve.model.PredictorConfig.__init__","text":"The configuration for the http call to the predictor Parameters: Name Type Description Default predictor_host str The host name of the predictor required predictor_protocol str The inference protocol used for predictor http call REST_V1 .value predictor_use_ssl bool Enable using ssl for http connection to the predictor False predictor_request_timeout_seconds int The request timeout seconds for the predictor http call 600 Source code in kserve/model.py 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 def __init__ ( self , predictor_host : str , predictor_protocol : str = PredictorProtocol . REST_V1 . value , predictor_use_ssl : bool = False , predictor_request_timeout_seconds : int = 600 , ): \"\"\"The configuration for the http call to the predictor Args: predictor_host: The host name of the predictor predictor_protocol: The inference protocol used for predictor http call predictor_use_ssl: Enable using ssl for http connection to the predictor predictor_request_timeout_seconds: The request timeout seconds for the predictor http call \"\"\" self . predictor_host = predictor_host self . predictor_protocol = predictor_protocol self . predictor_use_ssl = predictor_use_ssl self . predictor_request_timeout_seconds = predictor_request_timeout_seconds","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput","text":"Source code in kserve/protocol/infer_type.pyclass InferInput : _name : str _shape : List [ int ] _datatype : str _parameters : Dict def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferInput class is used to describe the input tensor of an inference request. Args: name: The name of the inference input whose data will be described by this object. shape : The shape of the associated inference input. datatype : The data type of the associated inference input. data : The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using `set_data_from_numpy`. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None @property def name ( self ) -> str : \"\"\"Get the name of inference input associated with this object. Returns: The name of the inference input \"\"\" return self . _name @property def datatype ( self ) -> str : \"\"\"Get the datatype of inference input associated with this object. Returns: The datatype of the inference input. \"\"\" return self . _datatype @property def data ( self ) -> Union [ List , np . ndarray , InferTensorContents ]: \"\"\"Get the data of the inference input associated with this object. Returns: The data of the inference input. \"\"\" return self . _data @property def shape ( self ) -> List [ int ]: \"\"\"Get the shape of inference input associated with this object. Returns: The shape of the inference input. \"\"\" return self . _shape @property def parameters ( self ) -> Union [ Dict , MessageMap [ str , InferParameter ], None ]: \"\"\"Get the parameters of the inference input associated with this object. Returns: The additional inference parameters \"\"\" return self . _parameters def set_shape ( self , shape : List [ int ]): \"\"\"Set the shape of inference input. Args: shape : The shape of the associated inference input. \"\"\" self . _shape = shape def as_string ( self ) -> List [ List [ str ]]: if self . datatype == \"BYTES\" : return [ s . decode ( \"utf-8\" ) for li in self . _data for s in li ] else : raise InvalidInput ( f \"invalid datatype { self . datatype } in the input\" ) def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the inference input data as numpy array. Returns: A numpy array of the inference input data \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( f \"invalid datatype { dtype } in the input\" ) if self . _raw_data is not None : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) def set_data_from_numpy ( self , input_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for input associated with this object. Args: input_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the tensor. \"\"\" if not isinstance ( input_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( input_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( input_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != input_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( input_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if input_tensor . size > 0 : for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in input_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( input_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = input_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) def __eq__ ( self , other ): if not isinstance ( other , InferInput ): return False if self . name != other . name : return False if self . shape != other . shape : return False if self . datatype != other . datatype : return False if self . parameters != other . parameters : return False if self . data != other . data : return False return True","title":"InferInput"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.data","text":"Get the data of the inference input associated with this object. Returns: Type Description Union [ List , ndarray , InferTensorContents ] The data of the inference input.","title":"data"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.datatype","text":"Get the datatype of inference input associated with this object. Returns: Type Description str The datatype of the inference input.","title":"datatype"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.name","text":"Get the name of inference input associated with this object. Returns: Type Description str The name of the inference input","title":"name"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.parameters","text":"Get the parameters of the inference input associated with this object. Returns: Type Description Union [ Dict , MessageMap [ str , InferParameter ], None] The additional inference parameters","title":"parameters"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.shape","text":"Get the shape of inference input associated with this object. Returns: Type Description List [ int ] The shape of the inference input.","title":"shape"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.__init__","text":"An object of InferInput class is used to describe the input tensor of an inference request. Parameters: Name Type Description Default name str The name of the inference input whose data will be described by this object. required shape The shape of the associated inference input. required datatype The data type of the associated inference input. required data The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using set_data_from_numpy . None parameters The additional inference parameters. None Source code in kserve/protocol/infer_type.py 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferInput class is used to describe the input tensor of an inference request. Args: name: The name of the inference input whose data will be described by this object. shape : The shape of the associated inference input. datatype : The data type of the associated inference input. data : The data of the inference input. When data is not set, raw_data is used for gRPC to transmit with numpy array bytes by using `set_data_from_numpy`. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.as_numpy","text":"Decode the inference input data as numpy array. Returns: Type Description ndarray A numpy array of the inference input data Source code in kserve/protocol/infer_type.py 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 def as_numpy ( self ) -> np . ndarray : \"\"\"Decode the inference input data as numpy array. Returns: A numpy array of the inference input data \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( f \"invalid datatype { dtype } in the input\" ) if self . _raw_data is not None : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape )","title":"as_numpy"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.set_data_from_numpy","text":"Set the tensor data from the specified numpy array for input associated with this object. Parameters: Name Type Description Default input_tensor The tensor data in numpy array format. required binary_data Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. True Source code in kserve/protocol/infer_type.py 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 def set_data_from_numpy ( self , input_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for input associated with this object. Args: input_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the tensor. \"\"\" if not isinstance ( input_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( input_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( input_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != input_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( input_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if input_tensor . size > 0 : for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in input_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( input_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = input_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data )","title":"set_data_from_numpy"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferInput.set_shape","text":"Set the shape of inference input. Parameters: Name Type Description Default shape The shape of the associated inference input. required Source code in kserve/protocol/infer_type.py 152 153 154 155 156 157 158 def set_shape ( self , shape : List [ int ]): \"\"\"Set the shape of inference input. Args: shape : The shape of the associated inference input. \"\"\" self . _shape = shape","title":"set_shape"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput","text":"Source code in kserve/protocol/infer_type.pyclass InferOutput : def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferOutput class is used to describe the output tensor for an inference response. Args: name : The name of inference output whose data will be described by this object. shape : The shape of the associated inference output. datatype : The data type of the associated inference output. data : The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None @property def name ( self ) -> str : \"\"\"Get the name of inference output associated with this object. Returns: The name of inference output. \"\"\" return self . _name @property def datatype ( self ) -> str : \"\"\"Get the data type of inference output associated with this object. Returns: The data type of inference output. \"\"\" return self . _datatype @property def data ( self ) -> Union [ List , np . ndarray , InferTensorContents ]: \"\"\"Get the data of inference output associated with this object. Returns: The data of inference output. \"\"\" return self . _data @property def shape ( self ) -> List [ int ]: \"\"\"Get the shape of inference output associated with this object. Returns: The shape of inference output \"\"\" return self . _shape @property def parameters ( self ) -> Union [ Dict , MessageMap [ str , InferParameter ], None ]: \"\"\"Get the parameters of inference output associated with this object. Returns: The additional inference parameters associated with the inference output. \"\"\" return self . _parameters @parameters . setter def parameters ( self , params : Union [ Dict , MessageMap [ str , InferParameter ]]): self . _parameters = params def set_shape ( self , shape : List [ int ]): \"\"\"Set the shape of inference output. Args: shape: The shape of the associated inference output. \"\"\" self . _shape = shape def as_numpy ( self ) -> numpy . ndarray : \"\"\"Decode the tensor output data as numpy array. Returns: The numpy array of the associated inference output data. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( \"invalid datatype in the input\" ) if self . _raw_data is not None : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape ) def set_data_from_numpy ( self , output_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for the inference output associated with this object. Args: output_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the output tensor. \"\"\" if not isinstance ( output_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( output_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( output_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != output_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( output_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if output_tensor . size > 0 : for obj in np . nditer ( output_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if output_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in output_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( output_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = output_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data ) def __eq__ ( self , other ): if not isinstance ( other , InferOutput ): return False if self . name != other . name : return False if self . shape != other . shape : return False if self . datatype != other . datatype : return False if self . parameters != other . parameters : return False if self . data != other . data : return False return True","title":"InferOutput"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.data","text":"Get the data of inference output associated with this object. Returns: Type Description Union [ List , ndarray , InferTensorContents ] The data of inference output.","title":"data"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.datatype","text":"Get the data type of inference output associated with this object. Returns: Type Description str The data type of inference output.","title":"datatype"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.name","text":"Get the name of inference output associated with this object. Returns: Type Description str The name of inference output.","title":"name"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.parameters","text":"Get the parameters of inference output associated with this object. Returns: Type Description Union [ Dict , MessageMap [ str , InferParameter ], None] The additional inference parameters associated with the inference output.","title":"parameters"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.shape","text":"Get the shape of inference output associated with this object. Returns: Type Description List [ int ] The shape of inference output","title":"shape"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.__init__","text":"An object of InferOutput class is used to describe the output tensor for an inference response. Parameters: Name Type Description Default name The name of inference output whose data will be described by this object. required shape The shape of the associated inference output. required datatype The data type of the associated inference output. required data The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. None parameters The additional inference parameters. None Source code in kserve/protocol/infer_type.py 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 def __init__ ( self , name : str , shape : List [ int ], datatype : str , data : Union [ List , np . ndarray , InferTensorContents ] = None , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"An object of InferOutput class is used to describe the output tensor for an inference response. Args: name : The name of inference output whose data will be described by this object. shape : The shape of the associated inference output. datatype : The data type of the associated inference output. data : The data of the inference output. When data is not set, raw_data is used for gRPC with numpy array bytes by calling set_data_from_numpy. parameters : The additional inference parameters. \"\"\" self . _name = name self . _shape = shape self . _datatype = datatype . upper () self . _parameters = parameters self . _data = data self . _raw_data = None","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.as_numpy","text":"Decode the tensor output data as numpy array. Returns: Type Description ndarray The numpy array of the associated inference output data. Source code in kserve/protocol/infer_type.py 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 def as_numpy ( self ) -> numpy . ndarray : \"\"\"Decode the tensor output data as numpy array. Returns: The numpy array of the associated inference output data. \"\"\" dtype = to_np_dtype ( self . datatype ) if dtype is None : raise InvalidInput ( \"invalid datatype in the input\" ) if self . _raw_data is not None : np_array = np . frombuffer ( self . _raw_data , dtype = dtype ) return np_array . reshape ( self . _shape ) else : np_array = np . array ( self . _data , dtype = dtype ) return np_array . reshape ( self . _shape )","title":"as_numpy"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.set_data_from_numpy","text":"Set the tensor data from the specified numpy array for the inference output associated with this object. Parameters: Name Type Description Default output_tensor The tensor data in numpy array format. required binary_data Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. True Source code in kserve/protocol/infer_type.py 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 def set_data_from_numpy ( self , output_tensor : np . ndarray , binary_data : bool = True ): \"\"\"Set the tensor data from the specified numpy array for the inference output associated with this object. Args: output_tensor : The tensor data in numpy array format. binary_data : Indicates whether to set data for the input in binary format or explicit tensor within JSON. The default value is True, which means the data will be delivered as binary data with gRPC or in the HTTP body after the JSON object for REST. Raises: InferenceError if failed to set data for the output tensor. \"\"\" if not isinstance ( output_tensor , ( np . ndarray ,)): raise InferenceError ( \"input_tensor must be a numpy array\" ) dtype = from_np_dtype ( output_tensor . dtype ) if self . _datatype != dtype : raise InferenceError ( \"got unexpected datatype {} from numpy array, expected {} \" . format ( dtype , self . _datatype ) ) valid_shape = True if len ( self . _shape ) != len ( output_tensor . shape ): valid_shape = False else : for i in range ( len ( self . _shape )): if self . _shape [ i ] != output_tensor . shape [ i ]: valid_shape = False if not valid_shape : raise InferenceError ( \"got unexpected numpy array shape [ {} ], expected [ {} ]\" . format ( str ( output_tensor . shape )[ 1 : - 1 ], str ( self . _shape )[ 1 : - 1 ] ) ) if not binary_data : if self . _parameters : self . _parameters . pop ( \"binary_data_size\" , None ) self . _raw_data = None if self . _datatype == \"BYTES\" : self . _data = [] try : if output_tensor . size > 0 : for obj in np . nditer ( output_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # We need to convert the object to string using utf-8, # if we want to use the binary_data=False. JSON requires # the input to be a UTF-8 string. if output_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) else : self . _data . append ( str ( obj . item ())) else : self . _data . append ( str ( obj . item (), encoding = \"utf-8\" )) except UnicodeDecodeError : raise InferenceError ( f 'Failed to encode \" { obj . item () } \" using UTF-8. Please use binary_data=True, if' \" you want to pass a byte array.\" ) else : self . _data = [ val . item () for val in output_tensor . flatten ()] else : self . _data = None if self . _datatype == \"BYTES\" : serialized_output = serialize_byte_tensor ( output_tensor ) if serialized_output . size > 0 : self . _raw_data = serialized_output . item () else : self . _raw_data = b \"\" else : self . _raw_data = output_tensor . tobytes () if self . _parameters is None : self . _parameters = { \"binary_data_size\" : len ( self . _raw_data )} else : self . _parameters [ \"binary_data_size\" ] = len ( self . _raw_data )","title":"set_data_from_numpy"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferOutput.set_shape","text":"Set the shape of inference output. Parameters: Name Type Description Default shape List [ int ] The shape of the associated inference output. required Source code in kserve/protocol/infer_type.py 547 548 549 550 551 552 553 def set_shape ( self , shape : List [ int ]): \"\"\"Set the shape of inference output. Args: shape: The shape of the associated inference output. \"\"\" self . _shape = shape","title":"set_shape"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest","text":"Source code in kserve/protocol/infer_type.pyclass InferRequest : id : Optional [ str ] model_name : str parameters : Optional [ Dict ] inputs : List [ InferInput ] from_grpc : bool def __init__ ( self , model_name : str , infer_inputs : List [ InferInput ], request_id : Optional [ str ] = None , raw_inputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"InferRequest Data Model. Args: model_name: The model name. infer_inputs: The inference inputs for the model. request_id: The id for the inference request. raw_inputs: The binary data for the inference inputs. from_grpc: Indicate if the data model is constructed from gRPC request. parameters: The additional inference parameters. \"\"\" self . id = request_id self . model_name = model_name self . inputs = infer_inputs self . parameters = parameters self . from_grpc = from_grpc if raw_inputs : for i , raw_input in enumerate ( raw_inputs ): self . inputs [ i ] . _raw_data = raw_input @classmethod def from_grpc ( cls , request : ModelInferRequest ): \"\"\"The class method to construct the InferRequest from a ModelInferRequest\"\"\" infer_inputs = [ InferInput ( name = input_tensor . name , shape = list ( input_tensor . shape ), datatype = input_tensor . datatype , data = get_content ( input_tensor . datatype , input_tensor . contents ), parameters = input_tensor . parameters , ) for input_tensor in request . inputs ] return cls ( request_id = request . id , model_name = request . model_name , infer_inputs = infer_inputs , raw_inputs = request . raw_input_contents , from_grpc = True , parameters = request . parameters , ) def to_rest ( self ) -> Dict : \"\"\"Converts the InferRequest object to v2 REST InferRequest Dict. Returns: The InferRequest Dict converted from InferRequest object. \"\"\" infer_inputs = [] for infer_input in self . inputs : datatype = infer_input . datatype if isinstance ( infer_input . datatype , numpy . dtype ): datatype = from_np_dtype ( infer_input . datatype ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_http_parameters ( infer_input . parameters ) if isinstance ( infer_input . data , numpy . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = False ) infer_input_dict [ \"data\" ] = infer_input . data else : infer_input_dict [ \"data\" ] = infer_input . data infer_inputs . append ( infer_input_dict ) infer_request = { \"id\" : self . id if self . id else str ( uuid . uuid4 ()), \"inputs\" : infer_inputs , } if self . parameters : infer_request [ \"parameters\" ] = to_http_parameters ( self . parameters ) return infer_request def to_grpc ( self ) -> ModelInferRequest : \"\"\"Converts the InferRequest object to gRPC ModelInferRequest type. Returns: The ModelInferResponse gRPC type converted from InferRequest object. \"\"\" infer_inputs = [] raw_input_contents = [] for infer_input in self . inputs : if isinstance ( infer_input . data , numpy . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = True ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_grpc_parameters ( infer_input . parameters ) if infer_input . _raw_data is not None : raw_input_contents . append ( infer_input . _raw_data ) else : if not isinstance ( infer_input . data , List ): raise InvalidInput ( \"input data is not a List\" ) infer_input_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_input . datatype , None ) if data_key is not None : infer_input . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_input . data ] # str to byte conversion for grpc proto infer_input_dict [ \"contents\" ][ data_key ] = infer_input . data else : raise InvalidInput ( \"invalid input datatype\" ) infer_inputs . append ( infer_input_dict ) return ModelInferRequest ( id = self . id , model_name = self . model_name , inputs = infer_inputs , raw_input_contents = raw_input_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , ) def as_dataframe ( self ) -> pd . DataFrame : \"\"\"Decode the tensor inputs as pandas dataframe. Returns: The inference input data as pandas dataframe \"\"\" dfs = [] for input in self . inputs : input_data = input . data if input . datatype == \"BYTES\" : input_data = [ str ( val , \"utf-8\" ) if isinstance ( val , bytes ) else val for val in input . data ] dfs . append ( pd . DataFrame ( input_data , columns = [ input . name ])) return pd . concat ( dfs , axis = 1 ) def __eq__ ( self , other ): if not isinstance ( other , InferRequest ): return False if self . model_name != other . model_name : return False if self . id != other . id : return False if self . from_grpc != other . from_grpc : return False if self . parameters != other . parameters : return False if self . inputs != other . inputs : return False return True","title":"InferRequest"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.__init__","text":"InferRequest Data Model. Parameters: Name Type Description Default model_name str The model name. required infer_inputs List [ InferInput ] The inference inputs for the model. required request_id Optional [ str ] The id for the inference request. None raw_inputs The binary data for the inference inputs. None from_grpc Optional [ bool ] Indicate if the data model is constructed from gRPC request. False parameters Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] The additional inference parameters. None Source code in kserve/protocol/infer_type.py 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 def __init__ ( self , model_name : str , infer_inputs : List [ InferInput ], request_id : Optional [ str ] = None , raw_inputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"InferRequest Data Model. Args: model_name: The model name. infer_inputs: The inference inputs for the model. request_id: The id for the inference request. raw_inputs: The binary data for the inference inputs. from_grpc: Indicate if the data model is constructed from gRPC request. parameters: The additional inference parameters. \"\"\" self . id = request_id self . model_name = model_name self . inputs = infer_inputs self . parameters = parameters self . from_grpc = from_grpc if raw_inputs : for i , raw_input in enumerate ( raw_inputs ): self . inputs [ i ] . _raw_data = raw_input","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.as_dataframe","text":"Decode the tensor inputs as pandas dataframe. Returns: Type Description DataFrame The inference input data as pandas dataframe Source code in kserve/protocol/infer_type.py 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 def as_dataframe ( self ) -> pd . DataFrame : \"\"\"Decode the tensor inputs as pandas dataframe. Returns: The inference input data as pandas dataframe \"\"\" dfs = [] for input in self . inputs : input_data = input . data if input . datatype == \"BYTES\" : input_data = [ str ( val , \"utf-8\" ) if isinstance ( val , bytes ) else val for val in input . data ] dfs . append ( pd . DataFrame ( input_data , columns = [ input . name ])) return pd . concat ( dfs , axis = 1 )","title":"as_dataframe"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.from_grpc","text":"The class method to construct the InferRequest from a ModelInferRequest Source code in kserve/protocol/infer_type.py 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 @classmethod def from_grpc ( cls , request : ModelInferRequest ): \"\"\"The class method to construct the InferRequest from a ModelInferRequest\"\"\" infer_inputs = [ InferInput ( name = input_tensor . name , shape = list ( input_tensor . shape ), datatype = input_tensor . datatype , data = get_content ( input_tensor . datatype , input_tensor . contents ), parameters = input_tensor . parameters , ) for input_tensor in request . inputs ] return cls ( request_id = request . id , model_name = request . model_name , infer_inputs = infer_inputs , raw_inputs = request . raw_input_contents , from_grpc = True , parameters = request . parameters , )","title":"from_grpc"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.to_grpc","text":"Converts the InferRequest object to gRPC ModelInferRequest type. Returns: Type Description ModelInferRequest The ModelInferResponse gRPC type converted from InferRequest object. Source code in kserve/protocol/infer_type.py 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 def to_grpc ( self ) -> ModelInferRequest : \"\"\"Converts the InferRequest object to gRPC ModelInferRequest type. Returns: The ModelInferResponse gRPC type converted from InferRequest object. \"\"\" infer_inputs = [] raw_input_contents = [] for infer_input in self . inputs : if isinstance ( infer_input . data , numpy . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = True ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : infer_input . datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_grpc_parameters ( infer_input . parameters ) if infer_input . _raw_data is not None : raw_input_contents . append ( infer_input . _raw_data ) else : if not isinstance ( infer_input . data , List ): raise InvalidInput ( \"input data is not a List\" ) infer_input_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_input . datatype , None ) if data_key is not None : infer_input . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_input . data ] # str to byte conversion for grpc proto infer_input_dict [ \"contents\" ][ data_key ] = infer_input . data else : raise InvalidInput ( \"invalid input datatype\" ) infer_inputs . append ( infer_input_dict ) return ModelInferRequest ( id = self . id , model_name = self . model_name , inputs = infer_inputs , raw_input_contents = raw_input_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , )","title":"to_grpc"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferRequest.to_rest","text":"Converts the InferRequest object to v2 REST InferRequest Dict. Returns: Type Description Dict The InferRequest Dict converted from InferRequest object. Source code in kserve/protocol/infer_type.py 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 def to_rest ( self ) -> Dict : \"\"\"Converts the InferRequest object to v2 REST InferRequest Dict. Returns: The InferRequest Dict converted from InferRequest object. \"\"\" infer_inputs = [] for infer_input in self . inputs : datatype = infer_input . datatype if isinstance ( infer_input . datatype , numpy . dtype ): datatype = from_np_dtype ( infer_input . datatype ) infer_input_dict = { \"name\" : infer_input . name , \"shape\" : infer_input . shape , \"datatype\" : datatype , } if infer_input . parameters : infer_input_dict [ \"parameters\" ] = to_http_parameters ( infer_input . parameters ) if isinstance ( infer_input . data , numpy . ndarray ): infer_input . set_data_from_numpy ( infer_input . data , binary_data = False ) infer_input_dict [ \"data\" ] = infer_input . data else : infer_input_dict [ \"data\" ] = infer_input . data infer_inputs . append ( infer_input_dict ) infer_request = { \"id\" : self . id if self . id else str ( uuid . uuid4 ()), \"inputs\" : infer_inputs , } if self . parameters : infer_request [ \"parameters\" ] = to_http_parameters ( self . parameters ) return infer_request","title":"to_rest"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse","text":"Source code in kserve/protocol/infer_type.py 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 class InferResponse : id : str model_name : str model_version : Optional [ str ] parameters : Optional [ Dict ] outputs : List [ InferOutput ] from_grpc : bool def __init__ ( self , response_id : str , model_name : str , infer_outputs : List [ InferOutput ], model_version : Optional [ str ] = None , raw_outputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"The InferResponse Data Model Args: response_id: The id of the inference response. model_name: The name of the model. infer_outputs: The inference outputs of the inference response. model_version: The version of the model. raw_outputs: The raw binary data of the inference outputs. from_grpc: Indicate if the InferResponse is constructed from a gRPC response. parameters: The additional inference parameters. \"\"\" self . id = response_id self . model_name = model_name self . model_version = model_version self . outputs = infer_outputs self . parameters = parameters self . from_grpc = from_grpc if raw_outputs : for i , raw_output in enumerate ( raw_outputs ): self . outputs [ i ] . _raw_data = raw_output @classmethod def from_grpc ( cls , response : ModelInferResponse ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from gRPC message type.\"\"\" infer_outputs = [ InferOutput ( name = output . name , shape = list ( output . shape ), datatype = output . datatype , data = get_content ( output . datatype , output . contents ), parameters = output . parameters , ) for output in response . outputs ] return cls ( model_name = response . model_name , model_version = response . model_version , response_id = response . id , parameters = response . parameters , infer_outputs = infer_outputs , raw_outputs = response . raw_output_contents , from_grpc = True , ) @classmethod def from_rest ( cls , model_name : str , response : Dict ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from REST message type.\"\"\" infer_outputs = [ InferOutput ( name = output [ \"name\" ], shape = list ( output [ \"shape\" ]), datatype = output [ \"datatype\" ], data = output [ \"data\" ], parameters = output . get ( \"parameters\" , None ), ) for output in response [ \"outputs\" ] ] return cls ( model_name = model_name , model_version = response . get ( \"model_version\" , None ), response_id = response . get ( \"id\" , None ), parameters = response . get ( \"parameters\" , None ), infer_outputs = infer_outputs , ) def to_rest ( self ) -> Dict : \"\"\"Converts the InferResponse object to v2 REST InferResponse dict. Returns: The InferResponse Dict. \"\"\" infer_outputs = [] for i , infer_output in enumerate ( self . outputs ): infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_http_parameters ( infer_output . parameters ) if isinstance ( infer_output . data , numpy . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = False ) infer_output_dict [ \"data\" ] = infer_output . data elif isinstance ( infer_output . _raw_data , bytes ): infer_output_dict [ \"data\" ] = infer_output . as_numpy () . tolist () else : infer_output_dict [ \"data\" ] = infer_output . data infer_outputs . append ( infer_output_dict ) res = { \"id\" : self . id , \"model_name\" : self . model_name , \"model_version\" : self . model_version , \"outputs\" : infer_outputs , } if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) return res def to_grpc ( self ) -> ModelInferResponse : \"\"\"Converts the InferResponse object to gRPC ModelInferResponse type. Returns: The ModelInferResponse gRPC message. \"\"\" infer_outputs = [] raw_output_contents = [] for infer_output in self . outputs : if isinstance ( infer_output . data , numpy . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = True ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_grpc_parameters ( infer_output . parameters ) if infer_output . _raw_data is not None : raw_output_contents . append ( infer_output . _raw_data ) else : if not isinstance ( infer_output . data , List ): raise InvalidInput ( \"output data is not a List\" ) infer_output_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_output . datatype , None ) if data_key is not None : infer_output . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_output . data ] # str to byte conversion for grpc proto infer_output_dict [ \"contents\" ][ data_key ] = infer_output . data else : raise InvalidInput ( \"to_grpc: invalid output datatype\" ) infer_outputs . append ( infer_output_dict ) return ModelInferResponse ( id = self . id , model_name = self . model_name , model_version = self . model_version , outputs = infer_outputs , raw_output_contents = raw_output_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , ) def __eq__ ( self , other ): if not isinstance ( other , InferResponse ): return False if self . model_name != other . model_name : return False if self . model_version != other . model_version : return False if self . id != other . id : return False if self . from_grpc != other . from_grpc : return False if self . parameters != other . parameters : return False if self . outputs != other . outputs : return False return True","title":"InferResponse"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.__init__","text":"The InferResponse Data Model Parameters: Name Type Description Default response_id str The id of the inference response. required model_name str The name of the model. required infer_outputs List [ InferOutput ] The inference outputs of the inference response. required model_version Optional [ str ] The version of the model. None raw_outputs The raw binary data of the inference outputs. None from_grpc Optional [ bool ] Indicate if the InferResponse is constructed from a gRPC response. False parameters Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] The additional inference parameters. None Source code in kserve/protocol/infer_type.py 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 def __init__ ( self , response_id : str , model_name : str , infer_outputs : List [ InferOutput ], model_version : Optional [ str ] = None , raw_outputs = None , from_grpc : Optional [ bool ] = False , parameters : Optional [ Union [ Dict , MessageMap [ str , InferParameter ]]] = None , ): \"\"\"The InferResponse Data Model Args: response_id: The id of the inference response. model_name: The name of the model. infer_outputs: The inference outputs of the inference response. model_version: The version of the model. raw_outputs: The raw binary data of the inference outputs. from_grpc: Indicate if the InferResponse is constructed from a gRPC response. parameters: The additional inference parameters. \"\"\" self . id = response_id self . model_name = model_name self . model_version = model_version self . outputs = infer_outputs self . parameters = parameters self . from_grpc = from_grpc if raw_outputs : for i , raw_output in enumerate ( raw_outputs ): self . outputs [ i ] . _raw_data = raw_output","title":"__init__"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.from_grpc","text":"The class method to construct the InferResponse object from gRPC message type. Source code in kserve/protocol/infer_type.py 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 @classmethod def from_grpc ( cls , response : ModelInferResponse ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from gRPC message type.\"\"\" infer_outputs = [ InferOutput ( name = output . name , shape = list ( output . shape ), datatype = output . datatype , data = get_content ( output . datatype , output . contents ), parameters = output . parameters , ) for output in response . outputs ] return cls ( model_name = response . model_name , model_version = response . model_version , response_id = response . id , parameters = response . parameters , infer_outputs = infer_outputs , raw_outputs = response . raw_output_contents , from_grpc = True , )","title":"from_grpc"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.from_rest","text":"The class method to construct the InferResponse object from REST message type. Source code in kserve/protocol/infer_type.py 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 @classmethod def from_rest ( cls , model_name : str , response : Dict ) -> \"InferResponse\" : \"\"\"The class method to construct the InferResponse object from REST message type.\"\"\" infer_outputs = [ InferOutput ( name = output [ \"name\" ], shape = list ( output [ \"shape\" ]), datatype = output [ \"datatype\" ], data = output [ \"data\" ], parameters = output . get ( \"parameters\" , None ), ) for output in response [ \"outputs\" ] ] return cls ( model_name = model_name , model_version = response . get ( \"model_version\" , None ), response_id = response . get ( \"id\" , None ), parameters = response . get ( \"parameters\" , None ), infer_outputs = infer_outputs , )","title":"from_rest"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.to_grpc","text":"Converts the InferResponse object to gRPC ModelInferResponse type. Returns: Type Description ModelInferResponse The ModelInferResponse gRPC message. Source code in kserve/protocol/infer_type.py 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 def to_grpc ( self ) -> ModelInferResponse : \"\"\"Converts the InferResponse object to gRPC ModelInferResponse type. Returns: The ModelInferResponse gRPC message. \"\"\" infer_outputs = [] raw_output_contents = [] for infer_output in self . outputs : if isinstance ( infer_output . data , numpy . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = True ) infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_grpc_parameters ( infer_output . parameters ) if infer_output . _raw_data is not None : raw_output_contents . append ( infer_output . _raw_data ) else : if not isinstance ( infer_output . data , List ): raise InvalidInput ( \"output data is not a List\" ) infer_output_dict [ \"contents\" ] = {} data_key = GRPC_CONTENT_DATATYPE_MAPPINGS . get ( infer_output . datatype , None ) if data_key is not None : infer_output . _data = [ bytes ( val , \"utf-8\" ) if isinstance ( val , str ) else val for val in infer_output . data ] # str to byte conversion for grpc proto infer_output_dict [ \"contents\" ][ data_key ] = infer_output . data else : raise InvalidInput ( \"to_grpc: invalid output datatype\" ) infer_outputs . append ( infer_output_dict ) return ModelInferResponse ( id = self . id , model_name = self . model_name , model_version = self . model_version , outputs = infer_outputs , raw_output_contents = raw_output_contents , parameters = to_grpc_parameters ( self . parameters ) if self . parameters else None , )","title":"to_grpc"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.InferResponse.to_rest","text":"Converts the InferResponse object to v2 REST InferResponse dict. Returns: Type Description Dict The InferResponse Dict. Source code in kserve/protocol/infer_type.py 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 def to_rest ( self ) -> Dict : \"\"\"Converts the InferResponse object to v2 REST InferResponse dict. Returns: The InferResponse Dict. \"\"\" infer_outputs = [] for i , infer_output in enumerate ( self . outputs ): infer_output_dict = { \"name\" : infer_output . name , \"shape\" : infer_output . shape , \"datatype\" : infer_output . datatype , } if infer_output . parameters : infer_output_dict [ \"parameters\" ] = to_http_parameters ( infer_output . parameters ) if isinstance ( infer_output . data , numpy . ndarray ): infer_output . set_data_from_numpy ( infer_output . data , binary_data = False ) infer_output_dict [ \"data\" ] = infer_output . data elif isinstance ( infer_output . _raw_data , bytes ): infer_output_dict [ \"data\" ] = infer_output . as_numpy () . tolist () else : infer_output_dict [ \"data\" ] = infer_output . data infer_outputs . append ( infer_output_dict ) res = { \"id\" : self . id , \"model_name\" : self . model_name , \"model_version\" : self . model_version , \"outputs\" : infer_outputs , } if self . parameters : res [ \"parameters\" ] = to_http_parameters ( self . parameters ) return res","title":"to_rest"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.serialize_byte_tensor","text":"Serializes a bytes tensor into a flat numpy array of length prepended bytes. The numpy array should use dtype of np.object_. For np.bytes_, numpy will remove trailing zeros at the end of byte sequence and because of this it should be avoided. Args: input_tensor : np.array of the bytes tensor to serialize. Returns: serialized_bytes_tensor : The 1-D numpy array of type uint8 containing the serialized bytes in 'C' order. Source code in kserve/protocol/infer_type.py 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 def serialize_byte_tensor ( input_tensor : numpy . ndarray ): \"\"\" Serializes a bytes tensor into a flat numpy array of length prepended bytes. The numpy array should use dtype of np.object_. For np.bytes_, numpy will remove trailing zeros at the end of byte sequence and because of this it should be avoided. Args: input_tensor : np.array of the bytes tensor to serialize. Returns: serialized_bytes_tensor : The 1-D numpy array of type uint8 containing the serialized bytes in 'C' order. \"\"\" if input_tensor . size == 0 : return () # If the input is a tensor of string/bytes objects, then must flatten those # into a 1-dimensional array containing the 4-byte byte size followed by the # actual element bytes. All elements are concatenated together in \"C\" order. if ( input_tensor . dtype == np . object_ ) or ( input_tensor . dtype . type == np . bytes_ ): flattened_ls = [] for obj in np . nditer ( input_tensor , flags = [ \"refs_ok\" ], order = \"C\" ): # If directly passing bytes to BYTES type, # don't convert it to str as Python will encode the # bytes which may distort the meaning if input_tensor . dtype == np . object_ : if type ( obj . item ()) == bytes : s = obj . item () else : s = str ( obj . item ()) . encode ( \"utf-8\" ) else : s = obj . item () flattened_ls . append ( struct . pack ( \" Dict [ str , InferParameter ]: \"\"\" Converts REST parameters to GRPC InferParameter objects :param parameters: parameters to be converted. :return: converted parameters as Dict[str, InferParameter] :raises InvalidInput: if the parameter type is not supported. \"\"\" grpc_params : Dict [ str , InferParameter ] = {} for key , val in parameters . items (): if isinstance ( val , str ): grpc_params [ key ] = InferParameter ( string_param = val ) elif isinstance ( val , bool ): grpc_params [ key ] = InferParameter ( bool_param = val ) elif isinstance ( val , int ): grpc_params [ key ] = InferParameter ( int64_param = val ) elif isinstance ( val , InferParameter ): grpc_params [ key ] = val else : raise InvalidInput ( f \"to_grpc: invalid parameter value: { val } \" ) return grpc_params","title":"to_grpc_parameters"},{"location":"python_runtime_api/docs/api/#kserve.protocol.infer_type.to_http_parameters","text":"Converts GRPC InferParameter parameters to REST parameters :param parameters: parameters to be converted. :return: converted parameters as Dict[str, Union[str, bool, int]] Source code in kserve/protocol/infer_type.py 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 def to_http_parameters ( parameters : Union [ dict , MessageMap [ str , InferParameter ]] ) -> Dict [ str , Union [ str , bool , int ]]: \"\"\" Converts GRPC InferParameter parameters to REST parameters :param parameters: parameters to be converted. :return: converted parameters as Dict[str, Union[str, bool, int]] \"\"\" http_params : Dict [ str , Union [ str , bool , int ]] = {} for key , val in parameters . items (): if isinstance ( val , InferParameter ): if val . HasField ( \"bool_param\" ): http_params [ key ] = val . bool_param elif val . HasField ( \"int64_param\" ): http_params [ key ] = val . int64_param elif val . HasField ( \"string_param\" ): http_params [ key ] = val . string_param else : http_params [ key ] = val return http_params","title":"to_http_parameters"},{"location":"reference/api/","text":"Packages: serving.kserve.io/v1alpha1 serving.kserve.io/v1beta1 serving.kserve.io/v1alpha1 Package v1alpha1 contains API Schema definitions for the serving v1alpha1 API group Resource Types: BuiltInAdapter ( Appears on: ServingRuntimeSpec ) Field Description serverType ServerType ServerType must be one of the supported built-in types such as \u201ctriton\u201d or \u201cmlserver\u201d, and the runtime\u2019s container must have the same name runtimeManagementPort int Port which the runtime server listens for model management requests memBufferBytes int Fixed memory overhead to subtract from runtime container\u2019s memory allocation to determine model capacity modelLoadingTimeoutMillis int Timeout for model loading operations in milliseconds env []Kubernetes core/v1.EnvVar Environment variables used to control other aspects of the built-in adapter\u2019s behaviour (uncommon) ClusterServingRuntime ClusterServingRuntime is the Schema for the servingruntimes API Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec ServingRuntimeSpec supportedModelFormats []SupportedModelFormat Model formats and version supported by this runtime multiModel bool (Optional) Whether this ServingRuntime is intended for multi-model usage or not. disabled bool (Optional) Set to true to disable use of this runtime protocolVersions []github.com/kserve/kserve/pkg/constants.InferenceServiceProtocol (Optional) Supported protocol versions (i.e. v1 or v2 or grpc-v1 or grpc-v2) ServingRuntimePodSpec ServingRuntimePodSpec (Members of ServingRuntimePodSpec are embedded into this type.) grpcEndpoint string (Optional) Grpc endpoint for internal model-management (implementing mmesh.ModelRuntime gRPC service) Assumed to be single-model runtime if omitted grpcDataEndpoint string (Optional) Grpc endpoint for inferencing httpDataEndpoint string (Optional) HTTP endpoint for inferencing replicas uint16 (Optional) Configure the number of replicas in the Deployment generated by this ServingRuntime If specified, this overrides the podsPerRuntime configuration value storageHelper StorageHelper (Optional) Configuration for this runtime\u2019s use of the storage helper (model puller) It is enabled unless explicitly disabled builtInAdapter BuiltInAdapter (Optional) Provide the details about built-in runtime adapter status ServingRuntimeStatus ClusterStorageContainer Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec StorageContainerSpec container Kubernetes core/v1.Container Container spec for the storage initializer init container supportedUriFormats []SupportedUriFormat List of URI formats that this container supports disabled bool (Optional) InferenceGraph InferenceGraph is the Schema for the InferenceGraph API for multiple models Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec InferenceGraphSpec nodes map[string]kserve.io/serving/pkg/apis/serving/v1alpha1.InferenceRouter Map of InferenceGraph router nodes Each node defines the router which can be different routing types resources Kubernetes core/v1.ResourceRequirements (Optional) affinity Kubernetes core/v1.Affinity (Optional) timeout int64 (Optional) TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. minReplicas int (Optional) Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. maxReplicas int (Optional) Maximum number of replicas for autoscaling. scaleTarget int (Optional) ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler ( https://knative.dev/docs/serving/autoscaling/autoscaling-targets/ ). scaleMetric ScaleMetric (Optional) ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler( https://knative.dev/docs/serving/autoscaling/autoscaling-metrics ). status InferenceGraphStatus InferenceGraphSpec ( Appears on: InferenceGraph ) InferenceGraphSpec defines the InferenceGraph spec Field Description nodes map[string]kserve.io/serving/pkg/apis/serving/v1alpha1.InferenceRouter Map of InferenceGraph router nodes Each node defines the router which can be different routing types resources Kubernetes core/v1.ResourceRequirements (Optional) affinity Kubernetes core/v1.Affinity (Optional) timeout int64 (Optional) TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. minReplicas int (Optional) Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. maxReplicas int (Optional) Maximum number of replicas for autoscaling. scaleTarget int (Optional) ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler ( https://knative.dev/docs/serving/autoscaling/autoscaling-targets/ ). scaleMetric ScaleMetric (Optional) ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler( https://knative.dev/docs/serving/autoscaling/autoscaling-metrics ). InferenceGraphStatus ( Appears on: InferenceGraph ) InferenceGraphStatus defines the InferenceGraph conditions and status Field Description Status knative.dev/pkg/apis/duck/v1.Status (Members of Status are embedded into this type.) Conditions for InferenceGraph url knative.dev/pkg/apis.URL (Optional) Url for the InferenceGraph InferenceRouter ( Appears on: InferenceGraphSpec ) InferenceRouter defines the router for each InferenceGraph node with one or multiple steps kind: InferenceGraph metadata: name: canary-route spec: nodes: root: routerType: Splitter routes: - service: mymodel1 weight: 20 - service: mymodel2 weight: 80 kind: InferenceGraph metadata: name: abtest spec: nodes: mymodel: routerType: Switch routes: - service: mymodel1 condition: \"{ .input.userId == 1 }\" - service: mymodel2 condition: \"{ .input.userId == 2 }\" Scoring a case using a model ensemble consists of scoring it using each model separately, then combining the results into a single scoring result using one of the pre-defined combination methods. Tree Ensemble constitutes a case where simple algorithms for combining results of either classification or regression trees are well known. Multiple classification trees, for example, are commonly combined using a \u201cmajority-vote\u201d method. Multiple regression trees are often combined using various averaging techniques. e.g tagging models with segment identifiers and weights to be used for their combination in these ways. kind: InferenceGraph metadata: name: ensemble spec: nodes: root: routerType: Sequence routes: - service: feast - nodeName: ensembleModel data: $response ensembleModel: routerType: Ensemble routes: - service: sklearn-model - service: xgboost-model Scoring a case using a sequence, or chain of models allows the output of one model to be passed in as input to the subsequent models. kind: InferenceGraph metadata: name: model-chainer spec: nodes: root: routerType: Sequence routes: - service: mymodel-s1 - service: mymodel-s2 data: $response - service: mymodel-s3 data: $response In the flow described below, the pre_processing node base64 encodes the image and passes it to two model nodes in the flow. The encoded data is available to both these nodes for classification. The second node i.e. dog-breed-classification takes the original input from the pre_processing node along-with the response from the cat-dog-classification node to do further classification of the dog breed if required. kind: InferenceGraph metadata: name: dog-breed-classification spec: nodes: root: routerType: Sequence routes: - service: cat-dog-classifier - nodeName: breed-classifier data: $request breed-classifier: routerType: Switch routes: - service: dog-breed-classifier condition: { .predictions.class == \"dog\" } - service: cat-breed-classifier condition: { .predictions.class == \"cat\" } Field Description routerType InferenceRouterType RouterType Sequence: chain multiple inference steps with input/output from previous step Splitter: randomly routes to the target service according to the weight Ensemble: routes the request to multiple models and then merge the responses Switch: routes the request to one of the steps based on condition steps []InferenceStep (Optional) Steps defines destinations for the current router node InferenceRouterType ( string alias) ( Appears on: InferenceRouter ) InferenceRouterType constant for inference routing types Value Description \"Ensemble\" Ensemble router routes the requests to multiple models and then merge the responses \"Sequence\" Sequence Default type only route to one destination \"Splitter\" Splitter router randomly routes the requests to the named service according to the weight \"Switch\" Switch routes the request to the model based on certain condition InferenceStep ( Appears on: InferenceRouter ) InferenceStep defines the inference target of the current step with condition, weights and data. Field Description name string (Optional) Unique name for the step within this node InferenceTarget InferenceTarget (Members of InferenceTarget are embedded into this type.) Node or service used to process this step data string (Optional) request data sent to the next route with input/output from the previous step $request $response.predictions weight int64 (Optional) the weight for split of the traffic, only used for Split Router when weight is specified all the routing targets should be sum to 100 condition string (Optional) routing based on the condition dependency InferenceStepDependencyType (Optional) to decide whether a step is a hard or a soft dependency in the Inference Graph InferenceStepDependencyType ( string alias) ( Appears on: InferenceStep ) InferenceStepDependencyType constant for inference step dependency Value Description \"Hard\" Hard \"Soft\" Soft InferenceTarget ( Appears on: InferenceStep ) Exactly one InferenceTarget field must be specified Field Description nodeName string (Optional) The node name for routing as next step serviceName string named reference for InferenceService serviceUrl string (Optional) InferenceService URL, mutually exclusive with ServiceName ModelSpec ( Appears on: TrainedModelSpec ) ModelSpec describes a TrainedModel Field Description storageUri string Storage URI for the model repository framework string Machine Learning The values could be: \u201ctensorflow\u201d,\u201cpytorch\u201d,\u201csklearn\u201d,\u201connx\u201d,\u201cxgboost\u201d, \u201cmyawesomeinternalframework\u201d etc. memory k8s.io/apimachinery/pkg/api/resource.Quantity Maximum memory this model will consume, this field is used to decide if a model server has enough memory to load this model. ScaleMetric ( string alias) ( Appears on: InferenceGraphSpec ) ScaleMetric enum ServerType ( string alias) ( Appears on: BuiltInAdapter ) ServerType constant for specifying the runtime name Value Description \"mlserver\" Model server is MLServer \"ovms\" Model server is OpenVino Model Server \"triton\" Model server is Triton ServingRuntime ServingRuntime is the Schema for the servingruntimes API Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec ServingRuntimeSpec supportedModelFormats []SupportedModelFormat Model formats and version supported by this runtime multiModel bool (Optional) Whether this ServingRuntime is intended for multi-model usage or not. disabled bool (Optional) Set to true to disable use of this runtime protocolVersions []github.com/kserve/kserve/pkg/constants.InferenceServiceProtocol (Optional) Supported protocol versions (i.e. v1 or v2 or grpc-v1 or grpc-v2) ServingRuntimePodSpec ServingRuntimePodSpec (Members of ServingRuntimePodSpec are embedded into this type.) grpcEndpoint string (Optional) Grpc endpoint for internal model-management (implementing mmesh.ModelRuntime gRPC service) Assumed to be single-model runtime if omitted grpcDataEndpoint string (Optional) Grpc endpoint for inferencing httpDataEndpoint string (Optional) HTTP endpoint for inferencing replicas uint16 (Optional) Configure the number of replicas in the Deployment generated by this ServingRuntime If specified, this overrides the podsPerRuntime configuration value storageHelper StorageHelper (Optional) Configuration for this runtime\u2019s use of the storage helper (model puller) It is enabled unless explicitly disabled builtInAdapter BuiltInAdapter (Optional) Provide the details about built-in runtime adapter status ServingRuntimeStatus ServingRuntimePodSpec ( Appears on: ServingRuntimeSpec ) Field Description containers []Kubernetes core/v1.Container List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. volumes []Kubernetes core/v1.Volume (Optional) List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes nodeSelector map[string]string (Optional) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node\u2019s labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ affinity Kubernetes core/v1.Affinity (Optional) If specified, the pod\u2019s scheduling constraints tolerations []Kubernetes core/v1.Toleration (Optional) If specified, the pod\u2019s tolerations. labels map[string]string (Optional) Labels that will be add to the pod. More info: http://kubernetes.io/docs/user-guide/labels annotations map[string]string (Optional) Annotations that will be add to the pod. More info: http://kubernetes.io/docs/user-guide/annotations imagePullSecrets []Kubernetes core/v1.LocalObjectReference (Optional) ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod ServingRuntimeSpec ( Appears on: ClusterServingRuntime , ServingRuntime , SupportedRuntime ) ServingRuntimeSpec defines the desired state of ServingRuntime. This spec is currently provisional and are subject to change as details regarding single-model serving and multi-model serving are hammered out. Field Description supportedModelFormats []SupportedModelFormat Model formats and version supported by this runtime multiModel bool (Optional) Whether this ServingRuntime is intended for multi-model usage or not. disabled bool (Optional) Set to true to disable use of this runtime protocolVersions []github.com/kserve/kserve/pkg/constants.InferenceServiceProtocol (Optional) Supported protocol versions (i.e. v1 or v2 or grpc-v1 or grpc-v2) ServingRuntimePodSpec ServingRuntimePodSpec (Members of ServingRuntimePodSpec are embedded into this type.) grpcEndpoint string (Optional) Grpc endpoint for internal model-management (implementing mmesh.ModelRuntime gRPC service) Assumed to be single-model runtime if omitted grpcDataEndpoint string (Optional) Grpc endpoint for inferencing httpDataEndpoint string (Optional) HTTP endpoint for inferencing replicas uint16 (Optional) Configure the number of replicas in the Deployment generated by this ServingRuntime If specified, this overrides the podsPerRuntime configuration value storageHelper StorageHelper (Optional) Configuration for this runtime\u2019s use of the storage helper (model puller) It is enabled unless explicitly disabled builtInAdapter BuiltInAdapter (Optional) Provide the details about built-in runtime adapter ServingRuntimeStatus ( Appears on: ClusterServingRuntime , ServingRuntime ) ServingRuntimeStatus defines the observed state of ServingRuntime StorageContainerSpec ( Appears on: ClusterStorageContainer ) StorageContainerSpec defines the container spec for the storage initializer init container, and the protocols it supports. Field Description container Kubernetes core/v1.Container Container spec for the storage initializer init container supportedUriFormats []SupportedUriFormat List of URI formats that this container supports StorageHelper ( Appears on: ServingRuntimeSpec ) Field Description disabled bool (Optional) SupportedModelFormat ( Appears on: ServingRuntimeSpec ) Field Description name string Name of the model format. version string (Optional) Version of the model format. Used in validating that a predictor is supported by a runtime. Can be \u201cmajor\u201d, \u201cmajor.minor\u201d or \u201cmajor.minor.patch\u201d. autoSelect bool (Optional) Set to true to allow the ServingRuntime to be used for automatic model placement if this model format is specified with no explicit runtime. priority int32 (Optional) Priority of this serving runtime for auto selection. This is used to select the serving runtime if more than one serving runtime supports the same model format. The value should be greater than zero. The higher the value, the higher the priority. Priority is not considered if AutoSelect is either false or not specified. Priority can be overridden by specifying the runtime in the InferenceService. SupportedRuntime SupportedRuntime is the schema for supported runtime result of automatic selection Field Description Name string Spec ServingRuntimeSpec SupportedUriFormat ( Appears on: StorageContainerSpec ) SupportedUriFormat can be either prefix or regex. Todo: Add validation that only one of them is set. Field Description prefix string regex string TrainedModel TrainedModel is the Schema for the TrainedModel API Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec TrainedModelSpec inferenceService string parent inference service to deploy to model ModelSpec Predictor model spec status TrainedModelStatus TrainedModelSpec ( Appears on: TrainedModel ) TrainedModelSpec defines the TrainedModel spec Field Description inferenceService string parent inference service to deploy to model ModelSpec Predictor model spec TrainedModelStatus ( Appears on: TrainedModel ) TrainedModelStatus defines the observed state of TrainedModel Field Description Status knative.dev/pkg/apis/duck/v1.Status (Members of Status are embedded into this type.) Conditions for trained model url knative.dev/pkg/apis.URL URL holds the url that will distribute traffic over the provided traffic targets. For v1: http[s]://{route-name}.{route-namespace}.{cluster-level-suffix}/v1/models/ :predict For v2: http[s]://{route-name}.{route-namespace}.{cluster-level-suffix}/v2/models/ /infer address knative.dev/pkg/apis/duck/v1.Addressable Addressable endpoint for the deployed trained model http:// /v1/models/ .metadata.name Generated with gen-crd-api-reference-docs on git commit 426fe21d . serving.kserve.io/v1beta1 Package v1beta1 contains API Schema definitions for the serving v1beta1 API group Resource Types: ARTExplainerSpec ( Appears on: ExplainerSpec ) ARTExplainerType defines the arguments for configuring an ART Explanation Server Field Description type ARTExplainerType The type of ART explainer ExplainerExtensionSpec ExplainerExtensionSpec (Members of ExplainerExtensionSpec are embedded into this type.) Contains fields shared across all explainers ARTExplainerType ( string alias) ( Appears on: ARTExplainerSpec ) Value Description \"SquareAttack\" AlibiExplainerSpec ( Appears on: ExplainerSpec ) AlibiExplainerSpec defines the arguments for configuring an Alibi Explanation Server Field Description type AlibiExplainerType The type of Alibi explainer Valid values are: - \u201cAnchorTabular\u201d; - \u201cAnchorImages\u201d; - \u201cAnchorText\u201d; - \u201cCounterfactuals\u201d; - \u201cContrastive\u201d; ExplainerExtensionSpec ExplainerExtensionSpec (Members of ExplainerExtensionSpec are embedded into this type.) Contains fields shared across all explainers AlibiExplainerType ( string alias) ( Appears on: AlibiExplainerSpec ) AlibiExplainerType is the explanation method Value Description \"AnchorImages\" \"AnchorTabular\" \"AnchorText\" \"Contrastive\" \"Counterfactuals\" Batcher ( Appears on: ComponentExtensionSpec ) Batcher specifies optional payload batching available for all components Field Description maxBatchSize int (Optional) Specifies the max number of requests to trigger a batch maxLatency int (Optional) Specifies the max latency to trigger a batch timeout int (Optional) Specifies the timeout of a batch Component Component interface is implemented by all specs that contain component implementations, e.g. PredictorSpec, ExplainerSpec, TransformerSpec. ComponentExtensionSpec ( Appears on: ExplainerSpec , PredictorSpec , TransformerSpec ) ComponentExtensionSpec defines the deployment configuration for a given InferenceService component Field Description minReplicas int (Optional) Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. maxReplicas int (Optional) Maximum number of replicas for autoscaling. scaleTarget int (Optional) ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler ( https://knative.dev/docs/serving/autoscaling/autoscaling-targets/ ). scaleMetric ScaleMetric (Optional) ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler( https://knative.dev/docs/serving/autoscaling/autoscaling-metrics ). containerConcurrency int64 (Optional) ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency( https://knative.dev/docs/serving/autoscaling/concurrency ). timeout int64 (Optional) TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. canaryTrafficPercent int64 (Optional) CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision logger LoggerSpec (Optional) Activate request/response logging and logger configurations batcher Batcher (Optional) Activate request batching and batching configurations labels map[string]string (Optional) Labels that will be add to the component pod. More info: http://kubernetes.io/docs/user-guide/labels annotations map[string]string (Optional) Annotations that will be add to the component pod. More info: http://kubernetes.io/docs/user-guide/annotations ComponentImplementation ComponentImplementation interface is implemented by predictor, transformer, and explainer implementations ComponentStatusSpec ( Appears on: InferenceServiceStatus ) ComponentStatusSpec describes the state of the component Field Description latestReadyRevision string (Optional) Latest revision name that is in ready state latestCreatedRevision string (Optional) Latest revision name that is created previousRolledoutRevision string (Optional) Previous revision name that is rolled out with 100 percent traffic latestRolledoutRevision string (Optional) Latest revision name that is rolled out with 100 percent traffic traffic []knative.dev/serving/pkg/apis/serving/v1.TrafficTarget (Optional) Traffic holds the configured traffic distribution for latest ready revision and previous rolled out revision. url knative.dev/pkg/apis.URL (Optional) URL holds the primary url that will distribute traffic over the provided traffic targets. This will be one the REST or gRPC endpoints that are available. It generally has the form http[s]://{route-name}.{route-namespace}.{cluster-level-suffix} restUrl knative.dev/pkg/apis.URL (Optional) REST endpoint of the component if available. grpcUrl knative.dev/pkg/apis.URL (Optional) gRPC endpoint of the component if available. address knative.dev/pkg/apis/duck/v1.Addressable (Optional) Addressable endpoint for the InferenceService ComponentType ( string alias) ComponentType contains the different types of components of the service Value Description \"explainer\" \"predictor\" \"transformer\" CustomExplainer CustomExplainer defines arguments for configuring a custom explainer. Field Description PodSpec Kubernetes core/v1.PodSpec (Members of PodSpec are embedded into this type.) CustomPredictor CustomPredictor defines arguments for configuring a custom server. Field Description PodSpec Kubernetes core/v1.PodSpec (Members of PodSpec are embedded into this type.) CustomTransformer CustomTransformer defines arguments for configuring a custom transformer. Field Description PodSpec Kubernetes core/v1.PodSpec (Members of PodSpec are embedded into this type.) DeployConfig Field Description defaultDeploymentMode string ExplainerConfig ( Appears on: ExplainersConfig ) Field Description image string explainer docker image name defaultImageVersion string default explainer docker image version ExplainerExtensionSpec ( Appears on: ARTExplainerSpec , AlibiExplainerSpec ) ExplainerExtensionSpec defines configuration shared across all explainer frameworks Field Description storageUri string The location of a trained explanation model runtimeVersion string Defaults to latest Explainer Version config map[string]string Inline custom parameter settings for explainer Container Kubernetes core/v1.Container (Members of Container are embedded into this type.) (Optional) Container enables overrides for the predictor. Each framework will have different defaults that are populated in the underlying container spec. storage StorageSpec (Optional) Storage Spec for model location ExplainerSpec ( Appears on: InferenceServiceSpec ) ExplainerSpec defines the container spec for a model explanation server, The following fields follow a \u201c1-of\u201d semantic. Users must specify exactly one spec. Field Description alibi AlibiExplainerSpec Spec for alibi explainer art ARTExplainerSpec Spec for ART explainer PodSpec PodSpec (Members of PodSpec are embedded into this type.) This spec is dual purpose. 1) Users may choose to provide a full PodSpec for their custom explainer. The field PodSpec.Containers is mutually exclusive with other explainers (i.e. Alibi). 2) Users may choose to provide a Explainer (i.e. Alibi) and specify PodSpec overrides in the PodSpec. They must not provide PodSpec.Containers in this case. ComponentExtensionSpec ComponentExtensionSpec (Members of ComponentExtensionSpec are embedded into this type.) Component extension defines the deployment configurations for explainer ExplainersConfig ( Appears on: InferenceServicesConfig ) Field Description alibi ExplainerConfig art ExplainerConfig FailureInfo ( Appears on: ModelStatus ) Field Description location string (Optional) Name of component to which the failure relates (usually Pod name) reason FailureReason (Optional) High level class of failure message string (Optional) Detailed error message modelRevisionName string (Optional) Internal Revision/ID of model, tied to specific Spec contents time Kubernetes meta/v1.Time (Optional) Time failure occurred or was discovered exitCode int32 (Optional) Exit status from the last termination of the container FailureReason ( string alias) ( Appears on: FailureInfo ) FailureReason enum Value Description \"InvalidPredictorSpec\" The current Predictor Spec is invalid or unsupported \"ModelLoadFailed\" The model failed to load within a ServingRuntime container \"NoSupportingRuntime\" There are no ServingRuntime which support the specified model type \"RuntimeDisabled\" The ServingRuntime is disabled \"RuntimeNotRecognized\" There is no ServingRuntime defined with the specified runtime name \"RuntimeUnhealthy\" Corresponding ServingRuntime containers failed to start or are unhealthy HuggingFaceRuntimeSpec ( Appears on: PredictorSpec ) HuggingFaceRuntimeSpec defines arguments for configuring HuggingFace model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors InferenceService InferenceService is the Schema for the InferenceServices API Field Description metadata Kubernetes meta/v1.ObjectMeta Refer to the Kubernetes API documentation for the fields of the metadata field. spec InferenceServiceSpec predictor PredictorSpec Predictor defines the model serving spec explainer ExplainerSpec (Optional) Explainer defines the model explanation service spec, explainer service calls to predictor or transformer if it is specified. transformer TransformerSpec (Optional) Transformer defines the pre/post processing before and after the predictor call, transformer service calls to predictor service. status InferenceServiceStatus InferenceServiceSpec ( Appears on: InferenceService ) InferenceServiceSpec is the top level type for this resource Field Description predictor PredictorSpec Predictor defines the model serving spec explainer ExplainerSpec (Optional) Explainer defines the model explanation service spec, explainer service calls to predictor or transformer if it is specified. transformer TransformerSpec (Optional) Transformer defines the pre/post processing before and after the predictor call, transformer service calls to predictor service. InferenceServiceStatus ( Appears on: InferenceService ) InferenceServiceStatus defines the observed state of InferenceService Field Description Status knative.dev/pkg/apis/duck/v1.Status (Members of Status are embedded into this type.) Conditions for the InferenceService - PredictorReady: predictor readiness condition; - TransformerReady: transformer readiness condition; - ExplainerReady: explainer readiness condition; - RoutesReady (serverless mode only): aggregated routing condition, i.e. endpoint readiness condition; - LatestDeploymentReady (serverless mode only): aggregated configuration condition, i.e. latest deployment readiness condition; - Ready: aggregated condition; address knative.dev/pkg/apis/duck/v1.Addressable (Optional) Addressable endpoint for the InferenceService url knative.dev/pkg/apis.URL (Optional) URL holds the url that will distribute traffic over the provided traffic targets. It generally has the form http[s]://{route-name}.{route-namespace}.{cluster-level-suffix} components map[kserve.io/serving/pkg/apis/serving/v1beta1.ComponentType]kserve.io/serving/pkg/apis/serving/v1beta1.ComponentStatusSpec Statuses for the components of the InferenceService modelStatus ModelStatus Model related statuses InferenceServicesConfig Field Description explainers ExplainersConfig Explainer configurations IngressConfig Field Description ingressGateway string ingressService string localGateway string localGatewayService string ingressDomain string ingressClassName string domainTemplate string urlScheme string disableIstioVirtualHost bool pathTemplate string disableIngressCreation bool LightGBMSpec ( Appears on: PredictorSpec ) LightGBMSpec defines arguments for configuring LightGBMSpec model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors LoggerSpec ( Appears on: ComponentExtensionSpec ) LoggerSpec specifies optional payload logging available for all components Field Description url string (Optional) URL to send logging events mode LoggerType (Optional) Specifies the scope of the loggers. Valid values are: - \u201call\u201d (default): log both request and response; - \u201crequest\u201d: log only request; - \u201cresponse\u201d: log only response LoggerType ( string alias) ( Appears on: LoggerSpec ) LoggerType controls the scope of log publishing Value Description \"all\" Logger mode to log both request and response \"request\" Logger mode to log only request \"response\" Logger mode to log only response ModelCopies ( Appears on: ModelStatus ) Field Description failedCopies int How many copies of this predictor\u2019s models failed to load recently totalCopies int (Optional) Total number copies of this predictor\u2019s models that are currently loaded ModelFormat ( Appears on: ModelSpec ) Field Description name string Name of the model format. version string (Optional) Version of the model format. Used in validating that a predictor is supported by a runtime. Can be \u201cmajor\u201d, \u201cmajor.minor\u201d or \u201cmajor.minor.patch\u201d. ModelRevisionStates ( Appears on: ModelStatus ) Field Description activeModelState ModelState High level state string: Pending, Standby, Loading, Loaded, FailedToLoad targetModelState ModelState ModelSpec ( Appears on: PredictorSpec ) Field Description modelFormat ModelFormat ModelFormat being served. runtime string (Optional) Specific ClusterServingRuntime/ServingRuntime name to use for deployment. PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) ModelState ( string alias) ( Appears on: ModelRevisionStates ) ModelState enum Value Description \"FailedToLoad\" All copies of the model failed to load \"Loaded\" At least one copy of the model is loaded \"Loading\" Model is loading \"Pending\" Model is not yet registered \"Standby\" Model is available but not loaded (will load when used) ModelStatus ( Appears on: InferenceServiceStatus ) Field Description transitionStatus TransitionStatus Whether the available predictor endpoints reflect the current Spec or is in transition states ModelRevisionStates (Optional) State information of the predictor\u2019s model. lastFailureInfo FailureInfo (Optional) Details of last failure, when load of target model is failed or blocked. copies ModelCopies (Optional) Model copy information of the predictor\u2019s model. ONNXRuntimeSpec ( Appears on: PredictorSpec ) ONNXRuntimeSpec defines arguments for configuring ONNX model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors PMMLSpec ( Appears on: PredictorSpec ) PMMLSpec defines arguments for configuring PMML model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors PaddleServerSpec ( Appears on: PredictorSpec ) Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) PodSpec ( Appears on: ExplainerSpec , PredictorSpec , TransformerSpec ) PodSpec is a description of a pod. Field Description volumes []Kubernetes core/v1.Volume (Optional) List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes initContainers []Kubernetes core/v1.Container List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ containers []Kubernetes core/v1.Container List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. ephemeralContainers []Kubernetes core/v1.EphemeralContainer (Optional) List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod\u2019s ephemeralcontainers subresource. This field is beta-level and available on clusters that haven\u2019t disabled the EphemeralContainers feature gate. restartPolicy Kubernetes core/v1.RestartPolicy (Optional) Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy terminationGracePeriodSeconds int64 (Optional) Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. activeDeadlineSeconds int64 (Optional) Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. dnsPolicy Kubernetes core/v1.DNSPolicy (Optional) Set DNS policy for the pod. Defaults to \u201cClusterFirst\u201d. Valid values are \u2018ClusterFirstWithHostNet\u2019, \u2018ClusterFirst\u2019, \u2018Default\u2019 or \u2018None\u2019. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to \u2018ClusterFirstWithHostNet\u2019. nodeSelector map[string]string (Optional) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node\u2019s labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ serviceAccountName string (Optional) ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ serviceAccount string (Optional) DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. automountServiceAccountToken bool (Optional) AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. nodeName string (Optional) NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. hostNetwork bool (Optional) Host networking requested for this pod. Use the host\u2019s network namespace. If this option is set, the ports that will be used must be specified. Default to false. hostPID bool (Optional) Use the host\u2019s pid namespace. Optional: Default to false. hostIPC bool (Optional) Use the host\u2019s ipc namespace. Optional: Default to false. shareProcessNamespace bool (Optional) Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. securityContext Kubernetes core/v1.PodSecurityContext (Optional) SecurityContext holds pod-level security attributes and common container settings. Optional: Defaults to empty. See type description for default values of each field. imagePullSecrets []Kubernetes core/v1.LocalObjectReference (Optional) ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod hostname string (Optional) Specifies the hostname of the Pod If not specified, the pod\u2019s hostname will be set to a system-defined value. subdomain string (Optional) If specified, the fully qualified Pod hostname will be \u201c . . .svc. \u201d. If not specified, the pod will not have a domainname at all. affinity Kubernetes core/v1.Affinity (Optional) If specified, the pod\u2019s scheduling constraints schedulerName string (Optional) If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. tolerations []Kubernetes core/v1.Toleration (Optional) If specified, the pod\u2019s tolerations. hostAliases []Kubernetes core/v1.HostAlias (Optional) HostAliases is an optional list of hosts and IPs that will be injected into the pod\u2019s hosts file if specified. This is only valid for non-hostNetwork pods. priorityClassName string (Optional) If specified, indicates the pod\u2019s priority. \u201csystem-node-critical\u201d and \u201csystem-cluster-critical\u201d are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. priority int32 (Optional) The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. dnsConfig Kubernetes core/v1.PodDNSConfig (Optional) Specifies the DNS parameters of a pod. Parameters specified here will be merged to the generated DNS configuration based on DNSPolicy. readinessGates []Kubernetes core/v1.PodReadinessGate (Optional) If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to \u201cTrue\u201d More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates runtimeClassName string (Optional) RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the \u201clegacy\u201d RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. enableServiceLinks bool (Optional) EnableServiceLinks indicates whether information about services should be injected into pod\u2019s environment variables, matching the syntax of Docker links. Optional: Defaults to true. preemptionPolicy Kubernetes core/v1.PreemptionPolicy (Optional) PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. overhead Kubernetes core/v1.ResourceList (Optional) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://git.k8s.io/enhancements/keps/sig-node/688-pod-overhead/README.md This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. topologySpreadConstraints []Kubernetes core/v1.TopologySpreadConstraint (Optional) TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. setHostnameAsFQDN bool (Optional) If true the pod\u2019s hostname will be configured as the pod\u2019s FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. os Kubernetes core/v1.PodOS (Optional) Specifies the OS of the containers in the pod. Some pod and container fields are restricted if this is set. If the OS field is set to linux, the following fields must be unset: -securityContext.windowsOptions If the OS field is set to windows, following fields must be unset: - spec.hostPID - spec.hostIPC - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - spec.securityContext.sysctls - spec.shareProcessNamespace - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - spec.securityContext.supplementalGroups - spec.containers[ ].securityContext.seLinuxOptions - spec.containers[ ].securityContext.seccompProfile - spec.containers[ ].securityContext.capabilities - spec.containers[ ].securityContext.readOnlyRootFilesystem - spec.containers[ ].securityContext.privileged - spec.containers[ ].securityContext.allowPrivilegeEscalation - spec.containers[ ].securityContext.procMount - spec.containers[ ].securityContext.runAsUser - spec.containers[*].securityContext.runAsGroup This is an alpha field and requires the IdentifyPodOS feature hostUsers bool (Optional) Use the host\u2019s user namespace. Optional: Default to true. If set to true or not present, the pod will be run in the host user namespace, useful for when the pod needs a feature only available to the host user namespace, such as loading a kernel module with CAP_SYS_MODULE. When set to false, a new userns is created for the pod. Setting false is useful for mitigating container breakout vulnerabilities even allowing users to run their containers as root without actually having root privileges on the host. This field is alpha-level and is only honored by servers that enable the UserNamespacesSupport feature. schedulingGates []Kubernetes core/v1.PodSchedulingGate (Optional) SchedulingGates is an opaque list of values that if specified will block scheduling the pod. If schedulingGates is not empty, the pod will stay in the SchedulingGated state and the scheduler will not attempt to schedule the pod. SchedulingGates can only be set at pod creation time, and be removed only afterwards. This is a beta feature enabled by the PodSchedulingReadiness feature gate. resourceClaims []Kubernetes core/v1.PodResourceClaim (Optional) ResourceClaims defines which ResourceClaims must be allocated and reserved before the Pod is allowed to start. The resources will be made available to those containers which consume them by name. This is an alpha field and requires enabling the DynamicResourceAllocation feature gate. This field is immutable. PredictorExtensionSpec ( Appears on: HuggingFaceRuntimeSpec , LightGBMSpec , ModelSpec , ONNXRuntimeSpec , PMMLSpec , PaddleServerSpec , SKLearnSpec , TFServingSpec , TorchServeSpec , TritonSpec , XGBoostSpec ) PredictorExtensionSpec defines configuration shared across all predictor frameworks Field Description storageUri string (Optional) This field points to the location of the trained model which is mounted onto the pod. runtimeVersion string (Optional) Runtime version of the predictor docker image protocolVersion github.com/kserve/kserve/pkg/constants.InferenceServiceProtocol (Optional) Protocol version to use by the predictor (i.e. v1 or v2 or grpc-v1 or grpc-v2) Container Kubernetes core/v1.Container (Members of Container are embedded into this type.) (Optional) Container enables overrides for the predictor. Each framework will have different defaults that are populated in the underlying container spec. storage StorageSpec (Optional) Storage Spec for model location PredictorImplementation PredictorImplementation defines common functions for all predictors e.g Tensorflow, Triton, etc PredictorSpec ( Appears on: InferenceServiceSpec ) PredictorSpec defines the configuration for a predictor, The following fields follow a \u201c1-of\u201d semantic. Users must specify exactly one spec. Field Description sklearn SKLearnSpec Spec for SKLearn model server xgboost XGBoostSpec Spec for XGBoost model server tensorflow TFServingSpec Spec for TFServing ( https://github.com/tensorflow/serving ) pytorch TorchServeSpec Spec for TorchServe ( https://pytorch.org/serve ) triton TritonSpec Spec for Triton Inference Server ( https://github.com/triton-inference-server/server ) onnx ONNXRuntimeSpec Spec for ONNX runtime ( https://github.com/microsoft/onnxruntime ) huggingface HuggingFaceRuntimeSpec Spec for HuggingFace runtime ( https://github.com/huggingface ) pmml PMMLSpec Spec for PMML ( http://dmg.org/pmml/v4-1/GeneralStructure.html ) lightgbm LightGBMSpec Spec for LightGBM model server paddle PaddleServerSpec Spec for Paddle model server ( https://github.com/PaddlePaddle/Serving ) model ModelSpec Model spec for any arbitrary framework. PodSpec PodSpec (Members of PodSpec are embedded into this type.) This spec is dual purpose. 1) Provide a full PodSpec for custom predictor. The field PodSpec.Containers is mutually exclusive with other predictors (i.e. TFServing). 2) Provide a predictor (i.e. TFServing) and specify PodSpec overrides, you must not provide PodSpec.Containers in this case. ComponentExtensionSpec ComponentExtensionSpec (Members of ComponentExtensionSpec are embedded into this type.) Component extension defines the deployment configurations for a predictor SKLearnSpec ( Appears on: PredictorSpec ) SKLearnSpec defines arguments for configuring SKLearn model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors ScaleMetric ( string alias) ( Appears on: ComponentExtensionSpec ) ScaleMetric enum Value Description \"cpu\" \"concurrency\" \"memory\" \"rps\" StorageSpec ( Appears on: ExplainerExtensionSpec , PredictorExtensionSpec ) Field Description path string (Optional) The path to the model object in the storage. It cannot co-exist with the storageURI. schemaPath string (Optional) The path to the model schema file in the storage. parameters map[string]string (Optional) Parameters to override the default storage credentials and config. key string (Optional) The Storage Key in the secret for this model. TFServingSpec ( Appears on: PredictorSpec ) TFServingSpec defines arguments for configuring Tensorflow model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors TorchServeSpec ( Appears on: PredictorSpec ) TorchServeSpec defines arguments for configuring PyTorch model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors TransformerSpec ( Appears on: InferenceServiceSpec ) TransformerSpec defines transformer service for pre/post processing Field Description PodSpec PodSpec (Members of PodSpec are embedded into this type.) This spec is dual purpose. 1) Provide a full PodSpec for custom transformer. The field PodSpec.Containers is mutually exclusive with other transformers. 2) Provide a transformer and specify PodSpec overrides, you must not provide PodSpec.Containers in this case. ComponentExtensionSpec ComponentExtensionSpec (Members of ComponentExtensionSpec are embedded into this type.) Component extension defines the deployment configurations for a transformer TransitionStatus ( string alias) ( Appears on: ModelStatus ) TransitionStatus enum Value Description \"BlockedByFailedLoad\" Target model failed to load \"InProgress\" Waiting for target model to reach state of active model \"InvalidSpec\" Target predictor spec failed validation \"UpToDate\" Predictor is up-to-date (reflects current spec) TritonSpec ( Appears on: PredictorSpec ) TritonSpec defines arguments for configuring Triton model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors XGBoostSpec ( Appears on: PredictorSpec ) XGBoostSpec defines arguments for configuring XGBoost model serving. Field Description PredictorExtensionSpec PredictorExtensionSpec (Members of PredictorExtensionSpec are embedded into this type.) Contains fields shared across all predictors Generated with gen-crd-api-reference-docs on git commit 426fe21d .","title":"Control Plane API"},{"location":"reference/swagger-ui/","text":"Open Inference Protocol API Specification \u00b6 REST \u00b6 GRPC \u00b6 ServerLive \u00b6 The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. rpc inference.GRPCInferenceService/ServerLive( ServerLiveRequest ) returns ServerLiveResponse ServerReady \u00b6 The ServerReady API indicates if the server is ready for inferencing. rpc inference.GRPCInferenceService/ServerReady( ServerReadyRequest ) returns ServerReadyResponse ModelReady \u00b6 The ModelReady API indicates if a specific model is ready for inferencing. rpc inference.GRPCInferenceService/ModelReady( ModelReadyRequest ) returns ModelReadyResponse ServerMetadata \u00b6 The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ServerMetadata( ServerMetadataRequest ) returns ServerMetadataResponse ModelMetadata \u00b6 The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelMetadata( ModelMetadataRequest ) returns ModelMetadataResponse ModelInfer \u00b6 The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelInfer( ModelInferRequest ) returns ModelInferResponse Messages \u00b6 InferParameter \u00b6 An inference parameter value. The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Field Type Description oneof parameter_choice.bool_param bool A boolean parameter value. oneof parameter_choice.int64_param int64 An int64 parameter value. oneof parameter_choice.string_param string A string parameter value. InferTensorContents \u00b6 The data contained in a tensor represented by the repeated type that matches the tensor's data type. Protobuf oneof is not used because oneofs cannot contain repeated fields. Field Type Description bool_contents repeated bool Representation for BOOL data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int_contents repeated int32 Representation for INT8, INT16, and INT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int64_contents repeated int64 Representation for INT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint_contents repeated uint32 Representation for UINT8, UINT16, and UINT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint64_contents repeated uint64 Representation for UINT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp32_contents repeated float Representation for FP32 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp64_contents repeated double Representation for FP64 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. bytes_contents repeated bytes Representation for BYTES data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. ModelInferRequest \u00b6 Field Type Description model_name string The name of the model to use for inferencing. model_version string The version of the model to use for inference. If not given the server will choose a version based on the model and internal policy. id string Optional identifier for the request. If specified will be returned in the response. parameters map ModelInferRequest.ParametersEntry Optional inference parameters. inputs repeated ModelInferRequest.InferInputTensor The input tensors for the inference. outputs repeated ModelInferRequest.InferRequestedOutputTensor The requested output tensors for the inference. Optional, if not specified all outputs produced by the model will be returned. raw_input_contents repeated bytes The data contained in an input tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_input_contents' must be initialized with data for each tensor in the same order as 'inputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferInputTensor::contents must not be specified for any input tensor. | ModelInferRequest.InferInputTensor \u00b6 An input tensor for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferRequest.InferInputTensor.ParametersEntry Optional inference input tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference request. ModelInferRequest.InferInputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferRequest.InferRequestedOutputTensor \u00b6 An output tensor requested for an inference request. Field Type Description name string The tensor name. parameters map ModelInferRequest.InferRequestedOutputTensor.ParametersEntry Optional requested output tensor parameters. ModelInferRequest.InferRequestedOutputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferRequest.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferResponse \u00b6 Field Type Description model_name string The name of the model used for inference. model_version string The version of the model used for inference. id string The id of the inference request if one was specified. parameters map ModelInferResponse.ParametersEntry Optional inference response parameters. outputs repeated ModelInferResponse.InferOutputTensor The output tensors holding inference results. raw_output_contents repeated bytes The data contained in an output tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_output_contents' must be initialized with data for each tensor in the same order as 'outputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferOutputTensor::contents must not be specified for any output tensor. | ModelInferResponse.InferOutputTensor \u00b6 An output tensor returned for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferResponse.InferOutputTensor.ParametersEntry Optional output tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference response. ModelInferResponse.InferOutputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferResponse.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelMetadataRequest \u00b6 Field Type Description name string The name of the model. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy. ModelMetadataResponse \u00b6 Field Type Description name string The model name. versions repeated string The versions of the model available on the server. platform string The model's platform. See Platforms. inputs repeated ModelMetadataResponse.TensorMetadata The model's inputs. outputs repeated ModelMetadataResponse.TensorMetadata The model's outputs. ModelMetadataResponse.TensorMetadata \u00b6 Metadata for a tensor. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. A variable-size dimension is represented by a -1 value. ModelReadyRequest \u00b6 Field Type Description name string The name of the model to check for readiness. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy. ModelReadyResponse \u00b6 Field Type Description ready bool True if the model is ready, false if not ready. ServerLiveRequest \u00b6 ServerLiveResponse \u00b6 Field Type Description live bool True if the inference server is live, false if not live. ServerMetadataRequest \u00b6 ServerMetadataResponse \u00b6 Field Type Description name string The server name. version string The server version. extensions repeated string The extensions supported by the server. ServerReadyRequest \u00b6 ServerReadyResponse \u00b6 Field Type Description ready bool True if the inference server is ready, false if not ready.","title":"Open Inference Protocol API Spec"},{"location":"reference/swagger-ui/#open-inference-protocol-api-specification","text":"","title":"Open Inference Protocol API Specification"},{"location":"reference/swagger-ui/#rest","text":"","title":"REST"},{"location":"reference/swagger-ui/#grpc","text":"","title":"GRPC"},{"location":"reference/swagger-ui/#serverlive","text":"The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. rpc inference.GRPCInferenceService/ServerLive( ServerLiveRequest ) returns ServerLiveResponse","title":"ServerLive"},{"location":"reference/swagger-ui/#serverready","text":"The ServerReady API indicates if the server is ready for inferencing. rpc inference.GRPCInferenceService/ServerReady( ServerReadyRequest ) returns ServerReadyResponse","title":"ServerReady"},{"location":"reference/swagger-ui/#modelready","text":"The ModelReady API indicates if a specific model is ready for inferencing. rpc inference.GRPCInferenceService/ModelReady( ModelReadyRequest ) returns ModelReadyResponse","title":"ModelReady"},{"location":"reference/swagger-ui/#servermetadata","text":"The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ServerMetadata( ServerMetadataRequest ) returns ServerMetadataResponse","title":"ServerMetadata"},{"location":"reference/swagger-ui/#modelmetadata","text":"The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelMetadata( ModelMetadataRequest ) returns ModelMetadataResponse","title":"ModelMetadata"},{"location":"reference/swagger-ui/#modelinfer","text":"The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelInfer( ModelInferRequest ) returns ModelInferResponse","title":"ModelInfer"},{"location":"reference/swagger-ui/#messages","text":"","title":"Messages"},{"location":"reference/swagger-ui/#inferparameter","text":"An inference parameter value. The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Field Type Description oneof parameter_choice.bool_param bool A boolean parameter value. oneof parameter_choice.int64_param int64 An int64 parameter value. oneof parameter_choice.string_param string A string parameter value.","title":"InferParameter"},{"location":"reference/swagger-ui/#infertensorcontents","text":"The data contained in a tensor represented by the repeated type that matches the tensor's data type. Protobuf oneof is not used because oneofs cannot contain repeated fields. Field Type Description bool_contents repeated bool Representation for BOOL data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int_contents repeated int32 Representation for INT8, INT16, and INT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int64_contents repeated int64 Representation for INT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint_contents repeated uint32 Representation for UINT8, UINT16, and UINT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint64_contents repeated uint64 Representation for UINT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp32_contents repeated float Representation for FP32 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp64_contents repeated double Representation for FP64 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. bytes_contents repeated bytes Representation for BYTES data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements.","title":"InferTensorContents"},{"location":"reference/swagger-ui/#modelinferrequest","text":"Field Type Description model_name string The name of the model to use for inferencing. model_version string The version of the model to use for inference. If not given the server will choose a version based on the model and internal policy. id string Optional identifier for the request. If specified will be returned in the response. parameters map ModelInferRequest.ParametersEntry Optional inference parameters. inputs repeated ModelInferRequest.InferInputTensor The input tensors for the inference. outputs repeated ModelInferRequest.InferRequestedOutputTensor The requested output tensors for the inference. Optional, if not specified all outputs produced by the model will be returned. raw_input_contents repeated bytes The data contained in an input tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_input_contents' must be initialized with data for each tensor in the same order as 'inputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferInputTensor::contents must not be specified for any input tensor. |","title":"ModelInferRequest"},{"location":"reference/swagger-ui/#modelinferrequestinferinputtensor","text":"An input tensor for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferRequest.InferInputTensor.ParametersEntry Optional inference input tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference request.","title":"ModelInferRequest.InferInputTensor"},{"location":"reference/swagger-ui/#modelinferrequestinferinputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.InferInputTensor.ParametersEntry"},{"location":"reference/swagger-ui/#modelinferrequestinferrequestedoutputtensor","text":"An output tensor requested for an inference request. Field Type Description name string The tensor name. parameters map ModelInferRequest.InferRequestedOutputTensor.ParametersEntry Optional requested output tensor parameters.","title":"ModelInferRequest.InferRequestedOutputTensor"},{"location":"reference/swagger-ui/#modelinferrequestinferrequestedoutputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.InferRequestedOutputTensor.ParametersEntry"},{"location":"reference/swagger-ui/#modelinferrequestparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.ParametersEntry"},{"location":"reference/swagger-ui/#modelinferresponse","text":"Field Type Description model_name string The name of the model used for inference. model_version string The version of the model used for inference. id string The id of the inference request if one was specified. parameters map ModelInferResponse.ParametersEntry Optional inference response parameters. outputs repeated ModelInferResponse.InferOutputTensor The output tensors holding inference results. raw_output_contents repeated bytes The data contained in an output tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_output_contents' must be initialized with data for each tensor in the same order as 'outputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferOutputTensor::contents must not be specified for any output tensor. |","title":"ModelInferResponse"},{"location":"reference/swagger-ui/#modelinferresponseinferoutputtensor","text":"An output tensor returned for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferResponse.InferOutputTensor.ParametersEntry Optional output tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference response.","title":"ModelInferResponse.InferOutputTensor"},{"location":"reference/swagger-ui/#modelinferresponseinferoutputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferResponse.InferOutputTensor.ParametersEntry"},{"location":"reference/swagger-ui/#modelinferresponseparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferResponse.ParametersEntry"},{"location":"reference/swagger-ui/#modelmetadatarequest","text":"Field Type Description name string The name of the model. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy.","title":"ModelMetadataRequest"},{"location":"reference/swagger-ui/#modelmetadataresponse","text":"Field Type Description name string The model name. versions repeated string The versions of the model available on the server. platform string The model's platform. See Platforms. inputs repeated ModelMetadataResponse.TensorMetadata The model's inputs. outputs repeated ModelMetadataResponse.TensorMetadata The model's outputs.","title":"ModelMetadataResponse"},{"location":"reference/swagger-ui/#modelmetadataresponsetensormetadata","text":"Metadata for a tensor. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. A variable-size dimension is represented by a -1 value.","title":"ModelMetadataResponse.TensorMetadata"},{"location":"reference/swagger-ui/#modelreadyrequest","text":"Field Type Description name string The name of the model to check for readiness. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy.","title":"ModelReadyRequest"},{"location":"reference/swagger-ui/#modelreadyresponse","text":"Field Type Description ready bool True if the model is ready, false if not ready.","title":"ModelReadyResponse"},{"location":"reference/swagger-ui/#serverliverequest","text":"","title":"ServerLiveRequest"},{"location":"reference/swagger-ui/#serverliveresponse","text":"Field Type Description live bool True if the inference server is live, false if not live.","title":"ServerLiveResponse"},{"location":"reference/swagger-ui/#servermetadatarequest","text":"","title":"ServerMetadataRequest"},{"location":"reference/swagger-ui/#servermetadataresponse","text":"Field Type Description name string The server name. version string The server version. extensions repeated string The extensions supported by the server.","title":"ServerMetadataResponse"},{"location":"reference/swagger-ui/#serverreadyrequest","text":"","title":"ServerReadyRequest"},{"location":"reference/swagger-ui/#serverreadyresponse","text":"Field Type Description ready bool True if the inference server is ready, false if not ready.","title":"ServerReadyResponse"},{"location":"reference/v2_inference/","text":"ServerLive \u00b6 The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. rpc inference.GRPCInferenceService/ServerLive( ServerLiveRequest ) returns ServerLiveResponse ServerReady \u00b6 The ServerReady API indicates if the server is ready for inferencing. rpc inference.GRPCInferenceService/ServerReady( ServerReadyRequest ) returns ServerReadyResponse ModelReady \u00b6 The ModelReady API indicates if a specific model is ready for inferencing. rpc inference.GRPCInferenceService/ModelReady( ModelReadyRequest ) returns ModelReadyResponse ServerMetadata \u00b6 The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ServerMetadata( ServerMetadataRequest ) returns ServerMetadataResponse ModelMetadata \u00b6 The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelMetadata( ModelMetadataRequest ) returns ModelMetadataResponse ModelInfer \u00b6 The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelInfer( ModelInferRequest ) returns ModelInferResponse Messages \u00b6 InferParameter \u00b6 An inference parameter value. The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Field Type Description oneof parameter_choice.bool_param bool A boolean parameter value. oneof parameter_choice.int64_param int64 An int64 parameter value. oneof parameter_choice.string_param string A string parameter value. InferTensorContents \u00b6 The data contained in a tensor represented by the repeated type that matches the tensor's data type. Protobuf oneof is not used because oneofs cannot contain repeated fields. Field Type Description bool_contents repeated bool Representation for BOOL data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int_contents repeated int32 Representation for INT8, INT16, and INT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int64_contents repeated int64 Representation for INT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint_contents repeated uint32 Representation for UINT8, UINT16, and UINT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint64_contents repeated uint64 Representation for UINT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp32_contents repeated float Representation for FP32 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp64_contents repeated double Representation for FP64 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. bytes_contents repeated bytes Representation for BYTES data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. ModelInferRequest \u00b6 Field Type Description model_name string The name of the model to use for inferencing. model_version string The version of the model to use for inference. If not given the server will choose a version based on the model and internal policy. id string Optional identifier for the request. If specified will be returned in the response. parameters map ModelInferRequest.ParametersEntry Optional inference parameters. inputs repeated ModelInferRequest.InferInputTensor The input tensors for the inference. outputs repeated ModelInferRequest.InferRequestedOutputTensor The requested output tensors for the inference. Optional, if not specified all outputs produced by the model will be returned. raw_input_contents repeated bytes The data contained in an input tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_input_contents' must be initialized with data for each tensor in the same order as 'inputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferInputTensor::contents must not be specified for any input tensor. | ModelInferRequest.InferInputTensor \u00b6 An input tensor for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferRequest.InferInputTensor.ParametersEntry Optional inference input tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference request. ModelInferRequest.InferInputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferRequest.InferRequestedOutputTensor \u00b6 An output tensor requested for an inference request. Field Type Description name string The tensor name. parameters map ModelInferRequest.InferRequestedOutputTensor.ParametersEntry Optional requested output tensor parameters. ModelInferRequest.InferRequestedOutputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferRequest.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferResponse \u00b6 Field Type Description model_name string The name of the model used for inference. model_version string The version of the model used for inference. id string The id of the inference request if one was specified. parameters map ModelInferResponse.ParametersEntry Optional inference response parameters. outputs repeated ModelInferResponse.InferOutputTensor The output tensors holding inference results. raw_output_contents repeated bytes The data contained in an output tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_output_contents' must be initialized with data for each tensor in the same order as 'outputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferOutputTensor::contents must not be specified for any output tensor. | ModelInferResponse.InferOutputTensor \u00b6 An output tensor returned for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferResponse.InferOutputTensor.ParametersEntry Optional output tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference response. ModelInferResponse.InferOutputTensor.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelInferResponse.ParametersEntry \u00b6 Field Type Description key string N/A value InferParameter N/A ModelMetadataRequest \u00b6 Field Type Description name string The name of the model. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy. ModelMetadataResponse \u00b6 Field Type Description name string The model name. versions repeated string The versions of the model available on the server. platform string The model's platform. See Platforms. inputs repeated ModelMetadataResponse.TensorMetadata The model's inputs. outputs repeated ModelMetadataResponse.TensorMetadata The model's outputs. ModelMetadataResponse.TensorMetadata \u00b6 Metadata for a tensor. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. A variable-size dimension is represented by a -1 value. ModelReadyRequest \u00b6 Field Type Description name string The name of the model to check for readiness. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy. ModelReadyResponse \u00b6 Field Type Description ready bool True if the model is ready, false if not ready. ServerLiveRequest \u00b6 ServerLiveResponse \u00b6 Field Type Description live bool True if the inference server is live, false if not live. ServerMetadataRequest \u00b6 ServerMetadataResponse \u00b6 Field Type Description name string The server name. version string The server version. extensions repeated string The extensions supported by the server. ServerReadyRequest \u00b6 ServerReadyResponse \u00b6 Field Type Description ready bool True if the inference server is ready, false if not ready.","title":"Index"},{"location":"reference/v2_inference/#serverlive","text":"The ServerLive API indicates if the inference server is able to receive and respond to metadata and inference requests. rpc inference.GRPCInferenceService/ServerLive( ServerLiveRequest ) returns ServerLiveResponse","title":"ServerLive"},{"location":"reference/v2_inference/#serverready","text":"The ServerReady API indicates if the server is ready for inferencing. rpc inference.GRPCInferenceService/ServerReady( ServerReadyRequest ) returns ServerReadyResponse","title":"ServerReady"},{"location":"reference/v2_inference/#modelready","text":"The ModelReady API indicates if a specific model is ready for inferencing. rpc inference.GRPCInferenceService/ModelReady( ModelReadyRequest ) returns ModelReadyResponse","title":"ModelReady"},{"location":"reference/v2_inference/#servermetadata","text":"The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ServerMetadata( ServerMetadataRequest ) returns ServerMetadataResponse","title":"ServerMetadata"},{"location":"reference/v2_inference/#modelmetadata","text":"The per-model metadata API provides information about a model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelMetadata( ModelMetadataRequest ) returns ModelMetadataResponse","title":"ModelMetadata"},{"location":"reference/v2_inference/#modelinfer","text":"The ModelInfer API performs inference using the specified model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. rpc inference.GRPCInferenceService/ModelInfer( ModelInferRequest ) returns ModelInferResponse","title":"ModelInfer"},{"location":"reference/v2_inference/#messages","text":"","title":"Messages"},{"location":"reference/v2_inference/#inferparameter","text":"An inference parameter value. The Parameters message describes a \u201cname\u201d/\u201dvalue\u201d pair, where the \u201cname\u201d is the name of the parameter and the \u201cvalue\u201d is a boolean, integer, or string corresponding to the parameter. Field Type Description oneof parameter_choice.bool_param bool A boolean parameter value. oneof parameter_choice.int64_param int64 An int64 parameter value. oneof parameter_choice.string_param string A string parameter value.","title":"InferParameter"},{"location":"reference/v2_inference/#infertensorcontents","text":"The data contained in a tensor represented by the repeated type that matches the tensor's data type. Protobuf oneof is not used because oneofs cannot contain repeated fields. Field Type Description bool_contents repeated bool Representation for BOOL data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int_contents repeated int32 Representation for INT8, INT16, and INT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. int64_contents repeated int64 Representation for INT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint_contents repeated uint32 Representation for UINT8, UINT16, and UINT32 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. uint64_contents repeated uint64 Representation for UINT64 data types. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp32_contents repeated float Representation for FP32 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. fp64_contents repeated double Representation for FP64 data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements. bytes_contents repeated bytes Representation for BYTES data type. The size must match what is expected by the tensor's shape. The contents must be the flattened, one-dimensional, row-major order of the tensor elements.","title":"InferTensorContents"},{"location":"reference/v2_inference/#modelinferrequest","text":"Field Type Description model_name string The name of the model to use for inferencing. model_version string The version of the model to use for inference. If not given the server will choose a version based on the model and internal policy. id string Optional identifier for the request. If specified will be returned in the response. parameters map ModelInferRequest.ParametersEntry Optional inference parameters. inputs repeated ModelInferRequest.InferInputTensor The input tensors for the inference. outputs repeated ModelInferRequest.InferRequestedOutputTensor The requested output tensors for the inference. Optional, if not specified all outputs produced by the model will be returned. raw_input_contents repeated bytes The data contained in an input tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_input_contents' must be initialized with data for each tensor in the same order as 'inputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferInputTensor::contents must not be specified for any input tensor. |","title":"ModelInferRequest"},{"location":"reference/v2_inference/#modelinferrequestinferinputtensor","text":"An input tensor for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferRequest.InferInputTensor.ParametersEntry Optional inference input tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference request.","title":"ModelInferRequest.InferInputTensor"},{"location":"reference/v2_inference/#modelinferrequestinferinputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.InferInputTensor.ParametersEntry"},{"location":"reference/v2_inference/#modelinferrequestinferrequestedoutputtensor","text":"An output tensor requested for an inference request. Field Type Description name string The tensor name. parameters map ModelInferRequest.InferRequestedOutputTensor.ParametersEntry Optional requested output tensor parameters.","title":"ModelInferRequest.InferRequestedOutputTensor"},{"location":"reference/v2_inference/#modelinferrequestinferrequestedoutputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.InferRequestedOutputTensor.ParametersEntry"},{"location":"reference/v2_inference/#modelinferrequestparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferRequest.ParametersEntry"},{"location":"reference/v2_inference/#modelinferresponse","text":"Field Type Description model_name string The name of the model used for inference. model_version string The version of the model used for inference. id string The id of the inference request if one was specified. parameters map ModelInferResponse.ParametersEntry Optional inference response parameters. outputs repeated ModelInferResponse.InferOutputTensor The output tensors holding inference results. raw_output_contents repeated bytes The data contained in an output tensor can be represented in \"raw\" bytes form or in the repeated type that matches the tensor's data type. To use the raw representation 'raw_output_contents' must be initialized with data for each tensor in the same order as 'outputs'. For each tensor, the size of this content must match what is expected by the tensor's shape and data type. The raw data must be the flattened, one-dimensional, row-major order of the tensor elements without any stride or padding between the elements. Note that the FP16 and BF16 data types must be represented as raw content as there is no specific data type for a 16-bit float type. If this field is specified then InferOutputTensor::contents must not be specified for any output tensor. |","title":"ModelInferResponse"},{"location":"reference/v2_inference/#modelinferresponseinferoutputtensor","text":"An output tensor returned for an inference request. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. parameters map ModelInferResponse.InferOutputTensor.ParametersEntry Optional output tensor parameters. contents InferTensorContents The tensor contents using a data-type format. This field must not be specified if \"raw\" tensor contents are being used for the inference response.","title":"ModelInferResponse.InferOutputTensor"},{"location":"reference/v2_inference/#modelinferresponseinferoutputtensorparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferResponse.InferOutputTensor.ParametersEntry"},{"location":"reference/v2_inference/#modelinferresponseparametersentry","text":"Field Type Description key string N/A value InferParameter N/A","title":"ModelInferResponse.ParametersEntry"},{"location":"reference/v2_inference/#modelmetadatarequest","text":"Field Type Description name string The name of the model. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy.","title":"ModelMetadataRequest"},{"location":"reference/v2_inference/#modelmetadataresponse","text":"Field Type Description name string The model name. versions repeated string The versions of the model available on the server. platform string The model's platform. See Platforms. inputs repeated ModelMetadataResponse.TensorMetadata The model's inputs. outputs repeated ModelMetadataResponse.TensorMetadata The model's outputs.","title":"ModelMetadataResponse"},{"location":"reference/v2_inference/#modelmetadataresponsetensormetadata","text":"Metadata for a tensor. Field Type Description name string The tensor name. datatype string The tensor data type. shape repeated int64 The tensor shape. A variable-size dimension is represented by a -1 value.","title":"ModelMetadataResponse.TensorMetadata"},{"location":"reference/v2_inference/#modelreadyrequest","text":"Field Type Description name string The name of the model to check for readiness. version string The version of the model to check for readiness. If not given the server will choose a version based on the model and internal policy.","title":"ModelReadyRequest"},{"location":"reference/v2_inference/#modelreadyresponse","text":"Field Type Description ready bool True if the model is ready, false if not ready.","title":"ModelReadyResponse"},{"location":"reference/v2_inference/#serverliverequest","text":"","title":"ServerLiveRequest"},{"location":"reference/v2_inference/#serverliveresponse","text":"Field Type Description live bool True if the inference server is live, false if not live.","title":"ServerLiveResponse"},{"location":"reference/v2_inference/#servermetadatarequest","text":"","title":"ServerMetadataRequest"},{"location":"reference/v2_inference/#servermetadataresponse","text":"Field Type Description name string The server name. version string The server version. extensions repeated string The extensions supported by the server.","title":"ServerMetadataResponse"},{"location":"reference/v2_inference/#serverreadyrequest","text":"","title":"ServerReadyRequest"},{"location":"reference/v2_inference/#serverreadyresponse","text":"Field Type Description ready bool True if the inference server is ready, false if not ready.","title":"ServerReadyResponse"},{"location":"reference/v2_inference/template/","text":"{{range $file := .Files}} {{range $service := .Services -}} {{range .Methods -}} {{.Name}} \u00b6 {{ .Description}} rpc {{$file.Package}}.{{$service.Name}}/{{.Name}}( {{.RequestLongType}} ) returns {{.ResponseLongType}} {{end}} {{end}} Messages \u00b6 {{range .Messages}} {{.LongName}} \u00b6 {{.Description}} {{if .HasFields}} | Field | Type | Description | | ----- | ---- | ----------- | {{range .Fields -}} | {{if .IsOneof}} oneof {{.OneofDecl}}.{{end}}{{.Name}} | {{if .IsMap}}map {{else if .Label}}{{.Label}} {{end}}{{.LongType}} | {{if .Description}}{{nobr .Description}}{{else}}N/A{{end}} | {{end}} {{end}} {{end}} {{end}}","title":"Template"},{"location":"reference/v2_inference/template/#name","text":"{{ .Description}} rpc {{$file.Package}}.{{$service.Name}}/{{.Name}}( {{.RequestLongType}} ) returns {{.ResponseLongType}} {{end}} {{end}}","title":"{{.Name}}"},{"location":"reference/v2_inference/template/#messages","text":"{{range .Messages}}","title":"Messages"},{"location":"reference/v2_inference/template/#longname","text":"{{.Description}} {{if .HasFields}} | Field | Type | Description | | ----- | ---- | ----------- | {{range .Fields -}} | {{if .IsOneof}} oneof {{.OneofDecl}}.{{end}}{{.Name}} | {{if .IsMap}}map {{else if .Label}}{{.Label}} {{end}}{{.LongType}} | {{if .Description}}{{nobr .Description}}{{else}}N/A{{end}} | {{end}} {{end}} {{end}} {{end}}","title":"{{.LongName}}"},{"location":"sdk_docs/sdk_doc/","text":"KServe Python SDK \u00b6 Python SDK for KServe controller plane client and data plane serving runtime API. Installation \u00b6 KServe Python SDK can be installed by pip or poetry . pip install \u00b6 pip install kserve Poetry \u00b6 Checkout KServe GitHub repository and Install via poetry . cd kserve/python/kserve peotry install KServe Serving Runtime API \u00b6 KServe's python serving runtime API implements the open inference protocol using FastAPI , see Serving Runtime API docs for more details. KServe Client API \u00b6 KServe's python client interacts with KServe control plane APIs for executing operations on a remote KServe cluster, such as creating, patching and deleting of a InferenceService instance. Getting Started \u00b6 Please see the Sample for Python SDK Client to get started. KServe Client API Reference \u00b6 Class Method Description KServeClient set_credentials Set Credentials KServeClient create Create InferenceService KServeClient get Get or watch the specified InferenceService or all InferenceServices in the namespace KServeClient patch Patch the specified InferenceService KServeClient replace Replace the specified InferenceService KServeClient delete Delete the specified InferenceService KServeClient wait_isvc_ready Wait for the InferenceService to be ready KServeClient is_isvc_ready Check if the InferenceService is ready Reference for Generated Data Models \u00b6 KnativeAddressable KnativeCondition KnativeURL KnativeVolatileTime NetUrlUserinfo V1beta1AIXExplainerSpec V1beta1AlibiExplainerSpec V1beta1Batcher V1beta1ComponentExtensionSpec V1beta1ComponentStatusSpec V1beta1CustomExplainer V1beta1CustomPredictor V1beta1CustomTransformer V1beta1ExplainerSpec V1beta1InferenceService V1beta1InferenceServiceList V1beta1InferenceServiceSpec V1beta1InferenceServiceStatus V1alpha1InferenceGraph V1alpha1InferenceGraphList V1alpha1InferenceGraphSpec V1alpha1InferenceGraphStatus V1beta1LightGBMSpec V1beta1LoggerSpec V1beta1ModelSpec V1beta1ModelStatus V1beta1ONNXRuntimeSpec V1beta1PaddleServerSpec V1beta1PMMLSpec V1beta1PodSpec V1beta1PredictorExtensionSpec V1beta1PredictorSpec V1beta1SKLearnSpec V1beta1TFServingSpec V1beta1TorchServeSpec V1beta1TransformerSpec V1beta1TritonSpec V1beta1XGBoostSpec","title":"Python Client SDK"},{"location":"sdk_docs/sdk_doc/#kserve-python-sdk","text":"Python SDK for KServe controller plane client and data plane serving runtime API.","title":"KServe Python SDK"},{"location":"sdk_docs/sdk_doc/#installation","text":"KServe Python SDK can be installed by pip or poetry .","title":"Installation"},{"location":"sdk_docs/sdk_doc/#pip-install","text":"pip install kserve","title":"pip install"},{"location":"sdk_docs/sdk_doc/#poetry","text":"Checkout KServe GitHub repository and Install via poetry . cd kserve/python/kserve peotry install","title":"Poetry"},{"location":"sdk_docs/sdk_doc/#kserve-serving-runtime-api","text":"KServe's python serving runtime API implements the open inference protocol using FastAPI , see Serving Runtime API docs for more details.","title":"KServe Serving Runtime API"},{"location":"sdk_docs/sdk_doc/#kserve-client-api","text":"KServe's python client interacts with KServe control plane APIs for executing operations on a remote KServe cluster, such as creating, patching and deleting of a InferenceService instance.","title":"KServe Client API"},{"location":"sdk_docs/sdk_doc/#getting-started","text":"Please see the Sample for Python SDK Client to get started.","title":"Getting Started"},{"location":"sdk_docs/sdk_doc/#kserve-client-api-reference","text":"Class Method Description KServeClient set_credentials Set Credentials KServeClient create Create InferenceService KServeClient get Get or watch the specified InferenceService or all InferenceServices in the namespace KServeClient patch Patch the specified InferenceService KServeClient replace Replace the specified InferenceService KServeClient delete Delete the specified InferenceService KServeClient wait_isvc_ready Wait for the InferenceService to be ready KServeClient is_isvc_ready Check if the InferenceService is ready","title":"KServe Client API Reference"},{"location":"sdk_docs/sdk_doc/#reference-for-generated-data-models","text":"KnativeAddressable KnativeCondition KnativeURL KnativeVolatileTime NetUrlUserinfo V1beta1AIXExplainerSpec V1beta1AlibiExplainerSpec V1beta1Batcher V1beta1ComponentExtensionSpec V1beta1ComponentStatusSpec V1beta1CustomExplainer V1beta1CustomPredictor V1beta1CustomTransformer V1beta1ExplainerSpec V1beta1InferenceService V1beta1InferenceServiceList V1beta1InferenceServiceSpec V1beta1InferenceServiceStatus V1alpha1InferenceGraph V1alpha1InferenceGraphList V1alpha1InferenceGraphSpec V1alpha1InferenceGraphStatus V1beta1LightGBMSpec V1beta1LoggerSpec V1beta1ModelSpec V1beta1ModelStatus V1beta1ONNXRuntimeSpec V1beta1PaddleServerSpec V1beta1PMMLSpec V1beta1PodSpec V1beta1PredictorExtensionSpec V1beta1PredictorSpec V1beta1SKLearnSpec V1beta1TFServingSpec V1beta1TorchServeSpec V1beta1TransformerSpec V1beta1TritonSpec V1beta1XGBoostSpec","title":"Reference for Generated Data Models"},{"location":"sdk_docs/docs/KServeClient/","text":"KServeClient \u00b6 KServeClient(config_file=None, context=None, client_configuration=None, persist_config=True) User can loads authentication and cluster information from kube-config file and stores them in kubernetes.client.configuration. Parameters are as following: parameter Description config_file Name of the kube-config file. Defaults to ~/.kube/config . Note that for the case that the SDK is running in cluster and you want to operate KServe in another remote cluster, user must set config_file to load kube-config file explicitly, e.g. KServeClient(config_file=\"~/.kube/config\") . context Set the active context. If is set to None, current_context from config file will be used. client_configuration The kubernetes.client.Configuration to set configs to. persist_config If True, config file will be updated when changed (e.g GCP token refresh). The APIs for KServeClient are as following: Class Method Description KServeClient set_credentials Set Credentials KServeClient create Create InferenceService KServeClient get Get or watch the specified InferenceService or all InferenceServices in the namespace KServeClient patch Patch the specified InferenceService KServeClient replace Replace the specified InferenceService KServeClient delete Delete the specified InferenceService KServeClient wait_isvc_ready Wait for the InferenceService to be ready KServeClient is_isvc_ready Check if the InferenceService is ready set_credentials \u00b6 set_credentials(storage_type, namespace=None, credentials_file=None, service_account='kfserving-service-credentials', **kwargs): Create or update a Secret and Service Account for GCS and S3 for the provided credentials. Once the Service Account is applied, it may be used in the Service Account field of a InferenceService's V1beta1ModelSpec . Example \u00b6 Example for creating GCP credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'GCS' , namespace = 'kubeflow' , credentials_file = '/tmp/gcp.json' , service_account = 'user_specified_sa_name' ) The API supports specifying a Service Account by service_account , or using default Service Account kfserving-service-credentials , if the Service Account does not exist, the API will create it and attach the created secret with the Service Account, if exists, only patch it to attach the created Secret. Example for creating S3 credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'S3' , namespace = 'kubeflow' , credentials_file = '/tmp/awcredentials' , s3_profile = 'default' , s3_endpoint = 's3.us-west-amazonaws.com' , s3_region = 'us-west-2' , s3_use_https = '1' , s3_verify_ssl = '0' ) Example for creating Azure credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'Azure' , namespace = 'kubeflow' , credentials_file = '/path/azure_credentials.json' ) The created or patched Secret and Service Account will be shown as following: INFO:kfserving.api.set_credentials:Created Secret: kfserving-secret-6tv6l in namespace kubeflow INFO:kfserving.api.set_credentials:Created (or Patched) Service account: kfserving-service-credentials in namespace kubeflow Parameters \u00b6 Name Type Storage Type Description storage_type str All Required. Valid values: GCS, S3 or Azure namespace str All Optional. The kubernetes namespace. Defaults to current or default namespace. credentials_file str All Optional. The path for the credentials file. The default file for GCS is ~/.config/gcloud/application_default_credentials.json , see the instructions on creating the GCS credentials file. For S3 is ~/.aws/credentials , see the instructions on creating the S3 credentials file. For Azure is ~/.azure/azure_credentials.json , see the instructions on creating the Azure credentials file. service_account str All Optional. The name of service account. Supports specifying the service_account , or using default Service Account kfserving-service-credentials . If the Service Account does not exist, the API will create it and attach the created Secret with the Service Account, if exists, only patch it to attach the created Secret. s3_endpoint str S3 only Optional. The S3 endpoint. s3_region str S3 only Optional. The S3 region By default, regional endpoint is used for S3. s3_use_https str S3 only Optional. HTTPS is used to access S3 by default, unless s3_use_https=0 s3_verify_ssl str S3 only Optional. If HTTPS is used, SSL verification could be disabled with s3_verify_ssl=0 create \u00b6 create(inferenceservice, namespace=None, watch=False, timeout_seconds=600) Create the provided InferenceService in the specified namespace Example \u00b6 from kubernetes import client from kserve import KServeClient from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = 'flower-sample' , namespace = 'kserve-models' ), spec = default_model_spec ) kserve = KServeClient () kserve . create ( isvc ) # The API also supports watching the created InferenceService status till it's READY. # kserve.create(isvc, watch=True) Parameters \u00b6 Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str Namespace for InferenceService deploying to. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the created InferenceService if True , otherwise will return the created InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional Return type \u00b6 object get \u00b6 get(name=None, namespace=None, watch=False, timeout_seconds=600) Get the created InferenceService in the specified namespace Example \u00b6 from kserve import KServeClient kserve = KServeClient () kserve . get ( 'flower-sample' , namespace = 'kubeflow' ) The API also support watching the specified InferenceService or all InferenceService in the namespace. from kserve import KServeClient kserve = KServeClient () kserve . get ( 'flower-sample' , namespace = 'kubeflow' , watch = True , timeout_seconds = 120 ) The outputs will be as following. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . NAME READY DEFAULT_TRAFFIC CANARY_TRAFFIC URL flower-sample Unknown http://flower-sample.kubeflow.example.com flower-sample Unknown 90 10 http://flower-sample.kubeflow.example.com flower-sample True 90 10 http://flower-sample.kubeflow.example.com Parameters \u00b6 Name Type Description Notes name str InferenceService name. If the name is not specified, it will get or watch all InferenceServices in the namespace. Optional. namespace str The InferenceService's namespace. Defaults to current or default namespace. Optional watch bool Watch the specified InferenceService or all InferenceService in the namespace if True , otherwise will return object for the specified InferenceService or all InferenceService in the namespace. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the speficed InferenceService overall status READY is True (Only if the name is speficed). Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional Return type \u00b6 object patch \u00b6 patch(name, inferenceservice, namespace=None, watch=False, timeout_seconds=600) Patch the created InferenceService in the specified namespace. Note that if you want to set the field from existing value to None , patch API may not work, you need to use replace API to remove the field value. Example \u00b6 from kubernetes import client from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService from kserve import KServeClient service_name = 'flower-sample' kserve = KServeClient () default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = default_model_spec ) kserve . create ( isvc ) kserve . wait_isvc_ready ( service_name , namespace = 'kserve-models' ) canary_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( canary_traffic_percent = 10 , tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers-2' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = 'flower-sample' , namespace = 'kserve-models' ), spec = canary_model_spec ) kserve . patch ( service_name , isvc ) # The API also supports watching the patached InferenceService status till it's READY. # kserve.patch('flower-sample', isvc, watch=True) Parameters \u00b6 Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str The InferenceService's namespace for patching. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the patched InferenceService if True , otherwise will return the patched InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional Return type \u00b6 object replace \u00b6 replace(name, inferenceservice, namespace=None, watch=False, timeout_seconds=600) Replace the created InferenceService in the specified namespace. Generally use the replace API to update whole InferenceService or remove a field such as canary or other components of the InferenceService. Example \u00b6 from kubernetes import client from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService from kserve import KServeClient service_name = 'flower-sample' kserve = KServeClient () default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = default_model_spec ) kserve . create ( isvc ) kserve . wait_isvc_ready ( service_name , namespace = 'kserve-models' ) canary_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( canary_traffic_percent = 0 , tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers-2' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = canary_model_spec ) kserve . replace ( service_name , isvc ) # The API also supports watching the replaced InferenceService status till it's READY. # kserve.replace('flower-sample', isvc, watch=True) Parameters \u00b6 Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str The InferenceService's namespace. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the patched InferenceService if True , otherwise will return the replaced InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional Return type \u00b6 object delete \u00b6 delete(name, namespace=None) Delete the created InferenceService in the specified namespace Example \u00b6 from kserve import KServeClient kserve = KServeClient () kserve . delete ( 'flower-sample' , namespace = 'kubeflow' ) Parameters \u00b6 Name Type Description Notes name str InferenceService name namespace str The inferenceservice's namespace. Defaults to current or default namespace. Optional Return type \u00b6 object wait_isvc_ready \u00b6 wait_isvc_ready(name, namespace=None, watch=False, timeout_seconds=600, polling_interval=10): Wait for the InferenceService to be ready. Example \u00b6 from kserve import KServeClient kserve = KServeClient () kserve . wait_isvc_ready ( 'flower-sample' , namespace = 'kubeflow' ) Parameters \u00b6 Name Type Description Notes name str The InferenceService name. namespace str The InferenceService namespace. Defaults to current or default namespace. Optional watch bool Watch the specified InferenceService if True . Optional timeout_seconds int How long to wait for the InferenceService, default wait for 600 seconds. Optional polling_interval int How often to poll for the status of the InferenceService. Optional Return type \u00b6 object is_isvc_ready \u00b6 is_isvc_ready(name, namespace=None) Returns True if the InferenceService is ready; false otherwise. Example \u00b6 from kserve import KServeClient kserve = KServeClient () kserve . is_isvc_ready ( 'flower-sample' , namespace = 'kubeflow' ) Parameters \u00b6 Name Type Description Notes name str The InferenceService name. namespace str The InferenceService namespace. Defaults to current or default namespace. Optional Return type \u00b6 Bool","title":"KServeClient"},{"location":"sdk_docs/docs/KServeClient/#kserveclient","text":"KServeClient(config_file=None, context=None, client_configuration=None, persist_config=True) User can loads authentication and cluster information from kube-config file and stores them in kubernetes.client.configuration. Parameters are as following: parameter Description config_file Name of the kube-config file. Defaults to ~/.kube/config . Note that for the case that the SDK is running in cluster and you want to operate KServe in another remote cluster, user must set config_file to load kube-config file explicitly, e.g. KServeClient(config_file=\"~/.kube/config\") . context Set the active context. If is set to None, current_context from config file will be used. client_configuration The kubernetes.client.Configuration to set configs to. persist_config If True, config file will be updated when changed (e.g GCP token refresh). The APIs for KServeClient are as following: Class Method Description KServeClient set_credentials Set Credentials KServeClient create Create InferenceService KServeClient get Get or watch the specified InferenceService or all InferenceServices in the namespace KServeClient patch Patch the specified InferenceService KServeClient replace Replace the specified InferenceService KServeClient delete Delete the specified InferenceService KServeClient wait_isvc_ready Wait for the InferenceService to be ready KServeClient is_isvc_ready Check if the InferenceService is ready","title":"KServeClient"},{"location":"sdk_docs/docs/KServeClient/#set_credentials","text":"set_credentials(storage_type, namespace=None, credentials_file=None, service_account='kfserving-service-credentials', **kwargs): Create or update a Secret and Service Account for GCS and S3 for the provided credentials. Once the Service Account is applied, it may be used in the Service Account field of a InferenceService's V1beta1ModelSpec .","title":"set_credentials"},{"location":"sdk_docs/docs/KServeClient/#example","text":"Example for creating GCP credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'GCS' , namespace = 'kubeflow' , credentials_file = '/tmp/gcp.json' , service_account = 'user_specified_sa_name' ) The API supports specifying a Service Account by service_account , or using default Service Account kfserving-service-credentials , if the Service Account does not exist, the API will create it and attach the created secret with the Service Account, if exists, only patch it to attach the created Secret. Example for creating S3 credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'S3' , namespace = 'kubeflow' , credentials_file = '/tmp/awcredentials' , s3_profile = 'default' , s3_endpoint = 's3.us-west-amazonaws.com' , s3_region = 'us-west-2' , s3_use_https = '1' , s3_verify_ssl = '0' ) Example for creating Azure credentials. from kserve import KServeClient kserve = KServeClient () kserve . set_credentials ( storage_type = 'Azure' , namespace = 'kubeflow' , credentials_file = '/path/azure_credentials.json' ) The created or patched Secret and Service Account will be shown as following: INFO:kfserving.api.set_credentials:Created Secret: kfserving-secret-6tv6l in namespace kubeflow INFO:kfserving.api.set_credentials:Created (or Patched) Service account: kfserving-service-credentials in namespace kubeflow","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters","text":"Name Type Storage Type Description storage_type str All Required. Valid values: GCS, S3 or Azure namespace str All Optional. The kubernetes namespace. Defaults to current or default namespace. credentials_file str All Optional. The path for the credentials file. The default file for GCS is ~/.config/gcloud/application_default_credentials.json , see the instructions on creating the GCS credentials file. For S3 is ~/.aws/credentials , see the instructions on creating the S3 credentials file. For Azure is ~/.azure/azure_credentials.json , see the instructions on creating the Azure credentials file. service_account str All Optional. The name of service account. Supports specifying the service_account , or using default Service Account kfserving-service-credentials . If the Service Account does not exist, the API will create it and attach the created Secret with the Service Account, if exists, only patch it to attach the created Secret. s3_endpoint str S3 only Optional. The S3 endpoint. s3_region str S3 only Optional. The S3 region By default, regional endpoint is used for S3. s3_use_https str S3 only Optional. HTTPS is used to access S3 by default, unless s3_use_https=0 s3_verify_ssl str S3 only Optional. If HTTPS is used, SSL verification could be disabled with s3_verify_ssl=0","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#create","text":"create(inferenceservice, namespace=None, watch=False, timeout_seconds=600) Create the provided InferenceService in the specified namespace","title":"create"},{"location":"sdk_docs/docs/KServeClient/#example_1","text":"from kubernetes import client from kserve import KServeClient from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = 'flower-sample' , namespace = 'kserve-models' ), spec = default_model_spec ) kserve = KServeClient () kserve . create ( isvc ) # The API also supports watching the created InferenceService status till it's READY. # kserve.create(isvc, watch=True)","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_1","text":"Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str Namespace for InferenceService deploying to. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the created InferenceService if True , otherwise will return the created InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#get","text":"get(name=None, namespace=None, watch=False, timeout_seconds=600) Get the created InferenceService in the specified namespace","title":"get"},{"location":"sdk_docs/docs/KServeClient/#example_2","text":"from kserve import KServeClient kserve = KServeClient () kserve . get ( 'flower-sample' , namespace = 'kubeflow' ) The API also support watching the specified InferenceService or all InferenceService in the namespace. from kserve import KServeClient kserve = KServeClient () kserve . get ( 'flower-sample' , namespace = 'kubeflow' , watch = True , timeout_seconds = 120 ) The outputs will be as following. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . NAME READY DEFAULT_TRAFFIC CANARY_TRAFFIC URL flower-sample Unknown http://flower-sample.kubeflow.example.com flower-sample Unknown 90 10 http://flower-sample.kubeflow.example.com flower-sample True 90 10 http://flower-sample.kubeflow.example.com","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_2","text":"Name Type Description Notes name str InferenceService name. If the name is not specified, it will get or watch all InferenceServices in the namespace. Optional. namespace str The InferenceService's namespace. Defaults to current or default namespace. Optional watch bool Watch the specified InferenceService or all InferenceService in the namespace if True , otherwise will return object for the specified InferenceService or all InferenceService in the namespace. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the speficed InferenceService overall status READY is True (Only if the name is speficed). Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_1","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#patch","text":"patch(name, inferenceservice, namespace=None, watch=False, timeout_seconds=600) Patch the created InferenceService in the specified namespace. Note that if you want to set the field from existing value to None , patch API may not work, you need to use replace API to remove the field value.","title":"patch"},{"location":"sdk_docs/docs/KServeClient/#example_3","text":"from kubernetes import client from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService from kserve import KServeClient service_name = 'flower-sample' kserve = KServeClient () default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = default_model_spec ) kserve . create ( isvc ) kserve . wait_isvc_ready ( service_name , namespace = 'kserve-models' ) canary_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( canary_traffic_percent = 10 , tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers-2' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = 'flower-sample' , namespace = 'kserve-models' ), spec = canary_model_spec ) kserve . patch ( service_name , isvc ) # The API also supports watching the patached InferenceService status till it's READY. # kserve.patch('flower-sample', isvc, watch=True)","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_3","text":"Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str The InferenceService's namespace for patching. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the patched InferenceService if True , otherwise will return the patched InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_2","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#replace","text":"replace(name, inferenceservice, namespace=None, watch=False, timeout_seconds=600) Replace the created InferenceService in the specified namespace. Generally use the replace API to update whole InferenceService or remove a field such as canary or other components of the InferenceService.","title":"replace"},{"location":"sdk_docs/docs/KServeClient/#example_4","text":"from kubernetes import client from kserve import constants from kserve import V1beta1PredictorSpec from kserve import V1beta1TFServingSpec from kserve import V1beta1InferenceServiceSpec from kserve import V1beta1InferenceService from kserve import KServeClient service_name = 'flower-sample' kserve = KServeClient () default_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = default_model_spec ) kserve . create ( isvc ) kserve . wait_isvc_ready ( service_name , namespace = 'kserve-models' ) canary_model_spec = V1beta1InferenceServiceSpec ( predictor = V1beta1PredictorSpec ( canary_traffic_percent = 0 , tensorflow = V1beta1TFServingSpec ( storage_uri = 'gs://kfserving-examples/models/tensorflow/flowers-2' ))) isvc = V1beta1InferenceService ( api_version = constants . KSERVE_V1BETA1 , kind = constants . KSERVE_KIND , metadata = client . V1ObjectMeta ( name = service_name , namespace = 'kserve-models' ), spec = canary_model_spec ) kserve . replace ( service_name , isvc ) # The API also supports watching the replaced InferenceService status till it's READY. # kserve.replace('flower-sample', isvc, watch=True)","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_4","text":"Name Type Description Notes inferenceservice V1beta1InferenceService InferenceService defination Required namespace str The InferenceService's namespace. If the namespace is not defined, will align with InferenceService definition, or use current or default namespace if namespace is not specified in InferenceService definition. Optional watch bool Watch the patched InferenceService if True , otherwise will return the replaced InferenceService object. Stop watching if InferenceService reaches the optional specified timeout_seconds or once the InferenceService overall status READY is True . Optional timeout_seconds int Timeout seconds for watching. Defaults to 600. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_3","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#delete","text":"delete(name, namespace=None) Delete the created InferenceService in the specified namespace","title":"delete"},{"location":"sdk_docs/docs/KServeClient/#example_5","text":"from kserve import KServeClient kserve = KServeClient () kserve . delete ( 'flower-sample' , namespace = 'kubeflow' )","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_5","text":"Name Type Description Notes name str InferenceService name namespace str The inferenceservice's namespace. Defaults to current or default namespace. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_4","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#wait_isvc_ready","text":"wait_isvc_ready(name, namespace=None, watch=False, timeout_seconds=600, polling_interval=10): Wait for the InferenceService to be ready.","title":"wait_isvc_ready"},{"location":"sdk_docs/docs/KServeClient/#example_6","text":"from kserve import KServeClient kserve = KServeClient () kserve . wait_isvc_ready ( 'flower-sample' , namespace = 'kubeflow' )","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_6","text":"Name Type Description Notes name str The InferenceService name. namespace str The InferenceService namespace. Defaults to current or default namespace. Optional watch bool Watch the specified InferenceService if True . Optional timeout_seconds int How long to wait for the InferenceService, default wait for 600 seconds. Optional polling_interval int How often to poll for the status of the InferenceService. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_5","text":"object","title":"Return type"},{"location":"sdk_docs/docs/KServeClient/#is_isvc_ready","text":"is_isvc_ready(name, namespace=None) Returns True if the InferenceService is ready; false otherwise.","title":"is_isvc_ready"},{"location":"sdk_docs/docs/KServeClient/#example_7","text":"from kserve import KServeClient kserve = KServeClient () kserve . is_isvc_ready ( 'flower-sample' , namespace = 'kubeflow' )","title":"Example"},{"location":"sdk_docs/docs/KServeClient/#parameters_7","text":"Name Type Description Notes name str The InferenceService name. namespace str The InferenceService namespace. Defaults to current or default namespace. Optional","title":"Parameters"},{"location":"sdk_docs/docs/KServeClient/#return-type_6","text":"Bool","title":"Return type"},{"location":"sdk_docs/docs/KnativeAddressable/","text":"KnativeAddressable \u00b6 Properties \u00b6 Name Type Description Notes url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"KnativeAddressable"},{"location":"sdk_docs/docs/KnativeAddressable/#knativeaddressable","text":"","title":"KnativeAddressable"},{"location":"sdk_docs/docs/KnativeAddressable/#properties","text":"Name Type Description Notes url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/KnativeCondition/","text":"KnativeCondition \u00b6 Conditions defines a readiness condition for a Knative resource. See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties Properties \u00b6 Name Type Description Notes last_transition_time KnativeVolatileTime LastTransitionTime is the last time the condition transitioned from one status to another. We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic differences (all other things held constant). [optional] message str A human readable message indicating details about the transition. [optional] reason str The reason for the condition's last transition. [optional] severity str Severity with which to treat failures of this type of condition. When this is not specified, it defaults to Error. [optional] status str Status of the condition, one of True, False, Unknown. type str Type of condition. [Back to Model list] [Back to API list] [Back to README]","title":"KnativeCondition"},{"location":"sdk_docs/docs/KnativeCondition/#knativecondition","text":"Conditions defines a readiness condition for a Knative resource. See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties","title":"KnativeCondition"},{"location":"sdk_docs/docs/KnativeCondition/#properties","text":"Name Type Description Notes last_transition_time KnativeVolatileTime LastTransitionTime is the last time the condition transitioned from one status to another. We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic differences (all other things held constant). [optional] message str A human readable message indicating details about the transition. [optional] reason str The reason for the condition's last transition. [optional] severity str Severity with which to treat failures of this type of condition. When this is not specified, it defaults to Error. [optional] status str Status of the condition, one of True, False, Unknown. type str Type of condition. [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/KnativeStatus/","text":"KnativeStatus \u00b6 Properties \u00b6 Name Type Description Notes conditions list[KnativeCondition] Conditions the latest available observations of a resource's current state. [optional] observed_generation int ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"KnativeStatus"},{"location":"sdk_docs/docs/KnativeStatus/#knativestatus","text":"","title":"KnativeStatus"},{"location":"sdk_docs/docs/KnativeStatus/#properties","text":"Name Type Description Notes conditions list[KnativeCondition] Conditions the latest available observations of a resource's current state. [optional] observed_generation int ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/KnativeURL/","text":"KnativeURL \u00b6 URL is an alias of url.URL. It has custom json marshal methods that enable it to be used in K8s CRDs such that the CRD resource will have the URL but operator code can can work with url.URL struct Properties \u00b6 Name Type Description Notes force_query bool encoded path hint (see EscapedPath method) fragment str encoded query values, without '?' host str username and password information opaque str path str host or host:port raw_path str path (relative paths may omit leading slash) raw_query str append a query ('?') even if RawQuery is empty scheme str user NetUrlUserinfo encoded opaque data [Back to Model list] [Back to API list] [Back to README]","title":"KnativeURL"},{"location":"sdk_docs/docs/KnativeURL/#knativeurl","text":"URL is an alias of url.URL. It has custom json marshal methods that enable it to be used in K8s CRDs such that the CRD resource will have the URL but operator code can can work with url.URL struct","title":"KnativeURL"},{"location":"sdk_docs/docs/KnativeURL/#properties","text":"Name Type Description Notes force_query bool encoded path hint (see EscapedPath method) fragment str encoded query values, without '?' host str username and password information opaque str path str host or host:port raw_path str path (relative paths may omit leading slash) raw_query str append a query ('?') even if RawQuery is empty scheme str user NetUrlUserinfo encoded opaque data [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/KnativeVolatileTime/","text":"KnativeVolatileTime \u00b6 VolatileTime wraps metav1.Time Properties \u00b6 Name Type Description Notes time datetime [Back to Model list] [Back to API list] [Back to README]","title":"KnativeVolatileTime"},{"location":"sdk_docs/docs/KnativeVolatileTime/#knativevolatiletime","text":"VolatileTime wraps metav1.Time","title":"KnativeVolatileTime"},{"location":"sdk_docs/docs/KnativeVolatileTime/#properties","text":"Name Type Description Notes time datetime [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/NetUrlUserinfo/","text":"NetUrlUserinfo \u00b6 Properties \u00b6 Name Type Description Notes password str password_set bool username str [Back to Model list] [Back to API list] [Back to README]","title":"NetUrlUserinfo"},{"location":"sdk_docs/docs/NetUrlUserinfo/#neturluserinfo","text":"","title":"NetUrlUserinfo"},{"location":"sdk_docs/docs/NetUrlUserinfo/#properties","text":"Name Type Description Notes password str password_set bool username str [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1Time/","text":"V1Time \u00b6 Properties \u00b6 Name Type Description Notes [Back to Model list] [Back to API list] [Back to README]","title":"V1Time"},{"location":"sdk_docs/docs/V1Time/#v1time","text":"","title":"V1Time"},{"location":"sdk_docs/docs/V1Time/#properties","text":"Name Type Description Notes [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1BuiltInAdapter/","text":"V1alpha1BuiltInAdapter \u00b6 Properties \u00b6 Name Type Description Notes env list[V1EnvVar] Environment variables used to control other aspects of the built-in adapter's behaviour (uncommon) [optional] mem_buffer_bytes int Fixed memory overhead to subtract from runtime container's memory allocation to determine model capacity [optional] model_loading_timeout_millis int Timeout for model loading operations in milliseconds [optional] runtime_management_port int Port which the runtime server listens for model management requests [optional] server_type str ServerType must be one of the supported built-in types such as "triton" or "mlserver", and the runtime's container must have the same name [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1BuiltInAdapter"},{"location":"sdk_docs/docs/V1alpha1BuiltInAdapter/#v1alpha1builtinadapter","text":"","title":"V1alpha1BuiltInAdapter"},{"location":"sdk_docs/docs/V1alpha1BuiltInAdapter/#properties","text":"Name Type Description Notes env list[V1EnvVar] Environment variables used to control other aspects of the built-in adapter's behaviour (uncommon) [optional] mem_buffer_bytes int Fixed memory overhead to subtract from runtime container's memory allocation to determine model capacity [optional] model_loading_timeout_millis int Timeout for model loading operations in milliseconds [optional] runtime_management_port int Port which the runtime server listens for model management requests [optional] server_type str ServerType must be one of the supported built-in types such as "triton" or "mlserver", and the runtime's container must have the same name [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntime/","text":"V1alpha1ClusterServingRuntime \u00b6 ClusterServingRuntime is the Schema for the servingruntimes API Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1ServingRuntimeSpec [optional] status object ServingRuntimeStatus defines the observed state of ServingRuntime [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ClusterServingRuntime"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntime/#v1alpha1clusterservingruntime","text":"ClusterServingRuntime is the Schema for the servingruntimes API","title":"V1alpha1ClusterServingRuntime"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntime/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1ServingRuntimeSpec [optional] status object ServingRuntimeStatus defines the observed state of ServingRuntime [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntimeList/","text":"V1alpha1ClusterServingRuntimeList \u00b6 ServingRuntimeList contains a list of ServingRuntime Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1ClusterServingRuntime] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ClusterServingRuntimeList"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntimeList/#v1alpha1clusterservingruntimelist","text":"ServingRuntimeList contains a list of ServingRuntime","title":"V1alpha1ClusterServingRuntimeList"},{"location":"sdk_docs/docs/V1alpha1ClusterServingRuntimeList/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1ClusterServingRuntime] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1Container/","text":"V1alpha1Container \u00b6 Properties \u00b6 Name Type Description Notes args list[str] [optional] command list[str] [optional] env list[V1EnvVar] [optional] image str [optional] image_pull_policy str [optional] liveness_probe V1Probe [optional] name str [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] working_dir str [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1Container"},{"location":"sdk_docs/docs/V1alpha1Container/#v1alpha1container","text":"","title":"V1alpha1Container"},{"location":"sdk_docs/docs/V1alpha1Container/#properties","text":"Name Type Description Notes args list[str] [optional] command list[str] [optional] env list[V1EnvVar] [optional] image str [optional] image_pull_policy str [optional] liveness_probe V1Probe [optional] name str [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] working_dir str [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceGraph/","text":"V1alpha1InferenceGraph \u00b6 InferenceGraph is the Schema for the InferenceGraph API for multiple models Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1InferenceGraphSpec [optional] status V1alpha1InferenceGraphStatus [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceGraph"},{"location":"sdk_docs/docs/V1alpha1InferenceGraph/#v1alpha1inferencegraph","text":"InferenceGraph is the Schema for the InferenceGraph API for multiple models","title":"V1alpha1InferenceGraph"},{"location":"sdk_docs/docs/V1alpha1InferenceGraph/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1InferenceGraphSpec [optional] status V1alpha1InferenceGraphStatus [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphList/","text":"V1alpha1InferenceGraphList \u00b6 InferenceGraphList contains a list of InferenceGraph Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1InferenceGraph] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceGraphList"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphList/#v1alpha1inferencegraphlist","text":"InferenceGraphList contains a list of InferenceGraph","title":"V1alpha1InferenceGraphList"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphList/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1InferenceGraph] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphSpec/","text":"V1alpha1InferenceGraphSpec \u00b6 InferenceGraphSpec defines the InferenceGraph spec Properties \u00b6 Name Type Description Notes nodes dict(str, V1alpha1InferenceRouter) Map of InferenceGraph router nodes Each node defines the router which can be different routing types [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceGraphSpec"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphSpec/#v1alpha1inferencegraphspec","text":"InferenceGraphSpec defines the InferenceGraph spec","title":"V1alpha1InferenceGraphSpec"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphSpec/#properties","text":"Name Type Description Notes nodes dict(str, V1alpha1InferenceRouter) Map of InferenceGraph router nodes Each node defines the router which can be different routing types [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphStatus/","text":"V1alpha1InferenceGraphStatus \u00b6 InferenceGraphStatus defines the InferenceGraph conditions and status Properties \u00b6 Name Type Description Notes annotations dict(str, str) Annotations is additional Status fields for the Resource to save some additional State as well as convey more information to the user. This is roughly akin to Annotations on any k8s resource, just the reconciler conveying richer information outwards. [optional] conditions list[KnativeCondition] Conditions the latest available observations of a resource's current state. [optional] observed_generation int ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. [optional] url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceGraphStatus"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphStatus/#v1alpha1inferencegraphstatus","text":"InferenceGraphStatus defines the InferenceGraph conditions and status","title":"V1alpha1InferenceGraphStatus"},{"location":"sdk_docs/docs/V1alpha1InferenceGraphStatus/#properties","text":"Name Type Description Notes annotations dict(str, str) Annotations is additional Status fields for the Resource to save some additional State as well as convey more information to the user. This is roughly akin to Annotations on any k8s resource, just the reconciler conveying richer information outwards. [optional] conditions list[KnativeCondition] Conditions the latest available observations of a resource's current state. [optional] observed_generation int ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. [optional] url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceRouter/","text":"V1alpha1InferenceRouter \u00b6 InferenceRouter defines the router for each InferenceGraph node with one or multiple steps yaml kind: InferenceGraph metadata: name: canary-route spec: nodes: root: routerType: Splitter routes: - service: mymodel1 weight: 20 - service: mymodel2 weight: 80 yaml kind: InferenceGraph metadata: name: abtest spec: nodes: mymodel: routerType: Switch routes: - service: mymodel1 condition: \\\"{ .input.userId == 1 }\\\" - service: mymodel2 condition: \\\"{ .input.userId == 2 }\\\" Scoring a case using a model ensemble consists of scoring it using each model separately, then combining the results into a single scoring result using one of the pre-defined combination methods. Tree Ensemble constitutes a case where simple algorithms for combining results of either classification or regression trees are well known. Multiple classification trees, for example, are commonly combined using a \"majority-vote\" method. Multiple regression trees are often combined using various averaging techniques. e.g tagging models with segment identifiers and weights to be used for their combination in these ways. yaml kind: InferenceGraph metadata: name: ensemble spec: nodes: root: routerType: Sequence routes: - service: feast - nodeName: ensembleModel data: $response ensembleModel: routerType: Ensemble routes: - service: sklearn-model - service: xgboost-model Scoring a case using a sequence, or chain of models allows the output of one model to be passed in as input to the subsequent models. yaml kind: InferenceGraph metadata: name: model-chainer spec: nodes: root: routerType: Sequence routes: - service: mymodel-s1 - service: mymodel-s2 data: $response - service: mymodel-s3 data: $response In the flow described below, the pre_processing node base64 encodes the image and passes it to two model nodes in the flow. The encoded data is available to both these nodes for classification. The second node i.e. dog-breed-classification takes the original input from the pre_processing node along-with the response from the cat-dog-classification node to do further classification of the dog breed if required. yaml kind: InferenceGraph metadata: name: dog-breed-classification spec: nodes: root: routerType: Sequence routes: - service: cat-dog-classifier - nodeName: breed-classifier data: $request breed-classifier: routerType: Switch routes: - service: dog-breed-classifier condition: { .predictions.class == \\\"dog\\\" } - service: cat-breed-classifier condition: { .predictions.class == \\\"cat\\\" } Properties \u00b6 Name Type Description Notes router_type str RouterType - `Sequence:` chain multiple inference steps with input/output from previous step - `Splitter:` randomly routes to the target service according to the weight - `Ensemble:` routes the request to multiple models and then merge the responses - `Switch:` routes the request to one of the steps based on condition [default to ''] steps list[V1alpha1InferenceStep] Steps defines destinations for the current router node [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceRouter"},{"location":"sdk_docs/docs/V1alpha1InferenceRouter/#v1alpha1inferencerouter","text":"InferenceRouter defines the router for each InferenceGraph node with one or multiple steps yaml kind: InferenceGraph metadata: name: canary-route spec: nodes: root: routerType: Splitter routes: - service: mymodel1 weight: 20 - service: mymodel2 weight: 80 yaml kind: InferenceGraph metadata: name: abtest spec: nodes: mymodel: routerType: Switch routes: - service: mymodel1 condition: \\\"{ .input.userId == 1 }\\\" - service: mymodel2 condition: \\\"{ .input.userId == 2 }\\\" Scoring a case using a model ensemble consists of scoring it using each model separately, then combining the results into a single scoring result using one of the pre-defined combination methods. Tree Ensemble constitutes a case where simple algorithms for combining results of either classification or regression trees are well known. Multiple classification trees, for example, are commonly combined using a \"majority-vote\" method. Multiple regression trees are often combined using various averaging techniques. e.g tagging models with segment identifiers and weights to be used for their combination in these ways. yaml kind: InferenceGraph metadata: name: ensemble spec: nodes: root: routerType: Sequence routes: - service: feast - nodeName: ensembleModel data: $response ensembleModel: routerType: Ensemble routes: - service: sklearn-model - service: xgboost-model Scoring a case using a sequence, or chain of models allows the output of one model to be passed in as input to the subsequent models. yaml kind: InferenceGraph metadata: name: model-chainer spec: nodes: root: routerType: Sequence routes: - service: mymodel-s1 - service: mymodel-s2 data: $response - service: mymodel-s3 data: $response In the flow described below, the pre_processing node base64 encodes the image and passes it to two model nodes in the flow. The encoded data is available to both these nodes for classification. The second node i.e. dog-breed-classification takes the original input from the pre_processing node along-with the response from the cat-dog-classification node to do further classification of the dog breed if required. yaml kind: InferenceGraph metadata: name: dog-breed-classification spec: nodes: root: routerType: Sequence routes: - service: cat-dog-classifier - nodeName: breed-classifier data: $request breed-classifier: routerType: Switch routes: - service: dog-breed-classifier condition: { .predictions.class == \\\"dog\\\" } - service: cat-breed-classifier condition: { .predictions.class == \\\"cat\\\" }","title":"V1alpha1InferenceRouter"},{"location":"sdk_docs/docs/V1alpha1InferenceRouter/#properties","text":"Name Type Description Notes router_type str RouterType - `Sequence:` chain multiple inference steps with input/output from previous step - `Splitter:` randomly routes to the target service according to the weight - `Ensemble:` routes the request to multiple models and then merge the responses - `Switch:` routes the request to one of the steps based on condition [default to ''] steps list[V1alpha1InferenceStep] Steps defines destinations for the current router node [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceStep/","text":"V1alpha1InferenceStep \u00b6 InferenceStep defines the inference target of the current step with condition, weights and data. Properties \u00b6 Name Type Description Notes condition str routing based on the condition [optional] data str request data sent to the next route with input/output from the previous step $request $response.predictions [optional] name str Unique name for the step within this node [optional] node_name str The node name for routing as next step [optional] service_name str named reference for InferenceService [optional] service_url str InferenceService URL, mutually exclusive with ServiceName [optional] weight int the weight for split of the traffic, only used for Split Router when weight is specified all the routing targets should be sum to 100 [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceStep"},{"location":"sdk_docs/docs/V1alpha1InferenceStep/#v1alpha1inferencestep","text":"InferenceStep defines the inference target of the current step with condition, weights and data.","title":"V1alpha1InferenceStep"},{"location":"sdk_docs/docs/V1alpha1InferenceStep/#properties","text":"Name Type Description Notes condition str routing based on the condition [optional] data str request data sent to the next route with input/output from the previous step $request $response.predictions [optional] name str Unique name for the step within this node [optional] node_name str The node name for routing as next step [optional] service_name str named reference for InferenceService [optional] service_url str InferenceService URL, mutually exclusive with ServiceName [optional] weight int the weight for split of the traffic, only used for Split Router when weight is specified all the routing targets should be sum to 100 [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1InferenceTarget/","text":"V1alpha1InferenceTarget \u00b6 Exactly one InferenceTarget field must be specified Properties \u00b6 Name Type Description Notes node_name str The node name for routing as next step [optional] service_name str named reference for InferenceService [optional] service_url str InferenceService URL, mutually exclusive with ServiceName [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1InferenceTarget"},{"location":"sdk_docs/docs/V1alpha1InferenceTarget/#v1alpha1inferencetarget","text":"Exactly one InferenceTarget field must be specified","title":"V1alpha1InferenceTarget"},{"location":"sdk_docs/docs/V1alpha1InferenceTarget/#properties","text":"Name Type Description Notes node_name str The node name for routing as next step [optional] service_name str named reference for InferenceService [optional] service_url str InferenceService URL, mutually exclusive with ServiceName [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ServingRuntime/","text":"V1alpha1ServingRuntime \u00b6 ServingRuntime is the Schema for the servingruntimes API Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1ServingRuntimeSpec [optional] status object ServingRuntimeStatus defines the observed state of ServingRuntime [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ServingRuntime"},{"location":"sdk_docs/docs/V1alpha1ServingRuntime/#v1alpha1servingruntime","text":"ServingRuntime is the Schema for the servingruntimes API","title":"V1alpha1ServingRuntime"},{"location":"sdk_docs/docs/V1alpha1ServingRuntime/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ObjectMeta [optional] spec V1alpha1ServingRuntimeSpec [optional] status object ServingRuntimeStatus defines the observed state of ServingRuntime [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeList/","text":"V1alpha1ServingRuntimeList \u00b6 ServingRuntimeList contains a list of ServingRuntime Properties \u00b6 Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1ServingRuntime] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ServingRuntimeList"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeList/#v1alpha1servingruntimelist","text":"ServingRuntimeList contains a list of ServingRuntime","title":"V1alpha1ServingRuntimeList"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeList/#properties","text":"Name Type Description Notes api_version str APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources [optional] items list[V1alpha1ServingRuntime] kind str Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds [optional] metadata V1ListMeta [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimePodSpec/","text":"V1alpha1ServingRuntimePodSpec \u00b6 Properties \u00b6 Name Type Description Notes affinity V1Affinity [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ServingRuntimePodSpec"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimePodSpec/#v1alpha1servingruntimepodspec","text":"","title":"V1alpha1ServingRuntimePodSpec"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimePodSpec/#properties","text":"Name Type Description Notes affinity V1Affinity [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeSpec/","text":"V1alpha1ServingRuntimeSpec \u00b6 ServingRuntimeSpec defines the desired state of ServingRuntime. This spec is currently provisional and are subject to change as details regarding single-model serving and multi-model serving are hammered out. Properties \u00b6 Name Type Description Notes affinity V1Affinity [optional] built_in_adapter V1alpha1BuiltInAdapter [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. disabled bool Set to true to disable use of this runtime [optional] grpc_data_endpoint str Grpc endpoint for inferencing [optional] grpc_endpoint str Grpc endpoint for internal model-management (implementing mmesh.ModelRuntime gRPC service) Assumed to be single-model runtime if omitted [optional] http_data_endpoint str HTTP endpoint for inferencing [optional] multi_model bool Whether this ServingRuntime is intended for multi-model usage or not. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] protocol_versions list[str] Supported protocol versions (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] replicas int Configure the number of replicas in the Deployment generated by this ServingRuntime If specified, this overrides the podsPerRuntime configuration value [optional] storage_helper V1alpha1StorageHelper [optional] supported_model_formats list[V1alpha1SupportedModelFormat] Model formats and version supported by this runtime [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1ServingRuntimeSpec"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeSpec/#v1alpha1servingruntimespec","text":"ServingRuntimeSpec defines the desired state of ServingRuntime. This spec is currently provisional and are subject to change as details regarding single-model serving and multi-model serving are hammered out.","title":"V1alpha1ServingRuntimeSpec"},{"location":"sdk_docs/docs/V1alpha1ServingRuntimeSpec/#properties","text":"Name Type Description Notes affinity V1Affinity [optional] built_in_adapter V1alpha1BuiltInAdapter [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. disabled bool Set to true to disable use of this runtime [optional] grpc_data_endpoint str Grpc endpoint for inferencing [optional] grpc_endpoint str Grpc endpoint for internal model-management (implementing mmesh.ModelRuntime gRPC service) Assumed to be single-model runtime if omitted [optional] http_data_endpoint str HTTP endpoint for inferencing [optional] multi_model bool Whether this ServingRuntime is intended for multi-model usage or not. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] protocol_versions list[str] Supported protocol versions (i.e. v1 or v2 or grpc-v1 or grpc-v2) [optional] replicas int Configure the number of replicas in the Deployment generated by this ServingRuntime If specified, this overrides the podsPerRuntime configuration value [optional] storage_helper V1alpha1StorageHelper [optional] supported_model_formats list[V1alpha1SupportedModelFormat] Model formats and version supported by this runtime [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1StorageHelper/","text":"V1alpha1StorageHelper \u00b6 Properties \u00b6 Name Type Description Notes disabled bool [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1StorageHelper"},{"location":"sdk_docs/docs/V1alpha1StorageHelper/#v1alpha1storagehelper","text":"","title":"V1alpha1StorageHelper"},{"location":"sdk_docs/docs/V1alpha1StorageHelper/#properties","text":"Name Type Description Notes disabled bool [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1alpha1SupportedModelFormat/","text":"V1alpha1SupportedModelFormat \u00b6 Properties \u00b6 Name Type Description Notes auto_select bool Set to true to allow the ServingRuntime to be used for automatic model placement if this model format is specified with no explicit runtime. [optional] name str Name of the model format. [optional] [default to ''] version str Version of the model format. Used in validating that a predictor is supported by a runtime. Can be "major", "major.minor" or "major.minor.patch". [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1alpha1SupportedModelFormat"},{"location":"sdk_docs/docs/V1alpha1SupportedModelFormat/#v1alpha1supportedmodelformat","text":"","title":"V1alpha1SupportedModelFormat"},{"location":"sdk_docs/docs/V1alpha1SupportedModelFormat/#properties","text":"Name Type Description Notes auto_select bool Set to true to allow the ServingRuntime to be used for automatic model placement if this model format is specified with no explicit runtime. [optional] name str Name of the model format. [optional] [default to ''] version str Version of the model format. Used in validating that a predictor is supported by a runtime. Can be "major", "major.minor" or "major.minor.patch". [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1AIXExplainerSpec/","text":"V1beta1AIXExplainerSpec \u00b6 AIXExplainerSpec defines the arguments for configuring an AIX Explanation Server Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default "0.0.0.0" address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of AIX explainer [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1AIXExplainerSpec"},{"location":"sdk_docs/docs/V1beta1AIXExplainerSpec/#v1beta1aixexplainerspec","text":"AIXExplainerSpec defines the arguments for configuring an AIX Explanation Server","title":"V1beta1AIXExplainerSpec"},{"location":"sdk_docs/docs/V1beta1AIXExplainerSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default "0.0.0.0" address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of AIX explainer [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ARTExplainerSpec/","text":"V1beta1ARTExplainerSpec \u00b6 ARTExplainerType defines the arguments for configuring an ART Explanation Server Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default "0.0.0.0" address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of ART explainer [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ARTExplainerSpec"},{"location":"sdk_docs/docs/V1beta1ARTExplainerSpec/#v1beta1artexplainerspec","text":"ARTExplainerType defines the arguments for configuring an ART Explanation Server","title":"V1beta1ARTExplainerSpec"},{"location":"sdk_docs/docs/V1beta1ARTExplainerSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default "0.0.0.0" address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of ART explainer [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1AlibiExplainerSpec/","text":"V1beta1AlibiExplainerSpec \u00b6 AlibiExplainerSpec defines the arguments for configuring an Alibi Explanation Server Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default "0.0.0.0" address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of Alibi explainer Valid values are: - "AnchorTabular"; - "AnchorImages"; - "AnchorText"; - "Counterfactuals"; - "Contrastive"; [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1AlibiExplainerSpec"},{"location":"sdk_docs/docs/V1beta1AlibiExplainerSpec/#v1beta1alibiexplainerspec","text":"AlibiExplainerSpec defines the arguments for configuring an Alibi Explanation Server","title":"V1beta1AlibiExplainerSpec"},{"location":"sdk_docs/docs/V1beta1AlibiExplainerSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default "0.0.0.0" address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] type str The type of Alibi explainer Valid values are: - "AnchorTabular"; - "AnchorImages"; - "AnchorText"; - "Counterfactuals"; - "Contrastive"; [default to ''] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1Batcher/","text":"V1beta1Batcher \u00b6 Batcher specifies optional payload batching available for all components Properties \u00b6 Name Type Description Notes max_batch_size int Specifies the max number of requests to trigger a batch [optional] max_latency int Specifies the max latency to trigger a batch [optional] timeout int Specifies the timeout of a batch [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1Batcher"},{"location":"sdk_docs/docs/V1beta1Batcher/#v1beta1batcher","text":"Batcher specifies optional payload batching available for all components","title":"V1beta1Batcher"},{"location":"sdk_docs/docs/V1beta1Batcher/#properties","text":"Name Type Description Notes max_batch_size int Specifies the max number of requests to trigger a batch [optional] max_latency int Specifies the max latency to trigger a batch [optional] timeout int Specifies the timeout of a batch [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ComponentExtensionSpec/","text":"V1beta1ComponentExtensionSpec \u00b6 ComponentExtensionSpec defines the deployment configuration for a given InferenceService component Properties \u00b6 Name Type Description Notes batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ComponentExtensionSpec"},{"location":"sdk_docs/docs/V1beta1ComponentExtensionSpec/#v1beta1componentextensionspec","text":"ComponentExtensionSpec defines the deployment configuration for a given InferenceService component","title":"V1beta1ComponentExtensionSpec"},{"location":"sdk_docs/docs/V1beta1ComponentExtensionSpec/#properties","text":"Name Type Description Notes batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ComponentStatusSpec/","text":"V1beta1ComponentStatusSpec \u00b6 ComponentStatusSpec describes the state of the component Properties \u00b6 Name Type Description Notes address KnativeAddressable [optional] grpc_url KnativeURL [optional] latest_created_revision str Latest revision name that is created [optional] latest_ready_revision str Latest revision name that is in ready state [optional] latest_rolledout_revision str Latest revision name that is rolled out with 100 percent traffic [optional] previous_rolledout_revision str Previous revision name that is rolled out with 100 percent traffic [optional] rest_url KnativeURL [optional] traffic list[KnativeDevServingPkgApisServingV1TrafficTarget] Traffic holds the configured traffic distribution for latest ready revision and previous rolled out revision. [optional] url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ComponentStatusSpec"},{"location":"sdk_docs/docs/V1beta1ComponentStatusSpec/#v1beta1componentstatusspec","text":"ComponentStatusSpec describes the state of the component","title":"V1beta1ComponentStatusSpec"},{"location":"sdk_docs/docs/V1beta1ComponentStatusSpec/#properties","text":"Name Type Description Notes address KnativeAddressable [optional] grpc_url KnativeURL [optional] latest_created_revision str Latest revision name that is created [optional] latest_ready_revision str Latest revision name that is in ready state [optional] latest_rolledout_revision str Latest revision name that is rolled out with 100 percent traffic [optional] previous_rolledout_revision str Previous revision name that is rolled out with 100 percent traffic [optional] rest_url KnativeURL [optional] traffic list[KnativeDevServingPkgApisServingV1TrafficTarget] Traffic holds the configured traffic distribution for latest ready revision and previous rolled out revision. [optional] url KnativeURL [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1CustomExplainer/","text":"V1beta1CustomExplainer \u00b6 CustomExplainer defines arguments for configuring a custom explainer. Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to "ClusterFirst". Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. "system-node-critical" and "system-cluster-critical" are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to "True" More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the "legacy" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be "...svc.". If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1CustomExplainer"},{"location":"sdk_docs/docs/V1beta1CustomExplainer/#v1beta1customexplainer","text":"CustomExplainer defines arguments for configuring a custom explainer.","title":"V1beta1CustomExplainer"},{"location":"sdk_docs/docs/V1beta1CustomExplainer/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to "ClusterFirst". Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. "system-node-critical" and "system-cluster-critical" are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to "True" More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the "legacy" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be "...svc.". If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1CustomPredictor/","text":"V1beta1CustomPredictor \u00b6 CustomPredictor defines arguments for configuring a custom server. Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to "ClusterFirst". Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. "system-node-critical" and "system-cluster-critical" are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to "True" More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the "legacy" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be "...svc.". If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1CustomPredictor"},{"location":"sdk_docs/docs/V1beta1CustomPredictor/#v1beta1custompredictor","text":"CustomPredictor defines arguments for configuring a custom server.","title":"V1beta1CustomPredictor"},{"location":"sdk_docs/docs/V1beta1CustomPredictor/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to "ClusterFirst". Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. "system-node-critical" and "system-cluster-critical" are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to "True" More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the "legacy" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be "...svc.". If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1CustomTransformer/","text":"V1beta1CustomTransformer \u00b6 CustomTransformer defines arguments for configuring a custom transformer. Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to "ClusterFirst". Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. "system-node-critical" and "system-cluster-critical" are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to "True" More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the "legacy" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be "...svc.". If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1CustomTransformer"},{"location":"sdk_docs/docs/V1beta1CustomTransformer/#v1beta1customtransformer","text":"CustomTransformer defines arguments for configuring a custom transformer.","title":"V1beta1CustomTransformer"},{"location":"sdk_docs/docs/V1beta1CustomTransformer/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to "ClusterFirst". Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/688-pod-overhead This field is beta-level as of Kubernetes v1.18, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. "system-node-critical" and "system-cluster-critical" are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to "True" More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the "legacy" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be "...svc.". If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ExplainerConfig/","text":"V1beta1ExplainerConfig \u00b6 Properties \u00b6 Name Type Description Notes default_image_version str default explainer docker image version [default to ''] image str explainer docker image name [default to ''] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ExplainerConfig"},{"location":"sdk_docs/docs/V1beta1ExplainerConfig/#v1beta1explainerconfig","text":"","title":"V1beta1ExplainerConfig"},{"location":"sdk_docs/docs/V1beta1ExplainerConfig/#properties","text":"Name Type Description Notes default_image_version str default explainer docker image version [default to ''] image str explainer docker image name [default to ''] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ExplainerExtensionSpec/","text":"V1beta1ExplainerExtensionSpec \u00b6 ExplainerExtensionSpec defines configuration shared across all explainer frameworks Properties \u00b6 Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [optional] [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default "0.0.0.0" address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ExplainerExtensionSpec"},{"location":"sdk_docs/docs/V1beta1ExplainerExtensionSpec/#v1beta1explainerextensionspec","text":"ExplainerExtensionSpec defines configuration shared across all explainer frameworks","title":"V1beta1ExplainerExtensionSpec"},{"location":"sdk_docs/docs/V1beta1ExplainerExtensionSpec/#properties","text":"Name Type Description Notes args list[str] Arguments to the entrypoint. The docker image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] command list[str] Entrypoint array. Not executed within a shell. The docker image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell [optional] config dict(str, str) Inline custom parameter settings for explainer [optional] env list[V1EnvVar] List of environment variables to set in the container. Cannot be updated. [optional] env_from list[V1EnvFromSource] List of sources to populate environment variables in the container. The keys defined within a source must be a C_IDENTIFIER. All invalid keys will be reported as an event when the container is starting. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. [optional] image str Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. [optional] image_pull_policy str Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images [optional] lifecycle V1Lifecycle [optional] liveness_probe V1Probe [optional] name str Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. [optional] [default to ''] ports list[V1ContainerPort] List of ports to expose from the container. Exposing a port here gives the system additional information about the network connections a container uses, but is primarily informational. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default "0.0.0.0" address inside a container will be accessible from the network. Cannot be updated. [optional] readiness_probe V1Probe [optional] resources V1ResourceRequirements [optional] runtime_version str Defaults to latest Explainer Version [optional] security_context V1SecurityContext [optional] startup_probe V1Probe [optional] stdin bool Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. [optional] stdin_once bool Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false [optional] storage V1beta1StorageSpec [optional] storage_uri str The location of a trained explanation model [optional] termination_message_path str Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. [optional] termination_message_policy str Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. [optional] tty bool Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. [optional] volume_devices list[V1VolumeDevice] volumeDevices is the list of block devices to be used by the container. [optional] volume_mounts list[V1VolumeMount] Pod volumes to mount into the container's filesystem. Cannot be updated. [optional] working_dir str Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. [optional] [Back to Model list] [Back to API list] [Back to README]","title":"Properties"},{"location":"sdk_docs/docs/V1beta1ExplainerSpec/","text":"V1beta1ExplainerSpec \u00b6 ExplainerSpec defines the container spec for a model explanation server, The following fields follow a \"1-of\" semantic. Users must specify exactly one spec. Properties \u00b6 Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] aix V1beta1AIXExplainerSpec [optional] alibi V1beta1AlibiExplainerSpec [optional] art V1beta1ARTExplainerSpec [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to "ClusterFirst". Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. "system-node-critical" and "system-cluster-critical" are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to "True" More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the "legacy" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be "...svc.". If not specified, the pod will not have a domainname at all. [optional] termination_grace_period_seconds int Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates delete immediately. If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. [optional] timeout int TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. [optional] tolerations list[V1Toleration] If specified, the pod's tolerations. [optional] topology_spread_constraints list[V1TopologySpreadConstraint] TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. [optional] volumes list[V1Volume] List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes [optional] [Back to Model list] [Back to API list] [Back to README]","title":"V1beta1ExplainerSpec"},{"location":"sdk_docs/docs/V1beta1ExplainerSpec/#v1beta1explainerspec","text":"ExplainerSpec defines the container spec for a model explanation server, The following fields follow a \"1-of\" semantic. Users must specify exactly one spec.","title":"V1beta1ExplainerSpec"},{"location":"sdk_docs/docs/V1beta1ExplainerSpec/#properties","text":"Name Type Description Notes active_deadline_seconds int Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. [optional] affinity V1Affinity [optional] aix V1beta1AIXExplainerSpec [optional] alibi V1beta1AlibiExplainerSpec [optional] art V1beta1ARTExplainerSpec [optional] automount_service_account_token bool AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. [optional] batcher V1beta1Batcher [optional] canary_traffic_percent int CanaryTrafficPercent defines the traffic split percentage between the candidate revision and the last ready revision [optional] container_concurrency int ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). [optional] containers list[V1Container] List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. [optional] dns_config V1PodDNSConfig [optional] dns_policy str Set DNS policy for the pod. Defaults to "ClusterFirst". Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. [optional] enable_service_links bool EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. [optional] ephemeral_containers list[V1EphemeralContainer] List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. This field is alpha-level and is only honored by servers that enable the EphemeralContainers feature. [optional] host_aliases list[V1HostAlias] HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. This is only valid for non-hostNetwork pods. [optional] host_ipc bool Use the host's ipc namespace. Optional: Default to false. [optional] host_network bool Host networking requested for this pod. Use the host's network namespace. If this option is set, the ports that will be used must be specified. Default to false. [optional] host_pid bool Use the host's pid namespace. Optional: Default to false. [optional] hostname str Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. [optional] image_pull_secrets list[V1LocalObjectReference] ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. For example, in the case of docker, only DockerConfig type secrets are honored. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod [optional] init_containers list[V1Container] List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ [optional] logger V1beta1LoggerSpec [optional] max_replicas int Maximum number of replicas for autoscaling. [optional] min_replicas int Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. [optional] node_name str NodeName is a request to schedule this pod onto a specific node. If it is non-empty, the scheduler simply schedules this pod onto that node, assuming that it fits resource requirements. [optional] node_selector dict(str, str) NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ [optional] overhead dict(str, ResourceQuantity) Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/688-pod-overhead This field is alpha-level as of Kubernetes v1.16, and is only honored by servers that enable the PodOverhead feature. [optional] preemption_policy str PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. This field is beta-level, gated by the NonPreemptingPriority feature-gate. [optional] priority int The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. [optional] priority_class_name str If specified, indicates the pod's priority. "system-node-critical" and "system-cluster-critical" are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. [optional] readiness_gates list[V1PodReadinessGate] If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to "True" More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-network/580-pod-readiness-gates [optional] restart_policy str Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy [optional] runtime_class_name str RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the "legacy" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/585-runtime-class This is a beta feature as of Kubernetes v1.14. [optional] scale_metric str ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). [optional] scale_target int ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). [optional] scheduler_name str If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. [optional] security_context V1PodSecurityContext [optional] service_account str DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. [optional] service_account_name str ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ [optional] set_hostname_as_fqdn bool If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. [optional] share_process_namespace bool Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. [optional] subdomain str If specified, the fully qualified Pod hostname will be "..