From b92a8fdd31c6cf2da9c0a8060d697607f057d927 Mon Sep 17 00:00:00 2001 From: faph Date: Thu, 11 Apr 2024 15:14:51 +0100 Subject: [PATCH 1/6] Add FAQ on model_fn function --- docs/index.rst | 3 ++- src/inference_server/__init__.py | 13 +++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index c76d669..744b98d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -4,7 +4,7 @@ inference-server Deploy your AI/ML model to Amazon SageMaker for Real-Time Inference and Batch Transform using your own Docker container image. .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :caption: Contents introduction @@ -12,6 +12,7 @@ Deploy your AI/ML model to Amazon SageMaker for Real-Time Inference and Batch Tr batch_transform deployment testing + faq modules diff --git a/src/inference_server/__init__.py b/src/inference_server/__init__.py index 5ab9c29..9f7ff0b 100644 --- a/src/inference_server/__init__.py +++ b/src/inference_server/__init__.py @@ -42,6 +42,7 @@ "MIMEAccept", # Exporting for plugin developers' convenience "create_app", "plugin_hook", + "warmup", ) #: Library version, e.g. 1.0.0, taken from Git tags @@ -70,12 +71,20 @@ class BatchStrategy(enum.Enum): def create_app() -> "WSGIApplication": - """Initialize and return the WSGI application""" + """ + Initialize and return the WSGI application + + This is the WSGI application factory function that needs to be passed to a WSGI-compatible web server. + """ return _app def warmup() -> None: - """Initialize any additional resources upfront""" + """ + Initialize any additional resources upfront + + This will call the ``model_fn`` plugin hook. + """ _model() From c8ce5c74ae4adf6ff670d83d70b31e2536b662fe Mon Sep 17 00:00:00 2001 From: faph Date: Thu, 11 Apr 2024 15:16:14 +0100 Subject: [PATCH 2/6] Add FAQ on model_fn function --- docs/faq.rst | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 docs/faq.rst diff --git a/docs/faq.rst b/docs/faq.rst new file mode 100644 index 0000000..e5cbf9e --- /dev/null +++ b/docs/faq.rst @@ -0,0 +1,32 @@ +Frequently Asked Questions (FAQ) +================================ + + +Is my "model_fn" called at each invocation? +------------------------------------------- + +No. + +The :func:`model_fn` function is called during the very fist invocation only. +Once the model has been loaded, it is retained in memory for as long as the service runs. + +To speed up the very first invocation, it is possible to trigger the `model_fn` hook in advance. +To do this, simply call :func:`inference_server.warmup`. + +For example, when using Gunicorn, this could be done from a post-fork Gunicorn hook:: + + def post_fork(server, worker): + worker.log.info("Warming up worker...") + inference_server.warmup() + + +Does **inference-server** support async/ASGI webservers? +-------------------------------------------------------- + + +My model is leaking memory, how do I address that? +-------------------------------------------------- + + +How do I invoke my model using a data stream from my favourite message queue system? +------------------------------------------------------------------------------------ From 90e6c30481e5a499db480d8c5d73951984238dd2 Mon Sep 17 00:00:00 2001 From: faph Date: Thu, 11 Apr 2024 15:21:46 +0100 Subject: [PATCH 3/6] Add FAQ on ASGI vs WSGI --- docs/faq.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/faq.rst b/docs/faq.rst index e5cbf9e..88bb4b6 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -23,6 +23,15 @@ For example, when using Gunicorn, this could be done from a post-fork Gunicorn h Does **inference-server** support async/ASGI webservers? -------------------------------------------------------- +No. + +**inference-server** is a WSGI application to be used by synchronous webservers. + +For most ML models that will be the correct choice as model inference is typically CPU-bound. +Therefore, a multi-process based WSGI server is a good choice whereby the number of workers is equal to the number of CPU cores available. + +For more details see :ref:`deployment:Configuring Gunicorn workers`. + My model is leaking memory, how do I address that? -------------------------------------------------- From e44d7a1fd7cab5e3af9759b2d64ba2d915752d74 Mon Sep 17 00:00:00 2001 From: faph Date: Thu, 11 Apr 2024 15:30:28 +0100 Subject: [PATCH 4/6] Add FAQ on memory leaks --- docs/faq.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/faq.rst b/docs/faq.rst index 88bb4b6..cbb84a4 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -36,6 +36,13 @@ For more details see :ref:`deployment:Configuring Gunicorn workers`. My model is leaking memory, how do I address that? -------------------------------------------------- +If the memory leak is outside your control, one approach would be to periodically restart the webserver workers. + +For example, when using Gunicorn, it is possible to specify a maximum number of HTTP requests (`max_requests`) after which a given worker should be restarted. +Gunicorn additionally allows a random offset (`max_requests_jitter`) to be added such that worker restarts are staggered. + +For more details see `Gunicorn settings documentation `_. + How do I invoke my model using a data stream from my favourite message queue system? ------------------------------------------------------------------------------------ From 2b838756ce24c7dda2b7602746392eb8b4208577 Mon Sep 17 00:00:00 2001 From: faph Date: Fri, 12 Apr 2024 10:39:57 +0100 Subject: [PATCH 5/6] Add FAQ on integration with message stream --- docs/faq.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/faq.rst b/docs/faq.rst index cbb84a4..07e5c1a 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -46,3 +46,14 @@ For more details see `Gunicorn settings documentation Date: Fri, 12 Apr 2024 10:43:07 +0100 Subject: [PATCH 6/6] Tweak API ref docs landing page --- docs/modules.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/modules.rst b/docs/modules.rst index e85375c..0555b36 100644 --- a/docs/modules.rst +++ b/docs/modules.rst @@ -1,8 +1,8 @@ -API Documentation -================= +API reference documentation +=========================== .. toctree:: - :maxdepth: 2 + :maxdepth: 1 inference_server inference_server_testing