From 9d7df0d310aa2edf15db9483270e3b9100c8addb Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 8 Apr 2024 15:04:08 +0200 Subject: [PATCH 1/4] Fix rule and instructions for TGI --- Makefile | 4 ++-- docs/source/howto/serving.mdx | 36 ++++++++++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 82a4acea..c06e50d6 100644 --- a/Makefile +++ b/Makefile @@ -46,8 +46,8 @@ tpu-tgi: docker build --rm -f text-generation-inference/Dockerfile \ --build-arg VERSION=$(VERSION) \ --build-arg TGI_VERSION=$(TGI_VERSION) \ - -t tpu-tgi:$(VERSION) . - docker tag tpu-tgi:$(VERSION) tpu-tgi:latest + -t huggingface/optimum-tpu:$(VERSION)-tgi . + docker tag huggingface/optimum-tpu:$(VERSION)-tgi tpu-tgi:latest # Run code quality checks style_check: diff --git a/docs/source/howto/serving.mdx b/docs/source/howto/serving.mdx index 8a406f06..bcd078ec 100644 --- a/docs/source/howto/serving.mdx +++ b/docs/source/howto/serving.mdx @@ -1,3 +1,37 @@ # Deploying a Text-Generation Inference server on a Google Cloud TPU instance -Stay tuned! +## Context + +Text-Generation-Inference (TGI) is a highly optimized serving engine enabling serving Large Language Models (LLMs) in a way +that better leverages the underlying hardware, Cloud TPU in this case. + +TGI comes as ready to use Docker containers + +## Deploy TGI on Cloud TPU instance + +We assume the reader already has a Cloud TPU instance up and running. +If this is not the case, please see our guide to deploy one [here](./deploy.mdx) + +### Docker Container Build + +Optimum-TPU provides a `make tpu-tgi` command at the root level to help you create local docker image. + +### Docker Container Run + +``` +docker run -p 8080:80 \ + --net=host --privileged \ + -v $(pwd)/data:/data \ + -e HF_TOKEN=${HF_TOKEN} \ + -e HF_BATCH_SIZE=1 \ + -e HF_SEQUENCE_LENGTH=1024 \ + huggingface/tpu-tgi \ + --model-id google/gemma-2b \ + --max-concurrent-requests 4 \ + --max-input-length 512 \ + --max-total-tokens 1024 \ + --max-batch-prefill-tokens 512 \ + --max-batch-total-tokens 1024 +``` + +### Executing requests against From fb3c06fb94f93ce1eec92b3e412c58bce96c2a7a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 8 Apr 2024 15:16:47 +0200 Subject: [PATCH 2/4] Add remaining of sentence --- docs/source/howto/serving.mdx | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/docs/source/howto/serving.mdx b/docs/source/howto/serving.mdx index bcd078ec..28903454 100644 --- a/docs/source/howto/serving.mdx +++ b/docs/source/howto/serving.mdx @@ -34,4 +34,20 @@ docker run -p 8080:80 \ --max-batch-total-tokens 1024 ``` -### Executing requests against +### Executing requests against the service + +You can query the model using either the `/generate` or `/generate_stream` routes: + +``` +curl 127.0.0.1:8080/generate \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \ + -H 'Content-Type: application/json' +``` + +``` +curl 127.0.0.1:8080/generate_stream \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \ + -H 'Content-Type: application/json' +``` From b166fff68cc0a1b9931cb4b82d7795f01e11114c Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 8 Apr 2024 15:22:25 +0200 Subject: [PATCH 3/4] Address comments --- docs/source/howto/serving.mdx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/howto/serving.mdx b/docs/source/howto/serving.mdx index 28903454..9b894c95 100644 --- a/docs/source/howto/serving.mdx +++ b/docs/source/howto/serving.mdx @@ -19,13 +19,14 @@ Optimum-TPU provides a `make tpu-tgi` command at the root level to help you crea ### Docker Container Run ``` +OPTIMUM_TPU_VERSION=0.1.0b1 docker run -p 8080:80 \ --net=host --privileged \ -v $(pwd)/data:/data \ -e HF_TOKEN=${HF_TOKEN} \ -e HF_BATCH_SIZE=1 \ -e HF_SEQUENCE_LENGTH=1024 \ - huggingface/tpu-tgi \ + huggingface/optimum-tpu:${OPTIMUM_TPU_VERSION}-tgi \ --model-id google/gemma-2b \ --max-concurrent-requests 4 \ --max-input-length 512 \ From bb5cdab613cdce6772bdc4b71be418a3f88b1cbe Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 8 Apr 2024 15:23:03 +0200 Subject: [PATCH 4/4] Again --- docs/source/howto/serving.mdx | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/howto/serving.mdx b/docs/source/howto/serving.mdx index 9b894c95..77b894b9 100644 --- a/docs/source/howto/serving.mdx +++ b/docs/source/howto/serving.mdx @@ -5,8 +5,6 @@ Text-Generation-Inference (TGI) is a highly optimized serving engine enabling serving Large Language Models (LLMs) in a way that better leverages the underlying hardware, Cloud TPU in this case. -TGI comes as ready to use Docker containers - ## Deploy TGI on Cloud TPU instance We assume the reader already has a Cloud TPU instance up and running.