From 9d7df0d310aa2edf15db9483270e3b9100c8addb Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Mon, 8 Apr 2024 15:04:08 +0200
Subject: [PATCH 1/4] Fix rule and instructions for TGI

---
 Makefile                      |  4 ++--
 docs/source/howto/serving.mdx | 36 ++++++++++++++++++++++++++++++++++-
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 82a4acea..c06e50d6 100644
--- a/Makefile
+++ b/Makefile
@@ -46,8 +46,8 @@ tpu-tgi:
 	docker build --rm -f text-generation-inference/Dockerfile \
 	             --build-arg VERSION=$(VERSION) \
 	             --build-arg TGI_VERSION=$(TGI_VERSION) \
-				 -t tpu-tgi:$(VERSION) .
-	docker tag tpu-tgi:$(VERSION) tpu-tgi:latest
+				 -t huggingface/optimum-tpu:$(VERSION)-tgi .
+	docker tag huggingface/optimum-tpu:$(VERSION)-tgi tpu-tgi:latest
 
 # Run code quality checks
 style_check:
diff --git a/docs/source/howto/serving.mdx b/docs/source/howto/serving.mdx
index 8a406f06..bcd078ec 100644
--- a/docs/source/howto/serving.mdx
+++ b/docs/source/howto/serving.mdx
@@ -1,3 +1,37 @@
 # Deploying a Text-Generation Inference server on a Google Cloud TPU instance
 
-Stay tuned!
+## Context
+
+Text-Generation-Inference (TGI) is a highly optimized serving engine enabling serving Large Language Models (LLMs) in a way
+that better leverages the underlying hardware, Cloud TPU in this case.
+
+TGI comes as ready to use Docker containers
+
+## Deploy TGI on Cloud TPU instance
+
+We assume the reader already has a Cloud TPU instance up and running.
+If this is not the case, please see our guide to deploy one [here](./deploy.mdx)
+
+### Docker Container Build
+
+Optimum-TPU provides a `make tpu-tgi` command at the root level to help you create local docker image.
+
+### Docker Container Run
+
+```
+docker run -p 8080:80 \
+       --net=host --privileged \
+       -v $(pwd)/data:/data \
+       -e HF_TOKEN=${HF_TOKEN} \
+       -e HF_BATCH_SIZE=1 \
+       -e HF_SEQUENCE_LENGTH=1024 \
+       huggingface/tpu-tgi \
+       --model-id google/gemma-2b \
+       --max-concurrent-requests 4 \
+       --max-input-length 512 \
+       --max-total-tokens 1024 \
+       --max-batch-prefill-tokens 512 \
+       --max-batch-total-tokens 1024
+```
+
+### Executing requests against

From fb3c06fb94f93ce1eec92b3e412c58bce96c2a7a Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Mon, 8 Apr 2024 15:16:47 +0200
Subject: [PATCH 2/4] Add remaining of sentence

---
 docs/source/howto/serving.mdx | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/docs/source/howto/serving.mdx b/docs/source/howto/serving.mdx
index bcd078ec..28903454 100644
--- a/docs/source/howto/serving.mdx
+++ b/docs/source/howto/serving.mdx
@@ -34,4 +34,20 @@ docker run -p 8080:80 \
        --max-batch-total-tokens 1024
 ```
 
-### Executing requests against
+### Executing requests against the service
+
+You can query the model using either the `/generate` or `/generate_stream` routes:
+
+```
+curl 127.0.0.1:8080/generate \
+    -X POST \
+    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+    -H 'Content-Type: application/json'
+```
+
+```
+curl 127.0.0.1:8080/generate_stream \
+    -X POST \
+    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+    -H 'Content-Type: application/json'
+```

From b166fff68cc0a1b9931cb4b82d7795f01e11114c Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Mon, 8 Apr 2024 15:22:25 +0200
Subject: [PATCH 3/4] Address comments

---
 docs/source/howto/serving.mdx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/howto/serving.mdx b/docs/source/howto/serving.mdx
index 28903454..9b894c95 100644
--- a/docs/source/howto/serving.mdx
+++ b/docs/source/howto/serving.mdx
@@ -19,13 +19,14 @@ Optimum-TPU provides a `make tpu-tgi` command at the root level to help you crea
 ### Docker Container Run
 
 ```
+OPTIMUM_TPU_VERSION=0.1.0b1
 docker run -p 8080:80 \
        --net=host --privileged \
        -v $(pwd)/data:/data \
        -e HF_TOKEN=${HF_TOKEN} \
        -e HF_BATCH_SIZE=1 \
        -e HF_SEQUENCE_LENGTH=1024 \
-       huggingface/tpu-tgi \
+       huggingface/optimum-tpu:${OPTIMUM_TPU_VERSION}-tgi \
        --model-id google/gemma-2b \
        --max-concurrent-requests 4 \
        --max-input-length 512 \

From bb5cdab613cdce6772bdc4b71be418a3f88b1cbe Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Mon, 8 Apr 2024 15:23:03 +0200
Subject: [PATCH 4/4] Again

---
 docs/source/howto/serving.mdx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/source/howto/serving.mdx b/docs/source/howto/serving.mdx
index 9b894c95..77b894b9 100644
--- a/docs/source/howto/serving.mdx
+++ b/docs/source/howto/serving.mdx
@@ -5,8 +5,6 @@
 Text-Generation-Inference (TGI) is a highly optimized serving engine enabling serving Large Language Models (LLMs) in a way
 that better leverages the underlying hardware, Cloud TPU in this case.
 
-TGI comes as ready to use Docker containers
-
 ## Deploy TGI on Cloud TPU instance
 
 We assume the reader already has a Cloud TPU instance up and running.