From 951dd3b62c1dd4f57f574afa0a7e6706b361e928 Mon Sep 17 00:00:00 2001
From: Baptiste Colle <32412211+baptistecolle@users.noreply.github.com>
Date: Wed, 18 Dec 2024 16:27:29 +0100
Subject: [PATCH] =?UTF-8?q?=F0=9F=94=96=20New=20release=20for=20optimum=20?=
 =?UTF-8?q?tpu=200.2.2=20(#130)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(tpu): add training Dockerfile for 0.2.2

* feat(tpu): Add TGI Dockerfile for 0.2.2

* docs(tpu): update documentation for TPU

* Update containers/tgi/tpu/0.2.2/entrypoint.sh

Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>

* Update containers/pytorch/training/tpu/2.5.1/transformers/4.46.3/py310/Dockerfile

Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>

* docs(readme) modify docs to state that building tpu image does not require TPUs

* docs: fix typos and add comments

* docs(readme) modify phrasing for docker build for tpu to be clearer

---------

Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
---
 README.md                                     |   7 +-
 containers/pytorch/training/README.md         |  14 +-
 .../transformers/4.46.3/py310/Dockerfile      |  56 +++++
 containers/tgi/README.md                      |  40 +++-
 containers/tgi/tpu/0.2.2/Dockerfile           | 196 ++++++++++++++++++
 containers/tgi/tpu/0.2.2/entrypoint.sh        |  45 ++++
 6 files changed, 345 insertions(+), 13 deletions(-)
 create mode 100644 containers/pytorch/training/tpu/2.5.1/transformers/4.46.3/py310/Dockerfile
 create mode 100644 containers/tgi/tpu/0.2.2/Dockerfile
 create mode 100644 containers/tgi/tpu/0.2.2/entrypoint.sh

diff --git a/README.md b/README.md
index 3719173b..b5d13024 100644
--- a/README.md
+++ b/README.md
@@ -9,15 +9,14 @@ The [Google-Cloud-Containers](https://github.com/huggingface/Google-Cloud-Contai
 - Training
   - [PyTorch](./containers/pytorch/training/README.md)
     - GPU
-    - TPU (soon)
+    - TPU
 - Inference
   - [PyTorch](./containers/pytorch/inference/README.md)
     - CPU
     - GPU
-    - TPU (soon)
   - [Text Generation Inference](./containers/tgi/README.md)
     - GPU
-    - TPU (soon)
+    - TPU
   - [Text Embeddings Inference](./containers/tei/README.md)
     - CPU
     - GPU
@@ -32,6 +31,8 @@ The [Google-Cloud-Containers](https://github.com/huggingface/Google-Cloud-Contai
 | us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-training-cu121.2-3.transformers.4-42.ubuntu2204.py310  | [huggingface-pytorch-training-gpu.2.3.0.transformers.4.42.3.py310](./containers/pytorch/training/gpu/2.3.0/transformers/4.42.3/py310/Dockerfile)   | PyTorch   | Training  | GPU         |
 | us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-inference-cu121.2-2.transformers.4-44.ubuntu2204.py311 | [huggingface-pytorch-inference-gpu.2.2.2.transformers.4.44.0.py311](./containers/pytorch/inference/gpu/2.2.2/transformers/4.44.0/py311/Dockerfile) | PyTorch   | Inference | GPU         |
 | us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-inference-cpu.2-2.transformers.4-44.ubuntu2204.py311   | [huggingface-pytorch-inference-cpu.2.2.2.transformers.4.44.0.py311](./containers/pytorch/inference/cpu/2.2.2/transformers/4.44.0/py311/Dockerfile) | PyTorch   | Inference | CPU         |
+| us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-tpu.0.2.2.py310   | [huggingface-text-generation-inference-tpu.0.2.2.py310](./containers/tgi/tpu/0.2.2/Dockerfile) | TGI     | Inference | TPU         |
+| us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-training-tpu.2.5.1.transformers.4.46.3.py310   | [huggingface-pytorch-training-tpu.2.5.1.transformers.4.46.3.py310](./containers/tgi/tpu/0.2.2/Dockerfile) | PyTorch | Training  | TPU         |
 
 > [!NOTE]
 > The listing above only contains the latest version of each of the Hugging Face DLCs, the full listing of the available published containers in Google Cloud can be found either in the [Deep Learning Containers Documentation](https://cloud.google.com/deep-learning-containers/docs/choosing-container#hugging-face), in the [Google Cloud Artifact Registry](https://console.cloud.google.com/artifacts/docker/deeplearning-platform-release/us/gcr.io) or via the `gcloud container images list --repository="us-docker.pkg.dev/deeplearning-platform-release/gcr.io" | grep "huggingface-"` command.
diff --git a/containers/pytorch/training/README.md b/containers/pytorch/training/README.md
index 781a8dfc..1fd7e915 100644
--- a/containers/pytorch/training/README.md
+++ b/containers/pytorch/training/README.md
@@ -79,8 +79,18 @@ The PyTorch Training containers come with two different containers depending on
   docker build -t us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-training-cu121.2-3.transformers.4-42.ubuntu2204.py310 -f containers/pytorch/training/gpu/2.3.0/transformers/4.42.3/py310/Dockerfile .
   ```
 
-- **TPU**: To build the PyTorch Training container for Google Cloud TPUs, an instance with at least one TPU available is required to install `optimum-tpu` which is a Python library with Google TPU optimizations for `transformers` models, making its integration seamless.
+- **TPU**: You can build PyTorch Training container for Google Cloud TPUs on any machine with docker build, you do not need to build it on a TPU VM
 
   ```bash
-  docker build -t us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-training-tpu.2.4.0.transformers.4.41.1.py310 -f containers/pytorch/training/tpu/2.4.0/transformers/4.41.1/py310/Dockerfile .
+  docker build -t us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-training-tpu.2.5.1.transformers.4.46.3.py310 -f containers/pytorch/training/tpu/2.5.1/transformers/4.46.3/py310/Dockerfile .
+  ```
+
+  To run the example notebook for fine-tuning Gemma, use the command below. You can skip the “Environment Setup” step, as you should already be on a TPU-enabled machine. For better security, consider omitting the --allow-root and --NotebookApp.token='' options when running the notebook.
+
+  ```bash
+  docker run --rm --net host --privileged \
+      -v$(pwd)/artifacts:/tmp/output \
+      -e HF_TOKEN=${HF_TOKEN} \
+      us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-training-tpu.2.5.1.transformers.4.46.3.py310 \
+      jupyter notebook --allow-root --NotebookApp.token='' /notebooks
   ```
diff --git a/containers/pytorch/training/tpu/2.5.1/transformers/4.46.3/py310/Dockerfile b/containers/pytorch/training/tpu/2.5.1/transformers/4.46.3/py310/Dockerfile
new file mode 100644
index 00000000..dcfd60f9
--- /dev/null
+++ b/containers/pytorch/training/tpu/2.5.1/transformers/4.46.3/py310/Dockerfile
@@ -0,0 +1,56 @@
+FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.5.1_3.10_tpuvm
+# Google maintained pytorch xla image with PyTorch=2.5.1 and Python=3.10
+# Read more about it here: https://github.com/pytorch/xla?tab=readme-ov-file#docker
+
+LABEL maintainer="Hugging Face"
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Versions
+ARG OPTIMUM_TPU='0.2.2'
+ARG TRANSFORMERS='4.46.3'
+ARG PEFT='0.13.2'
+ARG TRL='0.12.1'
+ARG DATASETS='3.1.0'
+ARG ACCELERATE='1.1.0'
+ARG EVALUATE='0.4.3'
+ARG SAFETENSORS='0.4.5'
+
+# Update pip
+RUN pip install --upgrade pip
+
+# Install Hugging Face Libraries
+RUN pip install --upgrade --no-cache-dir \
+  transformers[sklearn,sentencepiece]==${TRANSFORMERS} \
+  datasets==${DATASETS} \
+  accelerate==${ACCELERATE} \
+  evaluate==${EVALUATE} \
+  peft==${PEFT} \
+  trl==${TRL} \
+  safetensors==${SAFETENSORS} \
+  jupyter notebook
+
+# Install Optimum TPU
+RUN pip install git+https://github.com/huggingface/optimum-tpu.git@v${OPTIMUM_TPU}
+# Add example
+ADD https://raw.githubusercontent.com/huggingface/optimum-tpu/v${OPTIMUM_TPU}/examples/language-modeling/gemma_tuning.ipynb \
+  /notebooks/gemma_tuning.ipynb
+
+# Install Google CLI single command
+RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \
+    | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
+    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \
+    | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
+    apt-get update -y && \
+    apt-get install google-cloud-sdk -y
+
+# Install Google Cloud Python dependencies
+RUN pip install --upgrade --no-cache-dir \
+  google-cloud-storage \
+  google-cloud-bigquery \
+  google-cloud-aiplatform \
+  google-cloud-pubsub \
+  google-cloud-logging \
+  "protobuf<4.0.0"
+
+# Override pytorch xla base image with empty entrypoint
+ENTRYPOINT [""]
diff --git a/containers/tgi/README.md b/containers/tgi/README.md
index 6c9aa503..98b39673 100644
--- a/containers/tgi/README.md
+++ b/containers/tgi/README.md
@@ -14,13 +14,13 @@ gcloud container images list --repository="us-docker.pkg.dev/deeplearning-platfo
 
 Below you will find the instructions on how to run and test the TGI containers available within this repository. Note that before proceeding you need to first ensure that you have Docker installed either on your local or remote instance, if not, please follow the instructions on how to install Docker [here](https://docs.docker.com/get-docker/).
 
-To run the Docker container in GPUs you need to ensure that your hardware is supported (NVIDIA drivers on your device need to be compatible with CUDA version 12.2 or higher) and also install the NVIDIA Container Toolkit.
+### Run
 
-To find the supported models and hardware before running the TGI DLC, feel free to check [TGI Documentation](https://huggingface.co/docs/text-generation-inference/supported_models).
+The TGI containers support two different accelerator types: GPU and TPU. Depending on your infrastructure, you'll use different approaches to run the containers.
 
-### Run
+- **GPU**: To run this DLC, you need to have GPU accelerators available within the instance that you want to run TGI, not only because those are required, but also to enable the best performance due to the optimized inference CUDA kernels. Additionally, you need to ensure that your hardware is supported (NVIDIA drivers on your device need to be compatible with CUDA version 12.2 or higher) and also install the NVIDIA Container Toolkit.
 
-To run this DLC, you need to have GPU accelerators available within the instance that you want to run TGI, not only because those are required, but also to enable the best performance due to the optimized inference CUDA kernels.
+To find the supported models and hardware before running the TGI DLC, feel free to check [TGI Documentation](https://huggingface.co/docs/text-generation-inference/supported_models).
 
 Besides that, you also need to define the model to deploy, as well as the generation configuration. For the model selection, you can pick any model from the Hugging Face Hub that contains the tag `text-generation-inference` which means that it's supported by TGI; to explore all the available models within the Hub, please check [here](https://huggingface.co/models?other=text-generation-inference&sort=trending). Then, to select the best configuration for that model you can either keep the default values defined within TGI, or just select the recommended ones based on our instance specification via the Hugging Face Recommender API for TGI as follows:
 
@@ -60,6 +60,22 @@ docker run --gpus all -ti --shm-size 1g -p 8080:8080 \
     us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311
 ```
 
+- **TPU**: This example showcases how to deploy a TGI server on a TPU instance using the TGI container. Note that TPU support for TGI is currently experimental and may have limitations compared to GPU deployments.us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311
+
+
+```bash
+docker run --rm -p 8080:8080 \
+    --shm-size 16G --ipc host --privileged \
+    -e MODEL_ID=google/gemma-7b-it \
+    -e HF_TOKEN=$(cat ~/.cache/huggingface/token) \
+    -e MAX_INPUT_LENGTH=4000 \
+    -e MAX_TOTAL_TOKENS=4096 \
+    us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-tpu.0.2.2.py310
+```
+
+> [!NOTE]
+> Check the [Hugging Face Optimum TPU documentation](https://huggingface.co/docs/optimum-tpu/) for more information on TPU model serving.
+
 ### Test
 
 Once the Docker container is running, as it has been deployed with `text-generation-launcher`, the API will expose the following endpoints listed within the [TGI OpenAPI Specification](https://huggingface.github.io/text-generation-inference/).
@@ -108,8 +124,16 @@ curl 0.0.0.0:8080/generate \
 > [!WARNING]
 > Building the containers is not recommended since those are already built by Hugging Face and Google Cloud teams and provided openly, so the recommended approach is to use the pre-built containers available in [Google Cloud's Artifact Registry](https://console.cloud.google.com/artifacts/docker/deeplearning-platform-release/us/gcr.io) instead.
 
-In order to build TGI Docker container, you will need an instance with at least 4 NVIDIA GPUs available with at least 24 GiB of VRAM each, since TGI needs to build and compile the kernels required for the optimized inference. Also note that the build process may take ~30 minutes to complete, depending on the instance's specifications.
+The TGI containers come with two different variants depending on the accelerator used:
 
-```bash
-docker build -t us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311 -f containers/tgi/gpu/2.3.1/Dockerfile .
-```
+- **GPU**: To build the TGI container for GPU, you will need an instance with at least 4 NVIDIA GPUs available with at least 24 GiB of VRAM each, since TGI needs to build and compile the kernels required for the optimized inference. The build process may take ~30 minutes to complete.
+
+  ```bash
+  docker build -t us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311 -f containers/tgi/gpu/2.3.1/Dockerfile .
+  ```
+
+- **TPU**: You can build TGI container for Google Cloud TPUs on any machine with docker build, you do not need to build it on a TPU VM
+
+  ```bash
+  docker build --ulimit nofile=100000:100000 -t us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-tpu.0.2.2.py310 -f containers/tgi/tpu/0.2.2/Dockerfile .
+  ```
diff --git a/containers/tgi/tpu/0.2.2/Dockerfile b/containers/tgi/tpu/0.2.2/Dockerfile
new file mode 100644
index 00000000..8cf284d7
--- /dev/null
+++ b/containers/tgi/tpu/0.2.2/Dockerfile
@@ -0,0 +1,196 @@
+# Enable GCP integration by default
+ARG ENABLE_GOOGLE_FEATURE=1
+
+# Fetch and extract the TGI sources
+FROM alpine AS tgi
+# TGI version 2.4.1 by default
+ARG TGI_VERSION=2.4.1
+RUN test -n ${TGI_VERSION:?}
+RUN mkdir -p /tgi
+ADD https://github.com/huggingface/text-generation-inference/archive/v${TGI_VERSION}.tar.gz /tgi/sources.tar.gz
+RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1
+
+# Build cargo components (adapted from TGI original Dockerfile)
+# Note that the build image is aligned on the same Linux version as the base image (Debian bookworm/ Ubuntu 22.04)
+FROM lukemathwalker/cargo-chef:latest-rust-1.80.1-bookworm AS chef
+WORKDIR /usr/src
+
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef AS planner
+COPY --from=tgi /tgi/Cargo.toml Cargo.toml
+COPY --from=tgi /tgi/Cargo.lock Cargo.lock
+COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml
+COPY --from=tgi /tgi/proto proto
+COPY --from=tgi /tgi/benchmark benchmark
+COPY --from=tgi /tgi/router router
+COPY --from=tgi /tgi/backends backends
+COPY --from=tgi /tgi/launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+ARG ENABLE_GOOGLE_FEATURE
+RUN echo "Google Feature Status: ${ENABLE_GOOGLE_FEATURE}"
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    python3.11-dev
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --profile release-opt --recipe-path recipe.json
+
+COPY --from=tgi /tgi/Cargo.toml Cargo.toml
+COPY --from=tgi /tgi/Cargo.lock Cargo.lock
+COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml
+COPY --from=tgi /tgi/proto proto
+COPY --from=tgi /tgi/benchmark benchmark
+COPY --from=tgi /tgi/router router
+COPY --from=tgi /tgi/backends backends
+COPY --from=tgi /tgi/launcher launcher
+RUN if [ -n "$ENABLE_GOOGLE_FEATURE" ]; then \
+        cargo build --profile release-opt --features google; \
+    else \
+        cargo build --profile release-opt; \
+    fi
+
+# Python base image
+FROM ubuntu:22.04 AS base
+
+RUN apt-get update -y \
+ && apt-get install -y --no-install-recommends \
+    python3-pip \
+    python3-setuptools \
+    python-is-python3 \
+    git \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+RUN pip3 --no-cache-dir install --upgrade pip
+
+ARG ENABLE_GOOGLE_FEATURE
+ARG VERSION='0.2.2'
+RUN test -n ${VERSION:?}
+
+FROM base AS optimum-tpu-installer
+
+COPY . /tmp/src
+
+RUN if [ -n "$ENABLE_GOOGLE_FEATURE" ]; then \
+        # If we are building for GCP, we need to clone the optimum-tpu repo as this is built from the huggingface/Google-Cloud-Containers repository and not the huggingface/optimum-tpu repository
+        git clone https://github.com/huggingface/optimum-tpu.git /opt/optimum-tpu && \
+        cd /opt/optimum-tpu && git checkout v${VERSION}; \
+    fi && \
+        # Check if the optimum-tpu repo is cloned properly
+        cp -a /tmp/src /opt/optimum-tpu && \
+        if [ ! -d "/opt/optimum-tpu/optimum" ]; then \
+            echo "Error: Building from incorrect repository. This build must be run from optimum-tpu repo. If building from google-cloud-containers repo, set ENABLE_GOOGLE_FEATURE=1 to automatically clone optimum-tpu" && \ 
+            exit 1; \
+    fi
+
+
+# Python server build image
+FROM base AS pyserver
+
+RUN apt-get update -y \
+ && apt-get install -y --no-install-recommends \
+    make \
+    python3-venv \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN install -d /pyserver
+WORKDIR /pyserver
+COPY --from=optimum-tpu-installer /opt/optimum-tpu/text-generation-inference/server server
+COPY --from=tgi /tgi/proto proto
+RUN pip3 install -r server/build-requirements.txt
+RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto VERSION=${VERSION} make -C server gen-server
+
+# TPU base image (used for deployment)
+FROM base AS tpu_base
+
+ARG VERSION=${VERSION}
+
+# Install system prerequisites
+RUN apt-get update -y \
+ && apt-get install -y --no-install-recommends \
+    libpython3.10 \
+    libpython3.11 \
+    python3.11 \
+    git \
+    gnupg2 \
+    wget \
+    curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+# Update pip
+RUN pip install --upgrade pip
+
+# Install HuggingFace packages
+ARG TRANSFORMERS_VERSION='4.46.3'
+ARG ACCELERATE_VERSION='1.1.1'
+ARG SAFETENSORS_VERSION='0.4.5'
+
+ARG ENABLE_GOOGLE_FEATURE
+
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+ENV VERSION=${VERSION}
+
+ENV PORT=${ENABLE_GOOGLE_FEATURE:+8080}
+ENV PORT=${PORT:-80}
+
+ENV HF_HOME=${ENABLE_GOOGLE_FEATURE:+/tmp}
+ENV HF_HOME=${HF_HOME:-/data}
+
+# Install requirements for TGI, that uses python3.11
+RUN python3.11 -m pip install transformers==${TRANSFORMERS_VERSION}
+
+# Install requirements for optimum-tpu, then for TGI then optimum-tpu
+RUN python3 -m pip install hf_transfer safetensors==${SAFETENSORS_VERSION} typer
+COPY --from=optimum-tpu-installer /opt/optimum-tpu /opt/optimum-tpu
+RUN python3 /opt/optimum-tpu/optimum/tpu/cli.py install-jetstream-pytorch --yes
+RUN python3 -m pip install -e /opt/optimum-tpu \
+        -f https://storage.googleapis.com/libtpu-releases/index.html
+
+# Install benchmarker
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router-v2 /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+# Install python server
+COPY --from=pyserver /pyserver/build/dist dist
+RUN pip install dist/text_generation_server*.tar.gz
+
+
+# TPU compatible image for Inference Endpoints
+FROM tpu_base AS inference-endpoint
+
+COPY text-generation-inference/docker/entrypoint.sh entrypoint.sh
+RUN chmod +x entrypoint.sh
+ENTRYPOINT ["./entrypoint.sh"]
+
+FROM tpu_base AS google-cloud-containers
+
+# Install Google specific components if ENABLE_GOOGLE_FEATURE is set
+RUN if [ -n "$ENABLE_GOOGLE_FEATURE" ]; then \
+        apt-get update && \
+        DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+            ca-certificates \
+            curl \
+            git && \
+        rm -rf /var/lib/apt/lists/* && \
+        echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \
+            | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
+        curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \
+            | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
+        apt-get update -y && \
+        apt-get install google-cloud-sdk -y; \
+    fi
+
+# Custom entrypoint for Google
+COPY --chmod=775 containers/tgi/tpu/${VERSION}/entrypoint.sh* entrypoint.sh
+ENTRYPOINT ["./entrypoint.sh"]
\ No newline at end of file
diff --git a/containers/tgi/tpu/0.2.2/entrypoint.sh b/containers/tgi/tpu/0.2.2/entrypoint.sh
new file mode 100644
index 00000000..cd283a13
--- /dev/null
+++ b/containers/tgi/tpu/0.2.2/entrypoint.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# This is required by GKE, see
+# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#privileged-mode
+ulimit -l 68719476736
+
+# Check if MODEL_ID starts with "gs://"
+if [[ $AIP_STORAGE_URI == gs://* ]]; then
+    echo "AIP_STORAGE_URI set and starts with 'gs://', proceeding to download from GCS."
+    echo "AIP_STORAGE_URI: $AIP_STORAGE_URI"
+
+    # Define the target directory
+    TARGET_DIR="/tmp/model"
+    mkdir -p "$TARGET_DIR"
+
+    # Use gsutil to copy the content from GCS to the target directory
+    echo "Running: gcloud storage storage cp $AIP_STORAGE_URI/* $TARGET_DIR --recursive"
+    gcloud storage cp "$AIP_STORAGE_URI/*" "$TARGET_DIR" --recursive
+
+    # Check if gsutil command was successful
+    if [ $? -eq 0 ]; then
+        echo "Model downloaded successfully to ${TARGET_DIR}."
+        # Update MODEL_ID to point to the local directory
+        echo "Updating MODEL_ID to point to the local directory."
+        export MODEL_ID="$TARGET_DIR"
+    else
+        echo "Failed to download model from GCS."
+        exit 1
+    fi
+fi
+
+if [[ -z "${MAX_BATCH_SIZE}" ]]; then
+  # Default to a batch size of 4 if no value is provided
+  export MAX_BATCH_SIZE="4"
+fi
+
+if [[ -n "${QUANTIZATION}" ]]; then
+  # If quantization is set, we use jetstream_int8 (this is the only option supported by optimum-tpu at the moment)
+  QUANTIZATION="jetstream_int8"
+  export QUANTIZATION="${QUANTIZATION}"
+fi
+
+ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
+
+exec text-generation-launcher $@