From 37019594ca6e488be4f9460e19fd42e87f65fc42 Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Fri, 1 Mar 2024 18:51:17 -0800
Subject: [PATCH] Improvements to LLM examples (#616)

* typo

* improved LLM serving examples -- faster, updated deps, text tweaks
---
 .../llm-serving/falcon_bitsandbytes.py        | 16 ++---
 06_gpu_and_ml/llm-serving/falcon_gptq.py      | 20 +++---
 06_gpu_and_ml/llm-serving/openllama.py        | 19 +++--
 .../llm-serving/text_generation_inference.py  | 27 +++----
 06_gpu_and_ml/llm-serving/tgi_mixtral.py      | 10 +--
 06_gpu_and_ml/llm-serving/vllm_gemma.py       | 41 ++++++++---
 06_gpu_and_ml/llm-serving/vllm_inference.py   | 70 ++++++++++++++-----
 06_gpu_and_ml/llm-serving/vllm_mixtral.py     | 52 +++++++++-----
 8 files changed, 165 insertions(+), 90 deletions(-)

diff --git a/06_gpu_and_ml/llm-serving/falcon_bitsandbytes.py b/06_gpu_and_ml/llm-serving/falcon_bitsandbytes.py
index c2c7826c3..b281ca503 100644
--- a/06_gpu_and_ml/llm-serving/falcon_bitsandbytes.py
+++ b/06_gpu_and_ml/llm-serving/falcon_bitsandbytes.py
@@ -4,14 +4,14 @@
 # # Run Falcon-40B with bitsandbytes
 #
 # In this example, we download the full-precision weights of the Falcon-40B LLM but load it in 4-bit using
-# Tim Dettmer's [`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes) library. This enables it to fit
+# Tim Dettmers' [`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes) library. This enables it to fit
 # into a single GPU (A100 40GB).
 #
 # Due to the current limitations of the library, the inference speed is a little over 2 tokens/second and due
 # to the sheer size of the model, the cold start time on Modal is around 2 minutes.
 #
 # For faster cold start at the expense of inference speed, check out
-# [Running Falcon-40B with AutoGPTQ](/docs/examples/falcon_gptq).
+# [Running Falcon-40B with AutoGPTQ](https://modal.com/docs/examples/falcon_gptq).
 #
 # ## Setup
 #
@@ -43,7 +43,7 @@ def download_falcon_40b():
         "bitsandbytes-cuda117==0.26.0.post2",
         "peft==0.6.2",
         "transformers==4.31.0",
-        "accelerate==0.26.2",
+        "accelerate==0.26.1",
         "hf-transfer==0.1.5",
         "torch==2.0.0",
         "torchvision==0.15.1",
@@ -62,15 +62,15 @@ def download_falcon_40b():
 # ## The model class
 #
 # Next, we write the model code. We want Modal to load the model into memory just once every time a container starts up,
-# so we use [class syntax](/docs/guide/lifecycle-functions) and the `@enter` decorator.
+# so we use [class syntax](https://modal.com/docs/guide/lifecycle-functions) and the `@enter` decorator.
 #
-# Within the [@stub.cls](/docs/reference/modal.Stub#cls) decorator, we use the [gpu parameter](/docs/guide/gpu)
-# to specify that we want to run our function on an [A100 GPU](/pricing). We also allow each call 10 mintues to complete,
+# Within the [@stub.cls](https://modal.com/docs/reference/modal.Stub#cls) decorator, we use the [gpu parameter](/docs/guide/gpu)
+# to specify that we want to run our function on an [A100 GPU](https://modal.com/docs/guide/gpu). We also allow each call 10 mintues to complete,
 # and request the runner to stay live for 5 minutes after its last request.
 #
 # We load the model in 4-bit using the `bitsandbytes` library.
 #
-# The rest is just using the [pipeline()](https://huggingface.co/docs/transformers/en/main_classes/pipelines)
+# The rest is just using the [`pipeline`](https://huggingface.co/docs/transformers/en/main_classes/pipelines)
 # abstraction from the `transformers` library. Refer to the documentation for more parameters and tuning.
 @stub.cls(
     gpu=gpu.A100(),  # Use A100s
@@ -157,7 +157,7 @@ def generate(self, prompt: str):
 
 
 # ## Run the model
-# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function
+# We define a [`local_entrypoint`](https:modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function
 # sequentially for a list of inputs. You can run this locally with `modal run -q falcon_bitsandbytes.py`. The `-q` flag
 # enables streaming to work in the terminal output.
 prompt_template = (
diff --git a/06_gpu_and_ml/llm-serving/falcon_gptq.py b/06_gpu_and_ml/llm-serving/falcon_gptq.py
index e61cdc378..88481500e 100644
--- a/06_gpu_and_ml/llm-serving/falcon_gptq.py
+++ b/06_gpu_and_ml/llm-serving/falcon_gptq.py
@@ -8,8 +8,8 @@
 # cold start time on Modal is around 25s.
 #
 # For faster inference at the expense of a slower cold start, check out
-# [Running Falcon-40B with `bitsandbytes` quantization](/docs/examples/falcon_bitsandbytes). You can also
-# run a smaller, 7-billion-parameter model with the [OpenLLaMa example](/docs/examples/openllama).
+# [Running Falcon-40B with `bitsandbytes` quantization](https://modal.com/docs/examples/falcon_bitsandbytes). You can also
+# run a smaller model via the [Gemma 7B example](https://modal.com/docs/examples/vllm_gemma).
 #
 # ## Setup
 #
@@ -33,8 +33,8 @@ def download_model():
 
 
 # Now, we define our image. We'll use the `debian-slim` base image, and install the dependencies we need
-# using [`pip_install`](/docs/reference/modal.Image#pip_install). At the end, we'll use
-# [`run_function`](/docs/guide/custom-container#run-a-modal-function-during-your-build-with-run_function-beta) to run the
+# using [`pip_install`](https://modal.com/docs/reference/modal.Image#pip_install). At the end, we'll use
+# [`run_function`](https://modal.com/docs/guide/custom-container#run-a-modal-function-during-your-build-with-run_function-beta) to run the
 # function defined above as part of the image build.
 
 image = (
@@ -52,21 +52,21 @@ def download_model():
     .run_function(download_model)
 )
 
-# Let's instantiate and name our [Stub](/docs/guide/apps).
+# Let's instantiate and name our [`Stub`](https://modal.com/docs/guide/apps).
 stub = Stub(name="example-falcon-gptq", image=image)
 
 
 # ## The model class
 #
 # Next, we write the model code. We want Modal to load the model into memory just once every time a container starts up,
-# so we use [class syntax](/docs/guide/lifecycle-functions) and the `@enter` decorator.
+# so we use [class syntax](https://modal.com/docs/guide/lifecycle-functions) and the `@enter` decorator.
 #
-# Within the [@stub.cls](/docs/reference/modal.Stub#cls) decorator, we use the [gpu parameter](/docs/guide/gpu)
-# to specify that we want to run our function on an [A100 GPU](/pricing). We also allow each call 10 mintues to complete,
+# Within the [`@stub.cls`](https://modal.com/docs/reference/modal.Stub#cls) decorator, we use the [`gpu` parameter](https://modal.com/docs/guide/gpu)
+# to specify that we want to run our function on an [A100 GPU](https://modal.com/docs/guide/gpu#a100-gpus). We also allow each call 10 mintues to complete,
 # and request the runner to stay live for 5 minutes after its last request.
 #
 # The rest is just using the `transformers` library to run the model. Refer to the
-# [documentation](https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationMixin.generate)
+# [documentation](https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/text_generation#transformers.GenerationMixin.generate)
 # for more parameters and tuning.
 #
 # Note that we need to create a separate thread to call the `generate` function because we need to
@@ -121,7 +121,7 @@ def generate(self, prompt: str):
 
 
 # ## Run the model
-# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function
+# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function
 # sequentially for a list of inputs. You can run this locally with `modal run -q falcon_gptq.py`. The `-q` flag
 # enables streaming to work in the terminal output.
 prompt_template = (
diff --git a/06_gpu_and_ml/llm-serving/openllama.py b/06_gpu_and_ml/llm-serving/openllama.py
index d6f124ca3..a21891737 100644
--- a/06_gpu_and_ml/llm-serving/openllama.py
+++ b/06_gpu_and_ml/llm-serving/openllama.py
@@ -16,7 +16,7 @@
 # inside our container image.
 #
 # To do this, we have to define a function that loads both the model and tokenizer using
-# [from_pretrained](https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained).
+# [`from_pretrained`](https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained).
 # Since HuggingFace stores this model into a local cache, when Modal snapshots the image after running this function,
 # the model weights will be saved and available for use when the container starts up next time.
 
@@ -37,7 +37,7 @@ def download_models():
 # function defined above as part of the image build.
 
 image = (
-    # Python 3.11+ not yet supported for torch.compile
+    # Python 3.11+ not yet supported for `torch.compile`
     Image.debian_slim(python_version="3.10")
     .pip_install(
         "accelerate~=0.18.0",
@@ -48,7 +48,7 @@ def download_models():
     .run_function(download_models)
 )
 
-# Let's instantiate and name our [Stub](/docs/guide/apps).
+# Let's instantiate and name our [`Stub`](https://modal.com/docs/guide/apps).
 
 stub = Stub(name="example-open-llama", image=image)
 
@@ -61,7 +61,7 @@ def download_models():
 # Within the [@stub.cls](/docs/reference/modal.Stub#cls) decorator, we use the [gpu parameter](/docs/guide/gpu)
 # to specify that we want to run our function on an [A100 GPU with 20 GB of VRAM](/pricing).
 #
-# The rest is just using the [generate](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate) function
+# The rest is just using the [`generate`](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate) function
 # from the `transformers` library. Refer to the documentation for more parameters and tuning.
 
 
@@ -115,7 +115,7 @@ def generate(
 
 
 # ## Run the model
-# Finally, we define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function
+# Finally, we define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function
 # sequentially for a list of inputs. You can run this locally with `modal run openllama.py`.
 
 
@@ -139,9 +139,8 @@ def main():
 # ## Next steps
 # The above is a simple example of how to run a basic model. Note that OpenLLaMa has not been fine-tuned on an instruction-following dataset,
 # so the results aren't amazing out of the box. Refer to [DoppelBot, our Slack fine-tuning demo](https://github.com/modal-labs/doppel-bot) for how
-# you could use OpenLLaMa to perform a more useful downstream task.
+# you could use finetuning to make an LLM more useful for downstream tasks.
 #
-# If you're looking for useful responses out-of-the-box like ChatGPT, you could try Vicuna-13B, which is larger and has been instruction-tuned.
-# However, note that this model is not permissively licensed due to the dataset it was trained on. Refer to our [LLM voice chat](/docs/examples/llm-voice-chat)
-# post for how to build a complete voice chat app using Vicuna, or go straight to the [file](https://github.com/modal-labs/quillman/blob/main/src/llm_vicuna.py)
-# if you want to run it by itself.
+# If you're looking for responses more in the style of ChatGPT, you could try [Gemma 7B](https://modal.com/docs/examples/vllm_gemma), which has been trained to follow instructions.
+# However, note that this model is not permissively licensed due to the dataset it was trained on. Refer to our [LLM voice chat](https://modal.com/docs/examples/llm-voice-chat)
+# post for how to build a complete voice chat app using LLMs.
diff --git a/06_gpu_and_ml/llm-serving/text_generation_inference.py b/06_gpu_and_ml/llm-serving/text_generation_inference.py
index 130eb245c..e650ce557 100644
--- a/06_gpu_and_ml/llm-serving/text_generation_inference.py
+++ b/06_gpu_and_ml/llm-serving/text_generation_inference.py
@@ -3,10 +3,10 @@
 # In this example, we show how to run an optimized inference server using [Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference)
 # with performance advantages over standard text generation pipelines including:
 # - continuous batching, so multiple generations can take place at the same time on a single container
-# - PagedAttention, an optimization that increases throughput.
+# - PagedAttention, which applies memory paging to the attention mechanism's key-value cache, increasing throughput
 #
 # This example deployment, [accessible here](https://modal-labs--tgi-app.modal.run), can serve LLaMA 2 70B with
-# 70 second cold starts, up to 200 tokens/s of throughput and per-token latency of 55ms.
+# 70 second cold starts, up to 200 tokens/s of throughput, and a per-token latency of 55ms.
 
 # ## Setup
 #
@@ -24,7 +24,6 @@
 #
 # Any model supported by TGI can be chosen here.
 
-GPU_CONFIG = gpu.A100(memory=80, count=2)
 MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
 REVISION = "e1ce257bd76895e0864f3b4d6c7ed3c4cdec93e2"
 # Add `["--quantize", "gptq"]` for TheBloke GPTQ models.
@@ -68,16 +67,17 @@ def download_model():
 
 
 # ### Image definition
-# We’ll start from a Dockerhub image recommended by TGI, and override the default `ENTRYPOINT` for
+# We’ll start from a Docker Hub image recommended by TGI, and override the default `ENTRYPOINT` for
 # Modal to run its own which enables seamless serverless deployments.
 #
 # Next we run the download step to pre-populate the image with our model weights.
 #
-# For this step to work on a gated model such as LLaMA 2, the HF_TOKEN environment
-# variable must be set ([reference](https://github.com/huggingface/text-generation-inference#using-a-private-or-gated-model)).
+# For this step to work on a [gated model](https://github.com/huggingface/text-generation-inference#using-a-private-or-gated-model)
+# such as LLaMA 2, the `HF_TOKEN` environment variable must be set.
 #
-# After [creating a HuggingFace access token](https://huggingface.co/settings/tokens),
-# head to the [secrets page](https://modal.com/secrets) to create a Modal secret.
+# After [creating a HuggingFace access token](https://huggingface.co/settings/tokens)
+# and accepting the [LLaMA 2 license](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf),
+# head to the [secrets page](https://modal.com/secrets) to share it with Modal
 #
 # Finally, we install the `text-generation` client to interface with TGI's Rust webserver over `localhost`.
 
@@ -97,7 +97,7 @@ def download_model():
 
 # ## The model class
 #
-# The inference function is best represented with Modal's [class syntax](/docs/guide/lifecycle-functions).
+# The inference function is best represented with Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions).
 # The class syntax is a special representation for a Modal function which splits logic into two parts:
 # 1. the `@enter()` function, which runs once per container when it starts up, and
 # 2. the `@method()` function, which runs per inference request.
@@ -108,11 +108,14 @@ def download_model():
 # container ready.
 #
 # Here, we also
-# - specify the secret so the `HUGGING_FACE_HUB_TOKEN` environment variable is set
+# - specify the secret so the `HUGGING_FACE_HUB_TOKEN` environment variable can be set
 # - specify how many A100s we need per container
 # - specify that each container is allowed to handle up to 10 inputs (i.e. requests) simultaneously
 # - keep idle containers for 10 minutes before spinning down
-# - lift the timeout of each request.
+# - increase the timeout limit
+
+
+GPU_CONFIG = gpu.A100(memory=80, count=2)  # 2 A100s for LLaMA 2 70B
 
 
 @stub.cls(
@@ -188,7 +191,7 @@ async def generate_stream(self, question: str):
 
 
 # ## Run the model
-# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to invoke
+# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to invoke
 # our remote function. You can run this script locally with `modal run text_generation_inference.py`.
 @stub.local_entrypoint()
 def main():
diff --git a/06_gpu_and_ml/llm-serving/tgi_mixtral.py b/06_gpu_and_ml/llm-serving/tgi_mixtral.py
index 4c4067c9c..bfc9ff757 100644
--- a/06_gpu_and_ml/llm-serving/tgi_mixtral.py
+++ b/06_gpu_and_ml/llm-serving/tgi_mixtral.py
@@ -3,7 +3,7 @@
 # In this example, we show how to run an optimized inference server using [Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference)
 # with performance advantages over standard text generation pipelines including:
 # - continuous batching, so multiple generations can take place at the same time on a single container
-# - PagedAttention, an optimization that increases throughput.
+# - PagedAttention, which applies memory paging to the attention mechanism's key-value cache, increasing throughput
 #
 # This example deployment, [accessible here](https://modal-labs--tgi-mixtral.modal.run), can serve Mixtral 8x7B on two 80GB A100s, with
 # up to 500 tokens/s of throughput and per-token latency of 78ms.
@@ -38,7 +38,7 @@
 
 # ## Define a container image
 #
-# We want to create a Modal image which has the Huggingface model cache pre-populated.
+# We want to create a Modal image which has the Hugging Face model cache pre-populated.
 # The benefit of this is that the container no longer has to re-download the model from Huggingface -
 # instead, it will take advantage of Modal's internal filesystem for faster cold starts.
 # The 95GB model can be loaded in as little as 70 seconds.
@@ -62,7 +62,7 @@ def download_model():
 
 
 # ### Image definition
-# We’ll start from a Dockerhub image recommended by TGI, and override the default `ENTRYPOINT` for
+# We’ll start from a Docker Hub image recommended by TGI, and override the default `ENTRYPOINT` for
 # Modal to run its own which enables seamless serverless deployments.
 #
 # Next we run the download step to pre-populate the image with our model weights.
@@ -81,7 +81,7 @@ def download_model():
 
 # ## The model class
 #
-# The inference function is best represented with Modal's [class syntax](/docs/guide/lifecycle-functions).
+# The inference function is best represented with Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions).
 # The class syntax is a special representation for a Modal function which splits logic into two parts:
 # 1. the `@enter()` function, which runs once per container when it starts up, and
 # 2. the `@method()` function, which runs per inference request.
@@ -155,7 +155,7 @@ async def generate_stream(self, question: str):
 
 
 # ## Run the model
-# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to invoke
+# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to invoke
 # our remote function. You can run this script locally with `modal run text_generation_inference.py`.
 @stub.local_entrypoint()
 def main():
diff --git a/06_gpu_and_ml/llm-serving/vllm_gemma.py b/06_gpu_and_ml/llm-serving/vllm_gemma.py
index f08a5ea6d..96bf0819c 100644
--- a/06_gpu_and_ml/llm-serving/vllm_gemma.py
+++ b/06_gpu_and_ml/llm-serving/vllm_gemma.py
@@ -1,7 +1,7 @@
 # # Fast inference with vLLM (Gemma 7B)
 #
 # In this example, we show how to run basic LLM inference, using [`vLLM`](https://github.com/vllm-project/vllm)
-# to take advantage of [PagedAttention](https://arxiv.org/abs/2309.06180), which speeds up sequential inferences with optimized key-value caching.
+# to take advantage of [PagedAttention](https://arxiv.org/abs/2309.06180), which speeds up inference on longer sequences with optimized key-value caching.
 # You can read more about PagedAttention [here](https://charlesfrye.github.io/programming/2023/11/10/llms-systems.html).
 #
 # We'll run the [Gemma 7B Instruct](https://huggingface.co/google/gemma-7b-it) large language model.
@@ -9,7 +9,7 @@
 #
 # The "7B" in the name refers to the number of parameters (floating point numbers used to control inference)
 # in the model. Applying those 7,000,000,000 numbers onto an input is a lot of work,
-# so we'll use a GPU to speed up the process -- specifically, a top-of-the-line [NVIDIA H100](/blog/introducing-h100).
+# so we'll use a GPU to speed up the process -- specifically, a top-of-the-line [NVIDIA H100](https://modal.com/blog/introducing-h100).
 #
 # "Instruct" means that this version of Gemma is not simply a statistical model of language,
 # but has been fine-tuned to follow instructions -- like ChatGPT or Claude,
@@ -22,7 +22,8 @@
 #
 # To run
 # [any of the other supported models](https://vllm.readthedocs.io/en/latest/models/supported_models.html),
-# just change the model name. You may also need to enable `trust_remote_code` for some models (see comment below).
+# just change the model name. You may also need to change engine configuration, like `trust_remote_code`,
+# or GPU configuration, in order to run some models.
 #
 # ## Setup
 #
@@ -30,11 +31,10 @@
 
 import os
 
-from modal import Image, Secret, Stub, enter, method
+from modal import Image, Secret, Stub, enter, exit, gpu, method
 
 MODEL_DIR = "/model"
 BASE_MODEL = "google/gemma-7b-it"
-GPU_TYPE = "H100"
 
 
 # ## Define a container image
@@ -103,20 +103,33 @@ def download_model_to_folder():
 # on the GPU for each subsequent invocation of the function.
 #
 # The `vLLM` library allows the code to remain quite clean!
-@stub.cls(gpu=GPU_TYPE, secrets=[Secret.from_name("huggingface-secret")])
+
+GPU_CONFIG = gpu.H100(count=1)
+
+
+@stub.cls(gpu=GPU_CONFIG, secrets=[Secret.from_name("huggingface-secret")])
 class Model:
     @enter()
     def load(self):
         from vllm import LLM
 
+        if GPU_CONFIG.count > 1:
+            # Patch issue from https://github.com/vllm-project/vllm/issues/1116
+            import ray
+
+            ray.shutdown()
+            ray.init(num_gpus=GPU_CONFIG.count)
+
+        self.template = (
+            "start_of_turn>user\n{user}<end_of_turn>\n<start_of_turn>model"
+        )
+
         # Load the model. Tip: Some models, like MPT, may require `trust_remote_code=true`.
         self.llm = LLM(
             MODEL_DIR,
             enforce_eager=True,  # skip graph capturing for faster cold starts
+            tensor_parallel_size=GPU_CONFIG.count,
         )
-        self.template = """<start_of_turn>user
-{user}<end_of_turn>
-<start_of_turn>model"""
 
     @method()
     def generate(self, user_questions):
@@ -153,10 +166,18 @@ def generate(self, user_questions):
                 "\n\n",
                 sep=COLOR["ENDC"],
             )
+            time.sleep(0.01)
         print(
-            f"{COLOR['HEADER']}{COLOR['GREEN']}Generated {num_tokens} tokens from {BASE_MODEL} in {duration_s:.1f} seconds, throughput = {num_tokens / duration_s:.0f} tokens/second on GPU={GPU_TYPE}.{COLOR['ENDC']}"
+            f"{COLOR['HEADER']}{COLOR['GREEN']}Generated {num_tokens} tokens from {BASE_MODEL} in {duration_s:.1f} seconds, throughput = {num_tokens / duration_s:.0f} tokens/second on {GPU_CONFIG}.{COLOR['ENDC']}"
         )
 
+    @exit()
+    def stop_engine(self):
+        if GPU_CONFIG.count > 1:
+            import ray
+
+            ray.shutdown()
+
 
 # ## Run the model
 # We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function
diff --git a/06_gpu_and_ml/llm-serving/vllm_inference.py b/06_gpu_and_ml/llm-serving/vllm_inference.py
index 7a3a4aab7..fc4546d5b 100644
--- a/06_gpu_and_ml/llm-serving/vllm_inference.py
+++ b/06_gpu_and_ml/llm-serving/vllm_inference.py
@@ -6,9 +6,9 @@
 # `vLLM` also supports a use case as a FastAPI server which we will explore in a future guide. This example
 # walks through setting up an environment that works with `vLLM ` for basic inference.
 #
-# We are running the [Mistral 7B Instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) model here, which is an instruct fine-tuned version of Mistral's 7B model best fit for conversation.
+# We are running the [Mistral 7B Instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) model here, which is fine-tuned version of Mistral's 7B model trained to follow instructions.
 # You can expect 20 second cold starts and well over 100 tokens/second. The larger the batch of prompts, the higher the throughput.
-# For example, with the 60 prompts below, we can produce 19k tokens in 15 seconds, which is around 1.25k tokens/second.
+# For example, with the 64 prompts below, we can produce 15k tokens in less than 7 seconds, a throughput of over 2k tokens/second.
 #
 # To run
 # [any of the other supported models](https://vllm.readthedocs.io/en/latest/models/supported_models.html),
@@ -20,7 +20,7 @@
 
 import os
 
-from modal import Image, Secret, Stub, enter, method
+from modal import Image, Secret, Stub, enter, gpu, method
 
 MODEL_DIR = "/model"
 BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.1"
@@ -33,10 +33,6 @@
 # advantage of Modal's internal filesystem for faster cold starts.
 #
 # ### Download the weights
-# Make sure you have created a [HuggingFace access token](https://huggingface.co/settings/tokens).
-# To access the token in a Modal function, we can create a secret on the [secrets page](https://modal.com/secrets).
-# Now the token will be available via the environment variable named `HF_TOKEN`. Functions that inject this secret will have access to the environment variable.
-#
 # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`.
 #
 # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run.
@@ -49,18 +45,18 @@ def download_model_to_folder():
     snapshot_download(
         BASE_MODEL,
         local_dir=MODEL_DIR,
-        token=os.environ["HF_TOKEN"],
+        ignore_patterns=["*.pt", "*.bin"],  # Using safetensors
     )
     move_cache()
 
 
 # ### Image definition
-# We’ll start from a recommended Dockerhub image and install `vLLM`.
-# Then we’ll use run_function to run the function defined above to ensure the weights of
+# We’ll start from a recommended Docker Hub image and install `vLLM`.
+# Then we’ll use `run_function` to run the function defined above to ensure the weights of
 # the model are saved within the container image.
 image = (
     Image.from_registry(
-        "nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10"
+        "nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10"
     )
     .pip_install(
         "vllm==0.2.5",
@@ -82,19 +78,31 @@ def download_model_to_folder():
 
 # ## The model class
 #
-# The inference function is best represented with Modal's [class syntax](/docs/guide/lifecycle-functions) and the `@enter` decorator.
+# The inference function is best represented with Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions) and the `@enter` decorator.
 # This enables us to load the model into memory just once every time a container starts up, and keep it cached
 # on the GPU for each subsequent invocation of the function.
 #
 # The `vLLM` library allows the code to remain quite clean.
-@stub.cls(gpu="A100", secrets=[Secret.from_name("huggingface-secret")])
+
+# try out an H100 if you've got a large model or big batches!
+GPU_CONFIG = gpu.A100(count=1)  # 40GB A100 by default
+
+
+@stub.cls(gpu=GPU_CONFIG, secrets=[Secret.from_name("huggingface-secret")])
 class Model:
     @enter()
     def load_model(self):
         from vllm import LLM
 
+        if GPU_CONFIG.count > 1:
+            # Patch issue from https://github.com/vllm-project/vllm/issues/1116
+            import ray
+
+            ray.shutdown()
+            ray.init(num_gpus=GPU_CONFIG.count)
+
         # Load the model. Tip: MPT models may require `trust_remote_code=true`.
-        self.llm = LLM(MODEL_DIR)
+        self.llm = LLM(MODEL_DIR, tensor_parallel_size=GPU_CONFIG.count)
         self.template = """<s>[INST] <<SYS>>
 {system}
 <</SYS>>
@@ -103,6 +111,8 @@ def load_model(self):
 
     @method()
     def generate(self, user_questions):
+        import time
+
         from vllm import SamplingParams
 
         prompts = [
@@ -112,19 +122,38 @@ def generate(self, user_questions):
         sampling_params = SamplingParams(
             temperature=0.75,
             top_p=1,
-            max_tokens=800,
+            max_tokens=256,
             presence_penalty=1.15,
         )
+        start = time.monotonic_ns()
         result = self.llm.generate(prompts, sampling_params)
+        duration_s = (time.monotonic_ns() - start) / 1e9
         num_tokens = 0
+
+        COLOR = {
+            "HEADER": "\033[95m",
+            "BLUE": "\033[94m",
+            "GREEN": "\033[92m",
+            "RED": "\033[91m",
+            "ENDC": "\033[0m",
+        }
+
         for output in result:
             num_tokens += len(output.outputs[0].token_ids)
-            print(output.prompt, output.outputs[0].text, "\n\n", sep="")
-        print(f"Generated {num_tokens} tokens")
+            print(
+                f"{COLOR['HEADER']}{COLOR['GREEN']}{output.prompt}",
+                f"\n{COLOR['BLUE']}{output.outputs[0].text}",
+                "\n\n",
+                sep=COLOR["ENDC"],
+            )
+            time.sleep(0.01)
+        print(
+            f"{COLOR['HEADER']}{COLOR['GREEN']}Generated {num_tokens} tokens from {BASE_MODEL} in {duration_s:.1f} seconds, throughput = {num_tokens / duration_s:.0f} tokens/second on {GPU_CONFIG}.{COLOR['ENDC']}"
+        )
 
 
 # ## Run the model
-# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function
+# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function
 # sequentially for a list of inputs. You can run this locally with `modal run vllm_inference.py`.
 @stub.local_entrypoint()
 def main():
@@ -177,6 +206,7 @@ def main():
         "In a dystopian future where water is the most valuable commodity, how would society function?",
         "If a scientist discovers immortality, how could this impact society, economy, and the environment?",
         "What could be the potential implications of contact with an advanced alien civilization?",
+        "Describe how you would mediate a conflict between two roommates about doing the dishes using techniques of non-violent communication.",
         # Math
         "What is the product of 9 and 8?",
         "If a train travels 120 kilometers in 2 hours, what is its average speed?",
@@ -195,5 +225,9 @@ def main():
         "What are 'zombie stars' in the context of astronomy?",
         "Who were the 'Dog-Headed Saint' and the 'Lion-Faced Saint' in medieval Christian traditions?",
         "What is the story of the 'Globsters', unidentified organic masses washed up on the shores?",
+        # Multilingual
+        "战国时期最重要的人物是谁?",
+        "Tuende hatua kwa hatua. Hesabu jumla ya mfululizo wa kihesabu wenye neno la kwanza 2, neno la mwisho 42, na jumla ya maneno 21.",
+        "Kannst du die wichtigsten Eigenschaften und Funktionen des NMDA-Rezeptors beschreiben?",
     ]
     model.generate.remote(questions)
diff --git a/06_gpu_and_ml/llm-serving/vllm_mixtral.py b/06_gpu_and_ml/llm-serving/vllm_mixtral.py
index 7d78efed8..c8dc3a5c9 100644
--- a/06_gpu_and_ml/llm-serving/vllm_mixtral.py
+++ b/06_gpu_and_ml/llm-serving/vllm_mixtral.py
@@ -3,14 +3,11 @@
 # In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm)
 # to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching.
 #
-# `vLLM` also supports a use case as a FastAPI server which we will explore in a future guide. This example
-# walks through setting up an environment that works with `vLLM ` for basic inference.
-#
-# We are running the [Mixtral 8x7B Instruct](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) model here, which is a mixture-of-experts model finetuned for conversation.
-# You can expect 3 minute cold starts.
-# For a single request, the throughput is about 11 tokens/second, but there are upcoming `vLLM` optimizations to improve this.
-# The larger the batch of prompts, the higher the throughput (up to about 300 tokens/second).
-# For example, with the 60 prompts below, we can produce 30k tokens in 100 seconds.
+# We are running the [Mixtral 8x7B Instruct](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) model here,
+# which is a mixture-of-experts model finetuned for conversation.
+# You can expect ~3 minute cold starts.
+# For a single request, the throughput is over 50 tokens/second.
+# The larger the batch of prompts, the higher the throughput (up to hundreds of tokens per second).
 #
 # ## Setup
 #
@@ -19,7 +16,7 @@
 import os
 import time
 
-from modal import Image, Stub, enter, gpu, method
+from modal import Image, Stub, enter, exit, gpu, method
 
 MODEL_DIR = "/model"
 BASE_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
@@ -36,6 +33,8 @@
 #
 # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`.
 #
+# Mixtral is beefy, at nearly 100 GB in `safetensors` format, so this can take some time -- at least a few minutes.
+#
 # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run.
 def download_model_to_folder():
     from huggingface_hub import snapshot_download
@@ -46,7 +45,7 @@ def download_model_to_folder():
     snapshot_download(
         BASE_MODEL,
         local_dir=MODEL_DIR,
-        ignore_patterns="*.pt",  # Using safetensors
+        ignore_patterns=["*.pt"],  # Using safetensors
     )
     move_cache()
 
@@ -58,10 +57,10 @@ def download_model_to_folder():
 
 vllm_image = (
     Image.from_registry(
-        "nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10"
+        "nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10"
     )
     .pip_install(
-        "vllm==0.2.6",
+        "vllm==0.3.2",
         "huggingface_hub==0.19.4",
         "hf-transfer==0.1.4",
         "torch==2.1.2",
@@ -90,9 +89,14 @@ def download_model_to_folder():
 class Model:
     @enter()
     def start_engine(self):
+        import time
+
         from vllm.engine.arg_utils import AsyncEngineArgs
         from vllm.engine.async_llm_engine import AsyncLLMEngine
 
+        print("🥶 cold starting inference")
+        start = time.monotonic_ns()
+
         if GPU_CONFIG.count > 1:
             # Patch issue from https://github.com/vllm-project/vllm/issues/1116
             import ray
@@ -104,10 +108,16 @@ def start_engine(self):
             model=MODEL_DIR,
             tensor_parallel_size=GPU_CONFIG.count,
             gpu_memory_utilization=0.90,
+            enforce_eager=False,  # capture the graph for faster inference, but slower cold starts
+            disable_log_stats=True,  # disable logging so we can stream tokens
+            disable_log_requests=True,
         )
+        self.template = "<s> [INST] {user} [/INST] "
 
+        # this can take some time!
         self.engine = AsyncLLMEngine.from_engine_args(engine_args)
-        self.template = "<s> [INST] {user} [/INST] "
+        duration_s = (time.monotonic_ns() - start) / 1e9
+        print(f"🏎️ engine started in {duration_s:.0f}s")
 
     @method()
     async def completion_stream(self, user_question):
@@ -116,11 +126,10 @@ async def completion_stream(self, user_question):
 
         sampling_params = SamplingParams(
             temperature=0.75,
-            max_tokens=1024,
+            max_tokens=128,
             repetition_penalty=1.1,
         )
 
-        t0 = time.time()
         request_id = random_uuid()
         result_generator = self.engine.generate(
             self.template.format(user=user_question),
@@ -128,6 +137,7 @@ async def completion_stream(self, user_question):
             request_id,
         )
         index, num_tokens = 0, 0
+        start = time.monotonic_ns()
         async for output in result_generator:
             if (
                 output.outputs[0].text
@@ -139,8 +149,16 @@ async def completion_stream(self, user_question):
             num_tokens = len(output.outputs[0].token_ids)
 
             yield text_delta
+        duration_s = (time.monotonic_ns() - start) / 1e9
 
-        print(f"Generated {num_tokens} tokens in {time.time() - t0:.2f}s")
+        yield f"\n\tGenerated {num_tokens} tokens from {BASE_MODEL} in {duration_s:.1f}s, throughput = {num_tokens / duration_s:.0f} tokens/second on {GPU_CONFIG}.\n"
+
+    @exit()
+    def stop_engine(self):
+        if GPU_CONFIG.count > 1:
+            import ray
+
+            ray.shutdown()
 
 
 # ## Run the model
@@ -159,7 +177,7 @@ def main():
         "Who was Emperor Norton I, and what was his significance in San Francisco's history?",
     ]
     for question in questions:
-        print("Sending new request:", question)
+        print("Sending new request:", question, "\n\n")
         for text in model.completion_stream.remote_gen(question):
             print(text, end="", flush=True)