Improvements to LLM examples (#616)

* typo * improved LLM serving examples -- faster, updated deps, text tweaks
modal-labs · Mar 2, 2024 · 3701959 · 3701959
1 parent 927f0d0
commit 3701959
Show file tree

Hide file tree

Showing 8 changed files with 165 additions and 90 deletions.
diff --git a/06_gpu_and_ml/llm-serving/falcon_bitsandbytes.py b/06_gpu_and_ml/llm-serving/falcon_bitsandbytes.py
@@ -4,14 +4,14 @@
 # # Run Falcon-40B with bitsandbytes
 #
 # In this example, we download the full-precision weights of the Falcon-40B LLM but load it in 4-bit using
-# Tim Dettmer's [`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes) library. This enables it to fit
+# Tim Dettmers' [`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes) library. This enables it to fit
 # into a single GPU (A100 40GB).
 #
 # Due to the current limitations of the library, the inference speed is a little over 2 tokens/second and due
 # to the sheer size of the model, the cold start time on Modal is around 2 minutes.
 #
 # For faster cold start at the expense of inference speed, check out
-# [Running Falcon-40B with AutoGPTQ](/docs/examples/falcon_gptq).
+# [Running Falcon-40B with AutoGPTQ](https://modal.com/docs/examples/falcon_gptq).
 #
 # ## Setup
 #
@@ -43,7 +43,7 @@ def download_falcon_40b():
         "bitsandbytes-cuda117==0.26.0.post2",
         "peft==0.6.2",
         "transformers==4.31.0",
-        "accelerate==0.26.2",
+        "accelerate==0.26.1",
         "hf-transfer==0.1.5",
         "torch==2.0.0",
         "torchvision==0.15.1",
@@ -62,15 +62,15 @@ def download_falcon_40b():
 # ## The model class
 #
 # Next, we write the model code. We want Modal to load the model into memory just once every time a container starts up,
-# so we use [class syntax](/docs/guide/lifecycle-functions) and the `@enter` decorator.
+# so we use [class syntax](https://modal.com/docs/guide/lifecycle-functions) and the `@enter` decorator.
 #
-# Within the [@stub.cls](/docs/reference/modal.Stub#cls) decorator, we use the [gpu parameter](/docs/guide/gpu)
-# to specify that we want to run our function on an [A100 GPU](/pricing). We also allow each call 10 mintues to complete,
+# Within the [@stub.cls](https://modal.com/docs/reference/modal.Stub#cls) decorator, we use the [gpu parameter](/docs/guide/gpu)
+# to specify that we want to run our function on an [A100 GPU](https://modal.com/docs/guide/gpu). We also allow each call 10 mintues to complete,
 # and request the runner to stay live for 5 minutes after its last request.
 #
 # We load the model in 4-bit using the `bitsandbytes` library.
 #
-# The rest is just using the [pipeline()](https://huggingface.co/docs/transformers/en/main_classes/pipelines)
+# The rest is just using the [`pipeline`](https://huggingface.co/docs/transformers/en/main_classes/pipelines)
 # abstraction from the `transformers` library. Refer to the documentation for more parameters and tuning.
 @stub.cls(
     gpu=gpu.A100(),  # Use A100s
@@ -157,7 +157,7 @@ def generate(self, prompt: str):
 
 
 # ## Run the model
-# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function
+# We define a [`local_entrypoint`](https:modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function
 # sequentially for a list of inputs. You can run this locally with `modal run -q falcon_bitsandbytes.py`. The `-q` flag
 # enables streaming to work in the terminal output.
 prompt_template = (

diff --git a/06_gpu_and_ml/llm-serving/falcon_gptq.py b/06_gpu_and_ml/llm-serving/falcon_gptq.py
@@ -8,8 +8,8 @@
 # cold start time on Modal is around 25s.
 #
 # For faster inference at the expense of a slower cold start, check out
-# [Running Falcon-40B with `bitsandbytes` quantization](/docs/examples/falcon_bitsandbytes). You can also
-# run a smaller, 7-billion-parameter model with the [OpenLLaMa example](/docs/examples/openllama).
+# [Running Falcon-40B with `bitsandbytes` quantization](https://modal.com/docs/examples/falcon_bitsandbytes). You can also
+# run a smaller model via the [Gemma 7B example](https://modal.com/docs/examples/vllm_gemma).
 #
 # ## Setup
 #
@@ -33,8 +33,8 @@ def download_model():
 
 
 # Now, we define our image. We'll use the `debian-slim` base image, and install the dependencies we need
-# using [`pip_install`](/docs/reference/modal.Image#pip_install). At the end, we'll use
-# [`run_function`](/docs/guide/custom-container#run-a-modal-function-during-your-build-with-run_function-beta) to run the
+# using [`pip_install`](https://modal.com/docs/reference/modal.Image#pip_install). At the end, we'll use
+# [`run_function`](https://modal.com/docs/guide/custom-container#run-a-modal-function-during-your-build-with-run_function-beta) to run the
 # function defined above as part of the image build.
 
 image = (
@@ -52,21 +52,21 @@ def download_model():
     .run_function(download_model)
 )
 
-# Let's instantiate and name our [Stub](/docs/guide/apps).
+# Let's instantiate and name our [`Stub`](https://modal.com/docs/guide/apps).
 stub = Stub(name="example-falcon-gptq", image=image)
 
 
 # ## The model class
 #
 # Next, we write the model code. We want Modal to load the model into memory just once every time a container starts up,
-# so we use [class syntax](/docs/guide/lifecycle-functions) and the `@enter` decorator.
+# so we use [class syntax](https://modal.com/docs/guide/lifecycle-functions) and the `@enter` decorator.
 #
-# Within the [@stub.cls](/docs/reference/modal.Stub#cls) decorator, we use the [gpu parameter](/docs/guide/gpu)
-# to specify that we want to run our function on an [A100 GPU](/pricing). We also allow each call 10 mintues to complete,
+# Within the [`@stub.cls`](https://modal.com/docs/reference/modal.Stub#cls) decorator, we use the [`gpu` parameter](https://modal.com/docs/guide/gpu)
+# to specify that we want to run our function on an [A100 GPU](https://modal.com/docs/guide/gpu#a100-gpus). We also allow each call 10 mintues to complete,
 # and request the runner to stay live for 5 minutes after its last request.
 #
 # The rest is just using the `transformers` library to run the model. Refer to the
-# [documentation](https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationMixin.generate)
+# [documentation](https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/text_generation#transformers.GenerationMixin.generate)
 # for more parameters and tuning.
 #
 # Note that we need to create a separate thread to call the `generate` function because we need to
@@ -121,7 +121,7 @@ def generate(self, prompt: str):
 
 
 # ## Run the model
-# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function
+# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function
 # sequentially for a list of inputs. You can run this locally with `modal run -q falcon_gptq.py`. The `-q` flag
 # enables streaming to work in the terminal output.
 prompt_template = (

diff --git a/06_gpu_and_ml/llm-serving/openllama.py b/06_gpu_and_ml/llm-serving/openllama.py
@@ -16,7 +16,7 @@
 # inside our container image.
 #
 # To do this, we have to define a function that loads both the model and tokenizer using
-# [from_pretrained](https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained).
+# [`from_pretrained`](https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained).
 # Since HuggingFace stores this model into a local cache, when Modal snapshots the image after running this function,
 # the model weights will be saved and available for use when the container starts up next time.
 
@@ -37,7 +37,7 @@ def download_models():
 # function defined above as part of the image build.
 
 image = (
-    # Python 3.11+ not yet supported for torch.compile
+    # Python 3.11+ not yet supported for `torch.compile`
     Image.debian_slim(python_version="3.10")
     .pip_install(
         "accelerate~=0.18.0",
@@ -48,7 +48,7 @@ def download_models():
     .run_function(download_models)
 )
 
-# Let's instantiate and name our [Stub](/docs/guide/apps).
+# Let's instantiate and name our [`Stub`](https://modal.com/docs/guide/apps).
 
 stub = Stub(name="example-open-llama", image=image)
 
@@ -61,7 +61,7 @@ def download_models():
 # Within the [@stub.cls](/docs/reference/modal.Stub#cls) decorator, we use the [gpu parameter](/docs/guide/gpu)
 # to specify that we want to run our function on an [A100 GPU with 20 GB of VRAM](/pricing).
 #
-# The rest is just using the [generate](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate) function
+# The rest is just using the [`generate`](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate) function
 # from the `transformers` library. Refer to the documentation for more parameters and tuning.
 
 
@@ -115,7 +115,7 @@ def generate(
 
 
 # ## Run the model
-# Finally, we define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function
+# Finally, we define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function
 # sequentially for a list of inputs. You can run this locally with `modal run openllama.py`.
 
 
@@ -139,9 +139,8 @@ def main():
 # ## Next steps
 # The above is a simple example of how to run a basic model. Note that OpenLLaMa has not been fine-tuned on an instruction-following dataset,
 # so the results aren't amazing out of the box. Refer to [DoppelBot, our Slack fine-tuning demo](https://github.com/modal-labs/doppel-bot) for how
-# you could use OpenLLaMa to perform a more useful downstream task.
+# you could use finetuning to make an LLM more useful for downstream tasks.
 #
-# If you're looking for useful responses out-of-the-box like ChatGPT, you could try Vicuna-13B, which is larger and has been instruction-tuned.
-# However, note that this model is not permissively licensed due to the dataset it was trained on. Refer to our [LLM voice chat](/docs/examples/llm-voice-chat)
-# post for how to build a complete voice chat app using Vicuna, or go straight to the [file](https://github.com/modal-labs/quillman/blob/main/src/llm_vicuna.py)
-# if you want to run it by itself.
+# If you're looking for responses more in the style of ChatGPT, you could try [Gemma 7B](https://modal.com/docs/examples/vllm_gemma), which has been trained to follow instructions.
+# However, note that this model is not permissively licensed due to the dataset it was trained on. Refer to our [LLM voice chat](https://modal.com/docs/examples/llm-voice-chat)
+# post for how to build a complete voice chat app using LLMs.
diff --git a/06_gpu_and_ml/llm-serving/text_generation_inference.py b/06_gpu_and_ml/llm-serving/text_generation_inference.py
@@ -3,10 +3,10 @@
 # In this example, we show how to run an optimized inference server using [Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference)
 # with performance advantages over standard text generation pipelines including:
 # - continuous batching, so multiple generations can take place at the same time on a single container
-# - PagedAttention, an optimization that increases throughput.
+# - PagedAttention, which applies memory paging to the attention mechanism's key-value cache, increasing throughput
 #
 # This example deployment, [accessible here](https://modal-labs--tgi-app.modal.run), can serve LLaMA 2 70B with
-# 70 second cold starts, up to 200 tokens/s of throughput and per-token latency of 55ms.
+# 70 second cold starts, up to 200 tokens/s of throughput, and a per-token latency of 55ms.
 
 # ## Setup
 #
@@ -24,7 +24,6 @@
 #
 # Any model supported by TGI can be chosen here.
 
-GPU_CONFIG = gpu.A100(memory=80, count=2)
 MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
 REVISION = "e1ce257bd76895e0864f3b4d6c7ed3c4cdec93e2"
 # Add `["--quantize", "gptq"]` for TheBloke GPTQ models.
@@ -68,16 +67,17 @@ def download_model():
 
 
 # ### Image definition
-# We’ll start from a Dockerhub image recommended by TGI, and override the default `ENTRYPOINT` for
+# We’ll start from a Docker Hub image recommended by TGI, and override the default `ENTRYPOINT` for
 # Modal to run its own which enables seamless serverless deployments.
 #
 # Next we run the download step to pre-populate the image with our model weights.
 #
-# For this step to work on a gated model such as LLaMA 2, the HF_TOKEN environment
-# variable must be set ([reference](https://github.com/huggingface/text-generation-inference#using-a-private-or-gated-model)).
+# For this step to work on a [gated model](https://github.com/huggingface/text-generation-inference#using-a-private-or-gated-model)
+# such as LLaMA 2, the `HF_TOKEN` environment variable must be set.
 #
-# After [creating a HuggingFace access token](https://huggingface.co/settings/tokens),
-# head to the [secrets page](https://modal.com/secrets) to create a Modal secret.
+# After [creating a HuggingFace access token](https://huggingface.co/settings/tokens)
+# and accepting the [LLaMA 2 license](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf),
+# head to the [secrets page](https://modal.com/secrets) to share it with Modal
 #
 # Finally, we install the `text-generation` client to interface with TGI's Rust webserver over `localhost`.
 
@@ -97,7 +97,7 @@ def download_model():
 
 # ## The model class
 #
-# The inference function is best represented with Modal's [class syntax](/docs/guide/lifecycle-functions).
+# The inference function is best represented with Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions).
 # The class syntax is a special representation for a Modal function which splits logic into two parts:
 # 1. the `@enter()` function, which runs once per container when it starts up, and
 # 2. the `@method()` function, which runs per inference request.
@@ -108,11 +108,14 @@ def download_model():
 # container ready.
 #
 # Here, we also
-# - specify the secret so the `HUGGING_FACE_HUB_TOKEN` environment variable is set
+# - specify the secret so the `HUGGING_FACE_HUB_TOKEN` environment variable can be set
 # - specify how many A100s we need per container
 # - specify that each container is allowed to handle up to 10 inputs (i.e. requests) simultaneously
 # - keep idle containers for 10 minutes before spinning down
-# - lift the timeout of each request.
+# - increase the timeout limit
+
+
+GPU_CONFIG = gpu.A100(memory=80, count=2)  # 2 A100s for LLaMA 2 70B
 
 
 @stub.cls(
@@ -188,7 +191,7 @@ async def generate_stream(self, question: str):
 
 
 # ## Run the model
-# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to invoke
+# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to invoke
 # our remote function. You can run this script locally with `modal run text_generation_inference.py`.
 @stub.local_entrypoint()
 def main():

diff --git a/06_gpu_and_ml/llm-serving/tgi_mixtral.py b/06_gpu_and_ml/llm-serving/tgi_mixtral.py
@@ -3,7 +3,7 @@
 # In this example, we show how to run an optimized inference server using [Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference)
 # with performance advantages over standard text generation pipelines including:
 # - continuous batching, so multiple generations can take place at the same time on a single container
-# - PagedAttention, an optimization that increases throughput.
+# - PagedAttention, which applies memory paging to the attention mechanism's key-value cache, increasing throughput
 #
 # This example deployment, [accessible here](https://modal-labs--tgi-mixtral.modal.run), can serve Mixtral 8x7B on two 80GB A100s, with
 # up to 500 tokens/s of throughput and per-token latency of 78ms.
@@ -38,7 +38,7 @@
 
 # ## Define a container image
 #
-# We want to create a Modal image which has the Huggingface model cache pre-populated.
+# We want to create a Modal image which has the Hugging Face model cache pre-populated.
 # The benefit of this is that the container no longer has to re-download the model from Huggingface -
 # instead, it will take advantage of Modal's internal filesystem for faster cold starts.
 # The 95GB model can be loaded in as little as 70 seconds.
@@ -62,7 +62,7 @@ def download_model():
 
 
 # ### Image definition
-# We’ll start from a Dockerhub image recommended by TGI, and override the default `ENTRYPOINT` for
+# We’ll start from a Docker Hub image recommended by TGI, and override the default `ENTRYPOINT` for
 # Modal to run its own which enables seamless serverless deployments.
 #
 # Next we run the download step to pre-populate the image with our model weights.
@@ -81,7 +81,7 @@ def download_model():
 
 # ## The model class
 #
-# The inference function is best represented with Modal's [class syntax](/docs/guide/lifecycle-functions).
+# The inference function is best represented with Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions).
 # The class syntax is a special representation for a Modal function which splits logic into two parts:
 # 1. the `@enter()` function, which runs once per container when it starts up, and
 # 2. the `@method()` function, which runs per inference request.
@@ -155,7 +155,7 @@ async def generate_stream(self, question: str):
 
 
 # ## Run the model
-# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to invoke
+# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to invoke
 # our remote function. You can run this script locally with `modal run text_generation_inference.py`.
 @stub.local_entrypoint()
 def main():