From 37019594ca6e488be4f9460e19fd42e87f65fc42 Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Fri, 1 Mar 2024 18:51:17 -0800 Subject: [PATCH] Improvements to LLM examples (#616) * typo * improved LLM serving examples -- faster, updated deps, text tweaks --- .../llm-serving/falcon_bitsandbytes.py | 16 ++--- 06_gpu_and_ml/llm-serving/falcon_gptq.py | 20 +++--- 06_gpu_and_ml/llm-serving/openllama.py | 19 +++-- .../llm-serving/text_generation_inference.py | 27 +++---- 06_gpu_and_ml/llm-serving/tgi_mixtral.py | 10 +-- 06_gpu_and_ml/llm-serving/vllm_gemma.py | 41 ++++++++--- 06_gpu_and_ml/llm-serving/vllm_inference.py | 70 ++++++++++++++----- 06_gpu_and_ml/llm-serving/vllm_mixtral.py | 52 +++++++++----- 8 files changed, 165 insertions(+), 90 deletions(-) diff --git a/06_gpu_and_ml/llm-serving/falcon_bitsandbytes.py b/06_gpu_and_ml/llm-serving/falcon_bitsandbytes.py index c2c7826c3..b281ca503 100644 --- a/06_gpu_and_ml/llm-serving/falcon_bitsandbytes.py +++ b/06_gpu_and_ml/llm-serving/falcon_bitsandbytes.py @@ -4,14 +4,14 @@ # # Run Falcon-40B with bitsandbytes # # In this example, we download the full-precision weights of the Falcon-40B LLM but load it in 4-bit using -# Tim Dettmer's [`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes) library. This enables it to fit +# Tim Dettmers' [`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes) library. This enables it to fit # into a single GPU (A100 40GB). # # Due to the current limitations of the library, the inference speed is a little over 2 tokens/second and due # to the sheer size of the model, the cold start time on Modal is around 2 minutes. # # For faster cold start at the expense of inference speed, check out -# [Running Falcon-40B with AutoGPTQ](/docs/examples/falcon_gptq). +# [Running Falcon-40B with AutoGPTQ](https://modal.com/docs/examples/falcon_gptq). # # ## Setup # @@ -43,7 +43,7 @@ def download_falcon_40b(): "bitsandbytes-cuda117==0.26.0.post2", "peft==0.6.2", "transformers==4.31.0", - "accelerate==0.26.2", + "accelerate==0.26.1", "hf-transfer==0.1.5", "torch==2.0.0", "torchvision==0.15.1", @@ -62,15 +62,15 @@ def download_falcon_40b(): # ## The model class # # Next, we write the model code. We want Modal to load the model into memory just once every time a container starts up, -# so we use [class syntax](/docs/guide/lifecycle-functions) and the `@enter` decorator. +# so we use [class syntax](https://modal.com/docs/guide/lifecycle-functions) and the `@enter` decorator. # -# Within the [@stub.cls](/docs/reference/modal.Stub#cls) decorator, we use the [gpu parameter](/docs/guide/gpu) -# to specify that we want to run our function on an [A100 GPU](/pricing). We also allow each call 10 mintues to complete, +# Within the [@stub.cls](https://modal.com/docs/reference/modal.Stub#cls) decorator, we use the [gpu parameter](/docs/guide/gpu) +# to specify that we want to run our function on an [A100 GPU](https://modal.com/docs/guide/gpu). We also allow each call 10 mintues to complete, # and request the runner to stay live for 5 minutes after its last request. # # We load the model in 4-bit using the `bitsandbytes` library. # -# The rest is just using the [pipeline()](https://huggingface.co/docs/transformers/en/main_classes/pipelines) +# The rest is just using the [`pipeline`](https://huggingface.co/docs/transformers/en/main_classes/pipelines) # abstraction from the `transformers` library. Refer to the documentation for more parameters and tuning. @stub.cls( gpu=gpu.A100(), # Use A100s @@ -157,7 +157,7 @@ def generate(self, prompt: str): # ## Run the model -# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function +# We define a [`local_entrypoint`](https:modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function # sequentially for a list of inputs. You can run this locally with `modal run -q falcon_bitsandbytes.py`. The `-q` flag # enables streaming to work in the terminal output. prompt_template = ( diff --git a/06_gpu_and_ml/llm-serving/falcon_gptq.py b/06_gpu_and_ml/llm-serving/falcon_gptq.py index e61cdc378..88481500e 100644 --- a/06_gpu_and_ml/llm-serving/falcon_gptq.py +++ b/06_gpu_and_ml/llm-serving/falcon_gptq.py @@ -8,8 +8,8 @@ # cold start time on Modal is around 25s. # # For faster inference at the expense of a slower cold start, check out -# [Running Falcon-40B with `bitsandbytes` quantization](/docs/examples/falcon_bitsandbytes). You can also -# run a smaller, 7-billion-parameter model with the [OpenLLaMa example](/docs/examples/openllama). +# [Running Falcon-40B with `bitsandbytes` quantization](https://modal.com/docs/examples/falcon_bitsandbytes). You can also +# run a smaller model via the [Gemma 7B example](https://modal.com/docs/examples/vllm_gemma). # # ## Setup # @@ -33,8 +33,8 @@ def download_model(): # Now, we define our image. We'll use the `debian-slim` base image, and install the dependencies we need -# using [`pip_install`](/docs/reference/modal.Image#pip_install). At the end, we'll use -# [`run_function`](/docs/guide/custom-container#run-a-modal-function-during-your-build-with-run_function-beta) to run the +# using [`pip_install`](https://modal.com/docs/reference/modal.Image#pip_install). At the end, we'll use +# [`run_function`](https://modal.com/docs/guide/custom-container#run-a-modal-function-during-your-build-with-run_function-beta) to run the # function defined above as part of the image build. image = ( @@ -52,21 +52,21 @@ def download_model(): .run_function(download_model) ) -# Let's instantiate and name our [Stub](/docs/guide/apps). +# Let's instantiate and name our [`Stub`](https://modal.com/docs/guide/apps). stub = Stub(name="example-falcon-gptq", image=image) # ## The model class # # Next, we write the model code. We want Modal to load the model into memory just once every time a container starts up, -# so we use [class syntax](/docs/guide/lifecycle-functions) and the `@enter` decorator. +# so we use [class syntax](https://modal.com/docs/guide/lifecycle-functions) and the `@enter` decorator. # -# Within the [@stub.cls](/docs/reference/modal.Stub#cls) decorator, we use the [gpu parameter](/docs/guide/gpu) -# to specify that we want to run our function on an [A100 GPU](/pricing). We also allow each call 10 mintues to complete, +# Within the [`@stub.cls`](https://modal.com/docs/reference/modal.Stub#cls) decorator, we use the [`gpu` parameter](https://modal.com/docs/guide/gpu) +# to specify that we want to run our function on an [A100 GPU](https://modal.com/docs/guide/gpu#a100-gpus). We also allow each call 10 mintues to complete, # and request the runner to stay live for 5 minutes after its last request. # # The rest is just using the `transformers` library to run the model. Refer to the -# [documentation](https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationMixin.generate) +# [documentation](https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/text_generation#transformers.GenerationMixin.generate) # for more parameters and tuning. # # Note that we need to create a separate thread to call the `generate` function because we need to @@ -121,7 +121,7 @@ def generate(self, prompt: str): # ## Run the model -# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function +# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function # sequentially for a list of inputs. You can run this locally with `modal run -q falcon_gptq.py`. The `-q` flag # enables streaming to work in the terminal output. prompt_template = ( diff --git a/06_gpu_and_ml/llm-serving/openllama.py b/06_gpu_and_ml/llm-serving/openllama.py index d6f124ca3..a21891737 100644 --- a/06_gpu_and_ml/llm-serving/openllama.py +++ b/06_gpu_and_ml/llm-serving/openllama.py @@ -16,7 +16,7 @@ # inside our container image. # # To do this, we have to define a function that loads both the model and tokenizer using -# [from_pretrained](https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained). +# [`from_pretrained`](https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained). # Since HuggingFace stores this model into a local cache, when Modal snapshots the image after running this function, # the model weights will be saved and available for use when the container starts up next time. @@ -37,7 +37,7 @@ def download_models(): # function defined above as part of the image build. image = ( - # Python 3.11+ not yet supported for torch.compile + # Python 3.11+ not yet supported for `torch.compile` Image.debian_slim(python_version="3.10") .pip_install( "accelerate~=0.18.0", @@ -48,7 +48,7 @@ def download_models(): .run_function(download_models) ) -# Let's instantiate and name our [Stub](/docs/guide/apps). +# Let's instantiate and name our [`Stub`](https://modal.com/docs/guide/apps). stub = Stub(name="example-open-llama", image=image) @@ -61,7 +61,7 @@ def download_models(): # Within the [@stub.cls](/docs/reference/modal.Stub#cls) decorator, we use the [gpu parameter](/docs/guide/gpu) # to specify that we want to run our function on an [A100 GPU with 20 GB of VRAM](/pricing). # -# The rest is just using the [generate](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate) function +# The rest is just using the [`generate`](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate) function # from the `transformers` library. Refer to the documentation for more parameters and tuning. @@ -115,7 +115,7 @@ def generate( # ## Run the model -# Finally, we define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function +# Finally, we define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function # sequentially for a list of inputs. You can run this locally with `modal run openllama.py`. @@ -139,9 +139,8 @@ def main(): # ## Next steps # The above is a simple example of how to run a basic model. Note that OpenLLaMa has not been fine-tuned on an instruction-following dataset, # so the results aren't amazing out of the box. Refer to [DoppelBot, our Slack fine-tuning demo](https://github.com/modal-labs/doppel-bot) for how -# you could use OpenLLaMa to perform a more useful downstream task. +# you could use finetuning to make an LLM more useful for downstream tasks. # -# If you're looking for useful responses out-of-the-box like ChatGPT, you could try Vicuna-13B, which is larger and has been instruction-tuned. -# However, note that this model is not permissively licensed due to the dataset it was trained on. Refer to our [LLM voice chat](/docs/examples/llm-voice-chat) -# post for how to build a complete voice chat app using Vicuna, or go straight to the [file](https://github.com/modal-labs/quillman/blob/main/src/llm_vicuna.py) -# if you want to run it by itself. +# If you're looking for responses more in the style of ChatGPT, you could try [Gemma 7B](https://modal.com/docs/examples/vllm_gemma), which has been trained to follow instructions. +# However, note that this model is not permissively licensed due to the dataset it was trained on. Refer to our [LLM voice chat](https://modal.com/docs/examples/llm-voice-chat) +# post for how to build a complete voice chat app using LLMs. diff --git a/06_gpu_and_ml/llm-serving/text_generation_inference.py b/06_gpu_and_ml/llm-serving/text_generation_inference.py index 130eb245c..e650ce557 100644 --- a/06_gpu_and_ml/llm-serving/text_generation_inference.py +++ b/06_gpu_and_ml/llm-serving/text_generation_inference.py @@ -3,10 +3,10 @@ # In this example, we show how to run an optimized inference server using [Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference) # with performance advantages over standard text generation pipelines including: # - continuous batching, so multiple generations can take place at the same time on a single container -# - PagedAttention, an optimization that increases throughput. +# - PagedAttention, which applies memory paging to the attention mechanism's key-value cache, increasing throughput # # This example deployment, [accessible here](https://modal-labs--tgi-app.modal.run), can serve LLaMA 2 70B with -# 70 second cold starts, up to 200 tokens/s of throughput and per-token latency of 55ms. +# 70 second cold starts, up to 200 tokens/s of throughput, and a per-token latency of 55ms. # ## Setup # @@ -24,7 +24,6 @@ # # Any model supported by TGI can be chosen here. -GPU_CONFIG = gpu.A100(memory=80, count=2) MODEL_ID = "meta-llama/Llama-2-70b-chat-hf" REVISION = "e1ce257bd76895e0864f3b4d6c7ed3c4cdec93e2" # Add `["--quantize", "gptq"]` for TheBloke GPTQ models. @@ -68,16 +67,17 @@ def download_model(): # ### Image definition -# We’ll start from a Dockerhub image recommended by TGI, and override the default `ENTRYPOINT` for +# We’ll start from a Docker Hub image recommended by TGI, and override the default `ENTRYPOINT` for # Modal to run its own which enables seamless serverless deployments. # # Next we run the download step to pre-populate the image with our model weights. # -# For this step to work on a gated model such as LLaMA 2, the HF_TOKEN environment -# variable must be set ([reference](https://github.com/huggingface/text-generation-inference#using-a-private-or-gated-model)). +# For this step to work on a [gated model](https://github.com/huggingface/text-generation-inference#using-a-private-or-gated-model) +# such as LLaMA 2, the `HF_TOKEN` environment variable must be set. # -# After [creating a HuggingFace access token](https://huggingface.co/settings/tokens), -# head to the [secrets page](https://modal.com/secrets) to create a Modal secret. +# After [creating a HuggingFace access token](https://huggingface.co/settings/tokens) +# and accepting the [LLaMA 2 license](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), +# head to the [secrets page](https://modal.com/secrets) to share it with Modal # # Finally, we install the `text-generation` client to interface with TGI's Rust webserver over `localhost`. @@ -97,7 +97,7 @@ def download_model(): # ## The model class # -# The inference function is best represented with Modal's [class syntax](/docs/guide/lifecycle-functions). +# The inference function is best represented with Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions). # The class syntax is a special representation for a Modal function which splits logic into two parts: # 1. the `@enter()` function, which runs once per container when it starts up, and # 2. the `@method()` function, which runs per inference request. @@ -108,11 +108,14 @@ def download_model(): # container ready. # # Here, we also -# - specify the secret so the `HUGGING_FACE_HUB_TOKEN` environment variable is set +# - specify the secret so the `HUGGING_FACE_HUB_TOKEN` environment variable can be set # - specify how many A100s we need per container # - specify that each container is allowed to handle up to 10 inputs (i.e. requests) simultaneously # - keep idle containers for 10 minutes before spinning down -# - lift the timeout of each request. +# - increase the timeout limit + + +GPU_CONFIG = gpu.A100(memory=80, count=2) # 2 A100s for LLaMA 2 70B @stub.cls( @@ -188,7 +191,7 @@ async def generate_stream(self, question: str): # ## Run the model -# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to invoke +# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to invoke # our remote function. You can run this script locally with `modal run text_generation_inference.py`. @stub.local_entrypoint() def main(): diff --git a/06_gpu_and_ml/llm-serving/tgi_mixtral.py b/06_gpu_and_ml/llm-serving/tgi_mixtral.py index 4c4067c9c..bfc9ff757 100644 --- a/06_gpu_and_ml/llm-serving/tgi_mixtral.py +++ b/06_gpu_and_ml/llm-serving/tgi_mixtral.py @@ -3,7 +3,7 @@ # In this example, we show how to run an optimized inference server using [Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference) # with performance advantages over standard text generation pipelines including: # - continuous batching, so multiple generations can take place at the same time on a single container -# - PagedAttention, an optimization that increases throughput. +# - PagedAttention, which applies memory paging to the attention mechanism's key-value cache, increasing throughput # # This example deployment, [accessible here](https://modal-labs--tgi-mixtral.modal.run), can serve Mixtral 8x7B on two 80GB A100s, with # up to 500 tokens/s of throughput and per-token latency of 78ms. @@ -38,7 +38,7 @@ # ## Define a container image # -# We want to create a Modal image which has the Huggingface model cache pre-populated. +# We want to create a Modal image which has the Hugging Face model cache pre-populated. # The benefit of this is that the container no longer has to re-download the model from Huggingface - # instead, it will take advantage of Modal's internal filesystem for faster cold starts. # The 95GB model can be loaded in as little as 70 seconds. @@ -62,7 +62,7 @@ def download_model(): # ### Image definition -# We’ll start from a Dockerhub image recommended by TGI, and override the default `ENTRYPOINT` for +# We’ll start from a Docker Hub image recommended by TGI, and override the default `ENTRYPOINT` for # Modal to run its own which enables seamless serverless deployments. # # Next we run the download step to pre-populate the image with our model weights. @@ -81,7 +81,7 @@ def download_model(): # ## The model class # -# The inference function is best represented with Modal's [class syntax](/docs/guide/lifecycle-functions). +# The inference function is best represented with Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions). # The class syntax is a special representation for a Modal function which splits logic into two parts: # 1. the `@enter()` function, which runs once per container when it starts up, and # 2. the `@method()` function, which runs per inference request. @@ -155,7 +155,7 @@ async def generate_stream(self, question: str): # ## Run the model -# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to invoke +# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to invoke # our remote function. You can run this script locally with `modal run text_generation_inference.py`. @stub.local_entrypoint() def main(): diff --git a/06_gpu_and_ml/llm-serving/vllm_gemma.py b/06_gpu_and_ml/llm-serving/vllm_gemma.py index f08a5ea6d..96bf0819c 100644 --- a/06_gpu_and_ml/llm-serving/vllm_gemma.py +++ b/06_gpu_and_ml/llm-serving/vllm_gemma.py @@ -1,7 +1,7 @@ # # Fast inference with vLLM (Gemma 7B) # # In this example, we show how to run basic LLM inference, using [`vLLM`](https://github.com/vllm-project/vllm) -# to take advantage of [PagedAttention](https://arxiv.org/abs/2309.06180), which speeds up sequential inferences with optimized key-value caching. +# to take advantage of [PagedAttention](https://arxiv.org/abs/2309.06180), which speeds up inference on longer sequences with optimized key-value caching. # You can read more about PagedAttention [here](https://charlesfrye.github.io/programming/2023/11/10/llms-systems.html). # # We'll run the [Gemma 7B Instruct](https://huggingface.co/google/gemma-7b-it) large language model. @@ -9,7 +9,7 @@ # # The "7B" in the name refers to the number of parameters (floating point numbers used to control inference) # in the model. Applying those 7,000,000,000 numbers onto an input is a lot of work, -# so we'll use a GPU to speed up the process -- specifically, a top-of-the-line [NVIDIA H100](/blog/introducing-h100). +# so we'll use a GPU to speed up the process -- specifically, a top-of-the-line [NVIDIA H100](https://modal.com/blog/introducing-h100). # # "Instruct" means that this version of Gemma is not simply a statistical model of language, # but has been fine-tuned to follow instructions -- like ChatGPT or Claude, @@ -22,7 +22,8 @@ # # To run # [any of the other supported models](https://vllm.readthedocs.io/en/latest/models/supported_models.html), -# just change the model name. You may also need to enable `trust_remote_code` for some models (see comment below). +# just change the model name. You may also need to change engine configuration, like `trust_remote_code`, +# or GPU configuration, in order to run some models. # # ## Setup # @@ -30,11 +31,10 @@ import os -from modal import Image, Secret, Stub, enter, method +from modal import Image, Secret, Stub, enter, exit, gpu, method MODEL_DIR = "/model" BASE_MODEL = "google/gemma-7b-it" -GPU_TYPE = "H100" # ## Define a container image @@ -103,20 +103,33 @@ def download_model_to_folder(): # on the GPU for each subsequent invocation of the function. # # The `vLLM` library allows the code to remain quite clean! -@stub.cls(gpu=GPU_TYPE, secrets=[Secret.from_name("huggingface-secret")]) + +GPU_CONFIG = gpu.H100(count=1) + + +@stub.cls(gpu=GPU_CONFIG, secrets=[Secret.from_name("huggingface-secret")]) class Model: @enter() def load(self): from vllm import LLM + if GPU_CONFIG.count > 1: + # Patch issue from https://github.com/vllm-project/vllm/issues/1116 + import ray + + ray.shutdown() + ray.init(num_gpus=GPU_CONFIG.count) + + self.template = ( + "start_of_turn>user\n{user}\nmodel" + ) + # Load the model. Tip: Some models, like MPT, may require `trust_remote_code=true`. self.llm = LLM( MODEL_DIR, enforce_eager=True, # skip graph capturing for faster cold starts + tensor_parallel_size=GPU_CONFIG.count, ) - self.template = """user -{user} -model""" @method() def generate(self, user_questions): @@ -153,10 +166,18 @@ def generate(self, user_questions): "\n\n", sep=COLOR["ENDC"], ) + time.sleep(0.01) print( - f"{COLOR['HEADER']}{COLOR['GREEN']}Generated {num_tokens} tokens from {BASE_MODEL} in {duration_s:.1f} seconds, throughput = {num_tokens / duration_s:.0f} tokens/second on GPU={GPU_TYPE}.{COLOR['ENDC']}" + f"{COLOR['HEADER']}{COLOR['GREEN']}Generated {num_tokens} tokens from {BASE_MODEL} in {duration_s:.1f} seconds, throughput = {num_tokens / duration_s:.0f} tokens/second on {GPU_CONFIG}.{COLOR['ENDC']}" ) + @exit() + def stop_engine(self): + if GPU_CONFIG.count > 1: + import ray + + ray.shutdown() + # ## Run the model # We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function diff --git a/06_gpu_and_ml/llm-serving/vllm_inference.py b/06_gpu_and_ml/llm-serving/vllm_inference.py index 7a3a4aab7..fc4546d5b 100644 --- a/06_gpu_and_ml/llm-serving/vllm_inference.py +++ b/06_gpu_and_ml/llm-serving/vllm_inference.py @@ -6,9 +6,9 @@ # `vLLM` also supports a use case as a FastAPI server which we will explore in a future guide. This example # walks through setting up an environment that works with `vLLM ` for basic inference. # -# We are running the [Mistral 7B Instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) model here, which is an instruct fine-tuned version of Mistral's 7B model best fit for conversation. +# We are running the [Mistral 7B Instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) model here, which is fine-tuned version of Mistral's 7B model trained to follow instructions. # You can expect 20 second cold starts and well over 100 tokens/second. The larger the batch of prompts, the higher the throughput. -# For example, with the 60 prompts below, we can produce 19k tokens in 15 seconds, which is around 1.25k tokens/second. +# For example, with the 64 prompts below, we can produce 15k tokens in less than 7 seconds, a throughput of over 2k tokens/second. # # To run # [any of the other supported models](https://vllm.readthedocs.io/en/latest/models/supported_models.html), @@ -20,7 +20,7 @@ import os -from modal import Image, Secret, Stub, enter, method +from modal import Image, Secret, Stub, enter, gpu, method MODEL_DIR = "/model" BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.1" @@ -33,10 +33,6 @@ # advantage of Modal's internal filesystem for faster cold starts. # # ### Download the weights -# Make sure you have created a [HuggingFace access token](https://huggingface.co/settings/tokens). -# To access the token in a Modal function, we can create a secret on the [secrets page](https://modal.com/secrets). -# Now the token will be available via the environment variable named `HF_TOKEN`. Functions that inject this secret will have access to the environment variable. -# # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. # # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. @@ -49,18 +45,18 @@ def download_model_to_folder(): snapshot_download( BASE_MODEL, local_dir=MODEL_DIR, - token=os.environ["HF_TOKEN"], + ignore_patterns=["*.pt", "*.bin"], # Using safetensors ) move_cache() # ### Image definition -# We’ll start from a recommended Dockerhub image and install `vLLM`. -# Then we’ll use run_function to run the function defined above to ensure the weights of +# We’ll start from a recommended Docker Hub image and install `vLLM`. +# Then we’ll use `run_function` to run the function defined above to ensure the weights of # the model are saved within the container image. image = ( Image.from_registry( - "nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10" + "nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10" ) .pip_install( "vllm==0.2.5", @@ -82,19 +78,31 @@ def download_model_to_folder(): # ## The model class # -# The inference function is best represented with Modal's [class syntax](/docs/guide/lifecycle-functions) and the `@enter` decorator. +# The inference function is best represented with Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions) and the `@enter` decorator. # This enables us to load the model into memory just once every time a container starts up, and keep it cached # on the GPU for each subsequent invocation of the function. # # The `vLLM` library allows the code to remain quite clean. -@stub.cls(gpu="A100", secrets=[Secret.from_name("huggingface-secret")]) + +# try out an H100 if you've got a large model or big batches! +GPU_CONFIG = gpu.A100(count=1) # 40GB A100 by default + + +@stub.cls(gpu=GPU_CONFIG, secrets=[Secret.from_name("huggingface-secret")]) class Model: @enter() def load_model(self): from vllm import LLM + if GPU_CONFIG.count > 1: + # Patch issue from https://github.com/vllm-project/vllm/issues/1116 + import ray + + ray.shutdown() + ray.init(num_gpus=GPU_CONFIG.count) + # Load the model. Tip: MPT models may require `trust_remote_code=true`. - self.llm = LLM(MODEL_DIR) + self.llm = LLM(MODEL_DIR, tensor_parallel_size=GPU_CONFIG.count) self.template = """[INST] <> {system} <> @@ -103,6 +111,8 @@ def load_model(self): @method() def generate(self, user_questions): + import time + from vllm import SamplingParams prompts = [ @@ -112,19 +122,38 @@ def generate(self, user_questions): sampling_params = SamplingParams( temperature=0.75, top_p=1, - max_tokens=800, + max_tokens=256, presence_penalty=1.15, ) + start = time.monotonic_ns() result = self.llm.generate(prompts, sampling_params) + duration_s = (time.monotonic_ns() - start) / 1e9 num_tokens = 0 + + COLOR = { + "HEADER": "\033[95m", + "BLUE": "\033[94m", + "GREEN": "\033[92m", + "RED": "\033[91m", + "ENDC": "\033[0m", + } + for output in result: num_tokens += len(output.outputs[0].token_ids) - print(output.prompt, output.outputs[0].text, "\n\n", sep="") - print(f"Generated {num_tokens} tokens") + print( + f"{COLOR['HEADER']}{COLOR['GREEN']}{output.prompt}", + f"\n{COLOR['BLUE']}{output.outputs[0].text}", + "\n\n", + sep=COLOR["ENDC"], + ) + time.sleep(0.01) + print( + f"{COLOR['HEADER']}{COLOR['GREEN']}Generated {num_tokens} tokens from {BASE_MODEL} in {duration_s:.1f} seconds, throughput = {num_tokens / duration_s:.0f} tokens/second on {GPU_CONFIG}.{COLOR['ENDC']}" + ) # ## Run the model -# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function +# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function # sequentially for a list of inputs. You can run this locally with `modal run vllm_inference.py`. @stub.local_entrypoint() def main(): @@ -177,6 +206,7 @@ def main(): "In a dystopian future where water is the most valuable commodity, how would society function?", "If a scientist discovers immortality, how could this impact society, economy, and the environment?", "What could be the potential implications of contact with an advanced alien civilization?", + "Describe how you would mediate a conflict between two roommates about doing the dishes using techniques of non-violent communication.", # Math "What is the product of 9 and 8?", "If a train travels 120 kilometers in 2 hours, what is its average speed?", @@ -195,5 +225,9 @@ def main(): "What are 'zombie stars' in the context of astronomy?", "Who were the 'Dog-Headed Saint' and the 'Lion-Faced Saint' in medieval Christian traditions?", "What is the story of the 'Globsters', unidentified organic masses washed up on the shores?", + # Multilingual + "战国时期最重要的人物是谁?", + "Tuende hatua kwa hatua. Hesabu jumla ya mfululizo wa kihesabu wenye neno la kwanza 2, neno la mwisho 42, na jumla ya maneno 21.", + "Kannst du die wichtigsten Eigenschaften und Funktionen des NMDA-Rezeptors beschreiben?", ] model.generate.remote(questions) diff --git a/06_gpu_and_ml/llm-serving/vllm_mixtral.py b/06_gpu_and_ml/llm-serving/vllm_mixtral.py index 7d78efed8..c8dc3a5c9 100644 --- a/06_gpu_and_ml/llm-serving/vllm_mixtral.py +++ b/06_gpu_and_ml/llm-serving/vllm_mixtral.py @@ -3,14 +3,11 @@ # In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm) # to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching. # -# `vLLM` also supports a use case as a FastAPI server which we will explore in a future guide. This example -# walks through setting up an environment that works with `vLLM ` for basic inference. -# -# We are running the [Mixtral 8x7B Instruct](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) model here, which is a mixture-of-experts model finetuned for conversation. -# You can expect 3 minute cold starts. -# For a single request, the throughput is about 11 tokens/second, but there are upcoming `vLLM` optimizations to improve this. -# The larger the batch of prompts, the higher the throughput (up to about 300 tokens/second). -# For example, with the 60 prompts below, we can produce 30k tokens in 100 seconds. +# We are running the [Mixtral 8x7B Instruct](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) model here, +# which is a mixture-of-experts model finetuned for conversation. +# You can expect ~3 minute cold starts. +# For a single request, the throughput is over 50 tokens/second. +# The larger the batch of prompts, the higher the throughput (up to hundreds of tokens per second). # # ## Setup # @@ -19,7 +16,7 @@ import os import time -from modal import Image, Stub, enter, gpu, method +from modal import Image, Stub, enter, exit, gpu, method MODEL_DIR = "/model" BASE_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1" @@ -36,6 +33,8 @@ # # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. # +# Mixtral is beefy, at nearly 100 GB in `safetensors` format, so this can take some time -- at least a few minutes. +# # Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. def download_model_to_folder(): from huggingface_hub import snapshot_download @@ -46,7 +45,7 @@ def download_model_to_folder(): snapshot_download( BASE_MODEL, local_dir=MODEL_DIR, - ignore_patterns="*.pt", # Using safetensors + ignore_patterns=["*.pt"], # Using safetensors ) move_cache() @@ -58,10 +57,10 @@ def download_model_to_folder(): vllm_image = ( Image.from_registry( - "nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10" + "nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10" ) .pip_install( - "vllm==0.2.6", + "vllm==0.3.2", "huggingface_hub==0.19.4", "hf-transfer==0.1.4", "torch==2.1.2", @@ -90,9 +89,14 @@ def download_model_to_folder(): class Model: @enter() def start_engine(self): + import time + from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine + print("🥶 cold starting inference") + start = time.monotonic_ns() + if GPU_CONFIG.count > 1: # Patch issue from https://github.com/vllm-project/vllm/issues/1116 import ray @@ -104,10 +108,16 @@ def start_engine(self): model=MODEL_DIR, tensor_parallel_size=GPU_CONFIG.count, gpu_memory_utilization=0.90, + enforce_eager=False, # capture the graph for faster inference, but slower cold starts + disable_log_stats=True, # disable logging so we can stream tokens + disable_log_requests=True, ) + self.template = " [INST] {user} [/INST] " + # this can take some time! self.engine = AsyncLLMEngine.from_engine_args(engine_args) - self.template = " [INST] {user} [/INST] " + duration_s = (time.monotonic_ns() - start) / 1e9 + print(f"🏎️ engine started in {duration_s:.0f}s") @method() async def completion_stream(self, user_question): @@ -116,11 +126,10 @@ async def completion_stream(self, user_question): sampling_params = SamplingParams( temperature=0.75, - max_tokens=1024, + max_tokens=128, repetition_penalty=1.1, ) - t0 = time.time() request_id = random_uuid() result_generator = self.engine.generate( self.template.format(user=user_question), @@ -128,6 +137,7 @@ async def completion_stream(self, user_question): request_id, ) index, num_tokens = 0, 0 + start = time.monotonic_ns() async for output in result_generator: if ( output.outputs[0].text @@ -139,8 +149,16 @@ async def completion_stream(self, user_question): num_tokens = len(output.outputs[0].token_ids) yield text_delta + duration_s = (time.monotonic_ns() - start) / 1e9 - print(f"Generated {num_tokens} tokens in {time.time() - t0:.2f}s") + yield f"\n\tGenerated {num_tokens} tokens from {BASE_MODEL} in {duration_s:.1f}s, throughput = {num_tokens / duration_s:.0f} tokens/second on {GPU_CONFIG}.\n" + + @exit() + def stop_engine(self): + if GPU_CONFIG.count > 1: + import ray + + ray.shutdown() # ## Run the model @@ -159,7 +177,7 @@ def main(): "Who was Emperor Norton I, and what was his significance in San Francisco's history?", ] for question in questions: - print("Sending new request:", question) + print("Sending new request:", question, "\n\n") for text in model.completion_stream.remote_gen(question): print(text, end="", flush=True)