diff --git a/06_gpu_and_ml/llm-serving/trtllm_llama.py b/06_gpu_and_ml/llm-serving/trtllm_llama.py index e0ca2b772..add12a5bf 100644 --- a/06_gpu_and_ml/llm-serving/trtllm_llama.py +++ b/06_gpu_and_ml/llm-serving/trtllm_llama.py @@ -2,53 +2,53 @@ # deploy: true # tags: ["use-case-lm-inference"] # --- + # # Serverless TensorRT-LLM (LLaMA 3 8B) -# + # In this example, we demonstrate how to use the TensorRT-LLM framework to serve Meta's LLaMA 3 8B model -# at a total throughput of roughly 4,500 output tokens per second on a single NVIDIA A100 40GB GPU. -# At [Modal's on-demand rate](https://modal.com/pricing) of ~$4/hr, that's under $0.20 per million tokens -- +# at very high throughput. + +# We achieve a total throughput of over 25,000 output tokens per second on a single NVIDIA H100 GPU. +# At [Modal's on-demand rate](https://modal.com/pricing) of ~$4.50/hr, that's under $0.05 per million tokens -- # on auto-scaling infrastructure and served via a customizable API. -# -# Additional optimizations like speculative sampling and FP8 quantization can further improve throughput. -# For more on the throughput levels that are possible with TensorRT-LLM for different combinations -# of model, hardware, and workload, see the -# [official benchmarks](https://github.com/NVIDIA/TensorRT-LLM/blob/71d8d4d3dc655671f32535d6d2b60cab87f36e87/docs/source/performance.md). -# + +# Additional optimizations like speculative sampling can further improve throughput. + # ## Overview -# + # This guide is intended to document two things: # the general process for building TensorRT-LLM on Modal # and a specific configuration for serving the LLaMA 3 8B model. -# + # ### Build process -# + # Any given TensorRT-LLM service requires a multi-stage build process, # starting from model weights and ending with a compiled engine. # Because that process touches many sharp-edged high-performance components # across the stack, it can easily go wrong in subtle and hard-to-debug ways # that are idiosyncratic to specific systems. # And debugging GPU workloads is expensive! -# + # This example builds an entire service from scratch, from downloading weight tensors # to responding to requests, and so serves as living, interactive documentation of a TensorRT-LLM # build process that works on Modal. -# + # ### Engine configuration -# + # TensorRT-LLM is the Lamborghini of inference engines: it achieves seriously # impressive performance, but only if you tune it carefully. # We carefully document the choices we made here and point to additional resources # so you know where and how you might adjust the parameters for your use case. -# + # ## Installing TensorRT-LLM -# + # To run TensorRT-LLM, we must first install it. Easier said than done! -# + # In Modal, we define [container images](https://modal.com/docs/guide/custom-container) that run our serverless workloads. # All Modal containers have access to GPU drivers via the underlying host environment, # but we still need to install the software stack on top of the drivers, from the CUDA runtime up. -# -# We start from the official `nvidia/cuda:12.1.1-devel-ubuntu22.04` image, + +# We start from an official `nvidia/cuda` image, # which includes the CUDA runtime & development libraries # and the environment configuration necessary to run them. @@ -58,8 +58,9 @@ import pydantic # for typing, used later tensorrt_image = modal.Image.from_registry( - "nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10" -) + "nvidia/cuda:12.4.1-devel-ubuntu22.04", + add_python="3.10", # TRT-LLM requires Python 3.10 +).entrypoint([]) # remove verbose logging by base image on entry # On top of that, we add some system dependencies of TensorRT-LLM, # including OpenMPI for distributed communication, some core software like `git`, @@ -68,7 +69,7 @@ tensorrt_image = tensorrt_image.apt_install( "openmpi-bin", "libopenmpi-dev", "git", "git-lfs", "wget" ).pip_install( - "tensorrt_llm==0.10.0.dev2024042300", + "tensorrt_llm==0.14.0", pre=True, extra_index_url="https://pypi.nvidia.com", ) @@ -76,21 +77,21 @@ # Note that we're doing this by [method-chaining](https://quanticdev.com/articles/method-chaining/) # a number of calls to methods on the `modal.Image`. If you're familiar with # Dockerfiles, you can think of this as a Pythonic interface to instructions like `RUN` and `CMD`. -# + # End-to-end, this step takes five minutes. # If you're reading this from top to bottom, # you might want to stop here and execute the example # with `modal run trtllm_llama.py` # so that it runs in the background while you read the rest. -# + # ## Downloading the Model -# + # Next, we download the model we want to serve. In this case, we're using the instruction-tuned # version of Meta's LLaMA 3 8B model. # We use the function below to download the model from the Hugging Face Hub. MODEL_DIR = "/root/model/model_input" -MODEL_ID = "NousResearch/Meta-Llama-3-8B-Instruct" +MODEL_ID = "NousResearch/Meta-Llama-3-8B-Instruct" # fork without repo gating MODEL_REVISION = "b1532e4dee724d9ba63fe17496f298254d87ca64" # pin model revisions to prevent unexpected changes! @@ -117,11 +118,11 @@ def download_model(): MINUTES = 60 # seconds tensorrt_image = ( # update the image by downloading the model we're using tensorrt_image.pip_install( # add utilities for downloading the model - "hf-transfer==0.1.6", - "huggingface_hub==0.22.2", + "hf-transfer==0.1.8", + "huggingface_hub==0.26.2", "requests~=2.31.0", ) - .env( # hf-transfer: faster downloads, but fewer comforts + .env( # hf-transfer for faster downloads {"HF_HUB_ENABLE_HF_TRANSFER": "1"} ) .run_function( # download the model @@ -130,93 +131,103 @@ def download_model(): ) ) -# ## Configuring the model -# -# Now that we have the model downloaded, we need to convert it to a format that TensorRT-LLM can use. -# We use a convenience script provided by the TensorRT-LLM team. -# This script takes a few minutes to run. - -GIT_HASH = "71d8d4d3dc655671f32535d6d2b60cab87f36e87" -CHECKPOINT_SCRIPT_URL = f"https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/{GIT_HASH}/examples/llama/convert_checkpoint.py" +# ## Quantization -# TensorRT-LLM requires that a GPU be present to load the model, even though it isn't used directly during this conversion process. -# We'll use a single A100-40GB GPU for this example, but we have also tested it successfully with A10G, A100-80GB, and H100 GPUs. -# -# The most important feature to track when selecting hardware to run on is GPU RAM: -# larger models, longer sequences, and bigger batches all require more memory, -# We tuned all three to maximize throughput on this example. -# # The amount of GPU RAM on a single card is a tight constraint for most LLMs: -# RAM is measured in tens of gigabytes and -# models have billions of floating point parameters, -# each consuming one to four bytes of memory. +# RAM is measured in billions of bytes and models have billions of parameters. # The performance cliff if you need to spill to CPU memory is steep, -# so the only solution is to split the model across multiple GPUs. -# This is particularly important when serving larger models (e.g. 70B or 8x22B). +# so all of those parameters must fit in the GPU memory, +# along with other things like the KV cache. + +# The simplest way to reduce LLM inference's RAM requirements is to make the model's parameters smaller, +# to fit their values in a smaller number of bits, like four or eight. This is known as _quantization_. + +# We use a quantization script provided by the TensorRT-LLM team. +# This script takes a few minutes to run. + +GIT_HASH = "b0880169d0fb8cd0363049d91aa548e58a41be07" +CONVERSION_SCRIPT_URL = f"https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/{GIT_HASH}/examples/quantization/quantize.py" + +# NVIDIA's Ada Lovelace/Hopper chips, like the 4090, L40S, and H100, +# are capable of native calculations in 8bit floating point numbers, so we choose that as our quantization format (`qformat`). +# These GPUs are capable of twice as many floating point operations per second in 8bit as in 16bit -- +# about a trillion per second on an H100. N_GPUS = 1 # Heads up: this example has not yet been tested with multiple GPUs -GPU_CONFIG = modal.gpu.A100(count=N_GPUS) +GPU_CONFIG = modal.gpu.H100(count=N_GPUS) + +DTYPE = "float16" # format we download in, regular fp16 +QFORMAT = "fp8" # format we quantize the weights to +KV_CACHE_DTYPE = "fp8" # format we quantize the KV cache to -# This is also the point where we specify the data type for this model. -# We use IEEE 754-compliant half-precision floats, (`float16`), because we found that it resulted in marginally higher throughput, -# but the model is provided in Google's -# [`bfloat16` format](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format). -# On the latest Ada Lovelace chips, you might use `float8` to reduce GPU RAM usage and speed up inference, -# but note that the FP8 format is very new, so expect rough edges. +# Quantization is lossy, but the impact on model quality can be minimized by +# tuning the quantization parameters based on target outputs. -DTYPE = "float16" +CALIB_SIZE = "512" # size of calibration dataset # We put that all together with another invocation of `.run_commands`. +QUANTIZATION_ARGS = f"--dtype={DTYPE} --qformat={QFORMAT} --kv_cache_dtype={KV_CACHE_DTYPE} --calib_size={CALIB_SIZE}" + CKPT_DIR = "/root/model/model_ckpt" -tensorrt_image = ( # update the image by converting the model to TensorRT format - tensorrt_image.run_commands( # takes ~5 minutes +tensorrt_image = ( # update the image by quantizing the model + tensorrt_image.run_commands( # takes ~2 minutes [ - f"wget {CHECKPOINT_SCRIPT_URL} -O /root/convert_checkpoint.py", - f"python /root/convert_checkpoint.py --model_dir={MODEL_DIR} --output_dir={CKPT_DIR}" - + f" --tp_size={N_GPUS} --dtype={DTYPE}", + f"wget {CONVERSION_SCRIPT_URL} -O /root/convert.py", + f"python /root/convert.py --model_dir={MODEL_DIR} --output_dir={CKPT_DIR}" + + f" --tp_size={N_GPUS}" + + f" {QUANTIZATION_ARGS}", ], - gpu=GPU_CONFIG, # GPU must be present to load tensorrt_llm + gpu=GPU_CONFIG, ) ) # ## Compiling the engine -# + # TensorRT-LLM achieves its high throughput primarily by compiling the model: # making concrete choices of CUDA kernels to execute for each operation. # These kernels are much more specific than `matrix_multiply` or `softmax` -- # they have names like `maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148t_nt`. # They are optimized for the specific types and shapes of tensors that the model uses # and for the specific hardware that the model runs on. -# + # That means we need to know all of that information a priori -- # more like the original TensorFlow, which defined static graphs, than like PyTorch, # which builds up a graph of kernels dynamically at runtime. -# -# This extra layer of constraint on our LLM service is precisely + +# This extra layer of constraint on our LLM service is an important part of # what allows TensorRT-LLM to achieve its high throughput. -# + # So we need to specify things like the maximum batch size and the lengths of inputs and outputs. # The closer these are to the actual values we'll use in production, the better the throughput we'll get. +# Since we want to maximize the throughput, assuming we had a constant workload, +# we set the batch size to the largest value we can fit in GPU RAM. +# Quantization helps us again here, since it allows us to fit more tokens in the same RAM. + MAX_INPUT_LEN, MAX_OUTPUT_LEN = 256, 256 +MAX_NUM_TOKENS = 2**17 MAX_BATCH_SIZE = ( - 128 # better throughput at larger batch sizes, limited by GPU RAM + 1024 # better throughput at larger batch sizes, limited by GPU RAM ) ENGINE_DIR = "/root/model/model_output" -SIZE_ARGS = f"--max_batch_size={MAX_BATCH_SIZE} --max_input_len={MAX_INPUT_LEN} --max_output_len={MAX_OUTPUT_LEN}" +SIZE_ARGS = f"--max_input_len={MAX_INPUT_LEN} --max_num_tokens={MAX_NUM_TOKENS} --max_batch_size={MAX_BATCH_SIZE}" # There are many additional options you can pass to `trtllm-build` to tune the engine for your specific workload. # You can find the document we used for LLaMA -# [here](https://github.com/NVIDIA/TensorRT-LLM/tree/66ef1df492f7bc9c8eeb01d7e14db01838e3f0bd/examples/llama), +# [here](https://github.com/NVIDIA/TensorRT-LLM/tree/b0880169d0fb8cd0363049d91aa548e58a41be07/examples/llama), # which you can use to adjust the arguments to fit your workloads, # e.g. adjusting rotary embeddings and block sizes for longer contexts. -# -# We selected plugins that accelerate two core components of the model: dense matrix multiplication and attention. -# You can read more about the plugin options [here](https://fetch.ai/blog/advancing-llm-optimization). +# We also recommend the [official TRT-LLM best practices guide](https://nvidia.github.io/TensorRT-LLM/performance/perf-best-practices.html). + +# To make best use of our 8bit floating point hardware, and the weights and KV cache we have quantized, +# we activate the 8bit floating point fused multi-head attention plugin. -PLUGIN_ARGS = f"--gemm_plugin={DTYPE} --gpt_attention_plugin={DTYPE}" +# Because we are targeting maximum throughput, we do not activate the low latency 8bit floating point matrix multiplication plugin +# or the 8bit floating point matrix multiplication (`gemm`) plugin, which documentation indicates target smaller batch sizes. + +PLUGIN_ARGS = "--use_fp8_context_fmha enable" # We put all of this together with another invocation of `.run_commands`. @@ -224,7 +235,7 @@ def download_model(): tensorrt_image.run_commands( # takes ~5 minutes [ f"trtllm-build --checkpoint_dir {CKPT_DIR} --output_dir {ENGINE_DIR}" - + f" --tp_size={N_GPUS} --workers={N_GPUS}" + + f" --workers={N_GPUS}" + f" {SIZE_ARGS}" + f" {PLUGIN_ARGS}" ], @@ -234,21 +245,20 @@ def download_model(): ) ) -# ## Serving inference at thousands of tokens per second -# +# ## Serving inference at tens of thousands of tokens per second + # Now that we have the engine compiled, we can serve it with Modal by creating an `App`. app = modal.App( f"example-trtllm-{MODEL_ID.split('/')[-1]}", image=tensorrt_image ) -# Thanks to our custom container runtime system, even this -# large, many gigabyte container boots in seconds. -# +# Thanks to our custom container runtime system even this large, many gigabyte container boots in seconds. + # At container start time, we boot up the engine, which completes in under 30 seconds. # Container starts are triggered when Modal scales up your infrastructure, # like the first time you run this code or the first time a request comes in after a period of inactivity. -# + # Container lifecycles in Modal are managed via our `Cls` interface, so we define one below # to manage the engine and run inference. # For details, see [this guide](https://modal.com/docs/guide/lifecycle-functions). @@ -257,6 +267,7 @@ def download_model(): @app.cls( gpu=GPU_CONFIG, container_idle_timeout=10 * MINUTES, + image=tensorrt_image, ) class Model: @modal.enter() @@ -289,6 +300,7 @@ def load(self): engine_dir=f"{ENGINE_DIR}", lora_dir=None, rank=tensorrt_llm.mpi_rank(), # this will need to be adjusted to use multiple GPUs + max_output_len=MAX_OUTPUT_LEN, ) self.model = ModelRunner.from_dir(**runner_kwargs) @@ -375,7 +387,7 @@ def generate(self, prompts: list[str], settings=None): "\n\n", sep=COLOR["ENDC"], ) - time.sleep(0.01) # to avoid log truncation + time.sleep(0.05) # to avoid log truncation print( f"{COLOR['HEADER']}{COLOR['GREEN']}Generated {num_tokens} tokens from {MODEL_ID} in {duration_s:.1f} seconds," @@ -386,22 +398,25 @@ def generate(self, prompts: list[str], settings=None): # ## Calling our inference function -# + # Now, how do we actually run the model? -# + # There are two basic methods: from Python via our SDK or from anywhere, by setting up an API. -# + # ### Calling inference from Python -# + # To run our `Model`'s `.generate` method from Python, we just need to call it -- # with `.remote` appended to run it on Modal. -# + # We wrap that logic in a `local_entrypoint` so you can run it from the command line with # ```bash # modal run trtllm_llama.py # ``` -# -# For simplicity, we hard-code a batch of 128 questions to ask the model. + +# For simplicity, we hard-code a batch of 128 questions to ask the model, +# and then bulk it up to a batch size of 1024 by appending seven distinct prefixes. +# These prefixes ensure KV cache misses for the remainder of the generations, +# to keep the benchmark closer to what can be expected in a real workload. @app.local_entrypoint() @@ -547,6 +562,19 @@ def main(): "What is TensorRT? What role does it play in neural network inference?", ] + prefixes = [ + "Hi! ", + "Hello! ", + "Hi. ", + "Hello. ", + "Hi: ", + "Hello: ", + "Greetings. ", + ] + # prepending any string that causes a tokenization change is enough to invalidate KV cache + for ii, prefix in enumerate(prefixes): + questions += [prefix + question for question in questions[:128]] + model = Model() model.generate.remote(questions) # if you're calling this service from another Python project, @@ -554,11 +582,11 @@ def main(): # ### Calling inference via an API -# + # We can use `modal.web_endpoint` and `app.function` to turn any Python function into a web API. -# + # This API wrapper doesn't need all the dependencies of the core inference service, -# so we switch images here to a basic Linux image, `debian_slim`, and add the FastAPI stack +# so we switch images here to a basic Linux image, `debian_slim`, and add the FastAPI stack. web_image = modal.Image.debian_slim(python_version="3.10").pip_install( "fastapi[standard]==0.115.4", @@ -587,16 +615,16 @@ def generate_web(data: GenerateRequest) -> list[str]: # To set our function up as a web endpoint, we need to run this file -- # with `modal serve` to create a hot-reloading development server or `modal deploy` to deploy it to production. -# + # ```bash # modal serve trtllm_llama.py # ``` -# + # The URL for the endpoint appears in the output of the `modal serve` or `modal deploy` command. # Add `/docs` to the end of this URL to see the interactive Swagger documentation for the endpoint. -# + # You can also test the endpoint by sending a POST request with `curl` from another terminal: -# + # ```bash # curl -X POST url-from-output-of-modal-serve-here \ # -H "Content-Type: application/json" \ @@ -604,11 +632,11 @@ def generate_web(data: GenerateRequest) -> list[str]: # "prompts": ["Tell me a joke", "Describe a dream you had recently", "Share your favorite childhood memory"] # }' | python -m json.tool # python for pretty-printing, optional # ``` -# -# And now you have a high-throughput, low-latency, autoscaling API for serving LLaMA 3 8B completions! -# + +# And now you have a high-throughput, low-latency, autoscaling API for serving LLM completions! + # ## Footer -# + # The rest of the code in this example is utility code.