From eee86872eb992b6906fea6f59499ce607c282025 Mon Sep 17 00:00:00 2001 From: cathyzbn Date: Fri, 9 Aug 2024 21:56:37 +0000 Subject: [PATCH 1/4] batched whisper initial --- 06_gpu_and_ml/llm-serving/batched_whisper.py | 156 +++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 06_gpu_and_ml/llm-serving/batched_whisper.py diff --git a/06_gpu_and_ml/llm-serving/batched_whisper.py b/06_gpu_and_ml/llm-serving/batched_whisper.py new file mode 100644 index 000000000..f3cabca62 --- /dev/null +++ b/06_gpu_and_ml/llm-serving/batched_whisper.py @@ -0,0 +1,156 @@ +# # Fast Whisper inference using dynamic batching +# +# In this example, we demonstrate how to run batched inference for [OpenAI's Whisper](https://openai.com/index/whisper/), +# a speech recognition model. By batching multiple audio samples together or batching chunks of a single audio sample, +# we can achieve up to a 2.5x speedup in inference throughput with on an A100. +# +# We will be running the [Whisper Large V3](https://huggingface.co/openai/whisper-large-v3) model. +# To run [any of the other HuggingFace Whisper models](https://huggingface.co/models?search=openai/whisper), +# simply replace the `MODEL_NAME` and `MODEL_REVISION` variables. +# +# ## Setup +# +# First, we import the Modal client and define the model that we want to serve. + +import asyncio +import os +import time + +import modal +from datasets import load_dataset + +MODEL_DIR = "/model" +MODEL_NAME = "openai/whisper-large-v3" +MODEL_REVISION = "afda370583db9c5359511ed5d989400a6199dfe1" + +# ## Define a container image +# +# We want to create a Modal image that has the model weights pre-saved to a directory. The benefit of this +# is that the container no longer has to re-download the model from Hugging Face. Instead, it will take +# advantage of Modal's internal filesystem for faster cold starts. +# +# ### Download the weights +# We can download the model to a specific directory using the Hugging Face utility function `snapshot_download`. +# +# If you adapt this example to run another model, note that for this step to work on a +# [gated model](https://huggingface.co/docs/hub/en/models-gated), +# the `HF_TOKEN` environment variable must be set and provided as a [Modal Secret](https://modal.com/secrets). + +def download_model_to_image(model_dir, model_name, model_revision): + from huggingface_hub import snapshot_download + from transformers.utils import move_cache + + os.makedirs(model_dir, exist_ok=True) + + snapshot_download( + model_name, + local_dir=model_dir, + ignore_patterns=["*.pt", "*.bin"], # Using safetensors + revision=model_revision, + ) + move_cache() + +# ### Image Definition +# +# We’ll start with Modal's baseline `debian_slim` image and install the relevant libraries. +# Then we’ll use `run_function` with `download_model_to_image` to write the model into the container image. + +image = ( + modal.Image.debian_slim(python_version="3.11") + .pip_install( + "torch==2.1.2", + "transformers==4.39.3", + "hf-transfer==0.1.6", + "huggingface_hub==0.22.2", + "librosa==0.10.2", + "soundfile==0.12.1", + "datasets==2.20.0", + "accelerate==0.33.0", + ) + # Use the barebones `hf-transfer` package for maximum download speeds. No progress bar, but expect 700MB/s. + .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) + .run_function( + download_model_to_image, + timeout=60 * 20, + kwargs={ + "model_dir": MODEL_DIR, + "model_name": MODEL_NAME, + "model_revision": MODEL_REVISION, + }, + ) +) + +app = modal.App("example-whisper-batched-inference", image=image) + +# ## The model class +# +# The inference function is best represented using Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions), +# with a `load_model` method decorated with `@modal.enter`. This enables us to load the model into memory just once, +# every time a container starts up, and keep it cached on the GPU for subsequent invocations of the function. +# +# We also define a `transcribe` method that uses the `@modal.batched` decorator to enable dynamic batching. +# This allows us to invoke the function with individual audio samples, and the function will automatically batch them +# together before running inference. The `max_batch_size` parameter limits the batch size to a maximum of 128 audio samples +# at a time. The `wait_ms` parameter sets the maximum time to wait for more inputs before running the batched transcription. +# +# We selected a batch size of 128 because it is the largest power of 2 that fits within the 40GB A100 GPU memory. +# This number will vary depending on the model and the GPU you are using. To tune the `wait_ms` parameter, you can set it to +# `(targeted latency) - (execution time)`. Most applications have a targeted latency, and this allows the latency of +# any request to stay within that limit. +# +# Hint: Try using an H100 if you've got a large model or big batches! + +GPU_CONFIG = modal.gpu.A100(count=1) # 40GB A100 by default + +@app.cls(gpu=GPU_CONFIG, concurrency_limit=1) +class Model: + @modal.enter() + def load_model(self): + import torch + from transformers import ( + AutoModelForSpeechSeq2Seq, + AutoProcessor, + pipeline, + ) + self.processor = AutoProcessor.from_pretrained(MODEL_NAME) + self.model = AutoModelForSpeechSeq2Seq.from_pretrained( + MODEL_NAME, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + use_safetensors=True, + ).to("cuda") + + # Create a pipeline for preprocessing speech data and transcribing it + self.pipeline = pipeline( + "automatic-speech-recognition", + model=self.model, + tokenizer=self.processor.tokenizer, + feature_extractor=self.processor.feature_extractor, + torch_dtype=torch.float16, + device="cuda" + ) + + @modal.batched(max_batch_size=128, wait_ms=4000) + def transcribe(self, audio_samples): + transcription = self.pipeline(audio_samples, batch_size=len(audio_samples)) + return transcription + + +# ## Run the model +# +# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) +# to call our remote function sequentially for a list of inputs. You can run this locally with +# `modal run batched_whisper.py`. +# +# In this example, we use the [librispeech_asr_dummy dataset](https://huggingface.co/datasets/hf-internal-testing/librispeech_asr_dummy) +# from Hugging Face's Datasets library to test the model. +# +# We use [`map.aio`](/docs/reference/modal.Function#map) to asynchronously map over the audio files. +# This allows us to invoke the batched transcription method on each audio sample in parallel. + +@app.local_entrypoint() +async def main(): + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + batched_whisper = Model() + async for transcription in batched_whisper.transcribe.map.aio(ds["audio"][:20]): + print("Transcription for audio 📻", transcription) From 99274ffec33d1bad098898693a3dc5a2ba6002b4 Mon Sep 17 00:00:00 2001 From: cathyzbn Date: Fri, 16 Aug 2024 14:26:01 +0000 Subject: [PATCH 2/4] fix tests + nit --- 06_gpu_and_ml/llm-serving/batched_whisper.py | 131 +++++++++---------- 1 file changed, 64 insertions(+), 67 deletions(-) diff --git a/06_gpu_and_ml/llm-serving/batched_whisper.py b/06_gpu_and_ml/llm-serving/batched_whisper.py index f3cabca62..5b5f933fa 100644 --- a/06_gpu_and_ml/llm-serving/batched_whisper.py +++ b/06_gpu_and_ml/llm-serving/batched_whisper.py @@ -1,8 +1,8 @@ # # Fast Whisper inference using dynamic batching # -# In this example, we demonstrate how to run batched inference for [OpenAI's Whisper](https://openai.com/index/whisper/), -# a speech recognition model. By batching multiple audio samples together or batching chunks of a single audio sample, -# we can achieve up to a 2.5x speedup in inference throughput with on an A100. +# In this example, we demonstrate how to run batched inference for OpenAI's speech recognition model, +# [Whisper](https://openai.com/index/whisper/). Batching multiple audio samples together or batching chunks +# of a single audio sample can help to achieve a 2.5x speedup in inference throughput on an A100! # # We will be running the [Whisper Large V3](https://huggingface.co/openai/whisper-large-v3) model. # To run [any of the other HuggingFace Whisper models](https://huggingface.co/models?search=openai/whisper), @@ -10,50 +10,20 @@ # # ## Setup # -# First, we import the Modal client and define the model that we want to serve. +# Let's start by importing the Modal client and defining the model that we want to serve. -import asyncio import os -import time import modal -from datasets import load_dataset MODEL_DIR = "/model" MODEL_NAME = "openai/whisper-large-v3" MODEL_REVISION = "afda370583db9c5359511ed5d989400a6199dfe1" -# ## Define a container image -# -# We want to create a Modal image that has the model weights pre-saved to a directory. The benefit of this -# is that the container no longer has to re-download the model from Hugging Face. Instead, it will take -# advantage of Modal's internal filesystem for faster cold starts. -# -# ### Download the weights -# We can download the model to a specific directory using the Hugging Face utility function `snapshot_download`. -# -# If you adapt this example to run another model, note that for this step to work on a -# [gated model](https://huggingface.co/docs/hub/en/models-gated), -# the `HF_TOKEN` environment variable must be set and provided as a [Modal Secret](https://modal.com/secrets). - -def download_model_to_image(model_dir, model_name, model_revision): - from huggingface_hub import snapshot_download - from transformers.utils import move_cache - os.makedirs(model_dir, exist_ok=True) - - snapshot_download( - model_name, - local_dir=model_dir, - ignore_patterns=["*.pt", "*.bin"], # Using safetensors - revision=model_revision, - ) - move_cache() - -# ### Image Definition +# ## Define a container image # # We’ll start with Modal's baseline `debian_slim` image and install the relevant libraries. -# Then we’ll use `run_function` with `download_model_to_image` to write the model into the container image. image = ( modal.Image.debian_slim(python_version="3.11") @@ -64,39 +34,35 @@ def download_model_to_image(model_dir, model_name, model_revision): "huggingface_hub==0.22.2", "librosa==0.10.2", "soundfile==0.12.1", - "datasets==2.20.0", "accelerate==0.33.0", + "datasets==2.20.0", ) # Use the barebones `hf-transfer` package for maximum download speeds. No progress bar, but expect 700MB/s. .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) - .run_function( - download_model_to_image, - timeout=60 * 20, - kwargs={ - "model_dir": MODEL_DIR, - "model_name": MODEL_NAME, - "model_revision": MODEL_REVISION, - }, - ) ) app = modal.App("example-whisper-batched-inference", image=image) + # ## The model class # -# The inference function is best represented using Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions), -# with a `load_model` method decorated with `@modal.enter`. This enables us to load the model into memory just once, -# every time a container starts up, and keep it cached on the GPU for subsequent invocations of the function. +# The inference function is best represented using Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions). + +# We define a `@modal.build` method to download the model and a `@modal.enter` method to load the model. This allows +# the container to download the model from HuggingFace just once when it launches, load the model into memory just once +# every time a container starts up by caching it on the GPU for subsequent invocations of the function. # # We also define a `transcribe` method that uses the `@modal.batched` decorator to enable dynamic batching. # This allows us to invoke the function with individual audio samples, and the function will automatically batch them -# together before running inference. The `max_batch_size` parameter limits the batch size to a maximum of 128 audio samples -# at a time. The `wait_ms` parameter sets the maximum time to wait for more inputs before running the batched transcription. +# together before running inference. +# +# The `max_batch_size` parameter limits the maximum number of audio samples combined into a single batch. +# We used a `max_batch_size` of 128, the largest power of 2 that can be accommodated by the 40GB A100 GPU memory. This number +# will vary depending on the model and the GPU you are using. # -# We selected a batch size of 128 because it is the largest power of 2 that fits within the 40GB A100 GPU memory. -# This number will vary depending on the model and the GPU you are using. To tune the `wait_ms` parameter, you can set it to -# `(targeted latency) - (execution time)`. Most applications have a targeted latency, and this allows the latency of -# any request to stay within that limit. +# The `wait_ms` parameter sets the maximum time to wait for more inputs before running the batched transcription. +# To tune this parameter, you can set it to the target latency of your application minus the execution time of an inference batch. +# This allows the latency of any request to stay within your target latency. # # Hint: Try using an H100 if you've got a large model or big batches! @@ -104,6 +70,21 @@ def download_model_to_image(model_dir, model_name, model_revision): @app.cls(gpu=GPU_CONFIG, concurrency_limit=1) class Model: + @modal.build() + def download_model(self): + from huggingface_hub import snapshot_download + from transformers.utils import move_cache + + os.makedirs(MODEL_DIR, exist_ok=True) + + snapshot_download( + MODEL_NAME, + local_dir=MODEL_DIR, + ignore_patterns=["*.pt", "*.bin"], # Using safetensors + revision=MODEL_REVISION, + ) + move_cache() + @modal.enter() def load_model(self): import torch @@ -112,6 +93,7 @@ def load_model(self): AutoProcessor, pipeline, ) + self.processor = AutoProcessor.from_pretrained(MODEL_NAME) self.model = AutoModelForSpeechSeq2Seq.from_pretrained( MODEL_NAME, @@ -120,37 +102,52 @@ def load_model(self): use_safetensors=True, ).to("cuda") - # Create a pipeline for preprocessing speech data and transcribing it + # Create a pipeline for preprocessing and transcribing speech data self.pipeline = pipeline( "automatic-speech-recognition", model=self.model, tokenizer=self.processor.tokenizer, feature_extractor=self.processor.feature_extractor, torch_dtype=torch.float16, - device="cuda" + device="cuda", ) @modal.batched(max_batch_size=128, wait_ms=4000) def transcribe(self, audio_samples): - transcription = self.pipeline(audio_samples, batch_size=len(audio_samples)) + transcription = self.pipeline( + audio_samples, batch_size=len(audio_samples) + ) return transcription -# ## Run the model -# -# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) -# to call our remote function sequentially for a list of inputs. You can run this locally with -# `modal run batched_whisper.py`. -# +# ## Transcribe a dataset # In this example, we use the [librispeech_asr_dummy dataset](https://huggingface.co/datasets/hf-internal-testing/librispeech_asr_dummy) # from Hugging Face's Datasets library to test the model. # # We use [`map.aio`](/docs/reference/modal.Function#map) to asynchronously map over the audio files. # This allows us to invoke the batched transcription method on each audio sample in parallel. + +@app.function() +async def transcribe_hf_dataset(dataset_name): + from datasets import load_dataset + + ds = load_dataset( + dataset_name, "clean", split="validation" + ) + batched_whisper = Model() + async for transcription in batched_whisper.transcribe.map.aio( + ds["audio"] + ): + print("Transcription for audio 📻", transcription["text"]) + + +# ## Run the model +# +# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) +# to run the transcription. You can run this locally with `modal run batched_whisper.py`. + + @app.local_entrypoint() async def main(): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - batched_whisper = Model() - async for transcription in batched_whisper.transcribe.map.aio(ds["audio"][:20]): - print("Transcription for audio 📻", transcription) + transcribe_hf_dataset.remote("hf-internal-testing/librispeech_asr_dummy") From 7fe08745fb43ef1279d71c599a39c80c6cd69fae Mon Sep 17 00:00:00 2001 From: cathyzbn Date: Fri, 16 Aug 2024 14:26:48 +0000 Subject: [PATCH 3/4] ruff --- 06_gpu_and_ml/llm-serving/batched_whisper.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/06_gpu_and_ml/llm-serving/batched_whisper.py b/06_gpu_and_ml/llm-serving/batched_whisper.py index 5b5f933fa..e40146e73 100644 --- a/06_gpu_and_ml/llm-serving/batched_whisper.py +++ b/06_gpu_and_ml/llm-serving/batched_whisper.py @@ -68,6 +68,7 @@ GPU_CONFIG = modal.gpu.A100(count=1) # 40GB A100 by default + @app.cls(gpu=GPU_CONFIG, concurrency_limit=1) class Model: @modal.build() @@ -132,13 +133,9 @@ def transcribe(self, audio_samples): async def transcribe_hf_dataset(dataset_name): from datasets import load_dataset - ds = load_dataset( - dataset_name, "clean", split="validation" - ) + ds = load_dataset(dataset_name, "clean", split="validation") batched_whisper = Model() - async for transcription in batched_whisper.transcribe.map.aio( - ds["audio"] - ): + async for transcription in batched_whisper.transcribe.map.aio(ds["audio"]): print("Transcription for audio 📻", transcription["text"]) From f2a16c4a9cd0f4182df718d6acc2aaf18d6166c9 Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Sun, 18 Aug 2024 03:13:17 +0000 Subject: [PATCH 4/4] minor adjustments --- 06_gpu_and_ml/llm-serving/batched_whisper.py | 54 +++++++++++++------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/06_gpu_and_ml/llm-serving/batched_whisper.py b/06_gpu_and_ml/llm-serving/batched_whisper.py index e40146e73..e13a5892b 100644 --- a/06_gpu_and_ml/llm-serving/batched_whisper.py +++ b/06_gpu_and_ml/llm-serving/batched_whisper.py @@ -1,8 +1,9 @@ # # Fast Whisper inference using dynamic batching # -# In this example, we demonstrate how to run batched inference for OpenAI's speech recognition model, -# [Whisper](https://openai.com/index/whisper/). Batching multiple audio samples together or batching chunks -# of a single audio sample can help to achieve a 2.5x speedup in inference throughput on an A100! +# In this example, we demonstrate how to run [dynamically batched inference](https://modal.com/docs/guide/dynamic-batching) +# for OpenAI's speech recognition model, [Whisper](https://openai.com/index/whisper/), on Modal. +# Batching multiple audio samples together or batching chunks of a single audio sample can help to achieve a 2.5x increase +# in inference throughput on an A100! # # We will be running the [Whisper Large V3](https://huggingface.co/openai/whisper-large-v3) model. # To run [any of the other HuggingFace Whisper models](https://huggingface.co/models?search=openai/whisper), @@ -47,29 +48,30 @@ # ## The model class # # The inference function is best represented using Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions). - -# We define a `@modal.build` method to download the model and a `@modal.enter` method to load the model. This allows -# the container to download the model from HuggingFace just once when it launches, load the model into memory just once -# every time a container starts up by caching it on the GPU for subsequent invocations of the function. +# +# We define a `@modal.build` method to download the model and a `@modal.enter` method to load the model. +# `build` downloads the model from HuggingFace just once when our app is first run or deployed +# and `enter` loads the model into memory just once when our inference function is first invoked. # # We also define a `transcribe` method that uses the `@modal.batched` decorator to enable dynamic batching. # This allows us to invoke the function with individual audio samples, and the function will automatically batch them -# together before running inference. +# together before running inference. Batching is critical for making good use of the GPU, since GPUs are designed +# for running parallel operations at high throughput. # # The `max_batch_size` parameter limits the maximum number of audio samples combined into a single batch. -# We used a `max_batch_size` of 128, the largest power of 2 that can be accommodated by the 40GB A100 GPU memory. This number -# will vary depending on the model and the GPU you are using. +# We used a `max_batch_size` of `64`, the largest power-of-2 batch size that can be accommodated by the 24 A10G GPU memory. +# This number will vary depending on the model and the GPU you are using. # # The `wait_ms` parameter sets the maximum time to wait for more inputs before running the batched transcription. # To tune this parameter, you can set it to the target latency of your application minus the execution time of an inference batch. # This allows the latency of any request to stay within your target latency. # -# Hint: Try using an H100 if you've got a large model or big batches! -GPU_CONFIG = modal.gpu.A100(count=1) # 40GB A100 by default - -@app.cls(gpu=GPU_CONFIG, concurrency_limit=1) +@app.cls( + gpu="a10g", # Try using an A100 or H100 if you've got a large model or need big batches! + concurrency_limit=10, # default max GPUs for Modal's free tier +) class Model: @modal.build() def download_model(self): @@ -103,6 +105,8 @@ def load_model(self): use_safetensors=True, ).to("cuda") + self.model.generation_config.language = "<|en|>" + # Create a pipeline for preprocessing and transcribing speech data self.pipeline = pipeline( "automatic-speech-recognition", @@ -113,11 +117,19 @@ def load_model(self): device="cuda", ) - @modal.batched(max_batch_size=128, wait_ms=4000) + @modal.batched(max_batch_size=64, wait_ms=1000) def transcribe(self, audio_samples): + import time + + start = time.monotonic_ns() + print(f"Transcribing {len(audio_samples)} audio samples") transcription = self.pipeline( audio_samples, batch_size=len(audio_samples) ) + end = time.monotonic_ns() + print( + f"Transcribed {len(audio_samples)} samples in {round((end - start) / 1e9, 2)}s" + ) return transcription @@ -133,10 +145,13 @@ def transcribe(self, audio_samples): async def transcribe_hf_dataset(dataset_name): from datasets import load_dataset + print("📂 Loading dataset", dataset_name) ds = load_dataset(dataset_name, "clean", split="validation") + print("📂 Dataset loaded") batched_whisper = Model() + print("📣 Sending data for transcripton") async for transcription in batched_whisper.transcribe.map.aio(ds["audio"]): - print("Transcription for audio 📻", transcription["text"]) + yield transcription # ## Run the model @@ -146,5 +161,8 @@ async def transcribe_hf_dataset(dataset_name): @app.local_entrypoint() -async def main(): - transcribe_hf_dataset.remote("hf-internal-testing/librispeech_asr_dummy") +async def main(dataset_name: str = None): + if dataset_name is None: + dataset_name = "hf-internal-testing/librispeech_asr_dummy" + for result in transcribe_hf_dataset.remote_gen(dataset_name): + print(result["text"])