diff --git a/06_gpu_and_ml/openai_whisper/batched_whisper.py b/06_gpu_and_ml/openai_whisper/batched_whisper.py index 448f93d8b..73770a38e 100644 --- a/06_gpu_and_ml/openai_whisper/batched_whisper.py +++ b/06_gpu_and_ml/openai_whisper/batched_whisper.py @@ -1,16 +1,16 @@ # # Fast Whisper inference using dynamic batching -# + # In this example, we demonstrate how to run [dynamically batched inference](https://modal.com/docs/guide/dynamic-batching) # for OpenAI's speech recognition model, [Whisper](https://openai.com/index/whisper/), on Modal. # Batching multiple audio samples together or batching chunks of a single audio sample can help to achieve a 2.8x increase # in inference throughput on an A10G! -# + # We will be running the [Whisper Large V3](https://huggingface.co/openai/whisper-large-v3) model. # To run [any of the other HuggingFace Whisper models](https://huggingface.co/models?search=openai/whisper), # simply replace the `MODEL_NAME` and `MODEL_REVISION` variables. -# + # ## Setup -# + # Let's start by importing the Modal client and defining the model that we want to serve. import os @@ -23,20 +23,20 @@ # ## Define a container image -# + # We’ll start with Modal's baseline `debian_slim` image and install the relevant libraries. image = ( modal.Image.debian_slim(python_version="3.11") .pip_install( - "torch==2.1.2", - "transformers==4.39.3", - "hf-transfer==0.1.6", - "huggingface_hub==0.22.2", + "torch==2.5.1", + "transformers==4.47.1", + "hf-transfer==0.1.8", + "huggingface_hub==0.27.0", "librosa==0.10.2", "soundfile==0.12.1", - "accelerate==0.33.0", - "datasets==2.20.0", + "accelerate==1.2.1", + "datasets==3.2.0", ) # Use the barebones `hf-transfer` package for maximum download speeds. No progress bar, but expect 700MB/s. .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) @@ -46,26 +46,25 @@ # ## The model class -# + # The inference function is best represented using Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions). -# + # We define a `@modal.build` method to download the model and a `@modal.enter` method to load the model. # `build` downloads the model from HuggingFace just once when our app is first run or deployed # and `enter` loads the model into memory just once when our inference function is first invoked. -# + # We also define a `transcribe` method that uses the `@modal.batched` decorator to enable dynamic batching. # This allows us to invoke the function with individual audio samples, and the function will automatically batch them # together before running inference. Batching is critical for making good use of the GPU, since GPUs are designed # for running parallel operations at high throughput. -# + # The `max_batch_size` parameter limits the maximum number of audio samples combined into a single batch. # We used a `max_batch_size` of `64`, the largest power-of-2 batch size that can be accommodated by the 24 A10G GPU memory. # This number will vary depending on the model and the GPU you are using. -# + # The `wait_ms` parameter sets the maximum time to wait for more inputs before running the batched transcription. # To tune this parameter, you can set it to the target latency of your application minus the execution time of an inference batch. # This allows the latency of any request to stay within your target latency. -# @app.cls( @@ -134,9 +133,10 @@ def transcribe(self, audio_samples): # ## Transcribe a dataset + # In this example, we use the [librispeech_asr_dummy dataset](https://huggingface.co/datasets/hf-internal-testing/librispeech_asr_dummy) # from Hugging Face's Datasets library to test the model. -# + # We use [`map.aio`](/docs/reference/modal.Function#map) to asynchronously map over the audio files. # This allows us to invoke the batched transcription method on each audio sample in parallel. @@ -155,7 +155,7 @@ async def transcribe_hf_dataset(dataset_name): # ## Run the model -# + # We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) # to run the transcription. You can run this locally with `modal run batched_whisper.py`.