From eee86872eb992b6906fea6f59499ce607c282025 Mon Sep 17 00:00:00 2001
From: cathyzbn <cathyzhou0317@gmail.com>
Date: Fri, 9 Aug 2024 21:56:37 +0000
Subject: [PATCH 1/4] batched whisper initial

---
 06_gpu_and_ml/llm-serving/batched_whisper.py | 156 +++++++++++++++++++
 1 file changed, 156 insertions(+)
 create mode 100644 06_gpu_and_ml/llm-serving/batched_whisper.py

diff --git a/06_gpu_and_ml/llm-serving/batched_whisper.py b/06_gpu_and_ml/llm-serving/batched_whisper.py
new file mode 100644
index 000000000..f3cabca62
--- /dev/null
+++ b/06_gpu_and_ml/llm-serving/batched_whisper.py
@@ -0,0 +1,156 @@
+# # Fast Whisper inference using dynamic batching
+#
+# In this example, we demonstrate how to run batched inference for [OpenAI's Whisper](https://openai.com/index/whisper/),
+# a speech recognition model. By batching multiple audio samples together or batching chunks of a single audio sample,
+# we can achieve up to a 2.5x speedup in inference throughput with on an A100.
+#
+# We will be running the [Whisper Large V3](https://huggingface.co/openai/whisper-large-v3) model.
+# To run [any of the other HuggingFace Whisper models](https://huggingface.co/models?search=openai/whisper),
+# simply replace the `MODEL_NAME` and `MODEL_REVISION` variables.
+#
+# ## Setup
+#
+# First, we import the Modal client and define the model that we want to serve.
+
+import asyncio
+import os
+import time
+
+import modal
+from datasets import load_dataset
+
+MODEL_DIR = "/model"
+MODEL_NAME = "openai/whisper-large-v3"
+MODEL_REVISION = "afda370583db9c5359511ed5d989400a6199dfe1"
+
+# ## Define a container image
+#
+# We want to create a Modal image that has the model weights pre-saved to a directory. The benefit of this
+# is that the container no longer has to re-download the model from Hugging Face. Instead, it will take
+# advantage of Modal's internal filesystem for faster cold starts.
+#
+# ### Download the weights
+# We can download the model to a specific directory using the Hugging Face utility function `snapshot_download`.
+#
+# If you adapt this example to run another model, note that for this step to work on a
+# [gated model](https://huggingface.co/docs/hub/en/models-gated),
+# the `HF_TOKEN` environment variable must be set and provided as a [Modal Secret](https://modal.com/secrets).
+
+def download_model_to_image(model_dir, model_name, model_revision):
+    from huggingface_hub import snapshot_download
+    from transformers.utils import move_cache
+
+    os.makedirs(model_dir, exist_ok=True)
+
+    snapshot_download(
+        model_name,
+        local_dir=model_dir,
+        ignore_patterns=["*.pt", "*.bin"],  # Using safetensors
+        revision=model_revision,
+    )
+    move_cache()
+
+# ### Image Definition
+#
+# We’ll start with Modal's baseline `debian_slim` image and install the relevant libraries.
+# Then we’ll use `run_function` with `download_model_to_image` to write the model into the container image.
+
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install(
+        "torch==2.1.2",
+        "transformers==4.39.3",
+        "hf-transfer==0.1.6",
+        "huggingface_hub==0.22.2",
+        "librosa==0.10.2",
+        "soundfile==0.12.1",
+        "datasets==2.20.0",
+        "accelerate==0.33.0",
+    )
+    # Use the barebones `hf-transfer` package for maximum download speeds. No progress bar, but expect 700MB/s.
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
+    .run_function(
+        download_model_to_image,
+        timeout=60 * 20,
+        kwargs={
+            "model_dir": MODEL_DIR,
+            "model_name": MODEL_NAME,
+            "model_revision": MODEL_REVISION,
+        },
+    )
+)
+
+app = modal.App("example-whisper-batched-inference", image=image)
+
+# ## The model class
+#
+# The inference function is best represented using Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions),
+# with a `load_model` method decorated with `@modal.enter`. This enables us to load the model into memory just once,
+# every time a container starts up, and keep it cached on the GPU for subsequent invocations of the function.
+#
+# We also define a `transcribe` method that uses the `@modal.batched` decorator to enable dynamic batching.
+# This allows us to invoke the function with individual audio samples, and the function will automatically batch them
+# together before running inference. The `max_batch_size` parameter limits the batch size to a maximum of 128 audio samples
+# at a time. The `wait_ms` parameter sets the maximum time to wait for more inputs before running the batched transcription.
+#
+# We selected a batch size of 128 because it is the largest power of 2 that fits within the 40GB A100 GPU memory.
+# This number will vary depending on the model and the GPU you are using. To tune the `wait_ms` parameter, you can set it to
+# `(targeted latency) - (execution time)`. Most applications have a targeted latency, and this allows the latency of
+# any request to stay within that limit.
+#
+# Hint: Try using an H100 if you've got a large model or big batches!
+
+GPU_CONFIG = modal.gpu.A100(count=1)  # 40GB A100 by default
+
+@app.cls(gpu=GPU_CONFIG, concurrency_limit=1)
+class Model:
+    @modal.enter()
+    def load_model(self):
+        import torch
+        from transformers import (
+            AutoModelForSpeechSeq2Seq,
+            AutoProcessor,
+            pipeline,
+        )
+        self.processor = AutoProcessor.from_pretrained(MODEL_NAME)
+        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            MODEL_NAME,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        ).to("cuda")
+
+        # Create a pipeline for preprocessing speech data and transcribing it
+        self.pipeline = pipeline(
+            "automatic-speech-recognition",
+            model=self.model,
+            tokenizer=self.processor.tokenizer,
+            feature_extractor=self.processor.feature_extractor,
+            torch_dtype=torch.float16,
+            device="cuda"
+        )
+
+    @modal.batched(max_batch_size=128, wait_ms=4000)
+    def transcribe(self, audio_samples):
+        transcription = self.pipeline(audio_samples, batch_size=len(audio_samples))
+        return transcription
+
+
+# ## Run the model
+#
+# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps)
+# to call our remote function sequentially for a list of inputs. You can run this locally with
+# `modal run batched_whisper.py`.
+#
+# In this example, we use the [librispeech_asr_dummy dataset](https://huggingface.co/datasets/hf-internal-testing/librispeech_asr_dummy)
+# from Hugging Face's Datasets library to test the model.
+#
+# We use [`map.aio`](/docs/reference/modal.Function#map) to asynchronously map over the audio files.
+# This allows us to invoke the batched transcription method on each audio sample in parallel.
+
+@app.local_entrypoint()
+async def main():
+    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    batched_whisper = Model()
+    async for transcription in batched_whisper.transcribe.map.aio(ds["audio"][:20]):
+        print("Transcription for audio 📻", transcription)

From 99274ffec33d1bad098898693a3dc5a2ba6002b4 Mon Sep 17 00:00:00 2001
From: cathyzbn <cathyzhou0317@gmail.com>
Date: Fri, 16 Aug 2024 14:26:01 +0000
Subject: [PATCH 2/4] fix tests + nit

---
 06_gpu_and_ml/llm-serving/batched_whisper.py | 131 +++++++++----------
 1 file changed, 64 insertions(+), 67 deletions(-)

diff --git a/06_gpu_and_ml/llm-serving/batched_whisper.py b/06_gpu_and_ml/llm-serving/batched_whisper.py
index f3cabca62..5b5f933fa 100644
--- a/06_gpu_and_ml/llm-serving/batched_whisper.py
+++ b/06_gpu_and_ml/llm-serving/batched_whisper.py
@@ -1,8 +1,8 @@
 # # Fast Whisper inference using dynamic batching
 #
-# In this example, we demonstrate how to run batched inference for [OpenAI's Whisper](https://openai.com/index/whisper/),
-# a speech recognition model. By batching multiple audio samples together or batching chunks of a single audio sample,
-# we can achieve up to a 2.5x speedup in inference throughput with on an A100.
+# In this example, we demonstrate how to run batched inference for OpenAI's speech recognition model,
+# [Whisper](https://openai.com/index/whisper/). Batching multiple audio samples together or batching chunks
+# of a single audio sample can help to achieve a 2.5x speedup in inference throughput on an A100!
 #
 # We will be running the [Whisper Large V3](https://huggingface.co/openai/whisper-large-v3) model.
 # To run [any of the other HuggingFace Whisper models](https://huggingface.co/models?search=openai/whisper),
@@ -10,50 +10,20 @@
 #
 # ## Setup
 #
-# First, we import the Modal client and define the model that we want to serve.
+# Let's start by importing the Modal client and defining the model that we want to serve.
 
-import asyncio
 import os
-import time
 
 import modal
-from datasets import load_dataset
 
 MODEL_DIR = "/model"
 MODEL_NAME = "openai/whisper-large-v3"
 MODEL_REVISION = "afda370583db9c5359511ed5d989400a6199dfe1"
 
-# ## Define a container image
-#
-# We want to create a Modal image that has the model weights pre-saved to a directory. The benefit of this
-# is that the container no longer has to re-download the model from Hugging Face. Instead, it will take
-# advantage of Modal's internal filesystem for faster cold starts.
-#
-# ### Download the weights
-# We can download the model to a specific directory using the Hugging Face utility function `snapshot_download`.
-#
-# If you adapt this example to run another model, note that for this step to work on a
-# [gated model](https://huggingface.co/docs/hub/en/models-gated),
-# the `HF_TOKEN` environment variable must be set and provided as a [Modal Secret](https://modal.com/secrets).
-
-def download_model_to_image(model_dir, model_name, model_revision):
-    from huggingface_hub import snapshot_download
-    from transformers.utils import move_cache
 
-    os.makedirs(model_dir, exist_ok=True)
-
-    snapshot_download(
-        model_name,
-        local_dir=model_dir,
-        ignore_patterns=["*.pt", "*.bin"],  # Using safetensors
-        revision=model_revision,
-    )
-    move_cache()
-
-# ### Image Definition
+# ## Define a container image
 #
 # We’ll start with Modal's baseline `debian_slim` image and install the relevant libraries.
-# Then we’ll use `run_function` with `download_model_to_image` to write the model into the container image.
 
 image = (
     modal.Image.debian_slim(python_version="3.11")
@@ -64,39 +34,35 @@ def download_model_to_image(model_dir, model_name, model_revision):
         "huggingface_hub==0.22.2",
         "librosa==0.10.2",
         "soundfile==0.12.1",
-        "datasets==2.20.0",
         "accelerate==0.33.0",
+        "datasets==2.20.0",
     )
     # Use the barebones `hf-transfer` package for maximum download speeds. No progress bar, but expect 700MB/s.
     .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
-    .run_function(
-        download_model_to_image,
-        timeout=60 * 20,
-        kwargs={
-            "model_dir": MODEL_DIR,
-            "model_name": MODEL_NAME,
-            "model_revision": MODEL_REVISION,
-        },
-    )
 )
 
 app = modal.App("example-whisper-batched-inference", image=image)
 
+
 # ## The model class
 #
-# The inference function is best represented using Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions),
-# with a `load_model` method decorated with `@modal.enter`. This enables us to load the model into memory just once,
-# every time a container starts up, and keep it cached on the GPU for subsequent invocations of the function.
+# The inference function is best represented using Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions).
+
+# We define a `@modal.build` method to download the model and a `@modal.enter` method to load the model. This allows
+# the container to download the model from HuggingFace just once when it launches, load the model into memory just once
+# every time a container starts up by caching it on the GPU for subsequent invocations of the function.
 #
 # We also define a `transcribe` method that uses the `@modal.batched` decorator to enable dynamic batching.
 # This allows us to invoke the function with individual audio samples, and the function will automatically batch them
-# together before running inference. The `max_batch_size` parameter limits the batch size to a maximum of 128 audio samples
-# at a time. The `wait_ms` parameter sets the maximum time to wait for more inputs before running the batched transcription.
+# together before running inference.
+#
+# The `max_batch_size` parameter limits the maximum number of audio samples combined into a single batch.
+# We used a `max_batch_size` of 128, the largest power of 2 that can be accommodated by the 40GB A100 GPU memory. This number
+# will vary depending on the model and the GPU you are using.
 #
-# We selected a batch size of 128 because it is the largest power of 2 that fits within the 40GB A100 GPU memory.
-# This number will vary depending on the model and the GPU you are using. To tune the `wait_ms` parameter, you can set it to
-# `(targeted latency) - (execution time)`. Most applications have a targeted latency, and this allows the latency of
-# any request to stay within that limit.
+# The `wait_ms` parameter sets the maximum time to wait for more inputs before running the batched transcription.
+# To tune this parameter, you can set it to the target latency of your application minus the execution time of an inference batch.
+# This allows the latency of any request to stay within your target latency.
 #
 # Hint: Try using an H100 if you've got a large model or big batches!
 
@@ -104,6 +70,21 @@ def download_model_to_image(model_dir, model_name, model_revision):
 
 @app.cls(gpu=GPU_CONFIG, concurrency_limit=1)
 class Model:
+    @modal.build()
+    def download_model(self):
+        from huggingface_hub import snapshot_download
+        from transformers.utils import move_cache
+
+        os.makedirs(MODEL_DIR, exist_ok=True)
+
+        snapshot_download(
+            MODEL_NAME,
+            local_dir=MODEL_DIR,
+            ignore_patterns=["*.pt", "*.bin"],  # Using safetensors
+            revision=MODEL_REVISION,
+        )
+        move_cache()
+
     @modal.enter()
     def load_model(self):
         import torch
@@ -112,6 +93,7 @@ def load_model(self):
             AutoProcessor,
             pipeline,
         )
+
         self.processor = AutoProcessor.from_pretrained(MODEL_NAME)
         self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
             MODEL_NAME,
@@ -120,37 +102,52 @@ def load_model(self):
             use_safetensors=True,
         ).to("cuda")
 
-        # Create a pipeline for preprocessing speech data and transcribing it
+        # Create a pipeline for preprocessing and transcribing speech data
         self.pipeline = pipeline(
             "automatic-speech-recognition",
             model=self.model,
             tokenizer=self.processor.tokenizer,
             feature_extractor=self.processor.feature_extractor,
             torch_dtype=torch.float16,
-            device="cuda"
+            device="cuda",
         )
 
     @modal.batched(max_batch_size=128, wait_ms=4000)
     def transcribe(self, audio_samples):
-        transcription = self.pipeline(audio_samples, batch_size=len(audio_samples))
+        transcription = self.pipeline(
+            audio_samples, batch_size=len(audio_samples)
+        )
         return transcription
 
 
-# ## Run the model
-#
-# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps)
-# to call our remote function sequentially for a list of inputs. You can run this locally with
-# `modal run batched_whisper.py`.
-#
+# ## Transcribe a dataset
 # In this example, we use the [librispeech_asr_dummy dataset](https://huggingface.co/datasets/hf-internal-testing/librispeech_asr_dummy)
 # from Hugging Face's Datasets library to test the model.
 #
 # We use [`map.aio`](/docs/reference/modal.Function#map) to asynchronously map over the audio files.
 # This allows us to invoke the batched transcription method on each audio sample in parallel.
 
+
+@app.function()
+async def transcribe_hf_dataset(dataset_name):
+    from datasets import load_dataset
+
+    ds = load_dataset(
+        dataset_name, "clean", split="validation"
+    )
+    batched_whisper = Model()
+    async for transcription in batched_whisper.transcribe.map.aio(
+        ds["audio"]
+    ):
+        print("Transcription for audio 📻", transcription["text"])
+
+
+# ## Run the model
+#
+# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps)
+# to run the transcription. You can run this locally with `modal run batched_whisper.py`.
+
+
 @app.local_entrypoint()
 async def main():
-    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-    batched_whisper = Model()
-    async for transcription in batched_whisper.transcribe.map.aio(ds["audio"][:20]):
-        print("Transcription for audio 📻", transcription)
+    transcribe_hf_dataset.remote("hf-internal-testing/librispeech_asr_dummy")

From 7fe08745fb43ef1279d71c599a39c80c6cd69fae Mon Sep 17 00:00:00 2001
From: cathyzbn <cathyzhou0317@gmail.com>
Date: Fri, 16 Aug 2024 14:26:48 +0000
Subject: [PATCH 3/4] ruff

---
 06_gpu_and_ml/llm-serving/batched_whisper.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/06_gpu_and_ml/llm-serving/batched_whisper.py b/06_gpu_and_ml/llm-serving/batched_whisper.py
index 5b5f933fa..e40146e73 100644
--- a/06_gpu_and_ml/llm-serving/batched_whisper.py
+++ b/06_gpu_and_ml/llm-serving/batched_whisper.py
@@ -68,6 +68,7 @@
 
 GPU_CONFIG = modal.gpu.A100(count=1)  # 40GB A100 by default
 
+
 @app.cls(gpu=GPU_CONFIG, concurrency_limit=1)
 class Model:
     @modal.build()
@@ -132,13 +133,9 @@ def transcribe(self, audio_samples):
 async def transcribe_hf_dataset(dataset_name):
     from datasets import load_dataset
 
-    ds = load_dataset(
-        dataset_name, "clean", split="validation"
-    )
+    ds = load_dataset(dataset_name, "clean", split="validation")
     batched_whisper = Model()
-    async for transcription in batched_whisper.transcribe.map.aio(
-        ds["audio"]
-    ):
+    async for transcription in batched_whisper.transcribe.map.aio(ds["audio"]):
         print("Transcription for audio 📻", transcription["text"])
 
 

From f2a16c4a9cd0f4182df718d6acc2aaf18d6166c9 Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Sun, 18 Aug 2024 03:13:17 +0000
Subject: [PATCH 4/4] minor adjustments

---
 06_gpu_and_ml/llm-serving/batched_whisper.py | 54 +++++++++++++-------
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/06_gpu_and_ml/llm-serving/batched_whisper.py b/06_gpu_and_ml/llm-serving/batched_whisper.py
index e40146e73..e13a5892b 100644
--- a/06_gpu_and_ml/llm-serving/batched_whisper.py
+++ b/06_gpu_and_ml/llm-serving/batched_whisper.py
@@ -1,8 +1,9 @@
 # # Fast Whisper inference using dynamic batching
 #
-# In this example, we demonstrate how to run batched inference for OpenAI's speech recognition model,
-# [Whisper](https://openai.com/index/whisper/). Batching multiple audio samples together or batching chunks
-# of a single audio sample can help to achieve a 2.5x speedup in inference throughput on an A100!
+# In this example, we demonstrate how to run [dynamically batched inference](https://modal.com/docs/guide/dynamic-batching)
+# for OpenAI's speech recognition model, [Whisper](https://openai.com/index/whisper/), on Modal.
+# Batching multiple audio samples together or batching chunks of a single audio sample can help to achieve a 2.5x increase
+# in inference throughput on an A100!
 #
 # We will be running the [Whisper Large V3](https://huggingface.co/openai/whisper-large-v3) model.
 # To run [any of the other HuggingFace Whisper models](https://huggingface.co/models?search=openai/whisper),
@@ -47,29 +48,30 @@
 # ## The model class
 #
 # The inference function is best represented using Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions).
-
-# We define a `@modal.build` method to download the model and a `@modal.enter` method to load the model. This allows
-# the container to download the model from HuggingFace just once when it launches, load the model into memory just once
-# every time a container starts up by caching it on the GPU for subsequent invocations of the function.
+#
+# We define a `@modal.build` method to download the model and a `@modal.enter` method to load the model.
+# `build` downloads the model from HuggingFace just once when our app is first run or deployed
+# and `enter` loads the model into memory just once when our inference function is first invoked.
 #
 # We also define a `transcribe` method that uses the `@modal.batched` decorator to enable dynamic batching.
 # This allows us to invoke the function with individual audio samples, and the function will automatically batch them
-# together before running inference.
+# together before running inference. Batching is critical for making good use of the GPU, since GPUs are designed
+# for running parallel operations at high throughput.
 #
 # The `max_batch_size` parameter limits the maximum number of audio samples combined into a single batch.
-# We used a `max_batch_size` of 128, the largest power of 2 that can be accommodated by the 40GB A100 GPU memory. This number
-# will vary depending on the model and the GPU you are using.
+# We used a `max_batch_size` of `64`, the largest power-of-2 batch size that can be accommodated by the 24 A10G GPU memory.
+# This number will vary depending on the model and the GPU you are using.
 #
 # The `wait_ms` parameter sets the maximum time to wait for more inputs before running the batched transcription.
 # To tune this parameter, you can set it to the target latency of your application minus the execution time of an inference batch.
 # This allows the latency of any request to stay within your target latency.
 #
-# Hint: Try using an H100 if you've got a large model or big batches!
 
-GPU_CONFIG = modal.gpu.A100(count=1)  # 40GB A100 by default
 
-
-@app.cls(gpu=GPU_CONFIG, concurrency_limit=1)
+@app.cls(
+    gpu="a10g",  # Try using an A100 or H100 if you've got a large model or need big batches!
+    concurrency_limit=10,  # default max GPUs for Modal's free tier
+)
 class Model:
     @modal.build()
     def download_model(self):
@@ -103,6 +105,8 @@ def load_model(self):
             use_safetensors=True,
         ).to("cuda")
 
+        self.model.generation_config.language = "<|en|>"
+
         # Create a pipeline for preprocessing and transcribing speech data
         self.pipeline = pipeline(
             "automatic-speech-recognition",
@@ -113,11 +117,19 @@ def load_model(self):
             device="cuda",
         )
 
-    @modal.batched(max_batch_size=128, wait_ms=4000)
+    @modal.batched(max_batch_size=64, wait_ms=1000)
     def transcribe(self, audio_samples):
+        import time
+
+        start = time.monotonic_ns()
+        print(f"Transcribing {len(audio_samples)} audio samples")
         transcription = self.pipeline(
             audio_samples, batch_size=len(audio_samples)
         )
+        end = time.monotonic_ns()
+        print(
+            f"Transcribed {len(audio_samples)} samples in {round((end - start) / 1e9, 2)}s"
+        )
         return transcription
 
 
@@ -133,10 +145,13 @@ def transcribe(self, audio_samples):
 async def transcribe_hf_dataset(dataset_name):
     from datasets import load_dataset
 
+    print("📂 Loading dataset", dataset_name)
     ds = load_dataset(dataset_name, "clean", split="validation")
+    print("📂 Dataset loaded")
     batched_whisper = Model()
+    print("📣 Sending data for transcripton")
     async for transcription in batched_whisper.transcribe.map.aio(ds["audio"]):
-        print("Transcription for audio 📻", transcription["text"])
+        yield transcription
 
 
 # ## Run the model
@@ -146,5 +161,8 @@ async def transcribe_hf_dataset(dataset_name):
 
 
 @app.local_entrypoint()
-async def main():
-    transcribe_hf_dataset.remote("hf-internal-testing/librispeech_asr_dummy")
+async def main(dataset_name: str = None):
+    if dataset_name is None:
+        dataset_name = "hf-internal-testing/librispeech_asr_dummy"
+    for result in transcribe_hf_dataset.remote_gen(dataset_name):
+        print(result["text"])