Migrate examples away from @modal.build (#1038)

* Migrate batched_whisper example to use a Volume for the HuggingFace cache * Update webcame example to use Volume cache * Migrate wikipedia example to use a Volume * Migrate sentence transformers example to use run_function * Shorten the name used for the cache volume * Migrate cloud bucket loras example to use a Volume cache * Fix vestigial comment * Migrate pdf vision example to use a cache volume * Migrate text_to_image example to use a cache volume * Migrate trellis3d example to use volume caching * Migrate segment_anything example * Migrate the flux example * Migrate image_to_image example * Remove a couple vestigial references to '@build' in comments * Fixes
modal-labs · Jan 10, 2025 · 38449e5 · 38449e5
1 parent 9365ed0
commit 38449e5
Show file tree

Hide file tree

Showing 11 changed files with 225 additions and 177 deletions.
diff --git a/06_gpu_and_ml/embeddings/wikipedia/main.py b/06_gpu_and_ml/embeddings/wikipedia/main.py
@@ -18,13 +18,17 @@
 )
 
 ## Dataset-Specific Configuration
+MODEL_CACHE_VOLUME = modal.Volume.from_name(
+    "embedding-model-cache", create_if_missing=True
+)
 DATASET_NAME = "wikipedia"
 DATASET_READ_VOLUME = modal.Volume.from_name(
     "embedding-wikipedia", create_if_missing=True
 )
 EMBEDDING_CHECKPOINT_VOLUME = modal.Volume.from_name(
     "checkpoint", create_if_missing=True
 )
+MODEL_DIR = "/model"
 DATASET_DIR = "/data"
 CHECKPOINT_DIR = "/checkpoint"
 SAVE_TO_DISK = True
@@ -44,6 +48,8 @@
     str(BATCH_SIZE),
     "--max-batch-tokens",
     str(BATCH_SIZE * 512),
+    "--huggingface-hub-cache",
+    MODEL_DIR,
 ]
 
 
@@ -128,10 +134,6 @@ def generate_batches(xs, batch_size):
     retries=3,
 )
 class TextEmbeddingsInference:
-    @modal.build()
-    def download_model(self):
-        spawn_server()
-
     @modal.enter()
     def open_connection(self):
         # If the process is running for a long time, the client does not seem to close the connections, results in a pool timeout
@@ -262,6 +264,7 @@ def upload_result_to_hf(batch_size: int) -> None:
     volumes={
         DATASET_DIR: DATASET_READ_VOLUME,
         CHECKPOINT_DIR: EMBEDDING_CHECKPOINT_VOLUME,
+        MODEL_DIR: MODEL_CACHE_VOLUME,
     },
     timeout=86400,
     secrets=[modal.Secret.from_name("huggingface-secret")],

diff --git a/06_gpu_and_ml/gpu_packing.py b/06_gpu_and_ml/gpu_packing.py
@@ -12,7 +12,21 @@
 
 import modal
 
-image = modal.Image.debian_slim().pip_install("sentence-transformers==3.2.0")
+MODEL_PATH = "/model.bge"
+
+
+def download_model():
+    from sentence_transformers import SentenceTransformer
+
+    model = SentenceTransformer("BAAI/bge-small-en-v1.5")
+    model.save(MODEL_PATH)
+
+
+image = (
+    modal.Image.debian_slim()
+    .pip_install("sentence-transformers==3.2.0")
+    .run_function(download_model)
+)
 
 app = modal.App("gpu-packing", image=image)
 
@@ -49,11 +63,6 @@ def __init__(self, n_models=10):
         self.model_pool = ModelPool()
         self.n_models = n_models
 
-    @modal.build()
-    def download(self):
-        model = SentenceTransformer("BAAI/bge-small-en-v1.5")
-        model.save("/model.bge")
-
     @modal.enter()
     async def load_models(self):
         # Boot N models onto the gpu, and place into the pool

diff --git a/06_gpu_and_ml/llm-serving/chat_with_pdf_vision.py b/06_gpu_and_ml/llm-serving/chat_with_pdf_vision.py
@@ -13,7 +13,6 @@
 
 # First, we’ll import the libraries we need locally and define some constants.
 
-import os
 from pathlib import Path
 from urllib.request import urlopen
 from uuid import uuid4
@@ -22,11 +21,15 @@
 
 MINUTES = 60  # seconds
 
+app = modal.App("chat-with-pdf")
+
 # ## Setting up dependenices
 
 # In Modal, we define [container images](https://modal.com/docs/guide/custom-container) that run our serverless workloads.
 # We install the packages required for our application in those images.
 
+CACHE_DIR = "/hf-cache"
+
 model_image = (
     modal.Image.debian_slim(python_version="3.12")
     .apt_install("git")
@@ -38,6 +41,7 @@
             "torchvision==0.19.1",
         ]
     )
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HUB_CACHE": CACHE_DIR})
 )
 
 # These dependencies are only installed remotely, so we can't import them locally.
@@ -49,40 +53,13 @@
     from qwen_vl_utils import process_vision_info
     from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 
-# ## Downloading ColQwen2
+# ## Specifying the ColQwen2 model
 
 # Vision-language models (VLMs) for embedding and generation add another layer of simplification
 # to RAG apps based on vector search: we only need one model.
-# Here, we use the Qwen2-VL-2B-Instruct model from Alibaba.
-# The function below downloads the model from the Hugging Face Hub.
-
-
-def download_model(model_dir, model_name, model_revision):
-    from huggingface_hub import snapshot_download
-    from transformers.utils import move_cache
-
-    os.makedirs(model_dir, exist_ok=True)
-    snapshot_download(
-        model_name,
-        local_dir=model_dir,
-        revision=model_revision,
-        ignore_patterns=["*.pt", "*.bin"],  # using safetensors
-    )
-    move_cache()
 
-
-# We can also include other files that our application needs in the container image.
-# Here, we add the model weights to the image by executing our `download_model` function.
-
-model_image = model_image.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}).run_function(
-    download_model,
-    timeout=20 * MINUTES,
-    kwargs={
-        "model_dir": "/model-qwen2-VL-2B-Instruct",
-        "model_name": "Qwen/Qwen2-VL-2B-Instruct",
-        "model_revision": "aca78372505e6cb469c4fa6a35c60265b00ff5a4",
-    },
-)
+MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
+MODEL_REVISION = "aca78372505e6cb469c4fa6a35c60265b00ff5a4"
 
 # ## Managing state with Modal Volumes and Dicts
 
@@ -140,6 +117,30 @@ def __init__(self):
 pdf_volume = modal.Volume.from_name("colqwen-chat-pdfs", create_if_missing=True)
 PDF_ROOT = Path("/vol/pdfs/")
 
+# ### Caching the model weights
+
+# We'll also use a Volume to cache the model weights.
+
+cache_volume = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
+
+
+# Running this function will download the model weights to the cache volume.
+# Otherwise, the model weights will be downloaded on the first query.
+
+
+@app.function(
+    image=model_image, volumes={CACHE_DIR: cache_volume}, timeout=20 * MINUTES
+)
+def download_model():
+    from huggingface_hub import snapshot_download
+
+    result = snapshot_download(
+        MODEL_NAME,
+        revision=MODEL_REVISION,
+        ignore_patterns=["*.pt", "*.bin"],  # using safetensors
+    )
+    print(f"Downloaded model weights to {result}")
+
 
 # ## Defining a Chat with PDF service
 
@@ -148,25 +149,20 @@ def __init__(self):
 
 # It uses [Modal `@app.cls`](https://modal.com/docs/guide/lifecycle-functions) decorators
 # to organize the "lifecycle" of the app:
-# to ensure all model files are downloaded (`@modal.build`)
-# to load the model on container start (`@modal.enter`)
-# and to run inference on request (`@modal.method`).
+# loading the model on container start (`@modal.enter`) and running inference on request (`@modal.method`).
 
 # We include in the arguments to the `@app.cls` decorator
 # all the information about this service's infrastructure:
 # the container image, the remote storage, and the GPU requirements.
 
-app = modal.App("chat-with-pdf")
-
 
 @app.cls(
     image=model_image,
     gpu=modal.gpu.A100(size="80GB"),
     container_idle_timeout=10 * MINUTES,  # spin down when inactive
-    volumes={"/vol/pdfs/": pdf_volume},
+    volumes={"/vol/pdfs/": pdf_volume, CACHE_DIR: cache_volume},
 )
 class Model:
-    @modal.build()
     @modal.enter()
     def load_models(self):
         self.colqwen2_model = ColQwen2.from_pretrained(
@@ -178,7 +174,9 @@ def load_models(self):
             "vidore/colqwen2-v0.1"
         )
         self.qwen2_vl_model = Qwen2VLForConditionalGeneration.from_pretrained(
-            "Qwen/Qwen2-VL-2B-Instruct", torch_dtype=torch.bfloat16
+            MODEL_NAME,
+            revision=MODEL_REVISION,
+            torch_dtype=torch.bfloat16,
         )
         self.qwen2_vl_model.to("cuda:0")
         self.qwen2_vl_processor = AutoProcessor.from_pretrained(
@@ -320,7 +318,7 @@ def generate_response(self, message, session, image):
 pdf_image = (
     modal.Image.debian_slim(python_version="3.12")
     .apt_install("poppler-utils")
-    .pip_install("pdf2image==1.17.0")
+    .pip_install("pdf2image==1.17.0", "pillow==10.4.0")
 )
 
 

diff --git a/06_gpu_and_ml/obj_detection_webcam/webcam.py b/06_gpu_and_ml/obj_detection_webcam/webcam.py
@@ -40,22 +40,24 @@
 # [Pillow](https://python-pillow.org/) which lets us work with images from Python,
 # and a system font for drawing.
 
-# This example uses the `facebook/detr-resnet-50` pre-trained model, which is downloaded
-# once at image build time using the `@build` hook and saved into the image.
+# This example uses the `facebook/detr-resnet-50` pre-trained model,
+# which we'll cache to a Volume for fast cold starts.
 
-model_repo_id = "facebook/detr-resnet-50"
+MODEL_REPO_ID = "facebook/detr-resnet-50"
+MODEL_DIR = "/cache"
 
 
 app = modal.App("example-webcam-object-detection")
 image = (
     modal.Image.debian_slim(python_version="3.12")
     .pip_install(
-        "huggingface-hub==0.16.4",
+        "huggingface-hub==0.27.1",
         "Pillow",
         "timm",
         "transformers",
     )
     .apt_install("fonts-freefont-ttf")
+    .env({"HF_HUB_CACHE": MODEL_DIR})
 )
 
 
@@ -67,9 +69,6 @@
 #   which runs on every container start. This lets us load the model only once per
 #   container, so that it's reused for subsequent function calls.
 
-# * Above we stored the model in the container image. This lets us download the model only
-#   when the image is (re)built, and not everytime the function is called.
-
 # * We're running it on multiple CPUs for extra performance
 
 # Note that the function takes an image and returns a new image.
@@ -86,21 +85,29 @@
     from transformers import DetrForObjectDetection, DetrImageProcessor
 
 
-@app.cls(image=image)
-class ObjectDetection:
-    @modal.build()
-    def download_model(self):
-        snapshot_download(repo_id=model_repo_id, cache_dir="/cache")
+# We'll store the model weights in a Volume and provide a function that you can
+# `modal run` against to download the model weights prior to deploying the App.
+# Otherwise, the model weights will be downloaded for the first inference
+# and cached to the Volume when the first container exits.
+
+cache_volume = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
+
 
+@app.function(image=image, volumes={MODEL_DIR: cache_volume})
+def download_model():
+    loc = snapshot_download(repo_id=MODEL_REPO_ID)
+    print(f"Saved model to {loc}")
+
+
+@app.cls(image=image, volumes={MODEL_DIR: cache_volume})
+class ObjectDetection:
     @modal.enter()
     def load_model(self):
         self.feature_extractor = DetrImageProcessor.from_pretrained(
-            model_repo_id,
-            cache_dir="/cache",
+            MODEL_REPO_ID,
         )
         self.model = DetrForObjectDetection.from_pretrained(
-            model_repo_id,
-            cache_dir="/cache",
+            MODEL_REPO_ID,
         )
 
     @modal.method()

diff --git a/06_gpu_and_ml/openai_whisper/batched_whisper.py b/06_gpu_and_ml/openai_whisper/batched_whisper.py
@@ -13,7 +13,6 @@
 
 # Let's start by importing the Modal client and defining the model that we want to serve.
 
-import os
 
 import modal
 
@@ -39,19 +38,42 @@
         "datasets==3.2.0",
     )
     # Use the barebones `hf-transfer` package for maximum download speeds. No progress bar, but expect 700MB/s.
-    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HUB_CACHE": MODEL_DIR})
 )
 
-app = modal.App("example-whisper-batched-inference", image=image)
+model_cache = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
+app = modal.App(
+    "example-whisper-batched-inference",
+    image=image,
+    volumes={MODEL_DIR: model_cache},
+)
+
+# ## Caching the model weights
+
+# We'll define a function to download the model and cache it in a volume.
+# You can `modal run` against this function prior to deploying the App.
+
+
+@app.function()
+def download_model():
+    from huggingface_hub import snapshot_download
+    from transformers.utils import move_cache
+
+    snapshot_download(
+        MODEL_NAME,
+        ignore_patterns=["*.pt", "*.bin"],  # Using safetensors
+        revision=MODEL_REVISION,
+    )
+    move_cache()
 
 
 # ## The model class
 
 # The inference function is best represented using Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions).
 
-# We define a `@modal.build` method to download the model and a `@modal.enter` method to load the model.
-# `build` downloads the model from HuggingFace just once when our app is first run or deployed
-# and `enter` loads the model into memory just once when our inference function is first invoked.
+# We define a `@modal.enter` method to load the model when the container starts, before it picks up any inputs.
+# The weights will be loaded from the Hugging Face cache volume so that we don't need to download them when
+# we start a new container.
 
 # We also define a `transcribe` method that uses the `@modal.batched` decorator to enable dynamic batching.
 # This allows us to invoke the function with individual audio samples, and the function will automatically batch them
@@ -72,21 +94,6 @@
     concurrency_limit=10,  # default max GPUs for Modal's free tier
 )
 class Model:
-    @modal.build()
-    def download_model(self):
-        from huggingface_hub import snapshot_download
-        from transformers.utils import move_cache
-
-        os.makedirs(MODEL_DIR, exist_ok=True)
-
-        snapshot_download(
-            MODEL_NAME,
-            local_dir=MODEL_DIR,
-            ignore_patterns=["*.pt", "*.bin"],  # Using safetensors
-            revision=MODEL_REVISION,
-        )
-        move_cache()
-
     @modal.enter()
     def load_model(self):
         import torch