Skip to content

Commit

Permalink
Migrate examples away from @modal.build (#1038)
Browse files Browse the repository at this point in the history
* Migrate batched_whisper example to use a Volume for the HuggingFace cache

* Update webcame example to use Volume cache

* Migrate wikipedia example to use a Volume

* Migrate sentence transformers example to use run_function

* Shorten the name used for the cache volume

* Migrate cloud bucket loras example to use a Volume cache

* Fix vestigial comment

* Migrate pdf vision example to use a cache volume

* Migrate text_to_image example to use a cache volume

* Migrate trellis3d example to use volume caching

* Migrate segment_anything example

* Migrate the flux example

* Migrate image_to_image example

* Remove a couple vestigial references to '@build' in comments

* Fixes
  • Loading branch information
mwaskom authored Jan 10, 2025
1 parent 9365ed0 commit 38449e5
Show file tree
Hide file tree
Showing 11 changed files with 225 additions and 177 deletions.
11 changes: 7 additions & 4 deletions 06_gpu_and_ml/embeddings/wikipedia/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,17 @@
)

## Dataset-Specific Configuration
MODEL_CACHE_VOLUME = modal.Volume.from_name(
"embedding-model-cache", create_if_missing=True
)
DATASET_NAME = "wikipedia"
DATASET_READ_VOLUME = modal.Volume.from_name(
"embedding-wikipedia", create_if_missing=True
)
EMBEDDING_CHECKPOINT_VOLUME = modal.Volume.from_name(
"checkpoint", create_if_missing=True
)
MODEL_DIR = "/model"
DATASET_DIR = "/data"
CHECKPOINT_DIR = "/checkpoint"
SAVE_TO_DISK = True
Expand All @@ -44,6 +48,8 @@
str(BATCH_SIZE),
"--max-batch-tokens",
str(BATCH_SIZE * 512),
"--huggingface-hub-cache",
MODEL_DIR,
]


Expand Down Expand Up @@ -128,10 +134,6 @@ def generate_batches(xs, batch_size):
retries=3,
)
class TextEmbeddingsInference:
@modal.build()
def download_model(self):
spawn_server()

@modal.enter()
def open_connection(self):
# If the process is running for a long time, the client does not seem to close the connections, results in a pool timeout
Expand Down Expand Up @@ -262,6 +264,7 @@ def upload_result_to_hf(batch_size: int) -> None:
volumes={
DATASET_DIR: DATASET_READ_VOLUME,
CHECKPOINT_DIR: EMBEDDING_CHECKPOINT_VOLUME,
MODEL_DIR: MODEL_CACHE_VOLUME,
},
timeout=86400,
secrets=[modal.Secret.from_name("huggingface-secret")],
Expand Down
21 changes: 15 additions & 6 deletions 06_gpu_and_ml/gpu_packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,21 @@

import modal

image = modal.Image.debian_slim().pip_install("sentence-transformers==3.2.0")
MODEL_PATH = "/model.bge"


def download_model():
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("BAAI/bge-small-en-v1.5")
model.save(MODEL_PATH)


image = (
modal.Image.debian_slim()
.pip_install("sentence-transformers==3.2.0")
.run_function(download_model)
)

app = modal.App("gpu-packing", image=image)

Expand Down Expand Up @@ -49,11 +63,6 @@ def __init__(self, n_models=10):
self.model_pool = ModelPool()
self.n_models = n_models

@modal.build()
def download(self):
model = SentenceTransformer("BAAI/bge-small-en-v1.5")
model.save("/model.bge")

@modal.enter()
async def load_models(self):
# Boot N models onto the gpu, and place into the pool
Expand Down
78 changes: 38 additions & 40 deletions 06_gpu_and_ml/llm-serving/chat_with_pdf_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

# First, we’ll import the libraries we need locally and define some constants.

import os
from pathlib import Path
from urllib.request import urlopen
from uuid import uuid4
Expand All @@ -22,11 +21,15 @@

MINUTES = 60 # seconds

app = modal.App("chat-with-pdf")

# ## Setting up dependenices

# In Modal, we define [container images](https://modal.com/docs/guide/custom-container) that run our serverless workloads.
# We install the packages required for our application in those images.

CACHE_DIR = "/hf-cache"

model_image = (
modal.Image.debian_slim(python_version="3.12")
.apt_install("git")
Expand All @@ -38,6 +41,7 @@
"torchvision==0.19.1",
]
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HUB_CACHE": CACHE_DIR})
)

# These dependencies are only installed remotely, so we can't import them locally.
Expand All @@ -49,40 +53,13 @@
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

# ## Downloading ColQwen2
# ## Specifying the ColQwen2 model

# Vision-language models (VLMs) for embedding and generation add another layer of simplification
# to RAG apps based on vector search: we only need one model.
# Here, we use the Qwen2-VL-2B-Instruct model from Alibaba.
# The function below downloads the model from the Hugging Face Hub.


def download_model(model_dir, model_name, model_revision):
from huggingface_hub import snapshot_download
from transformers.utils import move_cache

os.makedirs(model_dir, exist_ok=True)
snapshot_download(
model_name,
local_dir=model_dir,
revision=model_revision,
ignore_patterns=["*.pt", "*.bin"], # using safetensors
)
move_cache()


# We can also include other files that our application needs in the container image.
# Here, we add the model weights to the image by executing our `download_model` function.

model_image = model_image.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}).run_function(
download_model,
timeout=20 * MINUTES,
kwargs={
"model_dir": "/model-qwen2-VL-2B-Instruct",
"model_name": "Qwen/Qwen2-VL-2B-Instruct",
"model_revision": "aca78372505e6cb469c4fa6a35c60265b00ff5a4",
},
)
MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
MODEL_REVISION = "aca78372505e6cb469c4fa6a35c60265b00ff5a4"

# ## Managing state with Modal Volumes and Dicts

Expand Down Expand Up @@ -140,6 +117,30 @@ def __init__(self):
pdf_volume = modal.Volume.from_name("colqwen-chat-pdfs", create_if_missing=True)
PDF_ROOT = Path("/vol/pdfs/")

# ### Caching the model weights

# We'll also use a Volume to cache the model weights.

cache_volume = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)


# Running this function will download the model weights to the cache volume.
# Otherwise, the model weights will be downloaded on the first query.


@app.function(
image=model_image, volumes={CACHE_DIR: cache_volume}, timeout=20 * MINUTES
)
def download_model():
from huggingface_hub import snapshot_download

result = snapshot_download(
MODEL_NAME,
revision=MODEL_REVISION,
ignore_patterns=["*.pt", "*.bin"], # using safetensors
)
print(f"Downloaded model weights to {result}")


# ## Defining a Chat with PDF service

Expand All @@ -148,25 +149,20 @@ def __init__(self):

# It uses [Modal `@app.cls`](https://modal.com/docs/guide/lifecycle-functions) decorators
# to organize the "lifecycle" of the app:
# to ensure all model files are downloaded (`@modal.build`)
# to load the model on container start (`@modal.enter`)
# and to run inference on request (`@modal.method`).
# loading the model on container start (`@modal.enter`) and running inference on request (`@modal.method`).

# We include in the arguments to the `@app.cls` decorator
# all the information about this service's infrastructure:
# the container image, the remote storage, and the GPU requirements.

app = modal.App("chat-with-pdf")


@app.cls(
image=model_image,
gpu=modal.gpu.A100(size="80GB"),
container_idle_timeout=10 * MINUTES, # spin down when inactive
volumes={"/vol/pdfs/": pdf_volume},
volumes={"/vol/pdfs/": pdf_volume, CACHE_DIR: cache_volume},
)
class Model:
@modal.build()
@modal.enter()
def load_models(self):
self.colqwen2_model = ColQwen2.from_pretrained(
Expand All @@ -178,7 +174,9 @@ def load_models(self):
"vidore/colqwen2-v0.1"
)
self.qwen2_vl_model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-2B-Instruct", torch_dtype=torch.bfloat16
MODEL_NAME,
revision=MODEL_REVISION,
torch_dtype=torch.bfloat16,
)
self.qwen2_vl_model.to("cuda:0")
self.qwen2_vl_processor = AutoProcessor.from_pretrained(
Expand Down Expand Up @@ -320,7 +318,7 @@ def generate_response(self, message, session, image):
pdf_image = (
modal.Image.debian_slim(python_version="3.12")
.apt_install("poppler-utils")
.pip_install("pdf2image==1.17.0")
.pip_install("pdf2image==1.17.0", "pillow==10.4.0")
)


Expand Down
39 changes: 23 additions & 16 deletions 06_gpu_and_ml/obj_detection_webcam/webcam.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,22 +40,24 @@
# [Pillow](https://python-pillow.org/) which lets us work with images from Python,
# and a system font for drawing.

# This example uses the `facebook/detr-resnet-50` pre-trained model, which is downloaded
# once at image build time using the `@build` hook and saved into the image.
# This example uses the `facebook/detr-resnet-50` pre-trained model,
# which we'll cache to a Volume for fast cold starts.

model_repo_id = "facebook/detr-resnet-50"
MODEL_REPO_ID = "facebook/detr-resnet-50"
MODEL_DIR = "/cache"


app = modal.App("example-webcam-object-detection")
image = (
modal.Image.debian_slim(python_version="3.12")
.pip_install(
"huggingface-hub==0.16.4",
"huggingface-hub==0.27.1",
"Pillow",
"timm",
"transformers",
)
.apt_install("fonts-freefont-ttf")
.env({"HF_HUB_CACHE": MODEL_DIR})
)


Expand All @@ -67,9 +69,6 @@
# which runs on every container start. This lets us load the model only once per
# container, so that it's reused for subsequent function calls.

# * Above we stored the model in the container image. This lets us download the model only
# when the image is (re)built, and not everytime the function is called.

# * We're running it on multiple CPUs for extra performance

# Note that the function takes an image and returns a new image.
Expand All @@ -86,21 +85,29 @@
from transformers import DetrForObjectDetection, DetrImageProcessor


@app.cls(image=image)
class ObjectDetection:
@modal.build()
def download_model(self):
snapshot_download(repo_id=model_repo_id, cache_dir="/cache")
# We'll store the model weights in a Volume and provide a function that you can
# `modal run` against to download the model weights prior to deploying the App.
# Otherwise, the model weights will be downloaded for the first inference
# and cached to the Volume when the first container exits.

cache_volume = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)


@app.function(image=image, volumes={MODEL_DIR: cache_volume})
def download_model():
loc = snapshot_download(repo_id=MODEL_REPO_ID)
print(f"Saved model to {loc}")


@app.cls(image=image, volumes={MODEL_DIR: cache_volume})
class ObjectDetection:
@modal.enter()
def load_model(self):
self.feature_extractor = DetrImageProcessor.from_pretrained(
model_repo_id,
cache_dir="/cache",
MODEL_REPO_ID,
)
self.model = DetrForObjectDetection.from_pretrained(
model_repo_id,
cache_dir="/cache",
MODEL_REPO_ID,
)

@modal.method()
Expand Down
49 changes: 28 additions & 21 deletions 06_gpu_and_ml/openai_whisper/batched_whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

# Let's start by importing the Modal client and defining the model that we want to serve.

import os

import modal

Expand All @@ -39,19 +38,42 @@
"datasets==3.2.0",
)
# Use the barebones `hf-transfer` package for maximum download speeds. No progress bar, but expect 700MB/s.
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HUB_CACHE": MODEL_DIR})
)

app = modal.App("example-whisper-batched-inference", image=image)
model_cache = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
app = modal.App(
"example-whisper-batched-inference",
image=image,
volumes={MODEL_DIR: model_cache},
)

# ## Caching the model weights

# We'll define a function to download the model and cache it in a volume.
# You can `modal run` against this function prior to deploying the App.


@app.function()
def download_model():
from huggingface_hub import snapshot_download
from transformers.utils import move_cache

snapshot_download(
MODEL_NAME,
ignore_patterns=["*.pt", "*.bin"], # Using safetensors
revision=MODEL_REVISION,
)
move_cache()


# ## The model class

# The inference function is best represented using Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions).

# We define a `@modal.build` method to download the model and a `@modal.enter` method to load the model.
# `build` downloads the model from HuggingFace just once when our app is first run or deployed
# and `enter` loads the model into memory just once when our inference function is first invoked.
# We define a `@modal.enter` method to load the model when the container starts, before it picks up any inputs.
# The weights will be loaded from the Hugging Face cache volume so that we don't need to download them when
# we start a new container.

# We also define a `transcribe` method that uses the `@modal.batched` decorator to enable dynamic batching.
# This allows us to invoke the function with individual audio samples, and the function will automatically batch them
Expand All @@ -72,21 +94,6 @@
concurrency_limit=10, # default max GPUs for Modal's free tier
)
class Model:
@modal.build()
def download_model(self):
from huggingface_hub import snapshot_download
from transformers.utils import move_cache

os.makedirs(MODEL_DIR, exist_ok=True)

snapshot_download(
MODEL_NAME,
local_dir=MODEL_DIR,
ignore_patterns=["*.pt", "*.bin"], # Using safetensors
revision=MODEL_REVISION,
)
move_cache()

@modal.enter()
def load_model(self):
import torch
Expand Down
Loading

0 comments on commit 38449e5

Please sign in to comment.