Skip to content

Commit

Permalink
Merge branch 'main' into charlesfrye/build-docs
Browse files Browse the repository at this point in the history
  • Loading branch information
charlesfrye authored Sep 27, 2024
2 parents 3a7f240 + c163a6e commit de80d55
Show file tree
Hide file tree
Showing 29 changed files with 405 additions and 15 deletions.
7 changes: 7 additions & 0 deletions 01_getting_started/generators.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# # Run a generator function on Modal

# This example shows how you can run a generator function on Modal. We define a
# function that `yields` values and then call it with the [`remote_gen`](https://modal.com/docs/reference/modal.Function#remote_gen) method. The
# `remote_gen` method returns a generator object that can be used to iterate over
# the values produced by the function.

import modal

app = modal.App("example-generators")
Expand Down
15 changes: 15 additions & 0 deletions 05_scheduling/schedule_simple.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,21 @@
# ---
# cmd: ["python", "-m", "05_scheduling.schedule_simple"]
# ---

# # Scheduling remote jobs

# This example shows how you can schedule remote jobs on Modal.
# You can do this either with:
#
# - [`modal.Period`](https://modal.com/docs/reference/modal.Period) - a time interval between function calls.
# - [`modal.Cron`](https://modal.com/docs/reference/modal.Cron) - a cron expression to specify the schedule.

# In the code below, the first function runs every
# 5 seconds, and the second function runs every minute. We use the `schedule`
# argument to specify the schedule for each function. The `schedule` argument can
# take a `modal.Period` object to specify a time interval or a `modal.Cron` object
# to specify a cron expression.

import time
from datetime import datetime

Expand Down
4 changes: 4 additions & 0 deletions 06_gpu_and_ml/embeddings/instructor.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# # Create Instructor Embeddings on Modal
#
# This example runs the [Instructor](https://github.com/xlang-ai/instructor-embedding) embedding model and runs a simple sentence similarity computation.

import modal

MODEL_DIR = "/model"
Expand Down
4 changes: 4 additions & 0 deletions 06_gpu_and_ml/embeddings/text_embeddings_inference.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# ---
# cmd: ["modal", "run", "06_gpu_and_ml/embeddings/text_embeddings_inference.py::embed_dataset"]
# ---
# # Run TextEmbeddingsInference (TEI) on Modal
#
# This example runs the [Text Embedding Inference (TEI)](https://github.com/huggingface/text-embeddings-inference) toolkit on the Hacker News BigQuery public dataset.

import json
import os
import socket
Expand Down
47 changes: 47 additions & 0 deletions 06_gpu_and_ml/gpu_fallbacks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# # Set "fallback" GPUs
#
# GPU availabilities on Modal can fluctuate, especially for
# tightly-constrained requests, like for eight co-located GPUs
# in a specific region.
#
# If your code can run on multiple different GPUs, you can specify
# your GPU request as a list, in order of preference, and whenever
# your Function scales up, we will try to schedule it on each requested GPU type in order.
#
# The code below demonstrates the usage of the `gpu` parameter with a list of GPUs.

import subprocess

import modal

app = modal.App("example-gpu-fallbacks")


@app.function(
gpu=["h100", "a100", "any"], # "any" means any of L4, A10, or T4
max_inputs=1, # new container each input, so we re-roll the GPU dice every time
)
async def remote(_idx):
gpu = subprocess.run(
["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
check=True,
text=True,
stdout=subprocess.PIPE,
).stdout.strip()
print(gpu)
return gpu


@app.local_entrypoint()
def local(count: int = 32):
from collections import Counter

gpu_counter = Counter(
remote.map([i for i in range(count)], order_outputs=False)
)
print(f"ran {gpu_counter.total()} times")
print(f"on the following {len(gpu_counter.keys())} GPUs:", end="\n")
print(
*[f"{gpu.rjust(32)}: {'🔥' * ct}" for gpu, ct in gpu_counter.items()],
sep="\n",
)
2 changes: 1 addition & 1 deletion 06_gpu_and_ml/llm-serving/openai_compatible/locustfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class WebsiteUser(locust.HttpUser):
@locust.task
def chat_completion(self):
payload = {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "Llama-3.2-3B-Instruct-quantized.w8a8",
"messages": messages,
}

Expand Down
30 changes: 18 additions & 12 deletions 06_gpu_and_ml/llm-serving/vllm_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,18 @@
import modal

vllm_image = modal.Image.debian_slim(python_version="3.10").pip_install(
"vllm==0.5.3post1"
"vllm==0.6.2"
)

# ## Download the model weights
#
# We'll be running a pretrained foundation model -- Meta's LLaMA 3.1 8B
# in the Instruct variant that's trained to chat and follow instructions.
# We'll be running a pretrained foundation model -- Meta's LLaMA 3.2 3B
# in the Instruct variant that's trained to chat and follow instructions,
# quantized to 8-bit by [Neural Magic](https://neuralmagic.com/) and uploaded to Hugging Face.

MODELS_DIR = "/llamas"
MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
MODEL_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16"
MODEL_NAME = "neuralmagic/Llama-3.2-3B-Instruct-quantized.w8a8"
MODEL_REVISION = "1c42cac61b517e84efa30e3e90f00076045d5a89"

# We need to make the weights of that model available to our Modal Functions.
#
Expand Down Expand Up @@ -85,10 +86,10 @@

@app.function(
image=vllm_image,
gpu=modal.gpu.A100(count=N_GPU, size="40GB"),
gpu=modal.gpu.L4(count=N_GPU),
container_idle_timeout=5 * MINUTES,
timeout=24 * HOURS,
allow_concurrent_inputs=100,
allow_concurrent_inputs=500,
volumes={MODELS_DIR: volume},
)
@modal.asgi_app()
Expand All @@ -102,14 +103,15 @@ def serve():
from vllm.entrypoints.openai.serving_completion import (
OpenAIServingCompletion,
)
from vllm.entrypoints.openai.serving_engine import BaseModelPath
from vllm.usage.usage_lib import UsageContext

volume.reload() # ensure we have the latest version of the weights

# create a fastAPI app that uses vLLM's OpenAI-compatible router
web_app = fastapi.FastAPI(
title=f"OpenAI-compatible {MODEL_NAME} server",
description="Run an OpenAI-compatible LLM server with vLLM on modal.com",
description="Run an OpenAI-compatible LLM server with vLLM on modal.com 🚀",
version="0.0.1",
docs_url="/docs",
)
Expand Down Expand Up @@ -159,20 +161,24 @@ async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):

request_logger = RequestLogger(max_log_len=2048)

api_server.openai_serving_chat = OpenAIServingChat(
base_model_paths = [
BaseModelPath(name=MODEL_NAME.split("/")[1], model_path=MODEL_NAME)
]

api_server.chat = lambda s: OpenAIServingChat(
engine,
model_config=model_config,
served_model_names=[MODEL_NAME],
base_model_paths=base_model_paths,
chat_template=None,
response_role="assistant",
lora_modules=[],
prompt_adapters=[],
request_logger=request_logger,
)
api_server.openai_serving_completion = OpenAIServingCompletion(
api_server.completion = lambda s: OpenAIServingCompletion(
engine,
model_config=model_config,
served_model_names=[MODEL_NAME],
base_model_paths=base_model_paths,
lora_modules=[],
prompt_adapters=[],
request_logger=request_logger,
Expand Down
159 changes: 159 additions & 0 deletions 06_gpu_and_ml/long-training/long-training.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# ---
# cmd: ["modal", "run", "--detach", "06_gpu_and_ml.long-training.long-training"]
# deploy: true
# ---

# # Running long training jobs on Modal

# While Modal functions typically have a [maximum timeout of 24 hours](/docs/guide/timeouts), you can still run long training jobs on Modal by implementing a checkpointing mechanism in your code.
# This allows you to save the model's state periodically and resume from the last saved state.
# In fact, we recommend implementing checkpointing logic regardless of the duration of your training jobs. This prevents loss of progress in case of interruptions or [preemptions](/docs/guide/preemption).

# In this example, we'll walk through how to implement this pattern using PyTorch Lightning.

# ## Pattern

# The core pattern for long-duration training on Modal:

# 1. Periodically save checkpoints to a Modal [volume](/docs/guide/volumes)
# 2. Handle interruptions/timeouts and resume from the last checkpoint


# ## Setup

# Let's start by importing the Modal client and defining the Modal app and image. Since we are using PyTorch Lightning, we use an officially supported CUDA docker image as our base image.
# Then we install `pytorch` and `lightning` on top of that.

import os

import modal

app = modal.App("interrupt-resume-lightning")

# Set up the environment
image = modal.Image.from_registry(
"nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10"
).pip_install("lightning", "torchvision")


# ## Define Modal Volume

# Next, we set up a Modal [volume](/docs/guide/volumes) for storing both the training data and the checkpoints

volume = modal.Volume.from_name("training-checkpoints", create_if_missing=True)

VOLUME_PATH = "/vol"
DATA_PATH = f"{VOLUME_PATH}/data"
CHECKPOINTS_PATH = f"{VOLUME_PATH}/checkpoints"

# ## Model training

# We implement the actual model training class/functions and the checkpointing logic.
# PyTorch Lightning offers some [built-in checkpointing](https://pytorch-lightning.readthedocs.io/en/1.2.10/common/weights_loading.html#:~:text=Lightning%20automates%20saving%20and%20loading,having%20to%20retrain%20the%20model.) functionality.
# You can specify the checkpoint file path that you want to resume from using the `ckpt_path` parameter of [`trainer.fit`](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.trainer.trainer.Trainer.html)
# Additionally, you can specify the checkpointing interval with the `every_n_epochs` parameter of [`ModelCheckpoint`](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html).
# In the code below, we save checkpoints every 10 epochs, but this number can be adjusted depending on how long the epochs take. The goal is to minimize the disruption from job failures. Something that takes a few days should be checkpointed perhaps every few hours. Depending on what training framework you are using, how exactly this checkpointing gets implemented may vary.


def get_checkpoint(checkpoint_dir):
from lightning.pytorch.callbacks import ModelCheckpoint

return ModelCheckpoint(
dirpath=checkpoint_dir,
save_last=True,
every_n_epochs=10,
filename="epoch{epoch:02d}",
)


def train_model(data_dir, checkpoint_dir, resume_from_checkpoint=None):
import lightning as L

from .train import get_autoencoder, get_train_loader

# train the model (hint: here are some helpful Trainer arguments for rapid idea iteration)
autoencoder = get_autoencoder()
train_loader = get_train_loader(data_dir=data_dir)
checkpoint_callback = get_checkpoint(checkpoint_dir)
trainer = L.Trainer(
limit_train_batches=100, max_epochs=100, callbacks=[checkpoint_callback]
)
if resume_from_checkpoint:
print(f"Resuming from checkpoint: {resume_from_checkpoint}")
trainer.fit(
model=autoencoder,
train_dataloaders=train_loader,
ckpt_path=resume_from_checkpoint,
)
else:
print("Starting training from scratch")
trainer.fit(autoencoder, train_loader)
print("Done training")
return


# ## Training function deployed on Modal
#
# Next, we define the training function running on Modal infrastructure. Note that this function has the volume mounted on it.
# The training function checks in the volume for an existing latest checkpoint file, and resumes training off that checkpoint if it finds it.
# The `timeout` parameter in the `@app.function` decorator is set to 30 seconds for demonstration purposes. In a real scenario, you'd set this to a larger value (e.g., several hours) based on your needs.
@app.function(
image=image,
volumes={VOLUME_PATH: volume},
gpu="any",
timeout=30,
)
def train():
last_checkpoint = os.path.join(CHECKPOINTS_PATH, "last.ckpt")

try:
if os.path.exists(last_checkpoint):
# Resume from the latest checkpoint
print(
f"Resuming training from the latest checkpoint: {last_checkpoint}"
)
train_model(
DATA_PATH,
CHECKPOINTS_PATH,
resume_from_checkpoint=last_checkpoint,
)
print("Training resumed successfully")
else:
print("Starting training from scratch")
train_model(DATA_PATH, CHECKPOINTS_PATH)
except Exception as e:
raise e

return


# ## Run the model
#
# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps)
# to run the training.
# If the function times out, or if the job is [preempted](/docs/guide/preemption#preemption), the loop will catch the exception and attempt to resume training from the last checkpoint.

# You can run this locally with `modal run --detach 06_gpu_and_ml.long-training.long-training`
# This runs the code in detached mode, allowing it to continue running even if you close your terminal or computer. This is important since training jobs can be long.


@app.local_entrypoint()
def main():
while True:
try:
print("Starting new training run")
train.remote()

print("Finished training")
break # Exit the loop if training completes successfully
except KeyboardInterrupt:
print("Job was preempted")
print("Will attempt to resume in the next iteration.")
continue
except modal.exception.FunctionTimeoutError:
print("Function timed out")
print("Will attempt to resume in the next iteration.")
continue
except Exception as e:
print(f"Error: {str(e)}")
break
46 changes: 46 additions & 0 deletions 06_gpu_and_ml/long-training/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# ---
# lambda-test: false
# pytest: false
# ---
import lightning as L
from torch import nn, optim, utils
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor


# define the LightningModule
class LitAutoEncoder(L.LightningModule):
def __init__(self, encoder, decoder):
super().__init__()
self.encoder = encoder
self.decoder = decoder

def training_step(self, batch, batch_idx):
x, _ = batch
x = x.view(x.size(0), -1)
z = self.encoder(x)
x_hat = self.decoder(z)
loss = nn.functional.mse_loss(x_hat, x)
self.log("train_loss", loss)
return loss

def configure_optimizers(self):
optimizer = optim.Adam(self.parameters(), lr=1e-3)
return optimizer


def get_autoencoder(checkpoint_path=None):
# define any number of nn.Modules (or use your current ones)
print("Defining encoder and decoder")
encoder = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))
decoder = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))

return LitAutoEncoder(encoder, decoder)


def get_train_loader(data_dir):
# setup data
print("Setting up data")
dataset = MNIST(data_dir, download=True, transform=ToTensor())
train_loader = utils.data.DataLoader(dataset)
return train_loader
Loading

0 comments on commit de80d55

Please sign in to comment.