diff --git a/01_getting_started/generators.py b/01_getting_started/generators.py
index 33a7b057b..d6df729c1 100644
--- a/01_getting_started/generators.py
+++ b/01_getting_started/generators.py
@@ -1,3 +1,10 @@
+# # Run a generator function on Modal
+
+# This example shows how you can run a generator function on Modal. We define a
+# function that `yields` values and then call it with the [`remote_gen`](https://modal.com/docs/reference/modal.Function#remote_gen) method. The
+# `remote_gen` method returns a generator object that can be used to iterate over
+# the values produced by the function.
+
 import modal
 
 app = modal.App("example-generators")
diff --git a/05_scheduling/schedule_simple.py b/05_scheduling/schedule_simple.py
index 42a9b08ad..47d498c5d 100644
--- a/05_scheduling/schedule_simple.py
+++ b/05_scheduling/schedule_simple.py
@@ -1,6 +1,21 @@
 # ---
 # cmd: ["python", "-m", "05_scheduling.schedule_simple"]
 # ---
+
+# # Scheduling remote jobs
+
+# This example shows how you can schedule remote jobs on Modal.
+# You can do this either with:
+#
+# - [`modal.Period`](https://modal.com/docs/reference/modal.Period) - a time interval between function calls.
+# - [`modal.Cron`](https://modal.com/docs/reference/modal.Cron) - a cron expression to specify the schedule.
+
+# In the code below, the first function runs every
+# 5 seconds, and the second function runs every minute. We use the `schedule`
+# argument to specify the schedule for each function. The `schedule` argument can
+# take a `modal.Period` object to specify a time interval or a `modal.Cron` object
+# to specify a cron expression.
+
 import time
 from datetime import datetime
 
diff --git a/06_gpu_and_ml/embeddings/instructor.py b/06_gpu_and_ml/embeddings/instructor.py
index ab04e5c73..968cd3a96 100644
--- a/06_gpu_and_ml/embeddings/instructor.py
+++ b/06_gpu_and_ml/embeddings/instructor.py
@@ -1,3 +1,7 @@
+# # Create Instructor Embeddings on Modal
+#
+# This example runs the [Instructor](https://github.com/xlang-ai/instructor-embedding) embedding model and runs a simple sentence similarity computation.
+
 import modal
 
 MODEL_DIR = "/model"
diff --git a/06_gpu_and_ml/embeddings/text_embeddings_inference.py b/06_gpu_and_ml/embeddings/text_embeddings_inference.py
index 23836a43c..8d3677088 100644
--- a/06_gpu_and_ml/embeddings/text_embeddings_inference.py
+++ b/06_gpu_and_ml/embeddings/text_embeddings_inference.py
@@ -1,6 +1,10 @@
 # ---
 # cmd: ["modal", "run", "06_gpu_and_ml/embeddings/text_embeddings_inference.py::embed_dataset"]
 # ---
+# # Run TextEmbeddingsInference (TEI) on Modal
+#
+# This example runs the [Text Embedding Inference (TEI)](https://github.com/huggingface/text-embeddings-inference) toolkit on the Hacker News BigQuery public dataset.
+
 import json
 import os
 import socket
diff --git a/06_gpu_and_ml/gpu_fallbacks.py b/06_gpu_and_ml/gpu_fallbacks.py
new file mode 100644
index 000000000..fb3c82c51
--- /dev/null
+++ b/06_gpu_and_ml/gpu_fallbacks.py
@@ -0,0 +1,47 @@
+# # Set "fallback" GPUs
+#
+# GPU availabilities on Modal can fluctuate, especially for
+# tightly-constrained requests, like for eight co-located GPUs
+# in a specific region.
+#
+# If your code can run on multiple different GPUs, you can specify
+# your GPU request as a list, in order of preference, and whenever
+# your Function scales up, we will try to schedule it on each requested GPU type in order.
+#
+# The code below demonstrates the usage of the `gpu` parameter with a list of GPUs.
+
+import subprocess
+
+import modal
+
+app = modal.App("example-gpu-fallbacks")
+
+
+@app.function(
+    gpu=["h100", "a100", "any"],  # "any" means any of L4, A10, or T4
+    max_inputs=1,  # new container each input, so we re-roll the GPU dice every time
+)
+async def remote(_idx):
+    gpu = subprocess.run(
+        ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
+        check=True,
+        text=True,
+        stdout=subprocess.PIPE,
+    ).stdout.strip()
+    print(gpu)
+    return gpu
+
+
+@app.local_entrypoint()
+def local(count: int = 32):
+    from collections import Counter
+
+    gpu_counter = Counter(
+        remote.map([i for i in range(count)], order_outputs=False)
+    )
+    print(f"ran {gpu_counter.total()} times")
+    print(f"on the following {len(gpu_counter.keys())} GPUs:", end="\n")
+    print(
+        *[f"{gpu.rjust(32)}: {'🔥' * ct}" for gpu, ct in gpu_counter.items()],
+        sep="\n",
+    )
diff --git a/06_gpu_and_ml/llm-serving/openai_compatible/locustfile.py b/06_gpu_and_ml/llm-serving/openai_compatible/locustfile.py
index a536ee1cf..66af64258 100644
--- a/06_gpu_and_ml/llm-serving/openai_compatible/locustfile.py
+++ b/06_gpu_and_ml/llm-serving/openai_compatible/locustfile.py
@@ -25,7 +25,7 @@ class WebsiteUser(locust.HttpUser):
     @locust.task
     def chat_completion(self):
         payload = {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "Llama-3.2-3B-Instruct-quantized.w8a8",
             "messages": messages,
         }
 
diff --git a/06_gpu_and_ml/llm-serving/vllm_inference.py b/06_gpu_and_ml/llm-serving/vllm_inference.py
index b52b66893..f7eeb2b15 100644
--- a/06_gpu_and_ml/llm-serving/vllm_inference.py
+++ b/06_gpu_and_ml/llm-serving/vllm_inference.py
@@ -32,17 +32,18 @@
 import modal
 
 vllm_image = modal.Image.debian_slim(python_version="3.10").pip_install(
-    "vllm==0.5.3post1"
+    "vllm==0.6.2"
 )
 
 # ## Download the model weights
 #
-# We'll be running a pretrained foundation model -- Meta's LLaMA 3.1 8B
-# in the Instruct variant that's trained to chat and follow instructions.
+# We'll be running a pretrained foundation model -- Meta's LLaMA 3.2 3B
+# in the Instruct variant that's trained to chat and follow instructions,
+# quantized to 8-bit by [Neural Magic](https://neuralmagic.com/) and uploaded to Hugging Face.
 
 MODELS_DIR = "/llamas"
-MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-MODEL_REVISION = "8c22764a7e3675c50d4c7c9a4edb474456022b16"
+MODEL_NAME = "neuralmagic/Llama-3.2-3B-Instruct-quantized.w8a8"
+MODEL_REVISION = "1c42cac61b517e84efa30e3e90f00076045d5a89"
 
 # We need to make the weights of that model available to our Modal Functions.
 #
@@ -85,10 +86,10 @@
 
 @app.function(
     image=vllm_image,
-    gpu=modal.gpu.A100(count=N_GPU, size="40GB"),
+    gpu=modal.gpu.L4(count=N_GPU),
     container_idle_timeout=5 * MINUTES,
     timeout=24 * HOURS,
-    allow_concurrent_inputs=100,
+    allow_concurrent_inputs=500,
     volumes={MODELS_DIR: volume},
 )
 @modal.asgi_app()
@@ -102,6 +103,7 @@ def serve():
     from vllm.entrypoints.openai.serving_completion import (
         OpenAIServingCompletion,
     )
+    from vllm.entrypoints.openai.serving_engine import BaseModelPath
     from vllm.usage.usage_lib import UsageContext
 
     volume.reload()  # ensure we have the latest version of the weights
@@ -109,7 +111,7 @@ def serve():
     # create a fastAPI app that uses vLLM's OpenAI-compatible router
     web_app = fastapi.FastAPI(
         title=f"OpenAI-compatible {MODEL_NAME} server",
-        description="Run an OpenAI-compatible LLM server with vLLM on modal.com",
+        description="Run an OpenAI-compatible LLM server with vLLM on modal.com 🚀",
         version="0.0.1",
         docs_url="/docs",
     )
@@ -159,20 +161,24 @@ async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
 
     request_logger = RequestLogger(max_log_len=2048)
 
-    api_server.openai_serving_chat = OpenAIServingChat(
+    base_model_paths = [
+        BaseModelPath(name=MODEL_NAME.split("/")[1], model_path=MODEL_NAME)
+    ]
+
+    api_server.chat = lambda s: OpenAIServingChat(
         engine,
         model_config=model_config,
-        served_model_names=[MODEL_NAME],
+        base_model_paths=base_model_paths,
         chat_template=None,
         response_role="assistant",
         lora_modules=[],
         prompt_adapters=[],
         request_logger=request_logger,
     )
-    api_server.openai_serving_completion = OpenAIServingCompletion(
+    api_server.completion = lambda s: OpenAIServingCompletion(
         engine,
         model_config=model_config,
-        served_model_names=[MODEL_NAME],
+        base_model_paths=base_model_paths,
         lora_modules=[],
         prompt_adapters=[],
         request_logger=request_logger,
diff --git a/06_gpu_and_ml/long-training/long-training.py b/06_gpu_and_ml/long-training/long-training.py
new file mode 100644
index 000000000..79f124257
--- /dev/null
+++ b/06_gpu_and_ml/long-training/long-training.py
@@ -0,0 +1,159 @@
+# ---
+# cmd: ["modal", "run", "--detach", "06_gpu_and_ml.long-training.long-training"]
+# deploy: true
+# ---
+
+# # Running long training jobs on Modal
+
+# While Modal functions typically have a [maximum timeout of 24 hours](/docs/guide/timeouts), you can still run long training jobs on Modal by implementing a checkpointing mechanism in your code.
+# This allows you to save the model's state periodically and resume from the last saved state.
+# In fact, we recommend implementing checkpointing logic regardless of the duration of your training jobs. This prevents loss of progress in case of interruptions or [preemptions](/docs/guide/preemption).
+
+# In this example, we'll walk through how to implement this pattern using PyTorch Lightning.
+
+# ## Pattern
+
+# The core pattern for long-duration training on Modal:
+
+# 1. Periodically save checkpoints to a Modal [volume](/docs/guide/volumes)
+# 2. Handle interruptions/timeouts and resume from the last checkpoint
+
+
+# ## Setup
+
+# Let's start by importing the Modal client and defining the Modal app and image. Since we are using PyTorch Lightning, we use an officially supported CUDA docker image as our base image.
+# Then we install `pytorch` and `lightning` on top of that.
+
+import os
+
+import modal
+
+app = modal.App("interrupt-resume-lightning")
+
+# Set up the environment
+image = modal.Image.from_registry(
+    "nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10"
+).pip_install("lightning", "torchvision")
+
+
+# ## Define Modal Volume
+
+# Next, we set up a Modal [volume](/docs/guide/volumes) for storing both the training data and the checkpoints
+
+volume = modal.Volume.from_name("training-checkpoints", create_if_missing=True)
+
+VOLUME_PATH = "/vol"
+DATA_PATH = f"{VOLUME_PATH}/data"
+CHECKPOINTS_PATH = f"{VOLUME_PATH}/checkpoints"
+
+# ## Model training
+
+# We implement the actual model training class/functions and the checkpointing logic.
+# PyTorch Lightning offers some [built-in checkpointing](https://pytorch-lightning.readthedocs.io/en/1.2.10/common/weights_loading.html#:~:text=Lightning%20automates%20saving%20and%20loading,having%20to%20retrain%20the%20model.) functionality.
+# You can specify the checkpoint file path that you want to resume from using the `ckpt_path` parameter of [`trainer.fit`](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.trainer.trainer.Trainer.html)
+# Additionally, you can specify the checkpointing interval with the `every_n_epochs` parameter of [`ModelCheckpoint`](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html).
+# In the code below, we save checkpoints every 10 epochs, but this number can be adjusted depending on how long the epochs take. The goal is to minimize the disruption from job failures. Something that takes a few days should be checkpointed perhaps every few hours. Depending on what training framework you are using, how exactly this checkpointing gets implemented may vary.
+
+
+def get_checkpoint(checkpoint_dir):
+    from lightning.pytorch.callbacks import ModelCheckpoint
+
+    return ModelCheckpoint(
+        dirpath=checkpoint_dir,
+        save_last=True,
+        every_n_epochs=10,
+        filename="epoch{epoch:02d}",
+    )
+
+
+def train_model(data_dir, checkpoint_dir, resume_from_checkpoint=None):
+    import lightning as L
+
+    from .train import get_autoencoder, get_train_loader
+
+    # train the model (hint: here are some helpful Trainer arguments for rapid idea iteration)
+    autoencoder = get_autoencoder()
+    train_loader = get_train_loader(data_dir=data_dir)
+    checkpoint_callback = get_checkpoint(checkpoint_dir)
+    trainer = L.Trainer(
+        limit_train_batches=100, max_epochs=100, callbacks=[checkpoint_callback]
+    )
+    if resume_from_checkpoint:
+        print(f"Resuming from checkpoint: {resume_from_checkpoint}")
+        trainer.fit(
+            model=autoencoder,
+            train_dataloaders=train_loader,
+            ckpt_path=resume_from_checkpoint,
+        )
+    else:
+        print("Starting training from scratch")
+        trainer.fit(autoencoder, train_loader)
+    print("Done training")
+    return
+
+
+# ## Training function deployed on Modal
+#
+# Next, we define the training function running on Modal infrastructure. Note that this function has the volume mounted on it.
+# The training function checks in the volume for an existing latest checkpoint file, and resumes training off that checkpoint if it finds it.
+# The `timeout` parameter in the `@app.function` decorator is set to 30 seconds for demonstration purposes. In a real scenario, you'd set this to a larger value (e.g., several hours) based on your needs.
+@app.function(
+    image=image,
+    volumes={VOLUME_PATH: volume},
+    gpu="any",
+    timeout=30,
+)
+def train():
+    last_checkpoint = os.path.join(CHECKPOINTS_PATH, "last.ckpt")
+
+    try:
+        if os.path.exists(last_checkpoint):
+            # Resume from the latest checkpoint
+            print(
+                f"Resuming training from the latest checkpoint: {last_checkpoint}"
+            )
+            train_model(
+                DATA_PATH,
+                CHECKPOINTS_PATH,
+                resume_from_checkpoint=last_checkpoint,
+            )
+            print("Training resumed successfully")
+        else:
+            print("Starting training from scratch")
+            train_model(DATA_PATH, CHECKPOINTS_PATH)
+    except Exception as e:
+        raise e
+
+    return
+
+
+# ## Run the model
+#
+# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps)
+# to run the training.
+# If the function times out, or if the job is [preempted](/docs/guide/preemption#preemption), the loop will catch the exception and attempt to resume training from the last checkpoint.
+
+# You can run this locally with `modal run --detach 06_gpu_and_ml.long-training.long-training`
+# This runs the code in detached mode, allowing it to continue running even if you close your terminal or computer. This is important since training jobs can be long.
+
+
+@app.local_entrypoint()
+def main():
+    while True:
+        try:
+            print("Starting new training run")
+            train.remote()
+
+            print("Finished training")
+            break  # Exit the loop if training completes successfully
+        except KeyboardInterrupt:
+            print("Job was preempted")
+            print("Will attempt to resume in the next iteration.")
+            continue
+        except modal.exception.FunctionTimeoutError:
+            print("Function timed out")
+            print("Will attempt to resume in the next iteration.")
+            continue
+        except Exception as e:
+            print(f"Error: {str(e)}")
+            break
diff --git a/06_gpu_and_ml/long-training/train.py b/06_gpu_and_ml/long-training/train.py
new file mode 100644
index 000000000..9e96890fa
--- /dev/null
+++ b/06_gpu_and_ml/long-training/train.py
@@ -0,0 +1,46 @@
+# ---
+# lambda-test: false
+# pytest: false
+# ---
+import lightning as L
+from torch import nn, optim, utils
+from torchvision.datasets import MNIST
+from torchvision.transforms import ToTensor
+
+
+# define the LightningModule
+class LitAutoEncoder(L.LightningModule):
+    def __init__(self, encoder, decoder):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+
+    def training_step(self, batch, batch_idx):
+        x, _ = batch
+        x = x.view(x.size(0), -1)
+        z = self.encoder(x)
+        x_hat = self.decoder(z)
+        loss = nn.functional.mse_loss(x_hat, x)
+        self.log("train_loss", loss)
+        return loss
+
+    def configure_optimizers(self):
+        optimizer = optim.Adam(self.parameters(), lr=1e-3)
+        return optimizer
+
+
+def get_autoencoder(checkpoint_path=None):
+    # define any number of nn.Modules (or use your current ones)
+    print("Defining encoder and decoder")
+    encoder = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))
+    decoder = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))
+
+    return LitAutoEncoder(encoder, decoder)
+
+
+def get_train_loader(data_dir):
+    # setup data
+    print("Setting up data")
+    dataset = MNIST(data_dir, download=True, transform=ToTensor())
+    train_loader = utils.data.DataLoader(dataset)
+    return train_loader
diff --git a/06_gpu_and_ml/stable_diffusion/flux.py b/06_gpu_and_ml/stable_diffusion/flux.py
index 3869248f9..91def06e4 100644
--- a/06_gpu_and_ml/stable_diffusion/flux.py
+++ b/06_gpu_and_ml/stable_diffusion/flux.py
@@ -1,7 +1,11 @@
 # ---
 # output-directory: "/tmp/flux"
 # ---
-# example originally contributed by [@Arro](https://github.com/Arro)
+# # Run Flux.1 (Schnell) on Modal
+#
+# This example runs the popular [Flux.1-schnell](https://huggingface.co/black-forest-labs/FLUX.1-schnell) text-to-image model on Modal.
+#
+# Thanks to [@Arro](https://github.com/Arro) for the original contribution.
 from io import BytesIO
 from pathlib import Path
 
diff --git a/06_gpu_and_ml/stable_diffusion/playground.py b/06_gpu_and_ml/stable_diffusion/playground.py
index ab5c77ce8..3af85c7f3 100644
--- a/06_gpu_and_ml/stable_diffusion/playground.py
+++ b/06_gpu_and_ml/stable_diffusion/playground.py
@@ -3,7 +3,10 @@
 # output-directory: "/tmp/playground-2-5"
 # args: ["--prompt", "A cinematic shot of a baby raccoon wearing an intricate Italian priest robe."]
 # ---
-
+# # Run Playground v2.5 on Modal
+#
+# This example runs the popular [Playground v2.5](https://huggingface.co/playgroundai/playground-v2.5-1024px-aesthetic) text-to-image on Modal.
+#
 from pathlib import Path
 
 import fastapi.staticfiles
diff --git a/06_gpu_and_ml/stable_diffusion/stable_diffusion_xl_lightning.py b/06_gpu_and_ml/stable_diffusion/stable_diffusion_xl_lightning.py
index f91479abb..8ca34d91e 100644
--- a/06_gpu_and_ml/stable_diffusion/stable_diffusion_xl_lightning.py
+++ b/06_gpu_and_ml/stable_diffusion/stable_diffusion_xl_lightning.py
@@ -1,3 +1,11 @@
+# ---
+# output-directory: "/tmp/stable-diffusion-xl-lightning"
+# ---
+# # Run SDXL Lightning on Modal
+#
+# This example runs [SDXL-Lightning](https://huggingface.co/ByteDance/SDXL-Lightning) by ByteDance, a fast text-to-image model that generates high quality images in just a few steps.
+#
+
 from pathlib import Path
 
 import modal
diff --git a/06_gpu_and_ml/stable_diffusion/stable_video_diffusion.py b/06_gpu_and_ml/stable_diffusion/stable_video_diffusion.py
index 5e85c5364..20025c382 100644
--- a/06_gpu_and_ml/stable_diffusion/stable_video_diffusion.py
+++ b/06_gpu_and_ml/stable_diffusion/stable_video_diffusion.py
@@ -1,6 +1,10 @@
 # ---
 # cmd: ["modal", "serve", "06_gpu_and_ml/stable_diffusion/stable_video_diffusion.py"]
 # ---
+# # Run Stable Video Diffusion in a Streamlit app
+#
+# This example runs the [Stable Video Diffusion](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt) image-to-video model.
+
 import os
 import sys
 
diff --git a/07_web_endpoints/count_faces.py b/07_web_endpoints/count_faces.py
index 20d0e96c1..4d0041ec4 100644
--- a/07_web_endpoints/count_faces.py
+++ b/07_web_endpoints/count_faces.py
@@ -2,6 +2,16 @@
 # lambda-test: false
 # ---
 
+# Run OpenCV face detection on an image
+
+# This example shows how you can use OpenCV on Modal to detect faces in an image. We use
+# the `opencv-python` package to load the image and the `opencv` library to
+# detect faces. The function `count_faces` takes an image as input and returns
+# the number of faces detected in the image.
+
+# The code below also shows how you can create a web server with Sanic to upload
+# an image and get the number of faces detected in the image.
+
 import os
 
 import modal
diff --git a/07_web_endpoints/fastapi_app.py b/07_web_endpoints/fastapi_app.py
index a1ab0782c..5871178df 100644
--- a/07_web_endpoints/fastapi_app.py
+++ b/07_web_endpoints/fastapi_app.py
@@ -2,6 +2,11 @@
 # lambda-test: false
 # ---
 
+# # Deploy FastAPI app with Modal
+
+# This example shows how you can deploy a [FastAPI](https://fastapi.tiangolo.com/) app with Modal.
+# You can serve any app written in an ASGI-compatible web framework (like FastAPI) using this pattern or you can server WSGI-compatible frameworks like Flask with [`wsgi_app`](https://modal.com/docs/guide/webhooks#wsgi).
+
 from typing import Optional
 
 import modal
diff --git a/07_web_endpoints/fasthtml_app.py b/07_web_endpoints/fasthtml_app.py
index 9c2c9c7cf..04a90eb3f 100644
--- a/07_web_endpoints/fasthtml_app.py
+++ b/07_web_endpoints/fasthtml_app.py
@@ -2,6 +2,13 @@
 # deploy: true
 # cmd: ["modal", "serve", "07_web_endpoints/fasthtml_app.py"]
 # ---
+
+# # Deploy a FastHTML app with Modal
+
+# This example shows how you can deploy a FastHTML app with Modal.
+# [FastHTML](https://www.fastht.ml/) is a Python library that allows you to create entire web applications using only Python.
+
+
 import modal
 
 app = modal.App("example-fasthtml")
diff --git a/07_web_endpoints/flask_app.py b/07_web_endpoints/flask_app.py
index ed43a108d..7ea0345ad 100644
--- a/07_web_endpoints/flask_app.py
+++ b/07_web_endpoints/flask_app.py
@@ -2,6 +2,11 @@
 # lambda-test: false
 # ---
 
+# # Deploy Flask app with Modal
+
+# This example shows how you can deploy a [Flask](https://flask.palletsprojects.com/en/3.0.x/) app with Modal.
+# You can serve any app written in a WSGI-compatible web framework (like Flask) on Modal with this pattern. You can serve an app written in an ASGI-compatible framework, like FastAPI, with [`asgi_app`](https://modal.com/docs/guide/webhooks#asgi).
+
 import modal
 
 app = modal.App(
diff --git a/07_web_endpoints/flask_streaming.py b/07_web_endpoints/flask_streaming.py
index 1cd4c98f5..241069d73 100644
--- a/07_web_endpoints/flask_streaming.py
+++ b/07_web_endpoints/flask_streaming.py
@@ -2,6 +2,10 @@
 # lambda-test: false
 # ---
 
+# # Deploy Flask app with streaming results with Modal
+
+# This example shows how you can deploy a [Flask](https://flask.palletsprojects.com/en/3.0.x/) app with Modal that streams results back to the client.
+
 import modal
 
 app = modal.App(
diff --git a/07_web_endpoints/streaming.py b/07_web_endpoints/streaming.py
index 764c09f1d..38850efc9 100644
--- a/07_web_endpoints/streaming.py
+++ b/07_web_endpoints/streaming.py
@@ -2,6 +2,10 @@
 # cmd: ["modal", "serve", "07_web_endpoints/streaming.py"]
 # deploy: true
 # ---
+
+# # Deploy FastAPI app with streaming results with Modal
+# This example shows how you can deploy a [FastAPI](https://fastapi.tiangolo.com/) app with Modal that streams results back to the client.
+
 import asyncio
 import time
 
diff --git a/08_advanced/generators_async.py b/08_advanced/generators_async.py
index d7cb85abf..46663eda1 100644
--- a/08_advanced/generators_async.py
+++ b/08_advanced/generators_async.py
@@ -1,3 +1,8 @@
+# # Run async generator function on Modal
+
+# This example shows how you can run an async generator function on Modal.
+# Modal natively supports async/await syntax using asyncio.
+
 import modal
 
 app = modal.App("example-generators-async")
diff --git a/08_advanced/parallel_execution.py b/08_advanced/parallel_execution.py
index 5ddad3559..908f237ba 100644
--- a/08_advanced/parallel_execution.py
+++ b/08_advanced/parallel_execution.py
@@ -1,3 +1,9 @@
+# # Parallel execution on Modal with `spawn`
+
+# This example shows how you can run multiple functions in parallel on Modal.
+# We use the `spawn` method to start a function and return a handle to its result.
+# The `get` method is used to retrieve the result of the function call.
+
 import time
 
 import modal
diff --git a/08_advanced/poll_delayed_result.py b/08_advanced/poll_delayed_result.py
index 6990d7285..285adea79 100644
--- a/08_advanced/poll_delayed_result.py
+++ b/08_advanced/poll_delayed_result.py
@@ -1,6 +1,14 @@
 # ---
 # lambda-test: false
 # ---
+
+# # Polling for a delayed result on Modal
+
+# This example shows how you can poll for a delayed result on Modal.
+
+# The function `factor_number` takes a number as input and returns the prime factors of the number. The function could take a long time to run, so we don't want to wait for the result in the web server.
+# Instead, we return a URL that the client can poll to get the result.
+
 import fastapi
 import modal
 from modal.functions import FunctionCall
diff --git a/10_integrations/pyjulia.py b/10_integrations/pyjulia.py
index d98b52395..df7cca4f7 100644
--- a/10_integrations/pyjulia.py
+++ b/10_integrations/pyjulia.py
@@ -1,3 +1,7 @@
+# # Run Julia code from Python using PyJulia
+
+# This example shows how you can run Julia code from Python using the PyJulia package.
+
 import modal
 
 image = image = (
diff --git a/10_integrations/webscraper.py b/10_integrations/webscraper.py
index ad3c0d534..bb351763b 100644
--- a/10_integrations/webscraper.py
+++ b/10_integrations/webscraper.py
@@ -1,6 +1,11 @@
 # ---
 # runtimes: ["runc", "gvisor"]
 # ---
+
+# # Web Scraping on Modal
+
+# This example shows how you can scrape links from a website and post them to a Slack channel using Modal.
+
 import os
 
 import modal
diff --git a/misc/lmdeploy_oai_compatible.py b/misc/lmdeploy_oai_compatible.py
index d82d41b86..c8794874e 100644
--- a/misc/lmdeploy_oai_compatible.py
+++ b/misc/lmdeploy_oai_compatible.py
@@ -1,3 +1,7 @@
+# # Deploy a model with `lmdeploy`
+#
+# This script is used to deploy a model using [lmdeploy](https://github.com/InternLM/lmdeploy) with OpenAI compatible API.
+
 import subprocess
 
 import modal
diff --git a/misc/say_hello_cron.py b/misc/say_hello_cron.py
index ab45bc0eb..e6c1ac566 100644
--- a/misc/say_hello_cron.py
+++ b/misc/say_hello_cron.py
@@ -2,6 +2,10 @@
 # lambda-test: false
 # ---
 
+# # Deploy a cron job with Modal
+
+# This example shows how you can deploy a cron job with Modal.
+
 import time
 from datetime import datetime, timezone
 
diff --git a/misc/stable_lm.py b/misc/stable_lm.py
index 3d4655357..48060eec9 100644
--- a/misc/stable_lm.py
+++ b/misc/stable_lm.py
@@ -1,3 +1,7 @@
+# # Run StableLM text completion model
+
+# This example shows how you can run [`stabilityai/stablelm-tuned-alpha-7b`](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b) on Modal
+
 import os
 import time
 from pathlib import Path
diff --git a/misc/tgi_oai_compatible.py b/misc/tgi_oai_compatible.py
index dab03639c..aebce5e0b 100644
--- a/misc/tgi_oai_compatible.py
+++ b/misc/tgi_oai_compatible.py
@@ -1,3 +1,7 @@
+# # Run TGI on Modal
+
+# This example shows how you can run LLMs with the [Text Generation Inference (TGI)](https://huggingface.co/docs/text-generation-inference/en/index) inference framework on Modal.
+
 import subprocess
 
 import modal
diff --git a/misc/tqdm_progress_bar.py b/misc/tqdm_progress_bar.py
index 46453c338..ce22a3a99 100644
--- a/misc/tqdm_progress_bar.py
+++ b/misc/tqdm_progress_bar.py
@@ -1,3 +1,7 @@
+# # Show a progress bar with tqdm on Modal
+
+# This example shows how you can show a progress bar with [tqdm](https://github.com/tqdm/tqdm) on Modal.
+
 import time
 
 import modal