diff --git a/06_gpu_and_ml/stable_diffusion/demo_images/dog.png b/06_gpu_and_ml/stable_diffusion/demo_images/dog.png deleted file mode 100644 index 022c2d13a..000000000 Binary files a/06_gpu_and_ml/stable_diffusion/demo_images/dog.png and /dev/null differ diff --git a/06_gpu_and_ml/stable_diffusion/foo.json b/06_gpu_and_ml/stable_diffusion/foo.json deleted file mode 100644 index cb61dad86..000000000 --- a/06_gpu_and_ml/stable_diffusion/foo.json +++ /dev/null @@ -1 +0,0 @@ -{"last_node_id": 9, "last_link_id": 9, "nodes": [{"id": 7, "type": "CLIPTextEncode", "pos": [413, 389], "size": {"0": 425.27801513671875, "1": 180.6060791015625}, "flags": {}, "order": 3, "mode": 0, "inputs": [{"name": "clip", "type": "CLIP", "link": 5}], "outputs": [{"name": "CONDITIONING", "type": "CONDITIONING", "links": [6], "slot_index": 0}], "properties": {"Node name for S&R": "CLIPTextEncode"}, "widgets_values": ["text, watermark"]}, {"id": 6, "type": "CLIPTextEncode", "pos": [415, 186], "size": {"0": 422.84503173828125, "1": 164.31304931640625}, "flags": {}, "order": 2, "mode": 0, "inputs": [{"name": "clip", "type": "CLIP", "link": 3}], "outputs": [{"name": "CONDITIONING", "type": "CONDITIONING", "links": [4], "slot_index": 0}], "properties": {"Node name for S&R": "CLIPTextEncode"}, "widgets_values": ["beautiful scenery nature glass bottle landscape, , purple galaxy bottle,"]}, {"id": 5, "type": "EmptyLatentImage", "pos": [473, 609], "size": {"0": 315, "1": 106}, "flags": {}, "order": 0, "mode": 0, "outputs": [{"name": "LATENT", "type": "LATENT", "links": [2], "slot_index": 0}], "properties": {"Node name for S&R": "EmptyLatentImage"}, "widgets_values": [512, 512, 1]}, {"id": 3, "type": "KSampler", "pos": [863, 186], "size": {"0": 315, "1": 262}, "flags": {}, "order": 4, "mode": 0, "inputs": [{"name": "model", "type": "MODEL", "link": 1}, {"name": "positive", "type": "CONDITIONING", "link": 4}, {"name": "negative", "type": "CONDITIONING", "link": 6}, {"name": "latent_image", "type": "LATENT", "link": 2}], "outputs": [{"name": "LATENT", "type": "LATENT", "links": [7], "slot_index": 0}], "properties": {"Node name for S&R": "KSampler"}, "widgets_values": [156680208700286, "randomize", 20, 8, "euler", "normal", 1]}, {"id": 8, "type": "VAEDecode", "pos": [1209, 188], "size": {"0": 210, "1": 46}, "flags": {}, "order": 5, "mode": 0, "inputs": [{"name": "samples", "type": "LATENT", "link": 7}, {"name": "vae", "type": "VAE", "link": 8}], "outputs": [{"name": "IMAGE", "type": "IMAGE", "links": [9], "slot_index": 0}], "properties": {"Node name for S&R": "VAEDecode"}}, {"id": 9, "type": "SaveImage", "pos": [1451, 189], "size": {"0": 210, "1": 58}, "flags": {}, "order": 6, "mode": 0, "inputs": [{"name": "images", "type": "IMAGE", "link": 9}], "properties": {}, "widgets_values": ["ComfyUI"]}, {"id": 4, "type": "CheckpointLoaderSimple", "pos": [26, 474], "size": {"0": 315, "1": 98}, "flags": {}, "order": 1, "mode": 0, "outputs": [{"name": "MODEL", "type": "MODEL", "links": [1], "slot_index": 0}, {"name": "CLIP", "type": "CLIP", "links": [3, 5], "slot_index": 1}, {"name": "VAE", "type": "VAE", "links": [8], "slot_index": 2}], "properties": {"Node name for S&R": "CheckpointLoaderSimple"}, "widgets_values": ["dreamlike-photoreal-2.0.safetensors"]}], "links": [[1, 4, 0, 3, 0, "MODEL"], [2, 5, 0, 3, 3, "LATENT"], [3, 4, 1, 6, 0, "CLIP"], [4, 6, 0, 3, 1, "CONDITIONING"], [5, 4, 1, 7, 0, "CLIP"], [6, 7, 0, 3, 2, "CONDITIONING"], [7, 3, 0, 8, 0, "LATENT"], [8, 4, 2, 8, 1, "VAE"], [9, 8, 0, 9, 0, "IMAGE"]], "groups": [], "config": {}, "extra": {}, "version": 0.4} \ No newline at end of file diff --git a/06_gpu_and_ml/stable_diffusion/huggingface_token.png b/06_gpu_and_ml/stable_diffusion/huggingface_token.png deleted file mode 100644 index 618b48d24..000000000 Binary files a/06_gpu_and_ml/stable_diffusion/huggingface_token.png and /dev/null differ diff --git a/06_gpu_and_ml/stable_diffusion/stable_diffusion_cli.py b/06_gpu_and_ml/stable_diffusion/stable_diffusion_cli.py index 9bfc2c84d..2494abbd7 100644 --- a/06_gpu_and_ml/stable_diffusion/stable_diffusion_cli.py +++ b/06_gpu_and_ml/stable_diffusion/stable_diffusion_cli.py @@ -2,12 +2,13 @@ # output-directory: "/tmp/stable-diffusion" # args: ["--prompt", "A 1600s oil painting of the New York City skyline"] # tags: ["use-case-image-video-3d"] +# deploy: true # --- -# # Run Stable Diffusion 3.5 Large Turbo from the command line +# # Run Stable Diffusion 3.5 Large Turbo as a CLI, API, and web UI # This example shows how to run [Stable Diffusion 3.5 Large Turbo](https://huggingface.co/stabilityai/stable-diffusion-3.5-large-turbo) on Modal -# and generate images from your local command line. +# to generate images from your local command line, via an API, and as a web UI. # Inference takes about one minute to cold start, # at which point images are generated at a rate of one image every 1-2 seconds @@ -47,19 +48,20 @@ .pip_install( "accelerate==0.33.0", "diffusers==0.31.0", + "fastapi[standard]==0.115.4", "huggingface-hub[hf_transfer]==0.25.2", "sentencepiece==0.2.0", "torch==2.5.1", "torchvision==0.20.1", "transformers~=4.44.0", ) - .entrypoint([]) # deactivate default entrypoint to reduce log verbosity .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster downloads ) with image.imports(): import diffusers import torch + from fastapi import Response # ## Implementing SD3.5 Large Turbo inference on Modal @@ -67,9 +69,14 @@ # that ensures models are downloaded when we `build` our container image (just like our dependencies) # and that models are loaded and then moved to the GPU when a new container starts. -# The `run_inference` function just wraps a `diffusers` pipeline. +# The `run` function just wraps a `diffusers` pipeline. # It sends the output image back to the client as bytes. +# We also include a `web` wrapper that makes it possible +# to trigger inference via an API call. +# See the `/docs` route of the URL ending in `inference-web.modal.run` +# that appears when you deploy the app for details. + model_id = "adamo1139/stable-diffusion-3.5-large-turbo-ungated" model_revision_id = "9ad870ac0b0e5e48ced156bb02f85d324b7275d2" @@ -79,7 +86,7 @@ gpu="H100", timeout=10 * MINUTES, ) -class StableDiffusion: +class Inference: @modal.build() @modal.enter() def initialize(self): @@ -94,7 +101,7 @@ def move_to_gpu(self): self.pipe.to("cuda") @modal.method() - def run_inference( + def run( self, prompt: str, batch_size: int = 4, seed: int = None ) -> list[bytes]: seed = seed if seed is not None else random.randint(0, 2**32 - 1) @@ -116,8 +123,17 @@ def run_inference( torch.cuda.empty_cache() # reduce fragmentation return image_output + @modal.web_endpoint(docs=True) + def web(self, prompt: str, seed: int = None): + return Response( + content=self.run.local( # run in the same container + prompt, batch_size=1, seed=seed + )[0], + media_type="image/png", + ) + -# ## Generating images from the command line +# ## Generating Stable Diffusion images from the command line # This is the command we'll use to generate images. It takes a text `prompt`, # a `batch_size` that determines the number of images to generate per prompt, @@ -125,6 +141,11 @@ def run_inference( # You can also provide a `seed` to make sampling more deterministic. +# Run it with +# ```bash +# modal run stable_diffusion_cli.py +# ``` + @app.local_entrypoint() def entrypoint( @@ -144,11 +165,11 @@ def entrypoint( output_dir = Path("/tmp/stable-diffusion") output_dir.mkdir(exist_ok=True, parents=True) - sd = StableDiffusion() + inference_service = Inference() for sample_idx in range(samples): start = time.time() - images = sd.run_inference.remote(prompt, batch_size, seed) + images = inference_service.run.remote(prompt, batch_size, seed) duration = time.time() - start print(f"Run {sample_idx+1} took {duration:.3f}s") if sample_idx: @@ -169,5 +190,56 @@ def entrypoint( output_path.write_bytes(image_bytes) +# ## Generating Stable Diffusion images in a web UI + +# Lastly, we add a simple web application that exposes a front-end (written in Alpine.js) for +# our image generation backend. + +# The `Inference` class will serve multiple users from its own shared pool of warm GPU containers automatically. + +# We can deploy this with `modal deploy stable_diffusion_cli.py`. + +frontend_path = Path(__file__).parent / "frontend" + +web_image = modal.Image.debian_slim(python_version="3.12").pip_install( + "jinja2", "fastapi[standard]==0.115.4" +) + + +@app.function( + image=web_image, + mounts=[modal.Mount.from_local_dir(frontend_path, remote_path="/assets")], + allow_concurrent_inputs=1000, +) +@modal.asgi_app() +def ui(): + import fastapi.staticfiles + from fastapi import FastAPI, Request + from fastapi.templating import Jinja2Templates + + web_app = FastAPI() + templates = Jinja2Templates(directory="/assets") + + @web_app.get("/") + async def read_root(request: Request): + return templates.TemplateResponse( + "index.html", + { + "request": request, + "inference_url": Inference.web.web_url, + "model_name": "Stable Diffusion 3.5 Large Turbo", + "default_prompt": "A cinematic shot of a baby raccoon wearing an intricate italian priest robe.", + }, + ) + + web_app.mount( + "/static", + fastapi.staticfiles.StaticFiles(directory="/assets"), + name="static", + ) + + return web_app + + def slugify(s: str) -> str: return "".join(c if c.isalnum() else "-" for c in s).strip("-") diff --git a/06_gpu_and_ml/stable_diffusion/stable_diffusion_latencies.png b/06_gpu_and_ml/stable_diffusion/stable_diffusion_latencies.png deleted file mode 100644 index 1a70edc1f..000000000 Binary files a/06_gpu_and_ml/stable_diffusion/stable_diffusion_latencies.png and /dev/null differ diff --git a/06_gpu_and_ml/stable_diffusion/stable_diffusion_montage.png b/06_gpu_and_ml/stable_diffusion/stable_diffusion_montage.png deleted file mode 100644 index 593919725..000000000 Binary files a/06_gpu_and_ml/stable_diffusion/stable_diffusion_montage.png and /dev/null differ diff --git a/06_gpu_and_ml/stable_diffusion/stable_diffusion_turbo_input.png b/06_gpu_and_ml/stable_diffusion/stable_diffusion_turbo_input.png deleted file mode 100644 index 022c2d13a..000000000 Binary files a/06_gpu_and_ml/stable_diffusion/stable_diffusion_turbo_input.png and /dev/null differ diff --git a/06_gpu_and_ml/stable_diffusion/stable_diffusion_turbo_output.png b/06_gpu_and_ml/stable_diffusion/stable_diffusion_turbo_output.png deleted file mode 100644 index 6123597cf..000000000 Binary files a/06_gpu_and_ml/stable_diffusion/stable_diffusion_turbo_output.png and /dev/null differ diff --git a/06_gpu_and_ml/stable_diffusion/stable_diffusion_xl.py b/06_gpu_and_ml/stable_diffusion/stable_diffusion_xl.py deleted file mode 100644 index 14802cef9..000000000 --- a/06_gpu_and_ml/stable_diffusion/stable_diffusion_xl.py +++ /dev/null @@ -1,220 +0,0 @@ -# --- -# output-directory: "/tmp/stable-diffusion-xl" -# args: ["--prompt", "An astronaut riding a green horse"] -# runtimes: ["runc", "gvisor"] -# tags: ["use-case-image-video-3d"] -# --- -# # Stable Diffusion XL 1.0 -# -# This example is similar to the [Stable Diffusion CLI](/docs/examples/stable_diffusion_cli) -# example, but it generates images from the larger SDXL 1.0 model. Specifically, it runs the -# first set of steps with the base model, followed by the refiner model. -# -# [Try out the live demo here!](https://modal-labs--stable-diffusion-xl-ui.modal.run/) The first -# generation may include a cold-start, which takes around 20 seconds. The inference speed depends on the GPU -# and step count (for reference, an A100 runs 40 steps in 8 seconds). - -# ## Basic setup - -import io -from pathlib import Path - -import modal - -# ## Define a container image -# -# To take advantage of Modal's blazing fast cold-start times, we'll need to download our model weights -# inside our container image with a download function. We ignore binaries, ONNX weights and 32-bit weights. -# -# Tip: avoid using global variables in this function to ensure the download step detects model changes and -# triggers a rebuild. - - -sdxl_image = ( - modal.Image.debian_slim(python_version="3.10") - .apt_install( - "libglib2.0-0", "libsm6", "libxrender1", "libxext6", "ffmpeg", "libgl1" - ) - .pip_install( - "diffusers==0.26.3", - "huggingface-hub~=0.25.2", - "invisible_watermark==0.2.0", - "transformers~=4.38.2", - "accelerate==0.27.2", - "safetensors==0.4.2", - "fastapi[standard]==0.115.4", - "pydantic==2.9.2", - "starlette==0.41.2", - ) -) - -app = modal.App("stable-diffusion-xl") - -with sdxl_image.imports(): - import torch - from diffusers import DiffusionPipeline - from fastapi import Response - -# ## Load model and run inference -# -# The container lifecycle [`@enter` decorator](https://modal.com/docs/guide/lifecycle-functions#container-lifecycle-beta) -# loads the model at startup. Then, we evaluate it in the `run_inference` function. -# -# To avoid excessive cold-starts, we set the idle timeout to 240 seconds, meaning once a GPU has loaded the model it will stay -# online for 4 minutes before spinning down. This can be adjusted for cost/experience trade-offs. - - -@app.cls(gpu=modal.gpu.A10G(), container_idle_timeout=240, image=sdxl_image) -class Model: - @modal.build() - def build(self): - from huggingface_hub import snapshot_download - - ignore = [ - "*.bin", - "*.onnx_data", - "*/diffusion_pytorch_model.safetensors", - ] - snapshot_download( - "stabilityai/stable-diffusion-xl-base-1.0", ignore_patterns=ignore - ) - snapshot_download( - "stabilityai/stable-diffusion-xl-refiner-1.0", - ignore_patterns=ignore, - ) - - @modal.enter() - def enter(self): - load_options = dict( - torch_dtype=torch.float16, - use_safetensors=True, - variant="fp16", - device_map="auto", - ) - - # Load base model - self.base = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", **load_options - ) - - # Load refiner model - self.refiner = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-refiner-1.0", - text_encoder_2=self.base.text_encoder_2, - vae=self.base.vae, - **load_options, - ) - - # Compiling the model graph is JIT so this will increase inference time for the first run - # but speed up subsequent runs. Uncomment to enable. - # self.base.unet = torch.compile(self.base.unet, mode="reduce-overhead", fullgraph=True) - # self.refiner.unet = torch.compile(self.refiner.unet, mode="reduce-overhead", fullgraph=True) - - def _inference(self, prompt, n_steps=24, high_noise_frac=0.8): - negative_prompt = "disfigured, ugly, deformed" - image = self.base( - prompt=prompt, - negative_prompt=negative_prompt, - num_inference_steps=n_steps, - denoising_end=high_noise_frac, - output_type="latent", - ).images - image = self.refiner( - prompt=prompt, - negative_prompt=negative_prompt, - num_inference_steps=n_steps, - denoising_start=high_noise_frac, - image=image, - ).images[0] - - byte_stream = io.BytesIO() - image.save(byte_stream, format="JPEG") - - return byte_stream - - @modal.method() - def inference(self, prompt, n_steps=24, high_noise_frac=0.8): - return self._inference( - prompt, n_steps=n_steps, high_noise_frac=high_noise_frac - ).getvalue() - - @modal.web_endpoint(docs=True) - def web_inference( - self, prompt: str, n_steps: int = 24, high_noise_frac: float = 0.8 - ): - return Response( - content=self._inference( - prompt, n_steps=n_steps, high_noise_frac=high_noise_frac - ).getvalue(), - media_type="image/jpeg", - ) - - -# And this is our entrypoint; where the CLI is invoked. Explore CLI options -# with: `modal run stable_diffusion_xl.py --help - - -@app.local_entrypoint() -def main(prompt: str = "Unicorns and leprechauns sign a peace treaty"): - image_bytes = Model().inference.remote(prompt) - - dir = Path("/tmp/stable-diffusion-xl") - if not dir.exists(): - dir.mkdir(exist_ok=True, parents=True) - - output_path = dir / "output.jpg" - print(f"Saving it to {output_path}") - with open(output_path, "wb") as f: - f.write(image_bytes) - - -# ## A user interface -# -# Here we ship a simple web application that exposes a front-end (written in Alpine.js) for -# our backend deployment. -# -# The Model class will serve multiple users from its own shared pool of warm GPU containers automatically. -# -# We can deploy this with `modal deploy stable_diffusion_xl.py`. -# -# Because the `web_endpoint` decorator on our `web_inference` function has the `docs` flag set to `True`, -# we also get interactive documentation for our endpoint at `/docs`. - -frontend_path = Path(__file__).parent / "frontend" - -web_image = modal.Image.debian_slim().pip_install("jinja2") - - -@app.function( - image=web_image, - mounts=[modal.Mount.from_local_dir(frontend_path, remote_path="/assets")], - allow_concurrent_inputs=20, -) -@modal.asgi_app() -def ui(): - import fastapi.staticfiles - from fastapi import FastAPI, Request - from fastapi.templating import Jinja2Templates - - web_app = FastAPI() - templates = Jinja2Templates(directory="/assets") - - @web_app.get("/") - async def read_root(request: Request): - return templates.TemplateResponse( - "index.html", - { - "request": request, - "inference_url": Model.web_inference.web_url, - "model_name": "Stable Diffusion XL", - "default_prompt": "A cinematic shot of a baby raccoon wearing an intricate italian priest robe.", - }, - ) - - web_app.mount( - "/static", - fastapi.staticfiles.StaticFiles(directory="/assets"), - name="static", - ) - - return web_app diff --git a/06_gpu_and_ml/stable_diffusion/stable_diffusion_xl_lightning.py b/06_gpu_and_ml/stable_diffusion/stable_diffusion_xl_lightning.py deleted file mode 100644 index 1ba491114..000000000 --- a/06_gpu_and_ml/stable_diffusion/stable_diffusion_xl_lightning.py +++ /dev/null @@ -1,165 +0,0 @@ -# --- -# output-directory: "/tmp/stable-diffusion-xl-lightning" -# tags: ["use-case-image-video-3d"] -# --- -# # Run SDXL Lightning on Modal -# -# This example runs [SDXL-Lightning](https://huggingface.co/ByteDance/SDXL-Lightning) by ByteDance, a fast text-to-image model that generates high quality images in just a few steps. -# - -from pathlib import Path - -import modal - -app = modal.App("stable-diffusion-xl-lightning") - -image = modal.Image.debian_slim(python_version="3.11").pip_install( - "diffusers==0.26.3", - "huggingface-hub==0.25.2", - "transformers~=4.37.2", - "accelerate==0.27.2", - "fastapi[standard]==0.115.4", - "pydantic==2.9.2", - "starlette==0.41.2", -) - -base = "stabilityai/stable-diffusion-xl-base-1.0" -repo = "ByteDance/SDXL-Lightning" -ckpt = "sdxl_lightning_4step_unet.safetensors" - - -with image.imports(): - import io - - import torch - from diffusers import ( - EulerDiscreteScheduler, - StableDiffusionXLPipeline, - UNet2DConditionModel, - ) - from fastapi import Response - from huggingface_hub import hf_hub_download - from safetensors.torch import load_file - - -@app.cls(image=image, gpu="a100") -class Model: - @modal.build() - @modal.enter() - def load_weights(self): - unet = UNet2DConditionModel.from_config(base, subfolder="unet").to( - "cuda", torch.float16 - ) - unet.load_state_dict( - load_file(hf_hub_download(repo, ckpt), device="cuda") - ) - self.pipe = StableDiffusionXLPipeline.from_pretrained( - base, unet=unet, torch_dtype=torch.float16, variant="fp16" - ).to("cuda") - - self.pipe.scheduler = EulerDiscreteScheduler.from_config( - self.pipe.scheduler.config, timestep_spacing="trailing" - ) - - def _inference(self, prompt, n_steps=4): - negative_prompt = "disfigured, ugly, deformed" - image = self.pipe( - prompt=prompt, - guidance_scale=0, - negative_prompt=negative_prompt, - num_inference_steps=n_steps, - ).images[0] - - byte_stream = io.BytesIO() - image.save(byte_stream, format="JPEG") - - return byte_stream - - @modal.method() - def inference(self, prompt, n_steps=4): - return self._inference( - prompt, - n_steps=n_steps, - ).getvalue() - - @modal.web_endpoint(docs=True) - def web_inference(self, prompt, n_steps=4): - return Response( - content=self._inference( - prompt, - n_steps=n_steps, - ).getvalue(), - media_type="image/jpeg", - ) - - -# And this is our entrypoint; where the CLI is invoked. Run this example -# with: `modal run stable_diffusion_xl_lightning.py --prompt "An astronaut riding a green horse"` - - -@app.local_entrypoint() -def main( - prompt: str = "in the style of Dali, a surrealist painting of a weasel in a tuxedo riding a bicycle in the rain", -): - image_bytes = Model().inference.remote(prompt) - - dir = Path("/tmp/stable-diffusion-xl-lightning") - if not dir.exists(): - dir.mkdir(exist_ok=True, parents=True) - - output_path = dir / "output.png" - print(f"Saving it to {output_path}") - with open(output_path, "wb") as f: - f.write(image_bytes) - - -# ## A user interface -# -# Here we ship a simple web application that exposes a front-end (written in Alpine.js) for -# our backend deployment. -# -# The Model class will serve multiple users from a its own shared pool of warm GPU containers automatically. -# -# We can deploy this with `modal deploy stable_diffusion_xl_lightning.py`. -# -# Because the `web_endpoint` decorator on our `web_inference` function has the `docs` flag set to `True`, -# we also get interactive documentation for our endpoint at `/docs`. - -frontend_path = Path(__file__).parent / "frontend" - -web_image = modal.Image.debian_slim().pip_install("jinja2") - - -@app.function( - image=web_image, - mounts=[modal.Mount.from_local_dir(frontend_path, remote_path="/assets")], - allow_concurrent_inputs=20, -) -@modal.asgi_app() -def ui(): - import fastapi.staticfiles - from fastapi import FastAPI, Request - from fastapi.templating import Jinja2Templates - - web_app = FastAPI() - templates = Jinja2Templates(directory="/assets") - - @web_app.get("/") - async def read_root(request: Request): - return templates.TemplateResponse( - "index.html", - { - "request": request, - "inference_url": Model.web_inference.web_url, - "model_name": "Stable Diffusion XL Lightning", - "default_prompt": "A cinematic shot of a baby raccoon wearing an intricate Italian priest robe.", - }, - ) - - web_app.mount( - "/static", - fastapi.staticfiles.StaticFiles(directory="/assets"), - name="static", - ) - - return web_app diff --git a/06_gpu_and_ml/stable_diffusion/stable_diffusion_xl_turbo.py b/06_gpu_and_ml/stable_diffusion/stable_diffusion_xl_turbo.py deleted file mode 100644 index 16e577ce1..000000000 --- a/06_gpu_and_ml/stable_diffusion/stable_diffusion_xl_turbo.py +++ /dev/null @@ -1,132 +0,0 @@ -# --- -# output-directory: "/tmp/stable-diffusion-xl-turbo" -# args: [] -# runtimes: ["runc", "gvisor"] -# tags: ["use-case-image-video-3d"] -# --- -# # Stable Diffusion XL Turbo Image-to-image -# -# This example is similar to the [Stable Diffusion XL](/docs/examples/stable_diffusion_xl) -# example, but it's a distilled model trained for real-time synthesis and is image-to-image. Learn more about it [here](https://stability.ai/news/stability-ai-sdxl-turbo). -# -# Input prompt: -# `dog wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k` -# -# Input | Output -# :-------------------------:|:-------------------------: -# ![](./stable_diffusion_turbo_input.png) | ![](./stable_diffusion_turbo_output.png) - -# ## Basic setup - -from io import BytesIO -from pathlib import Path - -import modal - -# ## Define a container image - - -image = modal.Image.debian_slim().pip_install( - "Pillow~=10.1.0", - "diffusers~=0.24.0", - "huggingface-hub~=0.25.2", - "transformers~=4.35.2", # This is needed for `import torch` - "accelerate~=0.25.0", # Allows `device_map="auto"``, which allows computation of optimized device_map - "safetensors~=0.4.1", # Enables safetensor format as opposed to using unsafe pickle format -) - -app = modal.App("stable-diffusion-xl-turbo", image=image) - -with image.imports(): - import torch - from diffusers import AutoPipelineForImage2Image - from diffusers.utils import load_image - from huggingface_hub import snapshot_download - from PIL import Image - - -# ## Load model and run inference -# -# The container lifecycle [`@enter` decorator](https://modal.com/docs/guide/lifecycle-functions#container-lifecycle-beta) -# loads the model at startup. Then, we evaluate it in the `inference` function. -# -# To avoid excessive cold-starts, we set the idle timeout to 240 seconds, meaning once a GPU has loaded the model it will stay -# online for 4 minutes before spinning down. This can be adjusted for cost/experience trade-offs. - - -@app.cls(gpu=modal.gpu.A10G(), container_idle_timeout=240) -class Model: - @modal.build() - def download_models(self): - # Ignore files that we don't need to speed up download time. - ignore = [ - "*.bin", - "*.onnx_data", - "*/diffusion_pytorch_model.safetensors", - ] - - snapshot_download("stabilityai/sdxl-turbo", ignore_patterns=ignore) - - @modal.enter() - def enter(self): - self.pipe = AutoPipelineForImage2Image.from_pretrained( - "stabilityai/sdxl-turbo", - torch_dtype=torch.float16, - variant="fp16", - device_map="auto", - ) - - @modal.method() - def inference(self, image_bytes, prompt): - init_image = load_image(Image.open(BytesIO(image_bytes))).resize( - (512, 512) - ) - num_inference_steps = 4 - strength = 0.9 - # "When using SDXL-Turbo for image-to-image generation, make sure that num_inference_steps * strength is larger or equal to 1" - # See: https://huggingface.co/stabilityai/sdxl-turbo - assert num_inference_steps * strength >= 1 - - image = self.pipe( - prompt, - image=init_image, - num_inference_steps=num_inference_steps, - strength=strength, - guidance_scale=0.0, - ).images[0] - - byte_stream = BytesIO() - image.save(byte_stream, format="PNG") - image_bytes = byte_stream.getvalue() - - return image_bytes - - -DEFAULT_IMAGE_PATH = Path(__file__).parent / "demo_images/dog.png" - - -@app.local_entrypoint() -def main( - image_path=DEFAULT_IMAGE_PATH, - prompt="dog wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k", -): - with open(image_path, "rb") as image_file: - input_image_bytes = image_file.read() - output_image_bytes = Model().inference.remote(input_image_bytes, prompt) - - dir = Path("/tmp/stable-diffusion-xl-turbo") - if not dir.exists(): - dir.mkdir(exist_ok=True, parents=True) - - output_path = dir / "output.png" - print(f"Saving it to {output_path}") - with open(output_path, "wb") as f: - f.write(output_image_bytes) - - -# ## Running the model -# -# We can run the model with different parameters using the following command, -# ``` -# modal run stable_diffusion_xl_turbo.py --prompt="harry potter, glasses, wizard" --image-path="dog.png" -# ```