From 69ab06b9bdd7a65f0e45145f85cc38efb16d5819 Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Tue, 12 Nov 2024 19:53:23 -0800 Subject: [PATCH] update dreambooth example to emphasize lora, download weights in volume (#969) --- 06_gpu_and_ml/dreambooth/dreambooth_app.py | 219 +++++++++++---------- 1 file changed, 116 insertions(+), 103 deletions(-) diff --git a/06_gpu_and_ml/dreambooth/dreambooth_app.py b/06_gpu_and_ml/dreambooth/dreambooth_app.py index 73e07969d..ddf1a818a 100644 --- a/06_gpu_and_ml/dreambooth/dreambooth_app.py +++ b/06_gpu_and_ml/dreambooth/dreambooth_app.py @@ -2,79 +2,81 @@ # deploy: true # tags: ["use-case-image-video-3d", "use-case-finetuning", "featured"] # --- -# -# # Custom Pet Art from Flux with Hugging Face and Gradio -# + +# # Train a character LoRA for Flux with Hugging Face and Gradio + # This example finetunes the [Flux.1-dev model](https://huggingface.co/black-forest-labs/FLUX.1-dev) # on images of a pet (by default, a puppy named Qwerty) # using a technique called textual inversion from [the "Dreambooth" paper](https://dreambooth.github.io/). # Effectively, it teaches a general image generation model a new "proper noun", # allowing for the personalized generation of art and photos. -# +# We supplement textual inversion with low-rank adaptation (LoRA) +# for increased efficiency during training. + # It then makes the model shareable with others -- without costing $25/day for a GPU server-- # by hosting a [Gradio app](https://gradio.app/) on Modal. -# + # It demonstrates a simple, productive, and cost-effective pathway # to building on large pretrained models using Modal's building blocks, like -# [GPU-accelerated](https://modal.com/docs/guide/gpu) Modal functions and classes for compute-intensive work, -# [volumes](https://modal.com/docs/guide/volumes) for storage, +# [GPU-accelerated](https://modal.com/docs/guide/gpu) Modal Functions and Clses for compute-intensive work, +# [Volumes](https://modal.com/docs/guide/volumes) for storage, # and [web endpoints](https://modal.com/docs/guide/webhooks) for serving. -# + # And with some light customization, you can use it to generate images of your pet! -# + # ![Gradio.app image generation interface](./gradio-image-generate.png) -# + # You can find a video walkthrough of this example on the Modal YouTube channel # [here](https://www.youtube.com/watch?v=df-8fiByXMI). -# + # ## Imports and setup -# + # We start by importing the necessary libraries and setting up the environment. -# By installing Modal, we already brought in the FastAPI library we'll use to serve our app, -# so we import it here. from dataclasses import dataclass from pathlib import Path import modal -from fastapi import FastAPI -from fastapi.responses import FileResponse # ## Building up the environment -# + # Machine learning environments are complex, and the dependencies can be hard to manage. -# Modal makes creating and working with environments easy via containers and container images. -# +# Modal makes creating and working with environments easy via +# [containers and container images](https://modal.com/docs/guide/custom-container). + # We start from a base image and specify all of our dependencies. # We'll call out the interesting ones as they come up below. # Note that these dependencies are not installed locally -# -- they are only installed in the remote environment where our app runs. +# -- they are only installed in the remote environment where our Modal App runs. app = modal.App(name="example-dreambooth-flux") image = modal.Image.debian_slim(python_version="3.10").pip_install( "accelerate==0.31.0", "datasets~=2.13.0", - "ftfy~=6.1.0", - "gradio~=4.29.0", "fastapi[standard]==0.115.4", + "ftfy~=6.1.0", + "gradio~=5.5.0", + "huggingface-hub==0.26.2", + "hf_transfer==0.1.8", + "numpy<2", + "peft==0.11.1", "pydantic==2.9.2", - "starlette==0.41.2", + "sentencepiece>=0.1.91,!=0.1.92", "smart_open~=6.4.0", + "starlette==0.41.2", "transformers~=4.41.2", - "sentencepiece>=0.1.91,!=0.1.92", "torch~=2.2.0", "torchvision~=0.16", "triton~=2.2.0", - "peft==0.11.1", "wandb==0.17.6", ) # ### Downloading scripts and installing a git repo with `run_commands` -# + # We'll use an example script from the `diffusers` library to train the model. # We acquire it from GitHub and install it in our environment with a series of commands. -# The container environments Modal functions run in are highly flexible -- +# The container environments Modal Functions run in are highly flexible -- # see [the docs](https://modal.com/docs/guide/custom-container) for more details. GIT_SHA = ( @@ -92,8 +94,9 @@ "cd /root && pip install -e .", ) ) + # ### Configuration with `dataclass`es -# + # Machine learning apps often have a lot of configuration information. # We collect up all of our configuration into dataclasses to avoid scattering special/magic values throughout code. @@ -111,56 +114,57 @@ class SharedConfig: model_name: str = "black-forest-labs/FLUX.1-dev" -# ### Downloading weights with `run_function` -# -# Not everything we need for an ML app like Pet Dreambooth is available as a Python package -# or even on GitHub. Sometimes, there is nothing to be done but to execute some code inside the environment. -# We can do this on Modal with `run_function`. -# -# In our case, we use it to download the pretrained model weights for the Stable Diffusion XL model -# that we'll be finetuning. -# +# ### Storing data created by our app with `modal.Volume` + +# The tools we've used so far work well for fetching external information, +# which defines the environment our app runs in, +# but what about data that we create or modify during the app's execution? +# A persisted [`modal.Volume`](https://modal.com/docs/guide/volumes) can store and share data across Modal Apps and Functions. + +# We'll use one to store both the original and fine-tuned weights we create during training +# and then load them back in for inference. + +volume = modal.Volume.from_name( + "dreambooth-finetuning-volume-flux", create_if_missing=True +) +MODEL_DIR = "/model" + # Note that access to the Flux.1-dev model on Hugging Face is # [gated by a license agreement](https://huggingface.co/docs/hub/en/models-gated) which # you must agree to [here](https://huggingface.co/black-forest-labs/FLUX.1-dev). # After you have accepted the license, [create a Modal Secret](https://modal.com/secrets) # with the name `huggingface` following the instructions in the template. +huggingface_secret = modal.Secret.from_name("huggingface") -def download_models(): - from diffusers import DiffusionPipeline - from transformers.utils import move_cache - - config = SharedConfig() - - DiffusionPipeline.from_pretrained(config.model_name) - move_cache() +image = image.env( + {"HF_HUB_ENABLE_HF_TRANSFER": "1"} # turn on faster downloads from HF +) -image = image.run_function( - download_models, secrets=[modal.Secret.from_name("huggingface")] +@app.function( + volumes={MODEL_DIR: volume}, + image=image, + secrets=[huggingface_secret], + timeout=600, # 10 minutes ) +def download_models(config): + import torch + from diffusers import DiffusionPipeline + from huggingface_hub import snapshot_download + snapshot_download( + config.model_name, + local_dir=MODEL_DIR, + ignore_patterns=["*.pt", "*.bin"], # using safetensors + ) -# ### Storing data generated by our app with `modal.Volume` -# -# The tools we've used so far work well for fetching external information, -# which defines the environment our app runs in, -# but what about data that we create or modify during the app's execution? -# A persisted `modal.Volume` can store and share data across Modal apps or runs of the same app. -# -# We'll use one to store the fine-tuned weights we create during training -# and then load them back in for inference. + DiffusionPipeline.from_pretrained(MODEL_DIR, torch_dtype=torch.bfloat16) -volume = modal.Volume.from_name( - "dreambooth-finetuning-volume-flux", create_if_missing=True -) -MODEL_DIR = "/model" +# ### Load fine-tuning dataset -# ### Load finetuning dataset -# -# Part of the magic of the Dreambooth approach is that we only need 3-10 images for finetuning. +# Part of the magic of the low-rank fine-tuning is that we only need 3-10 images for fine-tuning. # So we can fetch just a few images, stored on consumer platforms like Imgur or Google Drive, # whenever we need them -- no need for expensive, hard-to-maintain data pipelines. @@ -181,38 +185,38 @@ def load_images(image_urls: list[str]) -> Path: return img_path -# ## Finetuning a text-to-image model -# +# ## Low-Rank Adapation (LoRA) fine-tuning for a text-to-image model + # The base model we start from is trained to do a sort of "reverse [ekphrasis](https://en.wikipedia.org/wiki/Ekphrasis)": # it attempts to recreate a visual work of art or image from only its description. -# + # We can use the model to synthesize wholly new images # by combining the concepts it has learned from the training data. -# -# We use a pretrained model, the XL version of Stability AI's Stable Diffusion. -# In this example, we "finetune" SDXL, making only small adjustments to the weights. + +# We use a pretrained model, the Flux model from Black Forest Labs. +# In this example, we "finetune" Flux, making only small adjustments to the weights. # Furthermore, we don't change all the weights in the model. # Instead, using a technique called [_low-rank adaptation_](https://arxiv.org/abs/2106.09685), # we change a much smaller matrix that works "alongside" the existing weights, nudging the model in the direction we want. -# + # We can get away with such a small and simple training process because we're just teach the model the meaning of a single new word: the name of our pet. -# + # The result is a model that can generate novel images of our pet: # as an astronaut in space, as painted by Van Gogh or Bastiat, etc. -# + # ### Finetuning with Hugging Face ๐Ÿงจ Diffusers and Accelerate -# + # The model weights, training libraries, and training script are all provided by [๐Ÿค— Hugging Face](https://huggingface.co). -# + # You can kick off a training job with the command `modal run dreambooth_app.py::app.train`. # It should take about ten minutes. -# + # Training machine learning models takes time and produces a lot of metadata -- # metrics for performance and resource utilization, # metrics for model quality and training stability, # and model inputs and outputs like images and text. # This is especially important if you're fiddling around with the configuration parameters. -# + # This example can optionally use [Weights & Biases](https://wandb.ai) to track all of this training information. # Just sign up for an account, switch the flag below, and add your API key as a [Modal secret](https://modal.com/docs/guide/secrets). @@ -222,10 +226,10 @@ def load_images(image_urls: list[str]) -> Path: # Check out [this run](https://wandb.ai/cfrye59/dreambooth-lora-sd-xl/runs/ca3v1lsh?workspace=user-cfrye59), # which [despite having high GPU utilization](https://wandb.ai/cfrye59/dreambooth-lora-sd-xl/runs/ca3v1lsh/system) # suffered from numerical instability during training and produced only black images -- hard to debug without experiment management logs! -# + # You can read more about how the values in `TrainConfig` are chosen and adjusted [in this blog post on Hugging Face](https://huggingface.co/blog/dreambooth). # To run training on images of your own pet, upload the images to separate URLs and edit the contents of the file at `TrainConfig.instance_example_urls_file` to point to them. -# + # Tip: if the results you're seeing don't match the prompt too well, and instead produce an image # of your subject without taking the prompt into account, the model has likely overfit. In this case, repeat training with a lower # value of `max_train_steps`. If you used W&B, look back at results earlier in training to determine where to stop. @@ -260,7 +264,7 @@ class TrainConfig(SharedConfig): @app.function( image=image, - gpu=modal.gpu.A100( # fine-tuning is VRAM-heavy and requires an A100 GPU + gpu=modal.gpu.A100( # fine-tuning is VRAM-heavy and requires a high-VRAM GPU count=1, size="80GB" ), volumes={MODEL_DIR: volume}, # stores fine-tuned model @@ -311,7 +315,7 @@ def _exec_subprocess(cmd: list[str]): "launch", "examples/dreambooth/train_dreambooth_lora_flux.py", "--mixed_precision=bf16", # half-precision floats most of the time for faster training - f"--pretrained_model_name_or_path={config.model_name}", + f"--pretrained_model_name_or_path={MODEL_DIR}", f"--instance_data_dir={img_path}", f"--output_dir={MODEL_DIR}", f"--instance_prompt={prompt}", @@ -343,12 +347,12 @@ def _exec_subprocess(cmd: list[str]): # ## Running our model -# -# To generate images from prompts using our fine-tuned model, we define a Modal function called `inference`. -# + +# To generate images from prompts using our fine-tuned model, we define a Modal Function called `inference`. + # Naively, this would seem to be a bad fit for the flexible, serverless infrastructure of Modal: # wouldn't you need to include the steps to load the model and spin it up in every function call? -# + # In order to initialize the model just once on container startup, # we use Modal's [container lifecycle](https://modal.com/docs/guide/lifecycle-functions) features, which require the function to be part # of a class. Note that the `modal.Volume` we saved the model to is mounted here as well, @@ -362,14 +366,12 @@ def load_model(self): import torch from diffusers import DiffusionPipeline - config = TrainConfig() - # Reload the modal.Volume to ensure the latest state is accessible. volume.reload() # set up a hugging face inference pipeline using our model pipe = DiffusionPipeline.from_pretrained( - config.model_name, + MODEL_DIR, torch_dtype=torch.bfloat16, ).to("cuda") pipe.load_lora_weights(MODEL_DIR) @@ -387,27 +389,24 @@ def inference(self, text, config): # ## Wrap the trained model in a Gradio web UI -# + # [Gradio](https://gradio.app) makes it super easy to expose a model's functionality # in an easy-to-use, responsive web interface. -# + # This model is a text-to-image generator, # so we set up an interface that includes a user-entry text box # and a frame for displaying images. -# + # We also provide some example text inputs to help # guide users and to kick-start their creative juices. -# + # And we couldn't resist adding some Modal style to it as well! -# + # You can deploy the app on Modal with the command # `modal deploy dreambooth_app.py`. # You'll be able to come back days, weeks, or months later and find it still ready to go, # even though you don't have to pay for a server to run while you're not using it. -web_app = FastAPI() -assets_path = Path(__file__).parent / "assets" - @dataclass class AppConfig(SharedConfig): @@ -417,6 +416,9 @@ class AppConfig(SharedConfig): guidance_scale: float = 6 +assets_path = Path(__file__).parent / "assets" + + @app.function( image=image, concurrency_limit=1, @@ -426,8 +428,12 @@ class AppConfig(SharedConfig): @modal.asgi_app() def fastapi_app(): import gradio as gr + from fastapi import FastAPI + from fastapi.responses import FileResponse from gradio.routes import mount_gradio_app + web_app = FastAPI() + # Call out to the inference in a separate Modal environment with a GPU def go(text=""): if not text: @@ -473,10 +479,12 @@ async def background(): # add a gradio UI around inference with gr.Blocks( - theme=theme, css=css, title="Pet Dreambooth on Modal" + theme=theme, + css=css, + title=f"Generate images of {config.instance_name} on Modal", ) as interface: gr.Markdown( - f"# Dream up images of {instance_phrase}.\n\n{description}", + f"# Generate images of {instance_phrase}.\n\n{description}", ) with gr.Row(): inp = gr.Textbox( # input text component @@ -513,16 +521,17 @@ async def background(): ) -# ## Running your own Dreambooth from the command line -# +# ## Running your fine-tuned model from the command line + # You can use the `modal` command-line interface to set up, customize, and deploy this app: -# + # - `modal run dreambooth_app.py` will train the model. Change the `instance_example_urls_file` to point to your own pet's images. # - `modal serve dreambooth_app.py` will [serve](https://modal.com/docs/guide/webhooks#developing-with-modal-serve) the Gradio interface at a temporary location. Great for iterating on code! # - `modal shell dreambooth_app.py` is a convenient helper to open a bash [shell](https://modal.com/docs/guide/developing-debugging#interactive-shell) in our image. Great for debugging environment issues. -# -# Remember, once you've trained your own fine-tuned model, you can deploy it using `modal deploy dreambooth_app.py`. -# + +# Remember, once you've trained your own fine-tuned model, you can deploy it permanently -- for no cost when it is not being used! -- +# using `modal deploy dreambooth_app.py`. + # If you just want to try the app out, you can find our deployment [here](https://modal-labs--example-dreambooth-flux-fastapi-app.modal.run). @@ -530,8 +539,12 @@ async def background(): def run( # add more config params here to make training configurable max_train_steps: int = 250, ): + print("๐ŸŽจ loading model") + download_models.remote(SharedConfig()) + print("๐ŸŽจ setting up training") config = TrainConfig(max_train_steps=max_train_steps) instance_example_urls = ( Path(TrainConfig.instance_example_urls_file).read_text().splitlines() ) train.remote(instance_example_urls, config) + print("๐ŸŽจ training finished")