From b785a155d6bec03b7c6416ce70d103b177e3099f Mon Sep 17 00:00:00 2001 From: Linoy Tsaban <57615435+linoytsaban@users.noreply.github.com> Date: Mon, 4 Dec 2023 06:11:25 +0200 Subject: [PATCH 1/2] [advanced dreambooth lora sdxl training script] improve help tags (#6035) * improve help tags * style fix --------- Co-authored-by: Linoy --- .../train_dreambooth_lora_sdxl_advanced.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py index b7cb43449a4e..abd169b8bc97 100644 --- a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py +++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py @@ -232,20 +232,26 @@ def parse_args(input_args=None): help=( "The name of the Dataset (from the HuggingFace hub) containing the training data of instance images (could be your own, possibly private," " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem," - " or to a folder containing files that 🤗 Datasets can understand." + " or to a folder containing files that 🤗 Datasets can understand.To load the custom captions, the training set directory needs to follow the structure of a " + "datasets ImageFolder, containing both the images and the corresponding caption for each image. see: " + "https://huggingface.co/docs/datasets/image_dataset for more information" ), ) parser.add_argument( "--dataset_config_name", type=str, default=None, - help="The config of the Dataset, leave as None if there's only one config.", + help="The config of the Dataset. In some cases, a dataset may have more than one configuration (for example " + "if it contains different subsets of data within, and you only wish to load a specific subset - in that case specify the desired configuration using --dataset_config_name. Leave as " + "None if there's only one config.", ) parser.add_argument( "--instance_data_dir", type=str, default=None, - help=("A folder containing the training data. "), + help="A path to local folder containing the training data of instance images. Specify this arg instead of " + "--dataset_name if you wish to train using a local folder without custom captions. If you wish to train with custom captions please specify " + "--dataset_name instead.", ) parser.add_argument( From bf92e746c05b7da59a3ddf233dbc35b2587b3d2f Mon Sep 17 00:00:00 2001 From: gujing <925973396@qq.com> Date: Mon, 4 Dec 2023 12:36:23 +0800 Subject: [PATCH 2/2] fix StableDiffusionTensorRT super args error (#6009) --- .../community/stable_diffusion_tensorrt_img2img.py | 13 +++++++++++-- .../community/stable_diffusion_tensorrt_inpaint.py | 13 +++++++++++-- .../community/stable_diffusion_tensorrt_txt2img.py | 13 +++++++++++-- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/examples/community/stable_diffusion_tensorrt_img2img.py b/examples/community/stable_diffusion_tensorrt_img2img.py index 041cf3a12dbd..507177791f5e 100755 --- a/examples/community/stable_diffusion_tensorrt_img2img.py +++ b/examples/community/stable_diffusion_tensorrt_img2img.py @@ -41,7 +41,7 @@ save_engine, ) from polygraphy.backend.trt import util as trt_util -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.pipelines.stable_diffusion import ( @@ -709,6 +709,7 @@ def __init__( scheduler: DDIMScheduler, safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPFeatureExtractor, + image_encoder: CLIPVisionModelWithProjection = None, requires_safety_checker: bool = True, stages=["clip", "unet", "vae", "vae_encoder"], image_height: int = 512, @@ -724,7 +725,15 @@ def __init__( timing_cache: str = "timing_cache", ): super().__init__( - vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker + vae, + text_encoder, + tokenizer, + unet, + scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + image_encoder=image_encoder, + requires_safety_checker=requires_safety_checker, ) self.vae.forward = self.vae.decode diff --git a/examples/community/stable_diffusion_tensorrt_inpaint.py b/examples/community/stable_diffusion_tensorrt_inpaint.py index 71fa1b0a5f11..b4e16c76159c 100755 --- a/examples/community/stable_diffusion_tensorrt_inpaint.py +++ b/examples/community/stable_diffusion_tensorrt_inpaint.py @@ -41,7 +41,7 @@ save_engine, ) from polygraphy.backend.trt import util as trt_util -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.pipelines.stable_diffusion import ( @@ -710,6 +710,7 @@ def __init__( scheduler: DDIMScheduler, safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPFeatureExtractor, + image_encoder: CLIPVisionModelWithProjection = None, requires_safety_checker: bool = True, stages=["clip", "unet", "vae", "vae_encoder"], image_height: int = 512, @@ -725,7 +726,15 @@ def __init__( timing_cache: str = "timing_cache", ): super().__init__( - vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker + vae, + text_encoder, + tokenizer, + unet, + scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + image_encoder=image_encoder, + requires_safety_checker=requires_safety_checker, ) self.vae.forward = self.vae.decode diff --git a/examples/community/stable_diffusion_tensorrt_txt2img.py b/examples/community/stable_diffusion_tensorrt_txt2img.py index b51f3176b958..c38261463384 100755 --- a/examples/community/stable_diffusion_tensorrt_txt2img.py +++ b/examples/community/stable_diffusion_tensorrt_txt2img.py @@ -40,7 +40,7 @@ save_engine, ) from polygraphy.backend.trt import util as trt_util -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.pipelines.stable_diffusion import ( @@ -624,6 +624,7 @@ def __init__( scheduler: DDIMScheduler, safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPFeatureExtractor, + image_encoder: CLIPVisionModelWithProjection = None, requires_safety_checker: bool = True, stages=["clip", "unet", "vae"], image_height: int = 768, @@ -639,7 +640,15 @@ def __init__( timing_cache: str = "timing_cache", ): super().__init__( - vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker + vae, + text_encoder, + tokenizer, + unet, + scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + image_encoder=image_encoder, + requires_safety_checker=requires_safety_checker, ) self.vae.forward = self.vae.decode