From 7d7bd7f77b5fd9d6ea22ee9cc2489791ebd679a1 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 12 Dec 2023 10:37:04 +0000 Subject: [PATCH 1/3] update --- src/diffusers/loaders/single_file.py | 4 + .../stable_diffusion/convert_from_ckpt.py | 113 ++++++++---------- 2 files changed, 56 insertions(+), 61 deletions(-) diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py index a49280adfcfe..742984449e4f 100644 --- a/src/diffusers/loaders/single_file.py +++ b/src/diffusers/loaders/single_file.py @@ -169,10 +169,12 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): load_safety_checker = kwargs.pop("load_safety_checker", True) prediction_type = kwargs.pop("prediction_type", None) text_encoder = kwargs.pop("text_encoder", None) + text_encoder_2 = kwargs.pop("text_encoder_2", None) vae = kwargs.pop("vae", None) controlnet = kwargs.pop("controlnet", None) adapter = kwargs.pop("adapter", None) tokenizer = kwargs.pop("tokenizer", None) + tokenizer_2 = kwargs.pop("tokenizer_2", None) torch_dtype = kwargs.pop("torch_dtype", None) @@ -274,8 +276,10 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): load_safety_checker=load_safety_checker, prediction_type=prediction_type, text_encoder=text_encoder, + text_encoder_2=text_encoder_2, vae=vae, tokenizer=tokenizer, + tokenizer_2=tokenizer_2, original_config_file=original_config_file, config_files=config_files, local_files_only=local_files_only, diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index 761391189f8f..6a76a7a96a2b 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -1149,11 +1149,13 @@ def download_from_original_stable_diffusion_ckpt( adapter: Optional[bool] = None, load_safety_checker: bool = True, pipeline_class: DiffusionPipeline = None, - local_files_only=False, + local_files_only: bool = False, vae_path=None, vae=None, text_encoder=None, + text_encoder_2=None, tokenizer=None, + tokenizer_2=None, config_files=None, ) -> DiffusionPipeline: """ @@ -1232,7 +1234,9 @@ def download_from_original_stable_diffusion_ckpt( StableDiffusionInpaintPipeline, StableDiffusionPipeline, StableDiffusionUpscalePipeline, + StableDiffusionXLControlNetInpaintPipeline, StableDiffusionXLImg2ImgPipeline, + StableDiffusionXLInpaintPipeline, StableDiffusionXLPipeline, StableUnCLIPImg2ImgPipeline, StableUnCLIPPipeline, @@ -1339,7 +1343,11 @@ def download_from_original_stable_diffusion_ckpt( else: pipeline_class = StableDiffusionXLPipeline if model_type == "SDXL" else StableDiffusionXLImg2ImgPipeline - if num_in_channels is None and pipeline_class == StableDiffusionInpaintPipeline: + if num_in_channels is None and pipeline_class in [ + StableDiffusionInpaintPipeline, + StableDiffusionXLInpaintPipeline, + StableDiffusionXLControlNetInpaintPipeline, + ]: num_in_channels = 9 if num_in_channels is None and pipeline_class == StableDiffusionUpscalePipeline: num_in_channels = 7 @@ -1686,7 +1694,9 @@ def download_from_original_stable_diffusion_ckpt( feature_extractor=feature_extractor, ) elif model_type in ["SDXL", "SDXL-Refiner"]: - if model_type == "SDXL": + is_refiner = model_type == "SDXL-Refiner" + + if tokenizer is None: try: tokenizer = CLIPTokenizer.from_pretrained( "openai/clip-vit-large-patch14", local_files_only=local_files_only @@ -1695,7 +1705,10 @@ def download_from_original_stable_diffusion_ckpt( raise ValueError( f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'." ) + if text_encoder is None: text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only) + + if tokenizer_2 is None: try: tokenizer_2 = CLIPTokenizer.from_pretrained( "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only @@ -1705,35 +1718,51 @@ def download_from_original_stable_diffusion_ckpt( f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k' with `pad_token` set to '!'." ) + if text_encoder_2 is None: config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" config_kwargs = {"projection_dim": 1280} + prefix = "conditioner.embedders.0.model." if is_refiner else "conditioner.embedders.1.model." + text_encoder_2 = convert_open_clip_checkpoint( checkpoint, config_name, - prefix="conditioner.embedders.1.model.", + prefix=prefix, has_projection=True, local_files_only=local_files_only, **config_kwargs, ) - if is_accelerate_available(): # SBM Now move model to cpu. - if model_type in ["SDXL", "SDXL-Refiner"]: - for param_name, param in converted_unet_checkpoint.items(): - set_module_tensor_to_device(unet, param_name, "cpu", value=param) + if is_accelerate_available(): # SBM Now move model to cpu. + for param_name, param in converted_unet_checkpoint.items(): + set_module_tensor_to_device(unet, param_name, "cpu", value=param) - if controlnet: - pipe = pipeline_class( - vae=vae, - text_encoder=text_encoder, - tokenizer=tokenizer, - text_encoder_2=text_encoder_2, - tokenizer_2=tokenizer_2, - unet=unet, - controlnet=controlnet, - scheduler=scheduler, - force_zeros_for_empty_prompt=True, - ) - elif adapter: + if controlnet: + pipe = pipeline_class( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + text_encoder_2=text_encoder_2, + tokenizer_2=tokenizer_2, + unet=unet, + controlnet=controlnet, + scheduler=scheduler, + force_zeros_for_empty_prompt=True, + ) + elif adapter: + pipe = pipeline_class( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + text_encoder_2=text_encoder_2, + tokenizer_2=tokenizer_2, + unet=unet, + adapter=adapter, + scheduler=scheduler, + force_zeros_for_empty_prompt=True, + ) + + else: + if pipeline_class == StableDiffusionXLImg2ImgPipeline: pipe = pipeline_class( vae=vae, text_encoder=text_encoder, @@ -1741,9 +1770,9 @@ def download_from_original_stable_diffusion_ckpt( text_encoder_2=text_encoder_2, tokenizer_2=tokenizer_2, unet=unet, - adapter=adapter, scheduler=scheduler, - force_zeros_for_empty_prompt=True, + force_zeros_for_empty_prompt=False, + requires_aesthetics_score=is_refiner, ) else: pipe = pipeline_class( @@ -1756,44 +1785,6 @@ def download_from_original_stable_diffusion_ckpt( scheduler=scheduler, force_zeros_for_empty_prompt=True, ) - else: - tokenizer = None - text_encoder = None - try: - tokenizer_2 = CLIPTokenizer.from_pretrained( - "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only - ) - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k' with `pad_token` set to '!'." - ) - config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" - config_kwargs = {"projection_dim": 1280} - text_encoder_2 = convert_open_clip_checkpoint( - checkpoint, - config_name, - prefix="conditioner.embedders.0.model.", - has_projection=True, - local_files_only=local_files_only, - **config_kwargs, - ) - - if is_accelerate_available(): # SBM Now move model to cpu. - if model_type in ["SDXL", "SDXL-Refiner"]: - for param_name, param in converted_unet_checkpoint.items(): - set_module_tensor_to_device(unet, param_name, "cpu", value=param) - - pipe = StableDiffusionXLImg2ImgPipeline( - vae=vae, - text_encoder=text_encoder, - tokenizer=tokenizer, - text_encoder_2=text_encoder_2, - tokenizer_2=tokenizer_2, - unet=unet, - scheduler=scheduler, - requires_aesthetics_score=True, - force_zeros_for_empty_prompt=False, - ) else: text_config = create_ldm_bert_config(original_config) text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) From 61481575c0d90fd52be6935d4c6e9e8015b93d58 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 12 Dec 2023 12:33:53 +0000 Subject: [PATCH 2/3] update --- .../stable_diffusion/convert_from_ckpt.py | 46 ++++++++----------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index 6a76a7a96a2b..81cc58b907f9 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -1149,7 +1149,7 @@ def download_from_original_stable_diffusion_ckpt( adapter: Optional[bool] = None, load_safety_checker: bool = True, pipeline_class: DiffusionPipeline = None, - local_files_only: bool = False, + local_files_only=False, vae_path=None, vae=None, text_encoder=None, @@ -1696,7 +1696,7 @@ def download_from_original_stable_diffusion_ckpt( elif model_type in ["SDXL", "SDXL-Refiner"]: is_refiner = model_type == "SDXL-Refiner" - if tokenizer is None: + if (is_refiner is False) and (tokenizer is None): try: tokenizer = CLIPTokenizer.from_pretrained( "openai/clip-vit-large-patch14", local_files_only=local_files_only @@ -1705,7 +1705,8 @@ def download_from_original_stable_diffusion_ckpt( raise ValueError( f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'." ) - if text_encoder is None: + + if (is_refiner is False) and (text_encoder is None): text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only) if tokenizer_2 is None: @@ -1762,29 +1763,22 @@ def download_from_original_stable_diffusion_ckpt( ) else: - if pipeline_class == StableDiffusionXLImg2ImgPipeline: - pipe = pipeline_class( - vae=vae, - text_encoder=text_encoder, - tokenizer=tokenizer, - text_encoder_2=text_encoder_2, - tokenizer_2=tokenizer_2, - unet=unet, - scheduler=scheduler, - force_zeros_for_empty_prompt=False, - requires_aesthetics_score=is_refiner, - ) - else: - pipe = pipeline_class( - vae=vae, - text_encoder=text_encoder, - tokenizer=tokenizer, - text_encoder_2=text_encoder_2, - tokenizer_2=tokenizer_2, - unet=unet, - scheduler=scheduler, - force_zeros_for_empty_prompt=True, - ) + pipeline_kwargs = { + "vae": vae, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "text_encoder_2": text_encoder_2, + "tokenizer_2": tokenizer_2, + "unet": unet, + "scheduler": scheduler, + } + + if (pipeline_class == StableDiffusionXLImg2ImgPipeline) or ( + pipeline_class == StableDiffusionXLInpaintPipeline + ): + pipeline_kwargs.update({"requires_aesthetics_score": is_refiner}) + + pipe = pipeline_class(**pipeline_kwargs) else: text_config = create_ldm_bert_config(original_config) text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) From cfd73b4153733d41e976f04fb272f84054c09fd9 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 12 Dec 2023 14:24:30 +0000 Subject: [PATCH 3/3] update --- src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index 81cc58b907f9..5aa23252b86a 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -1778,6 +1778,9 @@ def download_from_original_stable_diffusion_ckpt( ): pipeline_kwargs.update({"requires_aesthetics_score": is_refiner}) + if is_refiner: + pipeline_kwargs.update({"force_zeros_for_empty_prompt": False}) + pipe = pipeline_class(**pipeline_kwargs) else: text_config = create_ldm_bert_config(original_config)