Merge commit '8f9f020e8f90efb3787f0feb0e544539110e40ce' into feature/…

…add-nsfw-filter # Conflicts: # modules/async_worker.py
lllyasviel · Nov 18, 2023 · 6893c8d · 6893c8d
2 parents 52ae346 + 8f9f020
commit 6893c8d
Show file tree

Hide file tree

Showing 27 changed files with 603 additions and 220 deletions.
diff --git a/args_manager.py b/args_manager.py
@@ -16,6 +16,8 @@
                                   "Some Mac users may need this.")
 
 fcbh_cli.parser.add_argument("--theme", type=str, help="launches the UI with light or dark theme", default=None)
+fcbh_cli.parser.add_argument("--disable-image-log", action='store_true',
+                             help="Prevent writing images and logs to hard drive.")
 
 fcbh_cli.parser.set_defaults(
     disable_cuda_malloc=True,

diff --git a/backend/headless/fcbh/cli_args.py b/backend/headless/fcbh/cli_args.py
@@ -62,6 +62,13 @@ def __call__(self, parser, namespace, values, option_string=None):
 fpvae_group.add_argument("--fp32-vae", action="store_true", help="Run the VAE in full precision fp32.")
 fpvae_group.add_argument("--bf16-vae", action="store_true", help="Run the VAE in bf16.")
 
+fpte_group = parser.add_mutually_exclusive_group()
+fpte_group.add_argument("--fp8_e4m3fn-text-enc", action="store_true", help="Store text encoder weights in fp8 (e4m3fn variant).")
+fpte_group.add_argument("--fp8_e5m2-text-enc", action="store_true", help="Store text encoder weights in fp8 (e5m2 variant).")
+fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text encoder weights in fp16.")
+fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.")
+
+
 parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")
 
 parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize when loading models with Intel GPUs.")

diff --git a/backend/headless/fcbh/controlnet.py b/backend/headless/fcbh/controlnet.py
@@ -33,7 +33,7 @@ def __init__(self, device=None):
         self.cond_hint_original = None
         self.cond_hint = None
         self.strength = 1.0
-        self.timestep_percent_range = (1.0, 0.0)
+        self.timestep_percent_range = (0.0, 1.0)
         self.timestep_range = None
 
         if device is None:
@@ -42,7 +42,7 @@ def __init__(self, device=None):
         self.previous_controlnet = None
         self.global_average_pooling = False
 
-    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(1.0, 0.0)):
+    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0)):
         self.cond_hint_original = cond_hint
         self.strength = strength
         self.timestep_percent_range = timestep_percent_range

diff --git a/backend/headless/fcbh/ldm/modules/diffusionmodules/openaimodel.py b/backend/headless/fcbh/ldm/modules/diffusionmodules/openaimodel.py
@@ -255,7 +255,10 @@ def apply_control(h, control, name):
     if control is not None and name in control and len(control[name]) > 0:
         ctrl = control[name].pop()
         if ctrl is not None:
-            h += ctrl
+            try:
+                h += ctrl
+            except:
+                print("warning control could not be applied", h.shape, ctrl.shape)
     return h
 
 class UNetModel(nn.Module):
@@ -630,6 +633,10 @@ def forward(self, x, timesteps=None, context=None, y=None, control=None, transfo
                     h = p(h, transformer_options)
 
             hs.append(h)
+            if "input_block_patch_after_skip" in transformer_patches:
+                patch = transformer_patches["input_block_patch_after_skip"]
+                for p in patch:
+                    h = p(h, transformer_options)
 
         transformer_options["block"] = ("middle", 0)
         h = forward_timestep_embed(self.middle_block, h, emb, context, transformer_options)

diff --git a/backend/headless/fcbh/model_detection.py b/backend/headless/fcbh/model_detection.py
@@ -186,17 +186,24 @@ def convert_config(unet_config):
 
 def unet_config_from_diffusers_unet(state_dict, dtype):
     match = {}
-    attention_resolutions = []
+    transformer_depth = []
 
     attn_res = 1
-    for i in range(5):
-        k = "down_blocks.{}.attentions.1.transformer_blocks.0.attn2.to_k.weight".format(i)
-        if k in state_dict:
-            match["context_dim"] = state_dict[k].shape[1]
-            attention_resolutions.append(attn_res)
+    down_blocks = count_blocks(state_dict, "down_blocks.{}")
+    for i in range(down_blocks):
+        attn_blocks = count_blocks(state_dict, "down_blocks.{}.attentions.".format(i) + '{}')
+        for ab in range(attn_blocks):
+            transformer_count = count_blocks(state_dict, "down_blocks.{}.attentions.{}.transformer_blocks.".format(i, ab) + '{}')
+            transformer_depth.append(transformer_count)
+            if transformer_count > 0:
+                match["context_dim"] = state_dict["down_blocks.{}.attentions.{}.transformer_blocks.0.attn2.to_k.weight".format(i, ab)].shape[1]
+
         attn_res *= 2
+        if attn_blocks == 0:
+            transformer_depth.append(0)
+            transformer_depth.append(0)
 
-    match["attention_resolutions"] = attention_resolutions
+    match["transformer_depth"] = transformer_depth
 
     match["model_channels"] = state_dict["conv_in.weight"].shape[0]
     match["in_channels"] = state_dict["conv_in.weight"].shape[1]
@@ -208,50 +215,55 @@ def unet_config_from_diffusers_unet(state_dict, dtype):
 
     SDXL = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
             'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
-            'num_res_blocks': 2, 'attention_resolutions': [2, 4], 'transformer_depth': [0, 2, 10], 'channel_mult': [1, 2, 4],
-            'transformer_depth_middle': 10, 'use_linear_in_transformer': True, 'context_dim': 2048, "num_head_channels": 64}
+            'num_res_blocks': [2, 2, 2], 'transformer_depth': [0, 0, 2, 2, 10, 10], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': 10,
+            'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 0, 2, 2, 2, 10, 10, 10]}
 
     SDXL_refiner = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
                     'num_classes': 'sequential', 'adm_in_channels': 2560, 'dtype': dtype, 'in_channels': 4, 'model_channels': 384,
-                    'num_res_blocks': 2, 'attention_resolutions': [2, 4], 'transformer_depth': [0, 4, 4, 0], 'channel_mult': [1, 2, 4, 4],
-                    'transformer_depth_middle': 4, 'use_linear_in_transformer': True, 'context_dim': 1280, "num_head_channels": 64}
+                    'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [0, 0, 4, 4, 4, 4, 0, 0], 'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 4,
+                    'use_linear_in_transformer': True, 'context_dim': 1280, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 0, 4, 4, 4, 4, 4, 4, 0, 0, 0]}
 
     SD21 = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
-            'adm_in_channels': None, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': 2,
-            'attention_resolutions': [1, 2, 4], 'transformer_depth': [1, 1, 1, 0], 'channel_mult': [1, 2, 4, 4],
-            'transformer_depth_middle': 1, 'use_linear_in_transformer': True, 'context_dim': 1024, "num_head_channels": 64}
+            'adm_in_channels': None, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2],
+            'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0], 'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': True,
+            'context_dim': 1024, 'num_head_channels': 64, 'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}
 
     SD21_uncliph = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
                     'num_classes': 'sequential', 'adm_in_channels': 2048, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
-                    'num_res_blocks': 2, 'attention_resolutions': [1, 2, 4], 'transformer_depth': [1, 1, 1, 0], 'channel_mult': [1, 2, 4, 4],
-                    'transformer_depth_middle': 1, 'use_linear_in_transformer': True, 'context_dim': 1024, "num_head_channels": 64}
+                    'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0], 'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1,
+                    'use_linear_in_transformer': True, 'context_dim': 1024, 'num_head_channels': 64, 'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}
 
     SD21_unclipl = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
                     'num_classes': 'sequential', 'adm_in_channels': 1536, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
-                    'num_res_blocks': 2, 'attention_resolutions': [1, 2, 4], 'transformer_depth': [1, 1, 1, 0], 'channel_mult': [1, 2, 4, 4],
-                    'transformer_depth_middle': 1, 'use_linear_in_transformer': True, 'context_dim': 1024}
+                    'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0], 'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1,
+                    'use_linear_in_transformer': True, 'context_dim': 1024, 'num_head_channels': 64, 'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}
 
-    SD15 = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
-            'adm_in_channels': None, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': 2,
-            'attention_resolutions': [1, 2, 4], 'transformer_depth': [1, 1, 1, 0], 'channel_mult': [1, 2, 4, 4],
-            'transformer_depth_middle': 1, 'use_linear_in_transformer': False, 'context_dim': 768, "num_heads": 8}
+    SD15 = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False, 'adm_in_channels': None,
+            'dtype': dtype, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0],
+            'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': False, 'context_dim': 768, 'num_heads': 8,
+            'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}
 
     SDXL_mid_cnet = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
-            'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
-            'num_res_blocks': 2, 'attention_resolutions': [4], 'transformer_depth': [0, 0, 1], 'channel_mult': [1, 2, 4],
-            'transformer_depth_middle': 1, 'use_linear_in_transformer': True, 'context_dim': 2048, "num_head_channels": 64}
+                     'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
+                     'num_res_blocks': [2, 2, 2], 'transformer_depth': [0, 0, 0, 0, 1, 1], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': 1,
+                     'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 0, 0, 0, 0, 1, 1, 1]}
 
     SDXL_small_cnet = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
-            'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
-            'num_res_blocks': 2, 'attention_resolutions': [], 'transformer_depth': [0, 0, 0], 'channel_mult': [1, 2, 4],
-            'transformer_depth_middle': 0, 'use_linear_in_transformer': True, "num_head_channels": 64, 'context_dim': 1}
+                       'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
+                       'num_res_blocks': [2, 2, 2], 'transformer_depth': [0, 0, 0, 0, 0, 0], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': 0,
+                       'use_linear_in_transformer': True, 'num_head_channels': 64, 'context_dim': 1, 'transformer_depth_output': [0, 0, 0, 0, 0, 0, 0, 0, 0]}
 
     SDXL_diffusers_inpaint = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
-            'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 9, 'model_channels': 320,
-            'num_res_blocks': 2, 'attention_resolutions': [2, 4], 'transformer_depth': [0, 2, 10], 'channel_mult': [1, 2, 4],
-            'transformer_depth_middle': 10, 'use_linear_in_transformer': True, 'context_dim': 2048, "num_head_channels": 64}
+                              'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 9, 'model_channels': 320,
+                              'num_res_blocks': [2, 2, 2], 'transformer_depth': [0, 0, 2, 2, 10, 10], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': 10,
+                              'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 0, 2, 2, 2, 10, 10, 10]}
+
+    SSD_1B = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
+              'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
+              'num_res_blocks': [2, 2, 2], 'transformer_depth': [0, 0, 2, 2, 4, 4], 'transformer_depth_output': [0, 0, 0, 1, 1, 2, 10, 4, 4],
+              'channel_mult': [1, 2, 4], 'transformer_depth_middle': -1, 'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64}
 
-    supported_models = [SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint]
+    supported_models = [SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint, SSD_1B]
 
     for unet_config in supported_models:
         matches = True

diff --git a/backend/headless/fcbh/model_management.py b/backend/headless/fcbh/model_management.py
@@ -482,6 +482,21 @@ def text_encoder_device():
     else:
         return torch.device("cpu")
 
+def text_encoder_dtype(device=None):
+    if args.fp8_e4m3fn_text_enc:
+        return torch.float8_e4m3fn
+    elif args.fp8_e5m2_text_enc:
+        return torch.float8_e5m2
+    elif args.fp16_text_enc:
+        return torch.float16
+    elif args.fp32_text_enc:
+        return torch.float32
+
+    if should_use_fp16(device, prioritize_performance=False):
+        return torch.float16
+    else:
+        return torch.float32
+
 def vae_device():
     return get_torch_device()
 

diff --git a/backend/headless/fcbh/model_patcher.py b/backend/headless/fcbh/model_patcher.py
@@ -37,7 +37,7 @@ def model_size(self):
         return size
 
     def clone(self):
-        n = ModelPatcher(self.model, self.load_device, self.offload_device, self.size, self.current_device)
+        n = ModelPatcher(self.model, self.load_device, self.offload_device, self.size, self.current_device, weight_inplace_update=self.weight_inplace_update)
         n.patches = {}
         for k in self.patches:
             n.patches[k] = self.patches[k][:]
@@ -99,6 +99,9 @@ def set_model_attn2_output_patch(self, patch):
     def set_model_input_block_patch(self, patch):
         self.set_model_patch(patch, "input_block_patch")
 
+    def set_model_input_block_patch_after_skip(self, patch):
+        self.set_model_patch(patch, "input_block_patch_after_skip")
+
     def set_model_output_block_patch(self, patch):
         self.set_model_patch(patch, "output_block_patch")
 

diff --git a/backend/headless/fcbh/model_sampling.py b/backend/headless/fcbh/model_sampling.py
@@ -76,5 +76,10 @@ def sigma(self, timestep):
         return log_sigma.exp()
 
     def percent_to_sigma(self, percent):
+        if percent <= 0.0:
+            return torch.tensor(999999999.9)
+        if percent >= 1.0:
+            return torch.tensor(0.0)
+        percent = 1.0 - percent
         return self.sigma(torch.tensor(percent * 999.0))
 
diff --git a/backend/headless/fcbh/samplers.py b/backend/headless/fcbh/samplers.py
@@ -220,6 +220,8 @@ def calc_cond_uncond_batch(model, cond, uncond, x_in, timestep, model_options):
                         transformer_options["patches"] = patches
 
                 transformer_options["cond_or_uncond"] = cond_or_uncond[:]
+                transformer_options["sigmas"] = timestep
+
                 c['transformer_options'] = transformer_options
 
                 if 'model_function_wrapper' in model_options:

diff --git a/backend/headless/fcbh/sd.py b/backend/headless/fcbh/sd.py
@@ -95,10 +95,7 @@ def __init__(self, target=None, embedding_directory=None, no_init=False):
         load_device = model_management.text_encoder_device()
         offload_device = model_management.text_encoder_offload_device()
         params['device'] = offload_device
-        if model_management.should_use_fp16(load_device, prioritize_performance=False):
-            params['dtype'] = torch.float16
-        else:
-            params['dtype'] = torch.float32
+        params['dtype'] = model_management.text_encoder_dtype(load_device)
 
         self.cond_stage_model = clip(**(params))
 

diff --git a/backend/headless/fcbh/utils.py b/backend/headless/fcbh/utils.py
@@ -258,7 +258,7 @@ def set_attr(obj, attr, value):
     for name in attrs[:-1]:
         obj = getattr(obj, name)
     prev = getattr(obj, attrs[-1])
-    setattr(obj, attrs[-1], torch.nn.Parameter(value))
+    setattr(obj, attrs[-1], torch.nn.Parameter(value, requires_grad=False))
     del prev
 
 def copy_to_param(obj, attr, value):

diff --git a/backend/headless/fcbh_extras/nodes_model_advanced.py b/backend/headless/fcbh_extras/nodes_model_advanced.py
@@ -66,6 +66,11 @@ def sigma(self, timestep):
         return log_sigma.exp()
 
     def percent_to_sigma(self, percent):
+        if percent <= 0.0:
+            return torch.tensor(999999999.9)
+        if percent >= 1.0:
+            return torch.tensor(0.0)
+        percent = 1.0 - percent
         return self.sigma(torch.tensor(percent * 999.0))
 
 

diff --git a/backend/headless/fcbh_extras/nodes_model_downscale.py b/backend/headless/fcbh_extras/nodes_model_downscale.py
@@ -0,0 +1,49 @@
+import torch
+
+class PatchModelAddDownscale:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": { "model": ("MODEL",),
+                              "block_number": ("INT", {"default": 3, "min": 1, "max": 32, "step": 1}),
+                              "downscale_factor": ("FLOAT", {"default": 2.0, "min": 0.1, "max": 9.0, "step": 0.001}),
+                              "start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}),
+                              "end_percent": ("FLOAT", {"default": 0.35, "min": 0.0, "max": 1.0, "step": 0.001}),
+                              "downscale_after_skip": ("BOOLEAN", {"default": True}),
+                              }}
+    RETURN_TYPES = ("MODEL",)
+    FUNCTION = "patch"
+
+    CATEGORY = "_for_testing"
+
+    def patch(self, model, block_number, downscale_factor, start_percent, end_percent, downscale_after_skip):
+        sigma_start = model.model.model_sampling.percent_to_sigma(start_percent).item()
+        sigma_end = model.model.model_sampling.percent_to_sigma(end_percent).item()
+
+        def input_block_patch(h, transformer_options):
+            if transformer_options["block"][1] == block_number:
+                sigma = transformer_options["sigmas"][0].item()
+                if sigma <= sigma_start and sigma >= sigma_end:
+                    h = torch.nn.functional.interpolate(h, scale_factor=(1.0 / downscale_factor), mode="bicubic", align_corners=False)
+            return h
+
+        def output_block_patch(h, hsp, transformer_options):
+            if h.shape[2] != hsp.shape[2]:
+                h = torch.nn.functional.interpolate(h, size=(hsp.shape[2], hsp.shape[3]), mode="bicubic", align_corners=False)
+            return h, hsp
+
+        m = model.clone()
+        if downscale_after_skip:
+            m.set_model_input_block_patch_after_skip(input_block_patch)
+        else:
+            m.set_model_input_block_patch(input_block_patch)
+        m.set_model_output_block_patch(output_block_patch)
+        return (m, )
+
+NODE_CLASS_MAPPINGS = {
+    "PatchModelAddDownscale": PatchModelAddDownscale,
+}
+
+NODE_DISPLAY_NAME_MAPPINGS = {
+    # Sampling
+    "PatchModelAddDownscale": "PatchModelAddDownscale (Kohya Deep Shrink)",
+}