From 80e9f7223477418731478f2fa6e6e9d00b41e358 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 1 Apr 2024 01:50:22 +0000
Subject: [PATCH 01/12] Bump crate-ci/typos from 1.17.2 to 1.19.0

Bumps [crate-ci/typos](https://github.com/crate-ci/typos) from 1.17.2 to 1.19.0.
- [Release notes](https://github.com/crate-ci/typos/releases)
- [Changelog](https://github.com/crate-ci/typos/blob/master/CHANGELOG.md)
- [Commits](https://github.com/crate-ci/typos/compare/v1.17.2...v1.19.0)

---
updated-dependencies:
- dependency-name: crate-ci/typos
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/typos.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/typos.yml b/.github/workflows/typos.yml
index c9edf2650..e8b06483f 100644
--- a/.github/workflows/typos.yml
+++ b/.github/workflows/typos.yml
@@ -18,4 +18,4 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: typos-action
-        uses: crate-ci/typos@v1.17.2
+        uses: crate-ci/typos@v1.19.0

From b748b48dbbaf6ddd011f032f5ede47aea094a208 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 3 Apr 2024 12:43:08 +0900
Subject: [PATCH 02/12] fix attention couple+deep shink cause error in some
 reso

---
 networks/lora.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/networks/lora.py b/networks/lora.py
index 948b30b0e..d1208040f 100644
--- a/networks/lora.py
+++ b/networks/lora.py
@@ -247,14 +247,13 @@ def get_mask_for_x(self, x):
             area = x.size()[1]
 
         mask = self.network.mask_dic.get(area, None)
-        if mask is None:
-            # raise ValueError(f"mask is None for resolution {area}")
+        if mask is None or len(x.size()) == 2:
             # emb_layers in SDXL doesn't have mask
             # if "emb" not in self.lora_name:
             #     print(f"mask is None for resolution {self.lora_name}, {area}, {x.size()}")
             mask_size = (1, x.size()[1]) if len(x.size()) == 2 else (1, *x.size()[1:-1], 1)
             return torch.ones(mask_size, dtype=x.dtype, device=x.device) / self.network.num_sub_prompts
-        if len(x.size()) != 4:
+        if len(x.size()) == 3:
             mask = torch.reshape(mask, (1, -1, 1))
         return mask
 

From cd587ce62cd340de32f8bf72e6eae209ce5d5580 Mon Sep 17 00:00:00 2001
From: ykume <ykumeykume@gmail.com>
Date: Fri, 5 Apr 2024 08:23:03 +0900
Subject: [PATCH 03/12] verify command line args if wandb is enabled

---
 fine_tune.py                         |  1 +
 library/train_util.py                | 56 +++++++++++++++++++++++++++-
 sdxl_train.py                        |  1 +
 sdxl_train_control_net_lllite.py     |  1 +
 sdxl_train_control_net_lllite_old.py |  1 +
 sdxl_train_network.py                |  1 +
 sdxl_train_textual_inversion.py      |  1 +
 train_controlnet.py                  |  1 +
 train_db.py                          |  1 +
 train_network.py                     |  1 +
 train_textual_inversion.py           |  1 +
 train_textual_inversion_XTI.py       |  1 +
 12 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/fine_tune.py b/fine_tune.py
index a0350ce18..3c4a5a26b 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -520,6 +520,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     train(args)
diff --git a/library/train_util.py b/library/train_util.py
index 1a46f6a7d..c13bb68ee 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1890,7 +1890,7 @@ def __init__(
                 subset.image_dir,
                 False,
                 None,
-                subset.caption_extension, 
+                subset.caption_extension,
                 subset.cache_info,
                 subset.num_repeats,
                 subset.shuffle_caption,
@@ -3358,6 +3358,60 @@ def add_masked_loss_arguments(parser: argparse.ArgumentParser):
     )
 
 
+# verify command line args for training
+def verify_command_line_training_args(args: argparse.Namespace):
+    # if wandb is enabled, the command line is exposed to the public
+    # check whether sensitive options are included in the command line arguments
+    # if so, warn or inform the user to move them to the configuration file
+    # wandbが有効な場合、コマンドラインが公開される
+    # 学習用のコマンドライン引数に敏感なオプションが含まれているかどうかを確認し、
+    # 含まれている場合は設定ファイルに移動するようにユーザーに警告または通知する
+
+    wandb_enabled = args.log_with is not None and args.log_with != "tensorboard"  # "all" or "wandb"
+    if not wandb_enabled:
+        return
+
+    sensitive_args = ["wandb_api_key", "huggingface_token"]
+    sensitive_path_args = [
+        "pretrained_model_name_or_path",
+        "vae",
+        "tokenizer_cache_dir",
+        "train_data_dir",
+        "conditioning_data_dir",
+        "reg_data_dir",
+        "output_dir",
+        "logging_dir",
+    ]
+
+    for arg in sensitive_args:
+        if getattr(args, arg, None) is not None:
+            logger.warning(
+                f"wandb is enabled, but option `{arg}` is included in the command line. Because the command line is exposed to the public, it is recommended to move it to the `.toml` file."
+                + f" / wandbが有効で、かつオプション `{arg}` がコマンドラインに含まれています。コマンドラインは公開されるため、`.toml`ファイルに移動することをお勧めします。"
+            )
+
+    # if path is absolute, it may include sensitive information
+    for arg in sensitive_path_args:
+        if getattr(args, arg, None) is not None and os.path.isabs(getattr(args, arg)):
+            logger.info(
+                f"wandb is enabled, but option `{arg}` is included in the command line and it is an absolute path. Because the command line is exposed to the public, it is recommended to move it to the `.toml` file or use relative path."
+                + f" / wandbが有効で、かつオプション `{arg}` がコマンドラインに含まれており、絶対パスです。コマンドラインは公開されるため、`.toml`ファイルに移動するか、相対パスを使用することをお勧めします。"
+            )
+
+    if getattr(args, "config_file", None) is not None:
+        logger.info(
+            f"wandb is enabled, but option `config_file` is included in the command line. Because the command line is exposed to the public, please be careful about the information included in the path."
+            + f" / wandbが有効で、かつオプション `config_file` がコマンドラインに含まれています。コマンドラインは公開されるため、パスに含まれる情報にご注意ください。"
+        )
+
+    # other sensitive options
+    if args.huggingface_repo_id is not None and args.huggingface_repo_visibility != "public":
+        logger.info(
+            f"wandb is enabled, but option huggingface_repo_id is included in the command line and huggingface_repo_visibility is not 'public'. Because the command line is exposed to the public, it is recommended to move it to the `.toml` file."
+            + f" / wandbが有効で、かつオプション huggingface_repo_id がコマンドラインに含まれており、huggingface_repo_visibility が 'public' ではありません。コマンドラインは公開されるため、`.toml`ファイルに移動することをお勧めします。"
+        )
+
+
 def verify_training_args(args: argparse.Namespace):
     r"""
     Verify training arguments. Also reflect highvram option to global variable
diff --git a/sdxl_train.py b/sdxl_train.py
index 816598e04..f6d277494 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -812,6 +812,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     train(args)
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 9eaaa19f2..e880b57de 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -612,6 +612,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     train(args)
diff --git a/sdxl_train_control_net_lllite_old.py b/sdxl_train_control_net_lllite_old.py
index e55a58896..0ea64b824 100644
--- a/sdxl_train_control_net_lllite_old.py
+++ b/sdxl_train_control_net_lllite_old.py
@@ -580,6 +580,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     train(args)
diff --git a/sdxl_train_network.py b/sdxl_train_network.py
index d33239d92..83969bb1d 100644
--- a/sdxl_train_network.py
+++ b/sdxl_train_network.py
@@ -178,6 +178,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     trainer = SdxlNetworkTrainer()
diff --git a/sdxl_train_textual_inversion.py b/sdxl_train_textual_inversion.py
index 257d181ad..5df739e28 100644
--- a/sdxl_train_textual_inversion.py
+++ b/sdxl_train_textual_inversion.py
@@ -131,6 +131,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     trainer = SdxlTextualInversionTrainer()
diff --git a/train_controlnet.py b/train_controlnet.py
index 0cb0405fd..90cac0410 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -617,6 +617,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     train(args)
diff --git a/train_db.py b/train_db.py
index 0a152f224..c3b7339f3 100644
--- a/train_db.py
+++ b/train_db.py
@@ -523,6 +523,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     train(args)
diff --git a/train_network.py b/train_network.py
index 8fe98f126..fcf4cd9b6 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1101,6 +1101,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     trainer = NetworkTrainer()
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index e7083596f..02edf9525 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -806,6 +806,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     trainer = TextualInversionTrainer()
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 861d48d1d..f0723f2a7 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -714,6 +714,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser = setup_parser()
 
     args = parser.parse_args()
+    train_util.verify_command_line_training_args(args)
     args = train_util.read_config_from_file(args, parser)
 
     train(args)

From 089727b5ee40193464d5f81662b523aa7c52bee2 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 7 Apr 2024 12:42:49 +0900
Subject: [PATCH 04/12] update readme

---
 README.md | 77 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 87cdf0b76..0cecc5676 100644
--- a/README.md
+++ b/README.md
@@ -139,18 +139,34 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 ### Mar XX, 2024 / 2024/3/XX: v0.8.6
 
+#### Highlights
 
 - The dependent libraries are updated. Please see [Upgrade](#upgrade) and update the libraries.
   - Especially `imagesize` is newly added, so if you cannot update the libraries immediately, please install with `pip install imagesize==1.4.1` separately.
   - `bitsandbytes==0.43.0`, `prodigyopt==1.0`, `lion-pytorch==0.0.6` are included in the requirements.txt.
   - Also, the PyTorch version is updated to 2.1.2 (PyTorch does not need to be updated immediately). In the upgrade procedure, PyTorch is not updated, so please manually install or update torch, torchvision, xformers if necessary (see [Upgrade PyTorch](#upgrade-pytorch)).
+- When logging to wandb is enabled, the entire command line is exposed. Therefore, it is recommended to write the API key of wandb and the token of HuggingFace in the configuration file (`.toml`). Thanks to bghira for raising the issue.
+  - A warning is displayed at the start of training if such information is included in the command line.
+  - Also, if there is an absolute path, the path may be exposed, so it is recommended to specify a relative path or write it in the configuration file. In such cases, an INFO log is displayed.
+  - See [#1123](https://github.com/kohya-ss/sd-scripts/pull/1123) and PR [#1240](https://github.com/kohya-ss/sd-scripts/pull/1240) for details.
 - Colab seems to stop with log output. Try specifying `--console_log_simple` option in the training script to disable rich logging.
-- The `.toml` file for the dataset config is now read in UTF-8 encoding. PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Thanks to Horizon1704!
-- Fixed a bug that the last subset settings are applied to all images when multiple subsets of regularization images are specified in the dataset settings. The settings for each subset are correctly applied to each image. PR [#1205](https://github.com/kohya-ss/sd-scripts/pull/1205) Thanks to feffy380!
+- Other improvements include the addition of masked loss, DeepSpeed support, dataset settings improvements, and image tagging improvements. See below for details.
+
+#### Training scripts
+
 - `train_network.py` and `sdxl_train_network.py` are modified to record some dataset settings in the metadata of the trained model (`caption_prefix`, `caption_suffix`, `keep_tokens_separator`, `secondary_separator`, `enable_wildcard`).
 - Fixed a bug that U-Net and Text Encoders are included in the state in `train_network.py` and `sdxl_train_network.py`. The saving and loading of the state are faster, the file size is smaller, and the memory usage when loading is reduced.
 - DeepSpeed is supported. PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101)  and [#1139](https://github.com/kohya-ss/sd-scripts/pull/1139) Thanks to BootsofLagrangian! See PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) for details.
 - The masked loss is supported in each training script. PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) See [Masked loss](#masked-loss) for details.
+- The options `--noise_offset_random_strength` and `--ip_noise_gamma_random_strength` are added to each training script. These options can be used to vary the noise offset and ip noise gamma in the range of 0 to the specified value. PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) Thanks to KohakuBlueleaf!
+- The options `--save_state_on_train_end` are added to each training script. PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) Thanks to gesen2egee!
+- The options `--sample_every_n_epochs` and `--sample_every_n_steps` in each training script now display a warning and ignore them when a number less than or equal to `0` is specified. Thanks to S-Del for raising the issue.
+
+#### Dataset settings
+
+- The [English version of the dataset settings documentation](./docs/config_README-en.md) is added. PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) Thanks to darkstorm2150!
+- The `.toml` file for the dataset config is now read in UTF-8 encoding. PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Thanks to Horizon1704!
+- Fixed a bug that the last subset settings are applied to all images when multiple subsets of regularization images are specified in the dataset settings. The settings for each subset are correctly applied to each image. PR [#1205](https://github.com/kohya-ss/sd-scripts/pull/1205) Thanks to feffy380!
 - Some features are added to the dataset subset settings.
   - `secondary_separator` is added to specify the tag separator that is not the target of shuffling or dropping. 
     - Specify `secondary_separator=";;;"`. When you specify `secondary_separator`, the part is not shuffled or dropped. 
@@ -159,6 +175,9 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - The existing features `caption_prefix` and `caption_suffix` can be used together. `caption_prefix` and `caption_suffix` are processed first, and then `enable_wildcard`, `keep_tokens_separator`, shuffling and dropping, and `secondary_separator` are processed in order.
   - See [Dataset config](./docs/config_README-en.md) for details.
 - The dataset with DreamBooth method supports caching image information (size, caption). PR [#1178](https://github.com/kohya-ss/sd-scripts/pull/1178) and [#1206](https://github.com/kohya-ss/sd-scripts/pull/1206) Thanks to KohakuBlueleaf! See [DreamBooth method specific options](./docs/config_README-en.md#dreambooth-specific-options) for details.
+
+#### Image tagging
+
 - The support for v3 repositories is added to `tag_image_by_wd14_tagger.py` (`--onnx` option only). PR [#1192](https://github.com/kohya-ss/sd-scripts/pull/1192) Thanks to sdbds!
   - Onnx may need to be updated. Onnx is not installed by default, so please install or update it with `pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` etc. Please also check the comments in `requirements.txt`.
 - The model is now saved in the subdirectory as `--repo_id` in `tag_image_by_wd14_tagger.py` . This caches multiple repo_id models. Please delete unnecessary files under `--model_dir`.
@@ -171,22 +190,43 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - Replace tags `--tag_replacement`
   - See [Tagging documentation](./docs/wd14_tagger_README-en.md) for details.
 - Fixed an error when specifying `--beam_search` and a value of 2 or more for `--num_beams` in `make_captions.py`.
-- The options `--noise_offset_random_strength` and `--ip_noise_gamma_random_strength` are added to each training script. These options can be used to vary the noise offset and ip noise gamma in the range of 0 to the specified value. PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) Thanks to KohakuBlueleaf!
-- The options `--save_state_on_train_end` are added to each training script. PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) Thanks to gesen2egee!
-- The options `--sample_every_n_epochs` and `--sample_every_n_steps` in each training script now display a warning and ignore them when a number less than or equal to `0` is specified. Thanks to S-Del for raising the issue.
-- The [English version of the dataset settings documentation](./docs/config_README-en.md) is added. PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) Thanks to darkstorm2150!
+
+#### About Masked loss
+
+The masked loss is supported in each training script. To enable the masked loss, specify the `--masked_loss` option.
+
+The feature is not fully tested, so there may be bugs. If you find any issues, please open an Issue.
+
+ControlNet dataset is used to specify the mask. The mask images should be the RGB images. The pixel value 255 in R channel is treated as the mask (the loss is calculated only for the pixels with the mask), and 0 is treated as the non-mask. The pixel values 0-255 are converted to 0-1 (i.e., the pixel value 128 is treated as the half weight of the loss). See details for the dataset specification in the [LLLite documentation](./docs/train_lllite_README.md#preparing-the-dataset).
+
+
+#### 主要な変更点
 
 - 依存ライブラリが更新されました。[アップグレード](./README-ja.md#アップグレード) を参照しライブラリを更新してください。
   - 特に `imagesize` が新しく追加されていますので、すぐにライブラリの更新ができない場合は `pip install imagesize==1.4.1` で個別にインストールしてください。
   - `bitsandbytes==0.43.0`、`prodigyopt==1.0`、`lion-pytorch==0.0.6` が requirements.txt に含まれるようになりました。
   - また PyTorch のバージョンを 2.1.2 に更新しました。PyTorch はすぐに更新する必要はありません。更新時は、アップグレードの手順では PyTorch が更新されませんので、torch、torchvision、xformers を手動でインストールしてください。
+- wandb へのログ出力が有効の場合、コマンドライン全体が公開されます。そのため、コマンドラインに wandb の API キーや HuggingFace のトークンなどが含まれる場合、設定ファイル（`.toml`）への記載をお勧めします。問題提起していただいた bghira 氏に感謝します。
+  - このような場合には学習開始時に警告が表示されます。
+  - また絶対パスの指定がある場合、そのパスが公開される可能性がありますので、相対パスを指定するか設定ファイルに記載することをお勧めします。このような場合は INFO ログが表示されます。
+  - 詳細は [#1123](https://github.com/kohya-ss/sd-scripts/pull/1123) および PR [#1240](https://github.com/kohya-ss/sd-scripts/pull/1240) をご覧ください。
 - Colab での動作時、ログ出力で停止してしまうようです。学習スクリプトに `--console_log_simple` オプションを指定し、rich のロギングを無効してお試しください。
-- データセット設定の `.toml` ファイルが UTF-8 encoding で読み込まれるようになりました。PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Horizon1704 氏に感謝します。
-- データセット設定で、正則化画像のサブセットを複数指定した時、最後のサブセットの各種設定がすべてのサブセットの画像に適用される不具合が修正されました。それぞれのサブセットの設定が、それぞれの画像に正しく適用されます。PR [#1205](https://github.com/kohya-ss/sd-scripts/pull/1205) feffy380 氏に感謝します。
+- その他、マスクロス追加、DeepSpeed 対応、データセット設定の改善、画像タグ付けの改善などがあります。詳細は以下をご覧ください。
+
+#### 学習スクリプト
+
 - `train_network.py` および `sdxl_train_network.py` で、学習したモデルのメタデータに一部のデータセット設定が記録されるよう修正しました（`caption_prefix`、`caption_suffix`、`keep_tokens_separator`、`secondary_separator`、`enable_wildcard`）。
 - `train_network.py` および `sdxl_train_network.py` で、state に U-Net および Text Encoder が含まれる不具合を修正しました。state の保存、読み込みが高速化され、ファイルサイズも小さくなり、また読み込み時のメモリ使用量も削減されます。
 - DeepSpeed がサポートされました。PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) 、[#1139](https://github.com/kohya-ss/sd-scripts/pull/1139) BootsofLagrangian 氏に感謝します。詳細は PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) をご覧ください。
 - 各学習スクリプトでマスクロスをサポートしました。PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) 詳細は [Masked loss](#masked-loss) をご覧ください。
+- 各学習スクリプトに、noise offset、ip noise gammaを、それぞれ 0~指定した値の範囲で変動させるオプション `--noise_offset_random_strength` および `--ip_noise_gamma_random_strength` が追加されました。 PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) KohakuBlueleaf 氏に感謝します。
+- 各学習スクリプトに、学習終了時に state を保存する `--save_state_on_train_end` オプションが追加されました。 PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) gesen2egee 氏に感謝します。
+- 各学習スクリプトで `--sample_every_n_epochs` および `--sample_every_n_steps` オプションに `0` 以下の数値を指定した時、警告を表示するとともにそれらを無視するよう変更しました。問題提起していただいた S-Del 氏に感謝します。
+
+#### データセット設定
+
+- データセット設定の `.toml` ファイルが UTF-8 encoding で読み込まれるようになりました。PR [#1167](https://github.com/kohya-ss/sd-scripts/pull/1167) Horizon1704 氏に感謝します。
+- データセット設定で、正則化画像のサブセットを複数指定した時、最後のサブセットの各種設定がすべてのサブセットの画像に適用される不具合が修正されました。それぞれのサブセットの設定が、それぞれの画像に正しく適用されます。PR [#1205](https://github.com/kohya-ss/sd-scripts/pull/1205) feffy380 氏に感謝します。
 - データセットのサブセット設定にいくつかの機能を追加しました。
   - シャッフルの対象とならないタグ分割識別子の指定 `secondary_separator` を追加しました。`secondary_separator=";;;"` のように指定します。`secondary_separator` で区切ることで、その部分はシャッフル、drop 時にまとめて扱われます。
   - `enable_wildcard` を追加しました。`true` にするとワイルドカード記法 `{aaa|bbb|ccc}` が使えます。また複数行キャプションも有効になります。
@@ -194,6 +234,10 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - 既存の機能 `caption_prefix` と `caption_suffix` とあわせて使えます。`caption_prefix` と `caption_suffix` は一番最初に処理され、その後、ワイルドカード、`keep_tokens_separator`、シャッフルおよび drop、`secondary_separator` の順に処理されます。
   - 詳細は [データセット設定](./docs/config_README-ja.md) をご覧ください。
 - DreamBooth 方式の DataSet で画像情報（サイズ、キャプション）をキャッシュする機能が追加されました。PR [#1178](https://github.com/kohya-ss/sd-scripts/pull/1178)、[#1206](https://github.com/kohya-ss/sd-scripts/pull/1206) KohakuBlueleaf 氏に感謝します。詳細は [データセット設定](./docs/config_README-ja.md#dreambooth-方式専用のオプション) をご覧ください。
+- データセット設定の[英語版ドキュメント](./docs/config_README-en.md) が追加されました。PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) darkstorm2150 氏に感謝します。
+
+#### 画像のタグ付け
+
 - `tag_image_by_wd14_tagger.py` で v3 のリポジトリがサポートされました（`--onnx` 指定時のみ有効）。 PR [#1192](https://github.com/kohya-ss/sd-scripts/pull/1192) sdbds 氏に感謝します。
   - Onnx のバージョンアップが必要になるかもしれません。デフォルトでは Onnx はインストールされていませんので、`pip install onnx==1.15.0 onnxruntime-gpu==1.17.1` 等でインストール、アップデートしてください。`requirements.txt` のコメントもあわせてご確認ください。
 - `tag_image_by_wd14_tagger.py` で、モデルを`--repo_id` のサブディレクトリに保存するようにしました。これにより複数のモデルファイルがキャッシュされます。`--model_dir` 直下の不要なファイルは削除願います。
@@ -206,21 +250,8 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - タグを置換する `--tag_replacement`
   - 詳細は [タグ付けに関するドキュメント](./docs/wd14_tagger_README-ja.md) をご覧ください。
 - `make_captions.py` で `--beam_search` を指定し `--num_beams` に2以上の値を指定した時のエラーを修正しました。
-- 各学習スクリプトに、noise offset、ip noise gammaを、それぞれ 0~指定した値の範囲で変動させるオプション `--noise_offset_random_strength` および `--ip_noise_gamma_random_strength` が追加されました。 PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) KohakuBlueleaf 氏に感謝します。
-- 各学習スクリプトに、学習終了時に state を保存する `--save_state_on_train_end` オプションが追加されました。 PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) gesen2egee 氏に感謝します。
-- 各学習スクリプトで `--sample_every_n_epochs` および `--sample_every_n_steps` オプションに `0` 以下の数値を指定した時、警告を表示するとともにそれらを無視するよう変更しました。問題提起していただいた S-Del 氏に感謝します。
-- データセット設定の[英語版ドキュメント](./docs/config_README-en.md) が追加されました。PR [#1175](https://github.com/kohya-ss/sd-scripts/pull/1175) darkstorm2150 氏に感謝します。
-
-Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for recent updates.
-最近の更新情報は [Release](https://github.com/kohya-ss/sd-scripts/releases) をご覧ください。
-
-#### Masked loss
 
-The masked loss is supported in each training script. To enable the masked loss, specify the `--masked_loss` option.
-
-The feature is not fully tested, so there may be bugs. If you find any issues, please open an Issue.
-
-ControlNet dataset is used to specify the mask. The mask images should be the RGB images. The pixel value 255 in R channel is treated as the mask (the loss is calculated only for the pixels with the mask), and 0 is treated as the non-mask. The pixel values 0-255 are converted to 0-1 (i.e., the pixel value 128 is treated as the half weight of the loss). See details for the dataset specification in the [LLLite documentation](./docs/train_lllite_README.md#preparing-the-dataset).
+#### マスクロスについて
 
 各学習スクリプトでマスクロスをサポートしました。マスクロスを有効にするには `--masked_loss` オプションを指定してください。
 
@@ -228,6 +259,8 @@ ControlNet dataset is used to specify the mask. The mask images should be the RG
 
 マスクの指定には ControlNet データセットを使用します。マスク画像は RGB 画像である必要があります。R チャンネルのピクセル値 255 がロス計算対象、0 がロス計算対象外になります。0-255 の値は、0-1 の範囲に変換されます（つまりピクセル値 128 の部分はロスの重みが半分になります）。データセットの詳細は [LLLite ドキュメント](./docs/train_lllite_README-ja.md#データセットの準備) をご覧ください。
 
+Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for recent updates.
+最近の更新情報は [Release](https://github.com/kohya-ss/sd-scripts/releases) をご覧ください。
 
 ## Additional Information
 

From 90b18795fce516cb00735dc43a6ee76ecae8ec83 Mon Sep 17 00:00:00 2001
From: kabachuha <artemkhrapov2001@yandex.ru>
Date: Sun, 7 Apr 2024 07:54:21 +0300
Subject: [PATCH 05/12] Add option to use Scheduled Huber Loss in all training
 pipelines to improve resilience to data corruption (#1228)

* add huber loss and huber_c compute to train_util

* add reduction modes

* add huber_c retrieval from timestep getter

* move get timesteps and huber to own function

* add conditional loss to all training scripts

* add cond loss to train network

* add (scheduled) huber_loss to args

* fixup twice timesteps getting

* PHL-schedule should depend on noise scheduler's num timesteps

* *2 multiplier to huber loss cause of 1/2 a^2 conv.

The Taylor expansion of sqrt near zero gives 1/2 a^2, which differs from a^2 of the standard MSE loss. This change scales them better against one another

* add option for smooth l1 (huber / delta)

* unify huber scheduling

* add snr huber scheduler

---------

Co-authored-by: Kohya S <52813779+kohya-ss@users.noreply.github.com>
---
 fine_tune.py                         |  6 +--
 library/train_util.py                | 79 ++++++++++++++++++++++++++--
 sdxl_train.py                        |  6 +--
 sdxl_train_control_net_lllite.py     |  4 +-
 sdxl_train_control_net_lllite_old.py |  4 +-
 train_controlnet.py                  | 11 ++--
 train_db.py                          |  4 +-
 train_network.py                     |  4 +-
 train_textual_inversion.py           |  4 +-
 train_textual_inversion_XTI.py       |  4 +-
 10 files changed, 96 insertions(+), 30 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index 3c4a5a26b..c7e6bbd2e 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -354,7 +354,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 # Predict the noise residual
                 with accelerator.autocast():
@@ -368,7 +368,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                 if args.min_snr_gamma or args.scale_v_pred_loss_like_noise_pred or args.debiased_estimation_loss:
                     # do not mean over batch dimension for snr weight or scale v-pred loss
-                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                     loss = loss.mean([1, 2, 3])
 
                     if args.min_snr_gamma:
@@ -380,7 +380,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                     loss = loss.mean()  # mean over batch dimension
                 else:
-                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="mean")
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c)
 
                 accelerator.backward(loss)
                 if accelerator.sync_gradients and args.max_grad_norm != 0.0:
diff --git a/library/train_util.py b/library/train_util.py
index c13bb68ee..90e6818ad 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3236,6 +3236,26 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         default=None,
         help="set maximum time step for U-Net training (1~1000, default is 1000) / U-Net学習時のtime stepの最大値を設定する（1~1000で指定、省略時はデフォルト値(1000)）",
     )
+    parser.add_argument(
+        "--loss_type",
+        type=str,
+        default="l2",
+        choices=["l2", "huber", "smooth_l1"],
+        help="The type of loss to use and whether it's scheduled based on the timestep"
+    )
+    parser.add_argument(
+        "--huber_schedule",
+        type=str,
+        default="exponential",
+        choices=["constant", "exponential", "snr"],
+        help="The type of loss to use and whether it's scheduled based on the timestep"
+    )
+    parser.add_argument(
+        "--huber_c",
+        type=float,
+        default=0.1,
+        help="The huber loss parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type.",
+    )
 
     parser.add_argument(
         "--lowram",
@@ -4842,6 +4862,38 @@ def save_sd_model_on_train_end_common(
         if args.huggingface_repo_id is not None:
             huggingface_util.upload(args, out_dir, "/" + model_name, force_sync_upload=True)
 
+def get_timesteps_and_huber_c(args, min_timestep, max_timestep, noise_scheduler, b_size, device):
+
+    #TODO: if a huber loss is selected, it will use constant timesteps for each batch
+    # as. In the future there may be a smarter way
+
+    if args.loss_type == 'huber' or args.loss_type == 'smooth_l1':
+        timesteps = torch.randint(
+            min_timestep, max_timestep, (1,), device='cpu'
+        )
+        timestep = timesteps.item()
+
+        if args.huber_schedule == "exponential":
+            alpha = - math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
+            huber_c = math.exp(-alpha * timestep)
+        elif args.huber_schedule == "snr":
+            alphas_cumprod = noise_scheduler.alphas_cumprod[timestep]
+            sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
+            huber_c = (1 - args.huber_c) / (1 + sigmas)**2 + args.huber_c
+        elif args.huber_schedule == "constant":
+            huber_c = args.huber_c
+        else:
+            raise NotImplementedError(f'Unknown Huber loss schedule {args.huber_schedule}!')
+
+        timesteps = timesteps.repeat(b_size).to(device)
+    elif args.loss_type == 'l2':
+        timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device=device)
+        huber_c = 1 # may be anything, as it's not used
+    else:
+        raise NotImplementedError(f'Unknown loss type {args.loss_type}')
+    timesteps = timesteps.long()
+
+    return timesteps, huber_c
 
 def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
     # Sample noise that we'll add to the latents
@@ -4862,8 +4914,7 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
     min_timestep = 0 if args.min_timestep is None else args.min_timestep
     max_timestep = noise_scheduler.config.num_train_timesteps if args.max_timestep is None else args.max_timestep
 
-    timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device=latents.device)
-    timesteps = timesteps.long()
+    timesteps, huber_c = get_timesteps_and_huber_c(args, min_timestep, max_timestep, noise_scheduler, b_size, latents.device)
 
     # Add noise to the latents according to the noise magnitude at each timestep
     # (this is the forward diffusion process)
@@ -4876,8 +4927,28 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
     else:
         noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 
-    return noise, noisy_latents, timesteps
-
+    return noise, noisy_latents, timesteps, huber_c
+
+# NOTE: if you're using the scheduled version, huber_c has to depend on the timesteps already
+def conditional_loss(model_pred:torch.Tensor, target:torch.Tensor, reduction:str="mean", loss_type:str="l2", huber_c:float=0.1):
+    
+    if loss_type == 'l2':
+        loss = torch.nn.functional.mse_loss(model_pred, target, reduction=reduction)
+    elif loss_type == 'huber':
+        loss = 2 * huber_c * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+        if reduction == "mean":
+            loss = torch.mean(loss)
+        elif reduction == "sum":
+            loss = torch.sum(loss)
+    elif loss_type == 'smooth_l1':
+        loss = 2 * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
+        if reduction == "mean":
+            loss = torch.mean(loss)
+        elif reduction == "sum":
+            loss = torch.sum(loss)
+    else:
+        raise NotImplementedError(f'Unsupported Loss Type {loss_type}')
+    return loss
 
 def append_lr_to_logs(logs, lr_scheduler, optimizer_type, including_unet=True):
     names = []
diff --git a/sdxl_train.py b/sdxl_train.py
index f6d277494..46d7860be 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -582,7 +582,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
 
@@ -600,7 +600,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                     or args.masked_loss
                 ):
                     # do not mean over batch dimension for snr weight or scale v-pred loss
-                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                     if args.masked_loss:
                         loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
@@ -616,7 +616,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                     loss = loss.mean()  # mean over batch dimension
                 else:
-                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="mean")
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c)
 
                 accelerator.backward(loss)
                 if accelerator.sync_gradients and args.max_grad_norm != 0.0:
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index e880b57de..f89c3628f 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -439,7 +439,7 @@ def remove_model(old_ckpt_name):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
 
@@ -458,7 +458,7 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/sdxl_train_control_net_lllite_old.py b/sdxl_train_control_net_lllite_old.py
index 0ea64b824..e85e978c1 100644
--- a/sdxl_train_control_net_lllite_old.py
+++ b/sdxl_train_control_net_lllite_old.py
@@ -406,7 +406,7 @@ def remove_model(old_ckpt_name):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 noisy_latents = noisy_latents.to(weight_dtype)  # TODO check why noisy_latents is not weight_dtype
 
@@ -426,7 +426,7 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/train_controlnet.py b/train_controlnet.py
index 90cac0410..f4c94e8d9 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -420,13 +420,8 @@ def remove_model(old_ckpt_name):
                     )
 
                 # Sample a random timestep for each image
-                timesteps = torch.randint(
-                    0,
-                    noise_scheduler.config.num_train_timesteps,
-                    (b_size,),
-                    device=latents.device,
-                )
-                timesteps = timesteps.long()
+                timesteps, huber_c = train_util.get_timesteps_and_huber_c(args, 0, noise_scheduler.config.num_train_timesteps, noise_scheduler, b_size, latents.device)
+
                 # Add noise to the latents according to the noise magnitude at each timestep
                 # (this is the forward diffusion process)
                 noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
@@ -457,7 +452,7 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                 loss = loss.mean([1, 2, 3])
 
                 loss_weights = batch["loss_weights"]  # 各sampleごとのweight
diff --git a/train_db.py b/train_db.py
index c3b7339f3..1de504ed8 100644
--- a/train_db.py
+++ b/train_db.py
@@ -346,7 +346,7 @@ def train(args):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 # Predict the noise residual
                 with accelerator.autocast():
@@ -358,7 +358,7 @@ def train(args):
                 else:
                     target = noise
 
-                loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                 if args.masked_loss:
                     loss = apply_masked_loss(loss, batch)
                 loss = loss.mean([1, 2, 3])
diff --git a/train_network.py b/train_network.py
index fcf4cd9b6..31d89276c 100644
--- a/train_network.py
+++ b/train_network.py
@@ -843,7 +843,7 @@ def remove_model(old_ckpt_name):
 
                     # Sample noise, sample a random timestep for each image, and add noise to the latents,
                     # with noise offset and/or multires noise if specified
-                    noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
+                    noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
                         args, noise_scheduler, latents
                     )
 
@@ -873,7 +873,7 @@ def remove_model(old_ckpt_name):
                     else:
                         target = noise
 
-                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                     if args.masked_loss:
                         loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 02edf9525..10fce2677 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -572,7 +572,7 @@ def remove_model(old_ckpt_name):
 
                     # Sample noise, sample a random timestep for each image, and add noise to the latents,
                     # with noise offset and/or multires noise if specified
-                    noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(
+                    noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
                         args, noise_scheduler, latents
                     )
 
@@ -588,7 +588,7 @@ def remove_model(old_ckpt_name):
                     else:
                         target = noise
 
-                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                     if args.masked_loss:
                         loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index f0723f2a7..ddd03d532 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -461,7 +461,7 @@ def remove_model(old_ckpt_name):
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
+                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 # Predict the noise residual
                 with accelerator.autocast():
@@ -473,7 +473,7 @@ def remove_model(old_ckpt_name):
                 else:
                     target = noise
 
-                loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
                 if args.masked_loss:
                     loss = apply_masked_loss(loss, batch)
                 loss = loss.mean([1, 2, 3])

From d30ebb205cbec29010d35222e2f478cf1813e151 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 7 Apr 2024 14:58:17 +0900
Subject: [PATCH 06/12] update readme, add metadata for network module

---
 README.md             | 47 +++++++++++++++++++++++++++++++++++++++----
 library/train_util.py | 45 +++++++++++++++++++++++------------------
 train_network.py      | 11 ++++++++--
 3 files changed, 77 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 0cecc5676..5282c1f69 100644
--- a/README.md
+++ b/README.md
@@ -150,14 +150,15 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - Also, if there is an absolute path, the path may be exposed, so it is recommended to specify a relative path or write it in the configuration file. In such cases, an INFO log is displayed.
   - See [#1123](https://github.com/kohya-ss/sd-scripts/pull/1123) and PR [#1240](https://github.com/kohya-ss/sd-scripts/pull/1240) for details.
 - Colab seems to stop with log output. Try specifying `--console_log_simple` option in the training script to disable rich logging.
-- Other improvements include the addition of masked loss, DeepSpeed support, dataset settings improvements, and image tagging improvements. See below for details.
+- Other improvements include the addition of masked loss, scheduled Huber Loss, DeepSpeed support, dataset settings improvements, and image tagging improvements. See below for details.
 
 #### Training scripts
 
 - `train_network.py` and `sdxl_train_network.py` are modified to record some dataset settings in the metadata of the trained model (`caption_prefix`, `caption_suffix`, `keep_tokens_separator`, `secondary_separator`, `enable_wildcard`).
 - Fixed a bug that U-Net and Text Encoders are included in the state in `train_network.py` and `sdxl_train_network.py`. The saving and loading of the state are faster, the file size is smaller, and the memory usage when loading is reduced.
 - DeepSpeed is supported. PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101)  and [#1139](https://github.com/kohya-ss/sd-scripts/pull/1139) Thanks to BootsofLagrangian! See PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) for details.
-- The masked loss is supported in each training script. PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) See [Masked loss](#masked-loss) for details.
+- The masked loss is supported in each training script. PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) See [Masked loss](#about-masked-loss) for details.
+- Scheduled Huber Loss has been introduced to each training scripts. PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) Thanks to kabachuha for the PR and cheald, drhead, and others for the discussion! See [Scheduled Huber Loss](#about-scheduled-huber-loss) for details.
 - The options `--noise_offset_random_strength` and `--ip_noise_gamma_random_strength` are added to each training script. These options can be used to vary the noise offset and ip noise gamma in the range of 0 to the specified value. PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) Thanks to KohakuBlueleaf!
 - The options `--save_state_on_train_end` are added to each training script. PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) Thanks to gesen2egee!
 - The options `--sample_every_n_epochs` and `--sample_every_n_steps` in each training script now display a warning and ignore them when a number less than or equal to `0` is specified. Thanks to S-Del for raising the issue.
@@ -199,6 +200,23 @@ The feature is not fully tested, so there may be bugs. If you find any issues, p
 
 ControlNet dataset is used to specify the mask. The mask images should be the RGB images. The pixel value 255 in R channel is treated as the mask (the loss is calculated only for the pixels with the mask), and 0 is treated as the non-mask. The pixel values 0-255 are converted to 0-1 (i.e., the pixel value 128 is treated as the half weight of the loss). See details for the dataset specification in the [LLLite documentation](./docs/train_lllite_README.md#preparing-the-dataset).
 
+#### About Scheduled Huber Loss
+
+Scheduled Huber Loss has been introduced to each training scripts. This is a method to improve robustness against outliers or anomalies (data corruption) in the training data.
+
+With the traditional MSE (L2) loss function, the impact of outliers could be significant, potentially leading to a degradation in the quality of generated images. On the other hand, while the Huber loss function can suppress the influence of outliers, it tends to compromise the reproduction of fine details in images.
+
+To address this, the proposed method employs a clever application of the Huber loss function. By scheduling the use of Huber loss in the early stages of training (when noise is high) and MSE in the later stages, it strikes a balance between outlier robustness and fine detail reproduction.
+
+Experimental results have confirmed that this method achieves higher accuracy on data containing outliers compared to pure Huber loss or MSE. The increase in computational cost is minimal.
+
+The newly added arguments loss_type, huber_schedule, and huber_c allow for the selection of the loss function type (Huber, smooth L1, MSE), scheduling method (exponential, constant, SNR), and Huber's parameter. This enables optimization based on the characteristics of the dataset.
+
+See PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) for details.
+
+- `loss_type`: Specify the loss function type. Choose `huber` for Huber loss, `smooth_l1` for smooth L1 loss, and `l2` for MSE loss. The default is `l2`, which is the same as before.
+- `huber_schedule`: Specify the scheduling method. Choose `exponential`, `constant`, or `SNR`. The default is `exponential`.
+- `huber_c`: Specify the Huber's parameter. The default is `0.1`.
 
 #### 主要な変更点
 
@@ -211,14 +229,15 @@ ControlNet dataset is used to specify the mask. The mask images should be the RG
   - また絶対パスの指定がある場合、そのパスが公開される可能性がありますので、相対パスを指定するか設定ファイルに記載することをお勧めします。このような場合は INFO ログが表示されます。
   - 詳細は [#1123](https://github.com/kohya-ss/sd-scripts/pull/1123) および PR [#1240](https://github.com/kohya-ss/sd-scripts/pull/1240) をご覧ください。
 - Colab での動作時、ログ出力で停止してしまうようです。学習スクリプトに `--console_log_simple` オプションを指定し、rich のロギングを無効してお試しください。
-- その他、マスクロス追加、DeepSpeed 対応、データセット設定の改善、画像タグ付けの改善などがあります。詳細は以下をご覧ください。
+- その他、マスクロス追加、Scheduled Huber Loss 追加、DeepSpeed 対応、データセット設定の改善、画像タグ付けの改善などがあります。詳細は以下をご覧ください。
 
 #### 学習スクリプト
 
 - `train_network.py` および `sdxl_train_network.py` で、学習したモデルのメタデータに一部のデータセット設定が記録されるよう修正しました（`caption_prefix`、`caption_suffix`、`keep_tokens_separator`、`secondary_separator`、`enable_wildcard`）。
 - `train_network.py` および `sdxl_train_network.py` で、state に U-Net および Text Encoder が含まれる不具合を修正しました。state の保存、読み込みが高速化され、ファイルサイズも小さくなり、また読み込み時のメモリ使用量も削減されます。
 - DeepSpeed がサポートされました。PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) 、[#1139](https://github.com/kohya-ss/sd-scripts/pull/1139) BootsofLagrangian 氏に感謝します。詳細は PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) をご覧ください。
-- 各学習スクリプトでマスクロスをサポートしました。PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) 詳細は [Masked loss](#masked-loss) をご覧ください。
+- 各学習スクリプトでマスクロスをサポートしました。PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) 詳細は [マスクロスについて](#マスクロスについて) をご覧ください。
+- 各学習スクリプトに Scheduled Huber Loss を追加しました。PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) ご提案いただいた kabachuha 氏、および議論を深めてくださった cheald 氏、drhead 氏を始めとする諸氏に感謝します。詳細は [Scheduled Huber Loss について](#scheduled-huber-loss-について) をご覧ください。
 - 各学習スクリプトに、noise offset、ip noise gammaを、それぞれ 0~指定した値の範囲で変動させるオプション `--noise_offset_random_strength` および `--ip_noise_gamma_random_strength` が追加されました。 PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) KohakuBlueleaf 氏に感謝します。
 - 各学習スクリプトに、学習終了時に state を保存する `--save_state_on_train_end` オプションが追加されました。 PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) gesen2egee 氏に感謝します。
 - 各学習スクリプトで `--sample_every_n_epochs` および `--sample_every_n_steps` オプションに `0` 以下の数値を指定した時、警告を表示するとともにそれらを無視するよう変更しました。問題提起していただいた S-Del 氏に感謝します。
@@ -262,6 +281,26 @@ ControlNet dataset is used to specify the mask. The mask images should be the RG
 Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for recent updates.
 最近の更新情報は [Release](https://github.com/kohya-ss/sd-scripts/releases) をご覧ください。
 
+#### Scheduled Huber Loss について
+
+各学習スクリプトに、学習データ中の異常値や外れ値（data corruption）への耐性を高めるための手法、Scheduled Huber Lossが導入されました。
+
+従来のMSE（L2）損失関数では、異常値の影響を大きく受けてしまい、生成画像の品質低下を招く恐れがありました。一方、Huber損失関数は異常値の影響を抑えられますが、画像の細部再現性が損なわれがちでした。
+
+この手法ではHuber損失関数の適用を工夫し、学習の初期段階（ノイズが大きい場合）ではHuber損失を、後期段階ではMSEを用いるようスケジューリングすることで、異常値耐性と細部再現性のバランスを取ります。
+
+実験の結果では、この手法が純粋なHuber損失やMSEと比べ、異常値を含むデータでより高い精度を達成することが確認されています。また計算コストの増加はわずかです。
+
+具体的には、新たに追加された引数loss_type、huber_schedule、huber_cで、損失関数の種類（Huber, smooth L1, MSE）とスケジューリング方法（exponential, constant, SNR）を選択できます。これによりデータセットに応じた最適化が可能になります。
+
+詳細は PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) をご覧ください。
+
+- `loss_type` : 損失関数の種類を指定します。`huber` で Huber損失、`smooth_l1` で smooth L1 損失、`l2` で MSE 損失を選択します。デフォルトは `l2` で、従来と同様です。
+- `huber_schedule` : スケジューリング方法を指定します。`exponential` で指数関数的、`constant` で一定、`snr` で信号対雑音比に基づくスケジューリングを選択します。デフォルトは `exponential` です。
+- `huber_c` : Huber損失のパラメータを指定します。デフォルトは `0.1` です。
+
+PR 内でいくつかの比較が共有されています。この機能を試す場合、最初は `--loss_type smooth_l1 --huber_schedule snr --huber_c 0.1` などで試してみるとよいかもしれません。
+
 ## Additional Information
 
 ### Naming of LoRA
diff --git a/library/train_util.py b/library/train_util.py
index 90e6818ad..9ce129bd9 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3241,20 +3241,21 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         type=str,
         default="l2",
         choices=["l2", "huber", "smooth_l1"],
-        help="The type of loss to use and whether it's scheduled based on the timestep"
+        help="The type of loss function to use (L2, Huber, or smooth L1), default is L2 / 使用する損失関数の種類（L2、Huber、またはsmooth L1）、デフォルトはL2",
     )
     parser.add_argument(
         "--huber_schedule",
         type=str,
         default="exponential",
         choices=["constant", "exponential", "snr"],
-        help="The type of loss to use and whether it's scheduled based on the timestep"
+        help="The scheduling method for Huber loss (constant, exponential, or SNR-based). Only used when loss_type is 'huber' or 'smooth_l1'. default is exponential"
+        + " / Huber損失のスケジューリング方法（constant、exponential、またはSNRベース）。loss_typeが'huber'または'smooth_l1'の場合に有効、デフォルトはexponential",
     )
     parser.add_argument(
         "--huber_c",
         type=float,
         default=0.1,
-        help="The huber loss parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type.",
+        help="The huber loss parameter. Only used if one of the huber loss modes (huber or smooth l1) is selected with loss_type. default is 0.1 / Huber損失のパラメータ。loss_typeがhuberまたはsmooth l1の場合に有効。デフォルトは0.1",
     )
 
     parser.add_argument(
@@ -4862,39 +4863,39 @@ def save_sd_model_on_train_end_common(
         if args.huggingface_repo_id is not None:
             huggingface_util.upload(args, out_dir, "/" + model_name, force_sync_upload=True)
 
+
 def get_timesteps_and_huber_c(args, min_timestep, max_timestep, noise_scheduler, b_size, device):
 
-    #TODO: if a huber loss is selected, it will use constant timesteps for each batch
+    # TODO: if a huber loss is selected, it will use constant timesteps for each batch
     # as. In the future there may be a smarter way
 
-    if args.loss_type == 'huber' or args.loss_type == 'smooth_l1':
-        timesteps = torch.randint(
-            min_timestep, max_timestep, (1,), device='cpu'
-        )
+    if args.loss_type == "huber" or args.loss_type == "smooth_l1":
+        timesteps = torch.randint(min_timestep, max_timestep, (1,), device="cpu")
         timestep = timesteps.item()
 
         if args.huber_schedule == "exponential":
-            alpha = - math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
+            alpha = -math.log(args.huber_c) / noise_scheduler.config.num_train_timesteps
             huber_c = math.exp(-alpha * timestep)
         elif args.huber_schedule == "snr":
             alphas_cumprod = noise_scheduler.alphas_cumprod[timestep]
             sigmas = ((1.0 - alphas_cumprod) / alphas_cumprod) ** 0.5
-            huber_c = (1 - args.huber_c) / (1 + sigmas)**2 + args.huber_c
+            huber_c = (1 - args.huber_c) / (1 + sigmas) ** 2 + args.huber_c
         elif args.huber_schedule == "constant":
             huber_c = args.huber_c
         else:
-            raise NotImplementedError(f'Unknown Huber loss schedule {args.huber_schedule}!')
+            raise NotImplementedError(f"Unknown Huber loss schedule {args.huber_schedule}!")
 
         timesteps = timesteps.repeat(b_size).to(device)
-    elif args.loss_type == 'l2':
+    elif args.loss_type == "l2":
         timesteps = torch.randint(min_timestep, max_timestep, (b_size,), device=device)
-        huber_c = 1 # may be anything, as it's not used
+        huber_c = 1  # may be anything, as it's not used
     else:
-        raise NotImplementedError(f'Unknown loss type {args.loss_type}')
+        raise NotImplementedError(f"Unknown loss type {args.loss_type}")
     timesteps = timesteps.long()
 
     return timesteps, huber_c
 
+
 def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
     # Sample noise that we'll add to the latents
     noise = torch.randn_like(latents, device=latents.device)
@@ -4929,27 +4930,31 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
 
     return noise, noisy_latents, timesteps, huber_c
 
+
 # NOTE: if you're using the scheduled version, huber_c has to depend on the timesteps already
-def conditional_loss(model_pred:torch.Tensor, target:torch.Tensor, reduction:str="mean", loss_type:str="l2", huber_c:float=0.1):
-    
-    if loss_type == 'l2':
+def conditional_loss(
+    model_pred: torch.Tensor, target: torch.Tensor, reduction: str = "mean", loss_type: str = "l2", huber_c: float = 0.1
+):
+
+    if loss_type == "l2":
         loss = torch.nn.functional.mse_loss(model_pred, target, reduction=reduction)
-    elif loss_type == 'huber':
+    elif loss_type == "huber":
         loss = 2 * huber_c * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
         if reduction == "mean":
             loss = torch.mean(loss)
         elif reduction == "sum":
             loss = torch.sum(loss)
-    elif loss_type == 'smooth_l1':
+    elif loss_type == "smooth_l1":
         loss = 2 * (torch.sqrt((model_pred - target) ** 2 + huber_c**2) - huber_c)
         if reduction == "mean":
             loss = torch.mean(loss)
         elif reduction == "sum":
             loss = torch.sum(loss)
     else:
-        raise NotImplementedError(f'Unsupported Loss Type {loss_type}')
+        raise NotImplementedError(f"Unsupported Loss Type {loss_type}")
     return loss
 
+
 def append_lr_to_logs(logs, lr_scheduler, optimizer_type, including_unet=True):
     names = []
     if including_unet:
diff --git a/train_network.py b/train_network.py
index 31d89276c..c99d37247 100644
--- a/train_network.py
+++ b/train_network.py
@@ -476,7 +476,7 @@ def save_model_hook(models, weights, output_dir):
             # pop weights of other models than network to save only network weights
             if accelerator.is_main_process:
                 remove_indices = []
-                for i,model in enumerate(models):
+                for i, model in enumerate(models):
                     if not isinstance(model, type(accelerator.unwrap_model(network))):
                         remove_indices.append(i)
                 for i in reversed(remove_indices):
@@ -569,6 +569,11 @@ def load_model_hook(models, input_dir):
             "ss_scale_weight_norms": args.scale_weight_norms,
             "ss_ip_noise_gamma": args.ip_noise_gamma,
             "ss_debiased_estimation": bool(args.debiased_estimation_loss),
+            "ss_noise_offset_random_strength": args.noise_offset_random_strength,
+            "ss_ip_noise_gamma_random_strength": args.ip_noise_gamma_random_strength,
+            "ss_loss_type": args.loss_type,
+            "ss_huber_schedule": args.huber_schedule,
+            "ss_huber_c": args.huber_c,
         }
 
         if use_user_config:
@@ -873,7 +878,9 @@ def remove_model(old_ckpt_name):
                     else:
                         target = noise
 
-                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c)
+                    loss = train_util.conditional_loss(
+                        noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
+                    )
                     if args.masked_loss:
                         loss = apply_masked_loss(loss, batch)
                     loss = loss.mean([1, 2, 3])

From 9c4d7d56482f08618a337ab0733824ac5704b6c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9D=92=E9=BE=8D=E8=81=96=E8=80=85=40bdsqlsz?=
 <qinglongshengzhe@gmail.com>
Date: Sun, 7 Apr 2024 17:15:49 +0800
Subject: [PATCH 07/12] init

---
 fine_tune.py                         |  5 ++++-
 library/train_util.py                | 15 +++++++++++++++
 sdxl_train.py                        |  5 ++++-
 sdxl_train_control_net_lllite.py     |  7 ++++++-
 sdxl_train_control_net_lllite_old.py |  7 ++++++-
 train_db.py                          |  5 ++++-
 train_network.py                     |  5 ++++-
 train_textual_inversion_XTI.py       |  7 ++++++-
 8 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index a0350ce18..aab7596e3 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -322,6 +322,8 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
         for m in training_models:
             m.train()
+            if (args.optimizer_type.lower().endswith("schedulefree")):
+                optimizer.train()
 
         for step, batch in enumerate(train_dataloader):
             current_step.value = global_step
@@ -390,7 +392,8 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
                 optimizer.step()
-                lr_scheduler.step()
+                if not args.optimizer_type.lower().endswith("scheduleFree"):
+                    lr_scheduler.step()
                 optimizer.zero_grad(set_to_none=True)
 
             # Checks if the accelerator has performed an optimization step behind the scenes
diff --git a/library/train_util.py b/library/train_util.py
index 1a46f6a7d..035870134 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -4012,6 +4012,21 @@ def get_optimizer(args, trainable_params):
         logger.info(f"use AdamW optimizer | {optimizer_kwargs}")
         optimizer_class = torch.optim.AdamW
         optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+        
+    elif optimizer_type.endswith("schedulefree".lower()):
+        try:
+            import schedulefree as sf
+        except ImportError:
+            raise ImportError("No schedulefree / schedulefreeがインストールされていないようです")
+        if optimizer_type == "AdamWScheduleFree".lower():
+                optimizer_class = sf.AdamWScheduleFree
+                logger.info(f"use AdamWScheduleFree optimizer | {optimizer_kwargs}")
+        elif optimizer_type == "SGDScheduleFree".lower():
+            optimizer_class = sf.SGDScheduleFree 
+            logger.info(f"use SGDScheduleFree optimizer | {optimizer_kwargs}")
+        else:
+            raise ValueError(f"Unknown optimizer type: {optimizer_type}")
+        optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
 
     if optimizer is None:
         # 任意のoptimizerを使う
diff --git a/sdxl_train.py b/sdxl_train.py
index 816598e04..3ab0513d0 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -501,6 +501,8 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
         for m in training_models:
             m.train()
+            if (args.optimizer_type.lower().endswith("schedulefree")):
+                optimizer.train()
 
         for step, batch in enumerate(train_dataloader):
             current_step.value = global_step
@@ -626,7 +628,8 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
                 optimizer.step()
-                lr_scheduler.step()
+                if not args.optimizer_type.lower().endswith("scheduleFree"):
+                    lr_scheduler.step()
                 optimizer.zero_grad(set_to_none=True)
 
             # Checks if the accelerator has performed an optimization step behind the scenes
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 9eaaa19f2..2c0c9818e 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -290,8 +290,12 @@ def train(args):
 
     if args.gradient_checkpointing:
         unet.train()  # according to TI example in Diffusers, train is required -> これオリジナルのU-Netしたので本当は外せる
+        if (args.optimizer_type.lower().endswith("schedulefree")):
+            optimizer.train()
     else:
         unet.eval()
+        if (args.optimizer_type.lower().endswith("schedulefree")):
+            optimizer.eval()
 
     # TextEncoderの出力をキャッシュするときにはCPUへ移動する
     if args.cache_text_encoder_outputs:
@@ -481,7 +485,8 @@ def remove_model(old_ckpt_name):
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
                 optimizer.step()
-                lr_scheduler.step()
+                if not args.optimizer_type.lower().endswith("scheduleFree"):
+                    lr_scheduler.step()
                 optimizer.zero_grad(set_to_none=True)
 
             # Checks if the accelerator has performed an optimization step behind the scenes
diff --git a/sdxl_train_control_net_lllite_old.py b/sdxl_train_control_net_lllite_old.py
index e55a58896..a383948b5 100644
--- a/sdxl_train_control_net_lllite_old.py
+++ b/sdxl_train_control_net_lllite_old.py
@@ -261,8 +261,12 @@ def train(args):
 
     if args.gradient_checkpointing:
         unet.train()  # according to TI example in Diffusers, train is required -> これオリジナルのU-Netしたので本当は外せる
+        if (args.optimizer_type.lower().endswith("schedulefree")):
+            optimizer.train()
     else:
         unet.eval()
+        if (args.optimizer_type.lower().endswith("schedulefree")):
+            optimizer.eval()
 
     network.prepare_grad_etc()
 
@@ -449,7 +453,8 @@ def remove_model(old_ckpt_name):
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
                 optimizer.step()
-                lr_scheduler.step()
+                if not args.optimizer_type.lower().endswith("scheduleFree"):
+                    lr_scheduler.step()
                 optimizer.zero_grad(set_to_none=True)
 
             # Checks if the accelerator has performed an optimization step behind the scenes
diff --git a/train_db.py b/train_db.py
index 0a152f224..15e1a63c5 100644
--- a/train_db.py
+++ b/train_db.py
@@ -302,6 +302,8 @@ def train(args):
 
         # 指定したステップ数までText Encoderを学習する：epoch最初の状態
         unet.train()
+        if (args.optimizer_type.lower().endswith("schedulefree")):
+            optimizer.train()
         # train==True is required to enable gradient_checkpointing
         if args.gradient_checkpointing or global_step < args.stop_text_encoder_training:
             text_encoder.train()
@@ -384,7 +386,8 @@ def train(args):
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
                 optimizer.step()
-                lr_scheduler.step()
+                if not args.optimizer_type.lower().endswith("scheduleFree"):
+                    lr_scheduler.step()
                 optimizer.zero_grad(set_to_none=True)
 
             # Checks if the accelerator has performed an optimization step behind the scenes
diff --git a/train_network.py b/train_network.py
index 8fe98f126..a6b67128f 100644
--- a/train_network.py
+++ b/train_network.py
@@ -446,6 +446,8 @@ def train(self, args):
         if args.gradient_checkpointing:
             # according to TI example in Diffusers, train is required
             unet.train()
+            if (args.optimizer_type.lower().endswith("schedulefree")):
+                optimizer.train()
             for t_enc in text_encoders:
                 t_enc.train()
 
@@ -900,7 +902,8 @@ def remove_model(old_ckpt_name):
                             accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
                     optimizer.step()
-                    lr_scheduler.step()
+                    if not args.optimizer_type.lower().endswith("scheduleFree"):
+                        lr_scheduler.step()
                     optimizer.zero_grad(set_to_none=True)
 
                 if args.scale_weight_norms:
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 861d48d1d..c6921c4e4 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -354,8 +354,12 @@ def train(args):
     unet.to(accelerator.device, dtype=weight_dtype)
     if args.gradient_checkpointing:  # according to TI example in Diffusers, train is required
         unet.train()
+        if (args.optimizer_type.lower().endswith("schedulefree")):
+            optimizer.train()
     else:
         unet.eval()
+        if (args.optimizer_type.lower().endswith("schedulefree")):
+            optimizer.eval()
 
     if not cache_latents:
         vae.requires_grad_(False)
@@ -496,7 +500,8 @@ def remove_model(old_ckpt_name):
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
                 optimizer.step()
-                lr_scheduler.step()
+                if not args.optimizer_type.lower().endswith("scheduleFree"):
+                    lr_scheduler.step()
                 optimizer.zero_grad(set_to_none=True)
 
                 # Let's make sure we don't update any embedding weights besides the newly added token

From f5f3bb01fa69e307c283705f3554a3074d26648c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9D=92=E9=BE=8D=E8=81=96=E8=80=85=40bdsqlsz?=
 <qinglongshengzhe@gmail.com>
Date: Sun, 7 Apr 2024 17:57:32 +0800
Subject: [PATCH 08/12] use no schedule

---
 fine_tune.py                         | 27 ++++++++++++++++++++-------
 sdxl_train.py                        | 16 ++++++++++++----
 sdxl_train_control_net_lllite.py     |  5 ++++-
 sdxl_train_control_net_lllite_old.py | 11 ++++++++---
 train_controlnet.py                  | 11 ++++++++---
 train_db.py                          | 27 ++++++++++++++++++++-------
 train_network.py                     | 24 +++++++++++++++++-------
 train_textual_inversion.py           | 22 ++++++++++++++++------
 train_textual_inversion_XTI.py       | 11 ++++++++---
 9 files changed, 113 insertions(+), 41 deletions(-)

diff --git a/fine_tune.py b/fine_tune.py
index cecb41b19..17d091408 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -255,18 +255,31 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             ds_model = deepspeed_utils.prepare_deepspeed_model(args, unet=unet, text_encoder=text_encoder)
         else:
             ds_model = deepspeed_utils.prepare_deepspeed_model(args, unet=unet)
-        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            ds_model, optimizer, train_dataloader, lr_scheduler
-        )
+        if args.optimizer_type.lower().endswith("scheduleFree"):
+            ds_model, optimizer, train_dataloader = accelerator.prepare(
+                ds_model, optimizer, train_dataloader
+            )
+        else:
+            ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+                ds_model, optimizer, train_dataloader, lr_scheduler
+            )
         training_models = [ds_model]
     else:
         # acceleratorがなんかよろしくやってくれるらしい
         if args.train_text_encoder:
-            unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-                unet, text_encoder, optimizer, train_dataloader, lr_scheduler
-            )
+            if args.optimizer_type.lower().endswith("scheduleFree"):
+                unet, text_encoder, optimizer, train_dataloader  = accelerator.prepare(
+                    unet, text_encoder, optimizer, train_dataloader
+                )
+            else:
+                unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+                    unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+                )
         else:
-            unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
+            if args.optimizer_type.lower().endswith("scheduleFree"):
+                unet, optimizer, train_dataloader = accelerator.prepare(unet, optimizer, train_dataloader)
+            else:
+                unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
 
     # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
     if args.full_fp16:
diff --git a/sdxl_train.py b/sdxl_train.py
index 6acd8a6ac..2590d36c1 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -415,9 +415,14 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             text_encoder2=text_encoder2 if train_text_encoder2 else None,
         )
         # most of ZeRO stage uses optimizer partitioning, so we have to prepare optimizer and ds_model at the same time. # pull/1139#issuecomment-1986790007
-        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            ds_model, optimizer, train_dataloader, lr_scheduler
-        )
+        if args.optimizer_type.lower().endswith("scheduleFree"):
+            ds_model, optimizer, train_dataloader = accelerator.prepare(
+                ds_model, optimizer, train_dataloader
+            )
+        else:
+            ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+                ds_model, optimizer, train_dataloader, lr_scheduler
+            )
         training_models = [ds_model]
 
     else:
@@ -428,7 +433,10 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             text_encoder1 = accelerator.prepare(text_encoder1)
         if train_text_encoder2:
             text_encoder2 = accelerator.prepare(text_encoder2)
-        optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
+        if args.optimizer_type.lower().endswith("scheduleFree"):
+            optimizer, train_dataloader = accelerator.prepare(optimizer, train_dataloader)
+        else:
+            optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
 
     # TextEncoderの出力をキャッシュするときにはCPUへ移動する
     if args.cache_text_encoder_outputs:
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index d788bacf5..6cbb4741b 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -286,7 +286,10 @@ def train(args):
     unet.to(weight_dtype)
 
     # acceleratorがなんかよろしくやってくれるらしい
-    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
+    if args.optimizer_type.lower().endswith("scheduleFree"):
+        unet, optimizer, train_dataloader = accelerator.prepare(unet, optimizer, train_dataloader)
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
 
     if args.gradient_checkpointing:
         unet.train()  # according to TI example in Diffusers, train is required -> これオリジナルのU-Netしたので本当は外せる
diff --git a/sdxl_train_control_net_lllite_old.py b/sdxl_train_control_net_lllite_old.py
index 3e81f2c94..bb48fcb14 100644
--- a/sdxl_train_control_net_lllite_old.py
+++ b/sdxl_train_control_net_lllite_old.py
@@ -254,9 +254,14 @@ def train(args):
         network.to(weight_dtype)
 
     # acceleratorがなんかよろしくやってくれるらしい
-    unet, network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        unet, network, optimizer, train_dataloader, lr_scheduler
-    )
+    if args.optimizer_type.lower().endswith("scheduleFree"):
+        unet, network, optimizer, train_dataloader = accelerator.prepare(
+            unet, network, optimizer, train_dataloader
+        )
+    else:
+        unet, network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, network, optimizer, train_dataloader, lr_scheduler
+        )
     network: control_net_lllite.ControlNetLLLite
 
     if args.gradient_checkpointing:
diff --git a/train_controlnet.py b/train_controlnet.py
index f4c94e8d9..6b71799dc 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -276,9 +276,14 @@ def train(args):
         controlnet.to(weight_dtype)
 
     # acceleratorがなんかよろしくやってくれるらしい
-    controlnet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        controlnet, optimizer, train_dataloader, lr_scheduler
-    )
+    if args.optimizer_type.lower().endswith("scheduleFree"):
+        controlnet, optimizer, train_dataloader = accelerator.prepare(
+            controlnet, optimizer, train_dataloader
+        )
+    else:
+        controlnet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            controlnet, optimizer, train_dataloader, lr_scheduler
+        )
 
     unet.requires_grad_(False)
     text_encoder.requires_grad_(False)
diff --git a/train_db.py b/train_db.py
index 62f9852f0..ad55d6ce0 100644
--- a/train_db.py
+++ b/train_db.py
@@ -229,19 +229,32 @@ def train(args):
             ds_model = deepspeed_utils.prepare_deepspeed_model(args, unet=unet, text_encoder=text_encoder)
         else:
             ds_model = deepspeed_utils.prepare_deepspeed_model(args, unet=unet)
-        ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            ds_model, optimizer, train_dataloader, lr_scheduler
-        )
+        if args.optimizer_type.lower().endswith("scheduleFree"):
+            ds_model, optimizer, train_dataloader = accelerator.prepare(
+                ds_model, optimizer, train_dataloader
+            )
+        else:
+            ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+                ds_model, optimizer, train_dataloader, lr_scheduler
+            )
         training_models = [ds_model]
 
     else:
         if train_text_encoder:
-            unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-                unet, text_encoder, optimizer, train_dataloader, lr_scheduler
-            )
+            if args.optimizer_type.lower().endswith("scheduleFree"):
+                unet, text_encoder, optimizer, train_dataloader  = accelerator.prepare(
+                    unet, text_encoder, optimizer, train_dataloader
+                )
+            else:
+                unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+                    unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+                )
             training_models = [unet, text_encoder]
         else:
-            unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
+            if args.optimizer_type.lower().endswith("scheduleFree"):
+                unet, optimizer, train_dataloader = accelerator.prepare(unet, optimizer, train_dataloader)
+            else:
+                unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
             training_models = [unet]
 
     if not train_text_encoder:
diff --git a/train_network.py b/train_network.py
index d47042805..e7db8168c 100644
--- a/train_network.py
+++ b/train_network.py
@@ -420,9 +420,14 @@ def train(self, args):
                 text_encoder2=text_encoders[1] if train_text_encoder and len(text_encoders) > 1 else None,
                 network=network,
             )
-            ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-                ds_model, optimizer, train_dataloader, lr_scheduler
-            )
+            if args.optimizer_type.lower().endswith("scheduleFree"):
+                ds_model, optimizer, train_dataloader = accelerator.prepare(
+                    ds_model, optimizer, train_dataloader
+                )    
+            else:
+                ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+                    ds_model, optimizer, train_dataloader, lr_scheduler
+                )
             training_model = ds_model
         else:
             if train_unet:
@@ -437,10 +442,15 @@ def train(self, args):
                     text_encoders = [text_encoder]
             else:
                 pass  # if text_encoder is not trained, no need to prepare. and device and dtype are already set
-
-            network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-                network, optimizer, train_dataloader, lr_scheduler
-            )
+            
+            if args.optimizer_type.lower().endswith("scheduleFree"):
+                network, optimizer, train_dataloader = accelerator.prepare(
+                    network, optimizer, train_dataloader
+                )  
+            else:
+                network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+                    network, optimizer, train_dataloader, lr_scheduler
+                )
             training_model = network
 
         if args.gradient_checkpointing:
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 10fce2677..4adbc642f 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -416,14 +416,24 @@ def train(self, args):
 
         # acceleratorがなんかよろしくやってくれるらしい
         if len(text_encoders) == 1:
-            text_encoder_or_list, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-                text_encoder_or_list, optimizer, train_dataloader, lr_scheduler
-            )
+            if args.optimizer_type.lower().endswith("scheduleFree"):
+                text_encoder_or_list, optimizer, train_dataloader = accelerator.preparet(
+                    text_encoder_or_list, optimizer, train_dataloader
+                )   
+            else:
+                text_encoder_or_list, optimizer, train_dataloader, lr_scheduler = accelerator.preparet(
+                    text_encoder_or_list, optimizer, train_dataloader, lr_scheduler
+                )
 
         elif len(text_encoders) == 2:
-            text_encoder1, text_encoder2, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-                text_encoders[0], text_encoders[1], optimizer, train_dataloader, lr_scheduler
-            )
+            if args.optimizer_type.lower().endswith("scheduleFree"):
+                text_encoder1, text_encoder2, optimizer, train_dataloader = accelerator.prepare(
+                    text_encoders[0], text_encoders[1], optimizer, train_dataloader
+                )  
+            else:
+                text_encoder1, text_encoder2, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+                    text_encoders[0], text_encoders[1], optimizer, train_dataloader, lr_scheduler
+                )
 
             text_encoder_or_list = text_encoders = [text_encoder1, text_encoder2]
 
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 032a36e21..701fd1467 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -335,9 +335,14 @@ def train(args):
     lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
 
     # acceleratorがなんかよろしくやってくれるらしい
-    text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        text_encoder, optimizer, train_dataloader, lr_scheduler
-    )
+    if args.optimizer_type.lower().endswith("scheduleFree"):
+        text_encoder, optimizer, train_dataloader = accelerator.prepare(
+            text_encoder, optimizer, train_dataloader
+        )   
+    else:
+        text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
 
     index_no_updates = torch.arange(len(tokenizer)) < token_ids_XTI[0]
     # logger.info(len(index_no_updates), torch.sum(index_no_updates))

From dfa30790a99754f2f20e4980e91a000cce51d8a4 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 7 Apr 2024 20:34:26 +0900
Subject: [PATCH 09/12] update readme

---
 README-ja.md | 44 ++++++++++++++++++++++++++++++++++++++++++
 README.md    | 54 ++++++++++------------------------------------------
 2 files changed, 54 insertions(+), 44 deletions(-)

diff --git a/README-ja.md b/README-ja.md
index f70f882d7..4ae6b2334 100644
--- a/README-ja.md
+++ b/README-ja.md
@@ -111,3 +111,47 @@ Conv2d 3x3への拡大は [cloneofsimo氏](https://github.com/cloneofsimo/lora)
 
 [BLIP](https://github.com/salesforce/BLIP): BSD-3-Clause
 
+## その他の情報
+
+### LoRAの名称について
+
+`train_network.py` がサポートするLoRAについて、混乱を避けるため名前を付けました。ドキュメントは更新済みです。以下は当リポジトリ内の独自の名称です。
+
+1. __LoRA-LierLa__ : (LoRA for __Li__ n __e__ a __r__  __La__ yers、リエラと読みます)
+
+    Linear 層およびカーネルサイズ 1x1 の Conv2d 層に適用されるLoRA
+
+2. __LoRA-C3Lier__ : (LoRA for __C__ olutional layers with __3__ x3 Kernel and  __Li__ n __e__ a __r__ layers、セリアと読みます)
+
+    1.に加え、カーネルサイズ 3x3 の Conv2d 層に適用されるLoRA
+
+デフォルトではLoRA-LierLaが使われます。LoRA-C3Lierを使う場合は `--network_args` に `conv_dim` を指定してください。
+
+<!-- 
+LoRA-LierLa は[Web UI向け拡張](https://github.com/kohya-ss/sd-webui-additional-networks)、またはAUTOMATIC1111氏のWeb UIのLoRA機能で使用することができます。
+
+LoRA-C3Lierを使いWeb UIで生成するには拡張を使用してください。
+-->
+
+### 学習中のサンプル画像生成
+
+プロンプトファイルは例えば以下のようになります。
+
+```
+# prompt 1
+masterpiece, best quality, (1girl), in white shirts, upper body, looking at viewer, simple background --n low quality, worst quality, bad anatomy,bad composition, poor, low effort --w 768 --h 768 --d 1 --l 7.5 --s 28
+
+# prompt 2
+masterpiece, best quality, 1boy, in business suit, standing at street, looking back --n (low quality, worst quality), bad anatomy,bad composition, poor, low effort --w 576 --h 832 --d 2 --l 5.5 --s 40
+```
+
+  `#` で始まる行はコメントになります。`--n` のように「ハイフン二個＋英小文字」の形でオプションを指定できます。以下が使用可能できます。
+
+  * `--n` Negative prompt up to the next option.
+  * `--w` Specifies the width of the generated image.
+  * `--h` Specifies the height of the generated image.
+  * `--d` Specifies the seed of the generated image.
+  * `--l` Specifies the CFG scale of the generated image.
+  * `--s` Specifies the number of steps in the generation.
+
+  `( )` や `[ ]` などの重みづけも動作します。
diff --git a/README.md b/README.md
index 5282c1f69..1ca699be5 100644
--- a/README.md
+++ b/README.md
@@ -137,15 +137,16 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 ## Change History
 
-### Mar XX, 2024 / 2024/3/XX: v0.8.6
+### Apr 7, 2024 / 2024-04-07: v0.8.6
 
 #### Highlights
 
 - The dependent libraries are updated. Please see [Upgrade](#upgrade) and update the libraries.
   - Especially `imagesize` is newly added, so if you cannot update the libraries immediately, please install with `pip install imagesize==1.4.1` separately.
   - `bitsandbytes==0.43.0`, `prodigyopt==1.0`, `lion-pytorch==0.0.6` are included in the requirements.txt.
+    - `bitsandbytes` no longer requires complex procedures as it now officially supports Windows.  
   - Also, the PyTorch version is updated to 2.1.2 (PyTorch does not need to be updated immediately). In the upgrade procedure, PyTorch is not updated, so please manually install or update torch, torchvision, xformers if necessary (see [Upgrade PyTorch](#upgrade-pytorch)).
-- When logging to wandb is enabled, the entire command line is exposed. Therefore, it is recommended to write the API key of wandb and the token of HuggingFace in the configuration file (`.toml`). Thanks to bghira for raising the issue.
+- When logging to wandb is enabled, the entire command line is exposed. Therefore, it is recommended to write wandb API key and HuggingFace token in the configuration file (`.toml`). Thanks to bghira for raising the issue.
   - A warning is displayed at the start of training if such information is included in the command line.
   - Also, if there is an absolute path, the path may be exposed, so it is recommended to specify a relative path or write it in the configuration file. In such cases, an INFO log is displayed.
   - See [#1123](https://github.com/kohya-ss/sd-scripts/pull/1123) and PR [#1240](https://github.com/kohya-ss/sd-scripts/pull/1240) for details.
@@ -223,6 +224,7 @@ See PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) for details.
 - 依存ライブラリが更新されました。[アップグレード](./README-ja.md#アップグレード) を参照しライブラリを更新してください。
   - 特に `imagesize` が新しく追加されていますので、すぐにライブラリの更新ができない場合は `pip install imagesize==1.4.1` で個別にインストールしてください。
   - `bitsandbytes==0.43.0`、`prodigyopt==1.0`、`lion-pytorch==0.0.6` が requirements.txt に含まれるようになりました。
+    - `bitsandbytes` が公式に Windows をサポートしたため複雑な手順が不要になりました。
   - また PyTorch のバージョンを 2.1.2 に更新しました。PyTorch はすぐに更新する必要はありません。更新時は、アップグレードの手順では PyTorch が更新されませんので、torch、torchvision、xformers を手動でインストールしてください。
 - wandb へのログ出力が有効の場合、コマンドライン全体が公開されます。そのため、コマンドラインに wandb の API キーや HuggingFace のトークンなどが含まれる場合、設定ファイル（`.toml`）への記載をお勧めします。問題提起していただいた bghira 氏に感謝します。
   - このような場合には学習開始時に警告が表示されます。
@@ -315,27 +317,14 @@ The LoRA supported by `train_network.py` has been named to avoid confusion. The
 
     In addition to 1., LoRA for Conv2d layers with 3x3 kernel 
     
-LoRA-LierLa is the default LoRA type for `train_network.py` (without `conv_dim` network arg). LoRA-LierLa can be used with [our extension](https://github.com/kohya-ss/sd-webui-additional-networks) for AUTOMATIC1111's Web UI, or with the built-in LoRA feature of the Web UI.
-
-To use LoRA-C3Lier with Web UI, please use our extension.
-
-### LoRAの名称について
-
-`train_network.py` がサポートするLoRAについて、混乱を避けるため名前を付けました。ドキュメントは更新済みです。以下は当リポジトリ内の独自の名称です。
-
-1. __LoRA-LierLa__ : (LoRA for __Li__ n __e__ a __r__  __La__ yers、リエラと読みます)
-
-    Linear 層およびカーネルサイズ 1x1 の Conv2d 層に適用されるLoRA
-
-2. __LoRA-C3Lier__ : (LoRA for __C__ olutional layers with __3__ x3 Kernel and  __Li__ n __e__ a __r__ layers、セリアと読みます)
-
-    1.に加え、カーネルサイズ 3x3 の Conv2d 層に適用されるLoRA
-
-LoRA-LierLa は[Web UI向け拡張](https://github.com/kohya-ss/sd-webui-additional-networks)、またはAUTOMATIC1111氏のWeb UIのLoRA機能で使用することができます。
+LoRA-LierLa is the default LoRA type for `train_network.py` (without `conv_dim` network arg). 
+<!-- 
+LoRA-LierLa can be used with [our extension](https://github.com/kohya-ss/sd-webui-additional-networks) for AUTOMATIC1111's Web UI, or with the built-in LoRA feature of the Web UI.
 
-LoRA-C3Lierを使いWeb UIで生成するには拡張を使用してください。
+To use LoRA-C3Lier with Web UI, please use our extension. 
+-->
 
-## Sample image generation during training
+### Sample image generation during training
   A prompt file might look like this, for example
 
 ```
@@ -356,26 +345,3 @@ masterpiece, best quality, 1boy, in business suit, standing at street, looking b
   * `--s` Specifies the number of steps in the generation.
 
   The prompt weighting such as `( )` and `[ ]` are working.
-
-## サンプル画像生成
-プロンプトファイルは例えば以下のようになります。
-
-```
-# prompt 1
-masterpiece, best quality, (1girl), in white shirts, upper body, looking at viewer, simple background --n low quality, worst quality, bad anatomy,bad composition, poor, low effort --w 768 --h 768 --d 1 --l 7.5 --s 28
-
-# prompt 2
-masterpiece, best quality, 1boy, in business suit, standing at street, looking back --n (low quality, worst quality), bad anatomy,bad composition, poor, low effort --w 576 --h 832 --d 2 --l 5.5 --s 40
-```
-
-  `#` で始まる行はコメントになります。`--n` のように「ハイフン二個＋英小文字」の形でオプションを指定できます。以下が使用可能できます。
-
-  * `--n` Negative prompt up to the next option.
-  * `--w` Specifies the width of the generated image.
-  * `--h` Specifies the height of the generated image.
-  * `--d` Specifies the seed of the generated image.
-  * `--l` Specifies the CFG scale of the generated image.
-  * `--s` Specifies the number of steps in the generation.
-
-  `( )` や `[ ]` などの重みづけも動作します。
-

From c973b29da422911893f62b8acbe7c455f0c8c78b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 7 Apr 2024 20:51:52 +0900
Subject: [PATCH 10/12] update readme

---
 README.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 1ca699be5..83fa81e03 100644
--- a/README.md
+++ b/README.md
@@ -159,7 +159,7 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 - Fixed a bug that U-Net and Text Encoders are included in the state in `train_network.py` and `sdxl_train_network.py`. The saving and loading of the state are faster, the file size is smaller, and the memory usage when loading is reduced.
 - DeepSpeed is supported. PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101)  and [#1139](https://github.com/kohya-ss/sd-scripts/pull/1139) Thanks to BootsofLagrangian! See PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) for details.
 - The masked loss is supported in each training script. PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) See [Masked loss](#about-masked-loss) for details.
-- Scheduled Huber Loss has been introduced to each training scripts. PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) Thanks to kabachuha for the PR and cheald, drhead, and others for the discussion! See [Scheduled Huber Loss](#about-scheduled-huber-loss) for details.
+- Scheduled Huber Loss has been introduced to each training scripts. PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) Thanks to kabachuha for the PR and cheald, drhead, and others for the discussion! See the PR and [Scheduled Huber Loss](#about-scheduled-huber-loss) for details.
 - The options `--noise_offset_random_strength` and `--ip_noise_gamma_random_strength` are added to each training script. These options can be used to vary the noise offset and ip noise gamma in the range of 0 to the specified value. PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) Thanks to KohakuBlueleaf!
 - The options `--save_state_on_train_end` are added to each training script. PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) Thanks to gesen2egee!
 - The options `--sample_every_n_epochs` and `--sample_every_n_steps` in each training script now display a warning and ignore them when a number less than or equal to `0` is specified. Thanks to S-Del for raising the issue.
@@ -219,6 +219,8 @@ See PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) for details.
 - `huber_schedule`: Specify the scheduling method. Choose `exponential`, `constant`, or `SNR`. The default is `exponential`.
 - `huber_c`: Specify the Huber's parameter. The default is `0.1`.
 
+Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for recent updates.
+
 #### 主要な変更点
 
 - 依存ライブラリが更新されました。[アップグレード](./README-ja.md#アップグレード) を参照しライブラリを更新してください。
@@ -239,7 +241,7 @@ See PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) for details.
 - `train_network.py` および `sdxl_train_network.py` で、state に U-Net および Text Encoder が含まれる不具合を修正しました。state の保存、読み込みが高速化され、ファイルサイズも小さくなり、また読み込み時のメモリ使用量も削減されます。
 - DeepSpeed がサポートされました。PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) 、[#1139](https://github.com/kohya-ss/sd-scripts/pull/1139) BootsofLagrangian 氏に感謝します。詳細は PR [#1101](https://github.com/kohya-ss/sd-scripts/pull/1101) をご覧ください。
 - 各学習スクリプトでマスクロスをサポートしました。PR [#1207](https://github.com/kohya-ss/sd-scripts/pull/1207) 詳細は [マスクロスについて](#マスクロスについて) をご覧ください。
-- 各学習スクリプトに Scheduled Huber Loss を追加しました。PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) ご提案いただいた kabachuha 氏、および議論を深めてくださった cheald 氏、drhead 氏を始めとする諸氏に感謝します。詳細は [Scheduled Huber Loss について](#scheduled-huber-loss-について) をご覧ください。
+- 各学習スクリプトに Scheduled Huber Loss を追加しました。PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) ご提案いただいた kabachuha 氏、および議論を深めてくださった cheald 氏、drhead 氏を始めとする諸氏に感謝します。詳細は当該 PR および [Scheduled Huber Loss について](#scheduled-huber-loss-について) をご覧ください。
 - 各学習スクリプトに、noise offset、ip noise gammaを、それぞれ 0~指定した値の範囲で変動させるオプション `--noise_offset_random_strength` および `--ip_noise_gamma_random_strength` が追加されました。 PR [#1177](https://github.com/kohya-ss/sd-scripts/pull/1177) KohakuBlueleaf 氏に感謝します。
 - 各学習スクリプトに、学習終了時に state を保存する `--save_state_on_train_end` オプションが追加されました。 PR [#1168](https://github.com/kohya-ss/sd-scripts/pull/1168) gesen2egee 氏に感謝します。
 - 各学習スクリプトで `--sample_every_n_epochs` および `--sample_every_n_steps` オプションに `0` 以下の数値を指定した時、警告を表示するとともにそれらを無視するよう変更しました。問題提起していただいた S-Del 氏に感謝します。
@@ -280,9 +282,6 @@ See PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) for details.
 
 マスクの指定には ControlNet データセットを使用します。マスク画像は RGB 画像である必要があります。R チャンネルのピクセル値 255 がロス計算対象、0 がロス計算対象外になります。0-255 の値は、0-1 の範囲に変換されます（つまりピクセル値 128 の部分はロスの重みが半分になります）。データセットの詳細は [LLLite ドキュメント](./docs/train_lllite_README-ja.md#データセットの準備) をご覧ください。
 
-Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for recent updates.
-最近の更新情報は [Release](https://github.com/kohya-ss/sd-scripts/releases) をご覧ください。
-
 #### Scheduled Huber Loss について
 
 各学習スクリプトに、学習データ中の異常値や外れ値（data corruption）への耐性を高めるための手法、Scheduled Huber Lossが導入されました。
@@ -303,6 +302,8 @@ Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for rece
 
 PR 内でいくつかの比較が共有されています。この機能を試す場合、最初は `--loss_type smooth_l1 --huber_schedule snr --huber_c 0.1` などで試してみるとよいかもしれません。
 
+最近の更新情報は [Release](https://github.com/kohya-ss/sd-scripts/releases) をご覧ください。
+
 ## Additional Information
 
 ### Naming of LoRA

From bfb352bc433326a77aca3124248331eb60c49e8c Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 7 Apr 2024 21:07:52 +0900
Subject: [PATCH 11/12] change huber_schedule from `exponential` to `snr`

---
 README.md             | 10 ++++++++--
 library/train_util.py |  6 +++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 83fa81e03..a7047a360 100644
--- a/README.md
+++ b/README.md
@@ -137,6 +137,12 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 ## Change History
 
+### Apr 7, 2024 / 2024-04-07: v0.8.7
+
+- The default value of `huber_schedule` in Scheduled Huber Loss is changed from `exponential` to `snr`, which is expected to give better results.
+
+- Scheduled Huber Loss の `huber_schedule` のデフォルト値を `exponential` から、より良い結果が期待できる `snr` に変更しました。
+
 ### Apr 7, 2024 / 2024-04-07: v0.8.6
 
 #### Highlights
@@ -216,7 +222,7 @@ The newly added arguments loss_type, huber_schedule, and huber_c allow for the s
 See PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) for details.
 
 - `loss_type`: Specify the loss function type. Choose `huber` for Huber loss, `smooth_l1` for smooth L1 loss, and `l2` for MSE loss. The default is `l2`, which is the same as before.
-- `huber_schedule`: Specify the scheduling method. Choose `exponential`, `constant`, or `SNR`. The default is `exponential`.
+- `huber_schedule`: Specify the scheduling method. Choose `exponential`, `constant`, or `snr`. The default is `snr`.
 - `huber_c`: Specify the Huber's parameter. The default is `0.1`.
 
 Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for recent updates.
@@ -297,7 +303,7 @@ Please read [Releases](https://github.com/kohya-ss/sd-scripts/releases) for rece
 詳細は PR [#1228](https://github.com/kohya-ss/sd-scripts/pull/1228/) をご覧ください。
 
 - `loss_type` : 損失関数の種類を指定します。`huber` で Huber損失、`smooth_l1` で smooth L1 損失、`l2` で MSE 損失を選択します。デフォルトは `l2` で、従来と同様です。
-- `huber_schedule` : スケジューリング方法を指定します。`exponential` で指数関数的、`constant` で一定、`snr` で信号対雑音比に基づくスケジューリングを選択します。デフォルトは `exponential` です。
+- `huber_schedule` : スケジューリング方法を指定します。`exponential` で指数関数的、`constant` で一定、`snr` で信号対雑音比に基づくスケジューリングを選択します。デフォルトは `snr` です。
 - `huber_c` : Huber損失のパラメータを指定します。デフォルトは `0.1` です。
 
 PR 内でいくつかの比較が共有されています。この機能を試す場合、最初は `--loss_type smooth_l1 --huber_schedule snr --huber_c 0.1` などで試してみるとよいかもしれません。
diff --git a/library/train_util.py b/library/train_util.py
index 9ce129bd9..15c23f3cc 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3246,10 +3246,10 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
     parser.add_argument(
         "--huber_schedule",
         type=str,
-        default="exponential",
+        default="snr",
         choices=["constant", "exponential", "snr"],
-        help="The scheduling method for Huber loss (constant, exponential, or SNR-based). Only used when loss_type is 'huber' or 'smooth_l1'. default is exponential"
-        + " / Huber損失のスケジューリング方法（constant、exponential、またはSNRベース）。loss_typeが'huber'または'smooth_l1'の場合に有効、デフォルトはexponential",
+        help="The scheduling method for Huber loss (constant, exponential, or SNR-based). Only used when loss_type is 'huber' or 'smooth_l1'. default is snr"
+        + " / Huber損失のスケジューリング方法（constant、exponential、またはSNRベース）。loss_typeが'huber'または'smooth_l1'の場合に有効、デフォルトは snr",
     )
     parser.add_argument(
         "--huber_c",

From 19f533d3766505dac03893ede260ba848ee207f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9D=92=E9=BE=8D=E8=81=96=E8=80=85=40bdsqlsz?=
 <qinglongshengzhe@gmail.com>
Date: Tue, 9 Apr 2024 17:19:19 +0800
Subject: [PATCH 12/12] fix typo

---
 library/train_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index 29bb4d487..664a23a2b 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3087,7 +3087,7 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
     )
     parser.add_argument("--seed", type=int, default=None, help="random seed for training / 学習時の乱数のseed")
     parser.add_argument(
-        "--gradient_checkpointing", action="store_true", help="enable gradient checkpointing / grandient checkpointingを有効にする"
+        "--gradient_checkpointing", action="store_true", help="enable gradient checkpointing / gradient checkpointingを有効にする"
     )
     parser.add_argument(
         "--gradient_accumulation_steps",