From 65e6594ac553447dece948f4730c1de4a64d24c9 Mon Sep 17 00:00:00 2001 From: shigabeev Date: Fri, 23 Aug 2024 08:37:42 +0000 Subject: [PATCH 1/3] added english readme files --- docs/config_README-en copy.md | 268 +++++++++ docs/fine_tune_README-en.md | 81 +++ docs/gen_img_README-en.md | 457 +++++++++++++++ docs/model_conversion-README-en.md | 58 ++ docs/train_README-en.md | 873 +++++++++++++++++++++++++++++ docs/train_db_README-en.md | 148 +++++ docs/train_network_README-en.md | 444 +++++++++++++++ docs/train_ti_README-en.md | 106 ++++ 8 files changed, 2435 insertions(+) create mode 100644 docs/config_README-en copy.md create mode 100644 docs/fine_tune_README-en.md create mode 100644 docs/gen_img_README-en.md create mode 100644 docs/model_conversion-README-en.md create mode 100644 docs/train_README-en.md create mode 100644 docs/train_db_README-en.md create mode 100644 docs/train_network_README-en.md create mode 100644 docs/train_ti_README-en.md diff --git a/docs/config_README-en copy.md b/docs/config_README-en copy.md new file mode 100644 index 000000000..20471a923 --- /dev/null +++ b/docs/config_README-en copy.md @@ -0,0 +1,268 @@ +## This is a ChatGPT-4 English adaptation of the original document by kohya-ss ([config_README-ja.md](https://github.com/kohya-ss/sd-scripts/blob/main/docs/config_README-ja.md)) + +This documentation explains the configuration file that can be passed using the `--dataset_config` option. + +## Overview + +By providing a configuration file, users can fine-tune various settings. + +* Multiple datasets can be configured. + * For example, you can set the `resolution` for each dataset and train them together. + * In learning methods that support both DreamBooth and fine-tuning techniques, it is possible to mix datasets using DreamBooth and fine-tuning techniques. +* Settings can be changed for each subset. + * A dataset is a collection of subsets, which are created by dividing the dataset into separate image directories or metadata. + * Options such as `keep_tokens` and `flip_aug` can be set for each subset. On the other hand, options such as `resolution` and `batch_size` can be set for each dataset, and the values are shared among subsets belonging to the same dataset. More details are provided later. + +The configuration file can be written in JSON or TOML format. Considering ease of writing, we recommend using [TOML](https://toml.io/ja/v1.0.0-rc.2). The following explanations assume the use of TOML. + +Here is an example of a configuration file written in TOML: + +```toml +[general] +shuffle_caption = true +caption_extension = '.txt' +keep_tokens = 1 + +# This is a DreamBooth-style dataset +[[datasets]] +resolution = 512 +batch_size = 4 +keep_tokens = 2 + + [[datasets.subsets]] + image_dir = 'C:\hoge' + class_tokens = 'hoge girl' + # This subset has keep_tokens = 2 (using the value of the parent datasets) + + [[datasets.subsets]] + image_dir = 'C:\fuga' + class_tokens = 'fuga boy' + keep_tokens = 3 + + [[datasets.subsets]] + is_reg = true + image_dir = 'C:\reg' + class_tokens = 'human' + keep_tokens = 1 + +# This is a fine-tuning-style dataset +[[datasets]] +resolution = [768, 768] +batch_size = 2 + + [[datasets.subsets]] + image_dir = 'C:\piyo' + metadata_file = 'C:\piyo\piyo_md.json' + # This subset has keep_tokens = 1 (using the general value) +``` + +In this example, three directories are trained as DreamBooth-style datasets at 512x512 (batch size 4), and one directory is trained as a fine-tuning-style dataset at 768x768 (batch size 2). + +## Dataset and Subset Configuration Settings + +The settings for datasets and subsets are divided into several sections. + +* `[general]` + * This section specifies options that apply to all datasets or all subsets. + * If an option with the same name exists in the dataset-specific and subset-specific settings, the dataset and subset-specific settings take precedence. +* `[[datasets]]` + * `datasets` is the registration section for dataset settings. This section specifies options that apply individually to each dataset. + * If subset-specific settings exist, the subset-specific settings take precedence. +* `[[datasets.subsets]]` + * `datasets.subsets` is the registration section for subset settings. This section specifies options that apply individually to each subset. + +The following is a conceptual diagram of the correspondence between the image directories and registration sections in the previous example: + +``` +C:\ +├─ hoge -> [[datasets.subsets]] No.1 ┐ ┐ +├─ fuga -> [[datasets.subsets]] No.2 |-> [[datasets]] No.1 |-> [general] +├─ reg -> [[datasets.subsets]] No.3 ┘ | +└─ piyo -> [[datasets.subsets]] No.4 --> [[datasets]] No.2 ┘ +``` + +Each image directory corresponds to one `[[datasets.subsets]]`. One or more `[[datasets.subsets]]` are combined to form a `[[datasets]]`. The `[general]` section includes all `[[datasets]]` and `[[datasets.subsets]]`. + +Different options can be specified for each registration section, but if an option with the same name is specified, the value in the lower registration section takes precedence. It may be easier to understand by checking how the `keep_tokens` option is handled in the previous example. + +In addition, the available options vary depending on the supported techniques of the learning method. + +* DreamBooth-specific options +* Fine-tuning-specific options +* Options available when the caption dropout technique can be used + +In learning methods that support both DreamBooth and fine-tuning techniques, both can be used together. +When using both, note that whether a dataset is a DreamBooth-style or fine-tuning-style is determined on a dataset-by-dataset basis, so it is not possible to mix DreamBooth-style subsets and fine-tuning-style subsets within the same dataset. +In other words, if you want to use both of these techniques, you need to set the subsets with different techniques to belong to different datasets. + +Regarding the program's behavior, it is determined that a subset is a fine-tuning-style subset if the `metadata_file` option, which will be explained later, exists. +Therefore, for subsets belonging to the same dataset, there is no problem as long as they are either "all have the `metadata_file` option" or "all do not have the `metadata_file` option". + +The following describes the available options. For options with the same name as command-line arguments, the basic explanation is omitted. Please refer to the other READMEs. + +### Common Options for All Learning Methods + +These options can be specified regardless of the learning method. + +#### Dataset-specific Options + +These options are related to dataset settings and cannot be written in `datasets.subsets`. + +| Option Name | Example Setting | `[general]` | `[[datasets]]` | +| ---- | ---- | ---- | ---- | +| `batch_size` | `1` | o | o | +| `bucket_no_upscale` | `true` | o | o | +| `bucket_reso_steps` | `64` | o | o | +| `enable_bucket` | `true` | o | o | +| `max_bucket_reso` | `1024` | o | o | +| `min_bucket_reso` | `128` | o | o | +| `resolution` | `256`, `[512, 512]` | o | o | + +* `batch_size` + * Equivalent to the command line argument `--train_batch_size`. + +These settings are fixed for each dataset. In other words, subsets belonging to the same dataset will share these settings. For example, if you want to prepare datasets with different resolutions, you can define them as separate datasets, as shown in the example above, and set different resolutions. + +#### Subset-specific options + +These are options related to the configuration of subsets. + +| Option name | Example | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` | +| ---- | ---- | ---- | ---- | ---- | +| `color_aug` | `false` | o | o | o | +| `face_crop_aug_range` | `[1.0, 3.0]` | o | o | o | +| `flip_aug` | `true` | o | o | o | +| `keep_tokens` | `2` | o | o | o | +| `num_repeats` | `10` | o | o | o | +| `random_crop` | `false` | o | o | o | +| `shuffle_caption` | `true` | o | o | o | + +* `num_repeats` + * Specifies the number of times the images in the subset are repeated. It corresponds to `--dataset_repeats` in fine-tuning, but `num_repeats` can be specified for any learning method. + +### Options exclusive to DreamBooth method + +The options for the DreamBooth method exist only for subset-specific options. + +#### Subset-specific options + +These are options related to the configuration of subsets in the DreamBooth method. + +| Option name | Example | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` | +| ---- | ---- | ---- | ---- | ---- | +| `image_dir` | `‘C:\hoge’` | - | - | o (required) | +| `caption_extension` | `".txt"` | o | o | o | +| `class_tokens` | `“sks girl”` | - | - | o | +| `is_reg` | `false` | - | - | o | + +Please note that the `image_dir` must specify a path where the image files are placed directly. In the traditional DreamBooth method, images needed to be placed in subdirectories, but this is not compatible with that specification. Also, even if you name the folder like `5_cat`, the repetition count and class name of the images will not be reflected. If you want to set these individually, you need to explicitly specify `num_repeats` and `class_tokens`. + +* `image_dir` + * Specifies the path of the image directory. This is a required option. + * Images must be placed directly in the directory. +* `class_tokens` + * Sets the class tokens. + * It will be used during training only if there is no corresponding caption file for the image. The determination of whether to use it is made on a per-image basis. If you do not specify `class_tokens` and no caption file is found, an error will occur. +* `is_reg` + * Specifies whether the images in the subset are for normalization or not. If not specified, it is treated as `false`, meaning the images are not for normalization. + +### Options exclusive to fine-tuning method + +The options for the fine-tuning method exist only for subset-specific options. + +#### Subset-specific options + +These are options related to the configuration of subsets in the fine-tuning method. + +| Option name | Example | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` | +| ---- | ---- | ---- | ---- | ---- | +| `image_dir` | `‘C:\hoge’` | - | - | o | +| `metadata_file` | `'C:\piyo\piyo_md.json'` | - | - | o (required) | + +* `image_dir` + * Specifies the path of the image directory. Unlike the DreamBooth method, this is not a required specification, but it is recommended to set it. + * The situation where you do not need to specify it is when you have executed with `--full_path` when creating the metadata file. + * Images must be placed directly in the directory. +* `metadata_file` + * Specifies the path of the metadata file used in the subset. This is a required option. + * Equivalent to the command line argument `--in_json`. + * Since the specification requires you to specify the metadata file for each subset, it is better to avoid creating metadata that spans directories in a single metadata file. It is strongly recommended to prepare a metadata file for each image directory and register them as separate subsets. + +### Options available when the caption dropout method can be used + +Caption dropout method options exist only for subset-specific options. Regardless of whether it is the DreamBooth method or the fine-tuning method, you can specify it if the learning method supports caption dropout. + +#### Subset-specific options + +These are options related to the configuration of subsets when the caption dropout method can be used. + +| Option name | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` | +| ---- | ---- | ---- | ---- | +| `caption_dropout_every_n_epochs` | o | o | o | +| `caption_dropout_rate` | o | o | o | +| `caption_tag_dropout_rate` | o | o | o | + +## Behavior when duplicate subsets exist + +For DreamBooth method datasets, subsets with the same `image_dir` are considered duplicates. For fine-tuning method datasets, subsets with the same `metadata_file` are considered duplicates. If duplicate subsets exist within the dataset, the second and subsequent ones will be ignored. + +On the other hand, if they belong to different datasets, they are not considered duplicates. For example, if you put subsets with the same `image_dir` in different datasets, they are not considered duplicates. This is useful when you want to train the same images at different resolutions. + +```toml +# If they exist in separate datasets, they are not considered duplicates and both will be used for training + +[[datasets]] +resolution = 512 + + [[datasets.subsets]] + image_dir = 'C:\hoge' + +[[datasets]] +resolution = 768 + + [[datasets.subsets]] + image_dir = 'C:\hoge' +``` + +## Usage with command line arguments + +Some options in the configuration file have overlapping roles with command line arguments. + +The following command line argument options are ignored when passing a configuration file: + +* `--train_data_dir` +* `--reg_data_dir` +* `--in_json` + +For the following command line argument options, if they are specified simultaneously in the command line argument and configuration file, the value in the configuration file takes precedence. Unless otherwise stated, the options have the same name. + +| Command line argument option | Preferred configuration file option | +| ---------------------------------- | ---------------------------------- | +| `--bucket_no_upscale` | | +| `--bucket_reso_steps` | | +| `--caption_dropout_every_n_epochs` | | +| `--caption_dropout_rate` | | +| `--caption_extension` | | +| `--caption_tag_dropout_rate` | | +| `--color_aug` | | +| `--dataset_repeats` | `num_repeats` | +| `--enable_bucket` | | +| `--face_crop_aug_range` | | +| `--flip_aug` | | +| `--keep_tokens` | | +| `--min_bucket_reso` | | +| `--random_crop` | | +| `--resolution` | | +| `--shuffle_caption` | | +| `--train_batch_size` | `batch_size` | + +## Error Handling Guide + +Currently, we are using an external library to check whether the configuration file is written correctly or not. However, the system is not well-maintained, and the error messages can be difficult to understand. We plan to address this issue in the future. + +As a temporary solution, we provide a list of frequently encountered errors and their solutions. If you encounter an error even though you believe everything is correct, or if you cannot understand the error message, please contact us as it may be a bug. + +* `voluptuous.error.MultipleInvalid: required key not provided @ ...`: This error indicates that a required option has not been specified. You might have forgotten to include the option or may have entered the option name incorrectly. + * The location of the error is indicated by the `...` part of the message. For example, if you see the error `voluptuous.error.MultipleInvalid: required key not provided @ data['datasets'][0]['subsets'][0]['image_dir']`, it means that the `image_dir` setting is missing from the 0th `subsets` configuration within the 0th `datasets`. +* `voluptuous.error.MultipleInvalid: expected int for dictionary value @ ...`: This error indicates that the value format is incorrect. The format of the value is likely incorrect. The `int` part will vary depending on the target option. The "Example Settings" for the options listed in this README may be helpful. +* `voluptuous.error.MultipleInvalid: extra keys not allowed @ ...`: This error occurs when there are unsupported option names present. You may have entered the option name incorrectly or accidentally included it. diff --git a/docs/fine_tune_README-en.md b/docs/fine_tune_README-en.md new file mode 100644 index 000000000..e19017574 --- /dev/null +++ b/docs/fine_tune_README-en.md @@ -0,0 +1,81 @@ +## This is a ChatGPT-4 English adaptation of the original document by kohya-ss ([fine_tune_README_ja.md](https://github.com/kohya-ss/sd-scripts/blob/main/docs/fine_tune_README_ja.md)) + +This is a fine-tuning method proposed by NovelAI, which is compatible with their learning approach, automatic captioning, tagging, and a Windows + VRAM 12GB (for SD v1.x) environment. Fine-tuning in this context refers to training the model using images and captions (LoRA, Textual Inversion, and Hypernetworks are not included). + +Please also refer to the [common document on training](./train_README-en.md). + +# Overview + +We will perform fine-tuning of the U-Net in Stable Diffusion using Diffusers. We have implemented the following improvements proposed by NovelAI's article (regarding Aspect Ratio Bucketing, we referred to NovelAI's code, but the final code is entirely original): + +* Use the second-to-last layer's output of the CLIP (Text Encoder) instead of the last layer's output. +* Train with non-square resolutions (Aspect Ratio Bucketing). +* Extend the token length from 75 to 225. +* Perform automatic captioning with BLIP and automatic tagging with DeepDanbooru or WD14Tagger. +* Support Hypernetwork training. +* Compatible with Stable Diffusion v2.0 (base and 768/v). +* Reduce memory usage and speed up training by pre-fetching VAE outputs and saving them to disk. + +By default, training for the Text Encoder is not performed. In general, it seems that only the U-Net is trained when fine-tuning the entire model (this is also the case with NovelAI). The Text Encoder can be included in the training with an optional setting. + +# Additional Features + +## Changing the CLIP output + +The CLIP (Text Encoder) is responsible for converting text features to reflect the prompt in the image. Stable Diffusion uses the output of the last layer of CLIP, but this can be changed to use the output of the second-to-last layer instead. According to NovelAI, this results in a more accurate reflection of the prompt. It is also possible to use the original last layer output. + +*Note: In Stable Diffusion 2.0, the second-to-last layer is used by default. Do not specify the clip_skip option. + +## Training with non-square resolutions + +In addition to the 512x512 resolution used in Stable Diffusion, we also train with resolutions such as 256x1024 and 384x640. This reduces the amount of cropping and is expected to improve the learning of the relationship between prompts and images. The training resolution is adjusted in 64-pixel increments vertically and horizontally, within the range that does not exceed the area (i.e., memory usage) of the specified resolution. + +In machine learning, it is common to unify input sizes across all inputs, but there is no specific constraint. In practice, it is sufficient to have a consistent image size within a single batch. NovelAI's bucketing seems to refer to pre-classifying training data by aspect ratio and learning resolution. Then, by creating batches with images from each bucket, the image size within the batch is unified. + +## Extension of token length from 75 to 225 + +In Stable Diffusion, the maximum token length is 75 (77 tokens, including the start and end tokens), but this is extended to 225 tokens. However, since the maximum length accepted by CLIP is 75 tokens, in the case of 225 tokens, the input is simply divided into three parts, and CLIP is called for each part, then the results are concatenated. + +*Note: It is not entirely clear whether this implementation is desirable. It seems to be working for now. There is no reference implementation for 2.0, so it has been implemented independently. + +*Note: In Automatic1111's Web UI, it seems that some additional processing, such as splitting based on commas, is performed. In my case, the implementation is simpler and only involves basic splitting. + +# Training Procedure + +Please refer to the README of this repository and set up your environment beforehand. + +## Data Preparation + +Please refer to the [instructions for preparing training data](./train_README-en.md). Fine-tuning only supports the metadata-based fine-tuning method. + +## Executing the Training + +For example, run the following command. Modify each line according to your needs. + +``` +accelerate launch --num_cpu_threads_per_process 1 fine_tune.py + --pretrained_model_name_or_path=<.ckpt or .safetensord or Diffusers version model directory> + --output_dir= + --output_name= + --dataset_config=<.toml file created during data preparation> + --save_model_as=safetensors + --learning_rate=5e-6 --max_train_steps=10000 + --use_8bit_adam --xformers --gradient_checkpointing + --mixed_precision=fp16 +``` + +It is generally a good idea to specify `1` for `num_cpu_threads_per_process`. + +Specify the base model for additional training in `pretrained_model_name_or_path`. You can specify the Stable Diffusion checkpoint file (.ckpt or .safetensors), the Diffusers local disk model directory, or the Diffusers model ID (e.g., "stabilityai/stable-diffusion-2"). + +Specify the folder to save the trained model in `output_dir`. Specify the model file name without the extension in `output_name`. Save the model in safetensors format by specifying `save_model_as`. + +Specify the `.toml` file in `dataset_config`. To begin with, set the batch size specified in the file to `1` to minimize memory consumption. + +Set the number of training steps to `10000` with `max_train_steps`. In this example, a learning_rate of `5e-6` is specified. + +Enable mixed precision with `mixed_precision="fp16"` to reduce memory usage (for RTX 30 series and later, you can also specify `bf16`. Match the settings with the accelerate configuration made during environment setup). Also, enable `gradient_checkpointing`. + +For more information on commonly used options, refer to the separate documentation. + +In summary, this fine-tuning approach is compatible with NovelAI's learning method, automatic captioning, tagging, and a Windows + VRAM 12GB environment. It includes several improvements, such as using the second-to-last layer of the CLIP, training with non-square resolutions, and extending the token length to 225. The training procedure involves preparing the data, executing the training, and using additional features such as training the Text Encoder. diff --git a/docs/gen_img_README-en.md b/docs/gen_img_README-en.md new file mode 100644 index 000000000..ec329c33c --- /dev/null +++ b/docs/gen_img_README-en.md @@ -0,0 +1,457 @@ +## This is a ChatGPT-4 English adaptation of the original document by kohya-ss ([gen_img_README-ja.md](https://github.com/kohya-ss/sd-scripts/blob/main/docs/gen_img_README-ja.md)) + +This is a Diffusers-based inference (image generation) script compatible with SD 1.x and 2.x models, as well as LoRA and ControlNet (only confirmed to work with v1.0) trained in this repository. It is used from the command line. + +# Overview + +* Diffusers (v0.10.2) based inference (image generation) script. +* Supports SD 1.x and 2.x (base/v-parameterization) models. +* Supports txt2img, img2img, and inpainting. +* Supports interactive mode, prompt loading from files, and continuous generation. +* Allows specifying the number of images generated per prompt line. +* Allows specifying the total number of iterations. +* Supports not only `fp16` but also `bf16`. +* Supports xformers for fast generation. + * Although xformers enable memory-efficient generation, the implementation is not as optimized as Automatic 1111's Web UI, and it uses approximately 6GB of VRAM for generating 512x512 images. +* Extension to 225 tokens for prompts. Supports negative prompts and weighting. +* Supports various Diffusers samplers (fewer than Web UI). +* Supports text encoder clip skip (using output from the nth-last layer). +* Supports separate loading of VAE. +* Supports CLIP Guided Stable Diffusion, VGG16 Guided Stable Diffusion, Highres. fix, and upscale. + * Highres. fix is an independent implementation without fully confirming the Web UI's implementation, so the output may differ. +* Supports LoRA. Supports application rate specification, simultaneous use of multiple LoRAs, and weight merging. + * It is not possible to specify different application rates for Text Encoder and U-Net. +* Supports Attention Couple. +* Supports ControlNet v1.0. +* Does not allow switching models during execution, but can be handled by creating a batch file. +* Adds various features that were personally desired. + +Not all tests are performed when adding new features, so some previous features may be affected and may not work. Please let us know if you have any issues. + +# Basic Usage + +## Generating Images in Interactive Mode + +Please enter the following: + +```batchfile +python gen_img_diffusers.py --ckpt --outdir --xformers --fp16 --interactive +``` + +Specify the model (Stable Diffusion checkpoint file or Diffusers model folder) with the `--ckpt` option and the image output destination folder with the `--outdir` option. + +Specify the use of xformers with the `--xformers` option (remove it if not using xformers). Perform inference in fp16 (single-precision) with the `--fp16` option. Inference in bf16 (bfloat16) can also be performed on RTX 30 series GPUs with the `--bf16` option. + +The `--interactive` option specifies interactive mode. + +Please add the `--v2` option if using Stable Diffusion 2.0 (or additional training models from it). If using a model with v-parameterization (e.g., `768-v-ema.ckpt` and additional training models from it), also add the `--v_parameterization` option. + +If the presence or absence of `--v2` is incorrect, an error will occur when loading the model. If the presence or absence of `--v_parameterization` is incorrect, a brown image will be displayed. + +Please enter the prompt when "Type prompt:" is displayed. + +![image](https://user-images.githubusercontent.com/52813779/235343115-f3b8ac82-456d-4aab-9724-0cc73c4534aa.png) + +*If an error occurs and no image is displayed, headless (no display functionality) OpenCV may be installed. Install regular OpenCV with `pip install opencv-python` or stop displaying images with the `--no_preview` option. + +Select the image window and press any key to close the window and enter the next prompt. Press Ctrl+Z followed by Enter in the prompt to close the script. + +## Generating Multiple Images with a Single Prompt + +Enter the following (on one line): + +```batchfile +python gen_img_diffusers.py --ckpt --outdir + --xformers --fp16 --images_per_prompt --prompt "" +``` + +Specify the number of images generated per prompt with the `--images_per_prompt` option. Specify the prompt with the `--prompt` option. Enclose the prompt in double quotes if it contains spaces. + +You can specify the batch size with the `--batch_size` option (described later). + +## Batch Generation by Loading Prompts from a File + +Enter the following: + +```batchfile +python gen_img_diffusers.py --ckpt --outdir + --xformers --fp16 --from_file +``` + +Specify the file containing the prompts with the `--from_file` option. Write one prompt per line. You can specify the number of images generated per prompt line with the `--images_per_prompt` option. + +## Using Negative Prompts and Weighting + +By writing `--n` in the prompt option (specify within the prompt as `--x`), everything that follows becomes a negative prompt. + +Also, like AUTOMATIC1111's Web UI, you can use `()` or `[]`, or `(xxx:1.3)` for weighting (implementation copied from Diffusers' [Long Prompt Weighting Stable Diffusion](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#long-prompt-weighting-stable-diffusion)). + +You can specify the same options when entering prompts from the command line or loading prompts from a file. + +![image](https://user-images.githubusercontent.com/52813779/235343128-e79cd768-ec59-46f5-8395-fce9bdc46208.png) + +# Main Options + +Specify these options from the command line. + +## Model Specification + +- `--ckpt `: Specifies the model name. The `--ckpt` option is required. You can specify the Stable Diffusion checkpoint file, Diffusers model folder, or Hugging Face model ID. + +- `--v2`: Specifies the use of a Stable Diffusion 2.x series model. No specification is needed for 1.x series models. + +- `--v_parameterization`: Specifies the use of a model with v-parameterization (`768-v-ema.ckpt` and additional training models from it, Waifu Diffusion v1.5, etc.). + + If the presence or absence of `--v2` is incorrect, an error will occur when loading the model. If the presence or absence of `--v_parameterization` is incorrect, a brown image will be displayed. + +- `--vae`: Specifies the VAE to use. If not specified, the VAE within the model will be used. + +## Image Generation and Output + +- `--interactive`: Operates in interactive mode. Generates images when a prompt is entered. + +- `--prompt `: Specifies the prompt. Enclose the prompt in double quotes if it contains spaces. + +- `--from_file `: Specifies the file containing the prompts. Write one prompt per line. You can specify the number of images generated per prompt line with the `--images_per_prompt` option. + +- `--W `: Specifies the image width. The default is `512`. + +- `--H `: Specifies the image height. The default is `512`. + +- `--steps `: Specifies the sampling step count. The default is `50`. + +- `--scale `: Specifies the unconditional guidance scale. The default is `7.5`. + +- `--sampler `: Specifies the sampler. The default is `ddim`. You can specify ddim, pndm, dpmsolver, dpmsolver+++, lms, euler, euler_a, which are provided by Diffusers (you can also specify the last three as k_lms, k_euler, k_euler_a). + +- `--outdir `: Specifies the destination folder for the images. + +- `--images_per_prompt `: Specifies the number of images generated per prompt. The default is `1`. + +- `--clip_skip `: Specifies which layer from the back of CLIP to use. The default is the last layer. + +- `--max_embeddings_multiples `: Specifies the multiplier for the default input/output length (75) of CLIP. The default is 75. For example, specifying 3 sets the input/output length to 225. + +- `--negative_scale`: Specifies the guidance scale for uncoditioning. This implementation is based on [the article by gcem156](https://note.com/gcem156/n/ne9a53e4a6f43). + + + +## Adjusting Memory Usage and Generation Speed + +- `--batch_size `: Specifies the batch size. The default is `1`. A larger batch size consumes more memory but increases generation speed. + +- `--vae_batch_size `: Specifies the VAE batch size. The default is the same as the batch size. VAE consumes more memory, and there may be cases where memory is insufficient after denoising (when the step is 100%). In such cases, reduce the VAE batch size. + +- `--xformers`: Specify this option when using xformers. + +- `--fp16`: Performs inference in fp16 (single precision). If neither `fp16` nor `bf16` is specified, inference is performed in fp32 (single precision). + +- `--bf16`: Performs inference in bf16 (bfloat16). This option can only be specified on RTX 30 series GPUs. The `--bf16` option will result in an error on non-RTX 30 series GPUs. The likelihood of inference results becoming NaN (resulting in a completely black image) seems lower with `bf16` than with `fp16`. + +## Using Additional Networks (such as LoRA) + +- `--network_module`: Specifies the additional network to use. For LoRA, specify `--network_module networks.lora`. To use multiple LoRAs, specify them like `--network_module networks.lora networks.lora networks.lora`. + +- `--network_weights`: Specifies the weight file of the additional network to use. Specify it like `--network_weights model.safetensors`. To use multiple LoRAs, specify them like `--network_weights model1.safetensors model2.safetensors model3.safetensors`. The number of arguments should be the same as the number specified in `--network_module`. + +- `--network_mul`: Specifies how many times to multiply the weights of the additional network to use. The default is `1`. Specify it like `--network_mul 0.8`. To use multiple LoRAs, specify them like `--network_mul 0.4 0.5 0.7`. The number of arguments should be the same as the number specified in `--network_module`. + +- `--network_merge`: Merges the weights of the additional network to use with the weights specified in `--network_mul` beforehand. This option cannot be used in combination with `--network_pre_calc`. The prompt option `--am` and Regional LoRA will no longer be available, but generation will be accelerated to the same extent as when not using LoRA. + +- `--network_pre_calc`: Calculates the weights of the additional network to use beforehand for each generation. The prompt option `--am` can be used. Generation will be accelerated to the same extent as when not using LoRA, but additional time will be required to calculate the weights before generation, and memory usage will also increase slightly. This option is invalidated when using Regional LoRA. + +# Examples of Main Option Specifications + +The following example generates 64 images in a single prompt with a batch size of 4. + +```batchfile +python gen_img_diffusers.py --ckpt model.ckpt --outdir outputs + --xformers --fp16 --W 512 --H 704 --scale 12.5 --sampler k_euler_a + --steps 32 --batch_size 4 --images_per_prompt 64 + --prompt "beautiful flowers --n monochrome" +``` + +The following example generates 10 images for each prompt written in a file with a batch size of 4. + +```batchfile +python gen_img_diffusers.py --ckpt model.ckpt --outdir outputs + --xformers --fp16 --W 512 --H 704 --scale 12.5 --sampler k_euler_a + --steps 32 --batch_size 4 --images_per_prompt 10 + --from_file prompts.txt +``` + +Here's an example using Textual Inversion (explained later) and LoRA. + +```batchfile +python gen_img_diffusers.py --ckpt model.safetensors + --scale 8 --steps 48 --outdir txt2img --xformers + --W 512 --H 768 --fp16 --sampler k_euler_a + --textual_inversion_embeddings goodembed.safetensors negprompt.pt + --network_module networks.lora networks.lora + --network_weights model1.safetensors model2.safetensors + --network_mul 0.4 0.8 + --clip_skip 2 --max_embeddings_multiples 1 + --batch_size 8 --images_per_prompt 1 --interactive +``` + +# Prompt Options + +Various options can be specified in the prompt using the format `--n` (two hyphens followed by an alphabet letter). This is valid whether specifying the prompt interactively, via the command line, or from a file. + +Please insert a space before and after the prompt option `--n`. + +- `--n`: Specifies a negative prompt. + +- `--w`: Specifies the image width. This will overwrite the specification from the command line. + +- `--h`: Specifies the image height. This will overwrite the specification from the command line. + +- `--s`: Specifies the number of steps. This will overwrite the specification from the command line. + +- `--d`: Specifies the random seed for this image. If `--images_per_prompt` is specified, please specify multiple seeds separated by commas, like `--d 1,2,3,4`. + * Due to various reasons, the generated image may differ from the Web UI even with the same random seed. + +- `--l`: Specifies the guidance scale. This will overwrite the specification from the command line. + +- `--t`: Specifies the strength for img2img (explained later). This will overwrite the specification from the command line. + +- `--nl`: Specifies the guidance scale for negative prompts (explained later). This will overwrite the specification from the command line. + +- `--am`: Specifies the weights of the additional network. This will overwrite the specification from the command line. To use multiple additional networks, specify them separated by commas, like `--am 0.8,0.5,0.3`. + +* When these options are specified, the batch may be executed with a smaller size than the batch size (as items with different values cannot be generated in bulk). (You don't need to worry too much about this, but when generating from a file with prompts, arranging prompts with the same values will improve efficiency.) + +Example: +``` +(masterpiece, best quality), 1girl, in shirt and plated skirt, standing at street under cherry blossoms, upper body, [from below], kind smile, looking at another, [goodembed] --n realistic, real life, (negprompt), (lowres:1.1), (worst quality:1.2), (low quality:1.1), bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, normal quality, jpeg artifacts, signature, watermark, username, blurry --w 960 --h 640 --s 28 --d 1 +``` + +![image](https://user-images.githubusercontent.com/52813779/235343446-25654172-fff4-4aaf-977a-20d262b51676.png) + +# Image-to-Image Conversion + +## Options + +- `--image_path`: Specifies the image to be used for img2img conversion. Specify it like `--image_path template.png`. If a folder is specified, the images in the folder will be used sequentially. + +- `--strength`: Specifies the strength of img2img. Specify it like `--strength 0.8`. The default value is `0.8`. + +- `--sequential_file_name`: Specifies whether to use a sequential file name. If specified, the generated file names will be in the format `im_000001.png` and so on. + +- `--use_original_file_name`: If specified, the generated file name will be the same as the original file name. + +## Example of Execution from the Command Line + +```batchfile +python gen_img_diffusers.py --ckpt trinart_characters_it4_v1_vae_merged.ckpt + --outdir outputs --xformers --fp16 --scale 12.5 --sampler k_euler --steps 32 + --image_path template.png --strength 0.8 + --prompt "1girl, cowboy shot, brown hair, pony tail, brown eyes, + sailor school uniform, outdoors + --n lowres, bad anatomy, bad hands, error, missing fingers, cropped, + worst quality, low quality, normal quality, jpeg artifacts, (blurry), + hair ornament, glasses" + --batch_size 8 --images_per_prompt 32 +``` + +When specifying a folder for the `--image_path` option, the images in the folder will be loaded sequentially. The number of generated images is not based on the number of images, but on the number of prompts, so please match the number of images for img2img and the number of prompts using the `--images_per_prompt` option. + +Files are sorted and loaded by file name. Note that the sort order is in string order (not `1.jpg→2.jpg→10.jpg`, but `1.jpg→10.jpg→2.jpg`), so please adjust accordingly by zero-padding the file names (e.g., `01.jpg→02.jpg→10.jpg`). + +## Using img2img for Upscaling + +When specifying the generated image size with the `--W` and `--H` command-line options during img2img, the original image will be resized to that size before performing img2img. + +Additionally, if the source image for img2img is an image generated by this script, and the prompt is omitted, the prompt will be automatically retrieved from the metadata of the source image, allowing only the 2nd stage of Highres. fix to be performed. + +## Inpainting during img2img + +You can perform inpainting by specifying an image and a mask image (note that this does not support inpainting models and simply performs img2img only on the masked area). + +The options are as follows: + +- `--mask_image`: Specifies the mask image. Like `--img_path`, if you specify a folder, the images in that folder will be used sequentially. + +The mask image is a grayscale image with the white parts being inpainted. It is recommended to use a gradient at the boundary to make the inpainted area look smoother. + +![image](https://user-images.githubusercontent.com/52813779/235343795-9eaa6d98-02ff-4f32-b089-80d1fc482453.png) + +# Other Functions + +## Textual Inversion + +Specify the embeddings to be used with the `--textual_inversion_embeddings` option (multiple can be specified). By using the file name without the extension in the prompt, those embeddings will be used (same usage as in the Web UI). This can also be used within negative prompts. + +As models, you can use the Textual Inversion models trained in this repository and the Textual Inversion models trained in the Web UI (image embedding is not supported). + +## Extended Textual Inversion + +Please specify the `--XTI_embeddings` option instead of the `--textual_inversion_embeddings` option. The usage is the same as with `--textual_inversion_embeddings`. + +## Highres. Fix + +This is a similar feature to the one in AUTOMATIC1111's Web UI (it may differ in various aspects due to being a custom implementation). A smaller image is generated initially, and then img2img is performed on that image to prevent inconsistencies in the entire image while generating a higher resolution image. + +The number of steps for the 2nd stage is calculated from the values of the `--steps` and `--strength` options (`steps*strength`). + +img2img cannot be combined with this feature. + +The following options are available: + +- `--highres_fix_scale`: Enables Highres. fix and specifies the size of the image generated in the 1st stage as a ratio. If the final output is 1024x1024 and a 512x512 image is generated initially, specify it as `--highres_fix_scale 0.5`. Please note that this is the reciprocal of the value used in the Web UI. + +- `--highres_fix_steps`: Specifies the number of steps for the 1st stage image. The default is `28`. + +- `--highres_fix_save_1st`: Specifies whether to save the 1st stage image. + +- `--highres_fix_latents_upscaling`: If specified, the 1st stage image is upscaled at the latent level during the 2nd stage image generation (only bilinear is supported). If not specified, LANCZOS4 upscaling is used. + +- `--highres_fix_upscaler`: Specifies an arbitrary upscaler for the 2nd stage. Currently, only `--highres_fix_upscaler tools.latent_upscaler` is supported. + +- `--highres_fix_upscaler_args`: Specifies the arguments to be passed to the upscaler specified with `--highres_fix_upscaler`. + In the case of `tools.latent_upscaler`, specify the weight file like `--highres_fix_upscaler_args "weights=D:\Work\SD\Models\others\etc\upscaler-v1-e100-220.safetensors"`. + +Example of a command line: + +```batchfile +python gen_img_diffusers.py --ckpt trinart_characters_it4_v1_vae_merged.ckpt + --n_iter 1 --scale 7.5 --W 1024 --H 1024 --batch_size 1 --outdir ../txt2img + --steps 48 --sampler ddim --fp16 + --xformers + --images_per_prompt 1 --interactive + --highres_fix_scale 0.5 --highres_fix_steps 28 --strength 0.5 +``` + +## ControlNet + +Currently, only ControlNet 1.0 has been tested. Only Canny preprocessing is supported. + +The following options are available: + +- `--control_net_models`: Specifies the ControlNet model file. + Multiple models can be specified, and they will be switched between steps (different implementation from the ControlNet extension in the Web UI). Both diff and regular models are supported. + +- `--guide_image_path`: Specifies the guide image to be used for ControlNet. Like `--img_path`, if you specify a folder, the images in that folder will be used sequentially. If you are using a model other than Canny, please preprocess the image beforehand. + +- `--control_net_preps`: Specifies the preprocessing for ControlNet. Multiple can be specified, like `--control_net_models`. Currently, only Canny is supported. If preprocessing is not used for the target model, specify `none`. + For Canny, specify the thresholds 1 and 2 separated by '_' like `--control_net_preps canny_63_191`. + +- `--control_net_weights`: Specifies the weights when applying ControlNet (normal with `1.0`, half the influence with `0.5`). Multiple can be specified, like `--control_net_models`. + +- `--control_net_ratios`: Specifies the range of steps where ControlNet is applied. If `0.5` is set, ControlNet will be applied up to half of the total number of steps. Multiple can be specified, like `--control_net_models`. + +Example of a command line: + +```batchfile +python gen_img_diffusers.py --ckpt model_ckpt --scale 8 --steps 48 --outdir txt2img --xformers + --W 512 --H 768 --bf16 --sampler k_euler_a + --control_net_models diff_control_sd15_canny.safetensors --control_net_weights 1.0 + --guide_image_path guide.png --control_net_ratios 1.0 --interactive +``` + +## Attention Couple + Regional LoRA + +This feature allows you to divide the prompt into several parts and specify which area of the image to apply each prompt to. There are no individual options, but `mask_path` and the prompt are used for specification. + +First, use `AND` in the prompt to define multiple parts. The first three parts can be assigned to specific regions, while the remaining parts will be applied to the entire image. Negative prompts will be applied to the entire image. + +In the following example, three parts are defined with AND. + +The following text describes various options and functionalities for generating images using Diffusers and CLIP: + +Two girls are smiling and looking at the viewer while another two girls are looking back. The image quality is not good, in fact, it's the worst quality. + +Next, prepare a mask image. The mask image is a color image, with each RGB channel corresponding to a separated part of the prompt by AND. If the value of a channel is all 0, it will be applied to the entire image. + +In the example above, the R channel corresponds to "shs 2girls, looking at viewer, smile", the G channel to "bsb 2girls, looking back", and the B channel to "2girls". By using a mask image like the one below, since there is no specification in the B channel, "2girls" will be applied to the entire image. + +![image](https://user-images.githubusercontent.com/52813779/235343061-b4dc9392-3dae-4831-8347-1e9ae5054251.png) + +The mask image can be specified using `--mask_path`. Currently, only one image is supported. The specified image size will be automatically resized and applied. + +It is also possible to combine this with ControlNet (recommended for fine-grained position control). + +When specifying LoRA, multiple LoRA specified by `--network_weights` will correspond to each part of the AND. As a current constraint, the number of LoRA must be the same as the number of AND parts. + +## CLIP Guided Stable Diffusion + +This is a modified version of the custom pipeline from the Community Examples of Diffusers [here](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#clip-guided-stable-diffusion). + +In addition to the regular prompt-based generation, it retrieves the text features from a larger CLIP model and controls the generated image to make its features closer to the text features. This increases VRAM usage considerably (may be difficult for 512x512 with 8GB VRAM) and takes longer to generate. + +Only DDIM, PNDM, and LMS samplers can be selected. + +You can specify the degree to which the CLIP features are reflected with the `--clip_guidance_scale` option. Starting from around 100 and adjusting up or down seems to work well. + +By default, the first 75 tokens of the prompt (excluding special characters for weighting) are passed to CLIP. You can use the `--c` option in the prompt to specify a separate text for CLIP instead of the regular prompt (for example, CLIP may not recognize DreamBooth's identifiers or model-specific words like "1girl"). + +Here's an example command line: + +```batchfile +python gen_img_diffusers.py --ckpt v1-5-pruned-emaonly.ckpt --n_iter 1 + --scale 2.5 --W 512 --H 512 --batch_size 1 --outdir ../txt2img --steps 36 + --sampler ddim --fp16 --opt_channels_last --xformers --images_per_prompt 1 + --interactive --clip_guidance_scale 100 +``` + +## CLIP Image Guided Stable Diffusion + +This feature allows you to control the generation by passing another image to CLIP instead of text, making the generated image's features closer to the guide image. Specify the application amount with the `--clip_image_guidance_scale` option and the guide image (file or folder) with the `--guide_image_path` option. + +Here's an example command line: + +```batchfile +python gen_img_diffusers.py --ckpt trinart_characters_it4_v1_vae_merged.ckpt + --n_iter 1 --scale 7.5 --W 512 --H 512 --batch_size 1 --outdir ../txt2img + --steps 80 --sampler ddim --fp16 --opt_channels_last --xformers + --images_per_prompt 1 --interactive --clip_image_guidance_scale 100 + --guide_image_path YUKA160113420I9A4104_TP_V.jpg +``` + +### VGG16 Guided Stable Diffusion + +This feature allows you to generate images that are closer to the specified image. In addition to the regular prompt-based generation, it retrieves the features from VGG16 and controls the generated image to make it closer to the specified guide image. This is recommended for use with img2img (as the generated image may be blurry with a regular generation). It is an original feature that utilizes the mechanism of CLIP Guided Stable Diffusion and the idea is borrowed from style transfer using VGG. + +Only DDIM, PNDM, and LMS samplers can be selected. + +You can specify the degree to which the VGG16 features are reflected with the `--vgg16_guidance_scale` option. Starting from around 100 and adjusting up or down seems to work well. Specify the guide image (file or folder) with the `--guide_image_path` option. + +To convert multiple images in bulk with img2img and use the original image as the guide image, specify the same value for `--guide_image_path` and `--image_path`. + +Here's an example command line: + +```batchfile +python gen_img_diffusers.py --ckpt wd-v1-3-full-pruned-half.ckpt + --n_iter 1 --scale 5.5 --steps 60 --outdir ../txt2img + --xformers --sampler ddim --fp16 --W 512 --H 704 + --batch_size 1 --images_per_prompt 1 + --prompt "picturesque, 1girl, solo, anime face, skirt, beautiful face + --n lowres, bad anatomy, bad hands, error, missing fingers, + cropped, worst quality, low quality, normal quality, + jpeg artifacts, blurry, 3d, bad face, monochrome --d 1" + --strength 0.8 --image_path ..\src_image + --vgg16_guidance_scale 100 --guide_image_path ..\src_image +``` + +You can specify the VGG16 layer number to be used for feature extraction with `--vgg16_guidance_layer`. The default is 20 for the ReLU of conv4-2. It is said that the higher layers represent the style, while the lower layers represent the content. + +![image](https://user-images.githubusercontent.com/52813779/235343813-3c1f0d7a-4fb3-4274-98e4-b92d76b551df.png) + +# Other Options + +- `--no_preview`: Do not display the preview image in interactive mode. Specify this if OpenCV is not installed or if you want to check the output file directly. + +- `--n_iter`: Specify the number of times to repeat the generation. The default is 1. Specify this when you want to perform multiple generations while reading prompts from a file. + +- `--tokenizer_cache_dir`: Specify the cache directory for the tokenizer. (Work in progress) + +- `--seed`: Specify the random seed. This seed will be used for the image when generating one image and as the seed for generating the seeds for each image when generating multiple images. + +- `--iter_same_seed`: If the prompt does not specify a random seed, use the same seed for all iterations within `--n_iter`. Use this when you want to unify seeds for comparison between multiple prompts specified with `--from_file`. + +- `--diffusers_xformers`: Use Diffuser's xformers. + +- `--opt_channels_last`: Place the tensor channels last during inference. This may result in faster performance in some cases. + +- `--network_show_meta`: Display the metadata for the additional network. + diff --git a/docs/model_conversion-README-en.md b/docs/model_conversion-README-en.md new file mode 100644 index 000000000..566cb19e5 --- /dev/null +++ b/docs/model_conversion-README-en.md @@ -0,0 +1,58 @@ +# Script for Mutual Conversion of Stable Diffusion Checkpoint and Diffusers Model (Compatible with SD2.0) + +## Introduction +While Diffusers officially provides conversion scripts for Stable Diffusion v1.x, as of 12/3, they have not yet released one compatible with 2.0. Since I have already implemented model conversion itself in my DreamBooth training script, I added code to it to create a script that performs mutual conversion. +I made it compatible with both v1.x and v2.0. + +## Downloading the Script +Please download from here. The .zip contains two scripts, so place them in the same folder. + +* [12/10 (v4) update](https://note.com/api/v2/attachments/download/3778af0a4024e079d7895109991e79d9): Added support for safetensors format of Diffusers model. Requires DiffUsers 0.10.2 (works with 0.10.0 or later, but 0.10.0 seems to have issues, so use 0.10.2). Update within the virtual environment with `pip install -U diffusers[torch]==0.10.2`. + +* [12/5 (v3) update](https://note.com/api/v2/attachments/download/82e3c2b9013c8b9b11cfc1b4e8a67c2d): Made a tentative fix as there were reports of errors when saving in safetensors format for some models. If you get an error with v2, please try v3 (other specifications are the same). I will delete v2 once I have thoroughly tested it. + +* [12/5 (v2) update](https://note.com/api/v2/attachments/download/b6b7158cd911ffd35f78948260d58e4a): Added support for safetensors format. Install safetensors with `pip install safetensors`. + +## Usage +### Converting from Diffusers to Stable Diffusion .ckpt/.safetensors +Specify the source model folder and destination .ckpt file as follows (actually written on one line). The v1/v2 version is automatically detected. + +``` +python convert_diffusers20_original_sd.py ..\models\diffusers_model + ..\models\sd.ckpt +``` + +Note that the Text Encoder of Diffusers v2 only has 22 layers, so if you convert it directly to Stable Diffusion, the weights will be insufficient. Therefore, the weights of the 22nd layer are copied as the 23rd layer to add weights. The weights of the 23rd layer are not used during image generation, so there is no impact. Similarly, dummy weights are added to text_projection and logit_scale (they do not seem to be used for image generation). + +If you use the .safetensors extension, it will automatically save in safetensors format. + +### Converting from Stable Diffusion .ckpt/.safetensors to Diffusers +Enter as follows. + +``` +python convert_diffusers20_original_sd.py ..\models\sd.ckpt + ..\models\diffusers_model + --v2 --reference_model stabilityai/stable-diffusion-2 +``` + +Specify the .ckpt file (or .safetensors file) and output destination folder as arguments (the read format is automatically determined by the extension). +Automatic model detection is not possible, so use the `--v1` or `--v2` option according to the model. + +Also, since .ckpt does not contain scheduler and tokenizer information, it is necessary to copy that information from some existing Diffusers model. Specify it with `--reference_model`. You can specify a HuggingFace id or a local model directory. + +If you don't have a local model, for v2, specifying `"stabilityai/stable-diffusion-2"` or `"stabilityai/stable-diffusion-2-base"` should work well. +For v1.4/1.5, `"CompVis/stable-diffusion-v1-4"` should be fine (v1.4 and v1.5 seem to be the same). + +Use the `--use_safetensors` option to save the Diffusers model in safetensors format. + +## Other Options +### --fp16 / --bf16 / --float +You can specify the data format when saving the checkpoint. Only `--fp16` is also valid when loading the Diffusers model. + +### --epoch / --global_step +When saving the checkpoint, the specified values are written for epoch and global_step. If not specified, both will be 0. + +## Conclusion +I think some people may be having trouble with the Diffusers model due to the poor inference environment. I hope this helps even a little. + +(It is also possible to convert the data format from checkpoint to checkpoint, although untested.) diff --git a/docs/train_README-en.md b/docs/train_README-en.md new file mode 100644 index 000000000..180256be6 --- /dev/null +++ b/docs/train_README-en.md @@ -0,0 +1,873 @@ +## This is a ChatGPT-4 English adaptation of the original document by kohya-ss ([train_README-ja.md](https://github.com/kohya-ss/sd-scripts/blob/main/docs/train_README-ja.md)) + +There may be errors in the description due to document updates. + +# Common Learning Guide + +In this repository, we support the fine-tuning of models, DreamBooth, and the learning of LoRA and Textual Inversion (including [XTI:P+](https://github.com/kohya-ss/sd-scripts/pull/327)). This document explains the common preparation methods for learning data and options. + +# Overview + +Please refer to the README of this repository beforehand and set up the environment. + +The following will be explained: + +1. Preparation of learning data (using the new format with configuration files) +2. A brief explanation of the terms used in learning +3. Previous specification method (specifying from the command line without using a configuration file) +4. Sample image generation during learning +5. Commonly used options in each script +6. Metadata preparation for fine-tuning method: captioning, etc. + +You can learn by just executing 1 (see each script's documentation for learning). Refer to 2 and later as needed. + +# Preparing Learning Data + +Prepare image files for learning data in any folder (multiple folders are also acceptable). Supported formats are `.png`, `.jpg`, `.jpeg`, `.webp`, and `.bmp`. Preprocessing such as resizing is generally not necessary. + +However, it is recommended to avoid using images that are significantly smaller than the learning resolution (described later) or to enlarge them in advance using super-resolution AI. Also, it seems that errors may occur with images larger than extremely large images (around 3000x3000 pixels), so please reduce their size beforehand. + +When learning, you need to organize the image data to be learned by the model and specify it to the script. You can specify the learning data in several ways, depending on the number of learning data, learning target, whether captions (image descriptions) can be prepared, etc. The following methods are available (the names of each method are not general, but are unique to this repository). Regularization images will be discussed later. + +1. DreamBooth, class+identifier method (regularization images can be used) + + Learn by associating the learning target with a specific word (identifier). There is no need to prepare captions. For example, when learning a particular character, it is easy to use because there is no need to prepare captions. However, since all elements of the learning data, such as hairstyle, clothing, and background, are learned by associating them with the identifier, situations such as not being able to change clothes in the generated prompt may occur. + +2. DreamBooth, caption method (regularization images can be used) + + Prepare a text file with captions recorded for each image and learn. For example, when learning a specific character, by describing the details of the image in the caption (character A in white clothes, character A in red clothes, etc.), the character and other elements are separated, and the model can be expected to learn the character more precisely. + +3. Fine-tuning method (regularization images cannot be used) + + The captions are collected in a metadata file in advance. It supports functions such as managing tags and captions separately and speeding up learning by pre-caching latents (explained in separate documents). (Although it is called the fine-tuning method, it can also be used for non-fine-tuning.) + +The combination of what you want to learn and the available learning methods are as follows: + +| Learning target or method | Script | DB / class+identifier | DB / Caption | fine-tuning | +| ----- | ----- | ----- | ----- | ----- | +| Fine-tuning the model | `fine_tune.py`| x | x | o | +| DreamBooth the model | `train_db.py`| o | o | x | +| LoRA | `train_network.py`| o | o | o | +| Textual Inversion | `train_textual_inversion.py`| o | o | o | + +## Which one to choose + +For LoRA and Textual Inversion, if you want to learn without having to prepare a caption file, the DreamBooth class+identifier method is a good choice. If you can prepare, the DreamBooth caption method is recommended. If you have a large number of learning data and do not use regularization images, consider the fine-tuning method as well. + +For DreamBooth, the same applies, but the fine-tuning method cannot be used. In the case of fine-tuning, only the fine-tuning method is available. + +# How to specify each method + +Here, only typical patterns for each specification method are explained. For more detailed specification methods, please see [Dataset Configuration](./config_README-en.md). + +# DreamBooth, class+identifier method (regularization images can be used) + +In this method, each image is learned as if it was learned with the caption `class identifier` (e.g., `shs dog`). + +## Step 1. Determine the identifier and class + +Decide on the word identifier to associate with the learning target and the class to which the target belongs. + +(There are various names such as instance, but for now, we will follow the original paper.) + +Here is a brief explanation (please investigate further for details). + +The class is a general category of the learning target. For example, if you want to learn a specific dog breed, the class would be dog. For anime characters, depending on the model, it could be boy, girl, 1boy, or 1girl. + +The identifier is used to identify and learn the learning target. Any word is acceptable, but according to the original paper, "a rare word of 3 characters or less that is 1 token in the tokenizer" is preferred. + +By using the identifier and class, for example, learning the model with "shs dog" allows the learning target to be identified and learned from the class. + +When generating images, specifying "shs dog" will generate images of the learned dog breed. + +(For reference, here are some identifiers I've been using recently: ``shs sts scs cpc coc cic msm usu ici lvl cic dii muk ori hru rik koo yos wny``. Ideally, it should not be included in the Danbooru Tag.) + +## Step 2. Decide whether to use regularization images, and if so, generate them + +Regularization images are images used to prevent the entire class from being pulled by the learning target (language drift). If you do not use regularization images, for example, if you learn a specific character with `shs 1girl`, even if you generate a simple `1girl` prompt, it will resemble that character. This is because `1girl` is included in the learning caption. + +By learning the target images and regularization images together, the class remains as a class, and the learning target is generated only when the identifier is included in the prompt. + +If you only need a specific character to come out in LoRA or DreamBooth, you don't need to use regularization images. + +Textual Inversion does not need to be used (because nothing is learned when the token string to be learned is not included in the caption). + +For regularization images, it is common to use images generated with only the class name for the learning target model (e.g., `1girl`). However, if the quality of the generated image is poor, you can also use images downloaded separately from the internet or modify the prompt. + +(Since the regularization images are also learned, their quality affects the model.) + +Generally, it is desirable to prepare about several hundred images (if the number is small, the class images will not be generalized, and their features will be learned). + +For generated images, usually, match the size of the generated images to the learning resolution (more precisely, the resolution of the bucket, described later). + +## Step 2: Write the configuration file + +Create a text file and set the extension to `.toml`. For example, you can write it as follows: + +```toml +[general] +enable_bucket = true # Whether to use Aspect Ratio Bucketing + +[[datasets]] +resolution = 512 # Training resolution +batch_size = 4 # Batch size + + [[datasets.subsets]] + image_dir = 'C:\hoge' # Specify the folder containing the training images + caption_extension = '.caption' # Caption file extension; change this if using .txt + num_repeats = 10 # Number of repetitions for training images + + # Write the following only when using regularization images. Remove it if not using them. + [[datasets.subsets]] + is_reg = true + image_dir = 'C:\reg' # Specify the folder containing the regularization images + class_tokens = 'girl' # Specify the class + num_repeats = 1 # Number of repetitions for regularization images; 1 is usually sufficient +``` + +You can start training by simply changing the following: + +1. Training resolution +2. Batch size +3. Folder specification +4. Caption file extension + + Any extension can be specified. +5. Number of repetitions + +## Step 3: Train + +Please refer to each document for training. + +# Fine-tuning method + +## Step 1: Prepare metadata + +Metadata is a management file that compiles captions and tags. It is in JSON format with the extension `.json`. The creation method is lengthy, so it is written at the end of this document. + +## Step 2: Write the configuration file + +Create a text file and set the extension to `.toml`. For example, you can write it as follows: + +```toml +[general] +shuffle_caption = true +keep_tokens = 1 + +[[datasets]] +resolution = 512 # Training resolution +batch_size = 4 # Batch size + + [[datasets.subsets]] + image_dir = 'C:\piyo' # Specify the folder containing the training images + metadata_file = 'C:\piyo\piyo_md.json' # Metadata file name +``` + +You can start training by simply changing the following. The parts not specifically mentioned are the same as DreamBooth and class+identifier methods: + +1. Training resolution +2. Batch size +3. Folder specification +4. Metadata file name + + Specify the metadata file created using the method described later. +---CUT--- +## Step 3: Learning + +Please conduct the learning process based on each document. + +# Brief explanations of terms used in learning + +These explanations are simplified, and I don't fully understand every detail, so please research further on your own. + +## Fine-tuning + +This term refers to the process of training and adjusting a model. Fine-tuning can have different meanings depending on the context. In a narrow sense, fine-tuning in the case of Stable Diffusion means training the model on images and captions. DreamBooth is a specific method of fine-tuning in this narrow sense. In a broader sense, fine-tuning includes methods such as LoRA, Textual Inversion, and Hypernetworks, and encompasses all aspects of model training. + +## Steps + +Roughly speaking, one step involves performing calculations on the training data once. One step is defined as "feeding the captions of the training data into the current model, comparing the resulting images with the images in the training data, and slightly modifying the model to make it closer to the training data." + +## Batch size + +The batch size is a value that specifies how many data points are calculated together in one step. By calculating data points together, the speed of the process is relatively improved. Generally, it is believed that a larger batch size leads to higher accuracy. + +The total number of data points used in training is the product of the batch size and the number of steps. Therefore, if you increase the batch size, you should reduce the number of steps. + +However, for example, "1,600 steps with a batch size of 1" and "400 steps with a batch size of 4" will not produce the same results. With the same learning rate, the latter is generally more prone to undertraining. You may need to adjust the learning rate slightly (e.g., `2e-6`) or increase the number of steps to 500 to compensate. + +Increasing the batch size consumes more GPU memory. If you run out of memory, you will encounter errors, and if you are on the edge of running out, the training speed will decrease. It is a good idea to adjust the batch size while monitoring the memory usage in the Task Manager or with the `nvidia-smi` command. + +Note that a "batch" is a unit of data. + +## Learning rate + +Roughly speaking, the learning rate indicates how much the model changes at each step. A larger value speeds up the learning process but may cause the model to change too much, resulting in a broken model or failure to reach an optimal state. A smaller value slows down the learning process and may still fail to reach an optimal state. + +The learning rate varies significantly for fine-tuning, DreamBooth, and LoRA, as well as for the training data, the model to be trained, and the batch size and number of steps. Start with a general value and adjust it based on the learning progress. + +By default, the learning rate is constant throughout the training process. The learning rate can change depending on the scheduler specified, which may affect the results. + +## Epoch + +An epoch is completed when the training data has been processed once (i.e., when the data has made one round). If you specify a number of repetitions, one epoch is completed after the repeated data has made one round. + +The number of steps in one epoch is generally `number of data points / batch size`. However, with Aspect Ratio Bucketing, the number of steps increases slightly (since different bucket data cannot be combined into the same batch, the number of steps increases). + +## Aspect Ratio Bucketing + +Stable Diffusion v1 is trained at 512x512 resolution, but it also trains at other resolutions such as 256x1024 and 384x640. This reduces the amount of trimming and allows for more accurate learning of the relationship between captions and images. + +Furthermore, it is no longer necessary to standardize the aspect ratio of image data in advance, as any resolution can be used for training. + +You can enable or disable this feature in the settings. In the examples of configuration files provided so far, it is enabled (`true` is set). + +Training resolutions are created and adjusted in increments of 64 pixels (default, changeable) in width and height, within the range that does not exceed the area (i.e., memory usage) of the specified resolution. + +In machine learning, it is common to standardize input sizes, but there is no particular constraint on this, and it is actually sufficient to standardize batch sizes. NovelAI's bucketing seems to mean pre-classifying the training data by learning resolution according to the aspect ratio. By creating batches from images within each bucket, the batch image size is standardized. + +# Previous specification format (specifying via command line without using a configuration file) + +This method involves specifying options directly from the command line without using a `.toml` file. There are DreamBooth class+identifier methods, DreamBooth caption methods, and fine-tuning methods. + +## DreamBooth, class+identifier method + +Create a folder for the training images. __Within this folder__, create directories named as follows: + +``` +_ +``` + +Don't forget the underscore (_) between them. + +For example, if you want to train with the "sls frog" prompt and repeat the data 20 times, the directory will be named "20_sls frog". The structure will look like this: + +![image](https://user-images.githubusercontent.com/52813779/210770636-1c851377-5936-4c15-90b7-8ac8ad6c2074.png) + +### Training with multiple classes and multiple targets (identifiers) + +To train with multiple classes or targets, simply prepare multiple folders in the format "number of repetitions_ " within the training image folder, and multiple folders in the format "number of repetitions_" within the regularization image folder. + +For example, if you want to train with both "sls frog" and "cpc rabbit" prompts, the structure will look like this: + +![image](https://user-images.githubusercontent.com/52813779/210777933-a22229db-b219-4cd8-83ca-e87320fc4192.png) + +If there is only one class with multiple targets, you only need one regularization image folder. For example, if there are characters A and B in the 1girl class, do the following: + +- train_girls + - 10_sls 1girl + - 10_cpc 1girl +- reg_girls + - 1_1girl + +### Step 2: Preparing regularization images + +This step is for when you want to use regularization images. + +Create a folder for regularization images. __Within this folder__, create a directory named as follows: + +``` +_ +``` + +For example, if you want to train with the "frog" prompt and not repeat the data (only use it once), the structure will look like this: + +![image](https://user-images.githubusercontent.com/52813779/210770897-329758e5-3675-49f1-b345-c135f1725832.png) + +### Step 3: Executing the learning process + +Run the respective learning scripts. Use the `--train_data_dir` option to specify the parent folder of the training data (not the folder containing the images), and the `--reg_data_dir` option to specify the parent folder of the regularization images (not the folder containing the images). + +## DreamBooth, caption method + +By placing files with the same file name as the images and with the .caption extension (changeable with options) in the training image and regularization image folders, the script will read the captions from those files and use them for training. + +Please note that the folder names (identifier class) will not be used for training these images. + +The caption file extension is .caption by default, but it can be changed using the `--caption_extension` option in the learning script. The `--shuffle_caption` option allows you to shuffle the comma-separated parts of the caption during training. + +---CUT--- +## Fine-tuning method + +The process of creating metadata is the same as when using a configuration file. Specify the metadata file with the `in_json` option. + +# Sample output during training + +You can check the progress of the training by generating images with the model during training. Specify the following options in the training script: + +- `--sample_every_n_steps` / `--sample_every_n_epochs` + + Specify the number of steps or epochs for sample output. Samples will be output every specified number. If both are specified, the number of epochs will take precedence. + +- `--sample_prompts` + + Specify the file containing the prompts for sample output. + +- `--sample_sampler` + + Specify the sampler to be used for sample output. You can choose from `'ddim', 'pndm', 'heun', 'dpmsolver', 'dpmsolver++', 'dpmsingle', 'k_lms', 'k_euler', 'k_euler_a', 'k_dpm_2', 'k_dpm_2_a'`. + +To generate sample output, you need to prepare a text file with prompts written in advance. Write one prompt per line. + +For example, the following: + +```txt +# prompt 1 +masterpiece, best quality, 1girl, in white shirts, upper body, looking at viewer, simple background --n low quality, worst quality, bad anatomy, bad composition, poor, low effort --w 768 --h 768 --d 1 --l 7.5 --s 28 + +# prompt 2 +masterpiece, best quality, 1boy, in business suit, standing at street, looking back --n low quality, worst quality, bad anatomy, bad composition, poor, low effort --w 576 --h 832 --d 2 --l 5.5 --s 40 +``` + +Lines starting with `#` are treated as comments. You can specify options for the generated image with " `--` + lowercase letter", such as `--n`. The following can be used: + +- `--n` Treat everything until the next option as a negative prompt. +- `--w` Specify the width of the generated image. +- `--h` Specify the height of the generated image. +- `--d` Specify the seed for the generated image. +- `--l` Specify the CFG scale for the generated image. +- `--s` Specify the number of steps during generation. + +# Commonly used options across scripts + +After updating the script, there may be cases where the documentation has not caught up. In that case, check the available options with the `--help` option. + +## Specifying the model to be used for training + +- `--v2` / `--v_parameterization` + + If you want to use Hugging Face's stable-diffusion-2-base or its fine-tuned model as the target model for training (models that are instructed to use `v2-inference.yaml` during inference), specify the `--v2` option. If you want to use stable-diffusion-2 or 768-v-ema.ckpt and their fine-tuned models (models that are instructed to use `v2-inference-v.yaml` during inference), specify both the `--v2` and `--v_parameterization` options. + + The main differences in Stable Diffusion 2.0 are: + + 1. Tokenizer used + 2. Text Encoder and output layer used (2.0 uses the second-to-last layer) + 3. Output dimension of Text Encoder (768->1024) + 4. U-Net structure (such as the number of heads in CrossAttention) + 5. v-parameterization (sampling method seems to have changed) + + The base version adopts 1-4, while the non-base version (768-v) adopts 1-5. The v2 option enables 1-4, and the v_parameterization option enables 5. + +- `--pretrained_model_name_or_path` + + Specifies the base model for additional training. You can specify Stable Diffusion checkpoint files (.ckpt or .safetensors), Diffusers model directories on your local disk, or Diffusers model IDs (such as "stabilityai/stable-diffusion-2"). + +## Training settings + +- `--output_dir` + + Specifies the folder where the trained model will be saved. + +- `--output_name` + + Specifies the file name of the model without the extension. + +- `--dataset_config` + + Specifies the `.toml` file containing the dataset settings. + +- `--max_train_steps` / `--max_train_epochs` + + Specifies the number of training steps or epochs. If both are specified, the number of epochs takes precedence. + +- `--mixed_precision` + + Uses mixed precision (mixed accuracy) for training to save memory. Specify as `--mixed_precision="fp16"`. Compared to no mixed precision (default), the accuracy may be lower, but the GPU memory required for training will be significantly reduced. + + (For RTX 30 series and later, you can also specify `bf16`. Please match the settings made to accelerate during environment preparation). + +- `--gradient_checkpointing` + + Reduces the GPU memory required for training by calculating weights in small increments rather than all at once. Turning this on or off does not affect accuracy, but turning it on allows for a larger batch size, which can impact performance. + + Generally, turning it on slows down the speed, but since it allows for a larger batch size, the total training time may actually be faster. + +- `--xformers` / `--mem_eff_attn` + + Specifying the xformers option will use xformers' CrossAttention. If you do not have xformers installed, or if it causes an error (depending on the environment, such as when `mixed_precision="no"`), you can specify the `mem_eff_attn` option instead to use the memory-efficient CrossAttention (slower than xformers). + +- `--clip_skip` + + Specify `2` to use the output of the second-to-last layer of the Text Encoder (CLIP). If you specify 1 or omit the option, the last layer will be used. + + ※Do not specify this option for SD2.0 training, as it uses the second-to-last layer by default. + + If the target model for training has already been trained to use the second-to-last layer, specifying 2 may be better. + + If not, and the last layer was used, the entire model has been trained with that assumption. In that case, retraining with the second-to-last layer may require a certain amount of training data and a longer training time to achieve desirable results. + +- `--max_token_length` + + The default is 75. You can extend the token length to 150 or 225 for training. Specify this option when training with long captions. + + However, since the token extension during training is slightly different from Automatic1111's Web UI (such as splitting specifications), it is recommended to train with 75 if not necessary. + + Like clip_skip, it is assumed that training with a different length than the model's training state will require a certain amount of training data and a longer training time. + +- `--weighted_captions` + + When specified, the same weighted captions as Automatic1111's Web UI will be enabled. This can be used for training methods other than "Textual Inversion and XTI". It is also effective for token strings of the DreamBooth method. + + The notation for weighted captions is almost the same as that of the Web UI, with (abc), [abc], (abc:1.23), etc. available. Nesting is also possible. Do not include commas within parentheses, as this will cause the correspondence of parentheses to be incorrect in the shuffle/dropout of prompts. + +- `--persistent_data_loader_workers` + + Specifying this in a Windows environment significantly reduces the waiting time between epochs. + +- `--max_data_loader_n_workers` + + This option sets the number of processes for data loading. Having more processes speeds up data loading and allows for more efficient use of the GPU, but it consumes main memory. By default, it is set to the smaller of either `8` or `the number of CPU concurrent threads - 1`. If you have limited main memory or if the GPU usage rate is around 90% or higher, consider lowering this value to `2` or `1` while monitoring those numbers. + +---CUT--- + +- `--logging_dir` / `--log_prefix` + + These options are related to saving the training logs. Please specify the log saving folder with the `logging_dir` option. Logs in the TensorBoard format will be saved. + + For example, specifying `--logging_dir=logs` will create a `logs` folder in the working directory, and logs will be saved in a datetime folder within it. By specifying the `--log_prefix` option, the specified string will be added before the datetime. Use it for identification purposes, such as `--logging_dir=logs --log_prefix=db_style1_`. + + To view logs in TensorBoard, open a separate command prompt and enter the following in the working directory: + + ``` + tensorboard --logdir=logs + ``` + + (I think TensorBoard will be installed along with the environment setup, but if it's not, please install it with `pip install tensorboard`.) + + Then open a browser and access http://localhost:6006/ to view the logs. + +- `--log_with` / `--log_tracker_name` + + These options are related to saving the training logs. In addition to `tensorboard`, you can save logs to `wandb`. For more details, please refer to [PR#428](https://github.com/kohya-ss/sd-scripts/pull/428). + +- `--noise_offset` + + This option implements the following article: https://www.crosslabs.org//blog/diffusion-with-offset-noise + + It seems that the quality of the generated images, which are overall dark or bright, may improve. It also appears to be effective in LoRA training. It is recommended to specify a value of about `0.1`. + +- `--adaptive_noise_scale` (experimental option) + + This option automatically adjusts the value of the noise offset according to the absolute value of the average of each channel of the latents. It is activated by specifying it simultaneously with `--noise_offset`. The noise offset value is calculated as `noise_offset + abs(mean(latents, dim=(2,3))) * adaptive_noise_scale`. Since the latents are close to a normal distribution, it might be a good idea to specify a value of about 1/10 of the noise offset. + + Negative values can also be specified, and in that case, the noise offset will be clipped to a minimum of 0. + +- `--multires_noise_iterations` / `--multires_noise_discount` + + This option sets the Multi-Resolution Noise (pyramid noise). For more details, please refer to [PR#471](https://github.com/kohya-ss/sd-scripts/pull/471) and this page [Multi-Resolution Noise for Diffusion Model Training](https://wandb.ai/johnowhitaker/multires_noise/reports/Multi-Resolution-Noise-for-Diffusion-Model-Training--VmlldzozNjYyOTU2). + + Specifying a numeric value for `--multires_noise_iterations` will enable this feature. A value of around 6-10 seems to be suitable. Specify a value of 0.1-0.3 (recommended by the PR author for relatively small datasets such as LoRA training) or around 0.8 (recommended by the original article) for `--multires_noise_discount` (default is 0.3). + +- `--debug_dataset` + + By adding this option, you can check the image data and captions that will be used for training before actually starting the training. Press the Esc key to exit and return to the command line. Press the `S` key to advance to the next step (batch) and the `E` key to advance to the next epoch. + + *Note: Images will not be displayed in Linux environments, including Colab. + +- `--vae` + + When you specify either a Stable Diffusion checkpoint, VAE checkpoint file, or Diffusers model or VAE (both local or Hugging Face model IDs are acceptable) in the `vae` option, the VAE will be used for training (during latents caching or latents acquisition). + + In DreamBooth and fine-tuning, the saved model will be a model that incorporates this VAE. + +- `--cache_latents` / `--cache_latents_to_disk` + + Caches the VAE output to main memory to reduce VRAM usage. This disables the use of augmentations other than `flip_aug`. It also slightly speeds up overall training time. + + By specifying `cache_latents_to_disk`, the cache will be saved to disk. The cache will still be effective even if the script is terminated and restarted. + +- `--min_snr_gamma` + + Specifies the Min-SNR Weighting strategy. For more details, please refer to [this link](https://github.com/kohya-ss/sd-scripts/pull/308). The recommended value in the paper is `5`. + +## Settings for saving models + +- `--save_precision` + + This option specifies the data precision when saving. By specifying one of float, fp16, or bf16 in the `save_precision` option, the model will be saved in that format (this does not apply when saving a Diffusers format model in DreamBooth or fine-tuning). Use this when you want to reduce the size of the model. + +- `--save_every_n_epochs` / `--save_state` / `--resume` + + By specifying a numeric value for the `save_every_n_epochs` option, the model will be saved at each specified epoch during training. + + When specifying the `save_state` option at the same time, the training state including the optimizer and other states will also be saved (although the model can be resumed from the saved state, this can be expected to improve accuracy and shorten training time). The saved state will be stored in a folder. + + The training state is output as a folder named `-??????-state` (where ?????? is the epoch number) in the saved location. Please use this option during long-lasting training sessions. + + To resume training from a saved state, use the `resume` option. Specify the folder of the training state (not the `output_dir`, but the state folder inside it). + + Please note that due to the Accelerator specification, the epoch numbers and global steps are not saved, so they will start from 1 again when resumed. + +- `--save_every_n_steps` + + By specifying a numeric value for the `save_every_n_steps` option, the model will be saved at each specified step during training. It can be specified simultaneously with `save_every_n_epochs`. + +---CUT--- +- `--save_model_as` (DreamBooth, fine-tuning only) + + Choose the model save format from `ckpt, safetensors, diffusers, diffusers_safetensors`. + + Specify it as `--save_model_as=safetensors`. If you load a Stable Diffusion format (ckpt or safetensors) and save it in the Diffusers format, the missing information will be supplemented by downloading the v1.5 or v2.1 information from Hugging Face. + +- `--huggingface_repo_id` etc. + + If a huggingface_repo_id is specified, the model will be uploaded to HuggingFace at the same time as it is saved. Please be careful with the handling of access tokens (refer to HuggingFace's documentation). + + Specify other arguments as follows: + + - `--huggingface_repo_id "your-hf-name/your-model" --huggingface_path_in_repo "path" --huggingface_repo_type model --huggingface_repo_visibility private --huggingface_token hf_YourAccessTokenHere` + + If you specify `public` for huggingface_repo_visibility, the repository will be published. If omitted or specified as `private` (or anything other than public), it will be private. + + If you specify the `--save_state` option and also specify `--save_state_to_huggingface`, the state will be uploaded. + + If you specify the `--resume` option and also specify `--resume_from_huggingface`, the state will be downloaded from HuggingFace and resumed. At that time, the `--resume` option will be `--resume {repo_id}/{path_in_repo}:{revision}:{repo_type}`. + + Example: `--resume_from_huggingface --resume your-hf-name/your-model/path/test-000002-state:main:model` + + If you specify the `--async_upload` option, the upload will be asynchronous. + +## Optimizer-related + +- `--optimizer_type` + + Specify the type of optimizer. The following can be specified: + - AdamW: [torch.optim.AdamW](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) + - Same as the unspecified option in previous versions + - AdamW8bit: Same arguments as above + - Same as the --use_8bit_adam option in previous versions + - Lion: https://github.com/lucidrains/lion-pytorch + - Same as the --use_lion_optimizer option in previous versions + - Lion8bit: Same arguments as above + - SGDNesterov: [torch.optim.SGD](https://pytorch.org/docs/stable/generated/torch.optim.SGD.html), nesterov=True + - SGDNesterov8bit: Same arguments as above + - DAdaptation(DAdaptAdam): https://github.com/facebookresearch/dadaptation + - DAdaptAdaGrad: Same arguments as above + - DAdaptAdan: Same arguments as above + - DAdaptSGD: Same arguments as above + - AdaFactor: [Transformers AdaFactor](https://huggingface.co/docs/transformers/main_classes/optimizer_schedules) + - Any optimizer + +- `--learning_rate` + + Specify the learning rate. The appropriate learning rate depends on the training script, so please refer to each description. + +- `--lr_scheduler` / `--lr_warmup_steps` / `--lr_scheduler_num_cycles` / `--lr_scheduler_power` + + These are the specifications for learning rate scheduler-related options. + + With the lr_scheduler option, you can choose the learning rate scheduler from linear, cosine, cosine_with_restarts, polynomial, constant, constant_with_warmup, or any scheduler. The default is constant. + + With lr_warmup_steps, you can specify the number of warm-up steps for the scheduler. + + lr_scheduler_num_cycles is the number of restarts for the cosine with restarts scheduler, and lr_scheduler_power is the polynomial power for the polynomial scheduler. + + For more details, please do your own research. + + To use any scheduler, specify the optional arguments with `--scheduler_args`, as with any optimizer. + +### Specifying the optimizer + +Specify the optional arguments for the optimizer with the --optimizer_args option. You can specify multiple values in the key=value format, and multiple values can be specified for the value, separated by commas. For example, to specify arguments for the AdamW optimizer, use `--optimizer_args weight_decay=0.01 betas=.9,.999`. + +When specifying optional arguments, please check the specifications of each optimizer. + +Some optimizers have required arguments that are automatically added if omitted (such as the momentum for SGDNesterov). Please check the console output. + +The D-Adaptation optimizer automatically adjusts the learning rate. The value specified for the learning rate option is not the learning rate itself, but the application rate of the learning rate determined by D-Adaptation, so normally specify 1.0. If you want to specify half the learning rate for the Text Encoder and the full learning rate for U-Net, use `--text_encoder_lr=0.5 --unet_lr=1.0`. + +The AdaFactor optimizer can automatically adjust the learning rate by specifying relative_step=True (added by default when omitted). When adjusting automatically, the adafactor_scheduler is forcibly used for the learning rate scheduler. Also, it seems to be good to specify scale_parameter and warmup_init. + +When adjusting automatically, the option specification is like `--optimizer_args "relative_step=True" "scale_parameter=True" "warmup_init=True"`. + +If you do not want to adjust the learning rate automatically, add the optional argument `relative_step=False`. In that case, it is recommended to use constant_with_warmup for the learning rate scheduler and not to clip the gradient norm. Therefore, the arguments are like `--optimizer_type=adafactor --optimizer_args "relative_step=False" --lr_scheduler="constant_with_warmup" --max_grad_norm=0.0`. + +### Using any optimizer + +If you want to use an optimizer from `torch.optim`, specify only the class name (e.g. `--optimizer_type=RMSprop`) or use the "module name.class name" (e.g. `--optimizer_type=bitsandbytes.optim.lamb.LAMB`) for optimizers from other modules. + +(Only importlib is used internally, and the operation is unconfirmed. Please install the necessary packages if needed.) + +# Creating metadata files + +## Preparing teacher data + +As previously mentioned, prepare the image data you want to train and put it in any folder. + +For example, store the images as follows: + +![Folder of teacher data](https://user-images.githubusercontent.com/52813779/208907739-8e89d5fa-6ca8-4b60-8927-f484d2a9ae04.png) + +## Automatic captioning + +Skip this step if you are training with tags only and not using captions. + +If you are preparing captions manually, put the captions in the same directory as the teacher data images, with the same file name and a .caption extension. Each file should be a single-line text file. + +### Captioning with BLIP + +In the latest version, downloading BLIP, downloading weights, and adding a virtual environment are no longer necessary. It works as is. + +Run the make_captions.py script in the finetune folder. + +``` +python finetune\make_captions.py --batch_size +``` + +If the batch size is 8 and the teacher data is placed in the parent folder train_data, it will look like this: + +``` +python finetune\make_captions.py --batch_size 8 ..\train_data +``` + +The caption files will be created in the same directory as the training data images, with the same file name and a ".caption" extension. + +Adjust the batch size according to your GPU's VRAM capacity. A larger batch size will speed up the process (it can be increased further for VRAM 12GB). You can specify the maximum length of captions using the "max_length" option. The default is 75, but you may want to increase it if you are training the model with a token length of 225. You can also change the caption file extension using the "caption_extension" option. The default is ".caption" (changing it to ".txt" may cause conflicts with DeepDanbooru mentioned later). + +If you have multiple training data folders, run the command for each folder separately. + +---CUT--- +Please note that the inference has randomness, so the results may change each time it is executed. To fix the results, specify a random seed with the `--seed` option, such as `--seed 42`. + +For other options, please refer to the help with `--help`. Unfortunately, there seems to be no comprehensive documentation on the meaning of the parameters, so you may need to look at the source code. + +By default, the caption files are generated with the .caption extension. + +![Caption generated folder](https://user-images.githubusercontent.com/52813779/208908845-48a9d36c-f6ee-4dae-af71-9ab462d1459e.png) + +For example, the following captions will be attached. + +![Caption and image](https://user-images.githubusercontent.com/52813779/208908947-af936957-5d73-4339-b6c8-945a52857373.png) + +## Tagging with DeepDanbooru + +If you do not want to perform tagging with danbooru tags, please proceed to "Preprocessing of captions and tag information." + +Tagging is done with either DeepDanbooru or WD14Tagger. WD14Tagger seems to have better accuracy. If you want to tag with WD14Tagger, please proceed to the next chapter. + +### Setting up the environment + +Clone DeepDanbooru (https://github.com/KichangKim/DeepDanbooru) into your working folder or download and unzip the zip file. I unzipped the zip file. +Also, download deepdanbooru-v3-20211112-sgd-e28.zip from the Assets of "DeepDanbooru Pretrained Model v3-20211112-sgd-e28" on DeepDanbooru's Releases page (https://github.com/KichangKim/DeepDanbooru/releases) and unzip it into the DeepDanbooru folder. + +Download from the following link. Click Assets to open and download from there. + +![DeepDanbooru download page](https://user-images.githubusercontent.com/52813779/208909417-10e597df-7085-41ee-bd06-3e856a1339df.png) + +Please set up the directory structure like this: + +![DeepDanbooru directory structure](https://user-images.githubusercontent.com/52813779/208909486-38935d8b-8dc6-43f1-84d3-fef99bc471aa.png) + +Install the necessary libraries for Diffusers' environment. Move to the DeepDanbooru folder and install (I think this will effectively only add tensorflow-io). + +``` +pip install -r requirements.txt +``` + +Next, install DeepDanbooru itself. + +``` +pip install . +``` + +Now the environment for tagging is ready. + +### Performing tagging + +Move to the DeepDanbooru folder and run deepdanbooru to perform tagging. + +``` +deepdanbooru evaluate --project-path deepdanbooru-v3-20211112-sgd-e28 --allow-folder --save-txt +``` + +If you placed the teacher data in the parent folder train_data, it would look like this: + +``` +deepdanbooru evaluate ../train_data --project-path deepdanbooru-v3-20211112-sgd-e28 --allow-folder --save-txt +``` + +Tag files are created in the same directory as the teacher data images, with the same file name and a .txt extension. Since they are processed one by one, it is relatively slow. + +If you have multiple teacher data folders, please run for each folder. + +The files are generated as follows: + +![Generated files of DeepDanbooru](https://user-images.githubusercontent.com/52813779/208909855-d21b9c98-f2d3-4283-8238-5b0e5aad6691.png) + +Tags are attached like this (with a lot of information...). + +![DeepDanbooru tags and image](https://user-images.githubusercontent.com/52813779/208909908-a7920174-266e-48d5-aaef-940aba709519.png) + +## Tagging with WD14Tagger + +This is the procedure for using WD14Tagger instead of DeepDanbooru. + +We will use the tagger used in Automatic1111's WebUI. I referred to the information on this GitHub page (https://github.com/toriato/stable-diffusion-webui-wd14-tagger#mrsmilingwolfs-model-aka-waifu-diffusion-14-tagger). + +The necessary modules are already installed in the initial environment setup. Also, the weights will be automatically downloaded from Hugging Face. + +### Performing tagging + +Run the script to perform tagging. +``` +python tag_images_by_wd14_tagger.py --batch_size +``` + +If the teacher data is placed in the parent folder train_data, it would look like this: +``` +python tag_images_by_wd14_tagger.py --batch_size 4 ..\train_data +``` + +On the first launch, the model file will be automatically downloaded to the wd14_tagger_model folder (you can change the folder with an option). It will look like this: + +![Downloaded files](https://user-images.githubusercontent.com/52813779/208910447-f7eb0582-90d6-49d3-a666-2b508c7d1842.png) + +Tag files are created in the same directory as the teacher data images, with the same file name and a .txt extension. + +![Generated tag files](https://user-images.githubusercontent.com/52813779/208910534-ea514373-1185-4b7d-9ae3-61eb50bc294e.png) + +![Tags and image](https://user-images.githubusercontent.com/52813779/208910599-29070c15-7639-474f-b3e4-06bd5a3df29e.png) + +You can specify the threshold for tagging with the thresh option. The default is 0.35, the same as WD14Tagger's sample. Lowering the value will result in more tags being attached, but the accuracy will decrease. + +Please adjust the batch size according to your GPU's VRAM capacity. The larger the size, the faster the process. You can change the tag file extension with the caption_extension option. The default is .txt. + +You can specify the model's save folder with the model_dir option. + +If you specify the force_download option, the model will be redownloaded even if the save folder exists. + +If you have multiple teacher data folders, please run for each folder. + +## Preprocessing captions and tag information + +To make it easier to process captions and tags with a script, they are combined into a single file as metadata. + +### Preprocessing captions + +To include captions in metadata, execute the following in your working folder (if you don't want to use captions for learning, you don't need to run this) (actually, write in one line, same for below). Specify the image file location in full path with the `--full_path` option. If you omit this option, the path will be recorded as a relative path, but you will need to specify the folder separately in the `.toml` file. + +``` +python merge_captions_to_metadata.py --full_path + --in_json +``` + +The metadata file name can be any name. +If the teacher data is train_data, there is no input metadata file, and the metadata file is meta_cap.json, it looks like this: + +``` +python merge_captions_to_metadata.py --full_path train_data meta_cap.json +``` + +You can specify the caption extension with the caption_extension option. + +If you have multiple teacher data folders, specify the full_path argument and run for each folder. + +---CUT--- +``` +python merge_captions_to_metadata.py --full_path + train_data1 meta_cap1.json +python merge_captions_to_metadata.py --full_path --in_json meta_cap1.json + train_data2 meta_cap2.json +``` + +If you omit the `in_json` option, the script will read from and overwrite the existing metadata file. + +__*Note: It is safer to change the `in_json` option and the output metadata file for each run.__ + +### Preprocessing Tags + +Similarly, you can also merge tags into the metadata (this step is not necessary if you don't want to use tags for training). + +``` +python merge_dd_tags_to_metadata.py --full_path + --in_json +``` + +If you have the same directory structure as before, you would read from `meta_cap.json` and write to `meta_cap_dd.json` as follows: + +``` +python merge_dd_tags_to_metadata.py --full_path train_data --in_json meta_cap.json meta_cap_dd.json +``` + +If you have multiple supervised data folders, execute the script for each folder by specifying the `full_path` argument. + +``` +python merge_dd_tags_to_metadata.py --full_path --in_json meta_cap2.json + train_data1 meta_cap_dd1.json +python merge_dd_tags_to_metadata.py --full_path --in_json meta_cap_dd1.json + train_data2 meta_cap_dd2.json +``` + +If you omit the `in_json` option, the script will read from and overwrite the existing metadata file. + +__*Note: It is safer to change the `in_json` option and the output metadata file for each run.__ + +### Cleaning Captions and Tags + +By this point, both captions and DeepDanbooru tags should be combined in the metadata file. However, automatically generated captions may have inconsistent notation (e.g., girl/girls/woman/women), and tags may contain underscores or ratings (in the case of DeepDanbooru). It is recommended to clean captions and tags using a text editor's replace feature or a similar tool. + +There is a script provided for cleaning captions and tags. Edit the script according to your needs and run it as follows: + +(No need to specify the supervised data folder. The script will clean all data within the metadata file.) + +``` +python clean_captions_and_tags.py +``` + +Please note that the `--in_json` option is not used in this case. For example: + +``` +python clean_captions_and_tags.py meta_cap_dd.json meta_clean.json +``` + +This completes the preprocessing of captions and tags. + +## Pre-fetching Latents + +*Note: This step is not mandatory. You can also train while fetching latents during training. Pre-fetching latents is not possible if you apply `random_crop` or `color_aug` during training (as the images change for each training iteration). In that case, you can proceed with the metadata obtained so far. + +By pre-fetching the latent representations of the images and saving them to disk, you can speed up the training process. Additionally, this step performs bucketing (classifying supervised data according to aspect ratio). + +In the working folder, enter the following command: + +``` +python prepare_buckets_latents.py --full_path + + + --batch_size + --max_resolution + --mixed_precision +``` + +For example, if your model is `model.ckpt`, batch size is 4, training resolution is 512x512, and precision is "no" (float32), you would read from `meta_clean.json` and write to `meta_lat.json` as follows: + +``` +python prepare_buckets_latents.py --full_path + train_data meta_clean.json meta_lat.json model.ckpt + --batch_size 4 --max_resolution 512,512 --mixed_precision no +``` + +The latents will be saved in the supervised data folder as numpy npz files. + +You can specify the minimum bucket resolution with the `--min_bucket_reso` option and the maximum with the `--max_bucket_reso` option. The defaults are 256 and 1024, respectively. For example, if you set the minimum size to 384, resolutions like 256x1024 or 320x768 will no longer be used. + +If you increase the resolution to, say, 768x768, you might want to set the maximum size to 1280 or similar. + +The `--flip_aug` option enables left-right flip augmentation, which can effectively double the amount of data. However, this may not work well for non-symmetrical data (e.g., character appearance or hairstyle). + +If you have multiple supervised data folders, execute the script for each folder by specifying the `full_path` argument. + +``` +python prepare_buckets_latents.py --full_path + train_data1 meta_clean.json meta_lat1.json model.ckpt + --batch_size 4 --max_resolution 512,512 --mixed_precision no + +python prepare_buckets_latents.py --full_path + train_data2 meta_lat1.json meta_lat2.json model.ckpt + --batch_size 4 --max_resolution 512,512 --mixed_precision no + +``` + +It is possible to use the same input and output metadata files, but using separate files is safer. + +__*Note: It is safer to change the input and output metadata filenames for each run.__ diff --git a/docs/train_db_README-en.md b/docs/train_db_README-en.md new file mode 100644 index 000000000..a5e89a692 --- /dev/null +++ b/docs/train_db_README-en.md @@ -0,0 +1,148 @@ +## This is a ChatGPT-4 English adaptation of the original document by kohya-ss ([train_db_README-ja.md](https://github.com/kohya-ss/sd-scripts/blob/main/docs/train_db_README-ja.md)) + +Introducing the DreamBooth Guide. + +Please also see the [common document for learning](./train_README-en.md). + +# Overview + +DreamBooth is a technology that adds specific themes to image generation models through additional learning and generates them with specific identifiers. [View the paper here](https://arxiv.org/abs/2208.12242). + +Specifically, it teaches the Stable Diffusion model characters and art styles, and allows them to be called with specific words like `shs` (to appear in the generated image). + +The script is based on [Diffusers' DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth), but has added features like the following (some of which have been implemented in the original script as well). + +The main features of the script are as follows: + +- Memory-saving with 8-bit Adam optimizer and caching latents (similar to [Shivam Shrirao's version](https://github.com/ShivamShrirao/diffusers/tree/main/examples/dreambooth)). +- Memory-saving with xformers. +- Learning in sizes other than 512x512. +- Quality improvement with augmentation. +- Support for fine-tuning DreamBooth and Text Encoder + U-Net. +- Reading and writing models in Stable Diffusion format. +- Aspect Ratio Bucketing. +- Support for Stable Diffusion v2.0. + +# Learning Procedure + +Please refer to this repository's README for environment preparation. + +## Data Preparation + +Please refer to [Preparing Learning Data](./train_README-en.md). + +## Running the Learning + +Execute the script. The command for maximum memory saving is as follows (actually entered in one line). Please modify each line as necessary. It seems to work with about 12GB of VRAM. + +``` +accelerate launch --num_cpu_threads_per_process 1 train_db.py + --pretrained_model_name_or_path=<.ckpt or .safetensord or Diffusers model directory> + --dataset_config=<.toml file created in data preparation> + --output_dir= + --output_name= + --save_model_as=safetensors + --prior_loss_weight=1.0 + --max_train_steps=1600 + --learning_rate=1e-6 + --optimizer_type="AdamW8bit" + --xformers + --mixed_precision="fp16" + --cache_latents + --gradient_checkpointing +``` + +It is usually best to specify 1 for `num_cpu_threads_per_process`. + +Specify the base model for additional learning in `pretrained_model_name_or_path`. You can specify a Stable Diffusion checkpoint file (.ckpt or .safetensors), a Diffusers local disk model directory, or a Diffusers model ID (such as "stabilityai/stable-diffusion-2"). + +Specify the folder to save the learned model in `output_dir`. Specify the model's file name without the extension in `output_name`. Specify saving in safetensors format with `save_model_as`. + +Specify the `.toml` file in `dataset_config`. For the initial batch size specification in the file, set it to `1` to keep memory consumption low. + +`prior_loss_weight` is the weight of the regularization image loss. Normally, specify 1.0. + +Set the number of training steps, `max_train_steps`, to 1600. The learning rate `learning_rate` is specified as 1e-6. + +Specify `mixed_precision="fp16"` for memory saving (you can also specify `bf16` for RTX 30 series and later. Match the settings made in accelerate during environment preparation). Also, specify `gradient_checkpointing`. + +To use the memory-efficient 8-bit AdamW optimizer, specify `optimizer_type="AdamW8bit"`. + +Specify the `xformers` option and use xformers' CrossAttention. If you have not installed xformers or encounter an error (depending on the environment, such as when `mixed_precision="no"`), you can specify the `mem_eff_attn` option instead to use the memory-efficient CrossAttention (which will be slower). + +Cache the VAE output for memory saving by specifying the `cache_latents` option. + +If you have enough memory, edit the `.toml` file to increase the batch size to, for example, `4` (which may potentially speed up and improve accuracy). Additionally, removing `cache_latents` enables augmentation. + +### Commonly Used Options + +Please refer to the [common document for learning](./train_README-en.md) for the following cases: + +- Learning Stable Diffusion 2.x or derived models +- Learning models that assume a clip skip of 2 or more +- Learning with captions exceeding 75 tokens + +### Step Count in DreamBooth + +In this script, for memory saving, the number of learning times per step is half that of the original script (because the target image and regularization image are divided into separate batches for learning). + +To perform almost the same learning as the original Diffusers version and XavierXiao's Stable Diffusion version, double the number of steps. + +(Strictly speaking, the order of the data changes because the learning image and regularization image are shuffled together, but it is thought to have little effect on learning.) + +### Batch Size in DreamBooth + +As the whole model is learned (similar to fine-tuning), memory consumption is higher compared to learning using LoRA and other methods. + +### Learning Rate + +The Diffusers version is 5e-6, but the Stable Diffusion version is 1e-6, so the sample above specifies 1e-6. + +### Command Line for Specifying Dataset in Previous Format + +Specify resolution and batch size as options. An example of the command line is as follows: + +``` +accelerate launch --num_cpu_threads_per_process 1 train_db.py + --pretrained_model_name_or_path=<.ckpt or .safetensors or Diffusers version model directory> + --train_data_dir= + --reg_data_dir= + --output_dir= + --output_name= + --prior_loss_weight=1.0 + --resolution=512 + --train_batch_size=1 + --learning_rate=1e-6 + --max_train_steps=1600 + --use_8bit_adam + --xformers + --mixed_precision="bf16" + --cache_latents + --gradient_checkpointing +``` + +## Generating images with the trained model + +Once the training is complete, a safetensors file will be output in the specified folder with the specified name. + +For v1.4/1.5 and other derivative models, you can infer with this model using Automatic1111's WebUI. Please place it in the models\Stable-diffusion folder. + +To generate images with the v2.x model in the WebUI, a separate .yaml file describing the model specifications is required. For the v2.x base, place the v2-inference.yaml in the same folder, and for the 768/v, place the v2-inference-v.yaml in the folder, and name the part before the extension the same as the model. + +![image](https://user-images.githubusercontent.com/52813779/210776915-061d79c3-6582-42c2-8884-8b91d2f07313.png) + +Each yaml file can be found in the [Stability AI's SD2.0 repository](https://github.com/Stability-AI/stablediffusion/tree/main/configs/stable-diffusion). + +# Other major options specific to DreamBooth + +Please refer to the separate document for all options. + +## Do not train the Text Encoder from the middle --stop_text_encoder_training + +By specifying a number for the stop_text_encoder_training option, the Text Encoder training will not be performed after that step, and only the U-Net will be trained. In some cases, this may lead to improved accuracy. + +(It is suspected that the Text Encoder alone may overfit first, and this option may help prevent that, but the exact impact is unknown.) + +## Do not pad the Tokenizer output --no_token_padding + +By specifying the no_token_padding option, the output of the Tokenizer will not be padded (this is the same behavior as the old DreamBooth of the Diffusers version). diff --git a/docs/train_network_README-en.md b/docs/train_network_README-en.md new file mode 100644 index 000000000..8bb59dea4 --- /dev/null +++ b/docs/train_network_README-en.md @@ -0,0 +1,444 @@ +## This is a ChatGPT-4 English adaptation of the original document by kohya-ss ([train_network_README-ja.md](https://github.com/kohya-ss/sd-scripts/blob/main/docs/train_network_README-ja.md)) + +# Learning LoRA + +This is an implementation of [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) (arxiv) and [LoRA](https://github.com/microsoft/LoRA) (github) for Stable Diffusion. + +We greatly appreciate the work of [cloneofsimo's repository](https://github.com/cloneofsimo/lora). Thank you. + +While conventional LoRA is applied only to Linear and Conv2d with a kernel size of 1x1, it can also be extended to Conv2d with a kernel size of 3x3. + +The expansion to Conv2d 3x3 was first released by [cloneofsimo](https://github.com/cloneofsimo/lora), and its effectiveness was demonstrated by KohakuBlueleaf in [LoCon](https://github.com/KohakuBlueleaf/LoCon). We deeply appreciate KohakuBlueleaf's work. + +It seems to work just fine with 8GB VRAM. + +Please also refer to the [common documentation on learning](./train_README-en.md). + +# Types of LoRA that can be learned + +We support the following two types, which have unique names within this repository: + +1. __LoRA-LierLa__: LoRA applied to Linear and Conv2d with a kernel size of 1x1. + +2. __LoRA-C3Lier__: LoRA applied to Linear, Conv2d with a kernel size of 1x1, and Conv2d with a kernel size of 3x3. + +Compared to LoRA-LierLa, LoRA-C3Liar may offer higher accuracy due to the increased number of layers it is applied to. + +During training, you can also use __DyLoRA__ (described later). + +## Notes on trained models + +LoRA-LierLa can be used with AUTOMATIC1111's Web UI LoRA feature. + +To generate with LoRA-C3Liar in Web UI, use this [WebUI extension for additional networks](https://github.com/kohya-ss/sd-webui-additional-networks). + +Both LoRA models can also be merged with the Stable Diffusion model using a script within this repository. + +There is currently no compatibility with cloneofsimo's repository or d8ahazard's [Dreambooth Extension for Stable-Diffusion-WebUI](https://github.com/d8ahazard/sd_dreambooth_extension) due to some feature extensions (described later). + +# Learning Procedure + +Please refer to this repository's README and prepare the environment accordingly. + +## Data Preparation + +Refer to [preparing learning data](./train_README-en.md). + +## Execute Learning + +Use `train_network.py`. + +Specify the target module name for the `--network_module` option in `train_network.py`. Since `network.lora` is compatible with LoRA, please specify that. + +It seems that specifying a higher learning rate than the usual DreamBooth or fine-tuning, such as `1e-4` to `1e-3`, works well. + +Here is an example command line: + +``` +accelerate launch --num_cpu_threads_per_process 1 train_network.py + --pretrained_model_name_or_path=<.ckpt, .safetensors, or directory of Diffusers version model> + --dataset_config=<.toml file created during data preparation> + --output_dir= + --output_name= + --save_model_as=safetensors + --prior_loss_weight=1.0 + --max_train_steps=400 + --learning_rate=1e-4 + --optimizer_type="AdamW8bit" + --xformers + --mixed_precision="fp16" + --cache_latents + --gradient_checkpointing + --save_every_n_epochs=1 + --network_module=networks.lora +``` + +This command line will train LoRA-LierLa. + +The LoRA model will be saved in the folder specified by the `--output_dir` option. For other options, optimizers, etc., please refer to the "Commonly Used Options" section in the [common learning documentation](./train_README-en.md). + +Other available options include: + +* `--network_dim`: Specifies the RANK of LoRA (`--networkdim=4`, etc.). The default is 4. A higher number increases expressiveness but requires more memory and time for learning. It is not always better to increase this number blindly. +* `--network_alpha`: Specifies the `alpha` value for stable learning and preventing underflow. The default is 1. Specifying the same value as `network_dim` results in the same behavior as previous versions. +* `--persistent_data_loader_workers`: Significantly reduces waiting time between epochs on Windows environments. +* `--max_data_loader_n_workers`: Specifies the number of processes for data loading. More processes result in faster data loading and more efficient GPU usage but consume main memory. By default, it is set to the smaller of `8` or `number of concurrent CPU threads - 1`. If you have limited main memory or GPU usage is above 90%, consider reducing this value to `2` or `1`, based on your hardware. +* `--network_weights`: Loads the weights of a pre-trained LoRA and trains from there. +* `--network_train_unet_only`: Enables only LoRA modules related to U-Net. This may be useful for fine-tuning-type learning. +* `--network_train_text_encoder_only`: Enables only LoRA modules related to Text Encoder. This may have a Textual Inversion-like effect. +* `--unet_lr`: Specifies a different learning rate for U-Net-related LoRA modules than the standard learning rate specified by the `--learning_rate` option. +* `--text_encoder_lr`: Specifies a different learning rate for Text Encoder-related LoRA modules than the standard learning rate specified by the `--learning_rate` option. There is some discussion that it may be better to use a slightly lower learning rate (e.g., 5e-5) for the Text Encoder. +* `--network_args`: Allows specifying multiple arguments. Described later. + +If both `--network_train_unet_only` and `--network_train_text_encoder_only` are unspecified (default), both Text Encoder and U-Net LoRA modules are enabled. + +# Other Learning Methods + +## Learning LoRA-C3Lier + +Specify `--network_args` like this: `conv_dim` for the rank of Conv2d (3x3) and `conv_alpha` for alpha. + +``` +--network_args "conv_dim=4" "conv_alpha=1" +``` + +If alpha is omitted, it defaults to 1. + +``` +--network_args "conv_dim=4" +``` + +## DyLoRA + +DyLoRA is proposed in this paper: [DyLoRA: Parameter Efficient Tuning of Pre-trained Models using Dynamic Search-Free Low-Rank Adaptation](https://arxiv.org/abs/2210.07558). The official implementation is available [here](https://github.com/huawei-noah/KD-NLP/tree/main/DyLoRA). + +According to the paper, LoRA's rank is not always better when it is higher. Instead, it is necessary to find the appropriate rank depending on the target model, dataset, task, etc. Using DyLoRA, you can simultaneously train LoRA with various ranks up to the specified dim(rank). This eliminates the need to search for the optimal rank separately for each. + +Our implementation is based on the official implementation with some custom extensions (which may have bugs). + +### Features of DyLoRA in this repository + +DyLoRA's model files are compatible with LoRA. Additionally, LoRA models with multiple dims(ranks) up to the specified dim can be extracted from DyLoRA's model files. + +Both DyLoRA-LierLa and DyLoRA-C3Lier can be learned. + +### Learning with DyLoRA + +Specify `network.dylora` for the `--network_module` option, which is compatible with DyLoRA. + +Also, specify `unit` in `--network_args`, such as `--network_args "unit=4"`. `unit` is the unit for dividing rank. For example, specify `--network_dim=16 --network_args "unit=4"`. Please make sure that `unit` is a divisor of `network_dim` (i.e., `network_dim` is a multiple of `unit`). + +If `unit` is not specified, it is treated as `unit=1`. + +For example, when learning with dim=16 and unit=4 (described later), you can extract and learn four LoRA models with ranks 4, 8, 12, and 16. By generating images with each extracted model and comparing them, you can select the LoRA with the optimal rank. + +Other options are the same as for regular LoRA. + +*Note: `unit` is a custom extension of this repository. In DyLoRA, the learning time is expected to be longer than that of regular LoRA with the same dim(rank), so the split unit has been increased. + +### Extracting LoRA models from DyLoRA models + +Use `extract_lora_from_dylora.py` in the `networks` folder. This script extracts LoRA models from DyLoRA models at the specified `unit` intervals. + +The command line looks like this: + +```powershell +python networks\extract_lora_from_dylora.py --model "foldername/dylora-model.safetensors" --save_to "foldername/dylora-model-split.safetensors" --unit 4 +``` + +Specify the DyLoRA model file in `--model`. Specify the file name to save the extracted model in `--save_to` (the rank value will be added to the file name). Specify the `unit` used during DyLoRA's learning in `--unit`. + +## Hierarchical Learning Rate + +For details, please refer to [PR #355](https://github.com/kohya-ss/sd-scripts/pull/355). + +You can specify the weights of the 25 blocks of the full model. There is no LoRA corresponding to the first block, but it is set to 25 for compatibility with hierarchical LoRA applications. Also, in some blocks, LoRA does not exist if not extended to conv2d3x3, but please always specify 25 values to unify the description. + +Specify the following arguments in `--network_args`. + +- `down_lr_weight`: Specify the learning rate weights of the down blocks of U-Net. You can specify the following: + - Weight for each block: Specify 12 values like `"down_lr_weight=0,0,0,0,0,0,1,1,1,1,1,1"`. + - Specification from preset: Specify like `"down_lr_weight=sine"` (weights are specified with a sine curve). You can specify sine, cosine, linear, reverse_linear, or zeros. Additionally, if you add `+number` like `"down_lr_weight=cosine+.25"`, the specified value will be added (it will become 0.25~1.25). +- `mid_lr_weight`: Specify the learning rate weight of the mid block of U-Net. Specify only one number like `"down_lr_weight=0.5"`. +- `up_lr_weight`: Specify the learning rate weights of the up blocks of U-Net. It is the same as down_lr_weight. +- Any unspecified parts will be treated as 1.0. Also, if the weight is set to 0, the LoRA module for that block will not be created. +- `block_lr_zero_threshold`: If the weight is less than or equal to this value, the LoRA module will not be created. The default is 0. + +### Example of hierarchical learning rate command-line specification: + +```powershell +--network_args "down_lr_weight=0.5,0.5,0.5,0.5,1.0,1.0,1.0,1.0,1.5,1.5,1.5,1.5" "mid_lr_weight=2.0" "up_lr_weight=1.5,1.5,1.5,1.5,1.0,1.0,1.0,1.0,0.5,0.5,0.5,0.5" + +--network_args "block_lr_zero_threshold=0.1" "down_lr_weight=sine+.5" "mid_lr_weight=1.5" "up_lr_weight=cosine+.5" +``` + +### Example of hierarchical learning rate toml file specification: + +```toml +network_args = [ "down_lr_weight=0.5,0.5,0.5,0.5,1.0,1.0,1.0,1.0,1.5,1.5,1.5,1.5", "mid_lr_weight=2.0", "up_lr_weight=1.5,1.5,1.5,1.5,1.0,1.0,1.0,1.0,0.5,0.5,0.5,0.5",] + +network_args = [ "block_lr_zero_threshold=0.1", "down_lr_weight=sine+.5", "mid_lr_weight=1.5", "up_lr_weight=cosine+.5", ] +``` + +## Hierarchical dim (rank) + +You can specify the dim (rank) of the 25 blocks of the full model. Like the hierarchical learning rate, LoRA may not exist in some blocks, but always specify 25 values. + +Specify the following arguments in `--network_args`. + +- `block_dims`: Specify the dim (rank) of each block. Specify 25 values like `"block_dims=2,2,2,2,4,4,4,4,6,6,6,6,8,6,6,6,6,4,4,4,4,2,2,2,2"`. +- `block_alphas`: Specify the alpha of each block. Specify 25 values like block_dims. If omitted, the value of network_alpha will be used. +- `conv_block_dims`: Extend LoRA to Conv2d 3x3 and specify the dim (rank) of each block. +- `conv_block_alphas`: Specify the alpha of each block when LoRA is extended to Conv2d 3x3. If omitted, the value of conv_alpha will be used. + +### Example of hierarchical dim (rank) command-line specification: + +```powershell +--network_args "block_dims=2,4,4,4,8,8,8,8,12,12,12,12,16,12,12,12,12,8,8,8,8,4,4,4,2" + +--network_args "block_dims=2,4,4,4,8,8,8,8,12,12,12,12,16,12,12,12,12,8,8,8,8,4,4,4,2" "conv_block_dims=2,2,2,2,4,4,4,4,6,6,6,6,8,6,6,6,6,4,4,4,4,2,2,2,2" + +--network_args "block_dims=2,4,4,4,8,8,8,8,12,12,12,12,16,12,12,12,12,8,8,8,8,4,4,4,2" "block_alphas=2,2,2,2,4,4,4,4,6,6,6,6,8,6,6,6,6,4,4,4,4,2,2,2,2" +``` + +### Example of hierarchical dim (rank) toml file specification: + +```toml +network_args = [ "block_dims=2,4,4,4,8,8,8,8,12,12,12,12,16,12,12,12,12,8,8,8,8,4,4,4,2",] + +network_args = [ "block_dims=2,4,4,4,8,8,8,8,12,12,12,12,16,12,12,12,12,8,8,8,8,4,4,4,2", "block_alphas=2,2,2,2,4,4,4,4,6,6,6,6,8,6,6,6,6,4,4,4,4,2,2,2,2",] +``` + +# Other scripts + +These are a set of scripts related to merging and LoRA. + +## About the merge script + +With merge_lora.py, you can merge the learning results of LoRA into the Stable Diffusion model or merge multiple LoRA models. + +### Merging LoRA model into Stable Diffusion model + +The merged model can be treated like a normal Stable Diffusion ckpt. For example, the command line will be as follows: + +``` +python networks\merge_lora.py --sd_model ..\model\model.ckpt + --save_to ..\lora_train1\model-char1-merged.safetensors + --models ..\lora_train1\last.safetensors --ratios 0.8 +``` + +If you are training with a Stable Diffusion v2.x model and want to merge it, please specify the --v2 option. + +Specify the Stable Diffusion model file to be merged in the --sd_model option (.ckpt or .safetensors only, Diffusers are not currently supported). + +Specify the destination to save the merged model in the --save_to option (.ckpt or .safetensors, automatically determined by the extension). + +Specify the learned LoRA model file in --models. Multiple specifications are also possible, in which case they will be merged in order. + +Specify the application rate (how much weight to reflect in the original model) for each model in --ratios with a value between 0 and 1.0. For example, if it seems close to overlearning, reducing the application rate may make it better. Please specify the same number as the number of models. + +When specifying multiple models, it will be as follows: + +``` +python networks\merge_lora.py --sd_model ..\model\model.ckpt + --save_to ..\lora_train1\model-char1-merged.safetensors + --models ..\lora_train1\last.safetensors ..\lora_train2\last.safetensors --ratios 0.8 0.5 +``` + +### Merging Multiple LoRA Models + +There will be subtle differences in the results due to the order of calculations when applying multiple LoRA models one by one to the SD model compared to merging multiple LoRA models and then applying them to the SD model. + +For example, the command line would look like this: + +``` +python networks\merge_lora.py + --save_to ..\lora_train1\model-char1-style1-merged.safetensors + --models ..\lora_train1\last.safetensors ..\lora_train2\last.safetensors --ratios 0.6 0.4 +``` + +The --sd_model option is not required. + +Specify the save destination for the merged LoRA model with the --save_to option (automatically determined by .ckpt or .safetensors extension). + +Specify the trained LoRA model files with the --models option. You can specify three or more models. + +Specify the ratio of each model (how much weight to reflect in the original model) with the --ratios option as a numeric value between 0 and 1.0. If you want to merge two models one-to-one, use "0.5 0.5". With "1.0 1.0", the total weight will be too high, and the result will likely be undesirable. + +LoRA models trained with v1 and v2, and those with different ranks (dimensions) or ``alpha`` values cannot be merged. LoRA models with only U-Net and those with U-Net + Text Encoder should be mergeable, but the results are unknown. + +### Other Options + +* precision + * Specify the precision during merge calculation from float, fp16, and bf16. If omitted, it defaults to float to ensure accuracy. If you want to reduce memory usage, please specify fp16/bf16. +* save_precision + * Specify the precision when saving the model from float, fp16, and bf16. If omitted, the precision will be the same as the precision option. + +## Merging Multiple LoRA Models with Different Ranks + +Approximate multiple LoRA models with a single LoRA model (an exact reproduction is not possible). Use `svd_merge_lora.py`. For example, the command line would look like this: + +``` +python networks\svd_merge_lora.py + --save_to ..\lora_train1\model-char1-style1-merged.safetensors + --models ..\lora_train1\last.safetensors ..\lora_train2\last.safetensors + --ratios 0.6 0.4 --new_rank 32 --device cuda +``` + +The main options are the same as `merge_lora.py`. The following options have been added: + +- `--new_rank` + - Specify the rank of the created LoRA model. +- `--new_conv_rank` + - Specify the rank of the created Conv2d 3x3 LoRA model. If omitted, it defaults to the same value as `new_rank`. +- `--device` + - Specify cuda with `--device cuda` to perform calculations on the GPU. This will speed up the process. + +## Generate Images Using the Scripts in This Repository + +Add the --network_module and --network_weights options to gen_img_diffusers.py. Their meanings are the same as during training. + +Specify a numerical value between 0 and 1.0 with the --network_mul option to change the application rate of LoRA. + +## Generate Images Using the Diffusers Pipeline + +Refer to the example below. The only required file is networks/lora.py. The script may not work with Diffusers versions other than 0.10.2. + +```python +import torch +from diffusers import StableDiffusionPipeline +from networks.lora import LoRAModule, create_network_from_weights +from safetensors.torch import load_file + +# if the ckpt is CompVis based, convert it to Diffusers beforehand with tools/convert_diffusers20_original_sd.py. See --help for more details. + +model_id_or_dir = r"model_id_on_hugging_face_or_dir" +device = "cuda" + +# create pipe +print(f"creating pipe from {model_id_or_dir}...") +pipe = StableDiffusionPipeline.from_pretrained(model_id_or_dir, revision="fp16", torch_dtype=torch.float16) +pipe = pipe.to(device) +vae = pipe.vae +text_encoder = pipe.text_encoder +unet = pipe.unet + +# load lora networks +print(f"loading lora networks...") + +lora_path1 = r"lora1.safetensors" +sd = load_file(lora_path1) # If the file is .ckpt, use torch.load instead. +network1, sd = create_network_from_weights(0.5, None, vae, text_encoder,unet, sd) +network1.apply_to(text_encoder, unet) +network1.load_state_dict(sd) +network1.to(device, dtype=torch.float16) + +# # You can merge weights instead of apply_to+load_state_dict. network.set_multiplier does not work +# network.merge_to(text_encoder, unet, sd) + +lora_path2 = r"lora2.safetensors" +sd = load_file(lora_path2) +network2, sd = create_network_from_weights(0.7, None, vae, text_encoder,unet, sd) +network2.apply_to(text_encoder, unet) +network2.load_state_dict(sd) +network2.to(device, dtype=torch.float16) + +lora_path3 = r"lora3.safetensors" +sd = load_file(lora_path3) +network3, sd = create_network_from_weights(0.5, None, vae, text_encoder, unet, sd) +network3.apply_to(text_encoder, unet) +network3.load_state_dict(sd) +network3.to(device, dtype=torch.float16) + +# prompts +prompt = "masterpiece, best quality, 1girl, in white shirt, looking at viewer" +negative_prompt = "bad quality, worst quality, bad anatomy, bad hands" + +# execute pipeline +print("generating image...") +with torch.autocast("cuda"): + image = pipe(prompt, guidance_scale=7.5, negative_prompt=negative_prompt).images[0] + +# if not merged, you can use set_multiplier +# network1.set_multiplier(0.8) +# and generate image again... + +# save image +image.save(r"by_diffusers..png") +``` + +## Creating a LoRA Model from the Difference of Two Models + +This implementation is based on [this discussion](https://github.com/cloneofsimo/lora/discussions/56). The equations are used as is (I don't fully understand them, but it seems that singular value decomposition is used for approximation). + +The difference between two models (e.g., the base model and the fine-tuned model) is approximated using LoRA. + +### How to Run the Script + +Specify as follows: +``` +python networks\extract_lora_from_models.py --model_org base-model.ckpt + --model_tuned fine-tuned-model.ckpt + --save_to lora-weights.safetensors --dim 4 +``` + +Specify the original Stable Diffusion model with the `--model_org` option. When applying the created LoRA model, specify this model. Both `.ckpt` and `.safetensors` formats are accepted. + +Specify the Stable Diffusion model to extract the difference with the `--model_tuned` option. For example, specify a model after fine-tuning or DreamBooth. Both `.ckpt` and `.safetensors` formats are accepted. + +Specify the destination for saving the LoRA model with `--save_to`, and specify the dimension of LoRA with `--dim`. + +The generated LoRA model can be used in the same way as a trained LoRA model. + +If the Text Encoder is the same in both models, the resulting LoRA will be a LoRA for U-Net only. + +### Other Options + +- `--v2` + - Specify if you want to use the v2.x Stable Diffusion model. +- `--device` + - Specify `--device cuda` to perform calculations on the GPU. This speeds up processing (it's not too slow on the CPU, but it's only about twice to several times faster). +- `--save_precision` + - Specify the LoRA save format from "float", "fp16", or "bf16". Default is "float". +- `--conv_dim` + - Specify to expand the range of LoRA application to Conv2d 3x3. Specify the rank of Conv2d 3x3. + +## Image Resize Script + +(Documentation will be organized later, but for now, here's an explanation.) + +With the extension of Aspect Ratio Bucketing, it is now possible to use small images as they are without enlarging them as training data. We received a preprocessing script along with a report that adding resized images of the original training images improved accuracy, so we've added and refined it. Thanks to bmaltais. + +### How to Run the Script + +Specify as follows. Both the original image and the resized image are saved in the destination folder. The resized image has the resolution, such as `+512x512`, added to the file name (which is different from the image size). Images smaller than the destination resolution will not be enlarged. + +``` +python tools\resize_images_to_resolution.py --max_resolution 512x512,384x384,256x256 --save_as_png + --copy_associated_files source_image_folder destination_folder +``` + +Images in the source_image_folder are resized so that their area is the same as the specified resolution and saved in the destination_folder. Files other than images are copied as is. + +Specify the destination size in the `--max_resolution` option as shown in the example. The images will be resized so that their area is the same as the specified size. If you specify multiple resolutions, the images will be resized for each resolution. If you specify `512x512,384x384,256x256`, the destination folder will contain four images: the original size and three resized sizes. + +Specify the `--save_as_png` option to save the images in PNG format. If omitted, the images will be saved in JPEG format (quality=100). + +When the `--copy_associated_files` option is specified, files with the same name as the image (excluding the extension, such as captions) are copied with the same name as the resized image file. + +### Other Options + +- divisible_by + - The image is cropped from the center so that the size (both height and width) of the resized image is divisible by this value. +- interpolation + - Specify the interpolation method for downsampling. Choose from `area, cubic, lanczos4`. The default is `area`. + +## Differences from Cloneofsimo's Repository + +As of December 25, 2022, this repository has expanded the application of LoRA to Text Encoder's MLP, U-Net's FFN, and Transformer's in/out projection, increasing its expressiveness. However, this has increased memory usage, making it barely fit within 8GB. + +The module replacement mechanisms are also completely different. + +## Future Extensions + +In addition to LoRA, other extensions can also be supported, and they will be added in the future. diff --git a/docs/train_ti_README-en.md b/docs/train_ti_README-en.md new file mode 100644 index 000000000..323660cfd --- /dev/null +++ b/docs/train_ti_README-en.md @@ -0,0 +1,106 @@ +## This is a ChatGPT-4 English adaptation of the original document by kohya-ss ([train_ti_README-ja.md](https://github.com/kohya-ss/sd-scripts/blob/main/docs/train_ti_README-ja.md)) + +This is an explanation about learning Textual Inversion (https://textual-inversion.github.io/). + +Please also refer to the [common documentation on learning](./train_README-en.md). + +The implementation was greatly inspired by https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion. + +The learned model can be used directly in the Web UI. + +# Learning procedure + +Please refer to this repository's README beforehand and set up the environment. + +## Data preparation + +Refer to [Preparing Training Data](./train_README-en.md) for more information. + +## Executing the training + +Use `train_textual_inversion.py`. The following is an example of a command-line (DreamBooth method). + +``` +accelerate launch --num_cpu_threads_per_process 1 train_textual_inversion.py + --dataset_config=<.toml file created during data preparation> + --output_dir= + --output_name= + --save_model_as=safetensors + --prior_loss_weight=1.0 + --max_train_steps=1600 + --learning_rate=1e-6 + --optimizer_type="AdamW8bit" + --xformers + --mixed_precision="fp16" + --cache_latents + --gradient_checkpointing + --token_string=mychar4 --init_word=cute --num_vectors_per_token=4 +``` + +Specify the token string during training with `--token_string`. __Make sure your training prompt includes this string (e.g., if the token_string is mychar4, use "mychar4 1girl")__. This part of the prompt will be replaced with a new token for Textual Inversion and learned. For DreamBooth and class+identifier-style datasets, it is easiest and most reliable to make the `token_string` the token string. + +You can check whether the token string is included in the prompt by using `--debug_dataset`. The replaced token id will be displayed, so you can check if there are tokens after `49408`, as shown below. + +``` +input ids: tensor([[49406, 49408, 49409, 49410, 49411, 49412, 49413, 49414, 49415, 49407, + 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, + 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, + 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, + 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, + 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, + 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, + 49407, 49407, 49407, 49407, 49407, 49407, 49407]]) +``` + +You cannot use words that the tokenizer already has (common words). + +Specify the string of the source token for initializing embeddings with `--init_word`. It is better to choose something close to the concept you want to learn. You cannot specify a string that consists of two or more tokens. + +Specify how many tokens to use in this training with `--num_vectors_per_token`. The more tokens you use, the more expressive the model will be, but the more tokens will be consumed. For example, if num_vectors_per_token=8, the specified token string will consume 8 tokens (out of the general prompt's 77-token limit). + +These are the main options for Textual Inversion. The rest is similar to other training scripts. + +Usually, it is better to specify `1` for `num_cpu_threads_per_process`. + +Specify the base model for additional learning with `pretrained_model_name_or_path`. You can specify a Stable Diffusion checkpoint file (.ckpt or .safetensors), a Diffusers model directory on your local disk, or a Diffusers model ID (e.g., "stabilityai/stable-diffusion-2"). + +Specify the folder to save the trained model after learning with `output_dir`. Specify the model's filename without the extension in `output_name`. Specify saving the model in safetensors format with `save_model_as`. + +Specify the `.toml` file in `dataset_config`. Set the batch size in the file to `1` initially to keep memory consumption low. + +Set the number of training steps to 10000 with `max_train_steps`. Set the learning rate to 5e-6 with `learning_rate`. + +To save memory, specify `mixed_precision="fp16"` (for RTX 30 series and later, you can also specify `bf16`. Match the setting you made in accelerate when setting up the environment). Also, specify `gradient_checkpointing`. + +To use a low-memory consumption 8bit AdamW optimizer, specify `optimizer_type="AdamW8bit"`. + +Specify the `xformers` option to use xformers' CrossAttention. If you have not installed xformers or if it causes errors (depending on the environment, such as when `mixed_precision="no"`), you can alternatively specify the `mem_eff_attn` option to use the memory-efficient CrossAttention (although it will be slower). + +If you have enough memory, edit the `.toml` file to increase the batch size to, for example, `8` (this may speed up and potentially improve accuracy). + +### Commonly used options + +Please refer to the documentation on options in the following cases: + +- Training a Stable Diffusion 2.x or derived model +- Training a model with a clip skip of 2 or more +- Training with captions exceeding 75 tokens + +### Batch size for Textual Inversion + +Compared to DreamBooth and fine-tuning, which train the entire model, Textual Inversion uses less memory, so you can set a larger batch size. + +# Other main options for Textual Inversion + +Please refer to another document for all options. + +* `--weights` + * Load pre-trained embeddings before training and learn further from them. +* `--use_object_template` + * Learn with a default object template string (e.g., "a photo of a {}") instead of captions. This will be the same as the official implementation. Captions will be ignored. +* `--use_style_template` + * Learn with a default style template string (e.g., "a painting in the style of {}") instead of captions. This will be the same as the official implementation. Captions will be ignored. + +## Generating images with the script in this repository + +Specify the learned embeddings file with the `--textual_inversion_embeddings` option in gen_img_diffusers.py (multiple files allowed). Use the filename (without the extension) of the embeddings file in the prompt, and the embeddings will be applied. From cf67f8470896959cd07b84a19d06f4ce583be7c0 Mon Sep 17 00:00:00 2001 From: shigabeev Date: Thu, 29 Aug 2024 19:00:20 +0000 Subject: [PATCH 2/3] add new scheduler --- gen_img_diffusers.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/gen_img_diffusers.py b/gen_img_diffusers.py index 2c40f1a06..63d5c1387 100644 --- a/gen_img_diffusers.py +++ b/gen_img_diffusers.py @@ -76,6 +76,7 @@ EulerAncestralDiscreteScheduler, DPMSolverMultistepScheduler, DPMSolverSinglestepScheduler, + DPMSolverSDEScheduler, LMSDiscreteScheduler, PNDMScheduler, DDIMScheduler, @@ -2323,7 +2324,18 @@ def main(args): scheduler_cls = KDPM2AncestralDiscreteScheduler scheduler_module = diffusers.schedulers.scheduling_k_dpm_2_ancestral_discrete scheduler_num_noises_per_step = 2 - + elif args.sampler == "dpmpp_sde": + scheduler_cls = DPMSolverSDEScheduler + sched_init_args["algorithm_type"] = "dpmsolver++" + sched_init_args["use_karras_sigmas"] = True + scheduler_module = diffusers.schedulers.scheduling_dpmsolver_sde + elif args.sampler == "dpmpp_2m_sde": + scheduler_cls = DPMSolverMultistepScheduler + sched_init_args["algorithm_type"] = "dpmsolver++" + sched_init_args["use_karras_sigmas"] = True + sched_init_args["solver_order"] = 2 + scheduler_module = diffusers.schedulers.scheduling_dpmsolver_multistep + if args.v_parameterization: sched_init_args["prediction_type"] = "v_prediction" @@ -3568,6 +3580,8 @@ def setup_parser() -> argparse.ArgumentParser: "k_euler_a", "k_dpm_2", "k_dpm_2_a", + "dpmpp_sde", + "dpmpp_2m_sde", ], help=f"sampler (scheduler) type / サンプラー(スケジューラ)の種類", ) From cb8e5e6a695d0be3f6a82538741a0d0e9870bed1 Mon Sep 17 00:00:00 2001 From: shigabeev Date: Mon, 9 Sep 2024 07:14:29 +0000 Subject: [PATCH 3/3] Add new samplers --- gen_img.py | 16 +++++++++++++++- gen_img_diffusers.py | 8 ++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/gen_img.py b/gen_img.py index 4fe898716..29d0f328f 100644 --- a/gen_img.py +++ b/gen_img.py @@ -31,6 +31,7 @@ EulerAncestralDiscreteScheduler, DPMSolverMultistepScheduler, DPMSolverSinglestepScheduler, + DPMSolverSDEScheduler, LMSDiscreteScheduler, PNDMScheduler, DDIMScheduler, @@ -1628,7 +1629,18 @@ def main(args): scheduler_module = diffusers.schedulers.scheduling_k_dpm_2_ancestral_discrete scheduler_num_noises_per_step = 2 has_clip_sample = False - + elif args.sampler == "k_dpm++_sde": + scheduler_cls = DPMSolverSDEScheduler + sched_init_args["algorithm_type"] = "dpmsolver++" + sched_init_args["use_karras_sigmas"] = True + scheduler_module = diffusers.schedulers.scheduling_dpmsolver_sde + elif args.sampler == "k_dpm++_2m_sde": + scheduler_cls = DPMSolverMultistepScheduler + sched_init_args["algorithm_type"] = "dpmsolver++" + sched_init_args["use_karras_sigmas"] = True + sched_init_args["solver_order"] = 2 + scheduler_module = diffusers.schedulers.scheduling_dpmsolver_multistep + if args.v_parameterization: sched_init_args["prediction_type"] = "v_prediction" @@ -3041,6 +3053,8 @@ def setup_parser() -> argparse.ArgumentParser: "k_euler_a", "k_dpm_2", "k_dpm_2_a", + "k_dpm++_sde", + "k_dpm++_2m_sde" ], help=f"sampler (scheduler) type / サンプラー(スケジューラ)の種類", ) diff --git a/gen_img_diffusers.py b/gen_img_diffusers.py index 63d5c1387..9d91d1b70 100644 --- a/gen_img_diffusers.py +++ b/gen_img_diffusers.py @@ -2324,12 +2324,12 @@ def main(args): scheduler_cls = KDPM2AncestralDiscreteScheduler scheduler_module = diffusers.schedulers.scheduling_k_dpm_2_ancestral_discrete scheduler_num_noises_per_step = 2 - elif args.sampler == "dpmpp_sde": + elif args.sampler == "k_dpm++_sde": scheduler_cls = DPMSolverSDEScheduler sched_init_args["algorithm_type"] = "dpmsolver++" sched_init_args["use_karras_sigmas"] = True scheduler_module = diffusers.schedulers.scheduling_dpmsolver_sde - elif args.sampler == "dpmpp_2m_sde": + elif args.sampler == "k_dpm++_2m_sde": scheduler_cls = DPMSolverMultistepScheduler sched_init_args["algorithm_type"] = "dpmsolver++" sched_init_args["use_karras_sigmas"] = True @@ -3580,8 +3580,8 @@ def setup_parser() -> argparse.ArgumentParser: "k_euler_a", "k_dpm_2", "k_dpm_2_a", - "dpmpp_sde", - "dpmpp_2m_sde", + "k_dpm++_sde", + "k_dpm++_2m_sde" ], help=f"sampler (scheduler) type / サンプラー(スケジューラ)の種類", )