From cc21474d94f301dd1f9f8ccdc43319628e33f238 Mon Sep 17 00:00:00 2001
From: lawrence-cj <cjs1020440147@icloud.com>
Date: Wed, 4 Dec 2024 16:22:48 +0800
Subject: [PATCH 1/4] fix the bug of stop training around 4 hours.

Signed-off-by: lawrence-cj <cjs1020440147@icloud.com>
---
 diffusion/utils/config.py | 1 +
 train_scripts/train.py    | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/diffusion/utils/config.py b/diffusion/utils/config.py
index 209a076..37dc2f6 100644
--- a/diffusion/utils/config.py
+++ b/diffusion/utils/config.py
@@ -141,6 +141,7 @@ class TrainingConfig(BaseConfig):
     load_mask_index: bool = False
     snr_loss: bool = False
     real_prompt_ratio: float = 1.0
+    training_hours: float = 10000.0
     save_image_epochs: int = 1
     save_model_epochs: int = 1
     save_model_steps: int = 1000000
diff --git a/train_scripts/train.py b/train_scripts/train.py
index 2f6e391..72a3ec0 100755
--- a/train_scripts/train.py
+++ b/train_scripts/train.py
@@ -448,7 +448,10 @@ def train(config, args, accelerator, model, optimizer, lr_scheduler, train_datal
 
             if loss_nan_timer > 20:
                 raise ValueError("Loss is NaN too much times. Break here.")
-            if global_step % config.train.save_model_steps == 0 or (time.time() - training_start_time) / 3600 > 3.8:
+            if (
+                global_step % config.train.save_model_steps == 0
+                or (time.time() - training_start_time) / 3600 > config.train.training_hours
+            ):
                 accelerator.wait_for_everyone()
                 if accelerator.is_main_process:
                     os.umask(0o000)
@@ -469,7 +472,7 @@ def train(config, args, accelerator, model, optimizer, lr_scheduler, train_datal
                             f.write(osp.join(config.work_dir, "config.py") + "\n")
                             f.write(ckpt_saved_path)
 
-                if (time.time() - training_start_time) / 3600 > 3.8:
+                if (time.time() - training_start_time) / 3600 > config.train.training_hours:
                     logger.info(f"Stopping training at epoch {epoch}, step {global_step} due to time limit.")
                     return
             if config.train.visualize and (global_step % config.train.eval_sampling_steps == 0 or (step + 1) == 1):

From 7b28bf3b28f98fa393199d6b0170b90715c93217 Mon Sep 17 00:00:00 2001
From: lawrence-cj <cjs1020440147@icloud.com>
Date: Fri, 6 Dec 2024 01:43:21 +0800
Subject: [PATCH 2/4] add comfyui md

Signed-off-by: lawrence-cj <cjs1020440147@icloud.com>
---
 asset/docs/ComfyUI/comfyui.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 asset/docs/ComfyUI/comfyui.md

diff --git a/asset/docs/ComfyUI/comfyui.md b/asset/docs/ComfyUI/comfyui.md
new file mode 100644
index 0000000..e69de29

From 666b3d7b7a0e5a42d14c55c590c438034324f5ec Mon Sep 17 00:00:00 2001
From: lawrence-cj <cjs1020440147@icloud.com>
Date: Mon, 9 Dec 2024 21:43:42 +0800
Subject: [PATCH 3/4] 1. update README.md; 2. add workflow json 3. add comfyui
 readme 4. fix the bug of 600M online ckpt path 5. add Sana ComfyUI
 workflow(Sana+CFG+FlowEuler)

Signed-off-by: lawrence-cj <cjs1020440147@icloud.com>
---
 README.md                              |   17 +-
 asset/docs/ComfyUI/Sana_CogVideoX.json | 1142 ++++++++++++++++++++++++
 asset/docs/ComfyUI/Sana_FlowEuler.json |  556 ++++++++++++
 asset/docs/ComfyUI/comfyui.md          |   33 +
 tests/bash/test_inference.sh           |    2 +-
 5 files changed, 1743 insertions(+), 7 deletions(-)
 create mode 100644 asset/docs/ComfyUI/Sana_CogVideoX.json
 create mode 100644 asset/docs/ComfyUI/Sana_FlowEuler.json

diff --git a/README.md b/README.md
index 516965a..3b89334 100644
--- a/README.md
+++ b/README.md
@@ -36,9 +36,10 @@ As a result, Sana-0.6B is very competitive with modern giant diffusion model (e.
 
 ## 🔥🔥 News
 
-- (🔥 New) \[2024/11/30\] All multi-linguistic (Emoji & Chinese & English) SFT models are released: [1.6B-512px](https://huggingface.co/Efficient-Large-Model/Sana_1600M_512px_MultiLing), [1.6B-1024px](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_MultiLing), [600M-512px](https://huggingface.co/Efficient-Large-Model/Sana_600M_512px), [600M-1024px](https://huggingface.co/Efficient-Large-Model/Sana_600M_1024px). The metric performance is shown [here](#performance)
-- (🔥 New) \[2024/11/27\] Sana Replicate API is launching at [Sana-API](https://replicate.com/chenxwh/sana).
-- (🔥 New) \[2024/11/27\] Sana code-base license changed to Apache 2.0.
+- (🔥 New) \[2024/12/9\] We release the [ComfyUI node](https://github.com/Efficient-Large-Model/ComfyUI_ExtraModels) for Sana. [\[Guidance\]](asset/docs/ComfyUI/comfyui.md)
+- (🔥 New) \[2024/11\] All multi-linguistic (Emoji & Chinese & English) SFT models are released: [1.6B-512px](https://huggingface.co/Efficient-Large-Model/Sana_1600M_512px_MultiLing), [1.6B-1024px](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_MultiLing), [600M-512px](https://huggingface.co/Efficient-Large-Model/Sana_600M_512px), [600M-1024px](https://huggingface.co/Efficient-Large-Model/Sana_600M_1024px). The metric performance is shown [here](#performance)
+- (🔥 New) \[2024/11\] Sana Replicate API is launching at [Sana-API](https://replicate.com/chenxwh/sana).
+- (🔥 New) \[2024/11\] Sana code-base license changed to Apache 2.0.
 - (🔥 New) \[2024/11\] 1.6B [Sana models](https://huggingface.co/collections/Efficient-Large-Model/sana-673efba2a57ed99843f11f9e) are released.
 - (🔥 New) \[2024/11\] Training & Inference & Metrics code are released.
 - (🔥 New) \[2024/11\] Working on [`diffusers`](https://github.com/huggingface/diffusers/pull/9982).
@@ -235,13 +236,17 @@ We will try our best to release
 - \[x\] Training code
 - \[x\] Inference code
 - \[+\] Model zoo
-- \[ \] working on Diffusers(https://github.com/huggingface/diffusers/pull/9982)
-- \[ \] ComfyUI
+- \[x\] ComfyUI
+- \[x\] DC-AE Diffusers
+- \[ \] Sana working on Diffusers(https://github.com/huggingface/diffusers/pull/9982)
 - \[ \] Laptop development
 
 # 🤗Acknowledgements
 
-- Thanks to [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha), [PixArt-Σ](https://github.com/PixArt-alpha/PixArt-sigma) and [Efficient-ViT](https://github.com/mit-han-lab/efficientvit) for their wonderful work and codebase!
+- Thanks to [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha), [PixArt-Σ](https://github.com/PixArt-alpha/PixArt-sigma), 
+[Efficient-ViT](https://github.com/mit-han-lab/efficientvit) and
+[ComfyUI_ExtraModels](https://github.com/city96/ComfyUI_ExtraModels)
+for their wonderful work and codebase!
 
 # 📖BibTeX
 
diff --git a/asset/docs/ComfyUI/Sana_CogVideoX.json b/asset/docs/ComfyUI/Sana_CogVideoX.json
new file mode 100644
index 0000000..77585ab
--- /dev/null
+++ b/asset/docs/ComfyUI/Sana_CogVideoX.json
@@ -0,0 +1,1142 @@
+{
+  "last_node_id": 37,
+  "last_link_id": 48,
+  "nodes": [
+    {
+      "id": 5,
+      "type": "GemmaLoader",
+      "pos": [
+        283.376953125,
+        603.7484741210938
+      ],
+      "size": [
+        315,
+        106
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "GEMMA",
+          "type": "GEMMA",
+          "links": [
+            9,
+            11
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "GemmaLoader"
+      },
+      "widgets_values": [
+        "google/gemma-2-2b-it",
+        "cuda",
+        "BF16"
+      ]
+    },
+    {
+      "id": 12,
+      "type": "SanaTextEncode",
+      "pos": [
+        670.9176635742188,
+        797.39501953125
+      ],
+      "size": [
+        400,
+        200
+      ],
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "GEMMA",
+          "type": "GEMMA",
+          "link": 11
+        }
+      ],
+      "outputs": [
+        {
+          "name": "CONDITIONING",
+          "type": "CONDITIONING",
+          "links": [
+            3
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "SanaTextEncode"
+      },
+      "widgets_values": [
+        "\"\""
+      ]
+    },
+    {
+      "id": 4,
+      "type": "SanaResolutionSelect",
+      "pos": [
+        300.2852783203125,
+        392.79766845703125
+      ],
+      "size": [
+        315,
+        102
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "width",
+          "type": "INT",
+          "links": [
+            7
+          ],
+          "slot_index": 0
+        },
+        {
+          "name": "height",
+          "type": "INT",
+          "links": [
+            8
+          ],
+          "slot_index": 1
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "SanaResolutionSelect"
+      },
+      "widgets_values": [
+        "1024px",
+        "1.46"
+      ]
+    },
+    {
+      "id": 7,
+      "type": "SanaTextEncode",
+      "pos": [
+        674.2115478515625,
+        504.2879638671875
+      ],
+      "size": [
+        400,
+        200
+      ],
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "GEMMA",
+          "type": "GEMMA",
+          "link": 9
+        }
+      ],
+      "outputs": [
+        {
+          "name": "CONDITIONING",
+          "type": "CONDITIONING",
+          "links": [
+            2
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "SanaTextEncode"
+      },
+      "widgets_values": [
+        "A cyberpunk cat with a neon sign that says 'Sana'."
+      ]
+    },
+    {
+      "id": 24,
+      "type": "PreviewImage",
+      "pos": [
+        1443.0323486328125,
+        352.056396484375
+      ],
+      "size": [
+        210,
+        246
+      ],
+      "flags": {},
+      "order": 13,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 47
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 25,
+      "type": "VHS_VideoCombine",
+      "pos": [
+        2825.935546875,
+        -102.76895904541016
+      ],
+      "size": [
+        767.7372436523438,
+        310
+      ],
+      "flags": {},
+      "order": 18,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 30
+        },
+        {
+          "name": "audio",
+          "type": "AUDIO",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "meta_batch",
+          "type": "VHS_BatchManager",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": null,
+          "shape": 7
+        }
+      ],
+      "outputs": [
+        {
+          "name": "Filenames",
+          "type": "VHS_FILENAMES",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VHS_VideoCombine"
+      },
+      "widgets_values": {
+        "frame_rate": 8,
+        "loop_count": 0,
+        "filename_prefix": "CogVideoX_Fun",
+        "format": "video/h264-mp4",
+        "pix_fmt": "yuv420p",
+        "crf": 19,
+        "save_metadata": true,
+        "pingpong": false,
+        "save_output": true,
+        "videopreview": {
+          "hidden": false,
+          "paused": false,
+          "params": {
+            "filename": "CogVideoX_Fun_00005.mp4",
+            "subfolder": "",
+            "type": "output",
+            "format": "video/h264-mp4",
+            "frame_rate": 8
+          },
+          "muted": false
+        }
+      }
+    },
+    {
+      "id": 27,
+      "type": "CogVideoTextEncode",
+      "pos": [
+        1713.936279296875,
+        174.2305450439453
+      ],
+      "size": [
+        471.90142822265625,
+        168.08047485351562
+      ],
+      "flags": {},
+      "order": 9,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 35
+        }
+      ],
+      "outputs": [
+        {
+          "name": "conditioning",
+          "type": "CONDITIONING",
+          "links": [
+            32
+          ],
+          "slot_index": 0,
+          "shape": 3
+        },
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "links": [
+            36
+          ],
+          "slot_index": 1
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoTextEncode"
+      },
+      "widgets_values": [
+        "fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+        1,
+        false
+      ]
+    },
+    {
+      "id": 28,
+      "type": "CogVideoTextEncode",
+      "pos": [
+        1720.936279296875,
+        393.230712890625
+      ],
+      "size": [
+        463.01251220703125,
+        144
+      ],
+      "flags": {},
+      "order": 11,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 36
+        }
+      ],
+      "outputs": [
+        {
+          "name": "conditioning",
+          "type": "CONDITIONING",
+          "links": [
+            33
+          ],
+          "slot_index": 0,
+          "shape": 3
+        },
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "links": null
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoTextEncode"
+      },
+      "widgets_values": [
+        "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
+        1,
+        true
+      ]
+    },
+    {
+      "id": 30,
+      "type": "CogVideoImageEncodeFunInP",
+      "pos": [
+        2088.93603515625,
+        595.230712890625
+      ],
+      "size": [
+        253.60000610351562,
+        146
+      ],
+      "flags": {},
+      "order": 15,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": 37
+        },
+        {
+          "name": "start_image",
+          "type": "IMAGE",
+          "link": 38
+        },
+        {
+          "name": "end_image",
+          "type": "IMAGE",
+          "link": null,
+          "shape": 7
+        }
+      ],
+      "outputs": [
+        {
+          "name": "image_cond_latents",
+          "type": "LATENT",
+          "links": [
+            34
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoImageEncodeFunInP"
+      },
+      "widgets_values": [
+        49,
+        true,
+        0
+      ]
+    },
+    {
+      "id": 33,
+      "type": "CogVideoDecode",
+      "pos": [
+        2442.93603515625,
+        -105.76895904541016
+      ],
+      "size": [
+        315,
+        198
+      ],
+      "flags": {},
+      "order": 17,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": 40
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 41
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            30
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoDecode"
+      },
+      "widgets_values": [
+        true,
+        240,
+        360,
+        0.2,
+        0.2,
+        true
+      ]
+    },
+    {
+      "id": 34,
+      "type": "DownloadAndLoadCogVideoModel",
+      "pos": [
+        1714.936279296875,
+        -138.76895141601562
+      ],
+      "size": [
+        362.1656799316406,
+        218
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "block_edit",
+          "type": "TRANSFORMERBLOCKS",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "lora",
+          "type": "COGLORA",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "compile_args",
+          "type": "COMPILEARGS",
+          "link": null,
+          "shape": 7
+        }
+      ],
+      "outputs": [
+        {
+          "name": "model",
+          "type": "COGVIDEOMODEL",
+          "links": [
+            31
+          ]
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "links": [
+            37,
+            40
+          ],
+          "slot_index": 1
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "DownloadAndLoadCogVideoModel"
+      },
+      "widgets_values": [
+        "alibaba-pai/CogVideoX-Fun-V1.1-5b-InP",
+        "bf16",
+        "disabled",
+        false,
+        "sdpa",
+        "main_device"
+      ]
+    },
+    {
+      "id": 31,
+      "type": "ImageResizeKJ",
+      "pos": [
+        1722.936279296875,
+        615.230712890625
+      ],
+      "size": [
+        315,
+        266
+      ],
+      "flags": {},
+      "order": 14,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "image",
+          "type": "IMAGE",
+          "link": 48
+        },
+        {
+          "name": "get_image_size",
+          "type": "IMAGE",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "width_input",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "width_input"
+          },
+          "shape": 7
+        },
+        {
+          "name": "height_input",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "height_input"
+          },
+          "shape": 7
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            38
+          ],
+          "slot_index": 0,
+          "shape": 3
+        },
+        {
+          "name": "width",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        },
+        {
+          "name": "height",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "ImageResizeKJ"
+      },
+      "widgets_values": [
+        720,
+        480,
+        "lanczos",
+        false,
+        2,
+        0,
+        0,
+        "disabled"
+      ]
+    },
+    {
+      "id": 29,
+      "type": "CLIPLoader",
+      "pos": [
+        1216.935791015625,
+        -8.769308090209961
+      ],
+      "size": [
+        451.30548095703125,
+        82
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "CLIP",
+          "type": "CLIP",
+          "links": [
+            35
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CLIPLoader"
+      },
+      "widgets_values": [
+        "text_encoders/t5xxl_fp16.safetensors",
+        "sd3"
+      ]
+    },
+    {
+      "id": 26,
+      "type": "CogVideoSampler",
+      "pos": [
+        2423.935791015625,
+        152.23048400878906
+      ],
+      "size": [
+        330,
+        574
+      ],
+      "flags": {},
+      "order": 16,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "COGVIDEOMODEL",
+          "link": 31
+        },
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 32
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 33
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "image_cond_latents",
+          "type": "LATENT",
+          "link": 34,
+          "shape": 7
+        },
+        {
+          "name": "context_options",
+          "type": "COGCONTEXT",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "controlnet",
+          "type": "COGVIDECONTROLNET",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "tora_trajectory",
+          "type": "TORAFEATURES",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "fastercache",
+          "type": "FASTERCACHEARGS",
+          "link": null,
+          "shape": 7
+        }
+      ],
+      "outputs": [
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "links": [
+            41
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoSampler"
+      },
+      "widgets_values": [
+        49,
+        25,
+        6,
+        1123398248636718,
+        "randomize",
+        "CogVideoXDDIM",
+        1
+      ]
+    },
+    {
+      "id": 35,
+      "type": "SanaCheckpointLoader",
+      "pos": [
+        286.5307922363281,
+        235.45753479003906
+      ],
+      "size": [
+        315,
+        82
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "links": [
+            43
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "SanaCheckpointLoader"
+      },
+      "widgets_values": [
+        "Efficient-Large-Model/Sana_1600M_1024px_MultiLing",
+        "SanaMS_1600M_P1_D20"
+      ]
+    },
+    {
+      "id": 37,
+      "type": "ExtraVAELoader",
+      "pos": [
+        1070.8033447265625,
+        747.4982299804688
+      ],
+      "size": [
+        315,
+        106
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "VAE",
+          "type": "VAE",
+          "links": [
+            46
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "ExtraVAELoader"
+      },
+      "widgets_values": [
+        "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers",
+        "dcae-f32c32-sana-1.0-diffusers",
+        "BF16"
+      ]
+    },
+    {
+      "id": 1,
+      "type": "KSampler",
+      "pos": [
+        1101.390625,
+        196.0309600830078
+      ],
+      "size": [
+        300,
+        480
+      ],
+      "flags": {},
+      "order": 10,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "link": 43
+        },
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 2
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 3
+        },
+        {
+          "name": "latent_image",
+          "type": "LATENT",
+          "link": 4
+        }
+      ],
+      "outputs": [
+        {
+          "name": "LATENT",
+          "type": "LATENT",
+          "links": [
+            5
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "KSampler"
+      },
+      "widgets_values": [
+        869595936769725,
+        "randomize",
+        28,
+        5,
+        "euler",
+        "normal",
+        1
+      ]
+    },
+    {
+      "id": 6,
+      "type": "EmptyDCAELatentImage",
+      "pos": [
+        723.0592041015625,
+        317.112548828125
+      ],
+      "size": [
+        315,
+        106
+      ],
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "width",
+          "type": "INT",
+          "link": 7,
+          "widget": {
+            "name": "width"
+          }
+        },
+        {
+          "name": "height",
+          "type": "INT",
+          "link": 8,
+          "widget": {
+            "name": "height"
+          }
+        }
+      ],
+      "outputs": [
+        {
+          "name": "LATENT",
+          "type": "LATENT",
+          "links": [
+            4
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "EmptyDCAELatentImage"
+      },
+      "widgets_values": [
+        512,
+        512,
+        1
+      ]
+    },
+    {
+      "id": 2,
+      "type": "VAEDecode",
+      "pos": [
+        1452.4869384765625,
+        217.9922637939453
+      ],
+      "size": [
+        200,
+        50
+      ],
+      "flags": {},
+      "order": 12,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 5
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": 46
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            47,
+            48
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VAEDecode"
+      },
+      "widgets_values": []
+    }
+  ],
+  "links": [
+    [
+      2,
+      7,
+      0,
+      1,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      3,
+      12,
+      0,
+      1,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      4,
+      6,
+      0,
+      1,
+      3,
+      "LATENT"
+    ],
+    [
+      5,
+      1,
+      0,
+      2,
+      0,
+      "LATENT"
+    ],
+    [
+      7,
+      4,
+      0,
+      6,
+      0,
+      "INT"
+    ],
+    [
+      8,
+      4,
+      1,
+      6,
+      1,
+      "INT"
+    ],
+    [
+      9,
+      5,
+      0,
+      7,
+      0,
+      "GEMMA"
+    ],
+    [
+      11,
+      5,
+      0,
+      12,
+      0,
+      "GEMMA"
+    ],
+    [
+      30,
+      33,
+      0,
+      25,
+      0,
+      "IMAGE"
+    ],
+    [
+      31,
+      34,
+      0,
+      26,
+      0,
+      "COGVIDEOMODEL"
+    ],
+    [
+      32,
+      27,
+      0,
+      26,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      33,
+      28,
+      0,
+      26,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      34,
+      30,
+      0,
+      26,
+      4,
+      "LATENT"
+    ],
+    [
+      35,
+      29,
+      0,
+      27,
+      0,
+      "CLIP"
+    ],
+    [
+      36,
+      27,
+      1,
+      28,
+      0,
+      "CLIP"
+    ],
+    [
+      37,
+      34,
+      1,
+      30,
+      0,
+      "VAE"
+    ],
+    [
+      38,
+      31,
+      0,
+      30,
+      1,
+      "IMAGE"
+    ],
+    [
+      40,
+      34,
+      1,
+      33,
+      0,
+      "VAE"
+    ],
+    [
+      41,
+      26,
+      0,
+      33,
+      1,
+      "LATENT"
+    ],
+    [
+      43,
+      35,
+      0,
+      1,
+      0,
+      "MODEL"
+    ],
+    [
+      46,
+      37,
+      0,
+      2,
+      1,
+      "VAE"
+    ],
+    [
+      47,
+      2,
+      0,
+      24,
+      0,
+      "IMAGE"
+    ],
+    [
+      48,
+      2,
+      0,
+      31,
+      0,
+      "IMAGE"
+    ]
+  ],
+  "groups": [],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.5644739300537776,
+      "offset": [
+        515.970442108866,
+        435.7565370847522
+      ]
+    },
+    "groupNodes": {}
+  },
+  "version": 0.4
+}
\ No newline at end of file
diff --git a/asset/docs/ComfyUI/Sana_FlowEuler.json b/asset/docs/ComfyUI/Sana_FlowEuler.json
new file mode 100644
index 0000000..0e858a7
--- /dev/null
+++ b/asset/docs/ComfyUI/Sana_FlowEuler.json
@@ -0,0 +1,556 @@
+{
+  "last_node_id": 29,
+  "last_link_id": 24,
+  "nodes": [
+    {
+      "id": 6,
+      "type": "SanaResolutionSelect",
+      "pos": [
+        -364.7435607910156,
+        301.5954284667969
+      ],
+      "size": [
+        315,
+        102
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "width",
+          "type": "INT",
+          "links": [
+            1
+          ],
+          "slot_index": 0
+        },
+        {
+          "name": "height",
+          "type": "INT",
+          "links": [
+            2
+          ],
+          "slot_index": 1
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "SanaResolutionSelect"
+      },
+      "widgets_values": [
+        "1024px",
+        "1.00"
+      ]
+    },
+    {
+      "id": 3,
+      "type": "EmptyDCAELatentImage",
+      "pos": [
+        57.19669723510742,
+        202.29898071289062
+      ],
+      "size": [
+        315,
+        106
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "width",
+          "type": "INT",
+          "link": 1,
+          "widget": {
+            "name": "width"
+          }
+        },
+        {
+          "name": "height",
+          "type": "INT",
+          "link": 2,
+          "widget": {
+            "name": "height"
+          }
+        }
+      ],
+      "outputs": [
+        {
+          "name": "LATENT",
+          "type": "LATENT",
+          "links": [
+            18
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "EmptyDCAELatentImage"
+      },
+      "widgets_values": [
+        512,
+        512,
+        1
+      ]
+    },
+    {
+      "id": 8,
+      "type": "SanaTextEncode",
+      "pos": [
+        5.8887786865234375,
+        706.19287109375
+      ],
+      "size": [
+        400,
+        200
+      ],
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "GEMMA",
+          "type": "GEMMA",
+          "link": 12
+        }
+      ],
+      "outputs": [
+        {
+          "name": "CONDITIONING",
+          "type": "CONDITIONING",
+          "links": [
+            17
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "SanaTextEncode"
+      },
+      "widgets_values": [
+        "\"\""
+      ]
+    },
+    {
+      "id": 13,
+      "type": "PreviewImage",
+      "pos": [
+        802.6994018554688,
+        217.20889282226562
+      ],
+      "size": [
+        605.93505859375,
+        665.570068359375
+      ],
+      "flags": {},
+      "order": 10,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 11
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 9,
+      "type": "GemmaLoader",
+      "pos": [
+        -381.6518859863281,
+        512.5463256835938
+      ],
+      "size": [
+        315,
+        106
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "GEMMA",
+          "type": "GEMMA",
+          "links": [
+            12,
+            13
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "GemmaLoader"
+      },
+      "widgets_values": [
+        "google/gemma-2-2b-it",
+        "cuda",
+        "BF16"
+      ]
+    },
+    {
+      "id": 15,
+      "type": "EmptyDCAELatentImage",
+      "pos": [
+        62.19669723510742,
+        207.29898071289062
+      ],
+      "size": [
+        315,
+        106
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "width",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "width"
+          }
+        },
+        {
+          "name": "height",
+          "type": "INT",
+          "link": null,
+          "widget": {
+            "name": "height"
+          }
+        }
+      ],
+      "outputs": [
+        {
+          "name": "LATENT",
+          "type": "LATENT",
+          "links": [],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "EmptyDCAELatentImage"
+      },
+      "widgets_values": [
+        512,
+        512,
+        1
+      ]
+    },
+    {
+      "id": 4,
+      "type": "VAEDecode",
+      "pos": [
+        776.332763671875,
+        105.08650970458984
+      ],
+      "size": [
+        200,
+        50
+      ],
+      "flags": {},
+      "order": 9,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 3
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": 24
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            11
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VAEDecode"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 29,
+      "type": "ExtraVAELoader",
+      "pos": [
+        460.67730712890625,
+        695.5695190429688
+      ],
+      "size": [
+        315,
+        106
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "VAE",
+          "type": "VAE",
+          "links": [
+            24
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "ExtraVAELoader"
+      },
+      "widgets_values": [
+        "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers",
+        "dcae-f32c32-sana-1.0-diffusers",
+        "BF16"
+      ]
+    },
+    {
+      "id": 25,
+      "type": "SanaCheckpointLoader",
+      "pos": [
+        -334.1000671386719,
+        133.43394470214844
+      ],
+      "size": [
+        315,
+        82
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "links": [
+            20
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "SanaCheckpointLoader"
+      },
+      "widgets_values": [
+        "Efficient-Large-Model/Sana_1600M_1024px_MultiLing",
+        "SanaMS_1600M_P1_D20"
+      ]
+    },
+    {
+      "id": 14,
+      "type": "SanaTextEncode",
+      "pos": [
+        9.182632446289062,
+        413.08575439453125
+      ],
+      "size": [
+        400,
+        200
+      ],
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "GEMMA",
+          "type": "GEMMA",
+          "link": 13
+        }
+      ],
+      "outputs": [
+        {
+          "name": "CONDITIONING",
+          "type": "CONDITIONING",
+          "links": [
+            16
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "SanaTextEncode"
+      },
+      "widgets_values": [
+        "make me a logo that says \"So Fast\"  with a really cool flying dragon shape with lightning sparks all over the sides and all of it contains Indonesian language"
+      ]
+    },
+    {
+      "id": 10,
+      "type": "KSampler",
+      "pos": [
+        429.7785339355469,
+        99.45759582519531
+      ],
+      "size": [
+        300,
+        480
+      ],
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "link": 20
+        },
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 16
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 17
+        },
+        {
+          "name": "latent_image",
+          "type": "LATENT",
+          "link": 18
+        }
+      ],
+      "outputs": [
+        {
+          "name": "LATENT",
+          "type": "LATENT",
+          "links": [
+            3
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "KSampler"
+      },
+      "widgets_values": [
+        750392164154046,
+        "randomize",
+        28,
+        4.5,
+        "euler",
+        "normal",
+        1
+      ]
+    }
+  ],
+  "links": [
+    [
+      1,
+      6,
+      0,
+      3,
+      0,
+      "INT"
+    ],
+    [
+      2,
+      6,
+      1,
+      3,
+      1,
+      "INT"
+    ],
+    [
+      3,
+      10,
+      0,
+      4,
+      0,
+      "LATENT"
+    ],
+    [
+      11,
+      4,
+      0,
+      13,
+      0,
+      "IMAGE"
+    ],
+    [
+      12,
+      9,
+      0,
+      8,
+      0,
+      "GEMMA"
+    ],
+    [
+      13,
+      9,
+      0,
+      14,
+      0,
+      "GEMMA"
+    ],
+    [
+      16,
+      14,
+      0,
+      10,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      17,
+      8,
+      0,
+      10,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      18,
+      3,
+      0,
+      10,
+      3,
+      "LATENT"
+    ],
+    [
+      20,
+      25,
+      0,
+      10,
+      0,
+      "MODEL"
+    ],
+    [
+      24,
+      29,
+      0,
+      4,
+      1,
+      "VAE"
+    ]
+  ],
+  "groups": [],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.7513148009015777,
+      "offset": [
+        628.4872538544141,
+        156.35225366732607
+      ]
+    }
+  },
+  "version": 0.4
+}
\ No newline at end of file
diff --git a/asset/docs/ComfyUI/comfyui.md b/asset/docs/ComfyUI/comfyui.md
index e69de29..105b026 100644
--- a/asset/docs/ComfyUI/comfyui.md
+++ b/asset/docs/ComfyUI/comfyui.md
@@ -0,0 +1,33 @@
+## 🖌️ Sana-ComfyUI
+
+[Original Repo](https://github.com/city96/ComfyUI_ExtraModels)
+
+### Model info / implementation
+- Uses Gemma2 2B as the text encoder
+- Multiple resolutions and models available
+- Compressed latent space (32 channels, /32 compression) - needs custom VAE
+
+### Usage
+1. All the checkpoints will be downloaded automatically.
+2. KSampler(Flow Euler) is available for now; Flow DPM-Solver will be available soon.
+3. For more information, check the [original city96/ComfyUI_ExtraModels](https://github.com/city96/ComfyUI_ExtraModels).
+
+```bash
+git clone https://github.com/comfyanonymous/ComfyUI.git
+cd ComfyUI
+git clone https://github.com/Efficient-Large-Model/ComfyUI_ExtraModels.git custom_nodes/ComfyUI_ExtraModels
+
+python main.py
+```
+
+### A sample workflow for Sana
+
+[Sana workflow](Sana_FlowEuler.json)
+
+![Sana](https://raw.githubusercontent.com/NVlabs/Sana/refs/heads/page/asset/content/comfyui/sana.jpg)
+
+### A sample for T2I(Sana) + I2V(CogVideoX)
+
+[Sana + CogVideoX workflow](Sana_CogVideoX.json)
+
+![Sample T2I + I2V workflow](https://raw.githubusercontent.com/NVlabs/Sana/refs/heads/page/asset/content/comfyui/sana-cogvideox.jpg)
diff --git a/tests/bash/test_inference.sh b/tests/bash/test_inference.sh
index a94b190..429c4d1 100644
--- a/tests/bash/test_inference.sh
+++ b/tests/bash/test_inference.sh
@@ -3,7 +3,7 @@ set -e
 
 python scripts/inference.py \
     --config=configs/sana_config/1024ms/Sana_600M_img1024.yaml \
-    --model_path=hf://Efficient-Large-Model/Sana_600M_1024px/checkpoints/Sana_600M_1024px.pth
+    --model_path=hf://Efficient-Large-Model/Sana_600M_1024px/checkpoints/Sana_600M_1024px_MultiLing.pth
 
 
 python scripts/inference.py \

From 6429e471ae289c26eb94da0c30d53aba07f3cc8c Mon Sep 17 00:00:00 2001
From: lawrence-cj <cjs1020440147@icloud.com>
Date: Mon, 9 Dec 2024 23:00:54 +0800
Subject: [PATCH 4/4] pre-commit;

Signed-off-by: lawrence-cj <cjs1020440147@icloud.com>
---
 README.md                              | 8 ++++----
 asset/docs/ComfyUI/Sana_CogVideoX.json | 2 +-
 asset/docs/ComfyUI/Sana_FlowEuler.json | 2 +-
 asset/docs/ComfyUI/comfyui.md          | 6 ++++--
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 3b89334..bef8373 100644
--- a/README.md
+++ b/README.md
@@ -243,10 +243,10 @@ We will try our best to release
 
 # 🤗Acknowledgements
 
-- Thanks to [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha), [PixArt-Σ](https://github.com/PixArt-alpha/PixArt-sigma), 
-[Efficient-ViT](https://github.com/mit-han-lab/efficientvit) and
-[ComfyUI_ExtraModels](https://github.com/city96/ComfyUI_ExtraModels)
-for their wonderful work and codebase!
+- Thanks to [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha), [PixArt-Σ](https://github.com/PixArt-alpha/PixArt-sigma),
+  [Efficient-ViT](https://github.com/mit-han-lab/efficientvit) and
+  [ComfyUI_ExtraModels](https://github.com/city96/ComfyUI_ExtraModels)
+  for their wonderful work and codebase!
 
 # 📖BibTeX
 
diff --git a/asset/docs/ComfyUI/Sana_CogVideoX.json b/asset/docs/ComfyUI/Sana_CogVideoX.json
index 77585ab..2230329 100644
--- a/asset/docs/ComfyUI/Sana_CogVideoX.json
+++ b/asset/docs/ComfyUI/Sana_CogVideoX.json
@@ -1139,4 +1139,4 @@
     "groupNodes": {}
   },
   "version": 0.4
-}
\ No newline at end of file
+}
diff --git a/asset/docs/ComfyUI/Sana_FlowEuler.json b/asset/docs/ComfyUI/Sana_FlowEuler.json
index 0e858a7..6ffe5d3 100644
--- a/asset/docs/ComfyUI/Sana_FlowEuler.json
+++ b/asset/docs/ComfyUI/Sana_FlowEuler.json
@@ -553,4 +553,4 @@
     }
   },
   "version": 0.4
-}
\ No newline at end of file
+}
diff --git a/asset/docs/ComfyUI/comfyui.md b/asset/docs/ComfyUI/comfyui.md
index 105b026..6bd2ee3 100644
--- a/asset/docs/ComfyUI/comfyui.md
+++ b/asset/docs/ComfyUI/comfyui.md
@@ -3,14 +3,16 @@
 [Original Repo](https://github.com/city96/ComfyUI_ExtraModels)
 
 ### Model info / implementation
+
 - Uses Gemma2 2B as the text encoder
 - Multiple resolutions and models available
 - Compressed latent space (32 channels, /32 compression) - needs custom VAE
 
 ### Usage
+
 1. All the checkpoints will be downloaded automatically.
-2. KSampler(Flow Euler) is available for now; Flow DPM-Solver will be available soon.
-3. For more information, check the [original city96/ComfyUI_ExtraModels](https://github.com/city96/ComfyUI_ExtraModels).
+1. KSampler(Flow Euler) is available for now; Flow DPM-Solver will be available soon.
+1. For more information, check the [original city96/ComfyUI_ExtraModels](https://github.com/city96/ComfyUI_ExtraModels).
 
 ```bash
 git clone https://github.com/comfyanonymous/ComfyUI.git