From cc21474d94f301dd1f9f8ccdc43319628e33f238 Mon Sep 17 00:00:00 2001 From: lawrence-cj Date: Wed, 4 Dec 2024 16:22:48 +0800 Subject: [PATCH 1/4] fix the bug of stop training around 4 hours. Signed-off-by: lawrence-cj --- diffusion/utils/config.py | 1 + train_scripts/train.py | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/diffusion/utils/config.py b/diffusion/utils/config.py index 209a076..37dc2f6 100644 --- a/diffusion/utils/config.py +++ b/diffusion/utils/config.py @@ -141,6 +141,7 @@ class TrainingConfig(BaseConfig): load_mask_index: bool = False snr_loss: bool = False real_prompt_ratio: float = 1.0 + training_hours: float = 10000.0 save_image_epochs: int = 1 save_model_epochs: int = 1 save_model_steps: int = 1000000 diff --git a/train_scripts/train.py b/train_scripts/train.py index 2f6e391..72a3ec0 100755 --- a/train_scripts/train.py +++ b/train_scripts/train.py @@ -448,7 +448,10 @@ def train(config, args, accelerator, model, optimizer, lr_scheduler, train_datal if loss_nan_timer > 20: raise ValueError("Loss is NaN too much times. Break here.") - if global_step % config.train.save_model_steps == 0 or (time.time() - training_start_time) / 3600 > 3.8: + if ( + global_step % config.train.save_model_steps == 0 + or (time.time() - training_start_time) / 3600 > config.train.training_hours + ): accelerator.wait_for_everyone() if accelerator.is_main_process: os.umask(0o000) @@ -469,7 +472,7 @@ def train(config, args, accelerator, model, optimizer, lr_scheduler, train_datal f.write(osp.join(config.work_dir, "config.py") + "\n") f.write(ckpt_saved_path) - if (time.time() - training_start_time) / 3600 > 3.8: + if (time.time() - training_start_time) / 3600 > config.train.training_hours: logger.info(f"Stopping training at epoch {epoch}, step {global_step} due to time limit.") return if config.train.visualize and (global_step % config.train.eval_sampling_steps == 0 or (step + 1) == 1): From 7b28bf3b28f98fa393199d6b0170b90715c93217 Mon Sep 17 00:00:00 2001 From: lawrence-cj Date: Fri, 6 Dec 2024 01:43:21 +0800 Subject: [PATCH 2/4] add comfyui md Signed-off-by: lawrence-cj --- asset/docs/ComfyUI/comfyui.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 asset/docs/ComfyUI/comfyui.md diff --git a/asset/docs/ComfyUI/comfyui.md b/asset/docs/ComfyUI/comfyui.md new file mode 100644 index 0000000..e69de29 From 666b3d7b7a0e5a42d14c55c590c438034324f5ec Mon Sep 17 00:00:00 2001 From: lawrence-cj Date: Mon, 9 Dec 2024 21:43:42 +0800 Subject: [PATCH 3/4] 1. update README.md; 2. add workflow json 3. add comfyui readme 4. fix the bug of 600M online ckpt path 5. add Sana ComfyUI workflow(Sana+CFG+FlowEuler) Signed-off-by: lawrence-cj --- README.md | 17 +- asset/docs/ComfyUI/Sana_CogVideoX.json | 1142 ++++++++++++++++++++++++ asset/docs/ComfyUI/Sana_FlowEuler.json | 556 ++++++++++++ asset/docs/ComfyUI/comfyui.md | 33 + tests/bash/test_inference.sh | 2 +- 5 files changed, 1743 insertions(+), 7 deletions(-) create mode 100644 asset/docs/ComfyUI/Sana_CogVideoX.json create mode 100644 asset/docs/ComfyUI/Sana_FlowEuler.json diff --git a/README.md b/README.md index 516965a..3b89334 100644 --- a/README.md +++ b/README.md @@ -36,9 +36,10 @@ As a result, Sana-0.6B is very competitive with modern giant diffusion model (e. ## 🔥🔥 News -- (🔥 New) \[2024/11/30\] All multi-linguistic (Emoji & Chinese & English) SFT models are released: [1.6B-512px](https://huggingface.co/Efficient-Large-Model/Sana_1600M_512px_MultiLing), [1.6B-1024px](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_MultiLing), [600M-512px](https://huggingface.co/Efficient-Large-Model/Sana_600M_512px), [600M-1024px](https://huggingface.co/Efficient-Large-Model/Sana_600M_1024px). The metric performance is shown [here](#performance) -- (🔥 New) \[2024/11/27\] Sana Replicate API is launching at [Sana-API](https://replicate.com/chenxwh/sana). -- (🔥 New) \[2024/11/27\] Sana code-base license changed to Apache 2.0. +- (🔥 New) \[2024/12/9\] We release the [ComfyUI node](https://github.com/Efficient-Large-Model/ComfyUI_ExtraModels) for Sana. [\[Guidance\]](asset/docs/ComfyUI/comfyui.md) +- (🔥 New) \[2024/11\] All multi-linguistic (Emoji & Chinese & English) SFT models are released: [1.6B-512px](https://huggingface.co/Efficient-Large-Model/Sana_1600M_512px_MultiLing), [1.6B-1024px](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_MultiLing), [600M-512px](https://huggingface.co/Efficient-Large-Model/Sana_600M_512px), [600M-1024px](https://huggingface.co/Efficient-Large-Model/Sana_600M_1024px). The metric performance is shown [here](#performance) +- (🔥 New) \[2024/11\] Sana Replicate API is launching at [Sana-API](https://replicate.com/chenxwh/sana). +- (🔥 New) \[2024/11\] Sana code-base license changed to Apache 2.0. - (🔥 New) \[2024/11\] 1.6B [Sana models](https://huggingface.co/collections/Efficient-Large-Model/sana-673efba2a57ed99843f11f9e) are released. - (🔥 New) \[2024/11\] Training & Inference & Metrics code are released. - (🔥 New) \[2024/11\] Working on [`diffusers`](https://github.com/huggingface/diffusers/pull/9982). @@ -235,13 +236,17 @@ We will try our best to release - \[x\] Training code - \[x\] Inference code - \[+\] Model zoo -- \[ \] working on Diffusers(https://github.com/huggingface/diffusers/pull/9982) -- \[ \] ComfyUI +- \[x\] ComfyUI +- \[x\] DC-AE Diffusers +- \[ \] Sana working on Diffusers(https://github.com/huggingface/diffusers/pull/9982) - \[ \] Laptop development # 🤗Acknowledgements -- Thanks to [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha), [PixArt-Σ](https://github.com/PixArt-alpha/PixArt-sigma) and [Efficient-ViT](https://github.com/mit-han-lab/efficientvit) for their wonderful work and codebase! +- Thanks to [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha), [PixArt-Σ](https://github.com/PixArt-alpha/PixArt-sigma), +[Efficient-ViT](https://github.com/mit-han-lab/efficientvit) and +[ComfyUI_ExtraModels](https://github.com/city96/ComfyUI_ExtraModels) +for their wonderful work and codebase! # 📖BibTeX diff --git a/asset/docs/ComfyUI/Sana_CogVideoX.json b/asset/docs/ComfyUI/Sana_CogVideoX.json new file mode 100644 index 0000000..77585ab --- /dev/null +++ b/asset/docs/ComfyUI/Sana_CogVideoX.json @@ -0,0 +1,1142 @@ +{ + "last_node_id": 37, + "last_link_id": 48, + "nodes": [ + { + "id": 5, + "type": "GemmaLoader", + "pos": [ + 283.376953125, + 603.7484741210938 + ], + "size": [ + 315, + 106 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "GEMMA", + "type": "GEMMA", + "links": [ + 9, + 11 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "GemmaLoader" + }, + "widgets_values": [ + "google/gemma-2-2b-it", + "cuda", + "BF16" + ] + }, + { + "id": 12, + "type": "SanaTextEncode", + "pos": [ + 670.9176635742188, + 797.39501953125 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "GEMMA", + "type": "GEMMA", + "link": 11 + } + ], + "outputs": [ + { + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 3 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "SanaTextEncode" + }, + "widgets_values": [ + "\"\"" + ] + }, + { + "id": 4, + "type": "SanaResolutionSelect", + "pos": [ + 300.2852783203125, + 392.79766845703125 + ], + "size": [ + 315, + 102 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "width", + "type": "INT", + "links": [ + 7 + ], + "slot_index": 0 + }, + { + "name": "height", + "type": "INT", + "links": [ + 8 + ], + "slot_index": 1 + } + ], + "properties": { + "Node name for S&R": "SanaResolutionSelect" + }, + "widgets_values": [ + "1024px", + "1.46" + ] + }, + { + "id": 7, + "type": "SanaTextEncode", + "pos": [ + 674.2115478515625, + 504.2879638671875 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "GEMMA", + "type": "GEMMA", + "link": 9 + } + ], + "outputs": [ + { + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 2 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "SanaTextEncode" + }, + "widgets_values": [ + "A cyberpunk cat with a neon sign that says 'Sana'." + ] + }, + { + "id": 24, + "type": "PreviewImage", + "pos": [ + 1443.0323486328125, + 352.056396484375 + ], + "size": [ + 210, + 246 + ], + "flags": {}, + "order": 13, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 47 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 25, + "type": "VHS_VideoCombine", + "pos": [ + 2825.935546875, + -102.76895904541016 + ], + "size": [ + 767.7372436523438, + 310 + ], + "flags": {}, + "order": 18, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 30 + }, + { + "name": "audio", + "type": "AUDIO", + "link": null, + "shape": 7 + }, + { + "name": "meta_batch", + "type": "VHS_BatchManager", + "link": null, + "shape": 7 + }, + { + "name": "vae", + "type": "VAE", + "link": null, + "shape": 7 + } + ], + "outputs": [ + { + "name": "Filenames", + "type": "VHS_FILENAMES", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "VHS_VideoCombine" + }, + "widgets_values": { + "frame_rate": 8, + "loop_count": 0, + "filename_prefix": "CogVideoX_Fun", + "format": "video/h264-mp4", + "pix_fmt": "yuv420p", + "crf": 19, + "save_metadata": true, + "pingpong": false, + "save_output": true, + "videopreview": { + "hidden": false, + "paused": false, + "params": { + "filename": "CogVideoX_Fun_00005.mp4", + "subfolder": "", + "type": "output", + "format": "video/h264-mp4", + "frame_rate": 8 + }, + "muted": false + } + } + }, + { + "id": 27, + "type": "CogVideoTextEncode", + "pos": [ + 1713.936279296875, + 174.2305450439453 + ], + "size": [ + 471.90142822265625, + 168.08047485351562 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 35 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [ + 32 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "clip", + "type": "CLIP", + "links": [ + 36 + ], + "slot_index": 1 + } + ], + "properties": { + "Node name for S&R": "CogVideoTextEncode" + }, + "widgets_values": [ + "fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.", + 1, + false + ] + }, + { + "id": 28, + "type": "CogVideoTextEncode", + "pos": [ + 1720.936279296875, + 393.230712890625 + ], + "size": [ + 463.01251220703125, + 144 + ], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 36 + } + ], + "outputs": [ + { + "name": "conditioning", + "type": "CONDITIONING", + "links": [ + 33 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "clip", + "type": "CLIP", + "links": null + } + ], + "properties": { + "Node name for S&R": "CogVideoTextEncode" + }, + "widgets_values": [ + "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ", + 1, + true + ] + }, + { + "id": 30, + "type": "CogVideoImageEncodeFunInP", + "pos": [ + 2088.93603515625, + 595.230712890625 + ], + "size": [ + 253.60000610351562, + 146 + ], + "flags": {}, + "order": 15, + "mode": 0, + "inputs": [ + { + "name": "vae", + "type": "VAE", + "link": 37 + }, + { + "name": "start_image", + "type": "IMAGE", + "link": 38 + }, + { + "name": "end_image", + "type": "IMAGE", + "link": null, + "shape": 7 + } + ], + "outputs": [ + { + "name": "image_cond_latents", + "type": "LATENT", + "links": [ + 34 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CogVideoImageEncodeFunInP" + }, + "widgets_values": [ + 49, + true, + 0 + ] + }, + { + "id": 33, + "type": "CogVideoDecode", + "pos": [ + 2442.93603515625, + -105.76895904541016 + ], + "size": [ + 315, + 198 + ], + "flags": {}, + "order": 17, + "mode": 0, + "inputs": [ + { + "name": "vae", + "type": "VAE", + "link": 40 + }, + { + "name": "samples", + "type": "LATENT", + "link": 41 + } + ], + "outputs": [ + { + "name": "images", + "type": "IMAGE", + "links": [ + 30 + ] + } + ], + "properties": { + "Node name for S&R": "CogVideoDecode" + }, + "widgets_values": [ + true, + 240, + 360, + 0.2, + 0.2, + true + ] + }, + { + "id": 34, + "type": "DownloadAndLoadCogVideoModel", + "pos": [ + 1714.936279296875, + -138.76895141601562 + ], + "size": [ + 362.1656799316406, + 218 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "block_edit", + "type": "TRANSFORMERBLOCKS", + "link": null, + "shape": 7 + }, + { + "name": "lora", + "type": "COGLORA", + "link": null, + "shape": 7 + }, + { + "name": "compile_args", + "type": "COMPILEARGS", + "link": null, + "shape": 7 + } + ], + "outputs": [ + { + "name": "model", + "type": "COGVIDEOMODEL", + "links": [ + 31 + ] + }, + { + "name": "vae", + "type": "VAE", + "links": [ + 37, + 40 + ], + "slot_index": 1 + } + ], + "properties": { + "Node name for S&R": "DownloadAndLoadCogVideoModel" + }, + "widgets_values": [ + "alibaba-pai/CogVideoX-Fun-V1.1-5b-InP", + "bf16", + "disabled", + false, + "sdpa", + "main_device" + ] + }, + { + "id": 31, + "type": "ImageResizeKJ", + "pos": [ + 1722.936279296875, + 615.230712890625 + ], + "size": [ + 315, + 266 + ], + "flags": {}, + "order": 14, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 48 + }, + { + "name": "get_image_size", + "type": "IMAGE", + "link": null, + "shape": 7 + }, + { + "name": "width_input", + "type": "INT", + "link": null, + "widget": { + "name": "width_input" + }, + "shape": 7 + }, + { + "name": "height_input", + "type": "INT", + "link": null, + "widget": { + "name": "height_input" + }, + "shape": 7 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 38 + ], + "slot_index": 0, + "shape": 3 + }, + { + "name": "width", + "type": "INT", + "links": null, + "shape": 3 + }, + { + "name": "height", + "type": "INT", + "links": null, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "ImageResizeKJ" + }, + "widgets_values": [ + 720, + 480, + "lanczos", + false, + 2, + 0, + 0, + "disabled" + ] + }, + { + "id": 29, + "type": "CLIPLoader", + "pos": [ + 1216.935791015625, + -8.769308090209961 + ], + "size": [ + 451.30548095703125, + 82 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "CLIP", + "type": "CLIP", + "links": [ + 35 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CLIPLoader" + }, + "widgets_values": [ + "text_encoders/t5xxl_fp16.safetensors", + "sd3" + ] + }, + { + "id": 26, + "type": "CogVideoSampler", + "pos": [ + 2423.935791015625, + 152.23048400878906 + ], + "size": [ + 330, + 574 + ], + "flags": {}, + "order": 16, + "mode": 0, + "inputs": [ + { + "name": "model", + "type": "COGVIDEOMODEL", + "link": 31 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 32 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 33 + }, + { + "name": "samples", + "type": "LATENT", + "link": null, + "shape": 7 + }, + { + "name": "image_cond_latents", + "type": "LATENT", + "link": 34, + "shape": 7 + }, + { + "name": "context_options", + "type": "COGCONTEXT", + "link": null, + "shape": 7 + }, + { + "name": "controlnet", + "type": "COGVIDECONTROLNET", + "link": null, + "shape": 7 + }, + { + "name": "tora_trajectory", + "type": "TORAFEATURES", + "link": null, + "shape": 7 + }, + { + "name": "fastercache", + "type": "FASTERCACHEARGS", + "link": null, + "shape": 7 + } + ], + "outputs": [ + { + "name": "samples", + "type": "LATENT", + "links": [ + 41 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CogVideoSampler" + }, + "widgets_values": [ + 49, + 25, + 6, + 1123398248636718, + "randomize", + "CogVideoXDDIM", + 1 + ] + }, + { + "id": 35, + "type": "SanaCheckpointLoader", + "pos": [ + 286.5307922363281, + 235.45753479003906 + ], + "size": [ + 315, + 82 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "model", + "type": "MODEL", + "links": [ + 43 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "SanaCheckpointLoader" + }, + "widgets_values": [ + "Efficient-Large-Model/Sana_1600M_1024px_MultiLing", + "SanaMS_1600M_P1_D20" + ] + }, + { + "id": 37, + "type": "ExtraVAELoader", + "pos": [ + 1070.8033447265625, + 747.4982299804688 + ], + "size": [ + 315, + 106 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "VAE", + "type": "VAE", + "links": [ + 46 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "ExtraVAELoader" + }, + "widgets_values": [ + "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers", + "dcae-f32c32-sana-1.0-diffusers", + "BF16" + ] + }, + { + "id": 1, + "type": "KSampler", + "pos": [ + 1101.390625, + 196.0309600830078 + ], + "size": [ + 300, + 480 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "name": "model", + "type": "MODEL", + "link": 43 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 2 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 3 + }, + { + "name": "latent_image", + "type": "LATENT", + "link": 4 + } + ], + "outputs": [ + { + "name": "LATENT", + "type": "LATENT", + "links": [ + 5 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "KSampler" + }, + "widgets_values": [ + 869595936769725, + "randomize", + 28, + 5, + "euler", + "normal", + 1 + ] + }, + { + "id": 6, + "type": "EmptyDCAELatentImage", + "pos": [ + 723.0592041015625, + 317.112548828125 + ], + "size": [ + 315, + 106 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "width", + "type": "INT", + "link": 7, + "widget": { + "name": "width" + } + }, + { + "name": "height", + "type": "INT", + "link": 8, + "widget": { + "name": "height" + } + } + ], + "outputs": [ + { + "name": "LATENT", + "type": "LATENT", + "links": [ + 4 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "EmptyDCAELatentImage" + }, + "widgets_values": [ + 512, + 512, + 1 + ] + }, + { + "id": 2, + "type": "VAEDecode", + "pos": [ + 1452.4869384765625, + 217.9922637939453 + ], + "size": [ + 200, + 50 + ], + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "name": "samples", + "type": "LATENT", + "link": 5 + }, + { + "name": "vae", + "type": "VAE", + "link": 46 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 47, + 48 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "VAEDecode" + }, + "widgets_values": [] + } + ], + "links": [ + [ + 2, + 7, + 0, + 1, + 1, + "CONDITIONING" + ], + [ + 3, + 12, + 0, + 1, + 2, + "CONDITIONING" + ], + [ + 4, + 6, + 0, + 1, + 3, + "LATENT" + ], + [ + 5, + 1, + 0, + 2, + 0, + "LATENT" + ], + [ + 7, + 4, + 0, + 6, + 0, + "INT" + ], + [ + 8, + 4, + 1, + 6, + 1, + "INT" + ], + [ + 9, + 5, + 0, + 7, + 0, + "GEMMA" + ], + [ + 11, + 5, + 0, + 12, + 0, + "GEMMA" + ], + [ + 30, + 33, + 0, + 25, + 0, + "IMAGE" + ], + [ + 31, + 34, + 0, + 26, + 0, + "COGVIDEOMODEL" + ], + [ + 32, + 27, + 0, + 26, + 1, + "CONDITIONING" + ], + [ + 33, + 28, + 0, + 26, + 2, + "CONDITIONING" + ], + [ + 34, + 30, + 0, + 26, + 4, + "LATENT" + ], + [ + 35, + 29, + 0, + 27, + 0, + "CLIP" + ], + [ + 36, + 27, + 1, + 28, + 0, + "CLIP" + ], + [ + 37, + 34, + 1, + 30, + 0, + "VAE" + ], + [ + 38, + 31, + 0, + 30, + 1, + "IMAGE" + ], + [ + 40, + 34, + 1, + 33, + 0, + "VAE" + ], + [ + 41, + 26, + 0, + 33, + 1, + "LATENT" + ], + [ + 43, + 35, + 0, + 1, + 0, + "MODEL" + ], + [ + 46, + 37, + 0, + 2, + 1, + "VAE" + ], + [ + 47, + 2, + 0, + 24, + 0, + "IMAGE" + ], + [ + 48, + 2, + 0, + 31, + 0, + "IMAGE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.5644739300537776, + "offset": [ + 515.970442108866, + 435.7565370847522 + ] + }, + "groupNodes": {} + }, + "version": 0.4 +} \ No newline at end of file diff --git a/asset/docs/ComfyUI/Sana_FlowEuler.json b/asset/docs/ComfyUI/Sana_FlowEuler.json new file mode 100644 index 0000000..0e858a7 --- /dev/null +++ b/asset/docs/ComfyUI/Sana_FlowEuler.json @@ -0,0 +1,556 @@ +{ + "last_node_id": 29, + "last_link_id": 24, + "nodes": [ + { + "id": 6, + "type": "SanaResolutionSelect", + "pos": [ + -364.7435607910156, + 301.5954284667969 + ], + "size": [ + 315, + 102 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "width", + "type": "INT", + "links": [ + 1 + ], + "slot_index": 0 + }, + { + "name": "height", + "type": "INT", + "links": [ + 2 + ], + "slot_index": 1 + } + ], + "properties": { + "Node name for S&R": "SanaResolutionSelect" + }, + "widgets_values": [ + "1024px", + "1.00" + ] + }, + { + "id": 3, + "type": "EmptyDCAELatentImage", + "pos": [ + 57.19669723510742, + 202.29898071289062 + ], + "size": [ + 315, + 106 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "width", + "type": "INT", + "link": 1, + "widget": { + "name": "width" + } + }, + { + "name": "height", + "type": "INT", + "link": 2, + "widget": { + "name": "height" + } + } + ], + "outputs": [ + { + "name": "LATENT", + "type": "LATENT", + "links": [ + 18 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "EmptyDCAELatentImage" + }, + "widgets_values": [ + 512, + 512, + 1 + ] + }, + { + "id": 8, + "type": "SanaTextEncode", + "pos": [ + 5.8887786865234375, + 706.19287109375 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "GEMMA", + "type": "GEMMA", + "link": 12 + } + ], + "outputs": [ + { + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 17 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "SanaTextEncode" + }, + "widgets_values": [ + "\"\"" + ] + }, + { + "id": 13, + "type": "PreviewImage", + "pos": [ + 802.6994018554688, + 217.20889282226562 + ], + "size": [ + 605.93505859375, + 665.570068359375 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 11 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 9, + "type": "GemmaLoader", + "pos": [ + -381.6518859863281, + 512.5463256835938 + ], + "size": [ + 315, + 106 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "GEMMA", + "type": "GEMMA", + "links": [ + 12, + 13 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "GemmaLoader" + }, + "widgets_values": [ + "google/gemma-2-2b-it", + "cuda", + "BF16" + ] + }, + { + "id": 15, + "type": "EmptyDCAELatentImage", + "pos": [ + 62.19669723510742, + 207.29898071289062 + ], + "size": [ + 315, + 106 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "width", + "type": "INT", + "link": null, + "widget": { + "name": "width" + } + }, + { + "name": "height", + "type": "INT", + "link": null, + "widget": { + "name": "height" + } + } + ], + "outputs": [ + { + "name": "LATENT", + "type": "LATENT", + "links": [], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "EmptyDCAELatentImage" + }, + "widgets_values": [ + 512, + 512, + 1 + ] + }, + { + "id": 4, + "type": "VAEDecode", + "pos": [ + 776.332763671875, + 105.08650970458984 + ], + "size": [ + 200, + 50 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "samples", + "type": "LATENT", + "link": 3 + }, + { + "name": "vae", + "type": "VAE", + "link": 24 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 11 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "VAEDecode" + }, + "widgets_values": [] + }, + { + "id": 29, + "type": "ExtraVAELoader", + "pos": [ + 460.67730712890625, + 695.5695190429688 + ], + "size": [ + 315, + 106 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "VAE", + "type": "VAE", + "links": [ + 24 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "ExtraVAELoader" + }, + "widgets_values": [ + "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers", + "dcae-f32c32-sana-1.0-diffusers", + "BF16" + ] + }, + { + "id": 25, + "type": "SanaCheckpointLoader", + "pos": [ + -334.1000671386719, + 133.43394470214844 + ], + "size": [ + 315, + 82 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "model", + "type": "MODEL", + "links": [ + 20 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "SanaCheckpointLoader" + }, + "widgets_values": [ + "Efficient-Large-Model/Sana_1600M_1024px_MultiLing", + "SanaMS_1600M_P1_D20" + ] + }, + { + "id": 14, + "type": "SanaTextEncode", + "pos": [ + 9.182632446289062, + 413.08575439453125 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "GEMMA", + "type": "GEMMA", + "link": 13 + } + ], + "outputs": [ + { + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 16 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "SanaTextEncode" + }, + "widgets_values": [ + "make me a logo that says \"So Fast\" with a really cool flying dragon shape with lightning sparks all over the sides and all of it contains Indonesian language" + ] + }, + { + "id": 10, + "type": "KSampler", + "pos": [ + 429.7785339355469, + 99.45759582519531 + ], + "size": [ + 300, + 480 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "model", + "type": "MODEL", + "link": 20 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 16 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 17 + }, + { + "name": "latent_image", + "type": "LATENT", + "link": 18 + } + ], + "outputs": [ + { + "name": "LATENT", + "type": "LATENT", + "links": [ + 3 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "KSampler" + }, + "widgets_values": [ + 750392164154046, + "randomize", + 28, + 4.5, + "euler", + "normal", + 1 + ] + } + ], + "links": [ + [ + 1, + 6, + 0, + 3, + 0, + "INT" + ], + [ + 2, + 6, + 1, + 3, + 1, + "INT" + ], + [ + 3, + 10, + 0, + 4, + 0, + "LATENT" + ], + [ + 11, + 4, + 0, + 13, + 0, + "IMAGE" + ], + [ + 12, + 9, + 0, + 8, + 0, + "GEMMA" + ], + [ + 13, + 9, + 0, + 14, + 0, + "GEMMA" + ], + [ + 16, + 14, + 0, + 10, + 1, + "CONDITIONING" + ], + [ + 17, + 8, + 0, + 10, + 2, + "CONDITIONING" + ], + [ + 18, + 3, + 0, + 10, + 3, + "LATENT" + ], + [ + 20, + 25, + 0, + 10, + 0, + "MODEL" + ], + [ + 24, + 29, + 0, + 4, + 1, + "VAE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.7513148009015777, + "offset": [ + 628.4872538544141, + 156.35225366732607 + ] + } + }, + "version": 0.4 +} \ No newline at end of file diff --git a/asset/docs/ComfyUI/comfyui.md b/asset/docs/ComfyUI/comfyui.md index e69de29..105b026 100644 --- a/asset/docs/ComfyUI/comfyui.md +++ b/asset/docs/ComfyUI/comfyui.md @@ -0,0 +1,33 @@ +## 🖌️ Sana-ComfyUI + +[Original Repo](https://github.com/city96/ComfyUI_ExtraModels) + +### Model info / implementation +- Uses Gemma2 2B as the text encoder +- Multiple resolutions and models available +- Compressed latent space (32 channels, /32 compression) - needs custom VAE + +### Usage +1. All the checkpoints will be downloaded automatically. +2. KSampler(Flow Euler) is available for now; Flow DPM-Solver will be available soon. +3. For more information, check the [original city96/ComfyUI_ExtraModels](https://github.com/city96/ComfyUI_ExtraModels). + +```bash +git clone https://github.com/comfyanonymous/ComfyUI.git +cd ComfyUI +git clone https://github.com/Efficient-Large-Model/ComfyUI_ExtraModels.git custom_nodes/ComfyUI_ExtraModels + +python main.py +``` + +### A sample workflow for Sana + +[Sana workflow](Sana_FlowEuler.json) + +![Sana](https://raw.githubusercontent.com/NVlabs/Sana/refs/heads/page/asset/content/comfyui/sana.jpg) + +### A sample for T2I(Sana) + I2V(CogVideoX) + +[Sana + CogVideoX workflow](Sana_CogVideoX.json) + +![Sample T2I + I2V workflow](https://raw.githubusercontent.com/NVlabs/Sana/refs/heads/page/asset/content/comfyui/sana-cogvideox.jpg) diff --git a/tests/bash/test_inference.sh b/tests/bash/test_inference.sh index a94b190..429c4d1 100644 --- a/tests/bash/test_inference.sh +++ b/tests/bash/test_inference.sh @@ -3,7 +3,7 @@ set -e python scripts/inference.py \ --config=configs/sana_config/1024ms/Sana_600M_img1024.yaml \ - --model_path=hf://Efficient-Large-Model/Sana_600M_1024px/checkpoints/Sana_600M_1024px.pth + --model_path=hf://Efficient-Large-Model/Sana_600M_1024px/checkpoints/Sana_600M_1024px_MultiLing.pth python scripts/inference.py \ From 6429e471ae289c26eb94da0c30d53aba07f3cc8c Mon Sep 17 00:00:00 2001 From: lawrence-cj Date: Mon, 9 Dec 2024 23:00:54 +0800 Subject: [PATCH 4/4] pre-commit; Signed-off-by: lawrence-cj --- README.md | 8 ++++---- asset/docs/ComfyUI/Sana_CogVideoX.json | 2 +- asset/docs/ComfyUI/Sana_FlowEuler.json | 2 +- asset/docs/ComfyUI/comfyui.md | 6 ++++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 3b89334..bef8373 100644 --- a/README.md +++ b/README.md @@ -243,10 +243,10 @@ We will try our best to release # 🤗Acknowledgements -- Thanks to [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha), [PixArt-Σ](https://github.com/PixArt-alpha/PixArt-sigma), -[Efficient-ViT](https://github.com/mit-han-lab/efficientvit) and -[ComfyUI_ExtraModels](https://github.com/city96/ComfyUI_ExtraModels) -for their wonderful work and codebase! +- Thanks to [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha), [PixArt-Σ](https://github.com/PixArt-alpha/PixArt-sigma), + [Efficient-ViT](https://github.com/mit-han-lab/efficientvit) and + [ComfyUI_ExtraModels](https://github.com/city96/ComfyUI_ExtraModels) + for their wonderful work and codebase! # 📖BibTeX diff --git a/asset/docs/ComfyUI/Sana_CogVideoX.json b/asset/docs/ComfyUI/Sana_CogVideoX.json index 77585ab..2230329 100644 --- a/asset/docs/ComfyUI/Sana_CogVideoX.json +++ b/asset/docs/ComfyUI/Sana_CogVideoX.json @@ -1139,4 +1139,4 @@ "groupNodes": {} }, "version": 0.4 -} \ No newline at end of file +} diff --git a/asset/docs/ComfyUI/Sana_FlowEuler.json b/asset/docs/ComfyUI/Sana_FlowEuler.json index 0e858a7..6ffe5d3 100644 --- a/asset/docs/ComfyUI/Sana_FlowEuler.json +++ b/asset/docs/ComfyUI/Sana_FlowEuler.json @@ -553,4 +553,4 @@ } }, "version": 0.4 -} \ No newline at end of file +} diff --git a/asset/docs/ComfyUI/comfyui.md b/asset/docs/ComfyUI/comfyui.md index 105b026..6bd2ee3 100644 --- a/asset/docs/ComfyUI/comfyui.md +++ b/asset/docs/ComfyUI/comfyui.md @@ -3,14 +3,16 @@ [Original Repo](https://github.com/city96/ComfyUI_ExtraModels) ### Model info / implementation + - Uses Gemma2 2B as the text encoder - Multiple resolutions and models available - Compressed latent space (32 channels, /32 compression) - needs custom VAE ### Usage + 1. All the checkpoints will be downloaded automatically. -2. KSampler(Flow Euler) is available for now; Flow DPM-Solver will be available soon. -3. For more information, check the [original city96/ComfyUI_ExtraModels](https://github.com/city96/ComfyUI_ExtraModels). +1. KSampler(Flow Euler) is available for now; Flow DPM-Solver will be available soon. +1. For more information, check the [original city96/ComfyUI_ExtraModels](https://github.com/city96/ComfyUI_ExtraModels). ```bash git clone https://github.com/comfyanonymous/ComfyUI.git