-
Notifications
You must be signed in to change notification settings - Fork 82
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
154 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
104 changes: 104 additions & 0 deletions
104
configs/sana_config/1024ms/Sana_1600M_img1024_AdamW.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
data: | ||
data_dir: [data/data_public/dir1] | ||
image_size: 1024 | ||
caption_proportion: | ||
prompt: 1 | ||
external_caption_suffixes: ['', _InternVL2-26B, _VILA1-5-13B] | ||
external_clipscore_suffixes: | ||
- _InternVL2-26B_clip_score | ||
- _VILA1-5-13B_clip_score | ||
- _prompt_clip_score | ||
clip_thr_temperature: 0.1 | ||
clip_thr: 25.0 | ||
load_text_feat: false | ||
load_vae_feat: false | ||
transform: default_train | ||
type: SanaWebDatasetMS | ||
sort_dataset: false | ||
# model config | ||
model: | ||
model: SanaMS_1600M_P1_D20 | ||
image_size: 1024 | ||
mixed_precision: fp16 # ['fp16', 'fp32', 'bf16'] | ||
fp32_attention: true | ||
load_from: | ||
resume_from: | ||
aspect_ratio_type: ASPECT_RATIO_1024 | ||
multi_scale: true | ||
#pe_interpolation: 1. | ||
attn_type: linear | ||
ffn_type: glumbconv | ||
mlp_acts: | ||
- silu | ||
- silu | ||
- | ||
mlp_ratio: 2.5 | ||
use_pe: false | ||
qk_norm: false | ||
class_dropout_prob: 0.1 | ||
# PAG | ||
pag_applied_layers: | ||
- 8 | ||
# VAE setting | ||
vae: | ||
vae_type: dc-ae | ||
vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0 | ||
scale_factor: 0.41407 | ||
vae_latent_dim: 32 | ||
vae_downsample_rate: 32 | ||
sample_posterior: true | ||
# text encoder | ||
text_encoder: | ||
text_encoder_name: gemma-2-2b-it | ||
y_norm: true | ||
y_norm_scale_factor: 0.01 | ||
model_max_length: 300 | ||
# CHI | ||
chi_prompt: | ||
- 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:' | ||
- '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.' | ||
- '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.' | ||
- 'Here are examples of how to transform or refine prompts:' | ||
- '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.' | ||
- '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.' | ||
- 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:' | ||
- 'User Prompt: ' | ||
# Sana schedule Flow | ||
scheduler: | ||
predict_v: true | ||
noise_schedule: linear_flow | ||
pred_sigma: false | ||
flow_shift: 3.0 | ||
# logit-normal timestep | ||
weighting_scheme: logit_normal | ||
logit_mean: 0.0 | ||
logit_std: 1.0 | ||
vis_sampler: flow_dpm-solver | ||
# training setting | ||
train: | ||
num_workers: 10 | ||
seed: 1 | ||
train_batch_size: 64 | ||
num_epochs: 100 | ||
gradient_accumulation_steps: 1 | ||
grad_checkpointing: true | ||
gradient_clip: 0.1 | ||
optimizer: | ||
lr: 1.0e-4 | ||
type: AdamW | ||
weight_decay: 0.01 | ||
eps: 1.0e-8 | ||
betas: [0.9, 0.999] | ||
lr_schedule: constant | ||
lr_schedule_args: | ||
num_warmup_steps: 2000 | ||
local_save_vis: true # if save log image locally | ||
visualize: true | ||
eval_sampling_steps: 500 | ||
log_interval: 20 | ||
save_model_epochs: 5 | ||
save_model_steps: 500 | ||
work_dir: output/debug | ||
online_metric: false | ||
eval_metric_step: 2000 | ||
online_metric_dir: metric_helper |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters