From ce9dd7a78a491a5e8dddda3c0f95a64fe81deeed Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 12 Apr 2024 01:44:09 -0400 Subject: [PATCH] add bigstral z3 config and make sure to use full_state_dict for fsdp --- .../zero3_bf16_cpuoffload_all.json | 2 + .../zero3_bf16_cpuoffload_params.json | 2 + examples/dbrx/16bit-lora.yaml | 2 +- examples/dbrx/8bit-lora.yaml | 2 +- examples/llama-2/qlora-fsdp.yml | 2 +- examples/mistral/bigstral-ds-zero3.yaml | 63 +++++++++++++++++++ examples/mistral/mistral-qlora-fsdp.yml | 2 +- examples/mistral/mixtral-8x22b-qlora-fsdp.yml | 2 +- examples/mistral/mixtral-qlora-fsdp.yml | 2 +- 9 files changed, 73 insertions(+), 6 deletions(-) create mode 100644 examples/mistral/bigstral-ds-zero3.yaml diff --git a/deepspeed_configs/zero3_bf16_cpuoffload_all.json b/deepspeed_configs/zero3_bf16_cpuoffload_all.json index 72fde6e5f1..09ca6785b2 100644 --- a/deepspeed_configs/zero3_bf16_cpuoffload_all.json +++ b/deepspeed_configs/zero3_bf16_cpuoffload_all.json @@ -1,4 +1,6 @@ { + "zero_force_ds_cpu_optimizer": false, + "zero_allow_untested_optimizer": true, "zero_optimization": { "stage": 3, "offload_optimizer": { diff --git a/deepspeed_configs/zero3_bf16_cpuoffload_params.json b/deepspeed_configs/zero3_bf16_cpuoffload_params.json index ca051e03ba..41d4a21323 100644 --- a/deepspeed_configs/zero3_bf16_cpuoffload_params.json +++ b/deepspeed_configs/zero3_bf16_cpuoffload_params.json @@ -1,4 +1,6 @@ { + "zero_force_ds_cpu_optimizer": false, + "zero_allow_untested_optimizer": true, "zero_optimization": { "stage": 3, "offload_param": { diff --git a/examples/dbrx/16bit-lora.yaml b/examples/dbrx/16bit-lora.yaml index 5e0faa5477..e5e3ea9216 100644 --- a/examples/dbrx/16bit-lora.yaml +++ b/examples/dbrx/16bit-lora.yaml @@ -77,5 +77,5 @@ fsdp_config: fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: DbrxBlock - fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_state_dict_type: FULL_STATE_DICT fsdp_activation_checkpointing: true diff --git a/examples/dbrx/8bit-lora.yaml b/examples/dbrx/8bit-lora.yaml index 5ed20c93a7..89e24db058 100644 --- a/examples/dbrx/8bit-lora.yaml +++ b/examples/dbrx/8bit-lora.yaml @@ -77,5 +77,5 @@ fsdp_config: fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: DbrxBlock - fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_state_dict_type: FULL_STATE_DICT fsdp_activation_checkpointing: true diff --git a/examples/llama-2/qlora-fsdp.yml b/examples/llama-2/qlora-fsdp.yml index 719644b56b..93b3b2a60a 100644 --- a/examples/llama-2/qlora-fsdp.yml +++ b/examples/llama-2/qlora-fsdp.yml @@ -74,5 +74,5 @@ fsdp_config: fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer - fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_state_dict_type: FULL_STATE_DICT special_tokens: diff --git a/examples/mistral/bigstral-ds-zero3.yaml b/examples/mistral/bigstral-ds-zero3.yaml new file mode 100644 index 0000000000..cc0a44b2a4 --- /dev/null +++ b/examples/mistral/bigstral-ds-zero3.yaml @@ -0,0 +1,63 @@ +base_model: mistral-community/Mixtral-8x22B-v0.1 +model_type: AutoModelForCausalLM +tokenizer_type: LlamaTokenizer +trust_remote_code: true + +load_in_8bit: false +load_in_4bit: false +strict: false + +unfrozen_parameters: + - ^lm_head.weight$ + - ^model.embed_tokens.weight$ + - model.layers.4[4-9]+.block_sparse_moe.gate + - model.layers.4[4-9]+.block_sparse_moe.experts + - model.layers.5[0-5]+.block_sparse_moe.gate + - model.layers.5[0-5]+.block_sparse_moe.experts + +model_config: + output_router_logits: true + +datasets: + - path: tatsu-lab/alpaca + type: alpaca +dataset_prepared_path: last_run_prepared +val_set_size: 0.05 +output_dir: ./out + +sequence_len: 2048 +sample_packing: true +pad_to_sequence_len: true + +gradient_accumulation_steps: 1 +micro_batch_size: 1 +num_epochs: 3 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0001 + +train_on_inputs: false +group_by_length: false +bf16: auto +fp16: +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true + +save_total_limit: 1 +save_steps: +debug: +deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_params.json +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: + eos_token: "<|im_end|>" +tokens: + - "<|im_start|>" diff --git a/examples/mistral/mistral-qlora-fsdp.yml b/examples/mistral/mistral-qlora-fsdp.yml index 46ebaf47f1..71ac1e701f 100644 --- a/examples/mistral/mistral-qlora-fsdp.yml +++ b/examples/mistral/mistral-qlora-fsdp.yml @@ -77,6 +77,6 @@ fsdp_config: fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: false fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer - fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_state_dict_type: FULL_STATE_DICT fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP special_tokens: diff --git a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml index 1bc104f700..ac80a2a756 100644 --- a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml +++ b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml @@ -76,6 +76,6 @@ fsdp_config: fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock - fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_state_dict_type: FULL_STATE_DICT fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP special_tokens: diff --git a/examples/mistral/mixtral-qlora-fsdp.yml b/examples/mistral/mixtral-qlora-fsdp.yml index 94ca02b0c6..b6a07ae51c 100644 --- a/examples/mistral/mixtral-qlora-fsdp.yml +++ b/examples/mistral/mixtral-qlora-fsdp.yml @@ -77,7 +77,7 @@ fsdp_config: fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock - fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_state_dict_type: FULL_STATE_DICT fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_sharding_strategy: FULL_SHARD fsdp_forward_prefetch: false