From 52be465f6e2bfe17ad9a8163cf52bd83ac9a445c Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 19 Sep 2023 10:11:34 -0400 Subject: [PATCH 1/4] update README w deepspeed info --- README.md | 18 ++++++++++ deepspeed/zero1.json | 75 +++++++++++++++++++------------------- deepspeed/zero2.json | 81 +++++++++++++++++++++--------------------- deepspeed/zero3.json | 1 + examples/phi/README.md | 1 + 5 files changed, 99 insertions(+), 77 deletions(-) diff --git a/README.md b/README.md index c9b935f53d..cdc37f3821 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ Features: - [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset) - [Config](#config) - [Train](#train) + - [Training w/ Deepspeed](#training-with-deepspeed) - [Inference](#inference) - [Merge LORA to Base](#merge-lora-to-base) - [Common Errors](#common-errors-) @@ -88,6 +89,8 @@ cd axolotl pip3 install packaging pip3 install -e .[flash-attn] pip3 install -U git+https://github.com/huggingface/peft.git +# optionally install deepspeed +pip3 install deepspeed # finetune lora accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml @@ -732,6 +735,21 @@ wandb_run_id: wandb_log_model: ``` +### Training with Deepspeed + +Deepspeed is an optimization suite for multi-gpu systems allowing you to train much larger models than you +might typically be able to fit into your GPU's VRAM. More information about the various optimization types +for deepspeed is available at https://huggingface.co/docs/accelerate/main/en/usage_guides/deepspeed#what-is-integrated + +We provide several default deepspeed JSON configurations for ZeRO stage 1, 2, and 3. + +```shell +# install deepspeed if you haven't already +pip3 install deepspeed + +accelerate launch -m axolotl.cli.train examples/llama-2/config.py --deepspeed deepspeed/zero1.json +``` + ### Inference Pass the appropriate flag to the train command: diff --git a/deepspeed/zero1.json b/deepspeed/zero1.json index 39d648b2b5..4e41c55913 100644 --- a/deepspeed/zero1.json +++ b/deepspeed/zero1.json @@ -1,39 +1,40 @@ { - "zero_optimization": { - "stage": 1, - "overlap_comm": true - }, - "bf16": { - "enabled": "auto" - }, - "fp16": { - "enabled": "auto", - "auto_cast": false, - "loss_scale": 0, - "initial_scale_power": 32, - "loss_scale_window": 1000, - "hysteresis": 2, - "min_loss_scale": 1 - }, - "optimizer": { - "type": "AdamW", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - }, - "scheduler": { - "type": "WarmupDecayLR", - "params": { - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto", - "total_num_steps": "auto" - } - }, - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "wall_clock_breakdown": false + "zero_optimization": { + "stage": 1, + "overlap_comm": true + }, + "bf16": { + "enabled": "auto" + }, + "fp16": { + "enabled": "auto", + "auto_cast": false, + "loss_scale": 0, + "initial_scale_power": 32, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupDecayLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto", + "total_num_steps": "auto" + } + }, + "gradient_accumulation_steps": "auto", + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false } diff --git a/deepspeed/zero2.json b/deepspeed/zero2.json index 16f2710f69..40b32beead 100644 --- a/deepspeed/zero2.json +++ b/deepspeed/zero2.json @@ -1,43 +1,44 @@ { - "zero_optimization": { - "stage": 2, - "offload_optimizer": { - "device": "cpu" - }, - "contiguous_gradients": true, - "overlap_comm": true + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "cpu" }, - "bf16": { - "enabled": "auto" - }, - "fp16": { - "enabled": "auto", - "auto_cast": false, - "loss_scale": 0, - "initial_scale_power": 32, - "loss_scale_window": 1000, - "hysteresis": 2, - "min_loss_scale": 1 - }, - "optimizer": { - "type": "AdamW", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - }, - "scheduler": { - "type": "WarmupDecayLR", - "params": { - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto", - "total_num_steps": "auto" - } - }, - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "wall_clock_breakdown": false + "contiguous_gradients": true, + "overlap_comm": true + }, + "bf16": { + "enabled": "auto" + }, + "fp16": { + "enabled": "auto", + "auto_cast": false, + "loss_scale": 0, + "initial_scale_power": 32, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupDecayLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto", + "total_num_steps": "auto" + } + }, + "gradient_accumulation_steps": "auto", + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false } diff --git a/deepspeed/zero3.json b/deepspeed/zero3.json index 3f5dff3302..b2b971dfe3 100644 --- a/deepspeed/zero3.json +++ b/deepspeed/zero3.json @@ -48,6 +48,7 @@ "warmup_num_steps": "auto" } }, + "gradient_accumulation_steps": "auto", "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false diff --git a/examples/phi/README.md b/examples/phi/README.md index 1109db0b50..b315a991fd 100644 --- a/examples/phi/README.md +++ b/examples/phi/README.md @@ -3,6 +3,7 @@ Due to some nuances with the phi code, please use deepspeed when training phi for full finetune. ```shell +# You may need to install deepspeed with `pip3 install deepspeed` accelerate launch -m axolotl.cli.train examples/phi/phi-ft.yml --deepspeed deepspeed/zero1.json # OR From 9bdb423b99784cfe739b59706494c596c1d2ec38 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 19 Sep 2023 14:57:00 -0400 Subject: [PATCH 2/4] use linear warmup for deepspeed --- deepspeed/zero1.json | 1 + deepspeed/zero2.json | 1 + deepspeed/zero3.json | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/deepspeed/zero1.json b/deepspeed/zero1.json index 4e41c55913..85bc586091 100644 --- a/deepspeed/zero1.json +++ b/deepspeed/zero1.json @@ -30,6 +30,7 @@ "warmup_min_lr": "auto", "warmup_max_lr": "auto", "warmup_num_steps": "auto", + "warmup_type": "linear", "total_num_steps": "auto" } }, diff --git a/deepspeed/zero2.json b/deepspeed/zero2.json index 40b32beead..18ec4bd0e3 100644 --- a/deepspeed/zero2.json +++ b/deepspeed/zero2.json @@ -34,6 +34,7 @@ "warmup_min_lr": "auto", "warmup_max_lr": "auto", "warmup_num_steps": "auto", + "warmup_type": "linear", "total_num_steps": "auto" } }, diff --git a/deepspeed/zero3.json b/deepspeed/zero3.json index b2b971dfe3..5da9c07e28 100644 --- a/deepspeed/zero3.json +++ b/deepspeed/zero3.json @@ -45,7 +45,8 @@ "params": { "warmup_min_lr": "auto", "warmup_max_lr": "auto", - "warmup_num_steps": "auto" + "warmup_num_steps": "auto", + "warmup_type": "linear" } }, "gradient_accumulation_steps": "auto", From 2bb1f9d4d7a0c80a1fb551485af11907b6f9e608 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Wed, 20 Sep 2023 22:00:45 -0400 Subject: [PATCH 3/4] update requirements to include deepspeed and update readme --- README.md | 19 +++++++++---------- requirements.txt | 1 + setup.py | 9 +++++++-- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index cdc37f3821..cec07ff1b1 100644 --- a/README.md +++ b/README.md @@ -87,10 +87,8 @@ git clone https://github.com/OpenAccess-AI-Collective/axolotl cd axolotl pip3 install packaging -pip3 install -e .[flash-attn] +pip3 install -e .[flash-attn,deepspeed] pip3 install -U git+https://github.com/huggingface/peft.git -# optionally install deepspeed -pip3 install deepspeed # finetune lora accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml @@ -124,7 +122,7 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \ 3. Install axolotl along with python dependencies ```bash pip3 install packaging - pip3 install -e .[flash-attn] + pip3 install -e .[flash-attn,deepspeed] ``` - LambdaLabs @@ -160,7 +158,7 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \ cd axolotl pip3 install packaging - pip3 install -e .[flash-attn] + pip3 install -e .[flash-attn,deepspeed] pip3 install protobuf==3.20.3 pip3 install -U --ignore-installed requests Pillow psutil scipy ``` @@ -718,11 +716,6 @@ fsdp_config: fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer ``` -- llama Deepspeed -```yaml -deepspeed: deepspeed/zero3.json -``` - ##### Weights & Biases Logging - wandb options @@ -750,6 +743,12 @@ pip3 install deepspeed accelerate launch -m axolotl.cli.train examples/llama-2/config.py --deepspeed deepspeed/zero1.json ``` +or + +```yaml +deepspeed: deepspeed/zero1.json +``` + ### Inference Pass the appropriate flag to the train command: diff --git a/requirements.txt b/requirements.txt index 4ef9f5fd2d..e1c4401fa7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ peft @ git+https://github.com/huggingface/peft.git transformers @ git+https://github.com/huggingface/transformers.git bitsandbytes>=0.41.1 accelerate @ git+https://github.com/huggingface/accelerate +deepspeed addict evaluate fire diff --git a/setup.py b/setup.py index fca5088da1..3109d4af1a 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,12 @@ def parse_requirements(): # Handle custom index URLs _, url = line.split() _dependency_links.append(url) - elif "flash-attn" not in line and line and line[0] != "#": + elif ( + "flash-attn" not in line + and "deepspeed" not in line + and line + and line[0] != "#" + ): # Handle standard packages _install_requires.append(line) return _install_requires, _dependency_links @@ -35,7 +40,7 @@ def parse_requirements(): "flash-attn": [ "flash-attn>=2.2.1", ], - "extras": [ + "deepspeed": [ "deepspeed", ], }, From e7d8c46005d050d6dba8619c52200506e2442956 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 21 Sep 2023 21:58:18 -0400 Subject: [PATCH 4/4] don't need redundant deepspeed install instructions --- README.md | 3 --- examples/phi/README.md | 1 - 2 files changed, 4 deletions(-) diff --git a/README.md b/README.md index cec07ff1b1..6e91158934 100644 --- a/README.md +++ b/README.md @@ -737,9 +737,6 @@ for deepspeed is available at https://huggingface.co/docs/accelerate/main/en/usa We provide several default deepspeed JSON configurations for ZeRO stage 1, 2, and 3. ```shell -# install deepspeed if you haven't already -pip3 install deepspeed - accelerate launch -m axolotl.cli.train examples/llama-2/config.py --deepspeed deepspeed/zero1.json ``` diff --git a/examples/phi/README.md b/examples/phi/README.md index b315a991fd..1109db0b50 100644 --- a/examples/phi/README.md +++ b/examples/phi/README.md @@ -3,7 +3,6 @@ Due to some nuances with the phi code, please use deepspeed when training phi for full finetune. ```shell -# You may need to install deepspeed with `pip3 install deepspeed` accelerate launch -m axolotl.cli.train examples/phi/phi-ft.yml --deepspeed deepspeed/zero1.json # OR