diff --git a/README.md b/README.md index c9b935f53d..6e91158934 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ Features: - [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset) - [Config](#config) - [Train](#train) + - [Training w/ Deepspeed](#training-with-deepspeed) - [Inference](#inference) - [Merge LORA to Base](#merge-lora-to-base) - [Common Errors](#common-errors-) @@ -86,7 +87,7 @@ git clone https://github.com/OpenAccess-AI-Collective/axolotl cd axolotl pip3 install packaging -pip3 install -e .[flash-attn] +pip3 install -e .[flash-attn,deepspeed] pip3 install -U git+https://github.com/huggingface/peft.git # finetune lora @@ -121,7 +122,7 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \ 3. Install axolotl along with python dependencies ```bash pip3 install packaging - pip3 install -e .[flash-attn] + pip3 install -e .[flash-attn,deepspeed] ``` - LambdaLabs @@ -157,7 +158,7 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \ cd axolotl pip3 install packaging - pip3 install -e .[flash-attn] + pip3 install -e .[flash-attn,deepspeed] pip3 install protobuf==3.20.3 pip3 install -U --ignore-installed requests Pillow psutil scipy ``` @@ -715,11 +716,6 @@ fsdp_config: fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer ``` -- llama Deepspeed -```yaml -deepspeed: deepspeed/zero3.json -``` - ##### Weights & Biases Logging - wandb options @@ -732,6 +728,24 @@ wandb_run_id: wandb_log_model: ``` +### Training with Deepspeed + +Deepspeed is an optimization suite for multi-gpu systems allowing you to train much larger models than you +might typically be able to fit into your GPU's VRAM. More information about the various optimization types +for deepspeed is available at https://huggingface.co/docs/accelerate/main/en/usage_guides/deepspeed#what-is-integrated + +We provide several default deepspeed JSON configurations for ZeRO stage 1, 2, and 3. + +```shell +accelerate launch -m axolotl.cli.train examples/llama-2/config.py --deepspeed deepspeed/zero1.json +``` + +or + +```yaml +deepspeed: deepspeed/zero1.json +``` + ### Inference Pass the appropriate flag to the train command: diff --git a/deepspeed/zero1.json b/deepspeed/zero1.json index 39d648b2b5..85bc586091 100644 --- a/deepspeed/zero1.json +++ b/deepspeed/zero1.json @@ -1,39 +1,41 @@ { - "zero_optimization": { - "stage": 1, - "overlap_comm": true - }, - "bf16": { - "enabled": "auto" - }, - "fp16": { - "enabled": "auto", - "auto_cast": false, - "loss_scale": 0, - "initial_scale_power": 32, - "loss_scale_window": 1000, - "hysteresis": 2, - "min_loss_scale": 1 - }, - "optimizer": { - "type": "AdamW", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - }, - "scheduler": { - "type": "WarmupDecayLR", - "params": { - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto", - "total_num_steps": "auto" - } - }, - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "wall_clock_breakdown": false + "zero_optimization": { + "stage": 1, + "overlap_comm": true + }, + "bf16": { + "enabled": "auto" + }, + "fp16": { + "enabled": "auto", + "auto_cast": false, + "loss_scale": 0, + "initial_scale_power": 32, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupDecayLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto", + "warmup_type": "linear", + "total_num_steps": "auto" + } + }, + "gradient_accumulation_steps": "auto", + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false } diff --git a/deepspeed/zero2.json b/deepspeed/zero2.json index 16f2710f69..18ec4bd0e3 100644 --- a/deepspeed/zero2.json +++ b/deepspeed/zero2.json @@ -1,43 +1,45 @@ { - "zero_optimization": { - "stage": 2, - "offload_optimizer": { - "device": "cpu" - }, - "contiguous_gradients": true, - "overlap_comm": true + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "cpu" }, - "bf16": { - "enabled": "auto" - }, - "fp16": { - "enabled": "auto", - "auto_cast": false, - "loss_scale": 0, - "initial_scale_power": 32, - "loss_scale_window": 1000, - "hysteresis": 2, - "min_loss_scale": 1 - }, - "optimizer": { - "type": "AdamW", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - }, - "scheduler": { - "type": "WarmupDecayLR", - "params": { - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto", - "total_num_steps": "auto" - } - }, - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "wall_clock_breakdown": false + "contiguous_gradients": true, + "overlap_comm": true + }, + "bf16": { + "enabled": "auto" + }, + "fp16": { + "enabled": "auto", + "auto_cast": false, + "loss_scale": 0, + "initial_scale_power": 32, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupDecayLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto", + "warmup_type": "linear", + "total_num_steps": "auto" + } + }, + "gradient_accumulation_steps": "auto", + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false } diff --git a/deepspeed/zero3.json b/deepspeed/zero3.json index 3f5dff3302..5da9c07e28 100644 --- a/deepspeed/zero3.json +++ b/deepspeed/zero3.json @@ -45,9 +45,11 @@ "params": { "warmup_min_lr": "auto", "warmup_max_lr": "auto", - "warmup_num_steps": "auto" + "warmup_num_steps": "auto", + "warmup_type": "linear" } }, + "gradient_accumulation_steps": "auto", "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false diff --git a/requirements.txt b/requirements.txt index 1e95b716ec..5aba20b161 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ peft @ git+https://github.com/huggingface/peft.git transformers @ git+https://github.com/huggingface/transformers.git bitsandbytes>=0.41.1 accelerate @ git+https://github.com/huggingface/accelerate +deepspeed addict evaluate fire diff --git a/setup.py b/setup.py index fca5088da1..3109d4af1a 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,12 @@ def parse_requirements(): # Handle custom index URLs _, url = line.split() _dependency_links.append(url) - elif "flash-attn" not in line and line and line[0] != "#": + elif ( + "flash-attn" not in line + and "deepspeed" not in line + and line + and line[0] != "#" + ): # Handle standard packages _install_requires.append(line) return _install_requires, _dependency_links @@ -35,7 +40,7 @@ def parse_requirements(): "flash-attn": [ "flash-attn>=2.2.1", ], - "extras": [ + "deepspeed": [ "deepspeed", ], },