From 086e48b1ac36f6056175fabd14cc57fabe00a63b Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 30 Mar 2024 12:00:46 -0400 Subject: [PATCH] update to use v2 of the converted model --- examples/dbrx/16bit-lora.yaml | 6 ++++-- examples/dbrx/README.md | 13 ++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/examples/dbrx/16bit-lora.yaml b/examples/dbrx/16bit-lora.yaml index 4b770025b3..55754191c6 100644 --- a/examples/dbrx/16bit-lora.yaml +++ b/examples/dbrx/16bit-lora.yaml @@ -1,4 +1,4 @@ -base_model: LnL-AI/dbrx-base-converted +base_model: LnL-AI/dbrx-base-converted-v2 trust_remote_code: true load_in_8bit: false @@ -29,7 +29,9 @@ lora_alpha: 16 lora_dropout: 0.05 # w1, w2, & v1 will hang the trainer lora_target_modules: - - Wqkv # attn + - q_proj # attn + - k_proj # attn + - v_proj # attn - out_proj # attn - layer # router # - w1 diff --git a/examples/dbrx/README.md b/examples/dbrx/README.md index b391af9b74..2efeee3f19 100644 --- a/examples/dbrx/README.md +++ b/examples/dbrx/README.md @@ -1,18 +1,25 @@ # DBRX MoE -Currently, for LoRA, only the `Wqkv`, `out_proj` and `layer` Linear layers are trainable. +Currently, for LoRA, only the `q_proj`, `k_proj`, `v_proj` `out_proj` and `layer` Linear layers are trainable. We are using the "converted" base models based on [this issue](https://huggingface.co/databricks/dbrx-instruct/discussions/10) where the Experts are fused as an `nn.Parameter` rather than a `nn.Linear` layer. However, the implementation is still a bit buggy and attempting to train a LoRA adapter over those `w1`, `w2` and `v1` layers results in the trainer hanging. -We recommend using the [`LnL-AI/dbrx-base-converted`](https://huggingface.co/LnL-AI/dbrx-base-converted) model as your base model for the time being. +We recommend using the [`LnL-AI/dbrx-base-converted-v2`](https://huggingface.co/LnL-AI/dbrx-base-converted-v2) model as your base model for the time being. +### FSDP +The high memory usage seen w/ FSDP is due to FSDP not supporting 8bit optimizers. - 16-bit LoRA w/ FSDP - - ✅ w/o CPU Offload - 8x80GB uses ~62GiB/gpu + - ✅ w/o CPU Offload - 8x80GB uses ~80GiB/gpu - ❌ w/ CPU Offload - `paged_adamw_8bit` optimizer errors from being on cpu - ❓ 8-bit LoRA w/ FSDP - WIP, need to handle loading 8-bit quantized weights - ❌ 4-bit QLoRA w/ FSDP - errors w/: `Error an illegal memory access was encountered at line 90 in file /src/csrc/ops.cu` - ✅ bf16 full finetune w/ FSDP, freezing all but first 8 layers (8x80GB uses ~78GiB/gpu) + + +### Deepspeed + +WIP