From 20da9291b28be8b8e06cfa0d1007aa9303d2946c Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Sat, 7 Oct 2023 00:11:19 +0900 Subject: [PATCH 1/3] Fix: Higher vram usage for mistral and sample_packing --- src/axolotl/utils/models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index aa6049bd3e..a5d1ebd4cb 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -81,7 +81,8 @@ def load_tokenizer(cfg): tokenizer.add_special_tokens({"pad_token": "[PAD]"}) os.environ["TOKENIZERS_PARALLELISM"] = "false" - if cfg.is_mistral_derived_model: + # Mistral's FA requires left padding + if cfg.is_mistral_derived_model and cfg.flash_attention and not cfg.sample_packing: tokenizer.padding_side = "left" if cfg.special_tokens: From ec9a1170488e80a472ba2ebfc8078c5072ae11d2 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Sat, 7 Oct 2023 00:12:58 +0900 Subject: [PATCH 2/3] chore: update comment --- src/axolotl/utils/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index a5d1ebd4cb..2c60f00c2b 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -81,7 +81,7 @@ def load_tokenizer(cfg): tokenizer.add_special_tokens({"pad_token": "[PAD]"}) os.environ["TOKENIZERS_PARALLELISM"] = "false" - # Mistral's FA requires left padding + # Mistral's official FA implementation requires left padding if cfg.is_mistral_derived_model and cfg.flash_attention and not cfg.sample_packing: tokenizer.padding_side = "left" From 1b2886dfad9c44bdd007ed648ced3ccabfa5e500 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Sat, 7 Oct 2023 00:43:17 +0900 Subject: [PATCH 3/3] chore: lint --- examples/mistral/qlora.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml index 5a131c5f36..9c64a8c2dd 100644 --- a/examples/mistral/qlora.yml +++ b/examples/mistral/qlora.yml @@ -36,10 +36,10 @@ lora_target_modules: - k_proj - o_proj -wandb_project: -wandb_entity: +wandb_project: +wandb_entity: wandb_watch: -wandb_run_id: +wandb_run_id: wandb_log_model: gradient_accumulation_steps: 4 @@ -76,4 +76,4 @@ fsdp_config: special_tokens: bos_token: "" eos_token: "" - unk_token: "" \ No newline at end of file + unk_token: ""