use flash_attn xentropy when available

axolotl-ai-cloud · Sep 4, 2023 · e37d3b2 · e37d3b2
1 parent 09f1543
commit e37d3b2
Showing 1 changed file with 14 additions and 0 deletions.
diff --git a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -2,7 +2,9 @@
 
 # copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py
 
+import logging
 import warnings
+from functools import partial
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -33,6 +35,9 @@
     )
 
 
+LOG = logging.getLogger("axolotl")
+
+
 def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
     transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (  # pylint: disable=protected-access
         _prepare_decoder_attention_mask
@@ -43,6 +48,15 @@ def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
         transformers.models.llama.modeling_llama.LlamaModel.forward = (
             llama_model_forward
         )
+    try:
+        from flash_attn.losses.cross_entropy import CrossEntropyLoss
+
+        LOG.info("patching with flash_attention.losses.cross_entropy")
+        transformers.models.llama.modeling_llama.CrossEntropyLoss = partial(
+            CrossEntropyLoss, inplace_backward=True
+        )
+    except ImportError:
+        pass
 
 
 # Disable the transformation of the attention mask in LlamaModel as the flash attention