From 5fe30b1497f6ebe75b46ee32f3b4c99b4cc9d672 Mon Sep 17 00:00:00 2001 From: Aman Gupta Karmani Date: Mon, 4 Sep 2023 17:49:16 -0400 Subject: [PATCH] use flash_attn xentropy when available (#525) * use flash_attn xentropy when available * log when xentropy is not found --- .../monkeypatch/llama_attn_hijack_flash.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py index cb0aa3fe6f..b0163a6556 100644 --- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py +++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py @@ -2,7 +2,9 @@ # copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py +import logging import warnings +from functools import partial from typing import List, Optional, Tuple, Union import torch @@ -33,6 +35,9 @@ ) +LOG = logging.getLogger("axolotl") + + def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False): transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( # pylint: disable=protected-access _prepare_decoder_attention_mask @@ -44,6 +49,18 @@ def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False): llama_model_forward ) + try: + from flash_attn.losses.cross_entropy import CrossEntropyLoss + + LOG.info("patching with flash_attn.losses.cross_entropy") + transformers.models.llama.modeling_llama.CrossEntropyLoss = partial( + CrossEntropyLoss, inplace_backward=True + ) + except ImportError: + LOG.info( + "optimized flash-attention CrossEntropyLoss not found (run `pip install git+https://github.com/Dao-AILab/flash-attention.git#egg=xentropy_cuda_lib&subdirectory=csrc/xentropy`)" + ) + # Disable the transformation of the attention mask in LlamaModel as the flash attention # requires the attention mask to be the same as the key_padding_mask