huggingface · younesbelkada · Sep 22, 2023 · Aug 18, 2023 · Aug 18, 2023 · Aug 18, 2023
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -69,6 +69,7 @@
     is_accelerate_available,
     is_auto_gptq_available,
     is_bitsandbytes_available,
+    is_flash_attn_available,
     is_offline_mode,
     is_optimum_available,
     is_peft_available,
@@ -1098,6 +1099,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
     is_parallelizable = False
     supports_gradient_checkpointing = False
 
+    # Flash Attention 2 support
+    _supports_flash_attn_2 = False
+
     @property
     def dummy_inputs(self) -> Dict[str, torch.Tensor]:
         """
@@ -1231,6 +1235,65 @@ def make_inputs_require_grads(module, input, output):
 
         self._require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads)
 
+    def enable_flash_attn_2(self) -> None:
+        """
+        Enable the Flash Attention 2.0 implementation for this model for more memory efficient inference and training.
+        If you don't know about Flash Attention, check out the official repository of flash attention:
+        https://github.com/Dao-AILab/flash-attention
+
+        For using Flash Attention 1.0 you can do it directly via the `BetterTransformer` API, have a look at this
+        specific section of the documentation to learn more about it:
+        https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#decoder-models
+        """
+        if not self._supports_flash_attn_2:
+            raise ValueError(
+                "The current architecture does not support Flash Attention 2.0. Please open an issue on GitHub to "
+                "request support for this architecture."
+            )
+
+        if not is_flash_attn_available():
+            raise ImportError(
+                "Flash Attention 2.0 is not available. Please refer to the documentation of https://github.com/Dao-AILab/flash-attention for"
+                " installing it."
+            )
+        else:
+            is_flash_greater_than_2 = version.parse(importlib.metadata.version("flash_attn")) > version.parse("2.0.0")
+            if not is_flash_greater_than_2:
+                raise ValueError(
+                    "You need flash_attn package version to be greater than 2.0. Make sure to have that version installed."
+                )
+
+        _is_bettertransformer = getattr(self, "use_bettertransformer", False)
+
+        if _is_bettertransformer:
+            raise ValueError(
+                "Flash Attention 2 and BetterTransformer API are not compatible. Please use one API or the other."
+            )
+
+        self._enable_flash_attn_2()
+        self._flash_attn_2_enabled = True
+
+    def disable_flash_attn_2(self) -> None:
+        """
+        Disables the Flash Attention 2.0 implementation for this model for more memory efficient inference and
+        training.
+        """
+        if not self._supports_flash_attn_2:
+            raise ValueError(
+                "The current architecture does not support Flash Attention 2.0. Please open an issue on GitHub to "
+                "request support for this architecture."
+            )
+
+        _flash_attn_2_enabled = self._flash_attn_2_enabled
+
+        if not _flash_attn_2_enabled:
+            raise ValueError(
+                "Flash Attention 2.0 is not enabled. Please enable it with `model.enable_flash_attn_2()`."
+            )
+
+        self._disable_flash_attn_2()
+        self._flash_attn_2_enabled = False
+
     def disable_input_require_grads(self):
         """
         Removes the `_require_grads_hook`.

diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
@@ -30,10 +30,22 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ...pytorch_utils import reset_and_attach_new_hooks
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_available,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_llama import LlamaConfig
 
 
+if is_flash_attn_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import pad_input, unpad_input  # noqa
+
+
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "LlamaConfig"
@@ -57,6 +69,59 @@ def _make_causal_mask(
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
+def _convert_to_padding_mask(attention_mask: torch.Tensor, mask_value: float = 0.0):
+    """
+    Convert causal attention mask to key-padding mask
+    """
+    if len(attention_mask.size()) != 4:
+        raise ValueError(
+            "Expecting attention_mask to have 4 dimensions, got tensor of shape: " f"{attention_mask.size()}"
+        )
+
+    batch_size = attention_mask.size(0)
+    key_length = attention_mask.size(-1)
+
+    padding_mask = torch.ones((batch_size, key_length), device=attention_mask.device)
+
+    for i in range(batch_size):
+        mask_slice = attention_mask[i, :, -1, :]
+        padding_mask[i, :] = torch.all(mask_slice == mask_value, dim=0)
+
+    return padding_mask
+
+
+def recursively_replace_module(model, old_class, target_class):
+    """
+    Recursively replace all old_class instances of the model with a target class. The target class should have the same
+    sub-module names than the old class.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model or the child module used for recursion
+        old_class (`class`):
+            The target old class to replace
+        target_class (`class`):
+            The new class that is going to be used in the replaced module.
+    """
+    for name, module in model.named_children():
+        if isinstance(module, old_class):
+            torch_device = module.q_proj.weight.device
+            with torch.device(torch_device):
+                new_module = target_class(module.config)
+
+            for inner_module_name, inner_module in module.named_modules():
+                setattr(new_module, inner_module_name, inner_module)
+
+            if hasattr(module, "_hf_hook"):
+                reset_and_attach_new_hooks(module, new_module)
+
+            model._modules[name] = new_module
+            module = None
+
+        if module is not None and len(list(module.children())) > 0:
+            recursively_replace_module(module, old_class, target_class)
+
+
 # Copied from transformers.models.bart.modeling_bart._expand_mask
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
@@ -256,6 +321,7 @@ def __init__(self, config: LlamaConfig):
         self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
         self._init_rope()
 
     def _init_rope(self):
@@ -328,7 +394,6 @@ def forward(
 
         past_key_value = (key_states, value_states) if use_cache else None
 
-        # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
@@ -358,6 +423,7 @@ def forward(
             )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
+
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
 
         if self.config.pretraining_tp > 1:
@@ -373,6 +439,129 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
 
+class LlamaFlashAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self._init_rope()
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaAttention._shape
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # LlamaFlashAttention attention does not support output_attentions
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dime x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        past_key_value = None
+
+        # TODO: llama does not have dropout in the config??
+        # It is recommended to use dropout with FA according to the docs
+        # when training.
+        dropout_rate = 0.0  # if not self.training else self.attn_dropout
+
+        padding_mask = _convert_to_padding_mask(attention_mask)
+
+        # contains at least one padding token
+        if padding_mask.sum().item() != bsz * kv_seq_len:
+            query_states, indices, current_query_length, query_max_seqlen = unpad_input(query_states, padding_mask)
+            key_states, _, current_key_length, key_max_seqlen = unpad_input(key_states, padding_mask)
+            value_states, _, _, _ = unpad_input(value_states, padding_mask)
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=current_query_length,
+                cu_seqlens_k=current_key_length,
+                max_seqlen_q=query_max_seqlen,
+                max_seqlen_k=key_max_seqlen,
+                dropout_p=0.0,
+                softmax_scale=None,
+                causal=True,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices, bsz, kv_seq_len)
+        else:
+            attn_output = flash_attn_func(query_states, key_states, value_states, dropout_rate, causal=True)
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
 class LlamaDecoderLayer(nn.Module):
     def __init__(self, config: LlamaConfig):
         super().__init__()
@@ -464,6 +653,7 @@ class LlamaPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["LlamaDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
 
     def _init_weights(self, module):
         std = self.config.initializer_range
@@ -480,6 +670,16 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, LlamaModel):
             module.gradient_checkpointing = value
 
+    def _enable_flash_attn_2(self):
+        for _, module in self.named_children():
+            if len(list(module.children())) > 0:
+                recursively_replace_module(module, LlamaAttention, LlamaFlashAttention)
+
+    def _disable_flash_attn_2(self):
+        for _, module in self.named_children():
+            if len(list(module.children())) > 0:
+                recursively_replace_module(module, LlamaFlashAttention, LlamaAttention)
+
 
 LLAMA_INPUTS_DOCSTRING = r"""
     Args:

diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
@@ -286,3 +286,34 @@ def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
     non-overlapping lifetimes may have the same id.
     """
     return tensor.device, storage_ptr(tensor), storage_size(tensor)
+
+
+def reset_and_attach_new_hooks(old_module, new_module) -> None:
+    """
+    Attach new hooks in new_module that are similar to the hook of old_module
+
+    Args:
+        old_module (`torch.nn.Module`):
+            The old module that contains the old hook
+        new_module (`torch.nn.Module`):
+            The new module that does not contain any hook
+        hook (`~accelerate.hooks.AlignDeviceHook`):
+            The
+    """
+    import accelerate
+    from accelerate.hooks import add_hook_to_module, remove_hook_from_module
+
+    hook = old_module._hf_hook
+
+    hook_cls = getattr(accelerate.hooks, hook.__class__.__name__)
+    hook_attr = hook.__dict__
+    filtered_old_hook_attr = {}
+    old_hook_init_signature = inspect.signature(hook_cls.__init__)
+    for k in hook_attr.keys():
+        if k in old_hook_init_signature.parameters:
+            filtered_old_hook_attr[k] = hook_attr[k]
+
+    new_hook = hook_cls(**filtered_old_hook_attr)
+
+    remove_hook_from_module(old_module)
+    add_hook_to_module(new_module, new_hook)
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
@@ -114,6 +114,7 @@
     is_detectron2_available,
     is_essentia_available,
     is_faiss_available,
+    is_flash_attn_available,
     is_flax_available,
     is_ftfy_available,
     is_in_notebook,