From 257c25d5c9af61e8e36e10cf8805c3144093ffd1 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Mon, 11 Mar 2024 22:07:46 -0700
Subject: [PATCH] Bump version to 0.6.0 (#1023)

---
 README.md                                     |  8 ++------
 llmfoundry/__init__.py                        | 13 ++-----------
 llmfoundry/models/layers/attention.py         |  7 -------
 llmfoundry/models/mpt/configuration_mpt.py    |  8 --------
 llmfoundry/models/mpt/modeling_mpt.py         |  9 +--------
 mcli/mcli-llama2-finetune.yaml                |  4 ++--
 setup.py                                      | 10 ++++------
 tests/models/layers/test_huggingface_flash.py | 15 +++------------
 8 files changed, 14 insertions(+), 60 deletions(-)

diff --git a/README.md b/README.md
index 16846c7aa8..7a97d400c6 100644
--- a/README.md
+++ b/README.md
@@ -132,9 +132,7 @@ We *strongly* recommend working with LLM Foundry inside a Docker container (see
 ```bash
 git clone https://github.com/mosaicml/llm-foundry.git
 cd llm-foundry
-pip install -e ".[gpu-flash2]"  # or `pip install -e .` if no NVIDIA GPU.
-# Note: Currently, `pip install -e ".[gpu-flash2]"` installs Flash Attention v2, and `pip install -e ".[gpu]"` installs Flash Attention v1.
-#       However, once the support for Flash Attention v1 is removed, both of these commands will install Flash Attention v2.
+pip install -e ".[gpu]"  # or `pip install -e .` if no NVIDIA GPU.
 ```
 
 ### Without Docker (not recommended)
@@ -152,9 +150,7 @@ source llmfoundry-venv/bin/activate
 
 pip install cmake packaging torch  # setup.py requires these be installed
 
-pip install -e ".[gpu-flash2]"  # or `pip install -e .` if no NVIDIA GPU.
-# Note: Currently, `pip install -e ".[gpu-flash2]"` installs Flash Attention v2, and `pip install -e ".[gpu]"` installs Flash Attention v1.
-#       However, once the support for Flash Attention v1 is removed, both of these commands will install Flash Attention v2.
+pip install -e ".[gpu]"  # or `pip install -e .` if no NVIDIA GPU.
 ```
 
 ### TransformerEngine and amp_fp8 support
diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py
index f65338c987..f4f0d68e9c 100644
--- a/llmfoundry/__init__.py
+++ b/llmfoundry/__init__.py
@@ -19,11 +19,6 @@
 
 hf_dynamic_modules_logger.addFilter(new_files_warning_filter)
 
-# Before importing any transformers models, we need to disable transformers flash attention if
-# we are in an environment with flash attention version <2. Transformers hard errors on a not properly
-# gated import otherwise.
-import transformers
-
 from llmfoundry import optim, utils
 from llmfoundry.data import (ConcatTokensDataset, MixtureOfDenoisersCollator,
                              NoConcatDataset, Seq2SeqFinetuningCollator,
@@ -33,8 +28,7 @@
                                   ComposerHFT5)
 from llmfoundry.models.layers.attention import (
     MultiheadAttention, attn_bias_shape, build_alibi_bias, build_attn_bias,
-    flash_attn_fn, is_flash_v1_installed,
-    scaled_multihead_dot_product_attention, triton_flash_attn_fn)
+    flash_attn_fn, scaled_multihead_dot_product_attention, triton_flash_attn_fn)
 from llmfoundry.models.layers.blocks import MPTBlock
 from llmfoundry.models.layers.ffn import FFN_CLASS_REGISTRY, MPTMLP, build_ffn
 from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY
@@ -42,9 +36,6 @@
                                    MPTForCausalLM, MPTModel, MPTPreTrainedModel)
 from llmfoundry.tokenizers import TiktokenTokenizerWrapper
 
-if is_flash_v1_installed():
-    transformers.utils.is_flash_attn_available = lambda: False
-
 __all__ = [
     'build_text_denoising_dataloader',
     'build_finetuning_dataloader',
@@ -77,4 +68,4 @@
     'TiktokenTokenizerWrapper',
 ]
 
-__version__ = '0.5.0'
+__version__ = '0.6.0'
diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
index 8fa8e6bc66..79780bccee 100644
--- a/llmfoundry/models/layers/attention.py
+++ b/llmfoundry/models/layers/attention.py
@@ -44,13 +44,6 @@ def check_alibi_support(attention_impl: str) -> bool:
         v2_version='v2.4.2')
 
 
-# Before importing any transformers models, we need to disable transformers flash attention if
-# we are in an environment with flash attention version <2. Transformers hard errors on a not properly
-# gated import otherwise.
-if is_flash_v1_installed():
-    import transformers
-    transformers.utils.is_flash_attn_available = lambda: False
-
 from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
 
 
diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py
index 68fe5befd5..b5a099002e 100644
--- a/llmfoundry/models/mpt/configuration_mpt.py
+++ b/llmfoundry/models/mpt/configuration_mpt.py
@@ -9,7 +9,6 @@
 from transformers import PretrainedConfig
 
 from llmfoundry.models.layers.attention import (check_alibi_support,
-                                                is_flash_v1_installed,
                                                 is_flash_v2_installed)
 from llmfoundry.models.layers.blocks import attn_config_defaults
 
@@ -230,13 +229,6 @@ def _validate_config(self) -> None:
             raise NotImplementedError(
                 'prefix_lm only implemented with torch and triton attention.')
 
-        if self.attn_config['attn_impl'] == 'flash' and is_flash_v1_installed():
-            warnings.warn(
-                VersionedDeprecationWarning(
-                    'Support for Flash Attention v1 is deprecated. Please upgrade to Flash Attention v2.4.2. To install Flash Attention v2.4.2, please run `pip install -e ".[gpu-flash2]"` from the root directory of the llm-foundry repository.',
-                    remove_version='0.6.0',
-                ))
-
         if self.attn_config[
                 'attn_impl'] == 'triton' and not self.attn_config['prefix_lm']:
             warnings.warn(
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index ff33990d7a..0a5e2ac13d 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -27,8 +27,7 @@
 from composer.utils import dist
 
 from llmfoundry.metrics import TokenAccuracy
-from llmfoundry.models.layers.attention import (is_flash_v1_installed,
-                                                is_flash_v2_installed)
+from llmfoundry.models.layers.attention import is_flash_v2_installed
 from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
 
 if is_flash_v2_installed():
@@ -39,12 +38,6 @@
     except Exception as e:
         raise e
 
-if is_flash_v1_installed():
-    try:  # This try...except is needed because transformers requires it despite the 'if' statement above
-        from flash_attn import bert_padding
-    except Exception as e:
-        raise e
-
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 from transformers import PreTrainedModel, PreTrainedTokenizerBase
diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml
index 36de709aed..091fc5a84e 100644
--- a/mcli/mcli-llama2-finetune.yaml
+++ b/mcli/mcli-llama2-finetune.yaml
@@ -38,7 +38,7 @@ parameters:
     pretrained: true
     # Note: you must have set the HUGGING_FACE_HUB_TOKEN environment variable and have access to the llama2 models
     use_auth_token: true
-    attention_patch_type: triton
+    use_flash_attention_2: true
 
   # Tokenizer
   tokenizer:
@@ -62,7 +62,7 @@ parameters:
       # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
       # # to profile this run's optimal packing_ratio as it depends on GPU count,
       # # batch size, sequence length
-      # packing_ratio:
+      # packing_ratio: auto
     drop_last: true
     num_workers: 8
     pin_memory: false
diff --git a/setup.py b/setup.py
index 7534d24503..4ecd34861a 100644
--- a/setup.py
+++ b/setup.py
@@ -3,6 +3,7 @@
 
 """MosaicML LLM Foundry package setup."""
 
+import copy
 import os
 import re
 
@@ -98,16 +99,13 @@
     'mosaicml[tensorboard]>=0.20.1,<0.21',
 ]
 
-extra_deps['gpu'] = [
-    'flash-attn==1.0.9',
-    # PyPI does not support direct dependencies, so we remove this line before uploading from PyPI
-    'xentropy-cuda-lib@git+https://github.com/HazyResearch/flash-attention.git@v1.0.9#subdirectory=csrc/xentropy',
-]
-
+# Flash 2 group kept for backwards compatibility
 extra_deps['gpu-flash2'] = [
     'flash-attn==2.5.0',
 ]
 
+extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2'])
+
 extra_deps['peft'] = [
     'mosaicml[peft]>=0.20.1,<0.21',
 ]
diff --git a/tests/models/layers/test_huggingface_flash.py b/tests/models/layers/test_huggingface_flash.py
index 411aab77a2..e9c0d32877 100644
--- a/tests/models/layers/test_huggingface_flash.py
+++ b/tests/models/layers/test_huggingface_flash.py
@@ -12,23 +12,14 @@
 from composer.utils import reproducibility
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
+from transformers.models.llama.modeling_llama import LlamaAttention
 
 from llmfoundry import COMPOSER_MODEL_REGISTRY
 from llmfoundry.models.hf.hf_fsdp import rgetattr
-from llmfoundry.models.layers.attention import (is_flash_v1_installed,
-                                                is_flash_v2_installed)
-from llmfoundry.utils.builders import build_tokenizer
-
-# Before importing any transformers models, we need to disable transformers flash attention if
-# we are in an environment with flash attention version <2. Transformers hard errors on a not properly
-# gated import otherwise.
-if is_flash_v1_installed():
-    transformers.utils.is_flash_attn_available = lambda: False
-
-from transformers.models.llama.modeling_llama import LlamaAttention
-
+from llmfoundry.models.layers.attention import is_flash_v2_installed
 from llmfoundry.models.layers.llama_attention_monkeypatch import (
     llama_attention_patch_torch, llama_attention_patch_triton)
+from llmfoundry.utils.builders import build_tokenizer
 
 
 @pytest.mark.parametrize('patch_fn_name', ['torch', 'triton'])